diff --git a/docs/reference/query-dsl/script-score-query.asciidoc b/docs/reference/query-dsl/script-score-query.asciidoc index ee68d3e40fe..56c4f7c41b8 100644 --- a/docs/reference/query-dsl/script-score-query.asciidoc +++ b/docs/reference/query-dsl/script-score-query.asciidoc @@ -182,60 +182,44 @@ different from the query's vector, 0 is used for missing dimensions in the calculations of vector functions. -[[random-functions]] -===== Random functions -There are two predefined ways to produce random values: -`randomNotReproducible` and `randomReproducible`. +[[random-score-function]] +===== Random score function +`random_score` function generates scores that are uniformly distributed +from 0 up to but not including 1. -`randomNotReproducible()` uses `java.util.Random` class -to generate a random value of the type `long`. -The generated values are not reproducible between requests' invocations. +`randomScore` function has the following syntax: +`randomScore(, )`. +It has a required parameter - `seed` as an integer value, +and an optional parameter - `fieldName` as a string value. [source,js] -------------------------------------------------- "script" : { - "source" : "randomNotReproducible()" + "source" : "randomScore(100, '_seq_no')" +} +-------------------------------------------------- +// NOTCONSOLE + +If the `fieldName` parameter is omitted, the internal Lucene +document ids will be used as a source of randomness. This is very efficient, +but unfortunately not reproducible since documents might be renumbered +by merges. + +[source,js] +-------------------------------------------------- +"script" : { + "source" : "randomScore(100)" } -------------------------------------------------- // NOTCONSOLE -`randomReproducible(String seedValue, int seed)` produces -reproducible random values of type `long`. This function requires -more computational time and memory than the non-reproducible version. - -A good candidate for the `seedValue` is document field values that -are unique across documents and already pre-calculated and preloaded -in the memory. For example, values of the document's `_seq_no` field -is a good candidate, as documents on the same shard have unique values -for the `_seq_no` field. - -[source,js] --------------------------------------------------- -"script" : { - "source" : "randomReproducible(Long.toString(doc['_seq_no'].value), 100)" -} --------------------------------------------------- -// NOTCONSOLE - - -A drawback of using `_seq_no` is that generated values change if -documents are updated. Another drawback is not absolute uniqueness, as -documents from different shards with the same sequence numbers -generate the same random values. - -If you need random values to be distinct across different shards, -you can use a field with unique values across shards, -such as `_id`, but watch out for the memory usage as all -these unique values need to be loaded into memory. - -[source,js] --------------------------------------------------- -"script" : { - "source" : "randomReproducible(doc['_id'].value, 100)" -} --------------------------------------------------- -// NOTCONSOLE +Note that documents that are within the same shard and have the +same value for field will get the same score, so it is usually desirable +to use a field that has unique values for all documents across a shard. +A good default choice might be to use the `_seq_no` +field, whose only drawback is that scores will change if the document is +updated since update operations also update the value of the `_seq_no` field. [[decay-functions]] @@ -349,8 +333,8 @@ the following script: ===== `random_score` -Use `randomReproducible` and `randomNotReproducible` functions -as described in <>. +Use `randomScore` function +as described in <>. ===== `field_value_factor` diff --git a/modules/lang-painless/src/main/resources/org/elasticsearch/painless/spi/org.elasticsearch.score.txt b/modules/lang-painless/src/main/resources/org/elasticsearch/painless/spi/org.elasticsearch.score.txt index 3d7b29826c7..03ec9275aa8 100644 --- a/modules/lang-painless/src/main/resources/org/elasticsearch/painless/spi/org.elasticsearch.score.txt +++ b/modules/lang-painless/src/main/resources/org/elasticsearch/painless/spi/org.elasticsearch.score.txt @@ -19,11 +19,14 @@ # This file contains a whitelist for functions to be used in Score context +class org.elasticsearch.script.ScoreScript no_import { +} + static_import { double saturation(double, double) from_class org.elasticsearch.script.ScoreScriptUtils double sigmoid(double, double, double) from_class org.elasticsearch.script.ScoreScriptUtils - double randomReproducible(String, int) from_class org.elasticsearch.script.ScoreScriptUtils - double randomNotReproducible() bound_to org.elasticsearch.script.ScoreScriptUtils$RandomNotReproducible + double randomScore(org.elasticsearch.script.ScoreScript, int, String) bound_to org.elasticsearch.script.ScoreScriptUtils$RandomScoreField + double randomScore(org.elasticsearch.script.ScoreScript, int) bound_to org.elasticsearch.script.ScoreScriptUtils$RandomScoreDoc double decayGeoLinear(String, String, String, double, GeoPoint) bound_to org.elasticsearch.script.ScoreScriptUtils$DecayGeoLinear double decayGeoExp(String, String, String, double, GeoPoint) bound_to org.elasticsearch.script.ScoreScriptUtils$DecayGeoExp double decayGeoGauss(String, String, String, double, GeoPoint) bound_to org.elasticsearch.script.ScoreScriptUtils$DecayGeoGauss diff --git a/modules/lang-painless/src/test/resources/rest-api-spec/test/painless/80_script_score.yml b/modules/lang-painless/src/test/resources/rest-api-spec/test/painless/80_script_score.yml index a3135777c95..cf55810058d 100644 --- a/modules/lang-painless/src/test/resources/rest-api-spec/test/painless/80_script_score.yml +++ b/modules/lang-painless/src/test/resources/rest-api-spec/test/painless/80_script_score.yml @@ -72,61 +72,6 @@ setup: - match: { hits.hits.1._id: d2 } - match: { hits.hits.2._id: d1 } ---- -"Random functions": - - do: - indices.create: - index: test - body: - settings: - number_of_shards: 2 - mappings: - properties: - f1: - type: keyword - - do: - index: - index: test - id: 1 - body: {"f1": "v1"} - - do: - index: - index: test - id: 2 - body: {"f1": "v2"} - - do: - index: - index: test - id: 3 - body: {"f1": "v3"} - - - do: - indices.refresh: {} - - - do: - search: - rest_total_hits_as_int: true - index: test - body: - query: - script_score: - query: {match_all: {} } - script: - source: "randomReproducible(Long.toString(doc['_seq_no'].value), 100)" - - match: { hits.total: 3 } - - - do: - search: - rest_total_hits_as_int: true - index: test - body: - query: - script_score: - query: {match_all: {} } - script: - source: "randomNotReproducible()" - - match: { hits.total: 3 } - --- "Decay geo functions": - do: diff --git a/modules/lang-painless/src/test/resources/rest-api-spec/test/painless/85_script_score_random_score.yml b/modules/lang-painless/src/test/resources/rest-api-spec/test/painless/85_script_score_random_score.yml new file mode 100644 index 00000000000..d1f0e6ff125 --- /dev/null +++ b/modules/lang-painless/src/test/resources/rest-api-spec/test/painless/85_script_score_random_score.yml @@ -0,0 +1,146 @@ +# Integration tests for ScriptScoreQuery using Painless + +setup: +- skip: + version: " - 7.09.99" + reason: "random score function of script score was added in 7.1" + +--- +"Random score function with _seq_no field": + - do: + indices.create: + index: test + body: + settings: + number_of_shards: 2 + mappings: + properties: + f1: + type: keyword + + - do: + bulk: + refresh: true + body: + - '{"index": {"_index": "test"}}' + - '{"f1": "v0"}' + - '{"index": {"_index": "test"}}' + - '{"f1": "v1"}' + - '{"index": {"_index": "test"}}' + - '{"f1": "v2"}' + - '{"index": {"_index": "test"}}' + - '{"f1": "v3"}' + - '{"index": {"_index": "test"}}' + - '{"f1": "v4"}' + - '{"index": {"_index": "test"}}' + - '{"f1": "v5"}' + - '{"index": {"_index": "test"}}' + - '{"f1": "v6"}' + + - do: + search: + rest_total_hits_as_int: true + index: test + body: + query: + script_score: + query: {match_all: {} } + script: + source: "randomScore(100, '_seq_no')" + # stash ids to check for reproducibility of ranking + - set: { hits.hits.0._id: id0 } + - set: { hits.hits.1._id: id1 } + - set: { hits.hits.2._id: id2 } + - set: { hits.hits.3._id: id3 } + - set: { hits.hits.4._id: id4 } + - set: { hits.hits.5._id: id5 } + - set: { hits.hits.6._id: id6 } + + # check that ranking is reproducible + - do: + search: + rest_total_hits_as_int: true + index: test + body: + query: + script_score: + query: {match_all: {} } + script: + source: "randomScore(100, '_seq_no')" + - match: { hits.hits.0._id: $id0 } + - match: { hits.hits.1._id: $id1 } + - match: { hits.hits.2._id: $id2 } + - match: { hits.hits.3._id: $id3 } + - match: { hits.hits.4._id: $id4 } + - match: { hits.hits.5._id: $id5 } + - match: { hits.hits.6._id: $id6 } + +--- +"Random score function with internal doc Ids": + - do: + indices.create: + index: test + body: + settings: + number_of_shards: 1 + mappings: + properties: + f1: + type: keyword + + - do: + bulk: + refresh: true + body: + - '{"index": {"_index": "test"}}' + - '{"f1": "v0"}' + - '{"index": {"_index": "test"}}' + - '{"f1": "v1"}' + - '{"index": {"_index": "test"}}' + - '{"f1": "v2"}' + - '{"index": {"_index": "test"}}' + - '{"f1": "v3"}' + - '{"index": {"_index": "test"}}' + - '{"f1": "v4"}' + - '{"index": {"_index": "test"}}' + - '{"f1": "v5"}' + - '{"index": {"_index": "test"}}' + - '{"f1": "v6"}' + + - do: + search: + rest_total_hits_as_int: true + index: test + body: + query: + script_score: + query: {match_all: {} } + script: + source: "randomScore(100)" + # stash ids to check for reproducibility of ranking + - set: { hits.hits.0._id: id0 } + - set: { hits.hits.1._id: id1 } + - set: { hits.hits.2._id: id2 } + - set: { hits.hits.3._id: id3 } + - set: { hits.hits.4._id: id4 } + - set: { hits.hits.5._id: id5 } + - set: { hits.hits.6._id: id6 } + + # check that ranking is reproducible + - do: + search: + rest_total_hits_as_int: true + index: test + body: + query: + script_score: + query: {match_all: {} } + script: + source: "randomScore(100)" + - match: { hits.hits.0._id: $id0 } + - match: { hits.hits.1._id: $id1 } + - match: { hits.hits.2._id: $id2 } + - match: { hits.hits.3._id: $id3 } + - match: { hits.hits.4._id: $id4 } + - match: { hits.hits.5._id: $id5 } + - match: { hits.hits.6._id: $id6 } diff --git a/server/src/main/java/org/elasticsearch/common/lucene/search/function/ScriptScoreFunction.java b/server/src/main/java/org/elasticsearch/common/lucene/search/function/ScriptScoreFunction.java index 8e51bc5951d..960df44a625 100644 --- a/server/src/main/java/org/elasticsearch/common/lucene/search/function/ScriptScoreFunction.java +++ b/server/src/main/java/org/elasticsearch/common/lucene/search/function/ScriptScoreFunction.java @@ -50,11 +50,24 @@ public class ScriptScoreFunction extends ScoreFunction { private final ScoreScript.LeafFactory script; + private final int shardId; + private final String indexName; + public ScriptScoreFunction(Script sScript, ScoreScript.LeafFactory script) { super(CombineFunction.REPLACE); this.sScript = sScript; this.script = script; + this.indexName = null; + this.shardId = -1; + } + + public ScriptScoreFunction(Script sScript, ScoreScript.LeafFactory script, String indexName, int shardId) { + super(CombineFunction.REPLACE); + this.sScript = sScript; + this.script = script; + this.indexName = indexName; + this.shardId = shardId; } @Override @@ -62,6 +75,8 @@ public class ScriptScoreFunction extends ScoreFunction { final ScoreScript leafScript = script.newInstance(ctx); final CannedScorer scorer = new CannedScorer(); leafScript.setScorer(scorer); + leafScript._setIndexName(indexName); + leafScript._setShard(shardId); return new LeafScoreFunction() { @Override public double score(int docId, float subQueryScore) throws IOException { diff --git a/server/src/main/java/org/elasticsearch/index/query/functionscore/ScriptScoreFunctionBuilder.java b/server/src/main/java/org/elasticsearch/index/query/functionscore/ScriptScoreFunctionBuilder.java index a860bd19d7c..accfd2f6569 100644 --- a/server/src/main/java/org/elasticsearch/index/query/functionscore/ScriptScoreFunctionBuilder.java +++ b/server/src/main/java/org/elasticsearch/index/query/functionscore/ScriptScoreFunctionBuilder.java @@ -94,7 +94,7 @@ public class ScriptScoreFunctionBuilder extends ScoreFunctionBuilder 0.0; + private final int docBase; + private int docId; + private int shardId = -1; + private String indexName = null; + public ScoreScript(Map params, SearchLookup lookup, LeafReaderContext leafContext) { // null check needed b/c of expression engine subclass if (lookup == null) { @@ -69,11 +74,13 @@ public abstract class ScoreScript { assert leafContext == null; this.params = null; this.leafLookup = null; + this.docBase = 0; } else { this.leafLookup = lookup.getLeafSearchLookup(leafContext); params = new HashMap<>(params); params.putAll(leafLookup.asMap()); this.params = new DeprecationMap(params, DEPRECATIONS, "score-script"); + this.docBase = leafContext.docBase; } } @@ -91,6 +98,7 @@ public abstract class ScoreScript { /** Set the current document to run the script on next. */ public void setDocument(int docid) { + this.docId = docid; leafLookup.setDocument(docid); } @@ -104,10 +112,74 @@ public abstract class ScoreScript { }; } + /** + * Accessed as _score in the painless script + * @return the score of the inner query + */ public double get_score() { return scoreSupplier.getAsDouble(); } + + /** + * Starting a name with underscore, so that the user cannot access this function directly through a script + * It is only used within predefined painless functions. + * @return the internal document ID + */ + public int _getDocId() { + return docId; + } + + /** + * Starting a name with underscore, so that the user cannot access this function directly through a script + * It is only used within predefined painless functions. + * @return the internal document ID with the base + */ + public int _getDocBaseId() { + return docBase + docId; + } + + /** + * Starting a name with underscore, so that the user cannot access this function directly through a script + * It is only used within predefined painless functions. + * @return shard id or throws an exception if shard is not set up for this script instance + */ + public int _getShardId() { + if (shardId > -1) { + return shardId; + } else { + throw new IllegalArgumentException("shard id can not be looked up!"); + } + } + + /** + * Starting a name with underscore, so that the user cannot access this function directly through a script + * It is only used within predefined painless functions. + * @return index name or throws an exception if the index name is not set up for this script instance + */ + public String _getIndex() { + if (indexName != null) { + return indexName; + } else { + throw new IllegalArgumentException("index name can not be looked up!"); + } + } + + /** + * Starting a name with underscore, so that the user cannot access this function directly through a script + */ + public void _setShard(int shardId) { + this.shardId = shardId; + } + + /** + * Starting a name with underscore, so that the user cannot access this function directly through a script + */ + public void _setIndexName(String indexName) { + this.indexName = indexName; + } + + /** A factory to construct {@link ScoreScript} instances. */ public interface LeafFactory { diff --git a/server/src/main/java/org/elasticsearch/script/ScoreScriptUtils.java b/server/src/main/java/org/elasticsearch/script/ScoreScriptUtils.java index 273b8fcf855..c7d6e889397 100644 --- a/server/src/main/java/org/elasticsearch/script/ScoreScriptUtils.java +++ b/server/src/main/java/org/elasticsearch/script/ScoreScriptUtils.java @@ -21,22 +21,20 @@ package org.elasticsearch.script; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.StringHelper; -import org.elasticsearch.common.Randomness; +import org.elasticsearch.ExceptionsHelper; import org.elasticsearch.common.geo.GeoDistance; import org.elasticsearch.common.geo.GeoPoint; import org.elasticsearch.common.geo.GeoUtils; import org.elasticsearch.common.time.DateMathParser; import org.elasticsearch.common.unit.DistanceUnit; import org.elasticsearch.common.unit.TimeValue; +import org.elasticsearch.index.fielddata.ScriptDocValues; import org.elasticsearch.index.mapper.DateFieldMapper; import java.time.ZoneId; -import java.util.Random; -/** - * ScoringScriptImpl can be used as {@link ScoreScript} - * to run a previously compiled Painless script. - */ +import static com.carrotsearch.hppc.BitMixer.mix32; + public final class ScoreScriptUtils { /****** STATIC FUNCTIONS that can be used by users for score calculations **/ @@ -53,26 +51,50 @@ public final class ScoreScriptUtils { return Math.pow(value,a) / (Math.pow(k,a) + Math.pow(value,a)); } + // random score based on the documents' values of the given field + public static final class RandomScoreField { + private final ScoreScript scoreScript; + private final ScriptDocValues docValues; + private final int saltedSeed; - // reproducible random - public static double randomReproducible(String seedValue, int seed) { - int hash = StringHelper.murmurhash3_x86_32(new BytesRef(seedValue), seed); - return (hash & 0x00FFFFFF) / (float)(1 << 24); // only use the lower 24 bits to construct a float from 0.0-1.0 - } - // not reproducible random - public static final class RandomNotReproducible { - private final Random rnd; + public RandomScoreField(ScoreScript scoreScript, int seed, String fieldName) { + this.scoreScript = scoreScript; + this.docValues = scoreScript.getDoc().get(fieldName); + int salt = (scoreScript._getIndex().hashCode() << 10) | scoreScript._getShardId(); + this.saltedSeed = mix32(salt ^ seed); - public RandomNotReproducible() { - this.rnd = Randomness.get(); } - public double randomNotReproducible() { - return rnd.nextDouble(); + public double randomScore() { + try { + docValues.setNextDocId(scoreScript._getDocId()); + String seedValue = String.valueOf(docValues.get(0)); + int hash = StringHelper.murmurhash3_x86_32(new BytesRef(seedValue), saltedSeed); + return (hash & 0x00FFFFFF) / (float)(1 << 24); // only use the lower 24 bits to construct a float from 0.0-1.0 + } catch (Exception e) { + throw ExceptionsHelper.convertToElastic(e); + } } } + // random score based on the internal Lucene document Ids + public static final class RandomScoreDoc { + private final ScoreScript scoreScript; + private final int saltedSeed; + + public RandomScoreDoc(ScoreScript scoreScript, int seed) { + this.scoreScript = scoreScript; + int salt = (scoreScript._getIndex().hashCode() << 10) | scoreScript._getShardId(); + this.saltedSeed = mix32(salt ^ seed); + } + + public double randomScore() { + String seedValue = Integer.toString(scoreScript._getDocBaseId()); + int hash = StringHelper.murmurhash3_x86_32(new BytesRef(seedValue), saltedSeed); + return (hash & 0x00FFFFFF) / (float)(1 << 24); // only use the lower 24 bits to construct a float from 0.0-1.0 + } + } // **** Decay functions on geo field public static final class DecayGeoLinear {