Add randomScore function in script_score query (#40186)
To make script_score query to have the same features as function_score query, we need to add randomScore function. This function produces different random scores on different index shards. It is also able to produce random scores based on the internal Lucene Document Ids.
This commit is contained in:
parent
85848af8cf
commit
24755209b4
|
@ -182,60 +182,44 @@ different from the query's vector, 0 is used for missing dimensions
|
|||
in the calculations of vector functions.
|
||||
|
||||
|
||||
[[random-functions]]
|
||||
===== Random functions
|
||||
There are two predefined ways to produce random values:
|
||||
`randomNotReproducible` and `randomReproducible`.
|
||||
[[random-score-function]]
|
||||
===== Random score function
|
||||
`random_score` function generates scores that are uniformly distributed
|
||||
from 0 up to but not including 1.
|
||||
|
||||
`randomNotReproducible()` uses `java.util.Random` class
|
||||
to generate a random value of the type `long`.
|
||||
The generated values are not reproducible between requests' invocations.
|
||||
`randomScore` function has the following syntax:
|
||||
`randomScore(<seed>, <fieldName>)`.
|
||||
It has a required parameter - `seed` as an integer value,
|
||||
and an optional parameter - `fieldName` as a string value.
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
"script" : {
|
||||
"source" : "randomNotReproducible()"
|
||||
"source" : "randomScore(100, '_seq_no')"
|
||||
}
|
||||
--------------------------------------------------
|
||||
// NOTCONSOLE
|
||||
|
||||
If the `fieldName` parameter is omitted, the internal Lucene
|
||||
document ids will be used as a source of randomness. This is very efficient,
|
||||
but unfortunately not reproducible since documents might be renumbered
|
||||
by merges.
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
"script" : {
|
||||
"source" : "randomScore(100)"
|
||||
}
|
||||
--------------------------------------------------
|
||||
// NOTCONSOLE
|
||||
|
||||
|
||||
`randomReproducible(String seedValue, int seed)` produces
|
||||
reproducible random values of type `long`. This function requires
|
||||
more computational time and memory than the non-reproducible version.
|
||||
|
||||
A good candidate for the `seedValue` is document field values that
|
||||
are unique across documents and already pre-calculated and preloaded
|
||||
in the memory. For example, values of the document's `_seq_no` field
|
||||
is a good candidate, as documents on the same shard have unique values
|
||||
for the `_seq_no` field.
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
"script" : {
|
||||
"source" : "randomReproducible(Long.toString(doc['_seq_no'].value), 100)"
|
||||
}
|
||||
--------------------------------------------------
|
||||
// NOTCONSOLE
|
||||
|
||||
|
||||
A drawback of using `_seq_no` is that generated values change if
|
||||
documents are updated. Another drawback is not absolute uniqueness, as
|
||||
documents from different shards with the same sequence numbers
|
||||
generate the same random values.
|
||||
|
||||
If you need random values to be distinct across different shards,
|
||||
you can use a field with unique values across shards,
|
||||
such as `_id`, but watch out for the memory usage as all
|
||||
these unique values need to be loaded into memory.
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
"script" : {
|
||||
"source" : "randomReproducible(doc['_id'].value, 100)"
|
||||
}
|
||||
--------------------------------------------------
|
||||
// NOTCONSOLE
|
||||
Note that documents that are within the same shard and have the
|
||||
same value for field will get the same score, so it is usually desirable
|
||||
to use a field that has unique values for all documents across a shard.
|
||||
A good default choice might be to use the `_seq_no`
|
||||
field, whose only drawback is that scores will change if the document is
|
||||
updated since update operations also update the value of the `_seq_no` field.
|
||||
|
||||
|
||||
[[decay-functions]]
|
||||
|
@ -349,8 +333,8 @@ the following script:
|
|||
|
||||
===== `random_score`
|
||||
|
||||
Use `randomReproducible` and `randomNotReproducible` functions
|
||||
as described in <<random-functions, random functions>>.
|
||||
Use `randomScore` function
|
||||
as described in <<random-score-function, random score function>>.
|
||||
|
||||
|
||||
===== `field_value_factor`
|
||||
|
|
|
@ -19,11 +19,14 @@
|
|||
|
||||
# This file contains a whitelist for functions to be used in Score context
|
||||
|
||||
class org.elasticsearch.script.ScoreScript no_import {
|
||||
}
|
||||
|
||||
static_import {
|
||||
double saturation(double, double) from_class org.elasticsearch.script.ScoreScriptUtils
|
||||
double sigmoid(double, double, double) from_class org.elasticsearch.script.ScoreScriptUtils
|
||||
double randomReproducible(String, int) from_class org.elasticsearch.script.ScoreScriptUtils
|
||||
double randomNotReproducible() bound_to org.elasticsearch.script.ScoreScriptUtils$RandomNotReproducible
|
||||
double randomScore(org.elasticsearch.script.ScoreScript, int, String) bound_to org.elasticsearch.script.ScoreScriptUtils$RandomScoreField
|
||||
double randomScore(org.elasticsearch.script.ScoreScript, int) bound_to org.elasticsearch.script.ScoreScriptUtils$RandomScoreDoc
|
||||
double decayGeoLinear(String, String, String, double, GeoPoint) bound_to org.elasticsearch.script.ScoreScriptUtils$DecayGeoLinear
|
||||
double decayGeoExp(String, String, String, double, GeoPoint) bound_to org.elasticsearch.script.ScoreScriptUtils$DecayGeoExp
|
||||
double decayGeoGauss(String, String, String, double, GeoPoint) bound_to org.elasticsearch.script.ScoreScriptUtils$DecayGeoGauss
|
||||
|
|
|
@ -72,61 +72,6 @@ setup:
|
|||
- match: { hits.hits.1._id: d2 }
|
||||
- match: { hits.hits.2._id: d1 }
|
||||
|
||||
---
|
||||
"Random functions":
|
||||
- do:
|
||||
indices.create:
|
||||
index: test
|
||||
body:
|
||||
settings:
|
||||
number_of_shards: 2
|
||||
mappings:
|
||||
properties:
|
||||
f1:
|
||||
type: keyword
|
||||
- do:
|
||||
index:
|
||||
index: test
|
||||
id: 1
|
||||
body: {"f1": "v1"}
|
||||
- do:
|
||||
index:
|
||||
index: test
|
||||
id: 2
|
||||
body: {"f1": "v2"}
|
||||
- do:
|
||||
index:
|
||||
index: test
|
||||
id: 3
|
||||
body: {"f1": "v3"}
|
||||
|
||||
- do:
|
||||
indices.refresh: {}
|
||||
|
||||
- do:
|
||||
search:
|
||||
rest_total_hits_as_int: true
|
||||
index: test
|
||||
body:
|
||||
query:
|
||||
script_score:
|
||||
query: {match_all: {} }
|
||||
script:
|
||||
source: "randomReproducible(Long.toString(doc['_seq_no'].value), 100)"
|
||||
- match: { hits.total: 3 }
|
||||
|
||||
- do:
|
||||
search:
|
||||
rest_total_hits_as_int: true
|
||||
index: test
|
||||
body:
|
||||
query:
|
||||
script_score:
|
||||
query: {match_all: {} }
|
||||
script:
|
||||
source: "randomNotReproducible()"
|
||||
- match: { hits.total: 3 }
|
||||
|
||||
---
|
||||
"Decay geo functions":
|
||||
- do:
|
||||
|
|
|
@ -0,0 +1,146 @@
|
|||
# Integration tests for ScriptScoreQuery using Painless
|
||||
|
||||
setup:
|
||||
- skip:
|
||||
version: " - 7.09.99"
|
||||
reason: "random score function of script score was added in 7.1"
|
||||
|
||||
---
|
||||
"Random score function with _seq_no field":
|
||||
- do:
|
||||
indices.create:
|
||||
index: test
|
||||
body:
|
||||
settings:
|
||||
number_of_shards: 2
|
||||
mappings:
|
||||
properties:
|
||||
f1:
|
||||
type: keyword
|
||||
|
||||
- do:
|
||||
bulk:
|
||||
refresh: true
|
||||
body:
|
||||
- '{"index": {"_index": "test"}}'
|
||||
- '{"f1": "v0"}'
|
||||
- '{"index": {"_index": "test"}}'
|
||||
- '{"f1": "v1"}'
|
||||
- '{"index": {"_index": "test"}}'
|
||||
- '{"f1": "v2"}'
|
||||
- '{"index": {"_index": "test"}}'
|
||||
- '{"f1": "v3"}'
|
||||
- '{"index": {"_index": "test"}}'
|
||||
- '{"f1": "v4"}'
|
||||
- '{"index": {"_index": "test"}}'
|
||||
- '{"f1": "v5"}'
|
||||
- '{"index": {"_index": "test"}}'
|
||||
- '{"f1": "v6"}'
|
||||
|
||||
- do:
|
||||
search:
|
||||
rest_total_hits_as_int: true
|
||||
index: test
|
||||
body:
|
||||
query:
|
||||
script_score:
|
||||
query: {match_all: {} }
|
||||
script:
|
||||
source: "randomScore(100, '_seq_no')"
|
||||
# stash ids to check for reproducibility of ranking
|
||||
- set: { hits.hits.0._id: id0 }
|
||||
- set: { hits.hits.1._id: id1 }
|
||||
- set: { hits.hits.2._id: id2 }
|
||||
- set: { hits.hits.3._id: id3 }
|
||||
- set: { hits.hits.4._id: id4 }
|
||||
- set: { hits.hits.5._id: id5 }
|
||||
- set: { hits.hits.6._id: id6 }
|
||||
|
||||
# check that ranking is reproducible
|
||||
- do:
|
||||
search:
|
||||
rest_total_hits_as_int: true
|
||||
index: test
|
||||
body:
|
||||
query:
|
||||
script_score:
|
||||
query: {match_all: {} }
|
||||
script:
|
||||
source: "randomScore(100, '_seq_no')"
|
||||
- match: { hits.hits.0._id: $id0 }
|
||||
- match: { hits.hits.1._id: $id1 }
|
||||
- match: { hits.hits.2._id: $id2 }
|
||||
- match: { hits.hits.3._id: $id3 }
|
||||
- match: { hits.hits.4._id: $id4 }
|
||||
- match: { hits.hits.5._id: $id5 }
|
||||
- match: { hits.hits.6._id: $id6 }
|
||||
|
||||
---
|
||||
"Random score function with internal doc Ids":
|
||||
- do:
|
||||
indices.create:
|
||||
index: test
|
||||
body:
|
||||
settings:
|
||||
number_of_shards: 1
|
||||
mappings:
|
||||
properties:
|
||||
f1:
|
||||
type: keyword
|
||||
|
||||
- do:
|
||||
bulk:
|
||||
refresh: true
|
||||
body:
|
||||
- '{"index": {"_index": "test"}}'
|
||||
- '{"f1": "v0"}'
|
||||
- '{"index": {"_index": "test"}}'
|
||||
- '{"f1": "v1"}'
|
||||
- '{"index": {"_index": "test"}}'
|
||||
- '{"f1": "v2"}'
|
||||
- '{"index": {"_index": "test"}}'
|
||||
- '{"f1": "v3"}'
|
||||
- '{"index": {"_index": "test"}}'
|
||||
- '{"f1": "v4"}'
|
||||
- '{"index": {"_index": "test"}}'
|
||||
- '{"f1": "v5"}'
|
||||
- '{"index": {"_index": "test"}}'
|
||||
- '{"f1": "v6"}'
|
||||
|
||||
- do:
|
||||
search:
|
||||
rest_total_hits_as_int: true
|
||||
index: test
|
||||
body:
|
||||
query:
|
||||
script_score:
|
||||
query: {match_all: {} }
|
||||
script:
|
||||
source: "randomScore(100)"
|
||||
# stash ids to check for reproducibility of ranking
|
||||
- set: { hits.hits.0._id: id0 }
|
||||
- set: { hits.hits.1._id: id1 }
|
||||
- set: { hits.hits.2._id: id2 }
|
||||
- set: { hits.hits.3._id: id3 }
|
||||
- set: { hits.hits.4._id: id4 }
|
||||
- set: { hits.hits.5._id: id5 }
|
||||
- set: { hits.hits.6._id: id6 }
|
||||
|
||||
# check that ranking is reproducible
|
||||
- do:
|
||||
search:
|
||||
rest_total_hits_as_int: true
|
||||
index: test
|
||||
body:
|
||||
query:
|
||||
script_score:
|
||||
query: {match_all: {} }
|
||||
script:
|
||||
source: "randomScore(100)"
|
||||
- match: { hits.hits.0._id: $id0 }
|
||||
- match: { hits.hits.1._id: $id1 }
|
||||
- match: { hits.hits.2._id: $id2 }
|
||||
- match: { hits.hits.3._id: $id3 }
|
||||
- match: { hits.hits.4._id: $id4 }
|
||||
- match: { hits.hits.5._id: $id5 }
|
||||
- match: { hits.hits.6._id: $id6 }
|
|
@ -50,11 +50,24 @@ public class ScriptScoreFunction extends ScoreFunction {
|
|||
|
||||
private final ScoreScript.LeafFactory script;
|
||||
|
||||
private final int shardId;
|
||||
private final String indexName;
|
||||
|
||||
|
||||
public ScriptScoreFunction(Script sScript, ScoreScript.LeafFactory script) {
|
||||
super(CombineFunction.REPLACE);
|
||||
this.sScript = sScript;
|
||||
this.script = script;
|
||||
this.indexName = null;
|
||||
this.shardId = -1;
|
||||
}
|
||||
|
||||
public ScriptScoreFunction(Script sScript, ScoreScript.LeafFactory script, String indexName, int shardId) {
|
||||
super(CombineFunction.REPLACE);
|
||||
this.sScript = sScript;
|
||||
this.script = script;
|
||||
this.indexName = indexName;
|
||||
this.shardId = shardId;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -62,6 +75,8 @@ public class ScriptScoreFunction extends ScoreFunction {
|
|||
final ScoreScript leafScript = script.newInstance(ctx);
|
||||
final CannedScorer scorer = new CannedScorer();
|
||||
leafScript.setScorer(scorer);
|
||||
leafScript._setIndexName(indexName);
|
||||
leafScript._setShard(shardId);
|
||||
return new LeafScoreFunction() {
|
||||
@Override
|
||||
public double score(int docId, float subQueryScore) throws IOException {
|
||||
|
|
|
@ -94,7 +94,7 @@ public class ScriptScoreFunctionBuilder extends ScoreFunctionBuilder<ScriptScore
|
|||
try {
|
||||
ScoreScript.Factory factory = context.getScriptService().compile(script, ScoreScript.CONTEXT);
|
||||
ScoreScript.LeafFactory searchScript = factory.newFactory(script.getParams(), context.lookup());
|
||||
return new ScriptScoreFunction(script, searchScript);
|
||||
return new ScriptScoreFunction(script, searchScript, context.index().getName(), context.getShardId());
|
||||
} catch (Exception e) {
|
||||
throw new QueryShardException(context, "script_score: the script could not be loaded", e);
|
||||
}
|
||||
|
|
|
@ -62,6 +62,11 @@ public abstract class ScoreScript {
|
|||
|
||||
private DoubleSupplier scoreSupplier = () -> 0.0;
|
||||
|
||||
private final int docBase;
|
||||
private int docId;
|
||||
private int shardId = -1;
|
||||
private String indexName = null;
|
||||
|
||||
public ScoreScript(Map<String, Object> params, SearchLookup lookup, LeafReaderContext leafContext) {
|
||||
// null check needed b/c of expression engine subclass
|
||||
if (lookup == null) {
|
||||
|
@ -69,11 +74,13 @@ public abstract class ScoreScript {
|
|||
assert leafContext == null;
|
||||
this.params = null;
|
||||
this.leafLookup = null;
|
||||
this.docBase = 0;
|
||||
} else {
|
||||
this.leafLookup = lookup.getLeafSearchLookup(leafContext);
|
||||
params = new HashMap<>(params);
|
||||
params.putAll(leafLookup.asMap());
|
||||
this.params = new DeprecationMap(params, DEPRECATIONS, "score-script");
|
||||
this.docBase = leafContext.docBase;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -91,6 +98,7 @@ public abstract class ScoreScript {
|
|||
|
||||
/** Set the current document to run the script on next. */
|
||||
public void setDocument(int docid) {
|
||||
this.docId = docid;
|
||||
leafLookup.setDocument(docid);
|
||||
}
|
||||
|
||||
|
@ -104,10 +112,74 @@ public abstract class ScoreScript {
|
|||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Accessed as _score in the painless script
|
||||
* @return the score of the inner query
|
||||
*/
|
||||
public double get_score() {
|
||||
return scoreSupplier.getAsDouble();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Starting a name with underscore, so that the user cannot access this function directly through a script
|
||||
* It is only used within predefined painless functions.
|
||||
* @return the internal document ID
|
||||
*/
|
||||
public int _getDocId() {
|
||||
return docId;
|
||||
}
|
||||
|
||||
/**
|
||||
* Starting a name with underscore, so that the user cannot access this function directly through a script
|
||||
* It is only used within predefined painless functions.
|
||||
* @return the internal document ID with the base
|
||||
*/
|
||||
public int _getDocBaseId() {
|
||||
return docBase + docId;
|
||||
}
|
||||
|
||||
/**
|
||||
* Starting a name with underscore, so that the user cannot access this function directly through a script
|
||||
* It is only used within predefined painless functions.
|
||||
* @return shard id or throws an exception if shard is not set up for this script instance
|
||||
*/
|
||||
public int _getShardId() {
|
||||
if (shardId > -1) {
|
||||
return shardId;
|
||||
} else {
|
||||
throw new IllegalArgumentException("shard id can not be looked up!");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Starting a name with underscore, so that the user cannot access this function directly through a script
|
||||
* It is only used within predefined painless functions.
|
||||
* @return index name or throws an exception if the index name is not set up for this script instance
|
||||
*/
|
||||
public String _getIndex() {
|
||||
if (indexName != null) {
|
||||
return indexName;
|
||||
} else {
|
||||
throw new IllegalArgumentException("index name can not be looked up!");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Starting a name with underscore, so that the user cannot access this function directly through a script
|
||||
*/
|
||||
public void _setShard(int shardId) {
|
||||
this.shardId = shardId;
|
||||
}
|
||||
|
||||
/**
|
||||
* Starting a name with underscore, so that the user cannot access this function directly through a script
|
||||
*/
|
||||
public void _setIndexName(String indexName) {
|
||||
this.indexName = indexName;
|
||||
}
|
||||
|
||||
|
||||
/** A factory to construct {@link ScoreScript} instances. */
|
||||
public interface LeafFactory {
|
||||
|
||||
|
|
|
@ -21,22 +21,20 @@ package org.elasticsearch.script;
|
|||
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.StringHelper;
|
||||
import org.elasticsearch.common.Randomness;
|
||||
import org.elasticsearch.ExceptionsHelper;
|
||||
import org.elasticsearch.common.geo.GeoDistance;
|
||||
import org.elasticsearch.common.geo.GeoPoint;
|
||||
import org.elasticsearch.common.geo.GeoUtils;
|
||||
import org.elasticsearch.common.time.DateMathParser;
|
||||
import org.elasticsearch.common.unit.DistanceUnit;
|
||||
import org.elasticsearch.common.unit.TimeValue;
|
||||
import org.elasticsearch.index.fielddata.ScriptDocValues;
|
||||
import org.elasticsearch.index.mapper.DateFieldMapper;
|
||||
|
||||
import java.time.ZoneId;
|
||||
import java.util.Random;
|
||||
|
||||
/**
|
||||
* ScoringScriptImpl can be used as {@link ScoreScript}
|
||||
* to run a previously compiled Painless script.
|
||||
*/
|
||||
import static com.carrotsearch.hppc.BitMixer.mix32;
|
||||
|
||||
public final class ScoreScriptUtils {
|
||||
|
||||
/****** STATIC FUNCTIONS that can be used by users for score calculations **/
|
||||
|
@ -53,26 +51,50 @@ public final class ScoreScriptUtils {
|
|||
return Math.pow(value,a) / (Math.pow(k,a) + Math.pow(value,a));
|
||||
}
|
||||
|
||||
// random score based on the documents' values of the given field
|
||||
public static final class RandomScoreField {
|
||||
private final ScoreScript scoreScript;
|
||||
private final ScriptDocValues docValues;
|
||||
private final int saltedSeed;
|
||||
|
||||
// reproducible random
|
||||
public static double randomReproducible(String seedValue, int seed) {
|
||||
int hash = StringHelper.murmurhash3_x86_32(new BytesRef(seedValue), seed);
|
||||
return (hash & 0x00FFFFFF) / (float)(1 << 24); // only use the lower 24 bits to construct a float from 0.0-1.0
|
||||
}
|
||||
|
||||
// not reproducible random
|
||||
public static final class RandomNotReproducible {
|
||||
private final Random rnd;
|
||||
public RandomScoreField(ScoreScript scoreScript, int seed, String fieldName) {
|
||||
this.scoreScript = scoreScript;
|
||||
this.docValues = scoreScript.getDoc().get(fieldName);
|
||||
int salt = (scoreScript._getIndex().hashCode() << 10) | scoreScript._getShardId();
|
||||
this.saltedSeed = mix32(salt ^ seed);
|
||||
|
||||
public RandomNotReproducible() {
|
||||
this.rnd = Randomness.get();
|
||||
}
|
||||
|
||||
public double randomNotReproducible() {
|
||||
return rnd.nextDouble();
|
||||
public double randomScore() {
|
||||
try {
|
||||
docValues.setNextDocId(scoreScript._getDocId());
|
||||
String seedValue = String.valueOf(docValues.get(0));
|
||||
int hash = StringHelper.murmurhash3_x86_32(new BytesRef(seedValue), saltedSeed);
|
||||
return (hash & 0x00FFFFFF) / (float)(1 << 24); // only use the lower 24 bits to construct a float from 0.0-1.0
|
||||
} catch (Exception e) {
|
||||
throw ExceptionsHelper.convertToElastic(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// random score based on the internal Lucene document Ids
|
||||
public static final class RandomScoreDoc {
|
||||
private final ScoreScript scoreScript;
|
||||
private final int saltedSeed;
|
||||
|
||||
public RandomScoreDoc(ScoreScript scoreScript, int seed) {
|
||||
this.scoreScript = scoreScript;
|
||||
int salt = (scoreScript._getIndex().hashCode() << 10) | scoreScript._getShardId();
|
||||
this.saltedSeed = mix32(salt ^ seed);
|
||||
}
|
||||
|
||||
public double randomScore() {
|
||||
String seedValue = Integer.toString(scoreScript._getDocBaseId());
|
||||
int hash = StringHelper.murmurhash3_x86_32(new BytesRef(seedValue), saltedSeed);
|
||||
return (hash & 0x00FFFFFF) / (float)(1 << 24); // only use the lower 24 bits to construct a float from 0.0-1.0
|
||||
}
|
||||
}
|
||||
|
||||
// **** Decay functions on geo field
|
||||
public static final class DecayGeoLinear {
|
||||
|
|
Loading…
Reference in New Issue