Move the `murmur3` field to a plugin and fix defaults.

This move the `murmur3` field to the `mapper-murmur3` plugin and fixes its
defaults so that values will not be indexed by default, as the only purpose
of this field is to speed up `cardinality` aggregations on high-cardinality
string fields, which only requires doc values.

I also removed the `rehash` option from the `cardinality` aggregation as it
doesn't bring much value (rehashing is cheap) and allowed to remove the
coupling between the `cardinality` aggregation and the `murmur3` field.

Close #12874
This commit is contained in:
Adrien Grand 2015-08-17 12:47:14 +02:00
parent 7765b0497d
commit a91b3fcbb9
26 changed files with 477 additions and 200 deletions

View File

@ -101,8 +101,7 @@ public class DocumentMapperParser {
.put(ObjectMapper.NESTED_CONTENT_TYPE, new ObjectMapper.TypeParser())
.put(TypeParsers.MULTI_FIELD_CONTENT_TYPE, TypeParsers.multiFieldConverterTypeParser)
.put(CompletionFieldMapper.CONTENT_TYPE, new CompletionFieldMapper.TypeParser())
.put(GeoPointFieldMapper.CONTENT_TYPE, new GeoPointFieldMapper.TypeParser())
.put(Murmur3FieldMapper.CONTENT_TYPE, new Murmur3FieldMapper.TypeParser());
.put(GeoPointFieldMapper.CONTENT_TYPE, new GeoPointFieldMapper.TypeParser());
if (ShapesAvailability.JTS_AVAILABLE) {
typeParsersBuilder.put(GeoShapeFieldMapper.CONTENT_TYPE, new GeoShapeFieldMapper.TypeParser());

View File

@ -84,10 +84,6 @@ public final class MapperBuilders {
return new LongFieldMapper.Builder(name);
}
public static Murmur3FieldMapper.Builder murmur3Field(String name) {
return new Murmur3FieldMapper.Builder(name);
}
public static FloatFieldMapper.Builder floatField(String name) {
return new FloatFieldMapper.Builder(name);
}

View File

@ -86,6 +86,7 @@ public class PluginManager {
"elasticsearch-delete-by-query",
"elasticsearch-lang-javascript",
"elasticsearch-lang-python",
"elasticsearch-mapper-murmur3",
"elasticsearch-mapper-size"
).build();

View File

@ -56,7 +56,6 @@ import java.util.Map;
public class CardinalityAggregator extends NumericMetricsAggregator.SingleValue {
private final int precision;
private final boolean rehash;
private final ValuesSource valuesSource;
// Expensive to initialize, so we only initialize it when we have an actual value source
@ -66,11 +65,10 @@ public class CardinalityAggregator extends NumericMetricsAggregator.SingleValue
private Collector collector;
private ValueFormatter formatter;
public CardinalityAggregator(String name, ValuesSource valuesSource, boolean rehash, int precision, ValueFormatter formatter,
public CardinalityAggregator(String name, ValuesSource valuesSource, int precision, ValueFormatter formatter,
AggregationContext context, Aggregator parent, List<PipelineAggregator> pipelineAggregators, Map<String, Object> metaData) throws IOException {
super(name, context, parent, pipelineAggregators, metaData);
this.valuesSource = valuesSource;
this.rehash = rehash;
this.precision = precision;
this.counts = valuesSource == null ? null : new HyperLogLogPlusPlus(precision, context.bigArrays(), 1);
this.formatter = formatter;
@ -85,13 +83,6 @@ public class CardinalityAggregator extends NumericMetricsAggregator.SingleValue
if (valuesSource == null) {
return new EmptyCollector();
}
// if rehash is false then the value source is either already hashed, or the user explicitly
// requested not to hash the values (perhaps they already hashed the values themselves before indexing the doc)
// so we can just work with the original value source as is
if (!rehash) {
MurmurHash3Values hashValues = MurmurHash3Values.cast(((ValuesSource.Numeric) valuesSource).longValues(ctx));
return new DirectCollector(counts, hashValues);
}
if (valuesSource instanceof ValuesSource.Numeric) {
ValuesSource.Numeric source = (ValuesSource.Numeric) valuesSource;

View File

@ -19,7 +19,6 @@
package org.elasticsearch.search.aggregations.metrics.cardinality;
import org.elasticsearch.search.aggregations.AggregationExecutionException;
import org.elasticsearch.search.aggregations.Aggregator;
import org.elasticsearch.search.aggregations.bucket.SingleBucketAggregator;
import org.elasticsearch.search.aggregations.pipeline.PipelineAggregator;
@ -35,12 +34,10 @@ import java.util.Map;
final class CardinalityAggregatorFactory extends ValuesSourceAggregatorFactory<ValuesSource> {
private final long precisionThreshold;
private final boolean rehash;
CardinalityAggregatorFactory(String name, ValuesSourceConfig config, long precisionThreshold, boolean rehash) {
CardinalityAggregatorFactory(String name, ValuesSourceConfig config, long precisionThreshold) {
super(name, InternalCardinality.TYPE.name(), config);
this.precisionThreshold = precisionThreshold;
this.rehash = rehash;
}
private int precision(Aggregator parent) {
@ -50,16 +47,13 @@ final class CardinalityAggregatorFactory extends ValuesSourceAggregatorFactory<V
@Override
protected Aggregator createUnmapped(AggregationContext context, Aggregator parent, List<PipelineAggregator> pipelineAggregators, Map<String, Object> metaData)
throws IOException {
return new CardinalityAggregator(name, null, true, precision(parent), config.formatter(), context, parent, pipelineAggregators, metaData);
return new CardinalityAggregator(name, null, precision(parent), config.formatter(), context, parent, pipelineAggregators, metaData);
}
@Override
protected Aggregator doCreateInternal(ValuesSource valuesSource, AggregationContext context, Aggregator parent,
boolean collectsFromSingleBucket, List<PipelineAggregator> pipelineAggregators, Map<String, Object> metaData) throws IOException {
if (!(valuesSource instanceof ValuesSource.Numeric) && !rehash) {
throw new AggregationExecutionException("Turning off rehashing for cardinality aggregation [" + name + "] on non-numeric values in not allowed");
}
return new CardinalityAggregator(name, valuesSource, rehash, precision(parent), config.formatter(), context, parent, pipelineAggregators,
return new CardinalityAggregator(name, valuesSource, precision(parent), config.formatter(), context, parent, pipelineAggregators,
metaData);
}

View File

@ -21,11 +21,9 @@ package org.elasticsearch.search.aggregations.metrics.cardinality;
import org.elasticsearch.common.ParseField;
import org.elasticsearch.common.xcontent.XContentParser;
import org.elasticsearch.index.mapper.core.Murmur3FieldMapper;
import org.elasticsearch.search.SearchParseException;
import org.elasticsearch.search.aggregations.Aggregator;
import org.elasticsearch.search.aggregations.AggregatorFactory;
import org.elasticsearch.search.aggregations.support.ValuesSourceConfig;
import org.elasticsearch.search.aggregations.support.ValuesSourceParser;
import org.elasticsearch.search.internal.SearchContext;
@ -35,6 +33,7 @@ import java.io.IOException;
public class CardinalityParser implements Aggregator.Parser {
private static final ParseField PRECISION_THRESHOLD = new ParseField("precision_threshold");
private static final ParseField REHASH = new ParseField("rehash").withAllDeprecated("no replacement - values will always be rehashed");
@Override
public String type() {
@ -44,10 +43,9 @@ public class CardinalityParser implements Aggregator.Parser {
@Override
public AggregatorFactory parse(String name, XContentParser parser, SearchContext context) throws IOException {
ValuesSourceParser vsParser = ValuesSourceParser.any(name, InternalCardinality.TYPE, context).formattable(false).build();
ValuesSourceParser<?> vsParser = ValuesSourceParser.any(name, InternalCardinality.TYPE, context).formattable(false).build();
long precisionThreshold = -1;
Boolean rehash = null;
XContentParser.Token token;
String currentFieldName = null;
@ -57,8 +55,8 @@ public class CardinalityParser implements Aggregator.Parser {
} else if (vsParser.token(currentFieldName, token, parser)) {
continue;
} else if (token.isValue()) {
if ("rehash".equals(currentFieldName)) {
rehash = parser.booleanValue();
if (context.parseFieldMatcher().match(currentFieldName, REHASH)) {
// ignore
} else if (context.parseFieldMatcher().match(currentFieldName, PRECISION_THRESHOLD)) {
precisionThreshold = parser.longValue();
} else {
@ -70,15 +68,7 @@ public class CardinalityParser implements Aggregator.Parser {
}
}
ValuesSourceConfig<?> config = vsParser.config();
if (rehash == null && config.fieldContext() != null && config.fieldContext().fieldType() instanceof Murmur3FieldMapper.Murmur3FieldType) {
rehash = false;
} else if (rehash == null) {
rehash = true;
}
return new CardinalityAggregatorFactory(name, config, precisionThreshold, rehash);
return new CardinalityAggregatorFactory(name, vsParser.config(), precisionThreshold);
}

View File

@ -43,6 +43,7 @@ OFFICIAL PLUGINS
- elasticsearch-delete-by-query
- elasticsearch-lang-javascript
- elasticsearch-lang-python
- elasticsearch-mapper-murmur3
- elasticsearch-mapper-size

View File

@ -1116,7 +1116,7 @@ public class GetActionIT extends ESIntegTestCase {
@Test
public void testGeneratedNumberFieldsUnstored() throws IOException {
indexSingleDocumentWithNumericFieldsGeneratedFromText(false, randomBoolean());
String[] fieldsList = {"token_count", "text.token_count", "murmur", "text.murmur"};
String[] fieldsList = {"token_count", "text.token_count"};
// before refresh - document is only in translog
assertGetFieldsAlwaysNull(indexOrAlias(), "doc", "1", fieldsList);
refresh();
@ -1130,7 +1130,7 @@ public class GetActionIT extends ESIntegTestCase {
@Test
public void testGeneratedNumberFieldsStored() throws IOException {
indexSingleDocumentWithNumericFieldsGeneratedFromText(true, randomBoolean());
String[] fieldsList = {"token_count", "text.token_count", "murmur", "text.murmur"};
String[] fieldsList = {"token_count", "text.token_count"};
// before refresh - document is only in translog
assertGetFieldsNull(indexOrAlias(), "doc", "1", fieldsList);
assertGetFieldsException(indexOrAlias(), "doc", "1", fieldsList);
@ -1159,10 +1159,6 @@ public class GetActionIT extends ESIntegTestCase {
" \"analyzer\": \"standard\",\n" +
" \"store\": \"" + storedString + "\"" +
" },\n" +
" \"murmur\": {\n" +
" \"type\": \"murmur3\",\n" +
" \"store\": \"" + storedString + "\"" +
" },\n" +
" \"text\": {\n" +
" \"type\": \"string\",\n" +
" \"fields\": {\n" +
@ -1170,10 +1166,6 @@ public class GetActionIT extends ESIntegTestCase {
" \"type\": \"token_count\",\n" +
" \"analyzer\": \"standard\",\n" +
" \"store\": \"" + storedString + "\"" +
" },\n" +
" \"murmur\": {\n" +
" \"type\": \"murmur3\",\n" +
" \"store\": \"" + storedString + "\"" +
" }\n" +
" }\n" +
" }" +
@ -1185,7 +1177,6 @@ public class GetActionIT extends ESIntegTestCase {
assertAcked(prepareCreate("test").addAlias(new Alias("alias")).setSource(createIndexSource));
ensureGreen();
String doc = "{\n" +
" \"murmur\": \"Some value that can be hashed\",\n" +
" \"token_count\": \"A text with five words.\",\n" +
" \"text\": \"A text with five words.\"\n" +
"}\n";

View File

@ -550,6 +550,7 @@ public class PluginManagerIT extends ESIntegTestCase {
PluginManager.checkForOfficialPlugins("elasticsearch-delete-by-query");
PluginManager.checkForOfficialPlugins("elasticsearch-lang-javascript");
PluginManager.checkForOfficialPlugins("elasticsearch-lang-python");
PluginManager.checkForOfficialPlugins("elasticsearch-mapper-murmur3");
try {
PluginManager.checkForOfficialPlugins("elasticsearch-mapper-attachment");

View File

@ -61,54 +61,23 @@ public class CardinalityIT extends ESIntegTestCase {
jsonBuilder().startObject().startObject("type").startObject("properties")
.startObject("str_value")
.field("type", "string")
.startObject("fields")
.startObject("hash")
.field("type", "murmur3")
.endObject()
.endObject()
.endObject()
.startObject("str_values")
.field("type", "string")
.startObject("fields")
.startObject("hash")
.field("type", "murmur3")
.endObject()
.endObject()
.endObject()
.startObject("l_value")
.field("type", "long")
.startObject("fields")
.startObject("hash")
.field("type", "murmur3")
.endObject()
.endObject()
.endObject()
.startObject("l_values")
.field("type", "long")
.startObject("fields")
.startObject("hash")
.field("type", "murmur3")
.endObject()
.endObject()
.endObject()
.startObject("d_value")
.field("type", "double")
.startObject("fields")
.startObject("hash")
.field("type", "murmur3")
.endObject()
.endObject()
.endObject()
.startObject("d_values")
.field("type", "double")
.startObject("fields")
.startObject("hash")
.field("type", "murmur3")
.endObject()
.endObject()
.endObject()
.endObject()
.endObject().endObject()).execute().actionGet();
.endObject().endObject().endObject()).execute().actionGet();
numDocs = randomIntBetween(2, 100);
precisionThreshold = randomIntBetween(0, 1 << randomInt(20));
@ -145,12 +114,12 @@ public class CardinalityIT extends ESIntegTestCase {
assertThat(count.getValue(), greaterThan(0L));
}
}
private String singleNumericField(boolean hash) {
return (randomBoolean() ? "l_value" : "d_value") + (hash ? ".hash" : "");
private String singleNumericField() {
return randomBoolean() ? "l_value" : "d_value";
}
private String multiNumericField(boolean hash) {
return (randomBoolean() ? "l_values" : "d_values") + (hash ? ".hash" : "");
return randomBoolean() ? "l_values" : "d_values";
}
@Test
@ -195,24 +164,10 @@ public class CardinalityIT extends ESIntegTestCase {
assertCount(count, numDocs);
}
@Test
public void singleValuedStringHashed() throws Exception {
SearchResponse response = client().prepareSearch("idx").setTypes("type")
.addAggregation(cardinality("cardinality").precisionThreshold(precisionThreshold).field("str_value.hash"))
.execute().actionGet();
assertSearchResponse(response);
Cardinality count = response.getAggregations().get("cardinality");
assertThat(count, notNullValue());
assertThat(count.getName(), equalTo("cardinality"));
assertCount(count, numDocs);
}
@Test
public void singleValuedNumeric() throws Exception {
SearchResponse response = client().prepareSearch("idx").setTypes("type")
.addAggregation(cardinality("cardinality").precisionThreshold(precisionThreshold).field(singleNumericField(false)))
.addAggregation(cardinality("cardinality").precisionThreshold(precisionThreshold).field(singleNumericField()))
.execute().actionGet();
assertSearchResponse(response);
@ -229,7 +184,7 @@ public class CardinalityIT extends ESIntegTestCase {
SearchResponse searchResponse = client().prepareSearch("idx").setQuery(matchAllQuery())
.addAggregation(
global("global").subAggregation(
cardinality("cardinality").precisionThreshold(precisionThreshold).field(singleNumericField(false))))
cardinality("cardinality").precisionThreshold(precisionThreshold).field(singleNumericField())))
.execute().actionGet();
assertSearchResponse(searchResponse);
@ -254,7 +209,7 @@ public class CardinalityIT extends ESIntegTestCase {
@Test
public void singleValuedNumericHashed() throws Exception {
SearchResponse response = client().prepareSearch("idx").setTypes("type")
.addAggregation(cardinality("cardinality").precisionThreshold(precisionThreshold).field(singleNumericField(true)))
.addAggregation(cardinality("cardinality").precisionThreshold(precisionThreshold).field(singleNumericField()))
.execute().actionGet();
assertSearchResponse(response);
@ -279,20 +234,6 @@ public class CardinalityIT extends ESIntegTestCase {
assertCount(count, numDocs * 2);
}
@Test
public void multiValuedStringHashed() throws Exception {
SearchResponse response = client().prepareSearch("idx").setTypes("type")
.addAggregation(cardinality("cardinality").precisionThreshold(precisionThreshold).field("str_values.hash"))
.execute().actionGet();
assertSearchResponse(response);
Cardinality count = response.getAggregations().get("cardinality");
assertThat(count, notNullValue());
assertThat(count.getName(), equalTo("cardinality"));
assertCount(count, numDocs * 2);
}
@Test
public void multiValuedNumeric() throws Exception {
SearchResponse response = client().prepareSearch("idx").setTypes("type")
@ -356,7 +297,7 @@ public class CardinalityIT extends ESIntegTestCase {
SearchResponse response = client().prepareSearch("idx").setTypes("type")
.addAggregation(
cardinality("cardinality").precisionThreshold(precisionThreshold).script(
new Script("doc['" + singleNumericField(false) + "'].value")))
new Script("doc['" + singleNumericField() + "'].value")))
.execute().actionGet();
assertSearchResponse(response);
@ -417,7 +358,7 @@ public class CardinalityIT extends ESIntegTestCase {
public void singleValuedNumericValueScript() throws Exception {
SearchResponse response = client().prepareSearch("idx").setTypes("type")
.addAggregation(
cardinality("cardinality").precisionThreshold(precisionThreshold).field(singleNumericField(false))
cardinality("cardinality").precisionThreshold(precisionThreshold).field(singleNumericField())
.script(new Script("_value")))
.execute().actionGet();
@ -464,23 +405,4 @@ public class CardinalityIT extends ESIntegTestCase {
}
}
@Test
public void asSubAggHashed() throws Exception {
SearchResponse response = client().prepareSearch("idx").setTypes("type")
.addAggregation(terms("terms").field("str_value")
.collectMode(randomFrom(SubAggCollectionMode.values()))
.subAggregation(cardinality("cardinality").precisionThreshold(precisionThreshold).field("str_values.hash")))
.execute().actionGet();
assertSearchResponse(response);
Terms terms = response.getAggregations().get("terms");
for (Terms.Bucket bucket : terms.getBuckets()) {
Cardinality count = bucket.getAggregations().get("cardinality");
assertThat(count, notNullValue());
assertThat(count.getName(), equalTo("cardinality"));
assertCount(count, 2);
}
}
}

View File

@ -0,0 +1,101 @@
[[mapper-murmur3]]
=== Mapper Murmur3 Plugin
The mapper-murmur3 plugin provides the ability to compute hash of field values
at index-time and store them in the index. This can sometimes be helpful when
running cardinality aggregations on high-cardinality and large string fields.
[[mapper-murmur3-install]]
[float]
==== Installation
This plugin can be installed using the plugin manager:
[source,sh]
----------------------------------------------------------------
sudo bin/plugin install mapper-murmur3
----------------------------------------------------------------
The plugin must be installed on every node in the cluster, and each node must
be restarted after installation.
[[mapper-murmur3-remove]]
[float]
==== Removal
The plugin can be removed with the following command:
[source,sh]
----------------------------------------------------------------
sudo bin/plugin remove mapper-murmur3
----------------------------------------------------------------
The node must be stopped before removing the plugin.
[[mapper-murmur3-usage]]
==== Using the `murmur3` field
The `murmur3` is typically used within a multi-field, so that both the original
value and its hash are stored in the index:
[source,js]
--------------------------
PUT my_index
{
"mappings": {
"my_type": {
"properties": {
"my_field": {
"type": "string",
"fields": {
"hash": {
"type": "murmur3"
}
}
}
}
}
}
}
--------------------------
// AUTOSENSE
Such a mapping would allow to refer to `my_field.hash` in order to get hashes
of the values of the `my_field` field. This is only useful in order to run
`cardinality` aggregations:
[source,js]
--------------------------
# Example documents
PUT my_index/my_type/1
{
"my_field": "This is a document"
}
PUT my_index/my_type/2
{
"my_field": "This is another document"
}
GET my_index/_search
{
"aggs": {
"my_field_cardinality": {
"cardinality": {
"field": "my_field.hash" <1>
}
}
}
}
--------------------------
// AUTOSENSE
<1> Counting unique values on the `my_field.hash` field
Running a `cardinality` aggregation on the `my_field` field directly would
yield the same result, however using `my_field.hash` instead might result in
a speed-up if the field has a high-cardinality. On the other hand, it is
discouraged to use the `murmur3` field on numeric fields and string fields
that are not almost unique as the use of a `murmur3` field is unlikely to
bring significant speed-ups, while increasing the amount of disk space required
to store the index.

View File

@ -14,5 +14,10 @@ The mapper-size plugin provides the `_size` meta field which, when enabled,
indexes the size in bytes of the original
{ref}/mapping-source-field.html[`_source`] field.
include::mapper-size.asciidoc[]
<<mapper-murmur3>>::
The mapper-murmur3 plugin allows hashes to be computed at index-time and stored
in the index for later use with the `cardinality` aggregation.
include::mapper-size.asciidoc[]
include::mapper-murmur3.asciidoc[]

View File

@ -23,9 +23,9 @@ match a query:
==== Precision control
This aggregation also supports the `precision_threshold` and `rehash` options:
This aggregation also supports the `precision_threshold` option:
experimental[The `precision_threshold` and `rehash` options are specific to the current internal implementation of the `cardinality` agg, which may change in the future]
experimental[The `precision_threshold` option is specific to the current internal implementation of the `cardinality` agg, which may change in the future]
[source,js]
--------------------------------------------------
@ -34,8 +34,7 @@ experimental[The `precision_threshold` and `rehash` options are specific to the
"author_count" : {
"cardinality" : {
"field" : "author_hash",
"precision_threshold": 100, <1>
"rehash": false <2>
"precision_threshold": 100 <1>
}
}
}
@ -49,11 +48,6 @@ supported value is 40000, thresholds above this number will have the same
effect as a threshold of 40000.
Default value depends on the number of parent aggregations that multiple
create buckets (such as terms or histograms).
<2> If you computed a hash on client-side, stored it into your documents and want
Elasticsearch to use them to compute counts using this hash function without
rehashing values, it is possible to specify `rehash: false`. Default value is
`true`. Please note that the hash must be indexed as a long when `rehash` is
false.
==== Counts are approximate
@ -86,47 +80,11 @@ counting millions of items.
==== Pre-computed hashes
If you don't want Elasticsearch to re-compute hashes on every run of this
aggregation, it is possible to use pre-computed hashes, either by computing a
hash on client-side, indexing it and specifying `rehash: false`, or by using
the special `murmur3` field mapper, typically in the context of a `multi-field`
in the mapping:
[source,js]
--------------------------------------------------
{
"author": {
"type": "string",
"fields": {
"hash": {
"type": "murmur3"
}
}
}
}
--------------------------------------------------
With such a mapping, Elasticsearch is going to compute hashes of the `author`
field at indexing time and store them in the `author.hash` field. This
way, unique counts can be computed using the cardinality aggregation by only
loading the hashes into memory, not the values of the `author` field, and
without computing hashes on the fly:
[source,js]
--------------------------------------------------
{
"aggs" : {
"author_count" : {
"cardinality" : {
"field" : "author.hash"
}
}
}
}
--------------------------------------------------
NOTE: `rehash` is automatically set to `false` when computing unique counts on
a `murmur3` field.
On string fields that have a high cardinality, it might be faster to store the
hash of your field values in your index and then run the cardinality aggregation
on this field. This can either be done by providing hash values from client-side
or by letting elasticsearch compute hash values for you by using the
{plugins}/mapper-size.html[`mapper-murmur3`] plugin.
NOTE: Pre-computing hashes is usually only useful on very large and/or
high-cardinality fields as it saves CPU and memory. However, on numeric

View File

@ -33,6 +33,7 @@ document:
<<search-suggesters-completion,Completion datatype>>::
`completion` to provide auto-complete suggestions
<<token-count>>:: `token_count` to count the number of tokens in a string
{plugins}/mapper-size.html[`mapper-murmur3`]:: `murmur3` to compute hashes of values at index-time and store them in the index
Attachment datatype::

View File

@ -41,6 +41,16 @@ can install the plugin with:
The `_shutdown` API has been removed without a replacement. Nodes should be
managed via the operating system and the provided start/stop scripts.
==== `murmur3` is now a plugin
The `murmur3` field, which indexes hashes of the field values, has been moved
out of core and is available as a plugin. It can be installed as:
[source,sh]
------------------
./bin/plugin install mapper-murmur3
------------------
==== `_size` is now a plugin
The `_size` meta-data field, which indexes the size in bytes of the original

View File

@ -0,0 +1 @@
This plugin has no third party dependencies

View File

@ -0,0 +1,43 @@
<?xml version="1.0" encoding="UTF-8"?>
<!-- Licensed to Elasticsearch under one or more contributor
license agreements. See the NOTICE file distributed with this work for additional
information regarding copyright ownership. ElasticSearch licenses this file to you
under the Apache License, Version 2.0 (the "License"); you may not use this
file except in compliance with the License. You may obtain a copy of the
License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by
applicable law or agreed to in writing, software distributed under the License
is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the specific language
governing permissions and limitations under the License. -->
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.elasticsearch.plugin</groupId>
<artifactId>elasticsearch-plugin</artifactId>
<version>2.1.0-SNAPSHOT</version>
</parent>
<artifactId>elasticsearch-mapper-murmur3</artifactId>
<name>Elasticsearch Mapper Murmur3 plugin</name>
<description>The Mapper Murmur3 plugin allows to compute hashes of a field's values at index-time and to store them in the index.</description>
<properties>
<elasticsearch.plugin.classname>org.elasticsearch.plugin.mapper.MapperMurmur3Plugin</elasticsearch.plugin.classname>
<tests.rest.suite>mapper_murmur3</tests.rest.suite>
<tests.rest.load_packaged>false</tests.rest.load_packaged>
</properties>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
</plugin>
</plugins>
</build>
</project>

View File

@ -0,0 +1,65 @@
# Integration tests for Mapper Murmur3 components
#
---
"Mapper Murmur3":
- do:
indices.create:
index: test
body:
mappings:
type1: { "properties": { "foo": { "type": "string", "fields": { "hash": { "type": "murmur3" } } } } }
- do:
index:
index: test
type: type1
id: 0
body: { "foo": null }
- do:
indices.refresh: {}
- do:
search:
body: { "aggs": { "foo_count": { "cardinality": { "field": "foo.hash" } } } }
- match: { aggregations.foo_count.value: 0 }
- do:
index:
index: test
type: type1
id: 1
body: { "foo": "bar" }
- do:
index:
index: test
type: type1
id: 2
body: { "foo": "baz" }
- do:
index:
index: test
type: type1
id: 3
body: { "foo": "quux" }
- do:
index:
index: test
type: type1
id: 4
body: { "foo": "bar" }
- do:
indices.refresh: {}
- do:
search:
body: { "aggs": { "foo_count": { "cardinality": { "field": "foo.hash" } } } }
- match: { aggregations.foo_count.value: 3 }

View File

@ -17,9 +17,10 @@
* under the License.
*/
package org.elasticsearch.index.mapper.core;
package org.elasticsearch.index.mapper.murmur3;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.Version;
import org.elasticsearch.common.Explicit;
@ -31,12 +32,13 @@ import org.elasticsearch.index.mapper.MappedFieldType;
import org.elasticsearch.index.mapper.Mapper;
import org.elasticsearch.index.mapper.MapperParsingException;
import org.elasticsearch.index.mapper.ParseContext;
import org.elasticsearch.index.mapper.core.LongFieldMapper;
import org.elasticsearch.index.mapper.core.NumberFieldMapper;
import java.io.IOException;
import java.util.List;
import java.util.Map;
import static org.elasticsearch.index.mapper.MapperBuilders.murmur3Field;
import static org.elasticsearch.index.mapper.core.TypeParsers.parseNumberField;
public class Murmur3FieldMapper extends LongFieldMapper {
@ -45,6 +47,9 @@ public class Murmur3FieldMapper extends LongFieldMapper {
public static class Defaults extends LongFieldMapper.Defaults {
public static final MappedFieldType FIELD_TYPE = new Murmur3FieldType();
static {
FIELD_TYPE.freeze();
}
}
public static class Builder extends NumberFieldMapper.Builder<Builder, Murmur3FieldMapper> {
@ -65,6 +70,17 @@ public class Murmur3FieldMapper extends LongFieldMapper {
return fieldMapper;
}
@Override
protected void setupFieldType(BuilderContext context) {
super.setupFieldType(context);
if (context.indexCreatedVersion().onOrAfter(Version.V_2_0_0)) {
fieldType.setIndexOptions(IndexOptions.NONE);
defaultFieldType.setIndexOptions(IndexOptions.NONE);
fieldType.setHasDocValues(true);
defaultFieldType.setHasDocValues(true);
}
}
@Override
protected NamedAnalyzer makeNumberAnalyzer(int precisionStep) {
return NumericLongAnalyzer.buildNamedAnalyzer(precisionStep);
@ -80,7 +96,7 @@ public class Murmur3FieldMapper extends LongFieldMapper {
@Override
@SuppressWarnings("unchecked")
public Mapper.Builder parse(String name, Map<String, Object> node, ParserContext parserContext) throws MapperParsingException {
Builder builder = murmur3Field(name);
Builder builder = new Builder(name);
// tweaking these settings is no longer allowed, the entire purpose of murmur3 fields is to store a hash
if (parserContext.indexVersionCreated().onOrAfter(Version.V_2_0_0_beta1)) {
@ -92,6 +108,10 @@ public class Murmur3FieldMapper extends LongFieldMapper {
}
}
if (parserContext.indexVersionCreated().before(Version.V_2_0_0)) {
builder.indexOptions(IndexOptions.DOCS);
}
parseNumberField(builder, name, node, parserContext);
// Because this mapper extends LongFieldMapper the null_value field will be added to the JSON when transferring cluster state
// between nodes so we have to remove the entry here so that the validation doesn't fail
@ -104,7 +124,8 @@ public class Murmur3FieldMapper extends LongFieldMapper {
// this only exists so a check can be done to match the field type to using murmur3 hashing...
public static class Murmur3FieldType extends LongFieldMapper.LongFieldType {
public Murmur3FieldType() {}
public Murmur3FieldType() {
}
protected Murmur3FieldType(Murmur3FieldType ref) {
super(ref);

View File

@ -0,0 +1,36 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.mapper.murmur3;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.AbstractIndexComponent;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.mapper.MapperService;
public class RegisterMurmur3FieldMapper extends AbstractIndexComponent {
@Inject
public RegisterMurmur3FieldMapper(Index index, Settings indexSettings, MapperService mapperService) {
super(index, indexSettings);
mapperService.documentMapperParser().putTypeParser(Murmur3FieldMapper.CONTENT_TYPE, new Murmur3FieldMapper.TypeParser());
}
}

View File

@ -0,0 +1,31 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.plugin.mapper;
import org.elasticsearch.common.inject.AbstractModule;
import org.elasticsearch.index.mapper.murmur3.RegisterMurmur3FieldMapper;
public class MapperMurmur3IndexModule extends AbstractModule {
@Override
protected void configure() {
bind(RegisterMurmur3FieldMapper.class).asEagerSingleton();
}
}

View File

@ -0,0 +1,45 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.plugin.mapper;
import org.elasticsearch.common.inject.Module;
import org.elasticsearch.plugins.AbstractPlugin;
import java.util.Collection;
import java.util.Collections;
public class MapperMurmur3Plugin extends AbstractPlugin {
@Override
public String name() {
return "mapper-murmur3";
}
@Override
public String description() {
return "A mapper that allows to precompute murmur3 hashes of values at index-time and store them in the index";
}
@Override
public Collection<Class<? extends Module>> indexModules() {
return Collections.<Class<? extends Module>>singleton(MapperMurmur3IndexModule.class);
}
}

View File

@ -0,0 +1,42 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.mapper.murmur3;
import com.carrotsearch.randomizedtesting.annotations.Name;
import com.carrotsearch.randomizedtesting.annotations.ParametersFactory;
import org.elasticsearch.test.rest.ESRestTestCase;
import org.elasticsearch.test.rest.RestTestCandidate;
import org.elasticsearch.test.rest.parser.RestTestParseException;
import java.io.IOException;
public class MapperMurmur3RestIT extends ESRestTestCase {
public MapperMurmur3RestIT(@Name("yaml") RestTestCandidate testCandidate) {
super(testCandidate);
}
@ParametersFactory
public static Iterable<Object[]> parameters() throws IOException, RestTestParseException {
return createParameters(0, 1);
}
}

View File

@ -17,9 +17,11 @@
* under the License.
*/
package org.elasticsearch.index.mapper.core;
package org.elasticsearch.index.mapper.murmur3;
import org.apache.lucene.index.DocValuesType;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexableField;
import org.elasticsearch.Version;
import org.elasticsearch.cluster.metadata.IndexMetaData;
import org.elasticsearch.common.settings.Settings;
@ -28,9 +30,12 @@ import org.elasticsearch.index.IndexService;
import org.elasticsearch.index.mapper.DocumentMapper;
import org.elasticsearch.index.mapper.DocumentMapperParser;
import org.elasticsearch.index.mapper.MapperParsingException;
import org.elasticsearch.index.mapper.ParsedDocument;
import org.elasticsearch.test.ESSingleNodeTestCase;
import org.junit.Before;
import java.util.Arrays;
public class Murmur3FieldMapperTests extends ESSingleNodeTestCase {
IndexService indexService;
@ -40,6 +45,22 @@ public class Murmur3FieldMapperTests extends ESSingleNodeTestCase {
public void before() {
indexService = createIndex("test");
parser = indexService.mapperService().documentMapperParser();
parser.putTypeParser(Murmur3FieldMapper.CONTENT_TYPE, new Murmur3FieldMapper.TypeParser());
}
public void testDefaults() throws Exception {
String mapping = XContentFactory.jsonBuilder().startObject().startObject("type")
.startObject("properties").startObject("field")
.field("type", "murmur3")
.endObject().endObject().endObject().endObject().string();
DocumentMapper mapper = parser.parse(mapping);
ParsedDocument parsedDoc = mapper.parse("test", "type", "1", XContentFactory.jsonBuilder().startObject().field("field", "value").endObject().bytes());
IndexableField[] fields = parsedDoc.rootDoc().getFields("field");
assertNotNull(fields);
assertEquals(Arrays.toString(fields), 1, fields.length);
IndexableField field = fields[0];
assertEquals(IndexOptions.NONE, field.fieldType().indexOptions());
assertEquals(DocValuesType.SORTED_NUMERIC, field.fieldType().docValuesType());
}
public void testDocValuesSettingNotAllowed() throws Exception {
@ -100,6 +121,7 @@ public class Murmur3FieldMapperTests extends ESSingleNodeTestCase {
Settings settings = Settings.builder().put(IndexMetaData.SETTING_VERSION_CREATED, Version.V_1_4_2.id).build();
indexService = createIndex("test_bwc", settings);
parser = indexService.mapperService().documentMapperParser();
parser.putTypeParser(Murmur3FieldMapper.CONTENT_TYPE, new Murmur3FieldMapper.TypeParser());
String mapping = XContentFactory.jsonBuilder().startObject().startObject("type")
.startObject("properties").startObject("field")
.field("type", "murmur3")
@ -115,6 +137,7 @@ public class Murmur3FieldMapperTests extends ESSingleNodeTestCase {
Settings settings = Settings.builder().put(IndexMetaData.SETTING_VERSION_CREATED, Version.V_1_4_2.id).build();
indexService = createIndex("test_bwc", settings);
parser = indexService.mapperService().documentMapperParser();
parser.putTypeParser(Murmur3FieldMapper.CONTENT_TYPE, new Murmur3FieldMapper.TypeParser());
String mapping = XContentFactory.jsonBuilder().startObject().startObject("type")
.startObject("properties").startObject("field")
.field("type", "murmur3")

View File

@ -436,6 +436,7 @@
<module>delete-by-query</module>
<module>lang-python</module>
<module>lang-javascript</module>
<module>mapper-murmur3</module>
<module>mapper-size</module>
<module>jvm-example</module>
<module>site-example</module>

View File

@ -333,6 +333,14 @@
<overWrite>true</overWrite>
</artifactItem>
<artifactItem>
<groupId>org.elasticsearch.plugin</groupId>
<artifactId>elasticsearch-mapper-murmur3</artifactId>
<version>${elasticsearch.version}</version>
<type>zip</type>
<overWrite>true</overWrite>
</artifactItem>
<artifactItem>
<groupId>org.elasticsearch.plugin</groupId>
<artifactId>elasticsearch-mapper-size</artifactId>