From 7690b40ec656f2161da605c6b132edeb41928043 Mon Sep 17 00:00:00 2001 From: Nik Everett Date: Fri, 1 Nov 2013 21:00:59 -0400 Subject: [PATCH] Allow string fields to store token counts To use this one you send a string to a field of type 'token_count'. This makes the most sense with a multi-field. --- .../mapping/types/core-types.asciidoc | 43 ++++ .../index/mapper/DocumentMapperParser.java | 1 + .../index/mapper/MapperBuilders.java | 4 + .../index/mapper/core/IntegerFieldMapper.java | 7 + .../index/mapper/core/StringFieldMapper.java | 111 ++++++--- .../mapper/core/TokenCountFieldMapper.java | 198 ++++++++++++++++ ...TokenCountFieldMapperIntegrationTests.java | 222 ++++++++++++++++++ .../core/TokenCountFieldMapperTests.java | 92 ++++++++ 8 files changed, 643 insertions(+), 35 deletions(-) create mode 100644 src/main/java/org/elasticsearch/index/mapper/core/TokenCountFieldMapper.java create mode 100644 src/test/java/org/elasticsearch/index/mapper/core/TokenCountFieldMapperIntegrationTests.java create mode 100644 src/test/java/org/elasticsearch/index/mapper/core/TokenCountFieldMapperTests.java diff --git a/docs/reference/mapping/types/core-types.asciidoc b/docs/reference/mapping/types/core-types.asciidoc index 0d65ff62ca8..0f3209cb244 100644 --- a/docs/reference/mapping/types/core-types.asciidoc +++ b/docs/reference/mapping/types/core-types.asciidoc @@ -212,6 +212,49 @@ defaults to `true` or to the parent `object` type setting. |======================================================================= +[float] +[[token_count]] +==== Token Count +added[0.90.8] +The `token_count` type maps to the JSON string type but indexes and stores +the number of tokens in the string rather than the string itself. For +example: + +[source,js] +-------------------------------------------------- +{ + "tweet" : { + "properties" : { + "message" : { + "type" : "multi_field", + "fields" : { + "name": { + "type": "string" + }, + "word_count": { + "type" : "token_count", + "store" : "yes", + "analyzer" : "standard" + } + } + } + } + } +} +-------------------------------------------------- + +All the configuration that can be specified for a number can be specified +for a token_count. The only extra configuration is the required +`analyzer` field which specifies which analyzer to use to break the string +into tokens. For best performance, use an analyzer with no token filters. + +[NOTE] +=================================================================== +Technically the `token_count` type sums position increments rather than +counting tokens. This means that even if the analyzer filters out stop +words they are included in the count. +=================================================================== + [float] [[date]] ==== Date diff --git a/src/main/java/org/elasticsearch/index/mapper/DocumentMapperParser.java b/src/main/java/org/elasticsearch/index/mapper/DocumentMapperParser.java index 6ddbea639b2..b2626990c5a 100644 --- a/src/main/java/org/elasticsearch/index/mapper/DocumentMapperParser.java +++ b/src/main/java/org/elasticsearch/index/mapper/DocumentMapperParser.java @@ -95,6 +95,7 @@ public class DocumentMapperParser extends AbstractIndexComponent { .put(DateFieldMapper.CONTENT_TYPE, new DateFieldMapper.TypeParser()) .put(IpFieldMapper.CONTENT_TYPE, new IpFieldMapper.TypeParser()) .put(StringFieldMapper.CONTENT_TYPE, new StringFieldMapper.TypeParser()) + .put(TokenCountFieldMapper.CONTENT_TYPE, new TokenCountFieldMapper.TypeParser()) .put(ObjectMapper.CONTENT_TYPE, new ObjectMapper.TypeParser()) .put(ObjectMapper.NESTED_CONTENT_TYPE, new ObjectMapper.TypeParser()) .put(MultiFieldMapper.CONTENT_TYPE, new MultiFieldMapper.TypeParser()) diff --git a/src/main/java/org/elasticsearch/index/mapper/MapperBuilders.java b/src/main/java/org/elasticsearch/index/mapper/MapperBuilders.java index 7b798c79091..478a7f683d1 100644 --- a/src/main/java/org/elasticsearch/index/mapper/MapperBuilders.java +++ b/src/main/java/org/elasticsearch/index/mapper/MapperBuilders.java @@ -133,6 +133,10 @@ public final class MapperBuilders { return new IntegerFieldMapper.Builder(name); } + public static TokenCountFieldMapper.Builder tokenCountField(String name) { + return new TokenCountFieldMapper.Builder(name); + } + public static LongFieldMapper.Builder longField(String name) { return new LongFieldMapper.Builder(name); } diff --git a/src/main/java/org/elasticsearch/index/mapper/core/IntegerFieldMapper.java b/src/main/java/org/elasticsearch/index/mapper/core/IntegerFieldMapper.java index 451ae4a94f4..f9867d032ba 100644 --- a/src/main/java/org/elasticsearch/index/mapper/core/IntegerFieldMapper.java +++ b/src/main/java/org/elasticsearch/index/mapper/core/IntegerFieldMapper.java @@ -313,7 +313,10 @@ public class IntegerFieldMapper extends NumberFieldMapper { } } } + addIntegerFields(fields, value, boost); + } + protected void addIntegerFields(List fields, int value, float boost) { if (fieldType.indexed() || fieldType.stored()) { CustomIntegerNumericField field = new CustomIntegerNumericField(this, value, fieldType); field.setBoost(boost); @@ -326,6 +329,10 @@ public class IntegerFieldMapper extends NumberFieldMapper { } } + protected Integer nullValue() { + return nullValue; + } + @Override protected String contentType() { return CONTENT_TYPE; diff --git a/src/main/java/org/elasticsearch/index/mapper/core/StringFieldMapper.java b/src/main/java/org/elasticsearch/index/mapper/core/StringFieldMapper.java index 61cbdab4ef3..515dad5e89f 100644 --- a/src/main/java/org/elasticsearch/index/mapper/core/StringFieldMapper.java +++ b/src/main/java/org/elasticsearch/index/mapper/core/StringFieldMapper.java @@ -259,57 +259,69 @@ public class StringFieldMapper extends AbstractFieldMapper implements Al @Override protected void parseCreateField(ParseContext context, List fields) throws IOException { - String value = nullValue; - float boost = this.boost; - if (context.externalValueSet()) { - value = (String) context.externalValue(); - } else { - XContentParser parser = context.parser(); - if (parser.currentToken() == XContentParser.Token.VALUE_NULL) { - value = nullValue; - } else if (parser.currentToken() == XContentParser.Token.START_OBJECT) { - XContentParser.Token token; - String currentFieldName = null; - while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) { - if (token == XContentParser.Token.FIELD_NAME) { - currentFieldName = parser.currentName(); - } else { - if ("value".equals(currentFieldName) || "_value".equals(currentFieldName)) { - value = parser.textOrNull(); - } else if ("boost".equals(currentFieldName) || "_boost".equals(currentFieldName)) { - boost = parser.floatValue(); - } else { - throw new ElasticSearchIllegalArgumentException("unknown property [" + currentFieldName + "]"); - } - } - } - } else { - value = parser.textOrNull(); - } - } - if (value == null) { + ValueAndBoost valueAndBoost = parseCreateFieldForString(context, nullValue, boost); + if (valueAndBoost.value() == null) { return; } - if (ignoreAbove > 0 && value.length() > ignoreAbove) { + if (ignoreAbove > 0 && valueAndBoost.value().length() > ignoreAbove) { return; } if (context.includeInAll(includeInAll, this)) { - context.allEntries().addText(names.fullName(), value, boost); + context.allEntries().addText(names.fullName(), valueAndBoost.value(), valueAndBoost.boost()); } if (fieldType.indexed() || fieldType.stored()) { - Field field = new StringField(names.indexName(), value, fieldType); - field.setBoost(boost); + Field field = new StringField(names.indexName(), valueAndBoost.value(), fieldType); + field.setBoost(valueAndBoost.boost()); fields.add(field); } if (hasDocValues()) { - fields.add(new SortedSetDocValuesField(names.indexName(), new BytesRef(value))); + fields.add(new SortedSetDocValuesField(names.indexName(), new BytesRef(valueAndBoost.value()))); } if (fields.isEmpty()) { - context.ignoredValue(names.indexName(), value); + context.ignoredValue(names.indexName(), valueAndBoost.value()); } } + /** + * Parse a field as though it were a string. + * @param context parse context used during parsing + * @param nullValue value to use for null + * @param defaultBoost default boost value returned unless overwritten in the field + * @return the parsed field and the boost either parsed or defaulted + * @throws IOException if thrown while parsing + */ + public static ValueAndBoost parseCreateFieldForString(ParseContext context, String nullValue, float defaultBoost) throws IOException { + if (context.externalValueSet()) { + return new ValueAndBoost((String) context.externalValue(), defaultBoost); + } + XContentParser parser = context.parser(); + if (parser.currentToken() == XContentParser.Token.VALUE_NULL) { + return new ValueAndBoost(nullValue, defaultBoost); + } + if (parser.currentToken() == XContentParser.Token.START_OBJECT) { + XContentParser.Token token; + String currentFieldName = null; + String value = nullValue; + float boost = defaultBoost; + while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) { + if (token == XContentParser.Token.FIELD_NAME) { + currentFieldName = parser.currentName(); + } else { + if ("value".equals(currentFieldName) || "_value".equals(currentFieldName)) { + value = parser.textOrNull(); + } else if ("boost".equals(currentFieldName) || "_boost".equals(currentFieldName)) { + boost = parser.floatValue(); + } else { + throw new ElasticSearchIllegalArgumentException("unknown property [" + currentFieldName + "]"); + } + } + } + return new ValueAndBoost(value, boost); + } + return new ValueAndBoost(parser.textOrNull(), defaultBoost); + } + @Override protected String contentType() { return CONTENT_TYPE; @@ -437,4 +449,33 @@ public class StringFieldMapper extends AbstractFieldMapper implements Al value = null; } } + + /** + * Parsed value and boost to be returned from {@link #parseCreateFieldForString}. + */ + public static class ValueAndBoost { + private final String value; + private final float boost; + + public ValueAndBoost(String value, float boost) { + this.value = value; + this.boost = boost; + } + + /** + * Value of string field. + * @return value of string field + */ + public String value() { + return value; + } + + /** + * Boost either parsed from the document or defaulted. + * @return boost either parsed from the document or defaulted + */ + public float boost() { + return boost; + } + } } diff --git a/src/main/java/org/elasticsearch/index/mapper/core/TokenCountFieldMapper.java b/src/main/java/org/elasticsearch/index/mapper/core/TokenCountFieldMapper.java new file mode 100644 index 00000000000..51751ff894f --- /dev/null +++ b/src/main/java/org/elasticsearch/index/mapper/core/TokenCountFieldMapper.java @@ -0,0 +1,198 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.mapper.core; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.FieldType; +import org.elasticsearch.common.Explicit; +import org.elasticsearch.common.Strings; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.common.xcontent.XContentBuilder; +import org.elasticsearch.index.analysis.NamedAnalyzer; +import org.elasticsearch.index.codec.docvaluesformat.DocValuesFormatProvider; +import org.elasticsearch.index.codec.postingsformat.PostingsFormatProvider; +import org.elasticsearch.index.mapper.*; +import org.elasticsearch.index.mapper.core.StringFieldMapper.ValueAndBoost; +import org.elasticsearch.index.similarity.SimilarityProvider; + +import java.io.IOException; +import java.util.List; +import java.util.Map; + +import static org.elasticsearch.common.xcontent.support.XContentMapValues.nodeIntegerValue; +import static org.elasticsearch.index.mapper.MapperBuilders.tokenCountField; +import static org.elasticsearch.index.mapper.core.TypeParsers.parseNumberField; + +/** + * A {@link FieldMapper} that takes a string and writes a count of the tokens in that string + * to the index. In most ways the mapper acts just like an {@link IntegerFieldMapper}. + */ +public class TokenCountFieldMapper extends IntegerFieldMapper { + public static final String CONTENT_TYPE = "token_count"; + + public static class Defaults extends IntegerFieldMapper.Defaults { + } + + public static class Builder extends NumberFieldMapper.Builder { + private Integer nullValue = Defaults.NULL_VALUE; + private NamedAnalyzer analyzer; + + public Builder(String name) { + super(name, new FieldType(Defaults.FIELD_TYPE)); + builder = this; + } + + public Builder nullValue(int nullValue) { + this.nullValue = nullValue; + return this; + } + + public Builder analyzer(NamedAnalyzer analyzer) { + this.analyzer = analyzer; + return this; + } + + public NamedAnalyzer analyzer() { + return analyzer; + } + + @Override + public TokenCountFieldMapper build(BuilderContext context) { + fieldType.setOmitNorms(fieldType.omitNorms() && boost == 1.0f); + TokenCountFieldMapper fieldMapper = new TokenCountFieldMapper(buildNames(context), precisionStep, boost, fieldType, nullValue, + ignoreMalformed(context), postingsProvider, docValuesProvider, similarity, fieldDataSettings, context.indexSettings(), + analyzer); + fieldMapper.includeInAll(includeInAll); + return fieldMapper; + } + } + + public static class TypeParser implements Mapper.TypeParser { + @Override + @SuppressWarnings("unchecked") + public Mapper.Builder parse(String name, Map node, ParserContext parserContext) throws MapperParsingException { + TokenCountFieldMapper.Builder builder = tokenCountField(name); + parseNumberField(builder, name, node, parserContext); + for (Map.Entry entry : node.entrySet()) { + String propName = Strings.toUnderscoreCase(entry.getKey()); + Object propNode = entry.getValue(); + if (propName.equals("null_value")) { + builder.nullValue(nodeIntegerValue(propNode)); + } else if (propName.equals("analyzer")) { + NamedAnalyzer analyzer = parserContext.analysisService().analyzer(propNode.toString()); + if (analyzer == null) { + throw new MapperParsingException("Analyzer [" + propNode.toString() + "] not found for field [" + name + "]"); + } + builder.analyzer(analyzer); + } + } + if (builder.analyzer() == null) { + throw new MapperParsingException("Analyzer must be set for field [" + name + "] but wasn't."); + } + return builder; + } + } + + private NamedAnalyzer analyzer; + + protected TokenCountFieldMapper(Names names, int precisionStep, float boost, FieldType fieldType, Integer nullValue, + Explicit ignoreMalformed, PostingsFormatProvider postingsProvider, DocValuesFormatProvider docValuesProvider, + SimilarityProvider similarity, Settings fieldDataSettings, Settings indexSettings, NamedAnalyzer analyzer) { + super(names, precisionStep, boost, fieldType, nullValue, ignoreMalformed, postingsProvider, docValuesProvider, similarity, + fieldDataSettings, indexSettings); + this.analyzer = analyzer; + } + + @Override + protected void parseCreateField(ParseContext context, List fields) throws IOException { + ValueAndBoost valueAndBoost = StringFieldMapper.parseCreateFieldForString(context, null /* Out null value is an int so we convert*/, boost); + if (valueAndBoost.value() == null && nullValue() == null) { + return; + } + + if (fieldType.indexed() || fieldType.stored() || hasDocValues()) { + int count; + if (valueAndBoost.value() == null) { + count = nullValue(); + } else { + count = countPositions(analyzer.analyzer().tokenStream(name(), valueAndBoost.value())); + } + addIntegerFields(fields, count, valueAndBoost.boost()); + } + if (fields.isEmpty()) { + context.ignoredValue(names.indexName(), valueAndBoost.value()); + } + } + + /** + * Count position increments in a token stream. Package private for testing. + * @param tokenStream token stream to count + * @return number of position increments in a token stream + * @throws IOException if tokenStream throws it + */ + static int countPositions(TokenStream tokenStream) throws IOException { + try { + int count = 0; + PositionIncrementAttribute position = tokenStream.addAttribute(PositionIncrementAttribute.class); + tokenStream.reset(); + while (tokenStream.incrementToken()) { + count += position.getPositionIncrement(); + } + tokenStream.end(); + count += position.getPositionIncrement(); + return count; + } finally { + tokenStream.close(); + } + } + + /** + * Name of analyzer. + * @return name of analyzer + */ + public String analyzer() { + return analyzer.name(); + } + + @Override + protected String contentType() { + return CONTENT_TYPE; + } + + @Override + public void merge(Mapper mergeWith, MergeContext mergeContext) throws MergeMappingException { + super.merge(mergeWith, mergeContext); + if (!this.getClass().equals(mergeWith.getClass())) { + return; + } + if (!mergeContext.mergeFlags().simulate()) { + this.analyzer = ((TokenCountFieldMapper) mergeWith).analyzer; + } + } + + @Override + protected void doXContentBody(XContentBuilder builder, boolean includeDefaults, Params params) throws IOException { + super.doXContentBody(builder, includeDefaults, params); + + builder.field("analyzer", analyzer()); + } +} diff --git a/src/test/java/org/elasticsearch/index/mapper/core/TokenCountFieldMapperIntegrationTests.java b/src/test/java/org/elasticsearch/index/mapper/core/TokenCountFieldMapperIntegrationTests.java new file mode 100644 index 00000000000..c493cb6bc89 --- /dev/null +++ b/src/test/java/org/elasticsearch/index/mapper/core/TokenCountFieldMapperIntegrationTests.java @@ -0,0 +1,222 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.mapper.core; + +import com.carrotsearch.randomizedtesting.annotations.Name; +import com.carrotsearch.randomizedtesting.annotations.ParametersFactory; +import com.google.common.collect.ImmutableList; +import org.apache.lucene.util.LuceneTestCase; +import org.elasticsearch.ElasticSearchException; +import org.elasticsearch.action.bulk.BulkResponse; +import org.elasticsearch.action.index.IndexRequestBuilder; +import org.elasticsearch.action.search.SearchRequestBuilder; +import org.elasticsearch.action.search.SearchResponse; +import org.elasticsearch.index.query.QueryBuilders; +import org.elasticsearch.search.SearchHit; +import org.elasticsearch.search.facet.terms.TermsFacet; +import org.elasticsearch.search.facet.terms.TermsFacetBuilder; +import org.elasticsearch.test.ElasticsearchIntegrationTest; +import org.junit.Test; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder; +import static org.hamcrest.Matchers.*; + +public class TokenCountFieldMapperIntegrationTests extends ElasticsearchIntegrationTest { + @ParametersFactory + public static Iterable buildParameters() { + List parameters = new ArrayList(); + for (boolean storeCountedFields : new boolean[] { true, false }) { + for (boolean loadCountedFields : new boolean[] { true, false }) { + parameters.add(new Object[] { storeCountedFields, loadCountedFields }); + } + } + return parameters; + } + + private final boolean storeCountedFields; + private final boolean loadCountedFields; + + public TokenCountFieldMapperIntegrationTests(@Name("storeCountedFields") boolean storeCountedFields, + @Name("loadCountedFields") boolean loadCountedFields) { + this.storeCountedFields = storeCountedFields; + this.loadCountedFields = loadCountedFields; + } + + /** + * It is possible to get the token count in a search response. + */ + @Test + public void searchReturnsTokenCount() throws ElasticSearchException, IOException { + init(); + + assertSearchReturns(searchById("single"), "single"); + assertSearchReturns(searchById("bulk1"), "bulk1"); + assertSearchReturns(searchById("bulk2"), "bulk2"); + assertSearchReturns(searchById("multi"), "multi"); + assertSearchReturns(searchById("multibulk1"), "multibulk1"); + assertSearchReturns(searchById("multibulk2"), "multibulk2"); + } + + /** + * It is possible to search by token count. + */ + @Test + public void searchByTokenCount() throws ElasticSearchException, IOException { + init(); + + assertSearchReturns(searchByNumericRange(4, 4).get(), "single"); + assertSearchReturns(searchByNumericRange(10, 10).get(), "multibulk2"); + assertSearchReturns(searchByNumericRange(7, 10).get(), "multi", "multibulk1", "multibulk2"); + assertSearchReturns(searchByNumericRange(1, 10).get(), "single", "bulk1", "bulk2", "multi", "multibulk1", "multibulk2"); + assertSearchReturns(searchByNumericRange(12, 12).get()); + } + + /** + * It is possible to search by token count. + */ + @Test + public void facetByTokenCount() throws ElasticSearchException, IOException { + init(); + + String facetField = randomFrom(ImmutableList.of( + "foo.token_count", "foo.token_count_unstored", "foo.token_count_with_doc_values")); + SearchResponse result = searchByNumericRange(1, 10) + .addFacet(new TermsFacetBuilder("facet").field(facetField)).get(); + assertSearchReturns(result, "single", "bulk1", "bulk2", "multi", "multibulk1", "multibulk2"); + assertThat(result.getFacets().facets().size(), equalTo(1)); + TermsFacet facet = (TermsFacet) result.getFacets().facets().get(0); + assertThat(facet.getEntries().size(), equalTo(9)); + } + + private void init() throws ElasticSearchException, IOException { + prepareCreate("test").addMapping("test", jsonBuilder().startObject() + .startObject("test") + .startObject("properties") + .startObject("foo") + .field("type", "multi_field") + .startObject("fields") + .startObject("foo") + .field("type", "string") + .field("store", storeCountedFields) + .field("analyzer", "simple") + .endObject() + .startObject("token_count") + .field("type", "token_count") + .field("analyzer", "standard") + .field("store", true) + .endObject() + .startObject("token_count_unstored") + .field("type", "token_count") + .field("analyzer", "standard") + .endObject() + .startObject("token_count_with_doc_values") + .field("type", "token_count") + .field("analyzer", "standard") + .startObject("fielddata") + .field("format", LuceneTestCase.defaultCodecSupportsSortedSet() ? "doc_values" : null) + .endObject() + .endObject() + .endObject() + .endObject() + .endObject() + .endObject().endObject()).get(); + ensureGreen(); + + assertTrue(prepareIndex("single", "I have four terms").get().isCreated()); + BulkResponse bulk = client().prepareBulk() + .add(prepareIndex("bulk1", "bulk three terms")) + .add(prepareIndex("bulk2", "this has five bulk terms")).get(); + assertFalse(bulk.buildFailureMessage(), bulk.hasFailures()); + assertTrue(prepareIndex("multi", "two terms", "wow now I have seven lucky terms").get().isCreated()); + bulk = client().prepareBulk() + .add(prepareIndex("multibulk1", "one", "oh wow now I have eight unlucky terms")) + .add(prepareIndex("multibulk2", "six is a bunch of terms", "ten! ten terms is just crazy! too many too count!")).get(); + assertFalse(bulk.buildFailureMessage(), bulk.hasFailures()); + + assertThat(refresh().getFailedShards(), equalTo(0)); + } + + private IndexRequestBuilder prepareIndex(String id, String... texts) throws IOException { + return client().prepareIndex("test", "test", id).setSource("foo", texts); + } + + private SearchResponse searchById(String id) { + return prepareSearch().setQuery(QueryBuilders.termQuery("_id", id)).get(); + } + + private SearchRequestBuilder searchByNumericRange(int low, int high) { + return prepareSearch().setQuery(QueryBuilders.rangeQuery(randomFrom( + ImmutableList.of("foo.token_count", "foo.token_count_unstored", "foo.token_count_with_doc_values") + )).gte(low).lte(high)); + } + + private SearchRequestBuilder prepareSearch() { + SearchRequestBuilder request = client().prepareSearch("test").setTypes("test"); + request.addField("foo.token_count"); + if (loadCountedFields) { + request.addField("foo"); + } + return request; + } + + private void assertSearchReturns(SearchResponse result, String... ids) { + assertThat(result.getHits().getTotalHits(), equalTo((long) ids.length)); + assertThat(result.getHits().hits().length, equalTo(ids.length)); + List foundIds = new ArrayList(); + for (SearchHit hit : result.getHits()) { + foundIds.add(hit.id()); + } + assertThat(foundIds, containsInAnyOrder(ids)); + for (SearchHit hit : result.getHits()) { + String id = hit.id(); + if (id.equals("single")) { + assertSearchHit(hit, 4); + } else if (id.equals("bulk1")) { + assertSearchHit(hit, 3); + } else if (id.equals("bulk2")) { + assertSearchHit(hit, 5); + } else if (id.equals("multi")) { + assertSearchHit(hit, 2, 7); + } else if (id.equals("multibulk1")) { + assertSearchHit(hit, 1, 8); + } else if (id.equals("multibulk2")) { + assertSearchHit(hit, 6, 10); + } else { + throw new ElasticSearchException("Unexpected response!"); + } + } + } + + private void assertSearchHit(SearchHit hit, int... termCounts) { + assertThat(hit.field("foo.token_count"), not(nullValue())); + assertThat(hit.field("foo.token_count").values().size(), equalTo(termCounts.length)); + for (int i = 0; i < termCounts.length; i++) { + assertThat((Integer) hit.field("foo.token_count").values().get(i), equalTo(termCounts[i])); + } + + if (loadCountedFields && storeCountedFields) { + assertThat(hit.field("foo").values().size(), equalTo(termCounts.length)); + } + } +} diff --git a/src/test/java/org/elasticsearch/index/mapper/core/TokenCountFieldMapperTests.java b/src/test/java/org/elasticsearch/index/mapper/core/TokenCountFieldMapperTests.java new file mode 100644 index 00000000000..dad48295db8 --- /dev/null +++ b/src/test/java/org/elasticsearch/index/mapper/core/TokenCountFieldMapperTests.java @@ -0,0 +1,92 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.mapper.core; + +import org.apache.lucene.analysis.CannedTokenStream; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; +import org.elasticsearch.common.xcontent.XContentFactory; +import org.elasticsearch.index.mapper.DocumentMapper; +import org.elasticsearch.index.mapper.MapperTestUtils; +import org.elasticsearch.test.ElasticsearchTestCase; +import org.junit.Test; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Collections; + +import static org.elasticsearch.index.mapper.DocumentMapper.MergeFlags.mergeFlags; +import static org.hamcrest.Matchers.equalTo; + +/** + * Test for {@link TokenCountFieldMapper}. + */ +public class TokenCountFieldMapperTests extends ElasticsearchTestCase { + @Test + public void testMerge() throws IOException { + String stage1Mapping = XContentFactory.jsonBuilder().startObject() + .startObject("person") + .startObject("properties") + .startObject("tc") + .field("type", "token_count") + .field("tokenizer", "keyword") + .endObject() + .endObject() + .endObject().endObject().string(); + DocumentMapper stage1 = MapperTestUtils.newParser().parse(stage1Mapping); + + String stage2Mapping = XContentFactory.jsonBuilder().startObject() + .startObject("person") + .startObject("properties") + .startObject("tc") + .field("type", "token_count") + .field("tokenizer", "standard") + .endObject() + .endObject() + .endObject().endObject().string(); + DocumentMapper stage2 = MapperTestUtils.newParser().parse(stage2Mapping); + + DocumentMapper.MergeResult mergeResult = stage1.merge(stage2, mergeFlags().simulate(true)); + assertThat(mergeResult.hasConflicts(), equalTo(false)); + // Just simulated so merge hasn't happened yet + assertThat(((TokenCountFieldMapper) stage1.mappers().smartName("tc").mapper()).analyzer(), equalTo("keyword")); + + mergeResult = stage1.merge(stage2, mergeFlags().simulate(false)); + assertThat(mergeResult.hasConflicts(), equalTo(false)); + // Just simulated so merge hasn't happened yet + assertThat(((TokenCountFieldMapper) stage1.mappers().smartName("tc").mapper()).analyzer(), equalTo("standard")); + } + + @Test + public void testCountPositions() throws IOException { + // We're looking to make sure that we: + Token t1 = new Token(); // Don't count tokens without an increment + t1.setPositionIncrement(0); + Token t2 = new Token(); + t2.setPositionIncrement(1); // Count normal tokens with one increment + Token t3 = new Token(); + t2.setPositionIncrement(2); // Count funny tokens with more than one increment + int finalTokenIncrement = 4; // Count the final token increment on the rare token streams that have them + Token[] tokens = new Token[] {t1, t2, t3}; + Collections.shuffle(Arrays.asList(tokens), getRandom()); + TokenStream tokenStream = new CannedTokenStream(finalTokenIncrement, 0, tokens); + assertThat(TokenCountFieldMapper.countPositions(tokenStream), equalTo(7)); + } +}