Allow string fields to store token counts

To use this one you send a string to a field of type 'token_count'. This makes the most sense with a multi-field.
2025-03-25 09:28:27 +00:00 · 2013-11-01 21:00:59 -04:00 · 2013-11-01 21:00:59 -04:00 · 7690b40ec6
commit 7690b40ec6
parent 3494ac252e
8 changed files with 643 additions and 35 deletions
--- a/docs/reference/mapping/types/core-types.asciidoc
+++ b/docs/reference/mapping/types/core-types.asciidoc
@ -212,6 +212,49 @@ defaults to `true` or to the parent `object` type setting.

 |=======================================================================

+[float]
+[[token_count]]
+==== Token Count
+added[0.90.8]
+The `token_count` type maps to the JSON string type but indexes and stores
+the number of tokens in the string rather than the string itself.  For
+example:
+
+[source,js]
+--------------------------------------------------
+{
+    "tweet" : {
+        "properties" : {
+            "message" : {
+                "type" : "multi_field",
+                "fields" : {
+                    "name": {
+                        "type": "string"
+                    },
+                    "word_count": {
+                        "type" : "token_count",
+                        "store" : "yes",
+                        "analyzer" : "standard"
+                    }
+                }
+            }
+        }
+    }
+}
+--------------------------------------------------
+
+All the configuration that can be specified for a number can be specified
+for a token_count.  The only extra configuration is the required
+`analyzer` field which specifies which analyzer to use to break the string
+into tokens.  For best performance, use an analyzer with no token filters.
+
+[NOTE]
+===================================================================
+Technically the `token_count` type sums position increments rather than
+counting tokens. This means that even if the analyzer filters out stop
+words they are included in the count.
+===================================================================
+
 [float]
 [[date]]
 ==== Date
--- a/src/main/java/org/elasticsearch/index/mapper/DocumentMapperParser.java
+++ b/src/main/java/org/elasticsearch/index/mapper/DocumentMapperParser.java
@ -95,6 +95,7 @@ public class DocumentMapperParser extends AbstractIndexComponent {
                .put(DateFieldMapper.CONTENT_TYPE, new DateFieldMapper.TypeParser())
                .put(IpFieldMapper.CONTENT_TYPE, new IpFieldMapper.TypeParser())
                .put(StringFieldMapper.CONTENT_TYPE, new StringFieldMapper.TypeParser())
+                .put(TokenCountFieldMapper.CONTENT_TYPE, new TokenCountFieldMapper.TypeParser())
                .put(ObjectMapper.CONTENT_TYPE, new ObjectMapper.TypeParser())
                .put(ObjectMapper.NESTED_CONTENT_TYPE, new ObjectMapper.TypeParser())
                .put(MultiFieldMapper.CONTENT_TYPE, new MultiFieldMapper.TypeParser())
--- a/src/main/java/org/elasticsearch/index/mapper/MapperBuilders.java
+++ b/src/main/java/org/elasticsearch/index/mapper/MapperBuilders.java
@ -133,6 +133,10 @@ public final class MapperBuilders {
        return new IntegerFieldMapper.Builder(name);
    }

+    public static TokenCountFieldMapper.Builder tokenCountField(String name) {
+        return new TokenCountFieldMapper.Builder(name);
+    }
+
    public static LongFieldMapper.Builder longField(String name) {
        return new LongFieldMapper.Builder(name);
    }
--- a/src/main/java/org/elasticsearch/index/mapper/core/IntegerFieldMapper.java
+++ b/src/main/java/org/elasticsearch/index/mapper/core/IntegerFieldMapper.java
@ -313,7 +313,10 @@ public class IntegerFieldMapper extends NumberFieldMapper<Integer> {
                }
            }
        }
+        addIntegerFields(fields, value, boost);
+    }

+    protected void addIntegerFields(List<Field> fields, int value, float boost) {
        if (fieldType.indexed() || fieldType.stored()) {
            CustomIntegerNumericField field = new CustomIntegerNumericField(this, value, fieldType);
            field.setBoost(boost);
@ -326,6 +329,10 @@ public class IntegerFieldMapper extends NumberFieldMapper<Integer> {
        }
    }

+    protected Integer nullValue() {
+        return nullValue;
+    }
+
    @Override
    protected String contentType() {
        return CONTENT_TYPE;
--- a/src/main/java/org/elasticsearch/index/mapper/core/StringFieldMapper.java
+++ b/src/main/java/org/elasticsearch/index/mapper/core/StringFieldMapper.java
@ -259,57 +259,69 @@ public class StringFieldMapper extends AbstractFieldMapper<String> implements Al

    @Override
    protected void parseCreateField(ParseContext context, List<Field> fields) throws IOException {
-        String value = nullValue;
-        float boost = this.boost;
-        if (context.externalValueSet()) {
-            value = (String) context.externalValue();
-        } else {
-            XContentParser parser = context.parser();
-            if (parser.currentToken() == XContentParser.Token.VALUE_NULL) {
-                value = nullValue;
-            } else if (parser.currentToken() == XContentParser.Token.START_OBJECT) {
-                XContentParser.Token token;
-                String currentFieldName = null;
-                while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) {
-                    if (token == XContentParser.Token.FIELD_NAME) {
-                        currentFieldName = parser.currentName();
-                    } else {
-                        if ("value".equals(currentFieldName) || "_value".equals(currentFieldName)) {
-                            value = parser.textOrNull();
-                        } else if ("boost".equals(currentFieldName) || "_boost".equals(currentFieldName)) {
-                            boost = parser.floatValue();
-                        } else {
-                            throw new ElasticSearchIllegalArgumentException("unknown property [" + currentFieldName + "]");
-                        }
-                    }
-                }
-            } else {
-                value = parser.textOrNull();
-            }
-        }
-        if (value == null) {
+        ValueAndBoost valueAndBoost = parseCreateFieldForString(context, nullValue, boost);
+        if (valueAndBoost.value() == null) {
            return;
        }
-        if (ignoreAbove > 0 && value.length() > ignoreAbove) {
+        if (ignoreAbove > 0 && valueAndBoost.value().length() > ignoreAbove) {
            return;
        }
        if (context.includeInAll(includeInAll, this)) {
-            context.allEntries().addText(names.fullName(), value, boost);
+            context.allEntries().addText(names.fullName(), valueAndBoost.value(), valueAndBoost.boost());
        }

        if (fieldType.indexed() || fieldType.stored()) {
-            Field field = new StringField(names.indexName(), value, fieldType);
-            field.setBoost(boost);
+            Field field = new StringField(names.indexName(), valueAndBoost.value(), fieldType);
+            field.setBoost(valueAndBoost.boost());
            fields.add(field);
        }
        if (hasDocValues()) {
-            fields.add(new SortedSetDocValuesField(names.indexName(), new BytesRef(value)));
+            fields.add(new SortedSetDocValuesField(names.indexName(), new BytesRef(valueAndBoost.value())));
        }
        if (fields.isEmpty()) {
-            context.ignoredValue(names.indexName(), value);
+            context.ignoredValue(names.indexName(), valueAndBoost.value());
        }
    }

+    /**
+     * Parse a field as though it were a string.
+     * @param context parse context used during parsing
+     * @param nullValue value to use for null
+     * @param defaultBoost default boost value returned unless overwritten in the field
+     * @return the parsed field and the boost either parsed or defaulted
+     * @throws IOException if thrown while parsing
+     */
+    public static ValueAndBoost parseCreateFieldForString(ParseContext context, String nullValue, float defaultBoost) throws IOException {
+        if (context.externalValueSet()) {
+            return new ValueAndBoost((String) context.externalValue(), defaultBoost);
+        }
+        XContentParser parser = context.parser();
+        if (parser.currentToken() == XContentParser.Token.VALUE_NULL) {
+            return new ValueAndBoost(nullValue, defaultBoost);
+        }
+        if (parser.currentToken() == XContentParser.Token.START_OBJECT) {
+            XContentParser.Token token;
+            String currentFieldName = null;
+            String value = nullValue;
+            float boost = defaultBoost;
+            while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) {
+                if (token == XContentParser.Token.FIELD_NAME) {
+                    currentFieldName = parser.currentName();
+                } else {
+                    if ("value".equals(currentFieldName) || "_value".equals(currentFieldName)) {
+                        value = parser.textOrNull();
+                    } else if ("boost".equals(currentFieldName) || "_boost".equals(currentFieldName)) {
+                        boost = parser.floatValue();
+                    } else {
+                        throw new ElasticSearchIllegalArgumentException("unknown property [" + currentFieldName + "]");
+                    }
+                }
+            }
+            return new ValueAndBoost(value, boost);
+        }
+        return new ValueAndBoost(parser.textOrNull(), defaultBoost);
+    }
+
    @Override
    protected String contentType() {
        return CONTENT_TYPE;
@ -437,4 +449,33 @@ public class StringFieldMapper extends AbstractFieldMapper<String> implements Al
            value = null;
        }
    }
+
+    /**
+     * Parsed value and boost to be returned from {@link #parseCreateFieldForString}.
+     */
+    public static class ValueAndBoost {
+        private final String value;
+        private final float boost;
+
+        public ValueAndBoost(String value, float boost) {
+            this.value = value;
+            this.boost = boost;
+        }
+
+        /**
+         * Value of string field.
+         * @return value of string field
+         */
+        public String value() {
+            return value;
+        }
+
+        /**
+         * Boost either parsed from the document or defaulted.
+         * @return boost either parsed from the document or defaulted
+         */
+        public float boost() {
+            return boost;
+        }
+    }
 }
--- a/src/main/java/org/elasticsearch/index/mapper/core/TokenCountFieldMapper.java
+++ b/src/main/java/org/elasticsearch/index/mapper/core/TokenCountFieldMapper.java
@ -0,0 +1,198 @@
+/*
+ * Licensed to ElasticSearch and Shay Banon under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. ElasticSearch licenses this
+ * file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.mapper.core;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.FieldType;
+import org.elasticsearch.common.Explicit;
+import org.elasticsearch.common.Strings;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.common.xcontent.XContentBuilder;
+import org.elasticsearch.index.analysis.NamedAnalyzer;
+import org.elasticsearch.index.codec.docvaluesformat.DocValuesFormatProvider;
+import org.elasticsearch.index.codec.postingsformat.PostingsFormatProvider;
+import org.elasticsearch.index.mapper.*;
+import org.elasticsearch.index.mapper.core.StringFieldMapper.ValueAndBoost;
+import org.elasticsearch.index.similarity.SimilarityProvider;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+
+import static org.elasticsearch.common.xcontent.support.XContentMapValues.nodeIntegerValue;
+import static org.elasticsearch.index.mapper.MapperBuilders.tokenCountField;
+import static org.elasticsearch.index.mapper.core.TypeParsers.parseNumberField;
+
+/**
+ * A {@link FieldMapper} that takes a string and writes a count of the tokens in that string
+ * to the index.  In most ways the mapper acts just like an {@link IntegerFieldMapper}.
+ */
+public class TokenCountFieldMapper extends IntegerFieldMapper {
+    public static final String CONTENT_TYPE = "token_count";
+
+    public static class Defaults extends IntegerFieldMapper.Defaults {
+    }
+
+    public static class Builder extends NumberFieldMapper.Builder<Builder, TokenCountFieldMapper> {
+        private Integer nullValue = Defaults.NULL_VALUE;
+        private NamedAnalyzer analyzer;
+
+        public Builder(String name) {
+            super(name, new FieldType(Defaults.FIELD_TYPE));
+            builder = this;
+        }
+
+        public Builder nullValue(int nullValue) {
+            this.nullValue = nullValue;
+            return this;
+        }
+
+        public Builder analyzer(NamedAnalyzer analyzer) {
+            this.analyzer = analyzer;
+            return this;
+        }
+
+        public NamedAnalyzer analyzer() {
+            return analyzer;
+        }
+
+        @Override
+        public TokenCountFieldMapper build(BuilderContext context) {
+            fieldType.setOmitNorms(fieldType.omitNorms() && boost == 1.0f);
+            TokenCountFieldMapper fieldMapper = new TokenCountFieldMapper(buildNames(context), precisionStep, boost, fieldType, nullValue,
+                    ignoreMalformed(context), postingsProvider, docValuesProvider, similarity, fieldDataSettings, context.indexSettings(),
+                    analyzer);
+            fieldMapper.includeInAll(includeInAll);
+            return fieldMapper;
+        }
+    }
+
+    public static class TypeParser implements Mapper.TypeParser {
+        @Override
+        @SuppressWarnings("unchecked")
+        public Mapper.Builder parse(String name, Map<String, Object> node, ParserContext parserContext) throws MapperParsingException {
+            TokenCountFieldMapper.Builder builder = tokenCountField(name);
+            parseNumberField(builder, name, node, parserContext);
+            for (Map.Entry<String, Object> entry : node.entrySet()) {
+                String propName = Strings.toUnderscoreCase(entry.getKey());
+                Object propNode = entry.getValue();
+                if (propName.equals("null_value")) {
+                    builder.nullValue(nodeIntegerValue(propNode));
+                } else if (propName.equals("analyzer")) {
+                    NamedAnalyzer analyzer = parserContext.analysisService().analyzer(propNode.toString());
+                    if (analyzer == null) {
+                        throw new MapperParsingException("Analyzer [" + propNode.toString() + "] not found for field [" + name + "]");
+                    }
+                    builder.analyzer(analyzer);
+                }
+            }
+            if (builder.analyzer() == null) {
+                throw new MapperParsingException("Analyzer must be set for field [" + name + "] but wasn't.");
+            }
+            return builder;
+        }
+    }
+
+    private NamedAnalyzer analyzer;
+
+    protected TokenCountFieldMapper(Names names, int precisionStep, float boost, FieldType fieldType, Integer nullValue,
+            Explicit<Boolean> ignoreMalformed, PostingsFormatProvider postingsProvider, DocValuesFormatProvider docValuesProvider,
+            SimilarityProvider similarity, Settings fieldDataSettings, Settings indexSettings, NamedAnalyzer analyzer) {
+        super(names, precisionStep, boost, fieldType, nullValue, ignoreMalformed, postingsProvider, docValuesProvider, similarity,
+                fieldDataSettings, indexSettings);
+        this.analyzer = analyzer;
+    }
+
+    @Override
+    protected void parseCreateField(ParseContext context, List<Field> fields) throws IOException {
+        ValueAndBoost valueAndBoost = StringFieldMapper.parseCreateFieldForString(context, null /* Out null value is an int so we convert*/, boost);
+        if (valueAndBoost.value() == null && nullValue() == null) {
+            return;
+        }
+
+        if (fieldType.indexed() || fieldType.stored() || hasDocValues()) {
+            int count;
+            if (valueAndBoost.value() == null) {
+                count = nullValue();
+            } else {
+                count = countPositions(analyzer.analyzer().tokenStream(name(), valueAndBoost.value()));
+            }
+            addIntegerFields(fields, count, valueAndBoost.boost());
+        }
+        if (fields.isEmpty()) {
+            context.ignoredValue(names.indexName(), valueAndBoost.value());
+        }
+    }
+
+    /**
+     * Count position increments in a token stream.  Package private for testing.
+     * @param tokenStream token stream to count
+     * @return number of position increments in a token stream
+     * @throws IOException if tokenStream throws it
+     */
+    static int countPositions(TokenStream tokenStream) throws IOException {
+        try {
+            int count = 0;
+            PositionIncrementAttribute position = tokenStream.addAttribute(PositionIncrementAttribute.class);
+            tokenStream.reset();
+            while (tokenStream.incrementToken()) {
+                count += position.getPositionIncrement();
+            }
+            tokenStream.end();
+            count += position.getPositionIncrement();
+            return count;
+        } finally {
+            tokenStream.close();
+        }
+    }
+
+    /**
+     * Name of analyzer.
+     * @return name of analyzer
+     */
+    public String analyzer() {
+        return analyzer.name();
+    }
+
+    @Override
+    protected String contentType() {
+        return CONTENT_TYPE;
+    }
+
+    @Override
+    public void merge(Mapper mergeWith, MergeContext mergeContext) throws MergeMappingException {
+        super.merge(mergeWith, mergeContext);
+        if (!this.getClass().equals(mergeWith.getClass())) {
+            return;
+        }
+        if (!mergeContext.mergeFlags().simulate()) {
+            this.analyzer = ((TokenCountFieldMapper) mergeWith).analyzer;
+        }
+    }
+
+    @Override
+    protected void doXContentBody(XContentBuilder builder, boolean includeDefaults, Params params) throws IOException {
+        super.doXContentBody(builder, includeDefaults, params);
+
+        builder.field("analyzer", analyzer());
+    }
+}
--- a/src/test/java/org/elasticsearch/index/mapper/core/TokenCountFieldMapperIntegrationTests.java
+++ b/src/test/java/org/elasticsearch/index/mapper/core/TokenCountFieldMapperIntegrationTests.java
@ -0,0 +1,222 @@
+/*
+ * Licensed to ElasticSearch and Shay Banon under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. ElasticSearch licenses this
+ * file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.mapper.core;
+
+import com.carrotsearch.randomizedtesting.annotations.Name;
+import com.carrotsearch.randomizedtesting.annotations.ParametersFactory;
+import com.google.common.collect.ImmutableList;
+import org.apache.lucene.util.LuceneTestCase;
+import org.elasticsearch.ElasticSearchException;
+import org.elasticsearch.action.bulk.BulkResponse;
+import org.elasticsearch.action.index.IndexRequestBuilder;
+import org.elasticsearch.action.search.SearchRequestBuilder;
+import org.elasticsearch.action.search.SearchResponse;
+import org.elasticsearch.index.query.QueryBuilders;
+import org.elasticsearch.search.SearchHit;
+import org.elasticsearch.search.facet.terms.TermsFacet;
+import org.elasticsearch.search.facet.terms.TermsFacetBuilder;
+import org.elasticsearch.test.ElasticsearchIntegrationTest;
+import org.junit.Test;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;
+import static org.hamcrest.Matchers.*;
+
+public class TokenCountFieldMapperIntegrationTests extends ElasticsearchIntegrationTest {
+    @ParametersFactory
+    public static Iterable<Object[]> buildParameters() {
+        List<Object[]> parameters = new ArrayList<Object[]>();
+        for (boolean storeCountedFields : new boolean[] { true, false }) {
+            for (boolean loadCountedFields : new boolean[] { true, false }) {
+                parameters.add(new Object[] { storeCountedFields, loadCountedFields });
+            }
+        }
+        return parameters;
+    }
+
+    private final boolean storeCountedFields;
+    private final boolean loadCountedFields;
+
+    public TokenCountFieldMapperIntegrationTests(@Name("storeCountedFields") boolean storeCountedFields,
+            @Name("loadCountedFields") boolean loadCountedFields) {
+        this.storeCountedFields = storeCountedFields;
+        this.loadCountedFields = loadCountedFields;
+    }
+
+    /**
+     * It is possible to get the token count in a search response.
+     */
+    @Test
+    public void searchReturnsTokenCount() throws ElasticSearchException, IOException {
+        init();
+
+        assertSearchReturns(searchById("single"), "single");
+        assertSearchReturns(searchById("bulk1"), "bulk1");
+        assertSearchReturns(searchById("bulk2"), "bulk2");
+        assertSearchReturns(searchById("multi"), "multi");
+        assertSearchReturns(searchById("multibulk1"), "multibulk1");
+        assertSearchReturns(searchById("multibulk2"), "multibulk2");
+    }
+
+    /**
+     * It is possible to search by token count.
+     */
+    @Test
+    public void searchByTokenCount() throws ElasticSearchException, IOException {
+        init();
+
+        assertSearchReturns(searchByNumericRange(4, 4).get(), "single");
+        assertSearchReturns(searchByNumericRange(10, 10).get(), "multibulk2");
+        assertSearchReturns(searchByNumericRange(7, 10).get(), "multi", "multibulk1", "multibulk2");
+        assertSearchReturns(searchByNumericRange(1, 10).get(), "single", "bulk1", "bulk2", "multi", "multibulk1", "multibulk2");
+        assertSearchReturns(searchByNumericRange(12, 12).get());
+    }
+
+    /**
+     * It is possible to search by token count.
+     */
+    @Test
+    public void facetByTokenCount() throws ElasticSearchException, IOException {
+        init();
+
+        String facetField = randomFrom(ImmutableList.of(
+                "foo.token_count", "foo.token_count_unstored", "foo.token_count_with_doc_values"));
+        SearchResponse result = searchByNumericRange(1, 10)
+                .addFacet(new TermsFacetBuilder("facet").field(facetField)).get();
+        assertSearchReturns(result, "single", "bulk1", "bulk2", "multi", "multibulk1", "multibulk2");
+        assertThat(result.getFacets().facets().size(), equalTo(1));
+        TermsFacet facet = (TermsFacet) result.getFacets().facets().get(0);
+        assertThat(facet.getEntries().size(), equalTo(9));
+    }
+
+    private void init() throws ElasticSearchException, IOException {
+        prepareCreate("test").addMapping("test", jsonBuilder().startObject()
+                .startObject("test")
+                    .startObject("properties")
+                        .startObject("foo")
+                            .field("type", "multi_field")
+                            .startObject("fields")
+                                .startObject("foo")
+                                    .field("type", "string")
+                                    .field("store", storeCountedFields)
+                                    .field("analyzer", "simple")
+                                .endObject()
+                                .startObject("token_count")
+                                    .field("type", "token_count")
+                                    .field("analyzer", "standard")
+                                    .field("store", true)
+                                .endObject()
+                                .startObject("token_count_unstored")
+                                    .field("type", "token_count")
+                                    .field("analyzer", "standard")
+                                .endObject()
+                                .startObject("token_count_with_doc_values")
+                                    .field("type", "token_count")
+                                    .field("analyzer", "standard")
+                                    .startObject("fielddata")
+                                        .field("format", LuceneTestCase.defaultCodecSupportsSortedSet() ? "doc_values" : null)
+                                    .endObject()
+                                .endObject()
+                            .endObject()
+                        .endObject()
+                    .endObject()
+                .endObject().endObject()).get();
+        ensureGreen();
+
+        assertTrue(prepareIndex("single", "I have four terms").get().isCreated());
+        BulkResponse bulk = client().prepareBulk()
+                .add(prepareIndex("bulk1", "bulk three terms"))
+                .add(prepareIndex("bulk2", "this has five bulk terms")).get();
+        assertFalse(bulk.buildFailureMessage(), bulk.hasFailures());
+        assertTrue(prepareIndex("multi", "two terms", "wow now I have seven lucky terms").get().isCreated());
+        bulk = client().prepareBulk()
+                .add(prepareIndex("multibulk1", "one", "oh wow now I have eight unlucky terms"))
+                .add(prepareIndex("multibulk2", "six is a bunch of terms", "ten!  ten terms is just crazy!  too many too count!")).get();
+        assertFalse(bulk.buildFailureMessage(), bulk.hasFailures());
+
+        assertThat(refresh().getFailedShards(), equalTo(0));
+    }
+
+    private IndexRequestBuilder prepareIndex(String id, String... texts) throws IOException {
+        return client().prepareIndex("test", "test", id).setSource("foo", texts);
+    }
+
+    private SearchResponse searchById(String id) {
+        return prepareSearch().setQuery(QueryBuilders.termQuery("_id", id)).get();
+    }
+
+    private SearchRequestBuilder searchByNumericRange(int low, int high) {
+        return prepareSearch().setQuery(QueryBuilders.rangeQuery(randomFrom(
+                ImmutableList.of("foo.token_count", "foo.token_count_unstored", "foo.token_count_with_doc_values")
+        )).gte(low).lte(high));
+    }
+
+    private SearchRequestBuilder prepareSearch() {
+        SearchRequestBuilder request = client().prepareSearch("test").setTypes("test");
+        request.addField("foo.token_count");
+        if (loadCountedFields) {
+            request.addField("foo");
+        }
+        return request;
+    }
+
+    private void assertSearchReturns(SearchResponse result, String... ids) {
+        assertThat(result.getHits().getTotalHits(), equalTo((long) ids.length));
+        assertThat(result.getHits().hits().length, equalTo(ids.length));
+        List<String> foundIds = new ArrayList<String>();
+        for (SearchHit hit : result.getHits()) {
+            foundIds.add(hit.id());
+        }
+        assertThat(foundIds, containsInAnyOrder(ids));
+        for (SearchHit hit : result.getHits()) {
+            String id = hit.id();
+            if (id.equals("single")) {
+                assertSearchHit(hit, 4);
+            } else if (id.equals("bulk1")) {
+                assertSearchHit(hit, 3);
+            } else if (id.equals("bulk2")) {
+                assertSearchHit(hit, 5);
+            } else if (id.equals("multi")) {
+                assertSearchHit(hit, 2, 7);
+            } else if (id.equals("multibulk1")) {
+                assertSearchHit(hit, 1, 8);
+            } else if (id.equals("multibulk2")) {
+                assertSearchHit(hit, 6, 10);
+            } else {
+                throw new ElasticSearchException("Unexpected response!");
+            }
+        }
+    }
+
+    private void assertSearchHit(SearchHit hit, int... termCounts) {
+        assertThat(hit.field("foo.token_count"), not(nullValue()));
+        assertThat(hit.field("foo.token_count").values().size(), equalTo(termCounts.length));
+        for (int i = 0; i < termCounts.length; i++) {
+            assertThat((Integer) hit.field("foo.token_count").values().get(i), equalTo(termCounts[i]));
+        }
+
+        if (loadCountedFields && storeCountedFields) {
+            assertThat(hit.field("foo").values().size(), equalTo(termCounts.length));
+        }
+    }
+}
--- a/src/test/java/org/elasticsearch/index/mapper/core/TokenCountFieldMapperTests.java
+++ b/src/test/java/org/elasticsearch/index/mapper/core/TokenCountFieldMapperTests.java
@ -0,0 +1,92 @@
+/*
+ * Licensed to ElasticSearch and Shay Banon under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. ElasticSearch licenses this
+ * file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.mapper.core;
+
+import org.apache.lucene.analysis.CannedTokenStream;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+import org.elasticsearch.common.xcontent.XContentFactory;
+import org.elasticsearch.index.mapper.DocumentMapper;
+import org.elasticsearch.index.mapper.MapperTestUtils;
+import org.elasticsearch.test.ElasticsearchTestCase;
+import org.junit.Test;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Collections;
+
+import static org.elasticsearch.index.mapper.DocumentMapper.MergeFlags.mergeFlags;
+import static org.hamcrest.Matchers.equalTo;
+
+/**
+ * Test for {@link TokenCountFieldMapper}.
+ */
+public class TokenCountFieldMapperTests extends ElasticsearchTestCase {
+    @Test
+    public void testMerge() throws IOException {
+        String stage1Mapping = XContentFactory.jsonBuilder().startObject()
+                .startObject("person")
+                    .startObject("properties")
+                        .startObject("tc")
+                            .field("type", "token_count")
+                            .field("tokenizer", "keyword")
+                        .endObject()
+                    .endObject()
+                .endObject().endObject().string();
+        DocumentMapper stage1 = MapperTestUtils.newParser().parse(stage1Mapping);
+
+        String stage2Mapping = XContentFactory.jsonBuilder().startObject()
+                .startObject("person")
+                    .startObject("properties")
+                        .startObject("tc")
+                            .field("type", "token_count")
+                            .field("tokenizer", "standard")
+                        .endObject()
+                    .endObject()
+                .endObject().endObject().string();
+        DocumentMapper stage2 = MapperTestUtils.newParser().parse(stage2Mapping);
+
+        DocumentMapper.MergeResult mergeResult = stage1.merge(stage2, mergeFlags().simulate(true));
+        assertThat(mergeResult.hasConflicts(), equalTo(false));
+        // Just simulated so merge hasn't happened yet
+        assertThat(((TokenCountFieldMapper) stage1.mappers().smartName("tc").mapper()).analyzer(), equalTo("keyword"));
+
+        mergeResult = stage1.merge(stage2, mergeFlags().simulate(false));
+        assertThat(mergeResult.hasConflicts(), equalTo(false));
+        // Just simulated so merge hasn't happened yet
+        assertThat(((TokenCountFieldMapper) stage1.mappers().smartName("tc").mapper()).analyzer(), equalTo("standard"));
+    }
+
+    @Test
+    public void testCountPositions() throws IOException {
+        // We're looking to make sure that we:
+        Token t1 = new Token();      // Don't count tokens without an increment
+        t1.setPositionIncrement(0);
+        Token t2 = new Token();
+        t2.setPositionIncrement(1);  // Count normal tokens with one increment
+        Token t3 = new Token();
+        t2.setPositionIncrement(2);  // Count funny tokens with more than one increment
+        int finalTokenIncrement = 4; // Count the final token increment on the rare token streams that have them
+        Token[] tokens = new Token[] {t1, t2, t3};
+        Collections.shuffle(Arrays.asList(tokens), getRandom());
+        TokenStream tokenStream = new CannedTokenStream(finalTokenIncrement, 0, tokens);
+        assertThat(TokenCountFieldMapper.countPositions(tokenStream), equalTo(7));
+    }
+}