From 4a45579506513f3a88cd51d79d2ff0c342021f93 Mon Sep 17 00:00:00 2001 From: Fabien Baligand Date: Fri, 21 Apr 2017 00:53:28 +0200 Subject: [PATCH] token_count type : add an option to count tokens (fix #23227) (#24175) Add option "enable_position_increments" with default value true. If option is set to false, indexed value is the number of tokens (not position increments count) --- .../index/mapper/TokenCountFieldMapper.java | 53 +++++++++++++++---- .../TokenCountFieldMapperIntegrationIT.java | 35 ++++++++---- .../mapper/TokenCountFieldMapperTests.java | 52 ++++++++++++------ .../mapping/types/token-count.asciidoc | 12 ++--- 4 files changed, 108 insertions(+), 44 deletions(-) diff --git a/core/src/main/java/org/elasticsearch/index/mapper/TokenCountFieldMapper.java b/core/src/main/java/org/elasticsearch/index/mapper/TokenCountFieldMapper.java index a2d40cd08bd..2ed6658e87c 100644 --- a/core/src/main/java/org/elasticsearch/index/mapper/TokenCountFieldMapper.java +++ b/core/src/main/java/org/elasticsearch/index/mapper/TokenCountFieldMapper.java @@ -22,10 +22,8 @@ package org.elasticsearch.index.mapper; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; -import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.IndexableField; -import org.elasticsearch.Version; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.index.analysis.NamedAnalyzer; @@ -36,6 +34,7 @@ import java.util.List; import java.util.Map; import static org.elasticsearch.common.xcontent.support.XContentMapValues.nodeIntegerValue; +import static org.elasticsearch.common.xcontent.support.XContentMapValues.nodeBooleanValue; import static org.elasticsearch.index.mapper.TypeParsers.parseField; /** @@ -47,10 +46,12 @@ public class TokenCountFieldMapper extends FieldMapper { public static class Defaults { public static final MappedFieldType FIELD_TYPE = new NumberFieldMapper.NumberFieldType(NumberFieldMapper.NumberType.INTEGER); + public static final boolean DEFAULT_POSITION_INCREMENTS = true; } public static class Builder extends FieldMapper.Builder { private NamedAnalyzer analyzer; + private boolean enablePositionIncrements = Defaults.DEFAULT_POSITION_INCREMENTS; public Builder(String name) { super(name, Defaults.FIELD_TYPE, Defaults.FIELD_TYPE); @@ -66,18 +67,26 @@ public class TokenCountFieldMapper extends FieldMapper { return analyzer; } + public Builder enablePositionIncrements(boolean enablePositionIncrements) { + this.enablePositionIncrements = enablePositionIncrements; + return this; + } + + public boolean enablePositionIncrements() { + return enablePositionIncrements; + } + @Override public TokenCountFieldMapper build(BuilderContext context) { setupFieldType(context); return new TokenCountFieldMapper(name, fieldType, defaultFieldType, - context.indexSettings(), analyzer, multiFieldsBuilder.build(this, context), copyTo); + context.indexSettings(), analyzer, enablePositionIncrements, multiFieldsBuilder.build(this, context), copyTo); } } public static class TypeParser implements Mapper.TypeParser { @Override - @SuppressWarnings("unchecked") - public Mapper.Builder parse(String name, Map node, ParserContext parserContext) throws MapperParsingException { + public Mapper.Builder parse(String name, Map node, ParserContext parserContext) throws MapperParsingException { TokenCountFieldMapper.Builder builder = new TokenCountFieldMapper.Builder(name); for (Iterator> iterator = node.entrySet().iterator(); iterator.hasNext();) { Map.Entry entry = iterator.next(); @@ -93,6 +102,9 @@ public class TokenCountFieldMapper extends FieldMapper { } builder.analyzer(analyzer); iterator.remove(); + } else if (propName.equals("enable_position_increments")) { + builder.enablePositionIncrements(nodeBooleanValue(propNode)); + iterator.remove(); } } parseField(builder, name, node, parserContext); @@ -104,11 +116,13 @@ public class TokenCountFieldMapper extends FieldMapper { } private NamedAnalyzer analyzer; + private boolean enablePositionIncrements; protected TokenCountFieldMapper(String simpleName, MappedFieldType fieldType, MappedFieldType defaultFieldType, - Settings indexSettings, NamedAnalyzer analyzer, MultiFields multiFields, CopyTo copyTo) { + Settings indexSettings, NamedAnalyzer analyzer, boolean enablePositionIncrements, MultiFields multiFields, CopyTo copyTo) { super(simpleName, fieldType, defaultFieldType, indexSettings, multiFields, copyTo); this.analyzer = analyzer; + this.enablePositionIncrements = enablePositionIncrements; } @Override @@ -124,7 +138,7 @@ public class TokenCountFieldMapper extends FieldMapper { if (value == null) { tokenCount = (Integer) fieldType().nullValue(); } else { - tokenCount = countPositions(analyzer, name(), value); + tokenCount = countPositions(analyzer, name(), value, enablePositionIncrements); } boolean indexed = fieldType().indexOptions() != IndexOptions.NONE; @@ -138,19 +152,26 @@ public class TokenCountFieldMapper extends FieldMapper { * @param analyzer analyzer to create token stream * @param fieldName field name to pass to analyzer * @param fieldValue field value to pass to analyzer + * @param enablePositionIncrements should we count position increments ? * @return number of position increments in a token stream * @throws IOException if tokenStream throws it */ - static int countPositions(Analyzer analyzer, String fieldName, String fieldValue) throws IOException { + static int countPositions(Analyzer analyzer, String fieldName, String fieldValue, boolean enablePositionIncrements) throws IOException { try (TokenStream tokenStream = analyzer.tokenStream(fieldName, fieldValue)) { int count = 0; PositionIncrementAttribute position = tokenStream.addAttribute(PositionIncrementAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { - count += position.getPositionIncrement(); + if (enablePositionIncrements) { + count += position.getPositionIncrement(); + } else { + count += Math.min(1, position.getPositionIncrement()); + } } tokenStream.end(); - count += position.getPositionIncrement(); + if (enablePositionIncrements) { + count += position.getPositionIncrement(); + } return count; } } @@ -163,6 +184,14 @@ public class TokenCountFieldMapper extends FieldMapper { return analyzer.name(); } + /** + * Indicates if position increments are counted. + * @return true if position increments are counted + */ + public boolean enablePositionIncrements() { + return enablePositionIncrements; + } + @Override protected String contentType() { return CONTENT_TYPE; @@ -172,12 +201,16 @@ public class TokenCountFieldMapper extends FieldMapper { protected void doMerge(Mapper mergeWith, boolean updateAllTypes) { super.doMerge(mergeWith, updateAllTypes); this.analyzer = ((TokenCountFieldMapper) mergeWith).analyzer; + this.enablePositionIncrements = ((TokenCountFieldMapper) mergeWith).enablePositionIncrements; } @Override protected void doXContentBody(XContentBuilder builder, boolean includeDefaults, Params params) throws IOException { super.doXContentBody(builder, includeDefaults, params); builder.field("analyzer", analyzer()); + if (includeDefaults || enablePositionIncrements() != Defaults.DEFAULT_POSITION_INCREMENTS) { + builder.field("enable_position_increments", enablePositionIncrements()); + } } } diff --git a/core/src/test/java/org/elasticsearch/index/mapper/TokenCountFieldMapperIntegrationIT.java b/core/src/test/java/org/elasticsearch/index/mapper/TokenCountFieldMapperIntegrationIT.java index 48307b6349c..75b588df85a 100644 --- a/core/src/test/java/org/elasticsearch/index/mapper/TokenCountFieldMapperIntegrationIT.java +++ b/core/src/test/java/org/elasticsearch/index/mapper/TokenCountFieldMapperIntegrationIT.java @@ -131,6 +131,12 @@ public class TokenCountFieldMapperIntegrationIT extends ESIntegTestCase { .field("analyzer", "standard") .field("doc_values", true) .endObject() + .startObject("token_count_without_position_increments") + .field("type", "token_count") + .field("analyzer", "english") + .field("enable_position_increments", false) + .field("store", true) + .endObject() .endObject() .endObject() .endObject() @@ -169,6 +175,7 @@ public class TokenCountFieldMapperIntegrationIT extends ESIntegTestCase { private SearchRequestBuilder prepareSearch() { SearchRequestBuilder request = client().prepareSearch("test").setTypes("test"); request.addStoredField("foo.token_count"); + request.addStoredField("foo.token_count_without_position_increments"); if (loadCountedFields) { request.addStoredField("foo"); } @@ -186,32 +193,38 @@ public class TokenCountFieldMapperIntegrationIT extends ESIntegTestCase { for (SearchHit hit : result.getHits()) { String id = hit.getId(); if (id.equals("single")) { - assertSearchHit(hit, 4); + assertSearchHit(hit, new int[]{4}, new int[]{4}); } else if (id.equals("bulk1")) { - assertSearchHit(hit, 3); + assertSearchHit(hit, new int[]{3}, new int[]{3}); } else if (id.equals("bulk2")) { - assertSearchHit(hit, 5); + assertSearchHit(hit, new int[]{5}, new int[]{4}); } else if (id.equals("multi")) { - assertSearchHit(hit, 2, 7); + assertSearchHit(hit, new int[]{2, 7}, new int[]{2, 7}); } else if (id.equals("multibulk1")) { - assertSearchHit(hit, 1, 8); + assertSearchHit(hit, new int[]{1, 8}, new int[]{1, 8}); } else if (id.equals("multibulk2")) { - assertSearchHit(hit, 6, 10); + assertSearchHit(hit, new int[]{6, 10}, new int[]{3, 9}); } else { throw new ElasticsearchException("Unexpected response!"); } } } - private void assertSearchHit(SearchHit hit, int... termCounts) { + private void assertSearchHit(SearchHit hit, int[] standardTermCounts, int[] englishTermCounts) { assertThat(hit.field("foo.token_count"), not(nullValue())); - assertThat(hit.field("foo.token_count").getValues().size(), equalTo(termCounts.length)); - for (int i = 0; i < termCounts.length; i++) { - assertThat((Integer) hit.field("foo.token_count").getValues().get(i), equalTo(termCounts[i])); + assertThat(hit.field("foo.token_count").getValues().size(), equalTo(standardTermCounts.length)); + for (int i = 0; i < standardTermCounts.length; i++) { + assertThat((Integer) hit.field("foo.token_count").getValues().get(i), equalTo(standardTermCounts[i])); + } + + assertThat(hit.field("foo.token_count_without_position_increments"), not(nullValue())); + assertThat(hit.field("foo.token_count_without_position_increments").getValues().size(), equalTo(englishTermCounts.length)); + for (int i = 0; i < englishTermCounts.length; i++) { + assertThat((Integer) hit.field("foo.token_count_without_position_increments").getValues().get(i), equalTo(englishTermCounts[i])); } if (loadCountedFields && storeCountedFields) { - assertThat(hit.field("foo").getValues().size(), equalTo(termCounts.length)); + assertThat(hit.field("foo").getValues().size(), equalTo(standardTermCounts.length)); } } } diff --git a/core/src/test/java/org/elasticsearch/index/mapper/TokenCountFieldMapperTests.java b/core/src/test/java/org/elasticsearch/index/mapper/TokenCountFieldMapperTests.java index 02128a4254a..0f976e12f39 100644 --- a/core/src/test/java/org/elasticsearch/index/mapper/TokenCountFieldMapperTests.java +++ b/core/src/test/java/org/elasticsearch/index/mapper/TokenCountFieldMapperTests.java @@ -24,23 +24,18 @@ import org.apache.lucene.analysis.CannedTokenStream; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; -import org.elasticsearch.Version; -import org.elasticsearch.cluster.metadata.IndexMetaData; import org.elasticsearch.common.compress.CompressedXContent; -import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.xcontent.XContentFactory; import org.elasticsearch.index.IndexService; import org.elasticsearch.plugins.Plugin; import org.elasticsearch.test.ESSingleNodeTestCase; import org.elasticsearch.test.InternalSettingsPlugin; -import org.elasticsearch.test.VersionUtils; import java.io.IOException; import java.util.Arrays; import java.util.Collection; import java.util.Collections; -import static com.carrotsearch.randomizedtesting.RandomizedTest.getRandom; import static org.hamcrest.Matchers.containsString; import static org.hamcrest.Matchers.equalTo; @@ -80,26 +75,49 @@ public class TokenCountFieldMapperTests extends ESSingleNodeTestCase { assertThat(((TokenCountFieldMapper) stage2.mappers().smartNameFieldMapper("tc")).analyzer(), equalTo("standard")); } - public void testCountPositions() throws IOException { - // We're looking to make sure that we: - Token t1 = new Token(); // Don't count tokens without an increment + /** + * When position increments are counted, we're looking to make sure that we: + - don't count tokens without an increment + - count normal tokens with one increment + - count funny tokens with more than one increment + - count the final token increments on the rare token streams that have them + */ + public void testCountPositionsWithIncrements() throws IOException { + Analyzer analyzer = createMockAnalyzer(); + assertThat(TokenCountFieldMapper.countPositions(analyzer, "", "", true), equalTo(7)); + } + + /** + * When position increments are not counted (only positions are counted), we're looking to make sure that we: + - don't count tokens without an increment + - count normal tokens with one increment + - count funny tokens with more than one increment as only one + - don't count the final token increments on the rare token streams that have them + */ + public void testCountPositionsWithoutIncrements() throws IOException { + Analyzer analyzer = createMockAnalyzer(); + assertThat(TokenCountFieldMapper.countPositions(analyzer, "", "", false), equalTo(2)); + } + + private Analyzer createMockAnalyzer() { + Token t1 = new Token(); // Token without an increment t1.setPositionIncrement(0); Token t2 = new Token(); - t2.setPositionIncrement(1); // Count normal tokens with one increment + t2.setPositionIncrement(1); // Normal token with one increment Token t3 = new Token(); - t2.setPositionIncrement(2); // Count funny tokens with more than one increment - int finalTokenIncrement = 4; // Count the final token increment on the rare token streams that have them + t2.setPositionIncrement(2); // Funny token with more than one increment + int finalTokenIncrement = 4; // Final token increment Token[] tokens = new Token[] {t1, t2, t3}; Collections.shuffle(Arrays.asList(tokens), random()); final TokenStream tokenStream = new CannedTokenStream(finalTokenIncrement, 0, tokens); // TODO: we have no CannedAnalyzer? Analyzer analyzer = new Analyzer() { - @Override - public TokenStreamComponents createComponents(String fieldName) { - return new TokenStreamComponents(new MockTokenizer(), tokenStream); - } - }; - assertThat(TokenCountFieldMapper.countPositions(analyzer, "", ""), equalTo(7)); + @Override + public TokenStreamComponents createComponents(String fieldName) { + return new TokenStreamComponents(new MockTokenizer(), tokenStream); + } + }; + return analyzer; } @Override diff --git a/docs/reference/mapping/types/token-count.asciidoc b/docs/reference/mapping/types/token-count.asciidoc index 704a94de0f7..60a95ec1d40 100644 --- a/docs/reference/mapping/types/token-count.asciidoc +++ b/docs/reference/mapping/types/token-count.asciidoc @@ -48,12 +48,6 @@ GET my_index/_search <2> The `name.length` field is a `token_count` <> which will index the number of tokens in the `name` field. <3> This query matches only the document containing `Rachel Alice Williams`, as it contains three tokens. -[NOTE] -=================================================================== -Technically the `token_count` type sums position increments rather than -counting tokens. This means that even if the analyzer filters out stop -words they are included in the count. -=================================================================== [[token-count-params]] ==== Parameters for `token_count` fields @@ -68,6 +62,12 @@ The following parameters are accepted by `token_count` fields: value. Required. For best performance, use an analyzer without token filters. +`enable_position_increments`:: + +Indicates if position increments should be counted. +Set to `false` if you don't want to count tokens removed by analyzer filters (like <>). +Defaults to `true`. + <>:: Mapping field-level query time boosting. Accepts a floating point number, defaults