Add option "enable_position_increments" with default value true. If option is set to false, indexed value is the number of tokens (not position increments count)
This commit is contained in:
parent
525101b64d
commit
4a45579506
|
@ -22,10 +22,8 @@ package org.elasticsearch.index.mapper;
|
|||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.index.IndexOptions;
|
||||
import org.apache.lucene.index.IndexableField;
|
||||
import org.elasticsearch.Version;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.common.xcontent.XContentBuilder;
|
||||
import org.elasticsearch.index.analysis.NamedAnalyzer;
|
||||
|
@ -36,6 +34,7 @@ import java.util.List;
|
|||
import java.util.Map;
|
||||
|
||||
import static org.elasticsearch.common.xcontent.support.XContentMapValues.nodeIntegerValue;
|
||||
import static org.elasticsearch.common.xcontent.support.XContentMapValues.nodeBooleanValue;
|
||||
import static org.elasticsearch.index.mapper.TypeParsers.parseField;
|
||||
|
||||
/**
|
||||
|
@ -47,10 +46,12 @@ public class TokenCountFieldMapper extends FieldMapper {
|
|||
|
||||
public static class Defaults {
|
||||
public static final MappedFieldType FIELD_TYPE = new NumberFieldMapper.NumberFieldType(NumberFieldMapper.NumberType.INTEGER);
|
||||
public static final boolean DEFAULT_POSITION_INCREMENTS = true;
|
||||
}
|
||||
|
||||
public static class Builder extends FieldMapper.Builder<Builder, TokenCountFieldMapper> {
|
||||
private NamedAnalyzer analyzer;
|
||||
private boolean enablePositionIncrements = Defaults.DEFAULT_POSITION_INCREMENTS;
|
||||
|
||||
public Builder(String name) {
|
||||
super(name, Defaults.FIELD_TYPE, Defaults.FIELD_TYPE);
|
||||
|
@ -66,18 +67,26 @@ public class TokenCountFieldMapper extends FieldMapper {
|
|||
return analyzer;
|
||||
}
|
||||
|
||||
public Builder enablePositionIncrements(boolean enablePositionIncrements) {
|
||||
this.enablePositionIncrements = enablePositionIncrements;
|
||||
return this;
|
||||
}
|
||||
|
||||
public boolean enablePositionIncrements() {
|
||||
return enablePositionIncrements;
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenCountFieldMapper build(BuilderContext context) {
|
||||
setupFieldType(context);
|
||||
return new TokenCountFieldMapper(name, fieldType, defaultFieldType,
|
||||
context.indexSettings(), analyzer, multiFieldsBuilder.build(this, context), copyTo);
|
||||
context.indexSettings(), analyzer, enablePositionIncrements, multiFieldsBuilder.build(this, context), copyTo);
|
||||
}
|
||||
}
|
||||
|
||||
public static class TypeParser implements Mapper.TypeParser {
|
||||
@Override
|
||||
@SuppressWarnings("unchecked")
|
||||
public Mapper.Builder parse(String name, Map<String, Object> node, ParserContext parserContext) throws MapperParsingException {
|
||||
public Mapper.Builder<?,?> parse(String name, Map<String, Object> node, ParserContext parserContext) throws MapperParsingException {
|
||||
TokenCountFieldMapper.Builder builder = new TokenCountFieldMapper.Builder(name);
|
||||
for (Iterator<Map.Entry<String, Object>> iterator = node.entrySet().iterator(); iterator.hasNext();) {
|
||||
Map.Entry<String, Object> entry = iterator.next();
|
||||
|
@ -93,6 +102,9 @@ public class TokenCountFieldMapper extends FieldMapper {
|
|||
}
|
||||
builder.analyzer(analyzer);
|
||||
iterator.remove();
|
||||
} else if (propName.equals("enable_position_increments")) {
|
||||
builder.enablePositionIncrements(nodeBooleanValue(propNode));
|
||||
iterator.remove();
|
||||
}
|
||||
}
|
||||
parseField(builder, name, node, parserContext);
|
||||
|
@ -104,11 +116,13 @@ public class TokenCountFieldMapper extends FieldMapper {
|
|||
}
|
||||
|
||||
private NamedAnalyzer analyzer;
|
||||
private boolean enablePositionIncrements;
|
||||
|
||||
protected TokenCountFieldMapper(String simpleName, MappedFieldType fieldType, MappedFieldType defaultFieldType,
|
||||
Settings indexSettings, NamedAnalyzer analyzer, MultiFields multiFields, CopyTo copyTo) {
|
||||
Settings indexSettings, NamedAnalyzer analyzer, boolean enablePositionIncrements, MultiFields multiFields, CopyTo copyTo) {
|
||||
super(simpleName, fieldType, defaultFieldType, indexSettings, multiFields, copyTo);
|
||||
this.analyzer = analyzer;
|
||||
this.enablePositionIncrements = enablePositionIncrements;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -124,7 +138,7 @@ public class TokenCountFieldMapper extends FieldMapper {
|
|||
if (value == null) {
|
||||
tokenCount = (Integer) fieldType().nullValue();
|
||||
} else {
|
||||
tokenCount = countPositions(analyzer, name(), value);
|
||||
tokenCount = countPositions(analyzer, name(), value, enablePositionIncrements);
|
||||
}
|
||||
|
||||
boolean indexed = fieldType().indexOptions() != IndexOptions.NONE;
|
||||
|
@ -138,19 +152,26 @@ public class TokenCountFieldMapper extends FieldMapper {
|
|||
* @param analyzer analyzer to create token stream
|
||||
* @param fieldName field name to pass to analyzer
|
||||
* @param fieldValue field value to pass to analyzer
|
||||
* @param enablePositionIncrements should we count position increments ?
|
||||
* @return number of position increments in a token stream
|
||||
* @throws IOException if tokenStream throws it
|
||||
*/
|
||||
static int countPositions(Analyzer analyzer, String fieldName, String fieldValue) throws IOException {
|
||||
static int countPositions(Analyzer analyzer, String fieldName, String fieldValue, boolean enablePositionIncrements) throws IOException {
|
||||
try (TokenStream tokenStream = analyzer.tokenStream(fieldName, fieldValue)) {
|
||||
int count = 0;
|
||||
PositionIncrementAttribute position = tokenStream.addAttribute(PositionIncrementAttribute.class);
|
||||
tokenStream.reset();
|
||||
while (tokenStream.incrementToken()) {
|
||||
count += position.getPositionIncrement();
|
||||
if (enablePositionIncrements) {
|
||||
count += position.getPositionIncrement();
|
||||
} else {
|
||||
count += Math.min(1, position.getPositionIncrement());
|
||||
}
|
||||
}
|
||||
tokenStream.end();
|
||||
count += position.getPositionIncrement();
|
||||
if (enablePositionIncrements) {
|
||||
count += position.getPositionIncrement();
|
||||
}
|
||||
return count;
|
||||
}
|
||||
}
|
||||
|
@ -163,6 +184,14 @@ public class TokenCountFieldMapper extends FieldMapper {
|
|||
return analyzer.name();
|
||||
}
|
||||
|
||||
/**
|
||||
* Indicates if position increments are counted.
|
||||
* @return <code>true</code> if position increments are counted
|
||||
*/
|
||||
public boolean enablePositionIncrements() {
|
||||
return enablePositionIncrements;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected String contentType() {
|
||||
return CONTENT_TYPE;
|
||||
|
@ -172,12 +201,16 @@ public class TokenCountFieldMapper extends FieldMapper {
|
|||
protected void doMerge(Mapper mergeWith, boolean updateAllTypes) {
|
||||
super.doMerge(mergeWith, updateAllTypes);
|
||||
this.analyzer = ((TokenCountFieldMapper) mergeWith).analyzer;
|
||||
this.enablePositionIncrements = ((TokenCountFieldMapper) mergeWith).enablePositionIncrements;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void doXContentBody(XContentBuilder builder, boolean includeDefaults, Params params) throws IOException {
|
||||
super.doXContentBody(builder, includeDefaults, params);
|
||||
builder.field("analyzer", analyzer());
|
||||
if (includeDefaults || enablePositionIncrements() != Defaults.DEFAULT_POSITION_INCREMENTS) {
|
||||
builder.field("enable_position_increments", enablePositionIncrements());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -131,6 +131,12 @@ public class TokenCountFieldMapperIntegrationIT extends ESIntegTestCase {
|
|||
.field("analyzer", "standard")
|
||||
.field("doc_values", true)
|
||||
.endObject()
|
||||
.startObject("token_count_without_position_increments")
|
||||
.field("type", "token_count")
|
||||
.field("analyzer", "english")
|
||||
.field("enable_position_increments", false)
|
||||
.field("store", true)
|
||||
.endObject()
|
||||
.endObject()
|
||||
.endObject()
|
||||
.endObject()
|
||||
|
@ -169,6 +175,7 @@ public class TokenCountFieldMapperIntegrationIT extends ESIntegTestCase {
|
|||
private SearchRequestBuilder prepareSearch() {
|
||||
SearchRequestBuilder request = client().prepareSearch("test").setTypes("test");
|
||||
request.addStoredField("foo.token_count");
|
||||
request.addStoredField("foo.token_count_without_position_increments");
|
||||
if (loadCountedFields) {
|
||||
request.addStoredField("foo");
|
||||
}
|
||||
|
@ -186,32 +193,38 @@ public class TokenCountFieldMapperIntegrationIT extends ESIntegTestCase {
|
|||
for (SearchHit hit : result.getHits()) {
|
||||
String id = hit.getId();
|
||||
if (id.equals("single")) {
|
||||
assertSearchHit(hit, 4);
|
||||
assertSearchHit(hit, new int[]{4}, new int[]{4});
|
||||
} else if (id.equals("bulk1")) {
|
||||
assertSearchHit(hit, 3);
|
||||
assertSearchHit(hit, new int[]{3}, new int[]{3});
|
||||
} else if (id.equals("bulk2")) {
|
||||
assertSearchHit(hit, 5);
|
||||
assertSearchHit(hit, new int[]{5}, new int[]{4});
|
||||
} else if (id.equals("multi")) {
|
||||
assertSearchHit(hit, 2, 7);
|
||||
assertSearchHit(hit, new int[]{2, 7}, new int[]{2, 7});
|
||||
} else if (id.equals("multibulk1")) {
|
||||
assertSearchHit(hit, 1, 8);
|
||||
assertSearchHit(hit, new int[]{1, 8}, new int[]{1, 8});
|
||||
} else if (id.equals("multibulk2")) {
|
||||
assertSearchHit(hit, 6, 10);
|
||||
assertSearchHit(hit, new int[]{6, 10}, new int[]{3, 9});
|
||||
} else {
|
||||
throw new ElasticsearchException("Unexpected response!");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void assertSearchHit(SearchHit hit, int... termCounts) {
|
||||
private void assertSearchHit(SearchHit hit, int[] standardTermCounts, int[] englishTermCounts) {
|
||||
assertThat(hit.field("foo.token_count"), not(nullValue()));
|
||||
assertThat(hit.field("foo.token_count").getValues().size(), equalTo(termCounts.length));
|
||||
for (int i = 0; i < termCounts.length; i++) {
|
||||
assertThat((Integer) hit.field("foo.token_count").getValues().get(i), equalTo(termCounts[i]));
|
||||
assertThat(hit.field("foo.token_count").getValues().size(), equalTo(standardTermCounts.length));
|
||||
for (int i = 0; i < standardTermCounts.length; i++) {
|
||||
assertThat((Integer) hit.field("foo.token_count").getValues().get(i), equalTo(standardTermCounts[i]));
|
||||
}
|
||||
|
||||
assertThat(hit.field("foo.token_count_without_position_increments"), not(nullValue()));
|
||||
assertThat(hit.field("foo.token_count_without_position_increments").getValues().size(), equalTo(englishTermCounts.length));
|
||||
for (int i = 0; i < englishTermCounts.length; i++) {
|
||||
assertThat((Integer) hit.field("foo.token_count_without_position_increments").getValues().get(i), equalTo(englishTermCounts[i]));
|
||||
}
|
||||
|
||||
if (loadCountedFields && storeCountedFields) {
|
||||
assertThat(hit.field("foo").getValues().size(), equalTo(termCounts.length));
|
||||
assertThat(hit.field("foo").getValues().size(), equalTo(standardTermCounts.length));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -24,23 +24,18 @@ import org.apache.lucene.analysis.CannedTokenStream;
|
|||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.elasticsearch.Version;
|
||||
import org.elasticsearch.cluster.metadata.IndexMetaData;
|
||||
import org.elasticsearch.common.compress.CompressedXContent;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.common.xcontent.XContentFactory;
|
||||
import org.elasticsearch.index.IndexService;
|
||||
import org.elasticsearch.plugins.Plugin;
|
||||
import org.elasticsearch.test.ESSingleNodeTestCase;
|
||||
import org.elasticsearch.test.InternalSettingsPlugin;
|
||||
import org.elasticsearch.test.VersionUtils;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
|
||||
import static com.carrotsearch.randomizedtesting.RandomizedTest.getRandom;
|
||||
import static org.hamcrest.Matchers.containsString;
|
||||
import static org.hamcrest.Matchers.equalTo;
|
||||
|
||||
|
@ -80,26 +75,49 @@ public class TokenCountFieldMapperTests extends ESSingleNodeTestCase {
|
|||
assertThat(((TokenCountFieldMapper) stage2.mappers().smartNameFieldMapper("tc")).analyzer(), equalTo("standard"));
|
||||
}
|
||||
|
||||
public void testCountPositions() throws IOException {
|
||||
// We're looking to make sure that we:
|
||||
Token t1 = new Token(); // Don't count tokens without an increment
|
||||
/**
|
||||
* When position increments are counted, we're looking to make sure that we:
|
||||
- don't count tokens without an increment
|
||||
- count normal tokens with one increment
|
||||
- count funny tokens with more than one increment
|
||||
- count the final token increments on the rare token streams that have them
|
||||
*/
|
||||
public void testCountPositionsWithIncrements() throws IOException {
|
||||
Analyzer analyzer = createMockAnalyzer();
|
||||
assertThat(TokenCountFieldMapper.countPositions(analyzer, "", "", true), equalTo(7));
|
||||
}
|
||||
|
||||
/**
|
||||
* When position increments are not counted (only positions are counted), we're looking to make sure that we:
|
||||
- don't count tokens without an increment
|
||||
- count normal tokens with one increment
|
||||
- count funny tokens with more than one increment as only one
|
||||
- don't count the final token increments on the rare token streams that have them
|
||||
*/
|
||||
public void testCountPositionsWithoutIncrements() throws IOException {
|
||||
Analyzer analyzer = createMockAnalyzer();
|
||||
assertThat(TokenCountFieldMapper.countPositions(analyzer, "", "", false), equalTo(2));
|
||||
}
|
||||
|
||||
private Analyzer createMockAnalyzer() {
|
||||
Token t1 = new Token(); // Token without an increment
|
||||
t1.setPositionIncrement(0);
|
||||
Token t2 = new Token();
|
||||
t2.setPositionIncrement(1); // Count normal tokens with one increment
|
||||
t2.setPositionIncrement(1); // Normal token with one increment
|
||||
Token t3 = new Token();
|
||||
t2.setPositionIncrement(2); // Count funny tokens with more than one increment
|
||||
int finalTokenIncrement = 4; // Count the final token increment on the rare token streams that have them
|
||||
t2.setPositionIncrement(2); // Funny token with more than one increment
|
||||
int finalTokenIncrement = 4; // Final token increment
|
||||
Token[] tokens = new Token[] {t1, t2, t3};
|
||||
Collections.shuffle(Arrays.asList(tokens), random());
|
||||
final TokenStream tokenStream = new CannedTokenStream(finalTokenIncrement, 0, tokens);
|
||||
// TODO: we have no CannedAnalyzer?
|
||||
Analyzer analyzer = new Analyzer() {
|
||||
@Override
|
||||
public TokenStreamComponents createComponents(String fieldName) {
|
||||
return new TokenStreamComponents(new MockTokenizer(), tokenStream);
|
||||
}
|
||||
};
|
||||
assertThat(TokenCountFieldMapper.countPositions(analyzer, "", ""), equalTo(7));
|
||||
@Override
|
||||
public TokenStreamComponents createComponents(String fieldName) {
|
||||
return new TokenStreamComponents(new MockTokenizer(), tokenStream);
|
||||
}
|
||||
};
|
||||
return analyzer;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -48,12 +48,6 @@ GET my_index/_search
|
|||
<2> The `name.length` field is a `token_count` <<multi-fields,multi-field>> which will index the number of tokens in the `name` field.
|
||||
<3> This query matches only the document containing `Rachel Alice Williams`, as it contains three tokens.
|
||||
|
||||
[NOTE]
|
||||
===================================================================
|
||||
Technically the `token_count` type sums position increments rather than
|
||||
counting tokens. This means that even if the analyzer filters out stop
|
||||
words they are included in the count.
|
||||
===================================================================
|
||||
|
||||
[[token-count-params]]
|
||||
==== Parameters for `token_count` fields
|
||||
|
@ -68,6 +62,12 @@ The following parameters are accepted by `token_count` fields:
|
|||
value. Required. For best performance, use an analyzer without token
|
||||
filters.
|
||||
|
||||
`enable_position_increments`::
|
||||
|
||||
Indicates if position increments should be counted.
|
||||
Set to `false` if you don't want to count tokens removed by analyzer filters (like <<analysis-stop-tokenfilter,`stop`>>).
|
||||
Defaults to `true`.
|
||||
|
||||
<<mapping-boost,`boost`>>::
|
||||
|
||||
Mapping field-level query time boosting. Accepts a floating point number, defaults
|
||||
|
|
Loading…
Reference in New Issue