Allow string fields to store token counts
To use this one you send a string to a field of type 'token_count'. This makes the most sense with a multi-field.
This commit is contained in:
parent
3494ac252e
commit
7690b40ec6
|
@ -212,6 +212,49 @@ defaults to `true` or to the parent `object` type setting.
|
|||
|
||||
|=======================================================================
|
||||
|
||||
[float]
|
||||
[[token_count]]
|
||||
==== Token Count
|
||||
added[0.90.8]
|
||||
The `token_count` type maps to the JSON string type but indexes and stores
|
||||
the number of tokens in the string rather than the string itself. For
|
||||
example:
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
{
|
||||
"tweet" : {
|
||||
"properties" : {
|
||||
"message" : {
|
||||
"type" : "multi_field",
|
||||
"fields" : {
|
||||
"name": {
|
||||
"type": "string"
|
||||
},
|
||||
"word_count": {
|
||||
"type" : "token_count",
|
||||
"store" : "yes",
|
||||
"analyzer" : "standard"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
--------------------------------------------------
|
||||
|
||||
All the configuration that can be specified for a number can be specified
|
||||
for a token_count. The only extra configuration is the required
|
||||
`analyzer` field which specifies which analyzer to use to break the string
|
||||
into tokens. For best performance, use an analyzer with no token filters.
|
||||
|
||||
[NOTE]
|
||||
===================================================================
|
||||
Technically the `token_count` type sums position increments rather than
|
||||
counting tokens. This means that even if the analyzer filters out stop
|
||||
words they are included in the count.
|
||||
===================================================================
|
||||
|
||||
[float]
|
||||
[[date]]
|
||||
==== Date
|
||||
|
|
|
@ -95,6 +95,7 @@ public class DocumentMapperParser extends AbstractIndexComponent {
|
|||
.put(DateFieldMapper.CONTENT_TYPE, new DateFieldMapper.TypeParser())
|
||||
.put(IpFieldMapper.CONTENT_TYPE, new IpFieldMapper.TypeParser())
|
||||
.put(StringFieldMapper.CONTENT_TYPE, new StringFieldMapper.TypeParser())
|
||||
.put(TokenCountFieldMapper.CONTENT_TYPE, new TokenCountFieldMapper.TypeParser())
|
||||
.put(ObjectMapper.CONTENT_TYPE, new ObjectMapper.TypeParser())
|
||||
.put(ObjectMapper.NESTED_CONTENT_TYPE, new ObjectMapper.TypeParser())
|
||||
.put(MultiFieldMapper.CONTENT_TYPE, new MultiFieldMapper.TypeParser())
|
||||
|
|
|
@ -133,6 +133,10 @@ public final class MapperBuilders {
|
|||
return new IntegerFieldMapper.Builder(name);
|
||||
}
|
||||
|
||||
public static TokenCountFieldMapper.Builder tokenCountField(String name) {
|
||||
return new TokenCountFieldMapper.Builder(name);
|
||||
}
|
||||
|
||||
public static LongFieldMapper.Builder longField(String name) {
|
||||
return new LongFieldMapper.Builder(name);
|
||||
}
|
||||
|
|
|
@ -313,7 +313,10 @@ public class IntegerFieldMapper extends NumberFieldMapper<Integer> {
|
|||
}
|
||||
}
|
||||
}
|
||||
addIntegerFields(fields, value, boost);
|
||||
}
|
||||
|
||||
protected void addIntegerFields(List<Field> fields, int value, float boost) {
|
||||
if (fieldType.indexed() || fieldType.stored()) {
|
||||
CustomIntegerNumericField field = new CustomIntegerNumericField(this, value, fieldType);
|
||||
field.setBoost(boost);
|
||||
|
@ -326,6 +329,10 @@ public class IntegerFieldMapper extends NumberFieldMapper<Integer> {
|
|||
}
|
||||
}
|
||||
|
||||
protected Integer nullValue() {
|
||||
return nullValue;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected String contentType() {
|
||||
return CONTENT_TYPE;
|
||||
|
|
|
@ -259,57 +259,69 @@ public class StringFieldMapper extends AbstractFieldMapper<String> implements Al
|
|||
|
||||
@Override
|
||||
protected void parseCreateField(ParseContext context, List<Field> fields) throws IOException {
|
||||
String value = nullValue;
|
||||
float boost = this.boost;
|
||||
if (context.externalValueSet()) {
|
||||
value = (String) context.externalValue();
|
||||
} else {
|
||||
XContentParser parser = context.parser();
|
||||
if (parser.currentToken() == XContentParser.Token.VALUE_NULL) {
|
||||
value = nullValue;
|
||||
} else if (parser.currentToken() == XContentParser.Token.START_OBJECT) {
|
||||
XContentParser.Token token;
|
||||
String currentFieldName = null;
|
||||
while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) {
|
||||
if (token == XContentParser.Token.FIELD_NAME) {
|
||||
currentFieldName = parser.currentName();
|
||||
} else {
|
||||
if ("value".equals(currentFieldName) || "_value".equals(currentFieldName)) {
|
||||
value = parser.textOrNull();
|
||||
} else if ("boost".equals(currentFieldName) || "_boost".equals(currentFieldName)) {
|
||||
boost = parser.floatValue();
|
||||
} else {
|
||||
throw new ElasticSearchIllegalArgumentException("unknown property [" + currentFieldName + "]");
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
value = parser.textOrNull();
|
||||
}
|
||||
}
|
||||
if (value == null) {
|
||||
ValueAndBoost valueAndBoost = parseCreateFieldForString(context, nullValue, boost);
|
||||
if (valueAndBoost.value() == null) {
|
||||
return;
|
||||
}
|
||||
if (ignoreAbove > 0 && value.length() > ignoreAbove) {
|
||||
if (ignoreAbove > 0 && valueAndBoost.value().length() > ignoreAbove) {
|
||||
return;
|
||||
}
|
||||
if (context.includeInAll(includeInAll, this)) {
|
||||
context.allEntries().addText(names.fullName(), value, boost);
|
||||
context.allEntries().addText(names.fullName(), valueAndBoost.value(), valueAndBoost.boost());
|
||||
}
|
||||
|
||||
if (fieldType.indexed() || fieldType.stored()) {
|
||||
Field field = new StringField(names.indexName(), value, fieldType);
|
||||
field.setBoost(boost);
|
||||
Field field = new StringField(names.indexName(), valueAndBoost.value(), fieldType);
|
||||
field.setBoost(valueAndBoost.boost());
|
||||
fields.add(field);
|
||||
}
|
||||
if (hasDocValues()) {
|
||||
fields.add(new SortedSetDocValuesField(names.indexName(), new BytesRef(value)));
|
||||
fields.add(new SortedSetDocValuesField(names.indexName(), new BytesRef(valueAndBoost.value())));
|
||||
}
|
||||
if (fields.isEmpty()) {
|
||||
context.ignoredValue(names.indexName(), value);
|
||||
context.ignoredValue(names.indexName(), valueAndBoost.value());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse a field as though it were a string.
|
||||
* @param context parse context used during parsing
|
||||
* @param nullValue value to use for null
|
||||
* @param defaultBoost default boost value returned unless overwritten in the field
|
||||
* @return the parsed field and the boost either parsed or defaulted
|
||||
* @throws IOException if thrown while parsing
|
||||
*/
|
||||
public static ValueAndBoost parseCreateFieldForString(ParseContext context, String nullValue, float defaultBoost) throws IOException {
|
||||
if (context.externalValueSet()) {
|
||||
return new ValueAndBoost((String) context.externalValue(), defaultBoost);
|
||||
}
|
||||
XContentParser parser = context.parser();
|
||||
if (parser.currentToken() == XContentParser.Token.VALUE_NULL) {
|
||||
return new ValueAndBoost(nullValue, defaultBoost);
|
||||
}
|
||||
if (parser.currentToken() == XContentParser.Token.START_OBJECT) {
|
||||
XContentParser.Token token;
|
||||
String currentFieldName = null;
|
||||
String value = nullValue;
|
||||
float boost = defaultBoost;
|
||||
while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) {
|
||||
if (token == XContentParser.Token.FIELD_NAME) {
|
||||
currentFieldName = parser.currentName();
|
||||
} else {
|
||||
if ("value".equals(currentFieldName) || "_value".equals(currentFieldName)) {
|
||||
value = parser.textOrNull();
|
||||
} else if ("boost".equals(currentFieldName) || "_boost".equals(currentFieldName)) {
|
||||
boost = parser.floatValue();
|
||||
} else {
|
||||
throw new ElasticSearchIllegalArgumentException("unknown property [" + currentFieldName + "]");
|
||||
}
|
||||
}
|
||||
}
|
||||
return new ValueAndBoost(value, boost);
|
||||
}
|
||||
return new ValueAndBoost(parser.textOrNull(), defaultBoost);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected String contentType() {
|
||||
return CONTENT_TYPE;
|
||||
|
@ -437,4 +449,33 @@ public class StringFieldMapper extends AbstractFieldMapper<String> implements Al
|
|||
value = null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Parsed value and boost to be returned from {@link #parseCreateFieldForString}.
|
||||
*/
|
||||
public static class ValueAndBoost {
|
||||
private final String value;
|
||||
private final float boost;
|
||||
|
||||
public ValueAndBoost(String value, float boost) {
|
||||
this.value = value;
|
||||
this.boost = boost;
|
||||
}
|
||||
|
||||
/**
|
||||
* Value of string field.
|
||||
* @return value of string field
|
||||
*/
|
||||
public String value() {
|
||||
return value;
|
||||
}
|
||||
|
||||
/**
|
||||
* Boost either parsed from the document or defaulted.
|
||||
* @return boost either parsed from the document or defaulted
|
||||
*/
|
||||
public float boost() {
|
||||
return boost;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,198 @@
|
|||
/*
|
||||
* Licensed to ElasticSearch and Shay Banon under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. ElasticSearch licenses this
|
||||
* file to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.mapper.core;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.FieldType;
|
||||
import org.elasticsearch.common.Explicit;
|
||||
import org.elasticsearch.common.Strings;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.common.xcontent.XContentBuilder;
|
||||
import org.elasticsearch.index.analysis.NamedAnalyzer;
|
||||
import org.elasticsearch.index.codec.docvaluesformat.DocValuesFormatProvider;
|
||||
import org.elasticsearch.index.codec.postingsformat.PostingsFormatProvider;
|
||||
import org.elasticsearch.index.mapper.*;
|
||||
import org.elasticsearch.index.mapper.core.StringFieldMapper.ValueAndBoost;
|
||||
import org.elasticsearch.index.similarity.SimilarityProvider;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import static org.elasticsearch.common.xcontent.support.XContentMapValues.nodeIntegerValue;
|
||||
import static org.elasticsearch.index.mapper.MapperBuilders.tokenCountField;
|
||||
import static org.elasticsearch.index.mapper.core.TypeParsers.parseNumberField;
|
||||
|
||||
/**
|
||||
* A {@link FieldMapper} that takes a string and writes a count of the tokens in that string
|
||||
* to the index. In most ways the mapper acts just like an {@link IntegerFieldMapper}.
|
||||
*/
|
||||
public class TokenCountFieldMapper extends IntegerFieldMapper {
|
||||
public static final String CONTENT_TYPE = "token_count";
|
||||
|
||||
public static class Defaults extends IntegerFieldMapper.Defaults {
|
||||
}
|
||||
|
||||
public static class Builder extends NumberFieldMapper.Builder<Builder, TokenCountFieldMapper> {
|
||||
private Integer nullValue = Defaults.NULL_VALUE;
|
||||
private NamedAnalyzer analyzer;
|
||||
|
||||
public Builder(String name) {
|
||||
super(name, new FieldType(Defaults.FIELD_TYPE));
|
||||
builder = this;
|
||||
}
|
||||
|
||||
public Builder nullValue(int nullValue) {
|
||||
this.nullValue = nullValue;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder analyzer(NamedAnalyzer analyzer) {
|
||||
this.analyzer = analyzer;
|
||||
return this;
|
||||
}
|
||||
|
||||
public NamedAnalyzer analyzer() {
|
||||
return analyzer;
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenCountFieldMapper build(BuilderContext context) {
|
||||
fieldType.setOmitNorms(fieldType.omitNorms() && boost == 1.0f);
|
||||
TokenCountFieldMapper fieldMapper = new TokenCountFieldMapper(buildNames(context), precisionStep, boost, fieldType, nullValue,
|
||||
ignoreMalformed(context), postingsProvider, docValuesProvider, similarity, fieldDataSettings, context.indexSettings(),
|
||||
analyzer);
|
||||
fieldMapper.includeInAll(includeInAll);
|
||||
return fieldMapper;
|
||||
}
|
||||
}
|
||||
|
||||
public static class TypeParser implements Mapper.TypeParser {
|
||||
@Override
|
||||
@SuppressWarnings("unchecked")
|
||||
public Mapper.Builder parse(String name, Map<String, Object> node, ParserContext parserContext) throws MapperParsingException {
|
||||
TokenCountFieldMapper.Builder builder = tokenCountField(name);
|
||||
parseNumberField(builder, name, node, parserContext);
|
||||
for (Map.Entry<String, Object> entry : node.entrySet()) {
|
||||
String propName = Strings.toUnderscoreCase(entry.getKey());
|
||||
Object propNode = entry.getValue();
|
||||
if (propName.equals("null_value")) {
|
||||
builder.nullValue(nodeIntegerValue(propNode));
|
||||
} else if (propName.equals("analyzer")) {
|
||||
NamedAnalyzer analyzer = parserContext.analysisService().analyzer(propNode.toString());
|
||||
if (analyzer == null) {
|
||||
throw new MapperParsingException("Analyzer [" + propNode.toString() + "] not found for field [" + name + "]");
|
||||
}
|
||||
builder.analyzer(analyzer);
|
||||
}
|
||||
}
|
||||
if (builder.analyzer() == null) {
|
||||
throw new MapperParsingException("Analyzer must be set for field [" + name + "] but wasn't.");
|
||||
}
|
||||
return builder;
|
||||
}
|
||||
}
|
||||
|
||||
private NamedAnalyzer analyzer;
|
||||
|
||||
protected TokenCountFieldMapper(Names names, int precisionStep, float boost, FieldType fieldType, Integer nullValue,
|
||||
Explicit<Boolean> ignoreMalformed, PostingsFormatProvider postingsProvider, DocValuesFormatProvider docValuesProvider,
|
||||
SimilarityProvider similarity, Settings fieldDataSettings, Settings indexSettings, NamedAnalyzer analyzer) {
|
||||
super(names, precisionStep, boost, fieldType, nullValue, ignoreMalformed, postingsProvider, docValuesProvider, similarity,
|
||||
fieldDataSettings, indexSettings);
|
||||
this.analyzer = analyzer;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void parseCreateField(ParseContext context, List<Field> fields) throws IOException {
|
||||
ValueAndBoost valueAndBoost = StringFieldMapper.parseCreateFieldForString(context, null /* Out null value is an int so we convert*/, boost);
|
||||
if (valueAndBoost.value() == null && nullValue() == null) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (fieldType.indexed() || fieldType.stored() || hasDocValues()) {
|
||||
int count;
|
||||
if (valueAndBoost.value() == null) {
|
||||
count = nullValue();
|
||||
} else {
|
||||
count = countPositions(analyzer.analyzer().tokenStream(name(), valueAndBoost.value()));
|
||||
}
|
||||
addIntegerFields(fields, count, valueAndBoost.boost());
|
||||
}
|
||||
if (fields.isEmpty()) {
|
||||
context.ignoredValue(names.indexName(), valueAndBoost.value());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Count position increments in a token stream. Package private for testing.
|
||||
* @param tokenStream token stream to count
|
||||
* @return number of position increments in a token stream
|
||||
* @throws IOException if tokenStream throws it
|
||||
*/
|
||||
static int countPositions(TokenStream tokenStream) throws IOException {
|
||||
try {
|
||||
int count = 0;
|
||||
PositionIncrementAttribute position = tokenStream.addAttribute(PositionIncrementAttribute.class);
|
||||
tokenStream.reset();
|
||||
while (tokenStream.incrementToken()) {
|
||||
count += position.getPositionIncrement();
|
||||
}
|
||||
tokenStream.end();
|
||||
count += position.getPositionIncrement();
|
||||
return count;
|
||||
} finally {
|
||||
tokenStream.close();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Name of analyzer.
|
||||
* @return name of analyzer
|
||||
*/
|
||||
public String analyzer() {
|
||||
return analyzer.name();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected String contentType() {
|
||||
return CONTENT_TYPE;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void merge(Mapper mergeWith, MergeContext mergeContext) throws MergeMappingException {
|
||||
super.merge(mergeWith, mergeContext);
|
||||
if (!this.getClass().equals(mergeWith.getClass())) {
|
||||
return;
|
||||
}
|
||||
if (!mergeContext.mergeFlags().simulate()) {
|
||||
this.analyzer = ((TokenCountFieldMapper) mergeWith).analyzer;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void doXContentBody(XContentBuilder builder, boolean includeDefaults, Params params) throws IOException {
|
||||
super.doXContentBody(builder, includeDefaults, params);
|
||||
|
||||
builder.field("analyzer", analyzer());
|
||||
}
|
||||
}
|
|
@ -0,0 +1,222 @@
|
|||
/*
|
||||
* Licensed to ElasticSearch and Shay Banon under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. ElasticSearch licenses this
|
||||
* file to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.mapper.core;
|
||||
|
||||
import com.carrotsearch.randomizedtesting.annotations.Name;
|
||||
import com.carrotsearch.randomizedtesting.annotations.ParametersFactory;
|
||||
import com.google.common.collect.ImmutableList;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.elasticsearch.ElasticSearchException;
|
||||
import org.elasticsearch.action.bulk.BulkResponse;
|
||||
import org.elasticsearch.action.index.IndexRequestBuilder;
|
||||
import org.elasticsearch.action.search.SearchRequestBuilder;
|
||||
import org.elasticsearch.action.search.SearchResponse;
|
||||
import org.elasticsearch.index.query.QueryBuilders;
|
||||
import org.elasticsearch.search.SearchHit;
|
||||
import org.elasticsearch.search.facet.terms.TermsFacet;
|
||||
import org.elasticsearch.search.facet.terms.TermsFacetBuilder;
|
||||
import org.elasticsearch.test.ElasticsearchIntegrationTest;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;
|
||||
import static org.hamcrest.Matchers.*;
|
||||
|
||||
public class TokenCountFieldMapperIntegrationTests extends ElasticsearchIntegrationTest {
|
||||
@ParametersFactory
|
||||
public static Iterable<Object[]> buildParameters() {
|
||||
List<Object[]> parameters = new ArrayList<Object[]>();
|
||||
for (boolean storeCountedFields : new boolean[] { true, false }) {
|
||||
for (boolean loadCountedFields : new boolean[] { true, false }) {
|
||||
parameters.add(new Object[] { storeCountedFields, loadCountedFields });
|
||||
}
|
||||
}
|
||||
return parameters;
|
||||
}
|
||||
|
||||
private final boolean storeCountedFields;
|
||||
private final boolean loadCountedFields;
|
||||
|
||||
public TokenCountFieldMapperIntegrationTests(@Name("storeCountedFields") boolean storeCountedFields,
|
||||
@Name("loadCountedFields") boolean loadCountedFields) {
|
||||
this.storeCountedFields = storeCountedFields;
|
||||
this.loadCountedFields = loadCountedFields;
|
||||
}
|
||||
|
||||
/**
|
||||
* It is possible to get the token count in a search response.
|
||||
*/
|
||||
@Test
|
||||
public void searchReturnsTokenCount() throws ElasticSearchException, IOException {
|
||||
init();
|
||||
|
||||
assertSearchReturns(searchById("single"), "single");
|
||||
assertSearchReturns(searchById("bulk1"), "bulk1");
|
||||
assertSearchReturns(searchById("bulk2"), "bulk2");
|
||||
assertSearchReturns(searchById("multi"), "multi");
|
||||
assertSearchReturns(searchById("multibulk1"), "multibulk1");
|
||||
assertSearchReturns(searchById("multibulk2"), "multibulk2");
|
||||
}
|
||||
|
||||
/**
|
||||
* It is possible to search by token count.
|
||||
*/
|
||||
@Test
|
||||
public void searchByTokenCount() throws ElasticSearchException, IOException {
|
||||
init();
|
||||
|
||||
assertSearchReturns(searchByNumericRange(4, 4).get(), "single");
|
||||
assertSearchReturns(searchByNumericRange(10, 10).get(), "multibulk2");
|
||||
assertSearchReturns(searchByNumericRange(7, 10).get(), "multi", "multibulk1", "multibulk2");
|
||||
assertSearchReturns(searchByNumericRange(1, 10).get(), "single", "bulk1", "bulk2", "multi", "multibulk1", "multibulk2");
|
||||
assertSearchReturns(searchByNumericRange(12, 12).get());
|
||||
}
|
||||
|
||||
/**
|
||||
* It is possible to search by token count.
|
||||
*/
|
||||
@Test
|
||||
public void facetByTokenCount() throws ElasticSearchException, IOException {
|
||||
init();
|
||||
|
||||
String facetField = randomFrom(ImmutableList.of(
|
||||
"foo.token_count", "foo.token_count_unstored", "foo.token_count_with_doc_values"));
|
||||
SearchResponse result = searchByNumericRange(1, 10)
|
||||
.addFacet(new TermsFacetBuilder("facet").field(facetField)).get();
|
||||
assertSearchReturns(result, "single", "bulk1", "bulk2", "multi", "multibulk1", "multibulk2");
|
||||
assertThat(result.getFacets().facets().size(), equalTo(1));
|
||||
TermsFacet facet = (TermsFacet) result.getFacets().facets().get(0);
|
||||
assertThat(facet.getEntries().size(), equalTo(9));
|
||||
}
|
||||
|
||||
private void init() throws ElasticSearchException, IOException {
|
||||
prepareCreate("test").addMapping("test", jsonBuilder().startObject()
|
||||
.startObject("test")
|
||||
.startObject("properties")
|
||||
.startObject("foo")
|
||||
.field("type", "multi_field")
|
||||
.startObject("fields")
|
||||
.startObject("foo")
|
||||
.field("type", "string")
|
||||
.field("store", storeCountedFields)
|
||||
.field("analyzer", "simple")
|
||||
.endObject()
|
||||
.startObject("token_count")
|
||||
.field("type", "token_count")
|
||||
.field("analyzer", "standard")
|
||||
.field("store", true)
|
||||
.endObject()
|
||||
.startObject("token_count_unstored")
|
||||
.field("type", "token_count")
|
||||
.field("analyzer", "standard")
|
||||
.endObject()
|
||||
.startObject("token_count_with_doc_values")
|
||||
.field("type", "token_count")
|
||||
.field("analyzer", "standard")
|
||||
.startObject("fielddata")
|
||||
.field("format", LuceneTestCase.defaultCodecSupportsSortedSet() ? "doc_values" : null)
|
||||
.endObject()
|
||||
.endObject()
|
||||
.endObject()
|
||||
.endObject()
|
||||
.endObject()
|
||||
.endObject().endObject()).get();
|
||||
ensureGreen();
|
||||
|
||||
assertTrue(prepareIndex("single", "I have four terms").get().isCreated());
|
||||
BulkResponse bulk = client().prepareBulk()
|
||||
.add(prepareIndex("bulk1", "bulk three terms"))
|
||||
.add(prepareIndex("bulk2", "this has five bulk terms")).get();
|
||||
assertFalse(bulk.buildFailureMessage(), bulk.hasFailures());
|
||||
assertTrue(prepareIndex("multi", "two terms", "wow now I have seven lucky terms").get().isCreated());
|
||||
bulk = client().prepareBulk()
|
||||
.add(prepareIndex("multibulk1", "one", "oh wow now I have eight unlucky terms"))
|
||||
.add(prepareIndex("multibulk2", "six is a bunch of terms", "ten! ten terms is just crazy! too many too count!")).get();
|
||||
assertFalse(bulk.buildFailureMessage(), bulk.hasFailures());
|
||||
|
||||
assertThat(refresh().getFailedShards(), equalTo(0));
|
||||
}
|
||||
|
||||
private IndexRequestBuilder prepareIndex(String id, String... texts) throws IOException {
|
||||
return client().prepareIndex("test", "test", id).setSource("foo", texts);
|
||||
}
|
||||
|
||||
private SearchResponse searchById(String id) {
|
||||
return prepareSearch().setQuery(QueryBuilders.termQuery("_id", id)).get();
|
||||
}
|
||||
|
||||
private SearchRequestBuilder searchByNumericRange(int low, int high) {
|
||||
return prepareSearch().setQuery(QueryBuilders.rangeQuery(randomFrom(
|
||||
ImmutableList.of("foo.token_count", "foo.token_count_unstored", "foo.token_count_with_doc_values")
|
||||
)).gte(low).lte(high));
|
||||
}
|
||||
|
||||
private SearchRequestBuilder prepareSearch() {
|
||||
SearchRequestBuilder request = client().prepareSearch("test").setTypes("test");
|
||||
request.addField("foo.token_count");
|
||||
if (loadCountedFields) {
|
||||
request.addField("foo");
|
||||
}
|
||||
return request;
|
||||
}
|
||||
|
||||
private void assertSearchReturns(SearchResponse result, String... ids) {
|
||||
assertThat(result.getHits().getTotalHits(), equalTo((long) ids.length));
|
||||
assertThat(result.getHits().hits().length, equalTo(ids.length));
|
||||
List<String> foundIds = new ArrayList<String>();
|
||||
for (SearchHit hit : result.getHits()) {
|
||||
foundIds.add(hit.id());
|
||||
}
|
||||
assertThat(foundIds, containsInAnyOrder(ids));
|
||||
for (SearchHit hit : result.getHits()) {
|
||||
String id = hit.id();
|
||||
if (id.equals("single")) {
|
||||
assertSearchHit(hit, 4);
|
||||
} else if (id.equals("bulk1")) {
|
||||
assertSearchHit(hit, 3);
|
||||
} else if (id.equals("bulk2")) {
|
||||
assertSearchHit(hit, 5);
|
||||
} else if (id.equals("multi")) {
|
||||
assertSearchHit(hit, 2, 7);
|
||||
} else if (id.equals("multibulk1")) {
|
||||
assertSearchHit(hit, 1, 8);
|
||||
} else if (id.equals("multibulk2")) {
|
||||
assertSearchHit(hit, 6, 10);
|
||||
} else {
|
||||
throw new ElasticSearchException("Unexpected response!");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void assertSearchHit(SearchHit hit, int... termCounts) {
|
||||
assertThat(hit.field("foo.token_count"), not(nullValue()));
|
||||
assertThat(hit.field("foo.token_count").values().size(), equalTo(termCounts.length));
|
||||
for (int i = 0; i < termCounts.length; i++) {
|
||||
assertThat((Integer) hit.field("foo.token_count").values().get(i), equalTo(termCounts[i]));
|
||||
}
|
||||
|
||||
if (loadCountedFields && storeCountedFields) {
|
||||
assertThat(hit.field("foo").values().size(), equalTo(termCounts.length));
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,92 @@
|
|||
/*
|
||||
* Licensed to ElasticSearch and Shay Banon under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. ElasticSearch licenses this
|
||||
* file to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.mapper.core;
|
||||
|
||||
import org.apache.lucene.analysis.CannedTokenStream;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.elasticsearch.common.xcontent.XContentFactory;
|
||||
import org.elasticsearch.index.mapper.DocumentMapper;
|
||||
import org.elasticsearch.index.mapper.MapperTestUtils;
|
||||
import org.elasticsearch.test.ElasticsearchTestCase;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
|
||||
import static org.elasticsearch.index.mapper.DocumentMapper.MergeFlags.mergeFlags;
|
||||
import static org.hamcrest.Matchers.equalTo;
|
||||
|
||||
/**
|
||||
* Test for {@link TokenCountFieldMapper}.
|
||||
*/
|
||||
public class TokenCountFieldMapperTests extends ElasticsearchTestCase {
|
||||
@Test
|
||||
public void testMerge() throws IOException {
|
||||
String stage1Mapping = XContentFactory.jsonBuilder().startObject()
|
||||
.startObject("person")
|
||||
.startObject("properties")
|
||||
.startObject("tc")
|
||||
.field("type", "token_count")
|
||||
.field("tokenizer", "keyword")
|
||||
.endObject()
|
||||
.endObject()
|
||||
.endObject().endObject().string();
|
||||
DocumentMapper stage1 = MapperTestUtils.newParser().parse(stage1Mapping);
|
||||
|
||||
String stage2Mapping = XContentFactory.jsonBuilder().startObject()
|
||||
.startObject("person")
|
||||
.startObject("properties")
|
||||
.startObject("tc")
|
||||
.field("type", "token_count")
|
||||
.field("tokenizer", "standard")
|
||||
.endObject()
|
||||
.endObject()
|
||||
.endObject().endObject().string();
|
||||
DocumentMapper stage2 = MapperTestUtils.newParser().parse(stage2Mapping);
|
||||
|
||||
DocumentMapper.MergeResult mergeResult = stage1.merge(stage2, mergeFlags().simulate(true));
|
||||
assertThat(mergeResult.hasConflicts(), equalTo(false));
|
||||
// Just simulated so merge hasn't happened yet
|
||||
assertThat(((TokenCountFieldMapper) stage1.mappers().smartName("tc").mapper()).analyzer(), equalTo("keyword"));
|
||||
|
||||
mergeResult = stage1.merge(stage2, mergeFlags().simulate(false));
|
||||
assertThat(mergeResult.hasConflicts(), equalTo(false));
|
||||
// Just simulated so merge hasn't happened yet
|
||||
assertThat(((TokenCountFieldMapper) stage1.mappers().smartName("tc").mapper()).analyzer(), equalTo("standard"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCountPositions() throws IOException {
|
||||
// We're looking to make sure that we:
|
||||
Token t1 = new Token(); // Don't count tokens without an increment
|
||||
t1.setPositionIncrement(0);
|
||||
Token t2 = new Token();
|
||||
t2.setPositionIncrement(1); // Count normal tokens with one increment
|
||||
Token t3 = new Token();
|
||||
t2.setPositionIncrement(2); // Count funny tokens with more than one increment
|
||||
int finalTokenIncrement = 4; // Count the final token increment on the rare token streams that have them
|
||||
Token[] tokens = new Token[] {t1, t2, t3};
|
||||
Collections.shuffle(Arrays.asList(tokens), getRandom());
|
||||
TokenStream tokenStream = new CannedTokenStream(finalTokenIncrement, 0, tokens);
|
||||
assertThat(TokenCountFieldMapper.countPositions(tokenStream), equalTo(7));
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue