From ad986eb2fcd793c6cf2fc90a48daaa345e0d0abb Mon Sep 17 00:00:00 2001 From: David Pilato Date: Sat, 26 Jul 2014 00:03:28 +0200 Subject: [PATCH] Add support for multi-fields Now https://github.com/elasticsearch/elasticsearch/pull/6867 is merged in elasticsearch core code (branch 1.x - es 1.4), we can support multi fields in mapper attachment plugin. ``` DELETE /test PUT /test { "settings": { "number_of_shards": 1 } } PUT /test/person/_mapping { "person": { "properties": { "file": { "type": "attachment", "path": "full", "fields": { "file": { "type": "string", "fields": { "store": { "type": "string", "store": true } } }, "content_type": { "type": "string", "fields": { "store": { "type": "string", "store": true }, "untouched": { "type": "string", "index": "not_analyzed", "store": true } } } } } } } } PUT /test/person/1?refresh=true { "file": "IkdvZCBTYXZlIHRoZSBRdWVlbiIgKGFsdGVybmF0aXZlbHkgIkdvZCBTYXZlIHRoZSBLaW5nIg==" } GET /test/person/_search { "fields": [ "file.store", "file.content_type.store" ], "aggs": { "store": { "terms": { "field": "file.content_type.store" } }, "untouched": { "terms": { "field": "file.content_type.untouched" } } } } ``` It gives: ```js { "took": 3, "timed_out": false, "_shards": { "total": 1, "successful": 1, "failed": 0 }, "hits": { "total": 1, "max_score": 1, "hits": [ { "_index": "test", "_type": "person", "_id": "1", "_score": 1, "fields": { "file.store": [ "\"God Save the Queen\" (alternatively \"God Save the King\"\n" ], "file.content_type.store": [ "text/plain; charset=ISO-8859-1" ] } } ] }, "aggregations": { "store": { "doc_count_error_upper_bound": 0, "buckets": [ { "key": "1", "doc_count": 1 }, { "key": "8859", "doc_count": 1 }, { "key": "charset", "doc_count": 1 }, { "key": "iso", "doc_count": 1 }, { "key": "plain", "doc_count": 1 }, { "key": "text", "doc_count": 1 } ] }, "untouched": { "doc_count_error_upper_bound": 0, "buckets": [ { "key": "text/plain; charset=ISO-8859-1", "doc_count": 1 } ] } } } ``` Note that using shorter definition works as well: ``` DELETE /test PUT /test { "settings": { "number_of_shards": 1 } } PUT /test/person/_mapping { "person": { "properties": { "file": { "type": "attachment" } } } } PUT /test/person/1?refresh=true { "file": "IkdvZCBTYXZlIHRoZSBRdWVlbiIgKGFsdGVybmF0aXZlbHkgIkdvZCBTYXZlIHRoZSBLaW5nIg==" } GET /test/person/_search { "query": { "match": { "file": "king" } } } ``` gives: ```js { "took": 53, "timed_out": false, "_shards": { "total": 1, "successful": 1, "failed": 0 }, "hits": { "total": 1, "max_score": 0.095891505, "hits": [ { "_index": "test", "_type": "person", "_id": "1", "_score": 0.095891505, "_source": { "file": "IkdvZCBTYXZlIHRoZSBRdWVlbiIgKGFsdGVybmF0aXZlbHkgIkdvZCBTYXZlIHRoZSBLaW5nIg==" } } ] } } ``` Closes #57. (cherry picked from commit 432d7c0) --- README.md | 185 ++++++++++- .../mapper/attachment/AttachmentMapper.java | 292 +++++++++++------- .../mapper/xcontent/MapperTestUtils.java | 82 +++++ .../MultifieldAttachmentMapperTests.java | 73 ++++- .../SimpleAttachmentIntegrationTests.java | 10 +- .../mapper/multifield/multifield-mapping.json | 5 +- 6 files changed, 510 insertions(+), 137 deletions(-) create mode 100644 src/test/java/org/elasticsearch/index/mapper/xcontent/MapperTestUtils.java diff --git a/README.md b/README.md index 07a429f31dd..3d41f8ea3e6 100644 --- a/README.md +++ b/README.md @@ -25,11 +25,14 @@ Please read documentation relative to the version you are using: The `attachment` type allows to index different "attachment" type field (encoded as `base64`), for example, microsoft office formats, open document formats, ePub, HTML, and so on (full list can be found [here](http://tika.apache.org/1.5/formats.html)). -The `attachment` type is provided as a plugin extension. The plugin is a simple zip file that can be downloaded and placed under `$ES_HOME/plugins` location. It will be automatically detected and the `attachment` type will be added. +The `attachment` type is provided as a plugin extension. The plugin is a simple zip file that can be downloaded and +placed under `$ES_HOME/plugins/mapper-attachments` location. When the node will start, it will be automatically detected +and the `attachment` type will be added. Using the attachment type is simple, in your mapping JSON, simply set a certain JSON element as attachment, for example: ```javascript +PUT /test/person/_mapping { "person" : { "properties" : { @@ -42,6 +45,7 @@ Using the attachment type is simple, in your mapping JSON, simply set a certain In this case, the JSON to index can be: ```javascript +PUT /test/person/1 { "my_attachment" : "... base64 encoded attachment ..." } @@ -49,7 +53,8 @@ In this case, the JSON to index can be: Or it is possible to use more elaborated JSON if content type, resource name or language need to be set explicitly: -```javascript +``` +PUT /test/person/1 { "my_attachment" : { "_content_type" : "application/pdf", @@ -60,7 +65,8 @@ Or it is possible to use more elaborated JSON if content type, resource name or } ``` -The `attachment` type not only indexes the content of the doc, but also automatically adds meta data on the attachment as well (when available). +The `attachment` type not only indexes the content of the doc, but also automatically adds meta data on the attachment +as well (when available). The metadata supported are: @@ -75,9 +81,11 @@ The metadata supported are: They can be queried using the "dot notation", for example: `my_attachment.author`. -Both the meta data and the actual content are simple core type mappers (string, date, ...), thus, they can be controlled in the mappings. For example: +Both the meta data and the actual content are simple core type mappers (string, date, ...), thus, they can be controlled +in the mappings. For example: ```javascript +PUT /test/person/_mapping { "person" : { "properties" : { @@ -99,12 +107,98 @@ Both the meta data and the actual content are simple core type mappers (string, } ``` -In the above example, the actual content indexed is mapped under `fields` name `file`, and we decide not to index it, so it will only be available in the `_all` field. The other fields map to their respective metadata names, but there is no need to specify the `type` (like `string` or `date`) since it is already known. +In the above example, the actual content indexed is mapped under `fields` name `file`, and we decide not to index it, so +it will only be available in the `_all` field. The other fields map to their respective metadata names, but there is no +need to specify the `type` (like `string` or `date`) since it is already known. + +Querying or accessing metadata +------------------------------ + +If you need to query on metadata fields, use the attachment field name dot the metadata field. For example: + +``` +DELETE /test +PUT /test +PUT /test/person/_mapping +{ + "person": { + "properties": { + "file": { + "type": "attachment", + "path": "full", + "fields": { + "content_type": { + "type": "string", + "store": true + } + } + } + } + } +} +PUT /test/person/1?refresh=true +{ + "file": "IkdvZCBTYXZlIHRoZSBRdWVlbiIgKGFsdGVybmF0aXZlbHkgIkdvZCBTYXZlIHRoZSBLaW5nIg==" +} +GET /test/person/_search +{ + "fields": [ "file.content_type" ], + "query": { + "match": { + "file.content_type": "text plain" + } + } +} +``` + +Will give you: + +``` +{ + "took": 2, + "timed_out": false, + "_shards": { + "total": 5, + "successful": 5, + "failed": 0 + }, + "hits": { + "total": 1, + "max_score": 0.16273327, + "hits": [ + { + "_index": "test", + "_type": "person", + "_id": "1", + "_score": 0.16273327, + "fields": { + "file.content_type": [ + "text/plain; charset=ISO-8859-1" + ] + } + } + ] + } +} +``` Indexed Characters ------------------ -By default, `100000` characters are extracted when indexing the content. This default value can be changed by setting the `index.mapping.attachment.indexed_chars` setting. It can also be provided on a per document indexed using the `_indexed_chars` parameter. `-1` can be set to extract all text, but note that all the text needs to be allowed to be represented in memory. +By default, `100000` characters are extracted when indexing the content. This default value can be changed by setting +the `index.mapping.attachment.indexed_chars` setting. It can also be provided on a per document indexed using the +`_indexed_chars` parameter. `-1` can be set to extract all text, but note that all the text needs to be allowed to be +represented in memory: + +``` +PUT /test/person/1 +{ + "my_attachment" : { + "_indexed_chars" : -1, + "_content" : "... base64 encoded attachment ..." + } +} +``` Metadata parsing error handling ------------------------------- @@ -135,22 +229,79 @@ Note that you can force language using `_language` field when sending your actua Highlighting attachments ------------------------ -If you want to highlight your attachment content, you will need to store your file content and set `term_vector` as follow: +If you want to highlight your attachment content, you will need to set `"store": true` and `"term_vector":"with_positions_offsets"` +for your attachment field. Here is a full script which does it: ``` -PUT test/my_type/_mapping +DELETE /test +PUT /test +PUT /test/person/_mapping { - "my_type" : { - "properties" : { - "my_html_file" : { - "type" : "attachment", - "fields" : { - "title" : { "store" : "yes" }, - "my_html_file" : { "term_vector":"with_positions_offsets", "store":"yes" } - } - } + "person": { + "properties": { + "file": { + "type": "attachment", + "path": "full", + "fields": { + "file": { + "type": "string", + "term_vector":"with_positions_offsets", + "store": true + } } + } } + } +} +PUT /test/person/1?refresh=true +{ + "file": "IkdvZCBTYXZlIHRoZSBRdWVlbiIgKGFsdGVybmF0aXZlbHkgIkdvZCBTYXZlIHRoZSBLaW5nIg==" +} +GET /test/person/_search +{ + "fields": [], + "query": { + "match": { + "file": "king queen" + } + }, + "highlight": { + "fields": { + "file": { + } + } + } +} +``` + +It gives back: + +```js +{ + "took": 9, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "failed": 0 + }, + "hits": { + "total": 1, + "max_score": 0.13561106, + "hits": [ + { + "_index": "test", + "_type": "person", + "_id": "1", + "_score": 0.13561106, + "highlight": { + "file": [ + "\"God Save the Queen\" (alternatively \"God Save the King\"\n" + ] + } + } + ] + } } ``` diff --git a/src/main/java/org/elasticsearch/index/mapper/attachment/AttachmentMapper.java b/src/main/java/org/elasticsearch/index/mapper/attachment/AttachmentMapper.java index 0fdca08904f..432e98602a1 100644 --- a/src/main/java/org/elasticsearch/index/mapper/attachment/AttachmentMapper.java +++ b/src/main/java/org/elasticsearch/index/mapper/attachment/AttachmentMapper.java @@ -19,22 +19,26 @@ package org.elasticsearch.index.mapper.attachment; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.FieldType; import org.apache.tika.language.LanguageIdentifier; import org.apache.tika.metadata.Metadata; import org.elasticsearch.common.io.stream.BytesStreamInput; import org.elasticsearch.common.logging.ESLogger; import org.elasticsearch.common.logging.ESLoggerFactory; +import org.elasticsearch.common.settings.ImmutableSettings; import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.common.xcontent.XContentParser; +import org.elasticsearch.index.fielddata.FieldDataType; import org.elasticsearch.index.mapper.*; -import org.elasticsearch.index.mapper.core.DateFieldMapper; -import org.elasticsearch.index.mapper.core.IntegerFieldMapper; -import org.elasticsearch.index.mapper.core.StringFieldMapper; +import org.elasticsearch.index.mapper.core.AbstractFieldMapper; import java.io.IOException; +import java.util.List; import java.util.Map; import static org.elasticsearch.index.mapper.MapperBuilders.*; +import static org.elasticsearch.index.mapper.core.TypeParsers.parseMultiField; import static org.elasticsearch.index.mapper.core.TypeParsers.parsePathType; import static org.elasticsearch.plugin.mapper.attachments.tika.TikaInstance.tika; @@ -57,7 +61,7 @@ import static org.elasticsearch.plugin.mapper.attachments.tika.TikaInstance.tika * _content_length = Specify the maximum amount of characters to extract from the attachment. If not specified, then the default for * tika is 100,000 characters. Caution is required when setting large values as this can cause memory issues. */ -public class AttachmentMapper implements Mapper { +public class AttachmentMapper extends AbstractFieldMapper { private static ESLogger logger = ESLoggerFactory.getLogger(AttachmentMapper.class.getName()); @@ -67,7 +71,18 @@ public class AttachmentMapper implements Mapper { public static final ContentPath.Type PATH_TYPE = ContentPath.Type.FULL; } - public static class Builder extends Mapper.Builder { + public static class FieldNames { + public static final String TITLE = "title"; + public static final String NAME = "name"; + public static final String AUTHOR = "author"; + public static final String KEYWORDS = "keywords"; + public static final String DATE = "date"; + public static final String CONTENT_TYPE = "content_type"; + public static final String CONTENT_LENGTH = "content_length"; + public static final String LANGUAGE = "language"; + } + + public static class Builder extends AbstractFieldMapper.Builder { private ContentPath.Type pathType = Defaults.PATH_TYPE; @@ -79,24 +94,24 @@ public class AttachmentMapper implements Mapper { private Mapper.Builder contentBuilder; - private Mapper.Builder titleBuilder = stringField("title"); + private Mapper.Builder titleBuilder = stringField(FieldNames.TITLE); - private Mapper.Builder nameBuilder = stringField("name"); + private Mapper.Builder nameBuilder = stringField(FieldNames.NAME); - private Mapper.Builder authorBuilder = stringField("author"); + private Mapper.Builder authorBuilder = stringField(FieldNames.AUTHOR); - private Mapper.Builder keywordsBuilder = stringField("keywords"); + private Mapper.Builder keywordsBuilder = stringField(FieldNames.KEYWORDS); - private Mapper.Builder dateBuilder = dateField("date"); + private Mapper.Builder dateBuilder = dateField(FieldNames.DATE); - private Mapper.Builder contentTypeBuilder = stringField("content_type"); + private Mapper.Builder contentTypeBuilder = stringField(FieldNames.CONTENT_TYPE); - private Mapper.Builder contentLengthBuilder = integerField("content_length"); + private Mapper.Builder contentLengthBuilder = integerField(FieldNames.CONTENT_LENGTH); - private Mapper.Builder languageBuilder = stringField("language"); + private Mapper.Builder languageBuilder = stringField(FieldNames.LANGUAGE); public Builder(String name) { - super(name); + super(name, new FieldType(AbstractFieldMapper.Defaults.FIELD_TYPE)); this.builder = this; this.contentBuilder = stringField(name); } @@ -194,7 +209,9 @@ public class AttachmentMapper implements Mapper { langDetect = Boolean.FALSE; } - return new AttachmentMapper(name, pathType, defaultIndexedChars, ignoreErrors, langDetect, contentMapper, dateMapper, titleMapper, nameMapper, authorMapper, keywordsMapper, contentTypeMapper, contentLength, language); + return new AttachmentMapper(buildNames(context), pathType, defaultIndexedChars, ignoreErrors, langDetect, contentMapper, + dateMapper, titleMapper, nameMapper, authorMapper, keywordsMapper, contentTypeMapper, contentLength, + language, multiFieldsBuilder.build(this, context), copyTo); } } @@ -221,6 +238,20 @@ public class AttachmentMapper implements Mapper { */ public static class TypeParser implements Mapper.TypeParser { + private Mapper.Builder findMapperBuilder(Map propNode, String propName, ParserContext parserContext) { + String type; + Object typeNode = propNode.get("type"); + if (typeNode != null) { + type = typeNode.toString(); + } else { + type = "string"; + } + Mapper.TypeParser typeParser = parserContext.typeParser(type); + Mapper.Builder mapperBuilder = typeParser.parse(propName, (Map) propNode, parserContext); + + return mapperBuilder; + } + @SuppressWarnings({"unchecked"}) @Override public Mapper.Builder parse(String name, Map node, ParserContext parserContext) throws MapperParsingException { @@ -235,37 +266,41 @@ public class AttachmentMapper implements Mapper { Map fieldsNode = (Map) fieldNode; for (Map.Entry entry1 : fieldsNode.entrySet()) { String propName = entry1.getKey(); - Object propNode = entry1.getValue(); + Map propNode = (Map) entry1.getValue(); - boolean isString = false; - if (propNode != null && propNode instanceof Map) { - Object oType = ((Map) propNode).get("type"); - if (oType != null && oType.equals(StringFieldMapper.CONTENT_TYPE)) { - isString = true; + Mapper.Builder mapperBuilder = findMapperBuilder(propNode, propName, parserContext); + parseMultiField((AbstractFieldMapper.Builder) mapperBuilder, fieldName, (Map) fieldNode, parserContext, propName, propNode); + + if (propName.equals(name)) { + builder.content(mapperBuilder); + } else { + switch (propName) { + case FieldNames.DATE: + builder.date(mapperBuilder); + break; + case FieldNames.AUTHOR: + builder.author(mapperBuilder); + break; + case FieldNames.CONTENT_LENGTH: + builder.contentLength(mapperBuilder); + break; + case FieldNames.CONTENT_TYPE: + builder.contentType(mapperBuilder); + break; + case FieldNames.KEYWORDS: + builder.keywords(mapperBuilder); + break; + case FieldNames.LANGUAGE: + builder.language(mapperBuilder); + break; + case FieldNames.TITLE: + builder.title(mapperBuilder); + break; + case FieldNames.NAME: + builder.name(mapperBuilder); + break; } } - - if (name.equals(propName)) { - // that is the content - builder.content(parserContext.typeParser(StringFieldMapper.CONTENT_TYPE).parse(name, (Map) propNode, parserContext)); - } else if ("date".equals(propName)) { - // If a specific format is already defined here, we should use it - builder.date(parserContext.typeParser(isString ? StringFieldMapper.CONTENT_TYPE : DateFieldMapper.CONTENT_TYPE).parse("date", (Map) propNode, parserContext)); - } else if ("title".equals(propName)) { - builder.title(parserContext.typeParser(StringFieldMapper.CONTENT_TYPE).parse("title", (Map) propNode, parserContext)); - } else if ("name".equals(propName)) { - builder.name(parserContext.typeParser(StringFieldMapper.CONTENT_TYPE).parse("name", (Map) propNode, parserContext)); - } else if ("author".equals(propName)) { - builder.author(parserContext.typeParser(StringFieldMapper.CONTENT_TYPE).parse("author", (Map) propNode, parserContext)); - } else if ("keywords".equals(propName)) { - builder.keywords(parserContext.typeParser(StringFieldMapper.CONTENT_TYPE).parse("keywords", (Map) propNode, parserContext)); - } else if ("content_type".equals(propName)) { - builder.contentType(parserContext.typeParser(StringFieldMapper.CONTENT_TYPE).parse("content_type", (Map) propNode, parserContext)); - } else if ("content_length".equals(propName)) { - builder.contentLength(parserContext.typeParser(IntegerFieldMapper.CONTENT_TYPE).parse("content_length", (Map) propNode, parserContext)); - } else if ("language".equals(propName)) { - builder.language(parserContext.typeParser(StringFieldMapper.CONTENT_TYPE).parse("language", (Map) propNode, parserContext)); - } } } } @@ -274,8 +309,6 @@ public class AttachmentMapper implements Mapper { } } - private final String name; - private final ContentPath.Type pathType; private final int defaultIndexedChars; @@ -302,10 +335,13 @@ public class AttachmentMapper implements Mapper { private final Mapper languageMapper; - public AttachmentMapper(String name, ContentPath.Type pathType, int defaultIndexedChars, Boolean ignoreErrors, Boolean defaultLangDetect, Mapper contentMapper, + public AttachmentMapper(Names names, ContentPath.Type pathType, int defaultIndexedChars, Boolean ignoreErrors, + Boolean defaultLangDetect, Mapper contentMapper, Mapper dateMapper, Mapper titleMapper, Mapper nameMapper, Mapper authorMapper, - Mapper keywordsMapper, Mapper contentTypeMapper, Mapper contentLengthMapper, Mapper languageMapper) { - this.name = name; + Mapper keywordsMapper, Mapper contentTypeMapper, Mapper contentLengthMapper, + Mapper languageMapper, MultiFields multiFields, CopyTo copyTo) { + super(names, 1.0f, AbstractFieldMapper.Defaults.FIELD_TYPE, false, null, null, null, null, null, null, null, + ImmutableSettings.EMPTY, multiFields, copyTo); this.pathType = pathType; this.defaultIndexedChars = defaultIndexedChars; this.ignoreErrors = ignoreErrors; @@ -322,8 +358,18 @@ public class AttachmentMapper implements Mapper { } @Override - public String name() { - return name; + public Object value(Object value) { + return null; + } + + @Override + public FieldType defaultFieldType() { + return AbstractFieldMapper.Defaults.FIELD_TYPE; + } + + @Override + public FieldDataType defaultFieldDataType() { + return null; } @Override @@ -393,7 +439,7 @@ public class AttachmentMapper implements Mapper { return; } - context.externalValue(parsedContent); + context = context.createExternalValueContext(parsedContent); contentMapper.parse(context); if (langDetect) { @@ -404,78 +450,99 @@ public class AttachmentMapper implements Mapper { LanguageIdentifier identifier = new LanguageIdentifier(parsedContent); language = identifier.getLanguage(); } - context.externalValue(language); + context = context.createExternalValueContext(language); languageMapper.parse(context); } catch(Throwable t) { logger.warn("Cannot detect language: {}", t.getMessage()); } } - try { - context.externalValue(name); - nameMapper.parse(context); - } catch(MapperParsingException e){ - if (!ignoreErrors) throw e; - if (logger.isDebugEnabled()) logger.debug("Ignoring MapperParsingException catch while parsing name: {}", e.getMessage()); - } - - try { - context.externalValue(metadata.get(Metadata.DATE)); - dateMapper.parse(context); - } catch(MapperParsingException e){ - if (!ignoreErrors) throw e; - if (logger.isDebugEnabled()) logger.debug("Ignoring MapperParsingException catch while parsing date: {}: {}", e.getMessage(), context.externalValue()); - } - - try { - context.externalValue(metadata.get(Metadata.TITLE)); - titleMapper.parse(context); - } catch(MapperParsingException e){ - if (!ignoreErrors) throw e; - if (logger.isDebugEnabled()) logger.debug("Ignoring MapperParsingException catch while parsing title: {}: {}", e.getMessage(), context.externalValue()); - } - - try { - context.externalValue(metadata.get(Metadata.AUTHOR)); - authorMapper.parse(context); - } catch(MapperParsingException e){ - if (!ignoreErrors) throw e; - if (logger.isDebugEnabled()) logger.debug("Ignoring MapperParsingException catch while parsing author: {}: {}", e.getMessage(), context.externalValue()); - } - - try { - context.externalValue(metadata.get(Metadata.KEYWORDS)); - keywordsMapper.parse(context); - } catch(MapperParsingException e){ - if (!ignoreErrors) throw e; - if (logger.isDebugEnabled()) logger.debug("Ignoring MapperParsingException catch while parsing keywords: {}: {}", e.getMessage(), context.externalValue()); - } - - try { - if (contentType != null) { - context.externalValue(contentType); - } else { - context.externalValue(metadata.get(Metadata.CONTENT_TYPE)); + if (name != null) { + try { + context = context.createExternalValueContext(name); + nameMapper.parse(context); + } catch(MapperParsingException e){ + if (!ignoreErrors) throw e; + if (logger.isDebugEnabled()) logger.debug("Ignoring MapperParsingException catch while parsing name: {}", e.getMessage()); } - contentTypeMapper.parse(context); - } catch(MapperParsingException e){ - if (!ignoreErrors) throw e; - if (logger.isDebugEnabled()) logger.debug("Ignoring MapperParsingException catch while parsing content_type: {}: {}", e.getMessage(), context.externalValue()); + } + + if (metadata.get(Metadata.DATE) != null) { + try { + context = context.createExternalValueContext(metadata.get(Metadata.DATE)); + dateMapper.parse(context); + } catch(MapperParsingException e){ + if (!ignoreErrors) throw e; + if (logger.isDebugEnabled()) logger.debug("Ignoring MapperParsingException catch while parsing date: {}: {}", e.getMessage(), context.externalValue()); + } + } + + if (metadata.get(Metadata.TITLE) != null) { + try { + context = context.createExternalValueContext(metadata.get(Metadata.TITLE)); + titleMapper.parse(context); + } catch(MapperParsingException e){ + if (!ignoreErrors) throw e; + if (logger.isDebugEnabled()) logger.debug("Ignoring MapperParsingException catch while parsing title: {}: {}", e.getMessage(), context.externalValue()); + } + } + + if (metadata.get(Metadata.AUTHOR) != null) { + try { + context = context.createExternalValueContext(metadata.get(Metadata.AUTHOR)); + authorMapper.parse(context); + } catch(MapperParsingException e){ + if (!ignoreErrors) throw e; + if (logger.isDebugEnabled()) logger.debug("Ignoring MapperParsingException catch while parsing author: {}: {}", e.getMessage(), context.externalValue()); + } + } + + if (metadata.get(Metadata.KEYWORDS) != null) { + try { + context = context.createExternalValueContext(metadata.get(Metadata.KEYWORDS)); + keywordsMapper.parse(context); + } catch(MapperParsingException e){ + if (!ignoreErrors) throw e; + if (logger.isDebugEnabled()) logger.debug("Ignoring MapperParsingException catch while parsing keywords: {}: {}", e.getMessage(), context.externalValue()); + } + } + + if (contentType == null) { + contentType = metadata.get(Metadata.CONTENT_TYPE); + } + if (contentType != null) { + try { + context = context.createExternalValueContext(contentType); + contentTypeMapper.parse(context); + } catch(MapperParsingException e){ + if (!ignoreErrors) throw e; + if (logger.isDebugEnabled()) logger.debug("Ignoring MapperParsingException catch while parsing content_type: {}: {}", e.getMessage(), context.externalValue()); + } + } + + int length = content.length; + // If we have CONTENT_LENGTH from Tika we use it + if (metadata.get(Metadata.CONTENT_LENGTH) != null) { + length = Integer.parseInt(metadata.get(Metadata.CONTENT_LENGTH)); } try { - if (metadata.get(Metadata.CONTENT_LENGTH) != null) { - // We try to get CONTENT_LENGTH from Tika first - context.externalValue(metadata.get(Metadata.CONTENT_LENGTH)); - } else { - // Otherwise, we use our byte[] length - context.externalValue(content.length); - } + context = context.createExternalValueContext(length); contentLengthMapper.parse(context); } catch(MapperParsingException e){ if (!ignoreErrors) throw e; if (logger.isDebugEnabled()) logger.debug("Ignoring MapperParsingException catch while parsing content_length: {}: {}", e.getMessage(), context.externalValue()); } + +// multiFields.parse(this, context); + if (copyTo != null) { + copyTo.parse(context); + } + } + + @Override + protected void parseCreateField(ParseContext parseContext, List fields) throws IOException { + } @Override @@ -515,7 +582,7 @@ public class AttachmentMapper implements Mapper { @Override public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException { - builder.startObject(name); + builder.startObject(name()); builder.field("type", CONTENT_TYPE); builder.field("path", pathType.name().toLowerCase()); @@ -529,9 +596,16 @@ public class AttachmentMapper implements Mapper { contentTypeMapper.toXContent(builder, params); contentLengthMapper.toXContent(builder, params); languageMapper.toXContent(builder, params); + multiFields.toXContent(builder, params); builder.endObject(); + multiFields.toXContent(builder, params); builder.endObject(); return builder; } + + @Override + protected String contentType() { + return CONTENT_TYPE; + } } diff --git a/src/test/java/org/elasticsearch/index/mapper/xcontent/MapperTestUtils.java b/src/test/java/org/elasticsearch/index/mapper/xcontent/MapperTestUtils.java new file mode 100644 index 00000000000..776de3b35e3 --- /dev/null +++ b/src/test/java/org/elasticsearch/index/mapper/xcontent/MapperTestUtils.java @@ -0,0 +1,82 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.mapper.xcontent; + +import org.elasticsearch.common.inject.Injector; +import org.elasticsearch.common.inject.ModulesBuilder; +import org.elasticsearch.common.settings.ImmutableSettings; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.common.settings.SettingsModule; +import org.elasticsearch.env.Environment; +import org.elasticsearch.env.EnvironmentModule; +import org.elasticsearch.index.Index; +import org.elasticsearch.index.IndexNameModule; +import org.elasticsearch.index.analysis.AnalysisModule; +import org.elasticsearch.index.analysis.AnalysisService; +import org.elasticsearch.index.codec.docvaluesformat.DocValuesFormatService; +import org.elasticsearch.index.codec.postingsformat.PostingsFormatService; +import org.elasticsearch.index.fielddata.IndexFieldDataService; +import org.elasticsearch.index.mapper.DocumentMapperParser; +import org.elasticsearch.index.mapper.MapperService; +import org.elasticsearch.index.settings.IndexSettingsModule; +import org.elasticsearch.index.similarity.SimilarityLookupService; +import org.elasticsearch.indices.analysis.IndicesAnalysisModule; +import org.elasticsearch.indices.analysis.IndicesAnalysisService; +import org.elasticsearch.indices.fielddata.breaker.NoneCircuitBreakerService; + +public class MapperTestUtils { + + public static DocumentMapperParser newParser() { + return new DocumentMapperParser(new Index("test"), ImmutableSettings.Builder.EMPTY_SETTINGS, newAnalysisService(), new PostingsFormatService(new Index("test")), + new DocValuesFormatService(new Index("test")), newSimilarityLookupService(), null); + } + + public static DocumentMapperParser newParser(Settings indexSettings) { + return new DocumentMapperParser(new Index("test"), indexSettings, newAnalysisService(indexSettings), new PostingsFormatService(new Index("test")), + new DocValuesFormatService(new Index("test")), newSimilarityLookupService(), null); + } + + public static MapperService newMapperService() { + return newMapperService(new Index("test"), ImmutableSettings.Builder.EMPTY_SETTINGS); + } + + public static MapperService newMapperService(Index index, Settings indexSettings) { + return new MapperService(index, indexSettings, new Environment(), newAnalysisService(), new IndexFieldDataService(index, new NoneCircuitBreakerService()), + new PostingsFormatService(index), new DocValuesFormatService(index), newSimilarityLookupService(), null); + } + + public static AnalysisService newAnalysisService() { + return newAnalysisService(ImmutableSettings.Builder.EMPTY_SETTINGS); + } + + public static AnalysisService newAnalysisService(Settings indexSettings) { + Injector parentInjector = new ModulesBuilder().add(new SettingsModule(indexSettings), new EnvironmentModule(new Environment(ImmutableSettings.Builder.EMPTY_SETTINGS)), new IndicesAnalysisModule()).createInjector(); + Injector injector = new ModulesBuilder().add( + new IndexSettingsModule(new Index("test"), indexSettings), + new IndexNameModule(new Index("test")), + new AnalysisModule(indexSettings, parentInjector.getInstance(IndicesAnalysisService.class))).createChildInjector(parentInjector); + + return injector.getInstance(AnalysisService.class); + } + + public static SimilarityLookupService newSimilarityLookupService() { + return new SimilarityLookupService(new Index("test"), ImmutableSettings.Builder.EMPTY_SETTINGS); + } +} diff --git a/src/test/java/org/elasticsearch/index/mapper/xcontent/MultifieldAttachmentMapperTests.java b/src/test/java/org/elasticsearch/index/mapper/xcontent/MultifieldAttachmentMapperTests.java index 24ee3521388..63e11c56ffd 100644 --- a/src/test/java/org/elasticsearch/index/mapper/xcontent/MultifieldAttachmentMapperTests.java +++ b/src/test/java/org/elasticsearch/index/mapper/xcontent/MultifieldAttachmentMapperTests.java @@ -19,11 +19,15 @@ package org.elasticsearch.index.mapper.xcontent; +import org.elasticsearch.common.Base64; import org.elasticsearch.common.settings.ImmutableSettings; +import org.elasticsearch.common.xcontent.XContentFactory; import org.elasticsearch.index.Index; import org.elasticsearch.index.analysis.AnalysisService; import org.elasticsearch.index.mapper.DocumentMapper; import org.elasticsearch.index.mapper.DocumentMapperParser; +import org.elasticsearch.index.mapper.MapperService; +import org.elasticsearch.index.mapper.ParsedDocument; import org.elasticsearch.index.mapper.attachment.AttachmentMapper; import org.elasticsearch.index.mapper.core.DateFieldMapper; import org.elasticsearch.index.mapper.core.StringFieldMapper; @@ -32,7 +36,7 @@ import org.junit.Before; import org.junit.Test; import static org.elasticsearch.common.io.Streams.copyToStringFromClasspath; -import static org.hamcrest.Matchers.instanceOf; +import static org.hamcrest.Matchers.*; /** * @@ -74,4 +78,71 @@ public class MultifieldAttachmentMapperTests extends ElasticsearchTestCase { assertThat(docMapper.mappers().fullName("file.content_type").mapper(), instanceOf(StringFieldMapper.class)); assertThat(docMapper.mappers().fullName("file.content_type.suggest").mapper(), instanceOf(StringFieldMapper.class)); } + + @Test + public void testExternalValues() throws Exception { + String originalText = "This is an elasticsearch mapper attachment test."; + String contentType = "text/plain; charset=ISO-8859-1"; + String forcedName = "dummyname.txt"; + + String bytes = Base64.encodeBytes(originalText.getBytes()); + + MapperService mapperService = MapperTestUtils.newMapperService(); + mapperService.documentMapperParser().putTypeParser(AttachmentMapper.CONTENT_TYPE, new AttachmentMapper.TypeParser()); + + String mapping = copyToStringFromClasspath("/org/elasticsearch/index/mapper/multifield/multifield-mapping.json"); + + DocumentMapper documentMapper = mapperService.documentMapperParser().parse(mapping); + + ParsedDocument doc = documentMapper.parse("person", "1", XContentFactory.jsonBuilder() + .startObject() + .field("file", bytes) + .endObject() + .bytes()); + + assertThat(doc.rootDoc().getField("file"), notNullValue()); + assertThat(doc.rootDoc().getField("file").stringValue(), is(originalText + "\n")); + + assertThat(doc.rootDoc().getField("file.content_type"), notNullValue()); + assertThat(doc.rootDoc().getField("file.content_type").stringValue(), is(contentType)); + assertThat(doc.rootDoc().getField("file.content_type.suggest"), notNullValue()); + assertThat(doc.rootDoc().getField("file.content_type.suggest").stringValue(), is(contentType)); + assertThat(doc.rootDoc().getField("file.content_length"), notNullValue()); + assertThat(doc.rootDoc().getField("file.content_length").numericValue().intValue(), is(originalText.length())); + + assertThat(doc.rootDoc().getField("file.suggest"), notNullValue()); + assertThat(doc.rootDoc().getField("file.suggest").stringValue(), is(originalText + "\n")); + + // Let's force some values + doc = documentMapper.parse("person", "1", XContentFactory.jsonBuilder() + .startObject() + .startObject("file") + .field("content", bytes) + .field("_name", forcedName) + .endObject() + .endObject() + .bytes()); + + assertThat(doc.rootDoc().getField("file"), notNullValue()); + assertThat(doc.rootDoc().getField("file").stringValue(), is(originalText + "\n")); + + assertThat(doc.rootDoc().getField("file.content_type"), notNullValue()); + assertThat(doc.rootDoc().getField("file.content_type").stringValue(), is(contentType)); + assertThat(doc.rootDoc().getField("file.content_type.suggest"), notNullValue()); + assertThat(doc.rootDoc().getField("file.content_type.suggest").stringValue(), is(contentType)); + assertThat(doc.rootDoc().getField("file.content_length"), notNullValue()); + assertThat(doc.rootDoc().getField("file.content_length").numericValue().intValue(), is(originalText.length())); + + assertThat(doc.rootDoc().getField("file.suggest"), notNullValue()); + assertThat(doc.rootDoc().getField("file.suggest").stringValue(), is(originalText + "\n")); + + assertThat(doc.rootDoc().getField("file.name"), notNullValue()); + assertThat(doc.rootDoc().getField("file.name").stringValue(), is(forcedName)); + // In mapping we have default store:false + assertThat(doc.rootDoc().getField("file.name").fieldType().stored(), is(false)); + assertThat(doc.rootDoc().getField("file.name.suggest"), notNullValue()); + assertThat(doc.rootDoc().getField("file.name.suggest").stringValue(), is(forcedName)); + // In mapping we set store:true for suggest subfield + assertThat(doc.rootDoc().getField("file.name.suggest").fieldType().stored(), is(true)); + } } diff --git a/src/test/java/org/elasticsearch/plugin/mapper/attachments/test/SimpleAttachmentIntegrationTests.java b/src/test/java/org/elasticsearch/plugin/mapper/attachments/test/SimpleAttachmentIntegrationTests.java index c3a902d6e87..ac4f3af6e72 100644 --- a/src/test/java/org/elasticsearch/plugin/mapper/attachments/test/SimpleAttachmentIntegrationTests.java +++ b/src/test/java/org/elasticsearch/plugin/mapper/attachments/test/SimpleAttachmentIntegrationTests.java @@ -20,8 +20,8 @@ package org.elasticsearch.plugin.mapper.attachments.test; import org.elasticsearch.action.count.CountResponse; -import org.elasticsearch.common.settings.ImmutableSettings; import org.elasticsearch.action.search.SearchResponse; +import org.elasticsearch.common.settings.ImmutableSettings; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.index.mapper.MapperParsingException; import org.elasticsearch.plugins.PluginsService; @@ -32,7 +32,6 @@ import org.junit.Test; import static org.elasticsearch.client.Requests.putMappingRequest; import static org.elasticsearch.common.io.Streams.copyToBytesFromClasspath; import static org.elasticsearch.common.io.Streams.copyToStringFromClasspath; -import static org.elasticsearch.common.settings.ImmutableSettings.settingsBuilder; import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder; import static org.elasticsearch.index.query.QueryBuilders.queryString; import static org.hamcrest.Matchers.equalTo; @@ -58,13 +57,6 @@ public class SimpleAttachmentIntegrationTests extends ElasticsearchIntegrationTe createIndex("test"); } - @Override - public Settings indexSettings() { - return settingsBuilder() - .put("index.numberOfReplicas", 0) - .build(); - } - @Test public void testSimpleAttachment() throws Exception { String mapping = copyToStringFromClasspath("/org/elasticsearch/index/mapper/xcontent/test-mapping.json"); diff --git a/src/test/resources/org/elasticsearch/index/mapper/multifield/multifield-mapping.json b/src/test/resources/org/elasticsearch/index/mapper/multifield/multifield-mapping.json index 1f037957233..5ee42d3fb74 100644 --- a/src/test/resources/org/elasticsearch/index/mapper/multifield/multifield-mapping.json +++ b/src/test/resources/org/elasticsearch/index/mapper/multifield/multifield-mapping.json @@ -26,7 +26,10 @@ "name": { "type": "string", "fields": { - "suggest": { "type": "string" } + "suggest": { + "type": "string", + "store": true + } } }, "author": {