diff --git a/README.md b/README.md index 42f67d6f707..b8d25ef6817 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ Mapper Attachments Type for ElasticSearch -================================== +========================================= The mapper attachments plugin adds the `attachment` type to ElasticSearch using Tika. @@ -36,48 +36,72 @@ The `attachment` type is provided as a plugin extension. The plugin is a simple Using the attachment type is simple, in your mapping JSON, simply set a certain JSON element as attachment, for example: - { - "person" : { - "properties" : { - "my_attachment" : { "type" : "attachment" } - } +```javascript +{ + "person" : { + "properties" : { + "my_attachment" : { "type" : "attachment" } } } +} +``` In this case, the JSON to index can be: - { - "my_attachment" : "... base64 encoded attachment ..." - } +```javascript +{ + "my_attachment" : "... base64 encoded attachment ..." +} +``` Or it is possible to use more elaborated JSON if content type or resource name need to be set explicitly: - { - "my_attachment" : { - "_content_type" : "application/pdf", - "_name" : "resource/name/of/my.pdf", - "content" : "... base64 encoded attachment ..." - } +```javascript +{ + "my_attachment" : { + "_content_type" : "application/pdf", + "_name" : "resource/name/of/my.pdf", + "content" : "... base64 encoded attachment ..." } +} +``` -The `attachment` type not only indexes the content of the doc, but also automatically adds meta data on the attachment as well (when available). The metadata supported are: `date`, `title`, `author`, and `keywords`. They can be queried using the "dot notation", for example: `my_attachment.author`. +The `attachment` type not only indexes the content of the doc, but also automatically adds meta data on the attachment as well (when available). + +The metadata supported are: + +* `date` +* `title` +* `name` only available if you set `_name` see above +* `author` +* `keywords` +* `content_type` +* `content_length` is the original content_length before text extraction (aka file size) + +They can be queried using the "dot notation", for example: `my_attachment.author`. Both the meta data and the actual content are simple core type mappers (string, date, ...), thus, they can be controlled in the mappings. For example: - { - "person" : { - "properties" : { - "file" : { - "type" : "attachment", - "fields" : { - "file" : {"index" : "no"}, - "date" : {"store" : "yes"}, - "author" : {"analyzer" : "myAnalyzer"} - } +```javascript +{ + "person" : { + "properties" : { + "file" : { + "type" : "attachment", + "fields" : { + "file" : {"index" : "no"}, + "title" : {store : "yes"}, + "date" : {"store" : "yes"}, + "author" : {"analyzer" : "myAnalyzer"}, + "keywords" : {store : "yes"}, + "content_type" : {store : "yes"}, + "content_length" : {store : "yes"} } } } } +} +``` In the above example, the actual content indexed is mapped under `fields` name `file`, and we decide not to index it, so it will only be available in the `_all` field. The other fields map to their respective metadata names, but there is no need to specify the `type` (like `string` or `date`) since it is already known. diff --git a/src/main/java/org/elasticsearch/index/mapper/attachment/AttachmentMapper.java b/src/main/java/org/elasticsearch/index/mapper/attachment/AttachmentMapper.java index 12f583a935f..9705e07c882 100644 --- a/src/main/java/org/elasticsearch/index/mapper/attachment/AttachmentMapper.java +++ b/src/main/java/org/elasticsearch/index/mapper/attachment/AttachmentMapper.java @@ -28,14 +28,14 @@ import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.common.xcontent.XContentParser; import org.elasticsearch.index.mapper.*; import org.elasticsearch.index.mapper.core.DateFieldMapper; +import org.elasticsearch.index.mapper.core.IntegerFieldMapper; import org.elasticsearch.index.mapper.core.StringFieldMapper; import org.elasticsearch.index.mapper.multifield.MultiFieldMapper; import java.io.IOException; import java.util.Map; -import static org.elasticsearch.index.mapper.MapperBuilders.dateField; -import static org.elasticsearch.index.mapper.MapperBuilders.stringField; +import static org.elasticsearch.index.mapper.MapperBuilders.*; import static org.elasticsearch.index.mapper.core.TypeParsers.parsePathType; import static org.elasticsearch.plugin.mapper.attachments.tika.TikaInstance.tika; @@ -90,6 +90,8 @@ public class AttachmentMapper implements Mapper { private Mapper.Builder contentTypeBuilder = stringField("content_type"); + private Mapper.Builder contentLengthBuilder = integerField("content_length"); + public Builder(String name) { super(name); this.builder = this; @@ -136,6 +138,11 @@ public class AttachmentMapper implements Mapper { return this; } + public Builder contentLength(Mapper.Builder contentType) { + this.contentLengthBuilder = contentType; + return this; + } + @Override public AttachmentMapper build(BuilderContext context) { ContentPath.Type origPathType = context.path().pathType(); @@ -152,6 +159,7 @@ public class AttachmentMapper implements Mapper { Mapper nameMapper = nameBuilder.build(context); Mapper keywordsMapper = keywordsBuilder.build(context); Mapper contentTypeMapper = contentTypeBuilder.build(context); + Mapper contentLength = contentLengthBuilder.build(context); context.path().remove(); context.path().pathType(origPathType); @@ -170,7 +178,7 @@ public class AttachmentMapper implements Mapper { ignoreErrors = Boolean.TRUE; } - return new AttachmentMapper(name, pathType, defaultIndexedChars, ignoreErrors, contentMapper, dateMapper, titleMapper, nameMapper, authorMapper, keywordsMapper, contentTypeMapper); + return new AttachmentMapper(name, pathType, defaultIndexedChars, ignoreErrors, contentMapper, dateMapper, titleMapper, nameMapper, authorMapper, keywordsMapper, contentTypeMapper, contentLength); } } @@ -185,7 +193,12 @@ public class AttachmentMapper implements Mapper { * fields : { * field1 : {type : "binary"}, * title : {store : "yes"}, - * date : {store : "yes"} + * date : {store : "yes"}, + * name : {store : "yes"}, + * author : {store : "yes"}, + * keywords : {store : "yes"}, + * content_type : {store : "yes"}, + * content_length : {store : "yes"} * } * } * @@ -232,6 +245,8 @@ public class AttachmentMapper implements Mapper { builder.keywords(parserContext.typeParser(isMultifield? MultiFieldMapper.CONTENT_TYPE:StringFieldMapper.CONTENT_TYPE).parse("keywords", (Map) propNode, parserContext)); } else if ("content_type".equals(propName)) { builder.contentType(parserContext.typeParser(isMultifield? MultiFieldMapper.CONTENT_TYPE:StringFieldMapper.CONTENT_TYPE).parse("content_type", (Map) propNode, parserContext)); + } else if ("content_length".equals(propName)) { + builder.contentLength(parserContext.typeParser(isMultifield? MultiFieldMapper.CONTENT_TYPE: IntegerFieldMapper.CONTENT_TYPE).parse("content_length", (Map) propNode, parserContext)); } } } @@ -263,9 +278,11 @@ public class AttachmentMapper implements Mapper { private final Mapper contentTypeMapper; + private final Mapper contentLengthMapper; + public AttachmentMapper(String name, ContentPath.Type pathType, int defaultIndexedChars, Boolean ignoreErrors, Mapper contentMapper, Mapper dateMapper, Mapper titleMapper, Mapper nameMapper, Mapper authorMapper, - Mapper keywordsMapper, Mapper contentTypeMapper) { + Mapper keywordsMapper, Mapper contentTypeMapper, Mapper contentLengthMapper) { this.name = name; this.pathType = pathType; this.defaultIndexedChars = defaultIndexedChars; @@ -277,6 +294,7 @@ public class AttachmentMapper implements Mapper { this.authorMapper = authorMapper; this.keywordsMapper = keywordsMapper; this.contentTypeMapper = contentTypeMapper; + this.contentLengthMapper = contentLengthMapper; } @Override @@ -388,6 +406,20 @@ public class AttachmentMapper implements Mapper { if (!ignoreErrors) throw e; if (logger.isDebugEnabled()) logger.debug("Ignoring MapperParsingException catch while parsing content_type: {}: {}", e.getMessage(), context.externalValue()); } + + try { + if (metadata.get(Metadata.CONTENT_LENGTH) != null) { + // We try to get CONTENT_LENGTH from Tika first + context.externalValue(metadata.get(Metadata.CONTENT_LENGTH)); + } else { + // Otherwise, we use our byte[] length + context.externalValue(content.length); + } + contentLengthMapper.parse(context); + } catch(MapperParsingException e){ + if (!ignoreErrors) throw e; + if (logger.isDebugEnabled()) logger.debug("Ignoring MapperParsingException catch while parsing content_length: {}: {}", e.getMessage(), context.externalValue()); + } } @Override @@ -404,6 +436,7 @@ public class AttachmentMapper implements Mapper { authorMapper.traverse(fieldMapperListener); keywordsMapper.traverse(fieldMapperListener); contentTypeMapper.traverse(fieldMapperListener); + contentLengthMapper.traverse(fieldMapperListener); } @Override @@ -419,6 +452,7 @@ public class AttachmentMapper implements Mapper { authorMapper.close(); keywordsMapper.close(); contentTypeMapper.close(); + contentLengthMapper.close(); } @Override @@ -435,6 +469,7 @@ public class AttachmentMapper implements Mapper { dateMapper.toXContent(builder, params); keywordsMapper.toXContent(builder, params); contentTypeMapper.toXContent(builder, params); + contentLengthMapper.toXContent(builder, params); builder.endObject(); builder.endObject(); diff --git a/src/test/java/org/elasticsearch/index/mapper/xcontent/MetadataMapperTest.java b/src/test/java/org/elasticsearch/index/mapper/xcontent/MetadataMapperTest.java index 2190448e156..945485d8340 100644 --- a/src/test/java/org/elasticsearch/index/mapper/xcontent/MetadataMapperTest.java +++ b/src/test/java/org/elasticsearch/index/mapper/xcontent/MetadataMapperTest.java @@ -25,7 +25,7 @@ import static org.hamcrest.Matchers.*; */ public class MetadataMapperTest { - protected void checkDate(String filename, Settings settings, Long expected) throws IOException { + protected void checkMeta(String filename, Settings settings, Long expectedDate, Long expectedLength) throws IOException { DocumentMapperParser mapperParser = new DocumentMapperParser(new Index("test"), settings, new AnalysisService(new Index("test")), null, null); mapperParser.putTypeParser(AttachmentMapper.CONTENT_TYPE, new AttachmentMapper.TypeParser()); @@ -45,45 +45,45 @@ public class MetadataMapperTest { Document doc = docMapper.parse(json).rootDoc(); assertThat(doc.get(docMapper.mappers().smartName("file").mapper().names().indexName()), containsString("World")); assertThat(doc.get(docMapper.mappers().smartName("file.name").mapper().names().indexName()), equalTo(filename)); - if (expected == null) { + if (expectedDate == null) { assertThat(doc.getField(docMapper.mappers().smartName("file.date").mapper().names().indexName()), nullValue()); } else { - assertThat(doc.getField(docMapper.mappers().smartName("file.date").mapper().names().indexName()).numericValue().longValue(), is(expected)); + assertThat(doc.getField(docMapper.mappers().smartName("file.date").mapper().names().indexName()).numericValue().longValue(), is(expectedDate)); } assertThat(doc.get(docMapper.mappers().smartName("file.title").mapper().names().indexName()), equalTo("Hello")); assertThat(doc.get(docMapper.mappers().smartName("file.author").mapper().names().indexName()), equalTo("kimchy")); assertThat(doc.get(docMapper.mappers().smartName("file.keywords").mapper().names().indexName()), equalTo("elasticsearch,cool,bonsai")); assertThat(doc.get(docMapper.mappers().smartName("file.content_type").mapper().names().indexName()), equalTo("text/html; charset=ISO-8859-1")); + assertThat(doc.getField(docMapper.mappers().smartName("file.content_length").mapper().names().indexName()).numericValue().longValue(), is(expectedLength)); } @Test public void testIgnoreWithoutDate() throws Exception { - checkDate("htmlWithoutDateMeta.html", ImmutableSettings.builder().build(), null); + checkMeta("htmlWithoutDateMeta.html", ImmutableSettings.builder().build(), null, 300L); } @Test public void testIgnoreWithEmptyDate() throws Exception { - checkDate("htmlWithEmptyDateMeta.html", ImmutableSettings.builder().build(), null); + checkMeta("htmlWithEmptyDateMeta.html", ImmutableSettings.builder().build(), null, 334L); } @Test public void testIgnoreWithCorrectDate() throws Exception { - checkDate("htmlWithValidDateMeta.html", ImmutableSettings.builder().build(), 1354233600000L); + checkMeta("htmlWithValidDateMeta.html", ImmutableSettings.builder().build(), 1354233600000L, 344L); } @Test public void testWithoutDate() throws Exception { - checkDate("htmlWithoutDateMeta.html", ImmutableSettings.builder().put("index.mapping.attachment.ignore_errors", false).build(), null); + checkMeta("htmlWithoutDateMeta.html", ImmutableSettings.builder().put("index.mapping.attachment.ignore_errors", false).build(), null, 300L); } @Test(expectedExceptions = MapperParsingException.class) public void testWithEmptyDate() throws Exception { - checkDate("htmlWithEmptyDateMeta.html", ImmutableSettings.builder().put("index.mapping.attachment.ignore_errors", false).build(), null); + checkMeta("htmlWithEmptyDateMeta.html", ImmutableSettings.builder().put("index.mapping.attachment.ignore_errors", false).build(), null, null); } @Test public void testWithCorrectDate() throws Exception { - checkDate("htmlWithValidDateMeta.html", ImmutableSettings.builder().put("index.mapping.attachment.ignore_errors", false).build(), 1354233600000L); + checkMeta("htmlWithValidDateMeta.html", ImmutableSettings.builder().put("index.mapping.attachment.ignore_errors", false).build(), 1354233600000L, 344L); } - }