Add content_length metadata

We now generate `content_length` field field based on file size.
Closes #26.
This commit is contained in:
David Pilato 2013-08-20 16:03:31 +02:00
parent 406e295c6c
commit 8c340535d2
3 changed files with 100 additions and 41 deletions

View File

@ -1,5 +1,5 @@
Mapper Attachments Type for ElasticSearch Mapper Attachments Type for ElasticSearch
================================== =========================================
The mapper attachments plugin adds the `attachment` type to ElasticSearch using Tika. The mapper attachments plugin adds the `attachment` type to ElasticSearch using Tika.
@ -36,48 +36,72 @@ The `attachment` type is provided as a plugin extension. The plugin is a simple
Using the attachment type is simple, in your mapping JSON, simply set a certain JSON element as attachment, for example: Using the attachment type is simple, in your mapping JSON, simply set a certain JSON element as attachment, for example:
{ ```javascript
"person" : { {
"properties" : { "person" : {
"my_attachment" : { "type" : "attachment" } "properties" : {
} "my_attachment" : { "type" : "attachment" }
} }
} }
}
```
In this case, the JSON to index can be: In this case, the JSON to index can be:
{ ```javascript
"my_attachment" : "... base64 encoded attachment ..." {
} "my_attachment" : "... base64 encoded attachment ..."
}
```
Or it is possible to use more elaborated JSON if content type or resource name need to be set explicitly: Or it is possible to use more elaborated JSON if content type or resource name need to be set explicitly:
{ ```javascript
"my_attachment" : { {
"_content_type" : "application/pdf", "my_attachment" : {
"_name" : "resource/name/of/my.pdf", "_content_type" : "application/pdf",
"content" : "... base64 encoded attachment ..." "_name" : "resource/name/of/my.pdf",
} "content" : "... base64 encoded attachment ..."
} }
}
```
The `attachment` type not only indexes the content of the doc, but also automatically adds meta data on the attachment as well (when available). The metadata supported are: `date`, `title`, `author`, and `keywords`. They can be queried using the "dot notation", for example: `my_attachment.author`. The `attachment` type not only indexes the content of the doc, but also automatically adds meta data on the attachment as well (when available).
The metadata supported are:
* `date`
* `title`
* `name` only available if you set `_name` see above
* `author`
* `keywords`
* `content_type`
* `content_length` is the original content_length before text extraction (aka file size)
They can be queried using the "dot notation", for example: `my_attachment.author`.
Both the meta data and the actual content are simple core type mappers (string, date, ...), thus, they can be controlled in the mappings. For example: Both the meta data and the actual content are simple core type mappers (string, date, ...), thus, they can be controlled in the mappings. For example:
{ ```javascript
"person" : { {
"properties" : { "person" : {
"file" : { "properties" : {
"type" : "attachment", "file" : {
"fields" : { "type" : "attachment",
"file" : {"index" : "no"}, "fields" : {
"date" : {"store" : "yes"}, "file" : {"index" : "no"},
"author" : {"analyzer" : "myAnalyzer"} "title" : {store : "yes"},
} "date" : {"store" : "yes"},
"author" : {"analyzer" : "myAnalyzer"},
"keywords" : {store : "yes"},
"content_type" : {store : "yes"},
"content_length" : {store : "yes"}
} }
} }
} }
} }
}
```
In the above example, the actual content indexed is mapped under `fields` name `file`, and we decide not to index it, so it will only be available in the `_all` field. The other fields map to their respective metadata names, but there is no need to specify the `type` (like `string` or `date`) since it is already known. In the above example, the actual content indexed is mapped under `fields` name `file`, and we decide not to index it, so it will only be available in the `_all` field. The other fields map to their respective metadata names, but there is no need to specify the `type` (like `string` or `date`) since it is already known.

View File

@ -28,14 +28,14 @@ import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.XContentParser; import org.elasticsearch.common.xcontent.XContentParser;
import org.elasticsearch.index.mapper.*; import org.elasticsearch.index.mapper.*;
import org.elasticsearch.index.mapper.core.DateFieldMapper; import org.elasticsearch.index.mapper.core.DateFieldMapper;
import org.elasticsearch.index.mapper.core.IntegerFieldMapper;
import org.elasticsearch.index.mapper.core.StringFieldMapper; import org.elasticsearch.index.mapper.core.StringFieldMapper;
import org.elasticsearch.index.mapper.multifield.MultiFieldMapper; import org.elasticsearch.index.mapper.multifield.MultiFieldMapper;
import java.io.IOException; import java.io.IOException;
import java.util.Map; import java.util.Map;
import static org.elasticsearch.index.mapper.MapperBuilders.dateField; import static org.elasticsearch.index.mapper.MapperBuilders.*;
import static org.elasticsearch.index.mapper.MapperBuilders.stringField;
import static org.elasticsearch.index.mapper.core.TypeParsers.parsePathType; import static org.elasticsearch.index.mapper.core.TypeParsers.parsePathType;
import static org.elasticsearch.plugin.mapper.attachments.tika.TikaInstance.tika; import static org.elasticsearch.plugin.mapper.attachments.tika.TikaInstance.tika;
@ -90,6 +90,8 @@ public class AttachmentMapper implements Mapper {
private Mapper.Builder contentTypeBuilder = stringField("content_type"); private Mapper.Builder contentTypeBuilder = stringField("content_type");
private Mapper.Builder contentLengthBuilder = integerField("content_length");
public Builder(String name) { public Builder(String name) {
super(name); super(name);
this.builder = this; this.builder = this;
@ -136,6 +138,11 @@ public class AttachmentMapper implements Mapper {
return this; return this;
} }
public Builder contentLength(Mapper.Builder contentType) {
this.contentLengthBuilder = contentType;
return this;
}
@Override @Override
public AttachmentMapper build(BuilderContext context) { public AttachmentMapper build(BuilderContext context) {
ContentPath.Type origPathType = context.path().pathType(); ContentPath.Type origPathType = context.path().pathType();
@ -152,6 +159,7 @@ public class AttachmentMapper implements Mapper {
Mapper nameMapper = nameBuilder.build(context); Mapper nameMapper = nameBuilder.build(context);
Mapper keywordsMapper = keywordsBuilder.build(context); Mapper keywordsMapper = keywordsBuilder.build(context);
Mapper contentTypeMapper = contentTypeBuilder.build(context); Mapper contentTypeMapper = contentTypeBuilder.build(context);
Mapper contentLength = contentLengthBuilder.build(context);
context.path().remove(); context.path().remove();
context.path().pathType(origPathType); context.path().pathType(origPathType);
@ -170,7 +178,7 @@ public class AttachmentMapper implements Mapper {
ignoreErrors = Boolean.TRUE; ignoreErrors = Boolean.TRUE;
} }
return new AttachmentMapper(name, pathType, defaultIndexedChars, ignoreErrors, contentMapper, dateMapper, titleMapper, nameMapper, authorMapper, keywordsMapper, contentTypeMapper); return new AttachmentMapper(name, pathType, defaultIndexedChars, ignoreErrors, contentMapper, dateMapper, titleMapper, nameMapper, authorMapper, keywordsMapper, contentTypeMapper, contentLength);
} }
} }
@ -185,7 +193,12 @@ public class AttachmentMapper implements Mapper {
* fields : { * fields : {
* field1 : {type : "binary"}, * field1 : {type : "binary"},
* title : {store : "yes"}, * title : {store : "yes"},
* date : {store : "yes"} * date : {store : "yes"},
* name : {store : "yes"},
* author : {store : "yes"},
* keywords : {store : "yes"},
* content_type : {store : "yes"},
* content_length : {store : "yes"}
* } * }
* } * }
* </pre> * </pre>
@ -232,6 +245,8 @@ public class AttachmentMapper implements Mapper {
builder.keywords(parserContext.typeParser(isMultifield? MultiFieldMapper.CONTENT_TYPE:StringFieldMapper.CONTENT_TYPE).parse("keywords", (Map<String, Object>) propNode, parserContext)); builder.keywords(parserContext.typeParser(isMultifield? MultiFieldMapper.CONTENT_TYPE:StringFieldMapper.CONTENT_TYPE).parse("keywords", (Map<String, Object>) propNode, parserContext));
} else if ("content_type".equals(propName)) { } else if ("content_type".equals(propName)) {
builder.contentType(parserContext.typeParser(isMultifield? MultiFieldMapper.CONTENT_TYPE:StringFieldMapper.CONTENT_TYPE).parse("content_type", (Map<String, Object>) propNode, parserContext)); builder.contentType(parserContext.typeParser(isMultifield? MultiFieldMapper.CONTENT_TYPE:StringFieldMapper.CONTENT_TYPE).parse("content_type", (Map<String, Object>) propNode, parserContext));
} else if ("content_length".equals(propName)) {
builder.contentLength(parserContext.typeParser(isMultifield? MultiFieldMapper.CONTENT_TYPE: IntegerFieldMapper.CONTENT_TYPE).parse("content_length", (Map<String, Object>) propNode, parserContext));
} }
} }
} }
@ -263,9 +278,11 @@ public class AttachmentMapper implements Mapper {
private final Mapper contentTypeMapper; private final Mapper contentTypeMapper;
private final Mapper contentLengthMapper;
public AttachmentMapper(String name, ContentPath.Type pathType, int defaultIndexedChars, Boolean ignoreErrors, Mapper contentMapper, public AttachmentMapper(String name, ContentPath.Type pathType, int defaultIndexedChars, Boolean ignoreErrors, Mapper contentMapper,
Mapper dateMapper, Mapper titleMapper, Mapper nameMapper, Mapper authorMapper, Mapper dateMapper, Mapper titleMapper, Mapper nameMapper, Mapper authorMapper,
Mapper keywordsMapper, Mapper contentTypeMapper) { Mapper keywordsMapper, Mapper contentTypeMapper, Mapper contentLengthMapper) {
this.name = name; this.name = name;
this.pathType = pathType; this.pathType = pathType;
this.defaultIndexedChars = defaultIndexedChars; this.defaultIndexedChars = defaultIndexedChars;
@ -277,6 +294,7 @@ public class AttachmentMapper implements Mapper {
this.authorMapper = authorMapper; this.authorMapper = authorMapper;
this.keywordsMapper = keywordsMapper; this.keywordsMapper = keywordsMapper;
this.contentTypeMapper = contentTypeMapper; this.contentTypeMapper = contentTypeMapper;
this.contentLengthMapper = contentLengthMapper;
} }
@Override @Override
@ -388,6 +406,20 @@ public class AttachmentMapper implements Mapper {
if (!ignoreErrors) throw e; if (!ignoreErrors) throw e;
if (logger.isDebugEnabled()) logger.debug("Ignoring MapperParsingException catch while parsing content_type: {}: {}", e.getMessage(), context.externalValue()); if (logger.isDebugEnabled()) logger.debug("Ignoring MapperParsingException catch while parsing content_type: {}: {}", e.getMessage(), context.externalValue());
} }
try {
if (metadata.get(Metadata.CONTENT_LENGTH) != null) {
// We try to get CONTENT_LENGTH from Tika first
context.externalValue(metadata.get(Metadata.CONTENT_LENGTH));
} else {
// Otherwise, we use our byte[] length
context.externalValue(content.length);
}
contentLengthMapper.parse(context);
} catch(MapperParsingException e){
if (!ignoreErrors) throw e;
if (logger.isDebugEnabled()) logger.debug("Ignoring MapperParsingException catch while parsing content_length: {}: {}", e.getMessage(), context.externalValue());
}
} }
@Override @Override
@ -404,6 +436,7 @@ public class AttachmentMapper implements Mapper {
authorMapper.traverse(fieldMapperListener); authorMapper.traverse(fieldMapperListener);
keywordsMapper.traverse(fieldMapperListener); keywordsMapper.traverse(fieldMapperListener);
contentTypeMapper.traverse(fieldMapperListener); contentTypeMapper.traverse(fieldMapperListener);
contentLengthMapper.traverse(fieldMapperListener);
} }
@Override @Override
@ -419,6 +452,7 @@ public class AttachmentMapper implements Mapper {
authorMapper.close(); authorMapper.close();
keywordsMapper.close(); keywordsMapper.close();
contentTypeMapper.close(); contentTypeMapper.close();
contentLengthMapper.close();
} }
@Override @Override
@ -435,6 +469,7 @@ public class AttachmentMapper implements Mapper {
dateMapper.toXContent(builder, params); dateMapper.toXContent(builder, params);
keywordsMapper.toXContent(builder, params); keywordsMapper.toXContent(builder, params);
contentTypeMapper.toXContent(builder, params); contentTypeMapper.toXContent(builder, params);
contentLengthMapper.toXContent(builder, params);
builder.endObject(); builder.endObject();
builder.endObject(); builder.endObject();

View File

@ -25,7 +25,7 @@ import static org.hamcrest.Matchers.*;
*/ */
public class MetadataMapperTest { public class MetadataMapperTest {
protected void checkDate(String filename, Settings settings, Long expected) throws IOException { protected void checkMeta(String filename, Settings settings, Long expectedDate, Long expectedLength) throws IOException {
DocumentMapperParser mapperParser = new DocumentMapperParser(new Index("test"), settings, new AnalysisService(new Index("test")), null, null); DocumentMapperParser mapperParser = new DocumentMapperParser(new Index("test"), settings, new AnalysisService(new Index("test")), null, null);
mapperParser.putTypeParser(AttachmentMapper.CONTENT_TYPE, new AttachmentMapper.TypeParser()); mapperParser.putTypeParser(AttachmentMapper.CONTENT_TYPE, new AttachmentMapper.TypeParser());
@ -45,45 +45,45 @@ public class MetadataMapperTest {
Document doc = docMapper.parse(json).rootDoc(); Document doc = docMapper.parse(json).rootDoc();
assertThat(doc.get(docMapper.mappers().smartName("file").mapper().names().indexName()), containsString("World")); assertThat(doc.get(docMapper.mappers().smartName("file").mapper().names().indexName()), containsString("World"));
assertThat(doc.get(docMapper.mappers().smartName("file.name").mapper().names().indexName()), equalTo(filename)); assertThat(doc.get(docMapper.mappers().smartName("file.name").mapper().names().indexName()), equalTo(filename));
if (expected == null) { if (expectedDate == null) {
assertThat(doc.getField(docMapper.mappers().smartName("file.date").mapper().names().indexName()), nullValue()); assertThat(doc.getField(docMapper.mappers().smartName("file.date").mapper().names().indexName()), nullValue());
} else { } else {
assertThat(doc.getField(docMapper.mappers().smartName("file.date").mapper().names().indexName()).numericValue().longValue(), is(expected)); assertThat(doc.getField(docMapper.mappers().smartName("file.date").mapper().names().indexName()).numericValue().longValue(), is(expectedDate));
} }
assertThat(doc.get(docMapper.mappers().smartName("file.title").mapper().names().indexName()), equalTo("Hello")); assertThat(doc.get(docMapper.mappers().smartName("file.title").mapper().names().indexName()), equalTo("Hello"));
assertThat(doc.get(docMapper.mappers().smartName("file.author").mapper().names().indexName()), equalTo("kimchy")); assertThat(doc.get(docMapper.mappers().smartName("file.author").mapper().names().indexName()), equalTo("kimchy"));
assertThat(doc.get(docMapper.mappers().smartName("file.keywords").mapper().names().indexName()), equalTo("elasticsearch,cool,bonsai")); assertThat(doc.get(docMapper.mappers().smartName("file.keywords").mapper().names().indexName()), equalTo("elasticsearch,cool,bonsai"));
assertThat(doc.get(docMapper.mappers().smartName("file.content_type").mapper().names().indexName()), equalTo("text/html; charset=ISO-8859-1")); assertThat(doc.get(docMapper.mappers().smartName("file.content_type").mapper().names().indexName()), equalTo("text/html; charset=ISO-8859-1"));
assertThat(doc.getField(docMapper.mappers().smartName("file.content_length").mapper().names().indexName()).numericValue().longValue(), is(expectedLength));
} }
@Test @Test
public void testIgnoreWithoutDate() throws Exception { public void testIgnoreWithoutDate() throws Exception {
checkDate("htmlWithoutDateMeta.html", ImmutableSettings.builder().build(), null); checkMeta("htmlWithoutDateMeta.html", ImmutableSettings.builder().build(), null, 300L);
} }
@Test @Test
public void testIgnoreWithEmptyDate() throws Exception { public void testIgnoreWithEmptyDate() throws Exception {
checkDate("htmlWithEmptyDateMeta.html", ImmutableSettings.builder().build(), null); checkMeta("htmlWithEmptyDateMeta.html", ImmutableSettings.builder().build(), null, 334L);
} }
@Test @Test
public void testIgnoreWithCorrectDate() throws Exception { public void testIgnoreWithCorrectDate() throws Exception {
checkDate("htmlWithValidDateMeta.html", ImmutableSettings.builder().build(), 1354233600000L); checkMeta("htmlWithValidDateMeta.html", ImmutableSettings.builder().build(), 1354233600000L, 344L);
} }
@Test @Test
public void testWithoutDate() throws Exception { public void testWithoutDate() throws Exception {
checkDate("htmlWithoutDateMeta.html", ImmutableSettings.builder().put("index.mapping.attachment.ignore_errors", false).build(), null); checkMeta("htmlWithoutDateMeta.html", ImmutableSettings.builder().put("index.mapping.attachment.ignore_errors", false).build(), null, 300L);
} }
@Test(expectedExceptions = MapperParsingException.class) @Test(expectedExceptions = MapperParsingException.class)
public void testWithEmptyDate() throws Exception { public void testWithEmptyDate() throws Exception {
checkDate("htmlWithEmptyDateMeta.html", ImmutableSettings.builder().put("index.mapping.attachment.ignore_errors", false).build(), null); checkMeta("htmlWithEmptyDateMeta.html", ImmutableSettings.builder().put("index.mapping.attachment.ignore_errors", false).build(), null, null);
} }
@Test @Test
public void testWithCorrectDate() throws Exception { public void testWithCorrectDate() throws Exception {
checkDate("htmlWithValidDateMeta.html", ImmutableSettings.builder().put("index.mapping.attachment.ignore_errors", false).build(), 1354233600000L); checkMeta("htmlWithValidDateMeta.html", ImmutableSettings.builder().put("index.mapping.attachment.ignore_errors", false).build(), 1354233600000L, 344L);
} }
} }