Add content_length metadata

We now generate `content_length` field field based on file size.
Closes #26.
This commit is contained in:
David Pilato 2013-08-20 16:03:31 +02:00
parent 406e295c6c
commit 8c340535d2
3 changed files with 100 additions and 41 deletions

View File

@ -1,5 +1,5 @@
Mapper Attachments Type for ElasticSearch
==================================
=========================================
The mapper attachments plugin adds the `attachment` type to ElasticSearch using Tika.
@ -36,48 +36,72 @@ The `attachment` type is provided as a plugin extension. The plugin is a simple
Using the attachment type is simple, in your mapping JSON, simply set a certain JSON element as attachment, for example:
{
```javascript
{
"person" : {
"properties" : {
"my_attachment" : { "type" : "attachment" }
}
}
}
}
```
In this case, the JSON to index can be:
{
```javascript
{
"my_attachment" : "... base64 encoded attachment ..."
}
}
```
Or it is possible to use more elaborated JSON if content type or resource name need to be set explicitly:
{
```javascript
{
"my_attachment" : {
"_content_type" : "application/pdf",
"_name" : "resource/name/of/my.pdf",
"content" : "... base64 encoded attachment ..."
}
}
}
```
The `attachment` type not only indexes the content of the doc, but also automatically adds meta data on the attachment as well (when available). The metadata supported are: `date`, `title`, `author`, and `keywords`. They can be queried using the "dot notation", for example: `my_attachment.author`.
The `attachment` type not only indexes the content of the doc, but also automatically adds meta data on the attachment as well (when available).
The metadata supported are:
* `date`
* `title`
* `name` only available if you set `_name` see above
* `author`
* `keywords`
* `content_type`
* `content_length` is the original content_length before text extraction (aka file size)
They can be queried using the "dot notation", for example: `my_attachment.author`.
Both the meta data and the actual content are simple core type mappers (string, date, ...), thus, they can be controlled in the mappings. For example:
{
```javascript
{
"person" : {
"properties" : {
"file" : {
"type" : "attachment",
"fields" : {
"file" : {"index" : "no"},
"title" : {store : "yes"},
"date" : {"store" : "yes"},
"author" : {"analyzer" : "myAnalyzer"}
}
"author" : {"analyzer" : "myAnalyzer"},
"keywords" : {store : "yes"},
"content_type" : {store : "yes"},
"content_length" : {store : "yes"}
}
}
}
}
}
```
In the above example, the actual content indexed is mapped under `fields` name `file`, and we decide not to index it, so it will only be available in the `_all` field. The other fields map to their respective metadata names, but there is no need to specify the `type` (like `string` or `date`) since it is already known.

View File

@ -28,14 +28,14 @@ import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.XContentParser;
import org.elasticsearch.index.mapper.*;
import org.elasticsearch.index.mapper.core.DateFieldMapper;
import org.elasticsearch.index.mapper.core.IntegerFieldMapper;
import org.elasticsearch.index.mapper.core.StringFieldMapper;
import org.elasticsearch.index.mapper.multifield.MultiFieldMapper;
import java.io.IOException;
import java.util.Map;
import static org.elasticsearch.index.mapper.MapperBuilders.dateField;
import static org.elasticsearch.index.mapper.MapperBuilders.stringField;
import static org.elasticsearch.index.mapper.MapperBuilders.*;
import static org.elasticsearch.index.mapper.core.TypeParsers.parsePathType;
import static org.elasticsearch.plugin.mapper.attachments.tika.TikaInstance.tika;
@ -90,6 +90,8 @@ public class AttachmentMapper implements Mapper {
private Mapper.Builder contentTypeBuilder = stringField("content_type");
private Mapper.Builder contentLengthBuilder = integerField("content_length");
public Builder(String name) {
super(name);
this.builder = this;
@ -136,6 +138,11 @@ public class AttachmentMapper implements Mapper {
return this;
}
public Builder contentLength(Mapper.Builder contentType) {
this.contentLengthBuilder = contentType;
return this;
}
@Override
public AttachmentMapper build(BuilderContext context) {
ContentPath.Type origPathType = context.path().pathType();
@ -152,6 +159,7 @@ public class AttachmentMapper implements Mapper {
Mapper nameMapper = nameBuilder.build(context);
Mapper keywordsMapper = keywordsBuilder.build(context);
Mapper contentTypeMapper = contentTypeBuilder.build(context);
Mapper contentLength = contentLengthBuilder.build(context);
context.path().remove();
context.path().pathType(origPathType);
@ -170,7 +178,7 @@ public class AttachmentMapper implements Mapper {
ignoreErrors = Boolean.TRUE;
}
return new AttachmentMapper(name, pathType, defaultIndexedChars, ignoreErrors, contentMapper, dateMapper, titleMapper, nameMapper, authorMapper, keywordsMapper, contentTypeMapper);
return new AttachmentMapper(name, pathType, defaultIndexedChars, ignoreErrors, contentMapper, dateMapper, titleMapper, nameMapper, authorMapper, keywordsMapper, contentTypeMapper, contentLength);
}
}
@ -185,7 +193,12 @@ public class AttachmentMapper implements Mapper {
* fields : {
* field1 : {type : "binary"},
* title : {store : "yes"},
* date : {store : "yes"}
* date : {store : "yes"},
* name : {store : "yes"},
* author : {store : "yes"},
* keywords : {store : "yes"},
* content_type : {store : "yes"},
* content_length : {store : "yes"}
* }
* }
* </pre>
@ -232,6 +245,8 @@ public class AttachmentMapper implements Mapper {
builder.keywords(parserContext.typeParser(isMultifield? MultiFieldMapper.CONTENT_TYPE:StringFieldMapper.CONTENT_TYPE).parse("keywords", (Map<String, Object>) propNode, parserContext));
} else if ("content_type".equals(propName)) {
builder.contentType(parserContext.typeParser(isMultifield? MultiFieldMapper.CONTENT_TYPE:StringFieldMapper.CONTENT_TYPE).parse("content_type", (Map<String, Object>) propNode, parserContext));
} else if ("content_length".equals(propName)) {
builder.contentLength(parserContext.typeParser(isMultifield? MultiFieldMapper.CONTENT_TYPE: IntegerFieldMapper.CONTENT_TYPE).parse("content_length", (Map<String, Object>) propNode, parserContext));
}
}
}
@ -263,9 +278,11 @@ public class AttachmentMapper implements Mapper {
private final Mapper contentTypeMapper;
private final Mapper contentLengthMapper;
public AttachmentMapper(String name, ContentPath.Type pathType, int defaultIndexedChars, Boolean ignoreErrors, Mapper contentMapper,
Mapper dateMapper, Mapper titleMapper, Mapper nameMapper, Mapper authorMapper,
Mapper keywordsMapper, Mapper contentTypeMapper) {
Mapper keywordsMapper, Mapper contentTypeMapper, Mapper contentLengthMapper) {
this.name = name;
this.pathType = pathType;
this.defaultIndexedChars = defaultIndexedChars;
@ -277,6 +294,7 @@ public class AttachmentMapper implements Mapper {
this.authorMapper = authorMapper;
this.keywordsMapper = keywordsMapper;
this.contentTypeMapper = contentTypeMapper;
this.contentLengthMapper = contentLengthMapper;
}
@Override
@ -388,6 +406,20 @@ public class AttachmentMapper implements Mapper {
if (!ignoreErrors) throw e;
if (logger.isDebugEnabled()) logger.debug("Ignoring MapperParsingException catch while parsing content_type: {}: {}", e.getMessage(), context.externalValue());
}
try {
if (metadata.get(Metadata.CONTENT_LENGTH) != null) {
// We try to get CONTENT_LENGTH from Tika first
context.externalValue(metadata.get(Metadata.CONTENT_LENGTH));
} else {
// Otherwise, we use our byte[] length
context.externalValue(content.length);
}
contentLengthMapper.parse(context);
} catch(MapperParsingException e){
if (!ignoreErrors) throw e;
if (logger.isDebugEnabled()) logger.debug("Ignoring MapperParsingException catch while parsing content_length: {}: {}", e.getMessage(), context.externalValue());
}
}
@Override
@ -404,6 +436,7 @@ public class AttachmentMapper implements Mapper {
authorMapper.traverse(fieldMapperListener);
keywordsMapper.traverse(fieldMapperListener);
contentTypeMapper.traverse(fieldMapperListener);
contentLengthMapper.traverse(fieldMapperListener);
}
@Override
@ -419,6 +452,7 @@ public class AttachmentMapper implements Mapper {
authorMapper.close();
keywordsMapper.close();
contentTypeMapper.close();
contentLengthMapper.close();
}
@Override
@ -435,6 +469,7 @@ public class AttachmentMapper implements Mapper {
dateMapper.toXContent(builder, params);
keywordsMapper.toXContent(builder, params);
contentTypeMapper.toXContent(builder, params);
contentLengthMapper.toXContent(builder, params);
builder.endObject();
builder.endObject();

View File

@ -25,7 +25,7 @@ import static org.hamcrest.Matchers.*;
*/
public class MetadataMapperTest {
protected void checkDate(String filename, Settings settings, Long expected) throws IOException {
protected void checkMeta(String filename, Settings settings, Long expectedDate, Long expectedLength) throws IOException {
DocumentMapperParser mapperParser = new DocumentMapperParser(new Index("test"), settings, new AnalysisService(new Index("test")), null, null);
mapperParser.putTypeParser(AttachmentMapper.CONTENT_TYPE, new AttachmentMapper.TypeParser());
@ -45,45 +45,45 @@ public class MetadataMapperTest {
Document doc = docMapper.parse(json).rootDoc();
assertThat(doc.get(docMapper.mappers().smartName("file").mapper().names().indexName()), containsString("World"));
assertThat(doc.get(docMapper.mappers().smartName("file.name").mapper().names().indexName()), equalTo(filename));
if (expected == null) {
if (expectedDate == null) {
assertThat(doc.getField(docMapper.mappers().smartName("file.date").mapper().names().indexName()), nullValue());
} else {
assertThat(doc.getField(docMapper.mappers().smartName("file.date").mapper().names().indexName()).numericValue().longValue(), is(expected));
assertThat(doc.getField(docMapper.mappers().smartName("file.date").mapper().names().indexName()).numericValue().longValue(), is(expectedDate));
}
assertThat(doc.get(docMapper.mappers().smartName("file.title").mapper().names().indexName()), equalTo("Hello"));
assertThat(doc.get(docMapper.mappers().smartName("file.author").mapper().names().indexName()), equalTo("kimchy"));
assertThat(doc.get(docMapper.mappers().smartName("file.keywords").mapper().names().indexName()), equalTo("elasticsearch,cool,bonsai"));
assertThat(doc.get(docMapper.mappers().smartName("file.content_type").mapper().names().indexName()), equalTo("text/html; charset=ISO-8859-1"));
assertThat(doc.getField(docMapper.mappers().smartName("file.content_length").mapper().names().indexName()).numericValue().longValue(), is(expectedLength));
}
@Test
public void testIgnoreWithoutDate() throws Exception {
checkDate("htmlWithoutDateMeta.html", ImmutableSettings.builder().build(), null);
checkMeta("htmlWithoutDateMeta.html", ImmutableSettings.builder().build(), null, 300L);
}
@Test
public void testIgnoreWithEmptyDate() throws Exception {
checkDate("htmlWithEmptyDateMeta.html", ImmutableSettings.builder().build(), null);
checkMeta("htmlWithEmptyDateMeta.html", ImmutableSettings.builder().build(), null, 334L);
}
@Test
public void testIgnoreWithCorrectDate() throws Exception {
checkDate("htmlWithValidDateMeta.html", ImmutableSettings.builder().build(), 1354233600000L);
checkMeta("htmlWithValidDateMeta.html", ImmutableSettings.builder().build(), 1354233600000L, 344L);
}
@Test
public void testWithoutDate() throws Exception {
checkDate("htmlWithoutDateMeta.html", ImmutableSettings.builder().put("index.mapping.attachment.ignore_errors", false).build(), null);
checkMeta("htmlWithoutDateMeta.html", ImmutableSettings.builder().put("index.mapping.attachment.ignore_errors", false).build(), null, 300L);
}
@Test(expectedExceptions = MapperParsingException.class)
public void testWithEmptyDate() throws Exception {
checkDate("htmlWithEmptyDateMeta.html", ImmutableSettings.builder().put("index.mapping.attachment.ignore_errors", false).build(), null);
checkMeta("htmlWithEmptyDateMeta.html", ImmutableSettings.builder().put("index.mapping.attachment.ignore_errors", false).build(), null, null);
}
@Test
public void testWithCorrectDate() throws Exception {
checkDate("htmlWithValidDateMeta.html", ImmutableSettings.builder().put("index.mapping.attachment.ignore_errors", false).build(), 1354233600000L);
checkMeta("htmlWithValidDateMeta.html", ImmutableSettings.builder().put("index.mapping.attachment.ignore_errors", false).build(), 1354233600000L, 344L);
}
}