Add content_length metadata
We now generate `content_length` field field based on file size. Closes #26.
This commit is contained in:
parent
406e295c6c
commit
8c340535d2
76
README.md
76
README.md
|
@ -1,5 +1,5 @@
|
||||||
Mapper Attachments Type for ElasticSearch
|
Mapper Attachments Type for ElasticSearch
|
||||||
==================================
|
=========================================
|
||||||
|
|
||||||
The mapper attachments plugin adds the `attachment` type to ElasticSearch using Tika.
|
The mapper attachments plugin adds the `attachment` type to ElasticSearch using Tika.
|
||||||
|
|
||||||
|
@ -36,48 +36,72 @@ The `attachment` type is provided as a plugin extension. The plugin is a simple
|
||||||
|
|
||||||
Using the attachment type is simple, in your mapping JSON, simply set a certain JSON element as attachment, for example:
|
Using the attachment type is simple, in your mapping JSON, simply set a certain JSON element as attachment, for example:
|
||||||
|
|
||||||
{
|
```javascript
|
||||||
"person" : {
|
{
|
||||||
"properties" : {
|
"person" : {
|
||||||
"my_attachment" : { "type" : "attachment" }
|
"properties" : {
|
||||||
}
|
"my_attachment" : { "type" : "attachment" }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
In this case, the JSON to index can be:
|
In this case, the JSON to index can be:
|
||||||
|
|
||||||
{
|
```javascript
|
||||||
"my_attachment" : "... base64 encoded attachment ..."
|
{
|
||||||
}
|
"my_attachment" : "... base64 encoded attachment ..."
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
Or it is possible to use more elaborated JSON if content type or resource name need to be set explicitly:
|
Or it is possible to use more elaborated JSON if content type or resource name need to be set explicitly:
|
||||||
|
|
||||||
{
|
```javascript
|
||||||
"my_attachment" : {
|
{
|
||||||
"_content_type" : "application/pdf",
|
"my_attachment" : {
|
||||||
"_name" : "resource/name/of/my.pdf",
|
"_content_type" : "application/pdf",
|
||||||
"content" : "... base64 encoded attachment ..."
|
"_name" : "resource/name/of/my.pdf",
|
||||||
}
|
"content" : "... base64 encoded attachment ..."
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
The `attachment` type not only indexes the content of the doc, but also automatically adds meta data on the attachment as well (when available). The metadata supported are: `date`, `title`, `author`, and `keywords`. They can be queried using the "dot notation", for example: `my_attachment.author`.
|
The `attachment` type not only indexes the content of the doc, but also automatically adds meta data on the attachment as well (when available).
|
||||||
|
|
||||||
|
The metadata supported are:
|
||||||
|
|
||||||
|
* `date`
|
||||||
|
* `title`
|
||||||
|
* `name` only available if you set `_name` see above
|
||||||
|
* `author`
|
||||||
|
* `keywords`
|
||||||
|
* `content_type`
|
||||||
|
* `content_length` is the original content_length before text extraction (aka file size)
|
||||||
|
|
||||||
|
They can be queried using the "dot notation", for example: `my_attachment.author`.
|
||||||
|
|
||||||
Both the meta data and the actual content are simple core type mappers (string, date, ...), thus, they can be controlled in the mappings. For example:
|
Both the meta data and the actual content are simple core type mappers (string, date, ...), thus, they can be controlled in the mappings. For example:
|
||||||
|
|
||||||
{
|
```javascript
|
||||||
"person" : {
|
{
|
||||||
"properties" : {
|
"person" : {
|
||||||
"file" : {
|
"properties" : {
|
||||||
"type" : "attachment",
|
"file" : {
|
||||||
"fields" : {
|
"type" : "attachment",
|
||||||
"file" : {"index" : "no"},
|
"fields" : {
|
||||||
"date" : {"store" : "yes"},
|
"file" : {"index" : "no"},
|
||||||
"author" : {"analyzer" : "myAnalyzer"}
|
"title" : {store : "yes"},
|
||||||
}
|
"date" : {"store" : "yes"},
|
||||||
|
"author" : {"analyzer" : "myAnalyzer"},
|
||||||
|
"keywords" : {store : "yes"},
|
||||||
|
"content_type" : {store : "yes"},
|
||||||
|
"content_length" : {store : "yes"}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
In the above example, the actual content indexed is mapped under `fields` name `file`, and we decide not to index it, so it will only be available in the `_all` field. The other fields map to their respective metadata names, but there is no need to specify the `type` (like `string` or `date`) since it is already known.
|
In the above example, the actual content indexed is mapped under `fields` name `file`, and we decide not to index it, so it will only be available in the `_all` field. The other fields map to their respective metadata names, but there is no need to specify the `type` (like `string` or `date`) since it is already known.
|
||||||
|
|
||||||
|
|
|
@ -28,14 +28,14 @@ import org.elasticsearch.common.xcontent.XContentBuilder;
|
||||||
import org.elasticsearch.common.xcontent.XContentParser;
|
import org.elasticsearch.common.xcontent.XContentParser;
|
||||||
import org.elasticsearch.index.mapper.*;
|
import org.elasticsearch.index.mapper.*;
|
||||||
import org.elasticsearch.index.mapper.core.DateFieldMapper;
|
import org.elasticsearch.index.mapper.core.DateFieldMapper;
|
||||||
|
import org.elasticsearch.index.mapper.core.IntegerFieldMapper;
|
||||||
import org.elasticsearch.index.mapper.core.StringFieldMapper;
|
import org.elasticsearch.index.mapper.core.StringFieldMapper;
|
||||||
import org.elasticsearch.index.mapper.multifield.MultiFieldMapper;
|
import org.elasticsearch.index.mapper.multifield.MultiFieldMapper;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
import static org.elasticsearch.index.mapper.MapperBuilders.dateField;
|
import static org.elasticsearch.index.mapper.MapperBuilders.*;
|
||||||
import static org.elasticsearch.index.mapper.MapperBuilders.stringField;
|
|
||||||
import static org.elasticsearch.index.mapper.core.TypeParsers.parsePathType;
|
import static org.elasticsearch.index.mapper.core.TypeParsers.parsePathType;
|
||||||
import static org.elasticsearch.plugin.mapper.attachments.tika.TikaInstance.tika;
|
import static org.elasticsearch.plugin.mapper.attachments.tika.TikaInstance.tika;
|
||||||
|
|
||||||
|
@ -90,6 +90,8 @@ public class AttachmentMapper implements Mapper {
|
||||||
|
|
||||||
private Mapper.Builder contentTypeBuilder = stringField("content_type");
|
private Mapper.Builder contentTypeBuilder = stringField("content_type");
|
||||||
|
|
||||||
|
private Mapper.Builder contentLengthBuilder = integerField("content_length");
|
||||||
|
|
||||||
public Builder(String name) {
|
public Builder(String name) {
|
||||||
super(name);
|
super(name);
|
||||||
this.builder = this;
|
this.builder = this;
|
||||||
|
@ -136,6 +138,11 @@ public class AttachmentMapper implements Mapper {
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public Builder contentLength(Mapper.Builder contentType) {
|
||||||
|
this.contentLengthBuilder = contentType;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public AttachmentMapper build(BuilderContext context) {
|
public AttachmentMapper build(BuilderContext context) {
|
||||||
ContentPath.Type origPathType = context.path().pathType();
|
ContentPath.Type origPathType = context.path().pathType();
|
||||||
|
@ -152,6 +159,7 @@ public class AttachmentMapper implements Mapper {
|
||||||
Mapper nameMapper = nameBuilder.build(context);
|
Mapper nameMapper = nameBuilder.build(context);
|
||||||
Mapper keywordsMapper = keywordsBuilder.build(context);
|
Mapper keywordsMapper = keywordsBuilder.build(context);
|
||||||
Mapper contentTypeMapper = contentTypeBuilder.build(context);
|
Mapper contentTypeMapper = contentTypeBuilder.build(context);
|
||||||
|
Mapper contentLength = contentLengthBuilder.build(context);
|
||||||
context.path().remove();
|
context.path().remove();
|
||||||
|
|
||||||
context.path().pathType(origPathType);
|
context.path().pathType(origPathType);
|
||||||
|
@ -170,7 +178,7 @@ public class AttachmentMapper implements Mapper {
|
||||||
ignoreErrors = Boolean.TRUE;
|
ignoreErrors = Boolean.TRUE;
|
||||||
}
|
}
|
||||||
|
|
||||||
return new AttachmentMapper(name, pathType, defaultIndexedChars, ignoreErrors, contentMapper, dateMapper, titleMapper, nameMapper, authorMapper, keywordsMapper, contentTypeMapper);
|
return new AttachmentMapper(name, pathType, defaultIndexedChars, ignoreErrors, contentMapper, dateMapper, titleMapper, nameMapper, authorMapper, keywordsMapper, contentTypeMapper, contentLength);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -185,7 +193,12 @@ public class AttachmentMapper implements Mapper {
|
||||||
* fields : {
|
* fields : {
|
||||||
* field1 : {type : "binary"},
|
* field1 : {type : "binary"},
|
||||||
* title : {store : "yes"},
|
* title : {store : "yes"},
|
||||||
* date : {store : "yes"}
|
* date : {store : "yes"},
|
||||||
|
* name : {store : "yes"},
|
||||||
|
* author : {store : "yes"},
|
||||||
|
* keywords : {store : "yes"},
|
||||||
|
* content_type : {store : "yes"},
|
||||||
|
* content_length : {store : "yes"}
|
||||||
* }
|
* }
|
||||||
* }
|
* }
|
||||||
* </pre>
|
* </pre>
|
||||||
|
@ -232,6 +245,8 @@ public class AttachmentMapper implements Mapper {
|
||||||
builder.keywords(parserContext.typeParser(isMultifield? MultiFieldMapper.CONTENT_TYPE:StringFieldMapper.CONTENT_TYPE).parse("keywords", (Map<String, Object>) propNode, parserContext));
|
builder.keywords(parserContext.typeParser(isMultifield? MultiFieldMapper.CONTENT_TYPE:StringFieldMapper.CONTENT_TYPE).parse("keywords", (Map<String, Object>) propNode, parserContext));
|
||||||
} else if ("content_type".equals(propName)) {
|
} else if ("content_type".equals(propName)) {
|
||||||
builder.contentType(parserContext.typeParser(isMultifield? MultiFieldMapper.CONTENT_TYPE:StringFieldMapper.CONTENT_TYPE).parse("content_type", (Map<String, Object>) propNode, parserContext));
|
builder.contentType(parserContext.typeParser(isMultifield? MultiFieldMapper.CONTENT_TYPE:StringFieldMapper.CONTENT_TYPE).parse("content_type", (Map<String, Object>) propNode, parserContext));
|
||||||
|
} else if ("content_length".equals(propName)) {
|
||||||
|
builder.contentLength(parserContext.typeParser(isMultifield? MultiFieldMapper.CONTENT_TYPE: IntegerFieldMapper.CONTENT_TYPE).parse("content_length", (Map<String, Object>) propNode, parserContext));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -263,9 +278,11 @@ public class AttachmentMapper implements Mapper {
|
||||||
|
|
||||||
private final Mapper contentTypeMapper;
|
private final Mapper contentTypeMapper;
|
||||||
|
|
||||||
|
private final Mapper contentLengthMapper;
|
||||||
|
|
||||||
public AttachmentMapper(String name, ContentPath.Type pathType, int defaultIndexedChars, Boolean ignoreErrors, Mapper contentMapper,
|
public AttachmentMapper(String name, ContentPath.Type pathType, int defaultIndexedChars, Boolean ignoreErrors, Mapper contentMapper,
|
||||||
Mapper dateMapper, Mapper titleMapper, Mapper nameMapper, Mapper authorMapper,
|
Mapper dateMapper, Mapper titleMapper, Mapper nameMapper, Mapper authorMapper,
|
||||||
Mapper keywordsMapper, Mapper contentTypeMapper) {
|
Mapper keywordsMapper, Mapper contentTypeMapper, Mapper contentLengthMapper) {
|
||||||
this.name = name;
|
this.name = name;
|
||||||
this.pathType = pathType;
|
this.pathType = pathType;
|
||||||
this.defaultIndexedChars = defaultIndexedChars;
|
this.defaultIndexedChars = defaultIndexedChars;
|
||||||
|
@ -277,6 +294,7 @@ public class AttachmentMapper implements Mapper {
|
||||||
this.authorMapper = authorMapper;
|
this.authorMapper = authorMapper;
|
||||||
this.keywordsMapper = keywordsMapper;
|
this.keywordsMapper = keywordsMapper;
|
||||||
this.contentTypeMapper = contentTypeMapper;
|
this.contentTypeMapper = contentTypeMapper;
|
||||||
|
this.contentLengthMapper = contentLengthMapper;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -388,6 +406,20 @@ public class AttachmentMapper implements Mapper {
|
||||||
if (!ignoreErrors) throw e;
|
if (!ignoreErrors) throw e;
|
||||||
if (logger.isDebugEnabled()) logger.debug("Ignoring MapperParsingException catch while parsing content_type: {}: {}", e.getMessage(), context.externalValue());
|
if (logger.isDebugEnabled()) logger.debug("Ignoring MapperParsingException catch while parsing content_type: {}: {}", e.getMessage(), context.externalValue());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
if (metadata.get(Metadata.CONTENT_LENGTH) != null) {
|
||||||
|
// We try to get CONTENT_LENGTH from Tika first
|
||||||
|
context.externalValue(metadata.get(Metadata.CONTENT_LENGTH));
|
||||||
|
} else {
|
||||||
|
// Otherwise, we use our byte[] length
|
||||||
|
context.externalValue(content.length);
|
||||||
|
}
|
||||||
|
contentLengthMapper.parse(context);
|
||||||
|
} catch(MapperParsingException e){
|
||||||
|
if (!ignoreErrors) throw e;
|
||||||
|
if (logger.isDebugEnabled()) logger.debug("Ignoring MapperParsingException catch while parsing content_length: {}: {}", e.getMessage(), context.externalValue());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -404,6 +436,7 @@ public class AttachmentMapper implements Mapper {
|
||||||
authorMapper.traverse(fieldMapperListener);
|
authorMapper.traverse(fieldMapperListener);
|
||||||
keywordsMapper.traverse(fieldMapperListener);
|
keywordsMapper.traverse(fieldMapperListener);
|
||||||
contentTypeMapper.traverse(fieldMapperListener);
|
contentTypeMapper.traverse(fieldMapperListener);
|
||||||
|
contentLengthMapper.traverse(fieldMapperListener);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -419,6 +452,7 @@ public class AttachmentMapper implements Mapper {
|
||||||
authorMapper.close();
|
authorMapper.close();
|
||||||
keywordsMapper.close();
|
keywordsMapper.close();
|
||||||
contentTypeMapper.close();
|
contentTypeMapper.close();
|
||||||
|
contentLengthMapper.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -435,6 +469,7 @@ public class AttachmentMapper implements Mapper {
|
||||||
dateMapper.toXContent(builder, params);
|
dateMapper.toXContent(builder, params);
|
||||||
keywordsMapper.toXContent(builder, params);
|
keywordsMapper.toXContent(builder, params);
|
||||||
contentTypeMapper.toXContent(builder, params);
|
contentTypeMapper.toXContent(builder, params);
|
||||||
|
contentLengthMapper.toXContent(builder, params);
|
||||||
builder.endObject();
|
builder.endObject();
|
||||||
|
|
||||||
builder.endObject();
|
builder.endObject();
|
||||||
|
|
|
@ -25,7 +25,7 @@ import static org.hamcrest.Matchers.*;
|
||||||
*/
|
*/
|
||||||
public class MetadataMapperTest {
|
public class MetadataMapperTest {
|
||||||
|
|
||||||
protected void checkDate(String filename, Settings settings, Long expected) throws IOException {
|
protected void checkMeta(String filename, Settings settings, Long expectedDate, Long expectedLength) throws IOException {
|
||||||
DocumentMapperParser mapperParser = new DocumentMapperParser(new Index("test"), settings, new AnalysisService(new Index("test")), null, null);
|
DocumentMapperParser mapperParser = new DocumentMapperParser(new Index("test"), settings, new AnalysisService(new Index("test")), null, null);
|
||||||
mapperParser.putTypeParser(AttachmentMapper.CONTENT_TYPE, new AttachmentMapper.TypeParser());
|
mapperParser.putTypeParser(AttachmentMapper.CONTENT_TYPE, new AttachmentMapper.TypeParser());
|
||||||
|
|
||||||
|
@ -45,45 +45,45 @@ public class MetadataMapperTest {
|
||||||
Document doc = docMapper.parse(json).rootDoc();
|
Document doc = docMapper.parse(json).rootDoc();
|
||||||
assertThat(doc.get(docMapper.mappers().smartName("file").mapper().names().indexName()), containsString("World"));
|
assertThat(doc.get(docMapper.mappers().smartName("file").mapper().names().indexName()), containsString("World"));
|
||||||
assertThat(doc.get(docMapper.mappers().smartName("file.name").mapper().names().indexName()), equalTo(filename));
|
assertThat(doc.get(docMapper.mappers().smartName("file.name").mapper().names().indexName()), equalTo(filename));
|
||||||
if (expected == null) {
|
if (expectedDate == null) {
|
||||||
assertThat(doc.getField(docMapper.mappers().smartName("file.date").mapper().names().indexName()), nullValue());
|
assertThat(doc.getField(docMapper.mappers().smartName("file.date").mapper().names().indexName()), nullValue());
|
||||||
} else {
|
} else {
|
||||||
assertThat(doc.getField(docMapper.mappers().smartName("file.date").mapper().names().indexName()).numericValue().longValue(), is(expected));
|
assertThat(doc.getField(docMapper.mappers().smartName("file.date").mapper().names().indexName()).numericValue().longValue(), is(expectedDate));
|
||||||
}
|
}
|
||||||
assertThat(doc.get(docMapper.mappers().smartName("file.title").mapper().names().indexName()), equalTo("Hello"));
|
assertThat(doc.get(docMapper.mappers().smartName("file.title").mapper().names().indexName()), equalTo("Hello"));
|
||||||
assertThat(doc.get(docMapper.mappers().smartName("file.author").mapper().names().indexName()), equalTo("kimchy"));
|
assertThat(doc.get(docMapper.mappers().smartName("file.author").mapper().names().indexName()), equalTo("kimchy"));
|
||||||
assertThat(doc.get(docMapper.mappers().smartName("file.keywords").mapper().names().indexName()), equalTo("elasticsearch,cool,bonsai"));
|
assertThat(doc.get(docMapper.mappers().smartName("file.keywords").mapper().names().indexName()), equalTo("elasticsearch,cool,bonsai"));
|
||||||
assertThat(doc.get(docMapper.mappers().smartName("file.content_type").mapper().names().indexName()), equalTo("text/html; charset=ISO-8859-1"));
|
assertThat(doc.get(docMapper.mappers().smartName("file.content_type").mapper().names().indexName()), equalTo("text/html; charset=ISO-8859-1"));
|
||||||
|
assertThat(doc.getField(docMapper.mappers().smartName("file.content_length").mapper().names().indexName()).numericValue().longValue(), is(expectedLength));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testIgnoreWithoutDate() throws Exception {
|
public void testIgnoreWithoutDate() throws Exception {
|
||||||
checkDate("htmlWithoutDateMeta.html", ImmutableSettings.builder().build(), null);
|
checkMeta("htmlWithoutDateMeta.html", ImmutableSettings.builder().build(), null, 300L);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testIgnoreWithEmptyDate() throws Exception {
|
public void testIgnoreWithEmptyDate() throws Exception {
|
||||||
checkDate("htmlWithEmptyDateMeta.html", ImmutableSettings.builder().build(), null);
|
checkMeta("htmlWithEmptyDateMeta.html", ImmutableSettings.builder().build(), null, 334L);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testIgnoreWithCorrectDate() throws Exception {
|
public void testIgnoreWithCorrectDate() throws Exception {
|
||||||
checkDate("htmlWithValidDateMeta.html", ImmutableSettings.builder().build(), 1354233600000L);
|
checkMeta("htmlWithValidDateMeta.html", ImmutableSettings.builder().build(), 1354233600000L, 344L);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testWithoutDate() throws Exception {
|
public void testWithoutDate() throws Exception {
|
||||||
checkDate("htmlWithoutDateMeta.html", ImmutableSettings.builder().put("index.mapping.attachment.ignore_errors", false).build(), null);
|
checkMeta("htmlWithoutDateMeta.html", ImmutableSettings.builder().put("index.mapping.attachment.ignore_errors", false).build(), null, 300L);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test(expectedExceptions = MapperParsingException.class)
|
@Test(expectedExceptions = MapperParsingException.class)
|
||||||
public void testWithEmptyDate() throws Exception {
|
public void testWithEmptyDate() throws Exception {
|
||||||
checkDate("htmlWithEmptyDateMeta.html", ImmutableSettings.builder().put("index.mapping.attachment.ignore_errors", false).build(), null);
|
checkMeta("htmlWithEmptyDateMeta.html", ImmutableSettings.builder().put("index.mapping.attachment.ignore_errors", false).build(), null, null);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testWithCorrectDate() throws Exception {
|
public void testWithCorrectDate() throws Exception {
|
||||||
checkDate("htmlWithValidDateMeta.html", ImmutableSettings.builder().put("index.mapping.attachment.ignore_errors", false).build(), 1354233600000L);
|
checkMeta("htmlWithValidDateMeta.html", ImmutableSettings.builder().put("index.mapping.attachment.ignore_errors", false).build(), 1354233600000L, 344L);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue