Add content_length metadata
We now generate `content_length` field field based on file size. Closes #26.
This commit is contained in:
parent
406e295c6c
commit
8c340535d2
76
README.md
76
README.md
|
@ -1,5 +1,5 @@
|
|||
Mapper Attachments Type for ElasticSearch
|
||||
==================================
|
||||
=========================================
|
||||
|
||||
The mapper attachments plugin adds the `attachment` type to ElasticSearch using Tika.
|
||||
|
||||
|
@ -36,48 +36,72 @@ The `attachment` type is provided as a plugin extension. The plugin is a simple
|
|||
|
||||
Using the attachment type is simple, in your mapping JSON, simply set a certain JSON element as attachment, for example:
|
||||
|
||||
{
|
||||
"person" : {
|
||||
"properties" : {
|
||||
"my_attachment" : { "type" : "attachment" }
|
||||
}
|
||||
```javascript
|
||||
{
|
||||
"person" : {
|
||||
"properties" : {
|
||||
"my_attachment" : { "type" : "attachment" }
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
In this case, the JSON to index can be:
|
||||
|
||||
{
|
||||
"my_attachment" : "... base64 encoded attachment ..."
|
||||
}
|
||||
```javascript
|
||||
{
|
||||
"my_attachment" : "... base64 encoded attachment ..."
|
||||
}
|
||||
```
|
||||
|
||||
Or it is possible to use more elaborated JSON if content type or resource name need to be set explicitly:
|
||||
|
||||
{
|
||||
"my_attachment" : {
|
||||
"_content_type" : "application/pdf",
|
||||
"_name" : "resource/name/of/my.pdf",
|
||||
"content" : "... base64 encoded attachment ..."
|
||||
}
|
||||
```javascript
|
||||
{
|
||||
"my_attachment" : {
|
||||
"_content_type" : "application/pdf",
|
||||
"_name" : "resource/name/of/my.pdf",
|
||||
"content" : "... base64 encoded attachment ..."
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
The `attachment` type not only indexes the content of the doc, but also automatically adds meta data on the attachment as well (when available). The metadata supported are: `date`, `title`, `author`, and `keywords`. They can be queried using the "dot notation", for example: `my_attachment.author`.
|
||||
The `attachment` type not only indexes the content of the doc, but also automatically adds meta data on the attachment as well (when available).
|
||||
|
||||
The metadata supported are:
|
||||
|
||||
* `date`
|
||||
* `title`
|
||||
* `name` only available if you set `_name` see above
|
||||
* `author`
|
||||
* `keywords`
|
||||
* `content_type`
|
||||
* `content_length` is the original content_length before text extraction (aka file size)
|
||||
|
||||
They can be queried using the "dot notation", for example: `my_attachment.author`.
|
||||
|
||||
Both the meta data and the actual content are simple core type mappers (string, date, ...), thus, they can be controlled in the mappings. For example:
|
||||
|
||||
{
|
||||
"person" : {
|
||||
"properties" : {
|
||||
"file" : {
|
||||
"type" : "attachment",
|
||||
"fields" : {
|
||||
"file" : {"index" : "no"},
|
||||
"date" : {"store" : "yes"},
|
||||
"author" : {"analyzer" : "myAnalyzer"}
|
||||
}
|
||||
```javascript
|
||||
{
|
||||
"person" : {
|
||||
"properties" : {
|
||||
"file" : {
|
||||
"type" : "attachment",
|
||||
"fields" : {
|
||||
"file" : {"index" : "no"},
|
||||
"title" : {store : "yes"},
|
||||
"date" : {"store" : "yes"},
|
||||
"author" : {"analyzer" : "myAnalyzer"},
|
||||
"keywords" : {store : "yes"},
|
||||
"content_type" : {store : "yes"},
|
||||
"content_length" : {store : "yes"}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
In the above example, the actual content indexed is mapped under `fields` name `file`, and we decide not to index it, so it will only be available in the `_all` field. The other fields map to their respective metadata names, but there is no need to specify the `type` (like `string` or `date`) since it is already known.
|
||||
|
||||
|
|
|
@ -28,14 +28,14 @@ import org.elasticsearch.common.xcontent.XContentBuilder;
|
|||
import org.elasticsearch.common.xcontent.XContentParser;
|
||||
import org.elasticsearch.index.mapper.*;
|
||||
import org.elasticsearch.index.mapper.core.DateFieldMapper;
|
||||
import org.elasticsearch.index.mapper.core.IntegerFieldMapper;
|
||||
import org.elasticsearch.index.mapper.core.StringFieldMapper;
|
||||
import org.elasticsearch.index.mapper.multifield.MultiFieldMapper;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Map;
|
||||
|
||||
import static org.elasticsearch.index.mapper.MapperBuilders.dateField;
|
||||
import static org.elasticsearch.index.mapper.MapperBuilders.stringField;
|
||||
import static org.elasticsearch.index.mapper.MapperBuilders.*;
|
||||
import static org.elasticsearch.index.mapper.core.TypeParsers.parsePathType;
|
||||
import static org.elasticsearch.plugin.mapper.attachments.tika.TikaInstance.tika;
|
||||
|
||||
|
@ -90,6 +90,8 @@ public class AttachmentMapper implements Mapper {
|
|||
|
||||
private Mapper.Builder contentTypeBuilder = stringField("content_type");
|
||||
|
||||
private Mapper.Builder contentLengthBuilder = integerField("content_length");
|
||||
|
||||
public Builder(String name) {
|
||||
super(name);
|
||||
this.builder = this;
|
||||
|
@ -136,6 +138,11 @@ public class AttachmentMapper implements Mapper {
|
|||
return this;
|
||||
}
|
||||
|
||||
public Builder contentLength(Mapper.Builder contentType) {
|
||||
this.contentLengthBuilder = contentType;
|
||||
return this;
|
||||
}
|
||||
|
||||
@Override
|
||||
public AttachmentMapper build(BuilderContext context) {
|
||||
ContentPath.Type origPathType = context.path().pathType();
|
||||
|
@ -152,6 +159,7 @@ public class AttachmentMapper implements Mapper {
|
|||
Mapper nameMapper = nameBuilder.build(context);
|
||||
Mapper keywordsMapper = keywordsBuilder.build(context);
|
||||
Mapper contentTypeMapper = contentTypeBuilder.build(context);
|
||||
Mapper contentLength = contentLengthBuilder.build(context);
|
||||
context.path().remove();
|
||||
|
||||
context.path().pathType(origPathType);
|
||||
|
@ -170,7 +178,7 @@ public class AttachmentMapper implements Mapper {
|
|||
ignoreErrors = Boolean.TRUE;
|
||||
}
|
||||
|
||||
return new AttachmentMapper(name, pathType, defaultIndexedChars, ignoreErrors, contentMapper, dateMapper, titleMapper, nameMapper, authorMapper, keywordsMapper, contentTypeMapper);
|
||||
return new AttachmentMapper(name, pathType, defaultIndexedChars, ignoreErrors, contentMapper, dateMapper, titleMapper, nameMapper, authorMapper, keywordsMapper, contentTypeMapper, contentLength);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -185,7 +193,12 @@ public class AttachmentMapper implements Mapper {
|
|||
* fields : {
|
||||
* field1 : {type : "binary"},
|
||||
* title : {store : "yes"},
|
||||
* date : {store : "yes"}
|
||||
* date : {store : "yes"},
|
||||
* name : {store : "yes"},
|
||||
* author : {store : "yes"},
|
||||
* keywords : {store : "yes"},
|
||||
* content_type : {store : "yes"},
|
||||
* content_length : {store : "yes"}
|
||||
* }
|
||||
* }
|
||||
* </pre>
|
||||
|
@ -232,6 +245,8 @@ public class AttachmentMapper implements Mapper {
|
|||
builder.keywords(parserContext.typeParser(isMultifield? MultiFieldMapper.CONTENT_TYPE:StringFieldMapper.CONTENT_TYPE).parse("keywords", (Map<String, Object>) propNode, parserContext));
|
||||
} else if ("content_type".equals(propName)) {
|
||||
builder.contentType(parserContext.typeParser(isMultifield? MultiFieldMapper.CONTENT_TYPE:StringFieldMapper.CONTENT_TYPE).parse("content_type", (Map<String, Object>) propNode, parserContext));
|
||||
} else if ("content_length".equals(propName)) {
|
||||
builder.contentLength(parserContext.typeParser(isMultifield? MultiFieldMapper.CONTENT_TYPE: IntegerFieldMapper.CONTENT_TYPE).parse("content_length", (Map<String, Object>) propNode, parserContext));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -263,9 +278,11 @@ public class AttachmentMapper implements Mapper {
|
|||
|
||||
private final Mapper contentTypeMapper;
|
||||
|
||||
private final Mapper contentLengthMapper;
|
||||
|
||||
public AttachmentMapper(String name, ContentPath.Type pathType, int defaultIndexedChars, Boolean ignoreErrors, Mapper contentMapper,
|
||||
Mapper dateMapper, Mapper titleMapper, Mapper nameMapper, Mapper authorMapper,
|
||||
Mapper keywordsMapper, Mapper contentTypeMapper) {
|
||||
Mapper keywordsMapper, Mapper contentTypeMapper, Mapper contentLengthMapper) {
|
||||
this.name = name;
|
||||
this.pathType = pathType;
|
||||
this.defaultIndexedChars = defaultIndexedChars;
|
||||
|
@ -277,6 +294,7 @@ public class AttachmentMapper implements Mapper {
|
|||
this.authorMapper = authorMapper;
|
||||
this.keywordsMapper = keywordsMapper;
|
||||
this.contentTypeMapper = contentTypeMapper;
|
||||
this.contentLengthMapper = contentLengthMapper;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -388,6 +406,20 @@ public class AttachmentMapper implements Mapper {
|
|||
if (!ignoreErrors) throw e;
|
||||
if (logger.isDebugEnabled()) logger.debug("Ignoring MapperParsingException catch while parsing content_type: {}: {}", e.getMessage(), context.externalValue());
|
||||
}
|
||||
|
||||
try {
|
||||
if (metadata.get(Metadata.CONTENT_LENGTH) != null) {
|
||||
// We try to get CONTENT_LENGTH from Tika first
|
||||
context.externalValue(metadata.get(Metadata.CONTENT_LENGTH));
|
||||
} else {
|
||||
// Otherwise, we use our byte[] length
|
||||
context.externalValue(content.length);
|
||||
}
|
||||
contentLengthMapper.parse(context);
|
||||
} catch(MapperParsingException e){
|
||||
if (!ignoreErrors) throw e;
|
||||
if (logger.isDebugEnabled()) logger.debug("Ignoring MapperParsingException catch while parsing content_length: {}: {}", e.getMessage(), context.externalValue());
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -404,6 +436,7 @@ public class AttachmentMapper implements Mapper {
|
|||
authorMapper.traverse(fieldMapperListener);
|
||||
keywordsMapper.traverse(fieldMapperListener);
|
||||
contentTypeMapper.traverse(fieldMapperListener);
|
||||
contentLengthMapper.traverse(fieldMapperListener);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -419,6 +452,7 @@ public class AttachmentMapper implements Mapper {
|
|||
authorMapper.close();
|
||||
keywordsMapper.close();
|
||||
contentTypeMapper.close();
|
||||
contentLengthMapper.close();
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -435,6 +469,7 @@ public class AttachmentMapper implements Mapper {
|
|||
dateMapper.toXContent(builder, params);
|
||||
keywordsMapper.toXContent(builder, params);
|
||||
contentTypeMapper.toXContent(builder, params);
|
||||
contentLengthMapper.toXContent(builder, params);
|
||||
builder.endObject();
|
||||
|
||||
builder.endObject();
|
||||
|
|
|
@ -25,7 +25,7 @@ import static org.hamcrest.Matchers.*;
|
|||
*/
|
||||
public class MetadataMapperTest {
|
||||
|
||||
protected void checkDate(String filename, Settings settings, Long expected) throws IOException {
|
||||
protected void checkMeta(String filename, Settings settings, Long expectedDate, Long expectedLength) throws IOException {
|
||||
DocumentMapperParser mapperParser = new DocumentMapperParser(new Index("test"), settings, new AnalysisService(new Index("test")), null, null);
|
||||
mapperParser.putTypeParser(AttachmentMapper.CONTENT_TYPE, new AttachmentMapper.TypeParser());
|
||||
|
||||
|
@ -45,45 +45,45 @@ public class MetadataMapperTest {
|
|||
Document doc = docMapper.parse(json).rootDoc();
|
||||
assertThat(doc.get(docMapper.mappers().smartName("file").mapper().names().indexName()), containsString("World"));
|
||||
assertThat(doc.get(docMapper.mappers().smartName("file.name").mapper().names().indexName()), equalTo(filename));
|
||||
if (expected == null) {
|
||||
if (expectedDate == null) {
|
||||
assertThat(doc.getField(docMapper.mappers().smartName("file.date").mapper().names().indexName()), nullValue());
|
||||
} else {
|
||||
assertThat(doc.getField(docMapper.mappers().smartName("file.date").mapper().names().indexName()).numericValue().longValue(), is(expected));
|
||||
assertThat(doc.getField(docMapper.mappers().smartName("file.date").mapper().names().indexName()).numericValue().longValue(), is(expectedDate));
|
||||
}
|
||||
assertThat(doc.get(docMapper.mappers().smartName("file.title").mapper().names().indexName()), equalTo("Hello"));
|
||||
assertThat(doc.get(docMapper.mappers().smartName("file.author").mapper().names().indexName()), equalTo("kimchy"));
|
||||
assertThat(doc.get(docMapper.mappers().smartName("file.keywords").mapper().names().indexName()), equalTo("elasticsearch,cool,bonsai"));
|
||||
assertThat(doc.get(docMapper.mappers().smartName("file.content_type").mapper().names().indexName()), equalTo("text/html; charset=ISO-8859-1"));
|
||||
assertThat(doc.getField(docMapper.mappers().smartName("file.content_length").mapper().names().indexName()).numericValue().longValue(), is(expectedLength));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testIgnoreWithoutDate() throws Exception {
|
||||
checkDate("htmlWithoutDateMeta.html", ImmutableSettings.builder().build(), null);
|
||||
checkMeta("htmlWithoutDateMeta.html", ImmutableSettings.builder().build(), null, 300L);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testIgnoreWithEmptyDate() throws Exception {
|
||||
checkDate("htmlWithEmptyDateMeta.html", ImmutableSettings.builder().build(), null);
|
||||
checkMeta("htmlWithEmptyDateMeta.html", ImmutableSettings.builder().build(), null, 334L);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testIgnoreWithCorrectDate() throws Exception {
|
||||
checkDate("htmlWithValidDateMeta.html", ImmutableSettings.builder().build(), 1354233600000L);
|
||||
checkMeta("htmlWithValidDateMeta.html", ImmutableSettings.builder().build(), 1354233600000L, 344L);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testWithoutDate() throws Exception {
|
||||
checkDate("htmlWithoutDateMeta.html", ImmutableSettings.builder().put("index.mapping.attachment.ignore_errors", false).build(), null);
|
||||
checkMeta("htmlWithoutDateMeta.html", ImmutableSettings.builder().put("index.mapping.attachment.ignore_errors", false).build(), null, 300L);
|
||||
}
|
||||
|
||||
@Test(expectedExceptions = MapperParsingException.class)
|
||||
public void testWithEmptyDate() throws Exception {
|
||||
checkDate("htmlWithEmptyDateMeta.html", ImmutableSettings.builder().put("index.mapping.attachment.ignore_errors", false).build(), null);
|
||||
checkMeta("htmlWithEmptyDateMeta.html", ImmutableSettings.builder().put("index.mapping.attachment.ignore_errors", false).build(), null, null);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testWithCorrectDate() throws Exception {
|
||||
checkDate("htmlWithValidDateMeta.html", ImmutableSettings.builder().put("index.mapping.attachment.ignore_errors", false).build(), 1354233600000L);
|
||||
checkMeta("htmlWithValidDateMeta.html", ImmutableSettings.builder().put("index.mapping.attachment.ignore_errors", false).build(), 1354233600000L, 344L);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue