From 019d0f9a2610f6d1b2eace9cc47d38ca9d8a0bbe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20Camblor?= Date: Fri, 30 Nov 2012 20:25:31 +0100 Subject: [PATCH] Don't reject full document in case of invalid metadata From original PR #17 from @fcamblor If you try to index a document with an invalid metadata, the full document is rejected. For example: ```html Hello World ``` has a non parseable date. This fix add a new option that ignore parsing errors `"index.mapping.attachment.ignore_errors":true` (default to `true`). Closes #17, #38. --- README.md | 8 ++ .../mapper/attachment/AttachmentMapper.java | 81 ++++++++++++----- .../mapper/xcontent/MetadataMapperTest.java | 89 +++++++++++++++++++ .../xcontent/htmlWithEmptyDateMeta.html | 11 +++ .../xcontent/htmlWithValidDateMeta.html | 11 +++ .../mapper/xcontent/htmlWithoutDateMeta.html | 10 +++ .../index/mapper/xcontent/test-mapping.json | 8 +- 7 files changed, 194 insertions(+), 24 deletions(-) create mode 100644 src/test/java/org/elasticsearch/index/mapper/xcontent/MetadataMapperTest.java create mode 100644 src/test/resources/org/elasticsearch/index/mapper/xcontent/htmlWithEmptyDateMeta.html create mode 100644 src/test/resources/org/elasticsearch/index/mapper/xcontent/htmlWithValidDateMeta.html create mode 100644 src/test/resources/org/elasticsearch/index/mapper/xcontent/htmlWithoutDateMeta.html diff --git a/README.md b/README.md index 695d2c0cd39..42f67d6f707 100644 --- a/README.md +++ b/README.md @@ -90,6 +90,14 @@ Note, this feature is support since `1.3.0` version. The plugin uses [Apache Tika](http://lucene.apache.org/tika/) to parse attachments, so many formats are supported, listed [here](http://lucene.apache.org/tika/0.10/formats.html). +Metadata parsing error handling +------------------------------- + +While extracting metadata content, errors could happen for example when parsing dates. +Since version `1.9.0`, parsing errors are ignored so your document is indexed. + +You can disable this feature by setting the `index.mapping.attachment.ignore_errors` setting to `false`. + License ------- diff --git a/src/main/java/org/elasticsearch/index/mapper/attachment/AttachmentMapper.java b/src/main/java/org/elasticsearch/index/mapper/attachment/AttachmentMapper.java index 9692642395f..12f583a935f 100644 --- a/src/main/java/org/elasticsearch/index/mapper/attachment/AttachmentMapper.java +++ b/src/main/java/org/elasticsearch/index/mapper/attachment/AttachmentMapper.java @@ -22,6 +22,8 @@ package org.elasticsearch.index.mapper.attachment; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.elasticsearch.common.io.stream.BytesStreamInput; +import org.elasticsearch.common.logging.ESLogger; +import org.elasticsearch.common.logging.ESLoggerFactory; import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.common.xcontent.XContentParser; import org.elasticsearch.index.mapper.*; @@ -58,6 +60,8 @@ import static org.elasticsearch.plugin.mapper.attachments.tika.TikaInstance.tika */ public class AttachmentMapper implements Mapper { + private static ESLogger logger = ESLoggerFactory.getLogger(AttachmentMapper.class.getName()); + public static final String CONTENT_TYPE = "attachment"; public static class Defaults { @@ -70,6 +74,8 @@ public class AttachmentMapper implements Mapper { private Integer defaultIndexedChars = null; + private Boolean ignoreErrors = null; + private Mapper.Builder contentBuilder; private Mapper.Builder titleBuilder = stringField("title"); @@ -95,11 +101,6 @@ public class AttachmentMapper implements Mapper { return this; } - public Builder defaultIndexedChars(int defaultIndexedChars) { - this.defaultIndexedChars = defaultIndexedChars; - return this; - } - public Builder content(Mapper.Builder content) { this.contentBuilder = content; return this; @@ -155,14 +156,21 @@ public class AttachmentMapper implements Mapper { context.path().pathType(origPathType); - if (defaultIndexedChars != null && context.indexSettings() != null) { + if (defaultIndexedChars == null && context.indexSettings() != null) { defaultIndexedChars = context.indexSettings().getAsInt("index.mapping.attachment.indexed_chars", 100000); } if (defaultIndexedChars == null) { defaultIndexedChars = 100000; } - return new AttachmentMapper(name, pathType, defaultIndexedChars, contentMapper, dateMapper, titleMapper, nameMapper, authorMapper, keywordsMapper, contentTypeMapper); + if (ignoreErrors == null && context.indexSettings() != null) { + ignoreErrors = context.indexSettings().getAsBoolean("index.mapping.attachment.ignore_errors", Boolean.TRUE); + } + if (ignoreErrors == null) { + ignoreErrors = Boolean.TRUE; + } + + return new AttachmentMapper(name, pathType, defaultIndexedChars, ignoreErrors, contentMapper, dateMapper, titleMapper, nameMapper, authorMapper, keywordsMapper, contentTypeMapper); } } @@ -239,6 +247,8 @@ public class AttachmentMapper implements Mapper { private final int defaultIndexedChars; + private final boolean ignoreErrors; + private final Mapper contentMapper; private final Mapper dateMapper; @@ -253,12 +263,13 @@ public class AttachmentMapper implements Mapper { private final Mapper contentTypeMapper; - public AttachmentMapper(String name, ContentPath.Type pathType, int defaultIndexedChars, Mapper contentMapper, + public AttachmentMapper(String name, ContentPath.Type pathType, int defaultIndexedChars, Boolean ignoreErrors, Mapper contentMapper, Mapper dateMapper, Mapper titleMapper, Mapper nameMapper, Mapper authorMapper, Mapper keywordsMapper, Mapper contentTypeMapper) { this.name = name; this.pathType = pathType; this.defaultIndexedChars = defaultIndexedChars; + this.ignoreErrors = ignoreErrors; this.contentMapper = contentMapper; this.dateMapper = dateMapper; this.titleMapper = titleMapper; @@ -330,23 +341,53 @@ public class AttachmentMapper implements Mapper { contentMapper.parse(context); - context.externalValue(name); - nameMapper.parse(context); + try { + context.externalValue(name); + nameMapper.parse(context); + } catch(MapperParsingException e){ + if (!ignoreErrors) throw e; + if (logger.isDebugEnabled()) logger.debug("Ignoring MapperParsingException catch while parsing name: {}", e.getMessage()); + } - context.externalValue(metadata.get(Metadata.DATE)); - dateMapper.parse(context); + try { + context.externalValue(metadata.get(Metadata.DATE)); + dateMapper.parse(context); + } catch(MapperParsingException e){ + if (!ignoreErrors) throw e; + if (logger.isDebugEnabled()) logger.debug("Ignoring MapperParsingException catch while parsing date: {}: {}", e.getMessage(), context.externalValue()); + } - context.externalValue(metadata.get(Metadata.TITLE)); - titleMapper.parse(context); + try { + context.externalValue(metadata.get(Metadata.TITLE)); + titleMapper.parse(context); + } catch(MapperParsingException e){ + if (!ignoreErrors) throw e; + if (logger.isDebugEnabled()) logger.debug("Ignoring MapperParsingException catch while parsing title: {}: {}", e.getMessage(), context.externalValue()); + } - context.externalValue(metadata.get(Metadata.AUTHOR)); - authorMapper.parse(context); + try { + context.externalValue(metadata.get(Metadata.AUTHOR)); + authorMapper.parse(context); + } catch(MapperParsingException e){ + if (!ignoreErrors) throw e; + if (logger.isDebugEnabled()) logger.debug("Ignoring MapperParsingException catch while parsing author: {}: {}", e.getMessage(), context.externalValue()); + } - context.externalValue(metadata.get(Metadata.KEYWORDS)); - keywordsMapper.parse(context); + try { + context.externalValue(metadata.get(Metadata.KEYWORDS)); + keywordsMapper.parse(context); + } catch(MapperParsingException e){ + if (!ignoreErrors) throw e; + if (logger.isDebugEnabled()) logger.debug("Ignoring MapperParsingException catch while parsing keywords: {}: {}", e.getMessage(), context.externalValue()); + } - context.externalValue(metadata.get(Metadata.CONTENT_TYPE)); - contentTypeMapper.parse(context); + try { + context.externalValue(metadata.get(Metadata.CONTENT_TYPE)); + contentTypeMapper.parse(context); + } catch(MapperParsingException e){ + if (!ignoreErrors) throw e; + if (logger.isDebugEnabled()) logger.debug("Ignoring MapperParsingException catch while parsing content_type: {}: {}", e.getMessage(), context.externalValue()); + } } @Override diff --git a/src/test/java/org/elasticsearch/index/mapper/xcontent/MetadataMapperTest.java b/src/test/java/org/elasticsearch/index/mapper/xcontent/MetadataMapperTest.java new file mode 100644 index 00000000000..ea037ecc220 --- /dev/null +++ b/src/test/java/org/elasticsearch/index/mapper/xcontent/MetadataMapperTest.java @@ -0,0 +1,89 @@ +package org.elasticsearch.index.mapper.xcontent; + +import org.apache.lucene.document.Document; +import org.elasticsearch.common.bytes.BytesReference; +import org.elasticsearch.common.settings.ImmutableSettings; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.index.Index; +import org.elasticsearch.index.analysis.AnalysisService; +import org.elasticsearch.index.mapper.DocumentMapper; +import org.elasticsearch.index.mapper.DocumentMapperParser; +import org.elasticsearch.index.mapper.MapperParsingException; +import org.elasticsearch.index.mapper.attachment.AttachmentMapper; +import org.testng.annotations.Test; + +import java.io.IOException; + +import static org.elasticsearch.common.io.Streams.copyToBytesFromClasspath; +import static org.elasticsearch.common.io.Streams.copyToStringFromClasspath; +import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder; +import static org.hamcrest.MatcherAssert.assertThat; +import static org.hamcrest.Matchers.*; + +/** + * Test for https://github.com/elasticsearch/elasticsearch-mapper-attachments/issues/38 + */ +public class MetadataMapperTest { + + protected void checkDate(String filename, Settings settings, Long expected) throws IOException { + DocumentMapperParser mapperParser = new DocumentMapperParser(new Index("test"), settings, new AnalysisService(new Index("test")), null, null); + mapperParser.putTypeParser(AttachmentMapper.CONTENT_TYPE, new AttachmentMapper.TypeParser()); + + String mapping = copyToStringFromClasspath("/org/elasticsearch/index/mapper/xcontent/test-mapping.json"); + DocumentMapper docMapper = mapperParser.parse(mapping); + byte[] html = copyToBytesFromClasspath("/org/elasticsearch/index/mapper/xcontent/" + filename); + + BytesReference json = jsonBuilder() + .startObject() + .field("_id", 1) + .startObject("file") + .field("_name", "htmlWithoutDateMeta.html") + .field("content", html) + .endObject() + .endObject().bytes(); + + Document doc = docMapper.parse(json).rootDoc(); + assertThat(doc.get(docMapper.mappers().smartName("file").mapper().names().indexName()), containsString("World")); + assertThat(doc.get(docMapper.mappers().smartName("file.name").mapper().names().indexName()), equalTo("htmlWithoutDateMeta.html")); + if (expected == null) { + assertThat(doc.getField(docMapper.mappers().smartName("file.date").mapper().names().indexName()), nullValue()); + } else { + assertThat(doc.getField(docMapper.mappers().smartName("file.date").mapper().names().indexName()).numericValue().longValue(), is(expected)); + } + assertThat(doc.get(docMapper.mappers().smartName("file.title").mapper().names().indexName()), equalTo("Hello")); + assertThat(doc.get(docMapper.mappers().smartName("file.author").mapper().names().indexName()), equalTo("kimchy")); + assertThat(doc.get(docMapper.mappers().smartName("file.keywords").mapper().names().indexName()), equalTo("elasticsearch,cool,bonsai")); + assertThat(doc.get(docMapper.mappers().smartName("file.content_type").mapper().names().indexName()), equalTo("text/html; charset=ISO-8859-1")); + } + + @Test + public void testIgnoreWithoutDate() throws Exception { + checkDate("htmlWithoutDateMeta.html", ImmutableSettings.builder().build(), null); + } + + @Test + public void testIgnoreWithEmptyDate() throws Exception { + checkDate("htmlWithEmptyDateMeta.html", ImmutableSettings.builder().build(), null); + } + + @Test + public void testIgnoreWithCorrectDate() throws Exception { + checkDate("htmlWithValidDateMeta.html", ImmutableSettings.builder().build(), 1354233600000L); + } + + @Test + public void testWithoutDate() throws Exception { + checkDate("htmlWithoutDateMeta.html", ImmutableSettings.builder().put("index.mapping.attachment.ignore_errors", false).build(), null); + } + + @Test(expectedExceptions = MapperParsingException.class) + public void testWithEmptyDate() throws Exception { + checkDate("htmlWithEmptyDateMeta.html", ImmutableSettings.builder().put("index.mapping.attachment.ignore_errors", false).build(), null); + } + + @Test + public void testWithCorrectDate() throws Exception { + checkDate("htmlWithValidDateMeta.html", ImmutableSettings.builder().put("index.mapping.attachment.ignore_errors", false).build(), 1354233600000L); + } + +} diff --git a/src/test/resources/org/elasticsearch/index/mapper/xcontent/htmlWithEmptyDateMeta.html b/src/test/resources/org/elasticsearch/index/mapper/xcontent/htmlWithEmptyDateMeta.html new file mode 100644 index 00000000000..f151208e384 --- /dev/null +++ b/src/test/resources/org/elasticsearch/index/mapper/xcontent/htmlWithEmptyDateMeta.html @@ -0,0 +1,11 @@ + + + + Hello + + + + +World + diff --git a/src/test/resources/org/elasticsearch/index/mapper/xcontent/htmlWithValidDateMeta.html b/src/test/resources/org/elasticsearch/index/mapper/xcontent/htmlWithValidDateMeta.html new file mode 100644 index 00000000000..79b5a6234ec --- /dev/null +++ b/src/test/resources/org/elasticsearch/index/mapper/xcontent/htmlWithValidDateMeta.html @@ -0,0 +1,11 @@ + + + + Hello + + + + +World + diff --git a/src/test/resources/org/elasticsearch/index/mapper/xcontent/htmlWithoutDateMeta.html b/src/test/resources/org/elasticsearch/index/mapper/xcontent/htmlWithoutDateMeta.html new file mode 100644 index 00000000000..3322fa3a734 --- /dev/null +++ b/src/test/resources/org/elasticsearch/index/mapper/xcontent/htmlWithoutDateMeta.html @@ -0,0 +1,10 @@ + + + + Hello + + + +World + diff --git a/src/test/resources/org/elasticsearch/index/mapper/xcontent/test-mapping.json b/src/test/resources/org/elasticsearch/index/mapper/xcontent/test-mapping.json index 4f2a9288a7b..c8680cf0644 100644 --- a/src/test/resources/org/elasticsearch/index/mapper/xcontent/test-mapping.json +++ b/src/test/resources/org/elasticsearch/index/mapper/xcontent/test-mapping.json @@ -1,9 +1,9 @@ { - person:{ - properties:{ + "person":{ + "properties":{ "file":{ - type:"attachment" + "type":"attachment" } } } -} \ No newline at end of file +}