Don't reject full document in case of invalid metadata

From original PR #17 from @fcamblor

If you try to index a document with an invalid metadata, the full document is rejected.

For example:

```html
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
"http://www.w3.org/TR/html4/loose.dtd">
<html lang="fr">
<head>
<title>Hello</title>
<meta name="date" content="">
<meta name="Author" content="kimchy">
<meta name="Keywords" content="elasticsearch,cool,bonsai">
</head>
<body>World</body>
</html>
```

has a non parseable date.

This fix add a new option that ignore parsing errors `"index.mapping.attachment.ignore_errors":true` (default to `true`).

Closes #17, #38.
This commit is contained in:
Frédéric Camblor 2012-11-30 20:25:31 +01:00 committed by David Pilato
parent d7a2e7e2ff
commit 019d0f9a26
7 changed files with 194 additions and 24 deletions

View File

@ -90,6 +90,14 @@ Note, this feature is support since `1.3.0` version.
The plugin uses [Apache Tika](http://lucene.apache.org/tika/) to parse attachments, so many formats are supported, listed [here](http://lucene.apache.org/tika/0.10/formats.html). The plugin uses [Apache Tika](http://lucene.apache.org/tika/) to parse attachments, so many formats are supported, listed [here](http://lucene.apache.org/tika/0.10/formats.html).
Metadata parsing error handling
-------------------------------
While extracting metadata content, errors could happen for example when parsing dates.
Since version `1.9.0`, parsing errors are ignored so your document is indexed.
You can disable this feature by setting the `index.mapping.attachment.ignore_errors` setting to `false`.
License License
------- -------

View File

@ -22,6 +22,8 @@ package org.elasticsearch.index.mapper.attachment;
import org.apache.tika.exception.TikaException; import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.Metadata;
import org.elasticsearch.common.io.stream.BytesStreamInput; import org.elasticsearch.common.io.stream.BytesStreamInput;
import org.elasticsearch.common.logging.ESLogger;
import org.elasticsearch.common.logging.ESLoggerFactory;
import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.XContentParser; import org.elasticsearch.common.xcontent.XContentParser;
import org.elasticsearch.index.mapper.*; import org.elasticsearch.index.mapper.*;
@ -58,6 +60,8 @@ import static org.elasticsearch.plugin.mapper.attachments.tika.TikaInstance.tika
*/ */
public class AttachmentMapper implements Mapper { public class AttachmentMapper implements Mapper {
private static ESLogger logger = ESLoggerFactory.getLogger(AttachmentMapper.class.getName());
public static final String CONTENT_TYPE = "attachment"; public static final String CONTENT_TYPE = "attachment";
public static class Defaults { public static class Defaults {
@ -70,6 +74,8 @@ public class AttachmentMapper implements Mapper {
private Integer defaultIndexedChars = null; private Integer defaultIndexedChars = null;
private Boolean ignoreErrors = null;
private Mapper.Builder contentBuilder; private Mapper.Builder contentBuilder;
private Mapper.Builder titleBuilder = stringField("title"); private Mapper.Builder titleBuilder = stringField("title");
@ -95,11 +101,6 @@ public class AttachmentMapper implements Mapper {
return this; return this;
} }
public Builder defaultIndexedChars(int defaultIndexedChars) {
this.defaultIndexedChars = defaultIndexedChars;
return this;
}
public Builder content(Mapper.Builder content) { public Builder content(Mapper.Builder content) {
this.contentBuilder = content; this.contentBuilder = content;
return this; return this;
@ -155,14 +156,21 @@ public class AttachmentMapper implements Mapper {
context.path().pathType(origPathType); context.path().pathType(origPathType);
if (defaultIndexedChars != null && context.indexSettings() != null) { if (defaultIndexedChars == null && context.indexSettings() != null) {
defaultIndexedChars = context.indexSettings().getAsInt("index.mapping.attachment.indexed_chars", 100000); defaultIndexedChars = context.indexSettings().getAsInt("index.mapping.attachment.indexed_chars", 100000);
} }
if (defaultIndexedChars == null) { if (defaultIndexedChars == null) {
defaultIndexedChars = 100000; defaultIndexedChars = 100000;
} }
return new AttachmentMapper(name, pathType, defaultIndexedChars, contentMapper, dateMapper, titleMapper, nameMapper, authorMapper, keywordsMapper, contentTypeMapper); if (ignoreErrors == null && context.indexSettings() != null) {
ignoreErrors = context.indexSettings().getAsBoolean("index.mapping.attachment.ignore_errors", Boolean.TRUE);
}
if (ignoreErrors == null) {
ignoreErrors = Boolean.TRUE;
}
return new AttachmentMapper(name, pathType, defaultIndexedChars, ignoreErrors, contentMapper, dateMapper, titleMapper, nameMapper, authorMapper, keywordsMapper, contentTypeMapper);
} }
} }
@ -239,6 +247,8 @@ public class AttachmentMapper implements Mapper {
private final int defaultIndexedChars; private final int defaultIndexedChars;
private final boolean ignoreErrors;
private final Mapper contentMapper; private final Mapper contentMapper;
private final Mapper dateMapper; private final Mapper dateMapper;
@ -253,12 +263,13 @@ public class AttachmentMapper implements Mapper {
private final Mapper contentTypeMapper; private final Mapper contentTypeMapper;
public AttachmentMapper(String name, ContentPath.Type pathType, int defaultIndexedChars, Mapper contentMapper, public AttachmentMapper(String name, ContentPath.Type pathType, int defaultIndexedChars, Boolean ignoreErrors, Mapper contentMapper,
Mapper dateMapper, Mapper titleMapper, Mapper nameMapper, Mapper authorMapper, Mapper dateMapper, Mapper titleMapper, Mapper nameMapper, Mapper authorMapper,
Mapper keywordsMapper, Mapper contentTypeMapper) { Mapper keywordsMapper, Mapper contentTypeMapper) {
this.name = name; this.name = name;
this.pathType = pathType; this.pathType = pathType;
this.defaultIndexedChars = defaultIndexedChars; this.defaultIndexedChars = defaultIndexedChars;
this.ignoreErrors = ignoreErrors;
this.contentMapper = contentMapper; this.contentMapper = contentMapper;
this.dateMapper = dateMapper; this.dateMapper = dateMapper;
this.titleMapper = titleMapper; this.titleMapper = titleMapper;
@ -330,23 +341,53 @@ public class AttachmentMapper implements Mapper {
contentMapper.parse(context); contentMapper.parse(context);
context.externalValue(name); try {
nameMapper.parse(context); context.externalValue(name);
nameMapper.parse(context);
} catch(MapperParsingException e){
if (!ignoreErrors) throw e;
if (logger.isDebugEnabled()) logger.debug("Ignoring MapperParsingException catch while parsing name: {}", e.getMessage());
}
context.externalValue(metadata.get(Metadata.DATE)); try {
dateMapper.parse(context); context.externalValue(metadata.get(Metadata.DATE));
dateMapper.parse(context);
} catch(MapperParsingException e){
if (!ignoreErrors) throw e;
if (logger.isDebugEnabled()) logger.debug("Ignoring MapperParsingException catch while parsing date: {}: {}", e.getMessage(), context.externalValue());
}
context.externalValue(metadata.get(Metadata.TITLE)); try {
titleMapper.parse(context); context.externalValue(metadata.get(Metadata.TITLE));
titleMapper.parse(context);
} catch(MapperParsingException e){
if (!ignoreErrors) throw e;
if (logger.isDebugEnabled()) logger.debug("Ignoring MapperParsingException catch while parsing title: {}: {}", e.getMessage(), context.externalValue());
}
context.externalValue(metadata.get(Metadata.AUTHOR)); try {
authorMapper.parse(context); context.externalValue(metadata.get(Metadata.AUTHOR));
authorMapper.parse(context);
} catch(MapperParsingException e){
if (!ignoreErrors) throw e;
if (logger.isDebugEnabled()) logger.debug("Ignoring MapperParsingException catch while parsing author: {}: {}", e.getMessage(), context.externalValue());
}
context.externalValue(metadata.get(Metadata.KEYWORDS)); try {
keywordsMapper.parse(context); context.externalValue(metadata.get(Metadata.KEYWORDS));
keywordsMapper.parse(context);
} catch(MapperParsingException e){
if (!ignoreErrors) throw e;
if (logger.isDebugEnabled()) logger.debug("Ignoring MapperParsingException catch while parsing keywords: {}: {}", e.getMessage(), context.externalValue());
}
context.externalValue(metadata.get(Metadata.CONTENT_TYPE)); try {
contentTypeMapper.parse(context); context.externalValue(metadata.get(Metadata.CONTENT_TYPE));
contentTypeMapper.parse(context);
} catch(MapperParsingException e){
if (!ignoreErrors) throw e;
if (logger.isDebugEnabled()) logger.debug("Ignoring MapperParsingException catch while parsing content_type: {}: {}", e.getMessage(), context.externalValue());
}
} }
@Override @Override

View File

@ -0,0 +1,89 @@
package org.elasticsearch.index.mapper.xcontent;
import org.apache.lucene.document.Document;
import org.elasticsearch.common.bytes.BytesReference;
import org.elasticsearch.common.settings.ImmutableSettings;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.analysis.AnalysisService;
import org.elasticsearch.index.mapper.DocumentMapper;
import org.elasticsearch.index.mapper.DocumentMapperParser;
import org.elasticsearch.index.mapper.MapperParsingException;
import org.elasticsearch.index.mapper.attachment.AttachmentMapper;
import org.testng.annotations.Test;
import java.io.IOException;
import static org.elasticsearch.common.io.Streams.copyToBytesFromClasspath;
import static org.elasticsearch.common.io.Streams.copyToStringFromClasspath;
import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;
import static org.hamcrest.MatcherAssert.assertThat;
import static org.hamcrest.Matchers.*;
/**
* Test for https://github.com/elasticsearch/elasticsearch-mapper-attachments/issues/38
*/
public class MetadataMapperTest {
protected void checkDate(String filename, Settings settings, Long expected) throws IOException {
DocumentMapperParser mapperParser = new DocumentMapperParser(new Index("test"), settings, new AnalysisService(new Index("test")), null, null);
mapperParser.putTypeParser(AttachmentMapper.CONTENT_TYPE, new AttachmentMapper.TypeParser());
String mapping = copyToStringFromClasspath("/org/elasticsearch/index/mapper/xcontent/test-mapping.json");
DocumentMapper docMapper = mapperParser.parse(mapping);
byte[] html = copyToBytesFromClasspath("/org/elasticsearch/index/mapper/xcontent/" + filename);
BytesReference json = jsonBuilder()
.startObject()
.field("_id", 1)
.startObject("file")
.field("_name", "htmlWithoutDateMeta.html")
.field("content", html)
.endObject()
.endObject().bytes();
Document doc = docMapper.parse(json).rootDoc();
assertThat(doc.get(docMapper.mappers().smartName("file").mapper().names().indexName()), containsString("World"));
assertThat(doc.get(docMapper.mappers().smartName("file.name").mapper().names().indexName()), equalTo("htmlWithoutDateMeta.html"));
if (expected == null) {
assertThat(doc.getField(docMapper.mappers().smartName("file.date").mapper().names().indexName()), nullValue());
} else {
assertThat(doc.getField(docMapper.mappers().smartName("file.date").mapper().names().indexName()).numericValue().longValue(), is(expected));
}
assertThat(doc.get(docMapper.mappers().smartName("file.title").mapper().names().indexName()), equalTo("Hello"));
assertThat(doc.get(docMapper.mappers().smartName("file.author").mapper().names().indexName()), equalTo("kimchy"));
assertThat(doc.get(docMapper.mappers().smartName("file.keywords").mapper().names().indexName()), equalTo("elasticsearch,cool,bonsai"));
assertThat(doc.get(docMapper.mappers().smartName("file.content_type").mapper().names().indexName()), equalTo("text/html; charset=ISO-8859-1"));
}
@Test
public void testIgnoreWithoutDate() throws Exception {
checkDate("htmlWithoutDateMeta.html", ImmutableSettings.builder().build(), null);
}
@Test
public void testIgnoreWithEmptyDate() throws Exception {
checkDate("htmlWithEmptyDateMeta.html", ImmutableSettings.builder().build(), null);
}
@Test
public void testIgnoreWithCorrectDate() throws Exception {
checkDate("htmlWithValidDateMeta.html", ImmutableSettings.builder().build(), 1354233600000L);
}
@Test
public void testWithoutDate() throws Exception {
checkDate("htmlWithoutDateMeta.html", ImmutableSettings.builder().put("index.mapping.attachment.ignore_errors", false).build(), null);
}
@Test(expectedExceptions = MapperParsingException.class)
public void testWithEmptyDate() throws Exception {
checkDate("htmlWithEmptyDateMeta.html", ImmutableSettings.builder().put("index.mapping.attachment.ignore_errors", false).build(), null);
}
@Test
public void testWithCorrectDate() throws Exception {
checkDate("htmlWithValidDateMeta.html", ImmutableSettings.builder().put("index.mapping.attachment.ignore_errors", false).build(), 1354233600000L);
}
}

View File

@ -0,0 +1,11 @@
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
"http://www.w3.org/TR/html4/loose.dtd">
<html lang="fr">
<head>
<title>Hello</title>
<meta name="date" content="">
<meta name="Author" content="kimchy">
<meta name="Keywords" content="elasticsearch,cool,bonsai">
</head>
<body>World</body>
</html>

View File

@ -0,0 +1,11 @@
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
"http://www.w3.org/TR/html4/loose.dtd">
<html lang="fr">
<head>
<title>Hello</title>
<meta name="date" content="2012-11-30">
<meta name="Author" content="kimchy">
<meta name="Keywords" content="elasticsearch,cool,bonsai">
</head>
<body>World</body>
</html>

View File

@ -0,0 +1,10 @@
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
"http://www.w3.org/TR/html4/loose.dtd">
<html lang="fr">
<head>
<title>Hello</title>
<meta name="Author" content="kimchy">
<meta name="Keywords" content="elasticsearch,cool,bonsai">
</head>
<body>World</body>
</html>

View File

@ -1,8 +1,8 @@
{ {
person:{ "person":{
properties:{ "properties":{
"file":{ "file":{
type:"attachment" "type":"attachment"
} }
} }
} }