Don't reject full document in case of invalid metadata
From original PR #17 from @fcamblor If you try to index a document with an invalid metadata, the full document is rejected. For example: ```html <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"> <html lang="fr"> <head> <title>Hello</title> <meta name="date" content=""> <meta name="Author" content="kimchy"> <meta name="Keywords" content="elasticsearch,cool,bonsai"> </head> <body>World</body> </html> ``` has a non parseable date. This fix add a new option that ignore parsing errors `"index.mapping.attachment.ignore_errors":true` (default to `true`). Closes #17, #38.
This commit is contained in:
parent
d7a2e7e2ff
commit
019d0f9a26
|
@ -90,6 +90,14 @@ Note, this feature is support since `1.3.0` version.
|
||||||
|
|
||||||
The plugin uses [Apache Tika](http://lucene.apache.org/tika/) to parse attachments, so many formats are supported, listed [here](http://lucene.apache.org/tika/0.10/formats.html).
|
The plugin uses [Apache Tika](http://lucene.apache.org/tika/) to parse attachments, so many formats are supported, listed [here](http://lucene.apache.org/tika/0.10/formats.html).
|
||||||
|
|
||||||
|
Metadata parsing error handling
|
||||||
|
-------------------------------
|
||||||
|
|
||||||
|
While extracting metadata content, errors could happen for example when parsing dates.
|
||||||
|
Since version `1.9.0`, parsing errors are ignored so your document is indexed.
|
||||||
|
|
||||||
|
You can disable this feature by setting the `index.mapping.attachment.ignore_errors` setting to `false`.
|
||||||
|
|
||||||
License
|
License
|
||||||
-------
|
-------
|
||||||
|
|
||||||
|
|
|
@ -22,6 +22,8 @@ package org.elasticsearch.index.mapper.attachment;
|
||||||
import org.apache.tika.exception.TikaException;
|
import org.apache.tika.exception.TikaException;
|
||||||
import org.apache.tika.metadata.Metadata;
|
import org.apache.tika.metadata.Metadata;
|
||||||
import org.elasticsearch.common.io.stream.BytesStreamInput;
|
import org.elasticsearch.common.io.stream.BytesStreamInput;
|
||||||
|
import org.elasticsearch.common.logging.ESLogger;
|
||||||
|
import org.elasticsearch.common.logging.ESLoggerFactory;
|
||||||
import org.elasticsearch.common.xcontent.XContentBuilder;
|
import org.elasticsearch.common.xcontent.XContentBuilder;
|
||||||
import org.elasticsearch.common.xcontent.XContentParser;
|
import org.elasticsearch.common.xcontent.XContentParser;
|
||||||
import org.elasticsearch.index.mapper.*;
|
import org.elasticsearch.index.mapper.*;
|
||||||
|
@ -58,6 +60,8 @@ import static org.elasticsearch.plugin.mapper.attachments.tika.TikaInstance.tika
|
||||||
*/
|
*/
|
||||||
public class AttachmentMapper implements Mapper {
|
public class AttachmentMapper implements Mapper {
|
||||||
|
|
||||||
|
private static ESLogger logger = ESLoggerFactory.getLogger(AttachmentMapper.class.getName());
|
||||||
|
|
||||||
public static final String CONTENT_TYPE = "attachment";
|
public static final String CONTENT_TYPE = "attachment";
|
||||||
|
|
||||||
public static class Defaults {
|
public static class Defaults {
|
||||||
|
@ -70,6 +74,8 @@ public class AttachmentMapper implements Mapper {
|
||||||
|
|
||||||
private Integer defaultIndexedChars = null;
|
private Integer defaultIndexedChars = null;
|
||||||
|
|
||||||
|
private Boolean ignoreErrors = null;
|
||||||
|
|
||||||
private Mapper.Builder contentBuilder;
|
private Mapper.Builder contentBuilder;
|
||||||
|
|
||||||
private Mapper.Builder titleBuilder = stringField("title");
|
private Mapper.Builder titleBuilder = stringField("title");
|
||||||
|
@ -95,11 +101,6 @@ public class AttachmentMapper implements Mapper {
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Builder defaultIndexedChars(int defaultIndexedChars) {
|
|
||||||
this.defaultIndexedChars = defaultIndexedChars;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
public Builder content(Mapper.Builder content) {
|
public Builder content(Mapper.Builder content) {
|
||||||
this.contentBuilder = content;
|
this.contentBuilder = content;
|
||||||
return this;
|
return this;
|
||||||
|
@ -155,14 +156,21 @@ public class AttachmentMapper implements Mapper {
|
||||||
|
|
||||||
context.path().pathType(origPathType);
|
context.path().pathType(origPathType);
|
||||||
|
|
||||||
if (defaultIndexedChars != null && context.indexSettings() != null) {
|
if (defaultIndexedChars == null && context.indexSettings() != null) {
|
||||||
defaultIndexedChars = context.indexSettings().getAsInt("index.mapping.attachment.indexed_chars", 100000);
|
defaultIndexedChars = context.indexSettings().getAsInt("index.mapping.attachment.indexed_chars", 100000);
|
||||||
}
|
}
|
||||||
if (defaultIndexedChars == null) {
|
if (defaultIndexedChars == null) {
|
||||||
defaultIndexedChars = 100000;
|
defaultIndexedChars = 100000;
|
||||||
}
|
}
|
||||||
|
|
||||||
return new AttachmentMapper(name, pathType, defaultIndexedChars, contentMapper, dateMapper, titleMapper, nameMapper, authorMapper, keywordsMapper, contentTypeMapper);
|
if (ignoreErrors == null && context.indexSettings() != null) {
|
||||||
|
ignoreErrors = context.indexSettings().getAsBoolean("index.mapping.attachment.ignore_errors", Boolean.TRUE);
|
||||||
|
}
|
||||||
|
if (ignoreErrors == null) {
|
||||||
|
ignoreErrors = Boolean.TRUE;
|
||||||
|
}
|
||||||
|
|
||||||
|
return new AttachmentMapper(name, pathType, defaultIndexedChars, ignoreErrors, contentMapper, dateMapper, titleMapper, nameMapper, authorMapper, keywordsMapper, contentTypeMapper);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -239,6 +247,8 @@ public class AttachmentMapper implements Mapper {
|
||||||
|
|
||||||
private final int defaultIndexedChars;
|
private final int defaultIndexedChars;
|
||||||
|
|
||||||
|
private final boolean ignoreErrors;
|
||||||
|
|
||||||
private final Mapper contentMapper;
|
private final Mapper contentMapper;
|
||||||
|
|
||||||
private final Mapper dateMapper;
|
private final Mapper dateMapper;
|
||||||
|
@ -253,12 +263,13 @@ public class AttachmentMapper implements Mapper {
|
||||||
|
|
||||||
private final Mapper contentTypeMapper;
|
private final Mapper contentTypeMapper;
|
||||||
|
|
||||||
public AttachmentMapper(String name, ContentPath.Type pathType, int defaultIndexedChars, Mapper contentMapper,
|
public AttachmentMapper(String name, ContentPath.Type pathType, int defaultIndexedChars, Boolean ignoreErrors, Mapper contentMapper,
|
||||||
Mapper dateMapper, Mapper titleMapper, Mapper nameMapper, Mapper authorMapper,
|
Mapper dateMapper, Mapper titleMapper, Mapper nameMapper, Mapper authorMapper,
|
||||||
Mapper keywordsMapper, Mapper contentTypeMapper) {
|
Mapper keywordsMapper, Mapper contentTypeMapper) {
|
||||||
this.name = name;
|
this.name = name;
|
||||||
this.pathType = pathType;
|
this.pathType = pathType;
|
||||||
this.defaultIndexedChars = defaultIndexedChars;
|
this.defaultIndexedChars = defaultIndexedChars;
|
||||||
|
this.ignoreErrors = ignoreErrors;
|
||||||
this.contentMapper = contentMapper;
|
this.contentMapper = contentMapper;
|
||||||
this.dateMapper = dateMapper;
|
this.dateMapper = dateMapper;
|
||||||
this.titleMapper = titleMapper;
|
this.titleMapper = titleMapper;
|
||||||
|
@ -330,23 +341,53 @@ public class AttachmentMapper implements Mapper {
|
||||||
contentMapper.parse(context);
|
contentMapper.parse(context);
|
||||||
|
|
||||||
|
|
||||||
context.externalValue(name);
|
try {
|
||||||
nameMapper.parse(context);
|
context.externalValue(name);
|
||||||
|
nameMapper.parse(context);
|
||||||
|
} catch(MapperParsingException e){
|
||||||
|
if (!ignoreErrors) throw e;
|
||||||
|
if (logger.isDebugEnabled()) logger.debug("Ignoring MapperParsingException catch while parsing name: {}", e.getMessage());
|
||||||
|
}
|
||||||
|
|
||||||
context.externalValue(metadata.get(Metadata.DATE));
|
try {
|
||||||
dateMapper.parse(context);
|
context.externalValue(metadata.get(Metadata.DATE));
|
||||||
|
dateMapper.parse(context);
|
||||||
|
} catch(MapperParsingException e){
|
||||||
|
if (!ignoreErrors) throw e;
|
||||||
|
if (logger.isDebugEnabled()) logger.debug("Ignoring MapperParsingException catch while parsing date: {}: {}", e.getMessage(), context.externalValue());
|
||||||
|
}
|
||||||
|
|
||||||
context.externalValue(metadata.get(Metadata.TITLE));
|
try {
|
||||||
titleMapper.parse(context);
|
context.externalValue(metadata.get(Metadata.TITLE));
|
||||||
|
titleMapper.parse(context);
|
||||||
|
} catch(MapperParsingException e){
|
||||||
|
if (!ignoreErrors) throw e;
|
||||||
|
if (logger.isDebugEnabled()) logger.debug("Ignoring MapperParsingException catch while parsing title: {}: {}", e.getMessage(), context.externalValue());
|
||||||
|
}
|
||||||
|
|
||||||
context.externalValue(metadata.get(Metadata.AUTHOR));
|
try {
|
||||||
authorMapper.parse(context);
|
context.externalValue(metadata.get(Metadata.AUTHOR));
|
||||||
|
authorMapper.parse(context);
|
||||||
|
} catch(MapperParsingException e){
|
||||||
|
if (!ignoreErrors) throw e;
|
||||||
|
if (logger.isDebugEnabled()) logger.debug("Ignoring MapperParsingException catch while parsing author: {}: {}", e.getMessage(), context.externalValue());
|
||||||
|
}
|
||||||
|
|
||||||
context.externalValue(metadata.get(Metadata.KEYWORDS));
|
try {
|
||||||
keywordsMapper.parse(context);
|
context.externalValue(metadata.get(Metadata.KEYWORDS));
|
||||||
|
keywordsMapper.parse(context);
|
||||||
|
} catch(MapperParsingException e){
|
||||||
|
if (!ignoreErrors) throw e;
|
||||||
|
if (logger.isDebugEnabled()) logger.debug("Ignoring MapperParsingException catch while parsing keywords: {}: {}", e.getMessage(), context.externalValue());
|
||||||
|
}
|
||||||
|
|
||||||
context.externalValue(metadata.get(Metadata.CONTENT_TYPE));
|
try {
|
||||||
contentTypeMapper.parse(context);
|
context.externalValue(metadata.get(Metadata.CONTENT_TYPE));
|
||||||
|
contentTypeMapper.parse(context);
|
||||||
|
} catch(MapperParsingException e){
|
||||||
|
if (!ignoreErrors) throw e;
|
||||||
|
if (logger.isDebugEnabled()) logger.debug("Ignoring MapperParsingException catch while parsing content_type: {}: {}", e.getMessage(), context.externalValue());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -0,0 +1,89 @@
|
||||||
|
package org.elasticsearch.index.mapper.xcontent;
|
||||||
|
|
||||||
|
import org.apache.lucene.document.Document;
|
||||||
|
import org.elasticsearch.common.bytes.BytesReference;
|
||||||
|
import org.elasticsearch.common.settings.ImmutableSettings;
|
||||||
|
import org.elasticsearch.common.settings.Settings;
|
||||||
|
import org.elasticsearch.index.Index;
|
||||||
|
import org.elasticsearch.index.analysis.AnalysisService;
|
||||||
|
import org.elasticsearch.index.mapper.DocumentMapper;
|
||||||
|
import org.elasticsearch.index.mapper.DocumentMapperParser;
|
||||||
|
import org.elasticsearch.index.mapper.MapperParsingException;
|
||||||
|
import org.elasticsearch.index.mapper.attachment.AttachmentMapper;
|
||||||
|
import org.testng.annotations.Test;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import static org.elasticsearch.common.io.Streams.copyToBytesFromClasspath;
|
||||||
|
import static org.elasticsearch.common.io.Streams.copyToStringFromClasspath;
|
||||||
|
import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;
|
||||||
|
import static org.hamcrest.MatcherAssert.assertThat;
|
||||||
|
import static org.hamcrest.Matchers.*;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test for https://github.com/elasticsearch/elasticsearch-mapper-attachments/issues/38
|
||||||
|
*/
|
||||||
|
public class MetadataMapperTest {
|
||||||
|
|
||||||
|
protected void checkDate(String filename, Settings settings, Long expected) throws IOException {
|
||||||
|
DocumentMapperParser mapperParser = new DocumentMapperParser(new Index("test"), settings, new AnalysisService(new Index("test")), null, null);
|
||||||
|
mapperParser.putTypeParser(AttachmentMapper.CONTENT_TYPE, new AttachmentMapper.TypeParser());
|
||||||
|
|
||||||
|
String mapping = copyToStringFromClasspath("/org/elasticsearch/index/mapper/xcontent/test-mapping.json");
|
||||||
|
DocumentMapper docMapper = mapperParser.parse(mapping);
|
||||||
|
byte[] html = copyToBytesFromClasspath("/org/elasticsearch/index/mapper/xcontent/" + filename);
|
||||||
|
|
||||||
|
BytesReference json = jsonBuilder()
|
||||||
|
.startObject()
|
||||||
|
.field("_id", 1)
|
||||||
|
.startObject("file")
|
||||||
|
.field("_name", "htmlWithoutDateMeta.html")
|
||||||
|
.field("content", html)
|
||||||
|
.endObject()
|
||||||
|
.endObject().bytes();
|
||||||
|
|
||||||
|
Document doc = docMapper.parse(json).rootDoc();
|
||||||
|
assertThat(doc.get(docMapper.mappers().smartName("file").mapper().names().indexName()), containsString("World"));
|
||||||
|
assertThat(doc.get(docMapper.mappers().smartName("file.name").mapper().names().indexName()), equalTo("htmlWithoutDateMeta.html"));
|
||||||
|
if (expected == null) {
|
||||||
|
assertThat(doc.getField(docMapper.mappers().smartName("file.date").mapper().names().indexName()), nullValue());
|
||||||
|
} else {
|
||||||
|
assertThat(doc.getField(docMapper.mappers().smartName("file.date").mapper().names().indexName()).numericValue().longValue(), is(expected));
|
||||||
|
}
|
||||||
|
assertThat(doc.get(docMapper.mappers().smartName("file.title").mapper().names().indexName()), equalTo("Hello"));
|
||||||
|
assertThat(doc.get(docMapper.mappers().smartName("file.author").mapper().names().indexName()), equalTo("kimchy"));
|
||||||
|
assertThat(doc.get(docMapper.mappers().smartName("file.keywords").mapper().names().indexName()), equalTo("elasticsearch,cool,bonsai"));
|
||||||
|
assertThat(doc.get(docMapper.mappers().smartName("file.content_type").mapper().names().indexName()), equalTo("text/html; charset=ISO-8859-1"));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testIgnoreWithoutDate() throws Exception {
|
||||||
|
checkDate("htmlWithoutDateMeta.html", ImmutableSettings.builder().build(), null);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testIgnoreWithEmptyDate() throws Exception {
|
||||||
|
checkDate("htmlWithEmptyDateMeta.html", ImmutableSettings.builder().build(), null);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testIgnoreWithCorrectDate() throws Exception {
|
||||||
|
checkDate("htmlWithValidDateMeta.html", ImmutableSettings.builder().build(), 1354233600000L);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testWithoutDate() throws Exception {
|
||||||
|
checkDate("htmlWithoutDateMeta.html", ImmutableSettings.builder().put("index.mapping.attachment.ignore_errors", false).build(), null);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test(expectedExceptions = MapperParsingException.class)
|
||||||
|
public void testWithEmptyDate() throws Exception {
|
||||||
|
checkDate("htmlWithEmptyDateMeta.html", ImmutableSettings.builder().put("index.mapping.attachment.ignore_errors", false).build(), null);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testWithCorrectDate() throws Exception {
|
||||||
|
checkDate("htmlWithValidDateMeta.html", ImmutableSettings.builder().put("index.mapping.attachment.ignore_errors", false).build(), 1354233600000L);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,11 @@
|
||||||
|
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
|
||||||
|
"http://www.w3.org/TR/html4/loose.dtd">
|
||||||
|
<html lang="fr">
|
||||||
|
<head>
|
||||||
|
<title>Hello</title>
|
||||||
|
<meta name="date" content="">
|
||||||
|
<meta name="Author" content="kimchy">
|
||||||
|
<meta name="Keywords" content="elasticsearch,cool,bonsai">
|
||||||
|
</head>
|
||||||
|
<body>World</body>
|
||||||
|
</html>
|
|
@ -0,0 +1,11 @@
|
||||||
|
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
|
||||||
|
"http://www.w3.org/TR/html4/loose.dtd">
|
||||||
|
<html lang="fr">
|
||||||
|
<head>
|
||||||
|
<title>Hello</title>
|
||||||
|
<meta name="date" content="2012-11-30">
|
||||||
|
<meta name="Author" content="kimchy">
|
||||||
|
<meta name="Keywords" content="elasticsearch,cool,bonsai">
|
||||||
|
</head>
|
||||||
|
<body>World</body>
|
||||||
|
</html>
|
|
@ -0,0 +1,10 @@
|
||||||
|
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
|
||||||
|
"http://www.w3.org/TR/html4/loose.dtd">
|
||||||
|
<html lang="fr">
|
||||||
|
<head>
|
||||||
|
<title>Hello</title>
|
||||||
|
<meta name="Author" content="kimchy">
|
||||||
|
<meta name="Keywords" content="elasticsearch,cool,bonsai">
|
||||||
|
</head>
|
||||||
|
<body>World</body>
|
||||||
|
</html>
|
|
@ -1,8 +1,8 @@
|
||||||
{
|
{
|
||||||
person:{
|
"person":{
|
||||||
properties:{
|
"properties":{
|
||||||
"file":{
|
"file":{
|
||||||
type:"attachment"
|
"type":"attachment"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue