Don't reject full document in case of invalid metadata
From original PR #17 from @fcamblor If you try to index a document with an invalid metadata, the full document is rejected. For example: ```html <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"> <html lang="fr"> <head> <title>Hello</title> <meta name="date" content=""> <meta name="Author" content="kimchy"> <meta name="Keywords" content="elasticsearch,cool,bonsai"> </head> <body>World</body> </html> ``` has a non parseable date. This fix add a new option that ignore parsing errors `"index.mapping.attachment.ignore_errors":true` (default to `true`). Closes #17, #38.
This commit is contained in:
parent
d7a2e7e2ff
commit
019d0f9a26
|
@ -90,6 +90,14 @@ Note, this feature is support since `1.3.0` version.
|
|||
|
||||
The plugin uses [Apache Tika](http://lucene.apache.org/tika/) to parse attachments, so many formats are supported, listed [here](http://lucene.apache.org/tika/0.10/formats.html).
|
||||
|
||||
Metadata parsing error handling
|
||||
-------------------------------
|
||||
|
||||
While extracting metadata content, errors could happen for example when parsing dates.
|
||||
Since version `1.9.0`, parsing errors are ignored so your document is indexed.
|
||||
|
||||
You can disable this feature by setting the `index.mapping.attachment.ignore_errors` setting to `false`.
|
||||
|
||||
License
|
||||
-------
|
||||
|
||||
|
|
|
@ -22,6 +22,8 @@ package org.elasticsearch.index.mapper.attachment;
|
|||
import org.apache.tika.exception.TikaException;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.elasticsearch.common.io.stream.BytesStreamInput;
|
||||
import org.elasticsearch.common.logging.ESLogger;
|
||||
import org.elasticsearch.common.logging.ESLoggerFactory;
|
||||
import org.elasticsearch.common.xcontent.XContentBuilder;
|
||||
import org.elasticsearch.common.xcontent.XContentParser;
|
||||
import org.elasticsearch.index.mapper.*;
|
||||
|
@ -58,6 +60,8 @@ import static org.elasticsearch.plugin.mapper.attachments.tika.TikaInstance.tika
|
|||
*/
|
||||
public class AttachmentMapper implements Mapper {
|
||||
|
||||
private static ESLogger logger = ESLoggerFactory.getLogger(AttachmentMapper.class.getName());
|
||||
|
||||
public static final String CONTENT_TYPE = "attachment";
|
||||
|
||||
public static class Defaults {
|
||||
|
@ -70,6 +74,8 @@ public class AttachmentMapper implements Mapper {
|
|||
|
||||
private Integer defaultIndexedChars = null;
|
||||
|
||||
private Boolean ignoreErrors = null;
|
||||
|
||||
private Mapper.Builder contentBuilder;
|
||||
|
||||
private Mapper.Builder titleBuilder = stringField("title");
|
||||
|
@ -95,11 +101,6 @@ public class AttachmentMapper implements Mapper {
|
|||
return this;
|
||||
}
|
||||
|
||||
public Builder defaultIndexedChars(int defaultIndexedChars) {
|
||||
this.defaultIndexedChars = defaultIndexedChars;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder content(Mapper.Builder content) {
|
||||
this.contentBuilder = content;
|
||||
return this;
|
||||
|
@ -155,14 +156,21 @@ public class AttachmentMapper implements Mapper {
|
|||
|
||||
context.path().pathType(origPathType);
|
||||
|
||||
if (defaultIndexedChars != null && context.indexSettings() != null) {
|
||||
if (defaultIndexedChars == null && context.indexSettings() != null) {
|
||||
defaultIndexedChars = context.indexSettings().getAsInt("index.mapping.attachment.indexed_chars", 100000);
|
||||
}
|
||||
if (defaultIndexedChars == null) {
|
||||
defaultIndexedChars = 100000;
|
||||
}
|
||||
|
||||
return new AttachmentMapper(name, pathType, defaultIndexedChars, contentMapper, dateMapper, titleMapper, nameMapper, authorMapper, keywordsMapper, contentTypeMapper);
|
||||
if (ignoreErrors == null && context.indexSettings() != null) {
|
||||
ignoreErrors = context.indexSettings().getAsBoolean("index.mapping.attachment.ignore_errors", Boolean.TRUE);
|
||||
}
|
||||
if (ignoreErrors == null) {
|
||||
ignoreErrors = Boolean.TRUE;
|
||||
}
|
||||
|
||||
return new AttachmentMapper(name, pathType, defaultIndexedChars, ignoreErrors, contentMapper, dateMapper, titleMapper, nameMapper, authorMapper, keywordsMapper, contentTypeMapper);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -239,6 +247,8 @@ public class AttachmentMapper implements Mapper {
|
|||
|
||||
private final int defaultIndexedChars;
|
||||
|
||||
private final boolean ignoreErrors;
|
||||
|
||||
private final Mapper contentMapper;
|
||||
|
||||
private final Mapper dateMapper;
|
||||
|
@ -253,12 +263,13 @@ public class AttachmentMapper implements Mapper {
|
|||
|
||||
private final Mapper contentTypeMapper;
|
||||
|
||||
public AttachmentMapper(String name, ContentPath.Type pathType, int defaultIndexedChars, Mapper contentMapper,
|
||||
public AttachmentMapper(String name, ContentPath.Type pathType, int defaultIndexedChars, Boolean ignoreErrors, Mapper contentMapper,
|
||||
Mapper dateMapper, Mapper titleMapper, Mapper nameMapper, Mapper authorMapper,
|
||||
Mapper keywordsMapper, Mapper contentTypeMapper) {
|
||||
this.name = name;
|
||||
this.pathType = pathType;
|
||||
this.defaultIndexedChars = defaultIndexedChars;
|
||||
this.ignoreErrors = ignoreErrors;
|
||||
this.contentMapper = contentMapper;
|
||||
this.dateMapper = dateMapper;
|
||||
this.titleMapper = titleMapper;
|
||||
|
@ -330,23 +341,53 @@ public class AttachmentMapper implements Mapper {
|
|||
contentMapper.parse(context);
|
||||
|
||||
|
||||
context.externalValue(name);
|
||||
nameMapper.parse(context);
|
||||
try {
|
||||
context.externalValue(name);
|
||||
nameMapper.parse(context);
|
||||
} catch(MapperParsingException e){
|
||||
if (!ignoreErrors) throw e;
|
||||
if (logger.isDebugEnabled()) logger.debug("Ignoring MapperParsingException catch while parsing name: {}", e.getMessage());
|
||||
}
|
||||
|
||||
context.externalValue(metadata.get(Metadata.DATE));
|
||||
dateMapper.parse(context);
|
||||
try {
|
||||
context.externalValue(metadata.get(Metadata.DATE));
|
||||
dateMapper.parse(context);
|
||||
} catch(MapperParsingException e){
|
||||
if (!ignoreErrors) throw e;
|
||||
if (logger.isDebugEnabled()) logger.debug("Ignoring MapperParsingException catch while parsing date: {}: {}", e.getMessage(), context.externalValue());
|
||||
}
|
||||
|
||||
context.externalValue(metadata.get(Metadata.TITLE));
|
||||
titleMapper.parse(context);
|
||||
try {
|
||||
context.externalValue(metadata.get(Metadata.TITLE));
|
||||
titleMapper.parse(context);
|
||||
} catch(MapperParsingException e){
|
||||
if (!ignoreErrors) throw e;
|
||||
if (logger.isDebugEnabled()) logger.debug("Ignoring MapperParsingException catch while parsing title: {}: {}", e.getMessage(), context.externalValue());
|
||||
}
|
||||
|
||||
context.externalValue(metadata.get(Metadata.AUTHOR));
|
||||
authorMapper.parse(context);
|
||||
try {
|
||||
context.externalValue(metadata.get(Metadata.AUTHOR));
|
||||
authorMapper.parse(context);
|
||||
} catch(MapperParsingException e){
|
||||
if (!ignoreErrors) throw e;
|
||||
if (logger.isDebugEnabled()) logger.debug("Ignoring MapperParsingException catch while parsing author: {}: {}", e.getMessage(), context.externalValue());
|
||||
}
|
||||
|
||||
context.externalValue(metadata.get(Metadata.KEYWORDS));
|
||||
keywordsMapper.parse(context);
|
||||
try {
|
||||
context.externalValue(metadata.get(Metadata.KEYWORDS));
|
||||
keywordsMapper.parse(context);
|
||||
} catch(MapperParsingException e){
|
||||
if (!ignoreErrors) throw e;
|
||||
if (logger.isDebugEnabled()) logger.debug("Ignoring MapperParsingException catch while parsing keywords: {}: {}", e.getMessage(), context.externalValue());
|
||||
}
|
||||
|
||||
context.externalValue(metadata.get(Metadata.CONTENT_TYPE));
|
||||
contentTypeMapper.parse(context);
|
||||
try {
|
||||
context.externalValue(metadata.get(Metadata.CONTENT_TYPE));
|
||||
contentTypeMapper.parse(context);
|
||||
} catch(MapperParsingException e){
|
||||
if (!ignoreErrors) throw e;
|
||||
if (logger.isDebugEnabled()) logger.debug("Ignoring MapperParsingException catch while parsing content_type: {}: {}", e.getMessage(), context.externalValue());
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -0,0 +1,89 @@
|
|||
package org.elasticsearch.index.mapper.xcontent;
|
||||
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.elasticsearch.common.bytes.BytesReference;
|
||||
import org.elasticsearch.common.settings.ImmutableSettings;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.index.Index;
|
||||
import org.elasticsearch.index.analysis.AnalysisService;
|
||||
import org.elasticsearch.index.mapper.DocumentMapper;
|
||||
import org.elasticsearch.index.mapper.DocumentMapperParser;
|
||||
import org.elasticsearch.index.mapper.MapperParsingException;
|
||||
import org.elasticsearch.index.mapper.attachment.AttachmentMapper;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import static org.elasticsearch.common.io.Streams.copyToBytesFromClasspath;
|
||||
import static org.elasticsearch.common.io.Streams.copyToStringFromClasspath;
|
||||
import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;
|
||||
import static org.hamcrest.MatcherAssert.assertThat;
|
||||
import static org.hamcrest.Matchers.*;
|
||||
|
||||
/**
|
||||
* Test for https://github.com/elasticsearch/elasticsearch-mapper-attachments/issues/38
|
||||
*/
|
||||
public class MetadataMapperTest {
|
||||
|
||||
protected void checkDate(String filename, Settings settings, Long expected) throws IOException {
|
||||
DocumentMapperParser mapperParser = new DocumentMapperParser(new Index("test"), settings, new AnalysisService(new Index("test")), null, null);
|
||||
mapperParser.putTypeParser(AttachmentMapper.CONTENT_TYPE, new AttachmentMapper.TypeParser());
|
||||
|
||||
String mapping = copyToStringFromClasspath("/org/elasticsearch/index/mapper/xcontent/test-mapping.json");
|
||||
DocumentMapper docMapper = mapperParser.parse(mapping);
|
||||
byte[] html = copyToBytesFromClasspath("/org/elasticsearch/index/mapper/xcontent/" + filename);
|
||||
|
||||
BytesReference json = jsonBuilder()
|
||||
.startObject()
|
||||
.field("_id", 1)
|
||||
.startObject("file")
|
||||
.field("_name", "htmlWithoutDateMeta.html")
|
||||
.field("content", html)
|
||||
.endObject()
|
||||
.endObject().bytes();
|
||||
|
||||
Document doc = docMapper.parse(json).rootDoc();
|
||||
assertThat(doc.get(docMapper.mappers().smartName("file").mapper().names().indexName()), containsString("World"));
|
||||
assertThat(doc.get(docMapper.mappers().smartName("file.name").mapper().names().indexName()), equalTo("htmlWithoutDateMeta.html"));
|
||||
if (expected == null) {
|
||||
assertThat(doc.getField(docMapper.mappers().smartName("file.date").mapper().names().indexName()), nullValue());
|
||||
} else {
|
||||
assertThat(doc.getField(docMapper.mappers().smartName("file.date").mapper().names().indexName()).numericValue().longValue(), is(expected));
|
||||
}
|
||||
assertThat(doc.get(docMapper.mappers().smartName("file.title").mapper().names().indexName()), equalTo("Hello"));
|
||||
assertThat(doc.get(docMapper.mappers().smartName("file.author").mapper().names().indexName()), equalTo("kimchy"));
|
||||
assertThat(doc.get(docMapper.mappers().smartName("file.keywords").mapper().names().indexName()), equalTo("elasticsearch,cool,bonsai"));
|
||||
assertThat(doc.get(docMapper.mappers().smartName("file.content_type").mapper().names().indexName()), equalTo("text/html; charset=ISO-8859-1"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testIgnoreWithoutDate() throws Exception {
|
||||
checkDate("htmlWithoutDateMeta.html", ImmutableSettings.builder().build(), null);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testIgnoreWithEmptyDate() throws Exception {
|
||||
checkDate("htmlWithEmptyDateMeta.html", ImmutableSettings.builder().build(), null);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testIgnoreWithCorrectDate() throws Exception {
|
||||
checkDate("htmlWithValidDateMeta.html", ImmutableSettings.builder().build(), 1354233600000L);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testWithoutDate() throws Exception {
|
||||
checkDate("htmlWithoutDateMeta.html", ImmutableSettings.builder().put("index.mapping.attachment.ignore_errors", false).build(), null);
|
||||
}
|
||||
|
||||
@Test(expectedExceptions = MapperParsingException.class)
|
||||
public void testWithEmptyDate() throws Exception {
|
||||
checkDate("htmlWithEmptyDateMeta.html", ImmutableSettings.builder().put("index.mapping.attachment.ignore_errors", false).build(), null);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testWithCorrectDate() throws Exception {
|
||||
checkDate("htmlWithValidDateMeta.html", ImmutableSettings.builder().put("index.mapping.attachment.ignore_errors", false).build(), 1354233600000L);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,11 @@
|
|||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
|
||||
"http://www.w3.org/TR/html4/loose.dtd">
|
||||
<html lang="fr">
|
||||
<head>
|
||||
<title>Hello</title>
|
||||
<meta name="date" content="">
|
||||
<meta name="Author" content="kimchy">
|
||||
<meta name="Keywords" content="elasticsearch,cool,bonsai">
|
||||
</head>
|
||||
<body>World</body>
|
||||
</html>
|
|
@ -0,0 +1,11 @@
|
|||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
|
||||
"http://www.w3.org/TR/html4/loose.dtd">
|
||||
<html lang="fr">
|
||||
<head>
|
||||
<title>Hello</title>
|
||||
<meta name="date" content="2012-11-30">
|
||||
<meta name="Author" content="kimchy">
|
||||
<meta name="Keywords" content="elasticsearch,cool,bonsai">
|
||||
</head>
|
||||
<body>World</body>
|
||||
</html>
|
|
@ -0,0 +1,10 @@
|
|||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
|
||||
"http://www.w3.org/TR/html4/loose.dtd">
|
||||
<html lang="fr">
|
||||
<head>
|
||||
<title>Hello</title>
|
||||
<meta name="Author" content="kimchy">
|
||||
<meta name="Keywords" content="elasticsearch,cool,bonsai">
|
||||
</head>
|
||||
<body>World</body>
|
||||
</html>
|
|
@ -1,8 +1,8 @@
|
|||
{
|
||||
person:{
|
||||
properties:{
|
||||
"person":{
|
||||
"properties":{
|
||||
"file":{
|
||||
type:"attachment"
|
||||
"type":"attachment"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue