Ignore encrypted documents
Original request: I am sending multiple pdf, word etc. attachments in one documents to be indexed. Some of them (pdf) are encrypted and I am getting a MapperParsingException caused by org.apache.tika.exception.TikaException: Unable to extract PDF content cause by org.apache.pdfbox.exceptions.WrappedIOException: Error decrypting document. I was wondering if the attachment mapper could expose some switch to ignore the documents it can not extract? As we now have option `ignore_errors`, we can support it. See #38 relative to this option. Closes #18.
This commit is contained in:
parent
d6aa2f0615
commit
b35ad804df
|
@ -356,7 +356,9 @@ public class AttachmentMapper implements Mapper {
|
||||||
// Set the maximum length of strings returned by the parseToString method, -1 sets no limit
|
// Set the maximum length of strings returned by the parseToString method, -1 sets no limit
|
||||||
parsedContent = tika().parseToString(new BytesStreamInput(content, false), metadata, indexedChars);
|
parsedContent = tika().parseToString(new BytesStreamInput(content, false), metadata, indexedChars);
|
||||||
} catch (Throwable e) {
|
} catch (Throwable e) {
|
||||||
throw new MapperParsingException("Failed to extract [" + indexedChars + "] characters of text for [" + name + "]", e);
|
// #18: we could ignore errors when Tika does not parse data
|
||||||
|
if (!ignoreErrors) throw new MapperParsingException("Failed to extract [" + indexedChars + "] characters of text for [" + name + "]", e);
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
context.externalValue(parsedContent);
|
context.externalValue(parsedContent);
|
||||||
|
|
|
@ -0,0 +1,130 @@
|
||||||
|
package org.elasticsearch.index.mapper.xcontent;
|
||||||
|
|
||||||
|
import org.apache.lucene.document.Document;
|
||||||
|
import org.elasticsearch.common.bytes.BytesReference;
|
||||||
|
import org.elasticsearch.common.settings.ImmutableSettings;
|
||||||
|
import org.elasticsearch.index.Index;
|
||||||
|
import org.elasticsearch.index.analysis.AnalysisService;
|
||||||
|
import org.elasticsearch.index.mapper.DocumentMapper;
|
||||||
|
import org.elasticsearch.index.mapper.DocumentMapperParser;
|
||||||
|
import org.elasticsearch.index.mapper.MapperParsingException;
|
||||||
|
import org.elasticsearch.index.mapper.attachment.AttachmentMapper;
|
||||||
|
import org.testng.annotations.Test;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import static org.elasticsearch.common.io.Streams.copyToBytesFromClasspath;
|
||||||
|
import static org.elasticsearch.common.io.Streams.copyToStringFromClasspath;
|
||||||
|
import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;
|
||||||
|
import static org.hamcrest.MatcherAssert.assertThat;
|
||||||
|
import static org.hamcrest.Matchers.*;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test for https://github.com/elasticsearch/elasticsearch-mapper-attachments/issues/18
|
||||||
|
* Note that we have converted /org/elasticsearch/index/mapper/xcontent/testContentLength.txt
|
||||||
|
* to a /org/elasticsearch/index/mapper/xcontent/encrypted.pdf with password `12345678`.
|
||||||
|
*/
|
||||||
|
public class EncryptedDocMapperTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testMultipleDocsEncryptedLast() throws IOException {
|
||||||
|
DocumentMapperParser mapperParser = new DocumentMapperParser(new Index("test"), new AnalysisService(new Index("test")), null, null);
|
||||||
|
mapperParser.putTypeParser(AttachmentMapper.CONTENT_TYPE, new AttachmentMapper.TypeParser());
|
||||||
|
|
||||||
|
String mapping = copyToStringFromClasspath("/org/elasticsearch/index/mapper/multipledocs/test-mapping.json");
|
||||||
|
DocumentMapper docMapper = mapperParser.parse(mapping);
|
||||||
|
byte[] html = copyToBytesFromClasspath("/org/elasticsearch/index/mapper/xcontent/htmlWithValidDateMeta.html");
|
||||||
|
byte[] pdf = copyToBytesFromClasspath("/org/elasticsearch/index/mapper/xcontent/encrypted.pdf");
|
||||||
|
|
||||||
|
BytesReference json = jsonBuilder()
|
||||||
|
.startObject()
|
||||||
|
.field("_id", 1)
|
||||||
|
.field("file1", html)
|
||||||
|
.field("file2", pdf)
|
||||||
|
.endObject().bytes();
|
||||||
|
|
||||||
|
Document doc = docMapper.parse(json).rootDoc();
|
||||||
|
assertThat(doc.get(docMapper.mappers().smartName("file1").mapper().names().indexName()), containsString("World"));
|
||||||
|
assertThat(doc.get(docMapper.mappers().smartName("file1.title").mapper().names().indexName()), equalTo("Hello"));
|
||||||
|
assertThat(doc.get(docMapper.mappers().smartName("file1.author").mapper().names().indexName()), equalTo("kimchy"));
|
||||||
|
assertThat(doc.get(docMapper.mappers().smartName("file1.keywords").mapper().names().indexName()), equalTo("elasticsearch,cool,bonsai"));
|
||||||
|
assertThat(doc.get(docMapper.mappers().smartName("file1.content_type").mapper().names().indexName()), equalTo("text/html; charset=ISO-8859-1"));
|
||||||
|
assertThat(doc.getField(docMapper.mappers().smartName("file1.content_length").mapper().names().indexName()).numericValue().longValue(), is(344L));
|
||||||
|
|
||||||
|
assertThat(doc.get(docMapper.mappers().smartName("file2").mapper().names().indexName()), nullValue());
|
||||||
|
assertThat(doc.get(docMapper.mappers().smartName("file2.title").mapper().names().indexName()), nullValue());
|
||||||
|
assertThat(doc.get(docMapper.mappers().smartName("file2.author").mapper().names().indexName()), nullValue());
|
||||||
|
assertThat(doc.get(docMapper.mappers().smartName("file2.keywords").mapper().names().indexName()), nullValue());
|
||||||
|
assertThat(doc.get(docMapper.mappers().smartName("file2.content_type").mapper().names().indexName()), nullValue());
|
||||||
|
assertThat(doc.getField(docMapper.mappers().smartName("file2.content_length").mapper().names().indexName()), nullValue());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testMultipleDocsEncryptedFirst() throws IOException {
|
||||||
|
DocumentMapperParser mapperParser = new DocumentMapperParser(new Index("test"), new AnalysisService(new Index("test")), null, null);
|
||||||
|
mapperParser.putTypeParser(AttachmentMapper.CONTENT_TYPE, new AttachmentMapper.TypeParser());
|
||||||
|
|
||||||
|
String mapping = copyToStringFromClasspath("/org/elasticsearch/index/mapper/multipledocs/test-mapping.json");
|
||||||
|
DocumentMapper docMapper = mapperParser.parse(mapping);
|
||||||
|
byte[] html = copyToBytesFromClasspath("/org/elasticsearch/index/mapper/xcontent/htmlWithValidDateMeta.html");
|
||||||
|
byte[] pdf = copyToBytesFromClasspath("/org/elasticsearch/index/mapper/xcontent/encrypted.pdf");
|
||||||
|
|
||||||
|
BytesReference json = jsonBuilder()
|
||||||
|
.startObject()
|
||||||
|
.field("_id", 1)
|
||||||
|
.field("file1", pdf)
|
||||||
|
.field("file2", html)
|
||||||
|
.endObject().bytes();
|
||||||
|
|
||||||
|
Document doc = docMapper.parse(json).rootDoc();
|
||||||
|
assertThat(doc.get(docMapper.mappers().smartName("file1").mapper().names().indexName()), nullValue());
|
||||||
|
assertThat(doc.get(docMapper.mappers().smartName("file1.title").mapper().names().indexName()), nullValue());
|
||||||
|
assertThat(doc.get(docMapper.mappers().smartName("file1.author").mapper().names().indexName()), nullValue());
|
||||||
|
assertThat(doc.get(docMapper.mappers().smartName("file1.keywords").mapper().names().indexName()), nullValue());
|
||||||
|
assertThat(doc.get(docMapper.mappers().smartName("file1.content_type").mapper().names().indexName()), nullValue());
|
||||||
|
assertThat(doc.getField(docMapper.mappers().smartName("file1.content_length").mapper().names().indexName()), nullValue());
|
||||||
|
|
||||||
|
assertThat(doc.get(docMapper.mappers().smartName("file2").mapper().names().indexName()), containsString("World"));
|
||||||
|
assertThat(doc.get(docMapper.mappers().smartName("file2.title").mapper().names().indexName()), equalTo("Hello"));
|
||||||
|
assertThat(doc.get(docMapper.mappers().smartName("file2.author").mapper().names().indexName()), equalTo("kimchy"));
|
||||||
|
assertThat(doc.get(docMapper.mappers().smartName("file2.keywords").mapper().names().indexName()), equalTo("elasticsearch,cool,bonsai"));
|
||||||
|
assertThat(doc.get(docMapper.mappers().smartName("file2.content_type").mapper().names().indexName()), equalTo("text/html; charset=ISO-8859-1"));
|
||||||
|
assertThat(doc.getField(docMapper.mappers().smartName("file2.content_length").mapper().names().indexName()).numericValue().longValue(), is(344L));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test(expectedExceptions = MapperParsingException.class)
|
||||||
|
public void testMultipleDocsEncryptedNotIgnoringErrors() throws IOException {
|
||||||
|
DocumentMapperParser mapperParser = new DocumentMapperParser(new Index("test"),
|
||||||
|
ImmutableSettings.builder().put("index.mapping.attachment.ignore_errors", false).build(),
|
||||||
|
new AnalysisService(new Index("test")), null, null);
|
||||||
|
mapperParser.putTypeParser(AttachmentMapper.CONTENT_TYPE, new AttachmentMapper.TypeParser());
|
||||||
|
|
||||||
|
String mapping = copyToStringFromClasspath("/org/elasticsearch/index/mapper/multipledocs/test-mapping.json");
|
||||||
|
DocumentMapper docMapper = mapperParser.parse(mapping);
|
||||||
|
byte[] html = copyToBytesFromClasspath("/org/elasticsearch/index/mapper/xcontent/htmlWithValidDateMeta.html");
|
||||||
|
byte[] pdf = copyToBytesFromClasspath("/org/elasticsearch/index/mapper/xcontent/encrypted.pdf");
|
||||||
|
|
||||||
|
BytesReference json = jsonBuilder()
|
||||||
|
.startObject()
|
||||||
|
.field("_id", 1)
|
||||||
|
.field("file1", pdf)
|
||||||
|
.field("file2", html)
|
||||||
|
.endObject().bytes();
|
||||||
|
|
||||||
|
Document doc = docMapper.parse(json).rootDoc();
|
||||||
|
assertThat(doc.get(docMapper.mappers().smartName("file1").mapper().names().indexName()), nullValue());
|
||||||
|
assertThat(doc.get(docMapper.mappers().smartName("file1.title").mapper().names().indexName()), nullValue());
|
||||||
|
assertThat(doc.get(docMapper.mappers().smartName("file1.author").mapper().names().indexName()), nullValue());
|
||||||
|
assertThat(doc.get(docMapper.mappers().smartName("file1.keywords").mapper().names().indexName()), nullValue());
|
||||||
|
assertThat(doc.get(docMapper.mappers().smartName("file1.content_type").mapper().names().indexName()), nullValue());
|
||||||
|
assertThat(doc.getField(docMapper.mappers().smartName("file1.content_length").mapper().names().indexName()), nullValue());
|
||||||
|
|
||||||
|
assertThat(doc.get(docMapper.mappers().smartName("file2").mapper().names().indexName()), containsString("World"));
|
||||||
|
assertThat(doc.get(docMapper.mappers().smartName("file2.title").mapper().names().indexName()), equalTo("Hello"));
|
||||||
|
assertThat(doc.get(docMapper.mappers().smartName("file2.author").mapper().names().indexName()), equalTo("kimchy"));
|
||||||
|
assertThat(doc.get(docMapper.mappers().smartName("file2.keywords").mapper().names().indexName()), equalTo("elasticsearch,cool,bonsai"));
|
||||||
|
assertThat(doc.get(docMapper.mappers().smartName("file2.content_type").mapper().names().indexName()), equalTo("text/html; charset=ISO-8859-1"));
|
||||||
|
assertThat(doc.getField(docMapper.mappers().smartName("file2.content_length").mapper().names().indexName()).numericValue().longValue(), is(344L));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,125 @@
|
||||||
|
/*
|
||||||
|
* Licensed to ElasticSearch and Shay Banon under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. ElasticSearch licenses this
|
||||||
|
* file to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing,
|
||||||
|
* software distributed under the License is distributed on an
|
||||||
|
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
* KIND, either express or implied. See the License for the
|
||||||
|
* specific language governing permissions and limitations
|
||||||
|
* under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.elasticsearch.plugin.mapper.attachments.test;
|
||||||
|
|
||||||
|
import org.elasticsearch.action.admin.cluster.health.ClusterHealthResponse;
|
||||||
|
import org.elasticsearch.action.admin.cluster.health.ClusterHealthStatus;
|
||||||
|
import org.elasticsearch.action.count.CountResponse;
|
||||||
|
import org.elasticsearch.common.logging.ESLogger;
|
||||||
|
import org.elasticsearch.common.logging.Loggers;
|
||||||
|
import org.elasticsearch.common.network.NetworkUtils;
|
||||||
|
import org.elasticsearch.common.settings.ImmutableSettings;
|
||||||
|
import org.elasticsearch.common.settings.Settings;
|
||||||
|
import org.elasticsearch.index.mapper.MapperParsingException;
|
||||||
|
import org.elasticsearch.node.Node;
|
||||||
|
import org.testng.annotations.AfterClass;
|
||||||
|
import org.testng.annotations.AfterMethod;
|
||||||
|
import org.testng.annotations.BeforeClass;
|
||||||
|
import org.testng.annotations.Test;
|
||||||
|
|
||||||
|
import static org.elasticsearch.client.Requests.*;
|
||||||
|
import static org.elasticsearch.common.io.Streams.copyToBytesFromClasspath;
|
||||||
|
import static org.elasticsearch.common.io.Streams.copyToStringFromClasspath;
|
||||||
|
import static org.elasticsearch.common.settings.ImmutableSettings.settingsBuilder;
|
||||||
|
import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;
|
||||||
|
import static org.elasticsearch.index.query.QueryBuilders.fieldQuery;
|
||||||
|
import static org.elasticsearch.node.NodeBuilder.nodeBuilder;
|
||||||
|
import static org.hamcrest.MatcherAssert.assertThat;
|
||||||
|
import static org.hamcrest.Matchers.equalTo;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test case for issue https://github.com/elasticsearch/elasticsearch-mapper-attachments/issues/18
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
public class MultipleAttachmentIntegrationTests {
|
||||||
|
|
||||||
|
private final ESLogger logger = Loggers.getLogger(getClass());
|
||||||
|
|
||||||
|
private Node node;
|
||||||
|
|
||||||
|
@BeforeClass
|
||||||
|
public void setupServer() {
|
||||||
|
node = nodeBuilder().local(true).settings(settingsBuilder()
|
||||||
|
.put("path.data", "target/data")
|
||||||
|
.put("cluster.name", "test-cluster-" + NetworkUtils.getLocalAddress())
|
||||||
|
.put("gateway.type", "none")).node();
|
||||||
|
}
|
||||||
|
|
||||||
|
@AfterClass
|
||||||
|
public void closeServer() {
|
||||||
|
node.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void createIndex(Settings settings) {
|
||||||
|
logger.info("creating index [test]");
|
||||||
|
node.client().admin().indices().create(createIndexRequest("test").settings(settingsBuilder().put("index.numberOfReplicas", 0).put(settings))).actionGet();
|
||||||
|
logger.info("Running Cluster Health");
|
||||||
|
ClusterHealthResponse clusterHealth = node.client().admin().cluster().health(clusterHealthRequest().waitForGreenStatus()).actionGet();
|
||||||
|
logger.info("Done Cluster Health, status " + clusterHealth.getStatus());
|
||||||
|
assertThat(clusterHealth.isTimedOut(), equalTo(false));
|
||||||
|
assertThat(clusterHealth.getStatus(), equalTo(ClusterHealthStatus.GREEN));
|
||||||
|
}
|
||||||
|
|
||||||
|
@AfterMethod
|
||||||
|
public void deleteIndex() {
|
||||||
|
logger.info("deleting index [test]");
|
||||||
|
node.client().admin().indices().delete(deleteIndexRequest("test")).actionGet();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* When we want to ignore errors (default)
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
public void testMultipleAttachmentsWithEncryptedDoc() throws Exception {
|
||||||
|
createIndex(ImmutableSettings.builder().build());
|
||||||
|
String mapping = copyToStringFromClasspath("/org/elasticsearch/index/mapper/multipledocs/test-mapping.json");
|
||||||
|
byte[] html = copyToBytesFromClasspath("/org/elasticsearch/index/mapper/xcontent/htmlWithValidDateMeta.html");
|
||||||
|
byte[] pdf = copyToBytesFromClasspath("/org/elasticsearch/index/mapper/xcontent/encrypted.pdf");
|
||||||
|
|
||||||
|
node.client().admin().indices().putMapping(putMappingRequest("test").type("person").source(mapping)).actionGet();
|
||||||
|
|
||||||
|
node.client().index(indexRequest("test").type("person")
|
||||||
|
.source(jsonBuilder().startObject().field("file1", html).field("file2", pdf).field("hello","world").endObject())).actionGet();
|
||||||
|
node.client().admin().indices().refresh(refreshRequest()).actionGet();
|
||||||
|
|
||||||
|
CountResponse countResponse = node.client().count(countRequest("test").query(fieldQuery("file1", "World"))).actionGet();
|
||||||
|
assertThat(countResponse.getCount(), equalTo(1l));
|
||||||
|
|
||||||
|
countResponse = node.client().count(countRequest("test").query(fieldQuery("hello", "World"))).actionGet();
|
||||||
|
assertThat(countResponse.getCount(), equalTo(1l));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* When we don't want to ignore errors
|
||||||
|
*/
|
||||||
|
@Test(expectedExceptions = MapperParsingException.class)
|
||||||
|
public void testMultipleAttachmentsWithEncryptedDocNotIgnoringErrors() throws Exception {
|
||||||
|
createIndex(ImmutableSettings.builder().put("index.mapping.attachment.ignore_errors", false).build());
|
||||||
|
String mapping = copyToStringFromClasspath("/org/elasticsearch/index/mapper/multipledocs/test-mapping.json");
|
||||||
|
byte[] html = copyToBytesFromClasspath("/org/elasticsearch/index/mapper/xcontent/htmlWithValidDateMeta.html");
|
||||||
|
byte[] pdf = copyToBytesFromClasspath("/org/elasticsearch/index/mapper/xcontent/encrypted.pdf");
|
||||||
|
|
||||||
|
node.client().admin().indices()
|
||||||
|
.putMapping(putMappingRequest("test").type("person").source(mapping)).actionGet();
|
||||||
|
|
||||||
|
node.client().index(indexRequest("test").type("person")
|
||||||
|
.source(jsonBuilder().startObject().field("file1", html).field("file2", pdf).field("hello","world").endObject())).actionGet();
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,12 @@
|
||||||
|
{
|
||||||
|
"person":{
|
||||||
|
"properties":{
|
||||||
|
"file1":{
|
||||||
|
"type":"attachment"
|
||||||
|
},
|
||||||
|
"file2":{
|
||||||
|
"type":"attachment"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
Binary file not shown.
Loading…
Reference in New Issue