fields;
+ if (fieldNames != null) {
+ fields = EnumSet.noneOf(Field.class);
+ for (String fieldName : fieldNames) {
+ try {
+ fields.add(Field.parse(fieldName));
+ } catch (Exception e) {
+ throw newConfigurationException(TYPE, processorTag, "fields", "illegal field option [" +
+ fieldName + "]. valid values are " + Arrays.toString(Field.values()));
+ }
+ }
+ } else {
+ fields = DEFAULT_FIELDS;
+ }
+
+ return new AttachmentProcessor(processorTag, sourceField, targetField, fields, indexedChars);
+ }
+ }
+
+ public enum Field {
+
+ CONTENT,
+ TITLE,
+ NAME,
+ AUTHOR,
+ KEYWORDS,
+ DATE,
+ CONTENT_TYPE,
+ CONTENT_LENGTH,
+ LANGUAGE;
+
+ public static Field parse(String value) {
+ return valueOf(value.toUpperCase(Locale.ROOT));
+ }
+
+ public String toLowerCase() {
+ return this.toString().toLowerCase(Locale.ROOT);
+ }
+ }
+}
diff --git a/plugins/ingest-attachment/src/main/java/org/elasticsearch/ingest/attachment/IngestAttachmentPlugin.java b/plugins/ingest-attachment/src/main/java/org/elasticsearch/ingest/attachment/IngestAttachmentPlugin.java
new file mode 100644
index 00000000000..8957fc2f841
--- /dev/null
+++ b/plugins/ingest-attachment/src/main/java/org/elasticsearch/ingest/attachment/IngestAttachmentPlugin.java
@@ -0,0 +1,43 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.ingest.attachment;
+
+import org.elasticsearch.node.NodeModule;
+import org.elasticsearch.plugins.Plugin;
+
+import java.io.IOException;
+
+public class IngestAttachmentPlugin extends Plugin {
+
+ @Override
+ public String name() {
+ return "ingest-attachment";
+ }
+
+ @Override
+ public String description() {
+ return "Ingest processor that adds uses Tika to extract binary data";
+ }
+
+ public void onModule(NodeModule nodeModule) throws IOException {
+ nodeModule.registerProcessor(AttachmentProcessor.TYPE,
+ (templateService, registry) -> new AttachmentProcessor.Factory());
+ }
+}
diff --git a/plugins/ingest-attachment/src/main/java/org/elasticsearch/ingest/attachment/TikaImpl.java b/plugins/ingest-attachment/src/main/java/org/elasticsearch/ingest/attachment/TikaImpl.java
new file mode 100644
index 00000000000..2ea977b4dd1
--- /dev/null
+++ b/plugins/ingest-attachment/src/main/java/org/elasticsearch/ingest/attachment/TikaImpl.java
@@ -0,0 +1,159 @@
+package org.elasticsearch.ingest.attachment;
+
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+import org.apache.tika.Tika;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.Parser;
+import org.elasticsearch.SpecialPermission;
+import org.elasticsearch.bootstrap.JarHell;
+import org.elasticsearch.common.SuppressForbidden;
+import org.elasticsearch.common.io.PathUtils;
+
+import java.io.ByteArrayInputStream;
+import java.io.FilePermission;
+import java.io.IOException;
+import java.lang.reflect.ReflectPermission;
+import java.net.URISyntaxException;
+import java.net.URL;
+import java.net.URLClassLoader;
+import java.nio.file.Path;
+import java.security.AccessControlContext;
+import java.security.AccessController;
+import java.security.PermissionCollection;
+import java.security.Permissions;
+import java.security.PrivilegedActionException;
+import java.security.PrivilegedExceptionAction;
+import java.security.ProtectionDomain;
+import java.security.SecurityPermission;
+import java.util.PropertyPermission;
+
+/**
+ * Runs tika with limited parsers and limited permissions.
+ *
+ * Do NOT make public
+ */
+final class TikaImpl {
+
+ /** subset of parsers for types we support */
+ private static final Parser PARSERS[] = new Parser[] {
+ // documents
+ new org.apache.tika.parser.html.HtmlParser(),
+ new org.apache.tika.parser.rtf.RTFParser(),
+ new org.apache.tika.parser.pdf.PDFParser(),
+ new org.apache.tika.parser.txt.TXTParser(),
+ new org.apache.tika.parser.microsoft.OfficeParser(),
+ new org.apache.tika.parser.microsoft.OldExcelParser(),
+ new org.apache.tika.parser.microsoft.ooxml.OOXMLParser(),
+ new org.apache.tika.parser.odf.OpenDocumentParser(),
+ new org.apache.tika.parser.iwork.IWorkPackageParser(),
+ new org.apache.tika.parser.xml.DcXMLParser(),
+ new org.apache.tika.parser.epub.EpubParser(),
+ };
+
+ /** autodetector based on this subset */
+ private static final AutoDetectParser PARSER_INSTANCE = new AutoDetectParser(PARSERS);
+
+ /** singleton tika instance */
+ private static final Tika TIKA_INSTANCE = new Tika(PARSER_INSTANCE.getDetector(), PARSER_INSTANCE);
+
+ /**
+ * parses with tika, throwing any exception hit while parsing the document
+ */
+ // only package private for testing!
+ static String parse(final byte content[], final Metadata metadata, final int limit) throws TikaException, IOException {
+ // check that its not unprivileged code like a script
+ SecurityManager sm = System.getSecurityManager();
+ if (sm != null) {
+ sm.checkPermission(new SpecialPermission());
+ }
+
+ try {
+ return AccessController.doPrivileged(new PrivilegedExceptionAction() {
+ @Override
+ public String run() throws TikaException, IOException {
+ return TIKA_INSTANCE.parseToString(new ByteArrayInputStream(content), metadata, limit);
+ }
+ }, RESTRICTED_CONTEXT);
+ } catch (PrivilegedActionException e) {
+ // checked exception from tika: unbox it
+ Throwable cause = e.getCause();
+ if (cause instanceof TikaException) {
+ throw (TikaException) cause;
+ } else if (cause instanceof IOException) {
+ throw (IOException) cause;
+ } else {
+ throw new AssertionError(cause);
+ }
+ }
+ }
+
+ // apply additional containment for parsers, this is intersected with the current permissions
+ // its hairy, but worth it so we don't have some XML flaw reading random crap from the FS
+ private static final AccessControlContext RESTRICTED_CONTEXT = new AccessControlContext(
+ new ProtectionDomain[] {
+ new ProtectionDomain(null, getRestrictedPermissions())
+ }
+ );
+
+ // compute some minimal permissions for parsers. they only get r/w access to the java temp directory,
+ // the ability to load some resources from JARs, and read sysprops
+ static PermissionCollection getRestrictedPermissions() {
+ Permissions perms = new Permissions();
+ // property/env access needed for parsing
+ perms.add(new PropertyPermission("*", "read"));
+ perms.add(new RuntimePermission("getenv.TIKA_CONFIG"));
+
+ // add permissions for resource access:
+ // classpath
+ addReadPermissions(perms, JarHell.parseClassPath());
+ // plugin jars
+ if (TikaImpl.class.getClassLoader() instanceof URLClassLoader) {
+ addReadPermissions(perms, ((URLClassLoader)TikaImpl.class.getClassLoader()).getURLs());
+ }
+ // jvm's java.io.tmpdir (needs read/write)
+ perms.add(new FilePermission(System.getProperty("java.io.tmpdir") + System.getProperty("file.separator") + "-",
+ "read,readlink,write,delete"));
+ // current hacks needed for POI/PDFbox issues:
+ perms.add(new SecurityPermission("putProviderProperty.BC"));
+ perms.add(new SecurityPermission("insertProvider"));
+ perms.add(new ReflectPermission("suppressAccessChecks"));
+ perms.setReadOnly();
+ return perms;
+ }
+
+ // add resources to (what is typically) a jar, but might not be (e.g. in tests/IDE)
+ @SuppressForbidden(reason = "adds access to jar resources")
+ static void addReadPermissions(Permissions perms, URL resources[]) {
+ try {
+ for (URL url : resources) {
+ Path path = PathUtils.get(url.toURI());
+ // resource itself
+ perms.add(new FilePermission(path.toString(), "read,readlink"));
+ // classes underneath
+ perms.add(new FilePermission(path.toString() + System.getProperty("file.separator") + "-", "read,readlink"));
+ }
+ } catch (URISyntaxException bogus) {
+ throw new RuntimeException(bogus);
+ }
+ }
+}
diff --git a/plugins/ingest-attachment/src/main/plugin-metadata/plugin-security.policy b/plugins/ingest-attachment/src/main/plugin-metadata/plugin-security.policy
new file mode 100644
index 00000000000..e23e9f4d0cf
--- /dev/null
+++ b/plugins/ingest-attachment/src/main/plugin-metadata/plugin-security.policy
@@ -0,0 +1,30 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+// NOTE: when modifying this file, look at restrictions in TikaImpl too
+grant {
+ // needed to apply additional sandboxing to tika parsing
+ permission java.security.SecurityPermission "createAccessControlContext";
+
+ // TODO: fix PDFBox not to actually install bouncy castle like this
+ permission java.security.SecurityPermission "putProviderProperty.BC";
+ permission java.security.SecurityPermission "insertProvider";
+ // TODO: fix POI XWPF to not do this: https://bz.apache.org/bugzilla/show_bug.cgi?id=58597
+ permission java.lang.reflect.ReflectPermission "suppressAccessChecks";
+};
diff --git a/plugins/ingest-attachment/src/test/java/org/elasticsearch/ingest/attachment/AttachmentProcessorFactoryTests.java b/plugins/ingest-attachment/src/test/java/org/elasticsearch/ingest/attachment/AttachmentProcessorFactoryTests.java
new file mode 100644
index 00000000000..469a0f8629e
--- /dev/null
+++ b/plugins/ingest-attachment/src/test/java/org/elasticsearch/ingest/attachment/AttachmentProcessorFactoryTests.java
@@ -0,0 +1,118 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.ingest.attachment;
+
+import org.elasticsearch.ElasticsearchParseException;
+import org.elasticsearch.ingest.core.AbstractProcessorFactory;
+import org.elasticsearch.test.ESTestCase;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.EnumSet;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Set;
+
+import static org.hamcrest.Matchers.equalTo;
+import static org.hamcrest.Matchers.sameInstance;
+import static org.hamcrest.core.Is.is;
+
+public class AttachmentProcessorFactoryTests extends ESTestCase {
+
+ private AttachmentProcessor.Factory factory = new AttachmentProcessor.Factory();
+
+ public void testBuildDefaults() throws Exception {
+ Map config = new HashMap<>();
+ config.put("source_field", "_field");
+
+ String processorTag = randomAsciiOfLength(10);
+ config.put(AbstractProcessorFactory.TAG_KEY, processorTag);
+
+ AttachmentProcessor processor = factory.create(config);
+ assertThat(processor.getTag(), equalTo(processorTag));
+ assertThat(processor.getSourceField(), equalTo("_field"));
+ assertThat(processor.getTargetField(), equalTo("attachment"));
+ assertThat(processor.getFields(), sameInstance(AttachmentProcessor.Factory.DEFAULT_FIELDS));
+ }
+
+ public void testConfigureIndexedChars() throws Exception {
+ int indexedChars = randomIntBetween(1, 100000);
+ Map config = new HashMap<>();
+ config.put("source_field", "_field");
+ config.put("indexed_chars", indexedChars);
+
+ String processorTag = randomAsciiOfLength(10);
+ config.put(AbstractProcessorFactory.TAG_KEY, processorTag);
+ AttachmentProcessor processor = factory.create(config);
+ assertThat(processor.getTag(), equalTo(processorTag));
+ assertThat(processor.getIndexedChars(), is(indexedChars));
+ }
+
+ public void testBuildTargetField() throws Exception {
+ Map config = new HashMap<>();
+ config.put("source_field", "_field");
+ config.put("target_field", "_field");
+ AttachmentProcessor processor = factory.create(config);
+ assertThat(processor.getSourceField(), equalTo("_field"));
+ assertThat(processor.getTargetField(), equalTo("_field"));
+ }
+
+ public void testBuildFields() throws Exception {
+ Set fields = EnumSet.noneOf(AttachmentProcessor.Field.class);
+ List fieldNames = new ArrayList<>();
+ int numFields = scaledRandomIntBetween(1, AttachmentProcessor.Field.values().length);
+ for (int i = 0; i < numFields; i++) {
+ AttachmentProcessor.Field field = AttachmentProcessor.Field.values()[i];
+ fields.add(field);
+ fieldNames.add(field.name().toLowerCase(Locale.ROOT));
+ }
+ Map config = new HashMap<>();
+ config.put("source_field", "_field");
+ config.put("fields", fieldNames);
+ AttachmentProcessor processor = factory.create(config);
+ assertThat(processor.getSourceField(), equalTo("_field"));
+ assertThat(processor.getFields(), equalTo(fields));
+ }
+
+ public void testBuildIllegalFieldOption() throws Exception {
+ Map config = new HashMap<>();
+ config.put("source_field", "_field");
+ config.put("fields", Collections.singletonList("invalid"));
+ try {
+ factory.create(config);
+ fail("exception expected");
+ } catch (ElasticsearchParseException e) {
+ assertThat(e.getMessage(), equalTo("[fields] illegal field option [invalid]. valid values are " +
+ "[CONTENT, TITLE, NAME, AUTHOR, KEYWORDS, DATE, CONTENT_TYPE, CONTENT_LENGTH, LANGUAGE]"));
+ }
+
+ config = new HashMap<>();
+ config.put("source_field", "_field");
+ config.put("fields", "invalid");
+ try {
+ factory.create(config);
+ fail("exception expected");
+ } catch (ElasticsearchParseException e) {
+ assertThat(e.getMessage(), equalTo("[fields] property isn't a list, but of type [java.lang.String]"));
+ }
+ }
+}
diff --git a/plugins/ingest-attachment/src/test/java/org/elasticsearch/ingest/attachment/AttachmentProcessorTests.java b/plugins/ingest-attachment/src/test/java/org/elasticsearch/ingest/attachment/AttachmentProcessorTests.java
new file mode 100644
index 00000000000..d9c96de8e45
--- /dev/null
+++ b/plugins/ingest-attachment/src/test/java/org/elasticsearch/ingest/attachment/AttachmentProcessorTests.java
@@ -0,0 +1,190 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.ingest.attachment;
+
+import org.apache.commons.io.IOUtils;
+import org.elasticsearch.ElasticsearchParseException;
+import org.elasticsearch.common.Base64;
+import org.elasticsearch.ingest.RandomDocumentPicks;
+import org.elasticsearch.ingest.core.IngestDocument;
+import org.elasticsearch.test.ESTestCase;
+import org.junit.Before;
+
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.EnumSet;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Set;
+
+import static org.hamcrest.Matchers.contains;
+import static org.hamcrest.Matchers.containsInAnyOrder;
+import static org.hamcrest.Matchers.containsString;
+import static org.hamcrest.Matchers.hasSize;
+import static org.hamcrest.Matchers.is;
+import static org.hamcrest.Matchers.not;
+import static org.hamcrest.Matchers.notNullValue;
+import static org.hamcrest.core.IsCollectionContaining.hasItem;
+
+public class AttachmentProcessorTests extends ESTestCase {
+
+ private AttachmentProcessor processor;
+
+ @Before
+ public void createStandardProcessor() throws IOException {
+ processor = new AttachmentProcessor(randomAsciiOfLength(10), "source_field",
+ "target_field", EnumSet.allOf(AttachmentProcessor.Field.class), 10000);
+ }
+
+ public void testEnglishTextDocument() throws Exception {
+ Map attachmentData = parseDocument("text-in-english.txt", processor);
+
+ assertThat(attachmentData.keySet(), containsInAnyOrder("language", "content", "content_type",
+ "content_length"));
+ assertThat(attachmentData.get("language"), is("en"));
+ assertThat(attachmentData.get("content"), is("\"God Save the Queen\" (alternatively \"God Save the King\""));
+ assertThat(attachmentData.get("content_type").toString(), containsString("text/plain"));
+ assertThat(attachmentData.get("content_length"), is(notNullValue()));
+ }
+
+ public void testEnglishTextDocumentWithRandomFields() throws Exception {
+ Set fields = EnumSet.noneOf(AttachmentProcessor.Field.class);
+ List fieldNames = new ArrayList<>();
+ int numFields = scaledRandomIntBetween(1, AttachmentProcessor.Field.values().length);
+ for (int i = 0; i < numFields; i++) {
+ AttachmentProcessor.Field field = AttachmentProcessor.Field.values()[i];
+ fields.add(field);
+ fieldNames.add(field.name().toLowerCase(Locale.ROOT));
+ }
+
+ processor = new AttachmentProcessor(randomAsciiOfLength(10), "source_field",
+ "target_field", EnumSet.copyOf(fields), 10000);
+
+ Map attachmentData = parseDocument("text-in-english.txt", processor);
+ assertThat(attachmentData.keySet(), hasSize(1));
+ assertThat(attachmentData.keySet(), contains("content"));
+ }
+
+ public void testFrenchTextDocument() throws Exception {
+ Map attachmentData = parseDocument("text-in-french.txt", processor);
+
+ assertThat(attachmentData.keySet(), hasItem("language"));
+ assertThat(attachmentData.get("language"), is("fr"));
+ }
+
+ public void testUnknownLanguageDocument() throws Exception {
+ Map attachmentData = parseDocument("text-gibberish.txt", processor);
+
+ assertThat(attachmentData.keySet(), hasItem("language"));
+ // lt seems some standard for not detected
+ assertThat(attachmentData.get("language"), is("lt"));
+ }
+
+ public void testEmptyTextDocument() throws Exception {
+ Map attachmentData = parseDocument("text-empty.txt", processor);
+ assertThat(attachmentData.keySet(), not(hasItem("language")));
+ }
+
+ public void testWordDocument() throws Exception {
+ Map attachmentData = parseDocument("issue-104.docx", processor);
+
+ assertThat(attachmentData.keySet(), containsInAnyOrder("content", "language", "date", "author", "content_type",
+ "content_length"));
+ assertThat(attachmentData.get("content"), is(notNullValue()));
+ assertThat(attachmentData.get("language"), is("en"));
+ assertThat(attachmentData.get("date"), is("2012-10-12T11:17:00Z"));
+ assertThat(attachmentData.get("author"), is("Windows User"));
+ assertThat(attachmentData.get("content_length"), is(notNullValue()));
+ assertThat(attachmentData.get("content_type").toString(),
+ is("application/vnd.openxmlformats-officedocument.wordprocessingml.document"));
+ }
+
+ public void testEncryptedPdf() throws Exception {
+ try {
+ parseDocument("encrypted.pdf", processor);
+ } catch (ElasticsearchParseException e) {
+ assertThat(e.getDetailedMessage(), containsString("document is encrypted"));
+ }
+ }
+
+ public void testHtmlDocument() throws Exception {
+ Map attachmentData = parseDocument("htmlWithEmptyDateMeta.html", processor);
+
+ assertThat(attachmentData.keySet(), containsInAnyOrder("language", "content", "author", "keywords", "title",
+ "content_type", "content_length"));
+ assertThat(attachmentData.get("language"), is("en"));
+ assertThat(attachmentData.get("content"), is(notNullValue()));
+ assertThat(attachmentData.get("content_length"), is(notNullValue()));
+ assertThat(attachmentData.get("author"), is("kimchy"));
+ assertThat(attachmentData.get("keywords"), is("elasticsearch,cool,bonsai"));
+ assertThat(attachmentData.get("title"), is("Hello"));
+ assertThat(attachmentData.get("content_type").toString(), containsString("text/html"));
+ }
+
+ public void testXHtmlDocument() throws Exception {
+ Map attachmentData = parseDocument("testXHTML.html", processor);
+
+ assertThat(attachmentData.keySet(), containsInAnyOrder("language", "content", "author", "title",
+ "content_type", "content_length"));
+ assertThat(attachmentData.get("content_type").toString(), containsString("application/xhtml+xml"));
+ }
+
+ public void testEpubDocument() throws Exception {
+ Map attachmentData = parseDocument("testEPUB.epub", processor);
+
+ assertThat(attachmentData.keySet(), containsInAnyOrder("language", "content", "author", "title",
+ "content_type", "content_length", "date", "keywords"));
+ assertThat(attachmentData.get("content_type").toString(), containsString("application/epub+zip"));
+ }
+
+ // no real detection, just rudimentary
+ public void testAsciidocDocument() throws Exception {
+ Map attachmentData = parseDocument("asciidoc.asciidoc", processor);
+
+ assertThat(attachmentData.keySet(), containsInAnyOrder("language", "content_type", "content",
+ "content_length"));
+ assertThat(attachmentData.get("content_type").toString(), containsString("text/plain"));
+ }
+
+ private Map parseDocument(String file, AttachmentProcessor processor) throws Exception {
+ Map document = new HashMap<>();
+ document.put("source_field", getAsBase64(file));
+
+ IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document);
+ processor.execute(ingestDocument);
+
+ @SuppressWarnings("unchecked")
+ Map attachmentData = (Map) ingestDocument.getSourceAndMetadata()
+ .get("target_field");
+ return attachmentData;
+ }
+
+ protected String getAsBase64(String filename) throws Exception {
+ String path = "/org/elasticsearch/ingest/attachment/test/sample-files/" + filename;
+ try (InputStream is = AttachmentProcessorTests.class.getResourceAsStream(path)) {
+ byte bytes[] = IOUtils.toByteArray(is);
+ return Base64.encodeBytes(bytes);
+ }
+ }
+}
diff --git a/plugins/ingest-attachment/src/test/java/org/elasticsearch/ingest/attachment/IngestAttachmentRestIT.java b/plugins/ingest-attachment/src/test/java/org/elasticsearch/ingest/attachment/IngestAttachmentRestIT.java
new file mode 100644
index 00000000000..2399f854c91
--- /dev/null
+++ b/plugins/ingest-attachment/src/test/java/org/elasticsearch/ingest/attachment/IngestAttachmentRestIT.java
@@ -0,0 +1,41 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.ingest.attachment;
+
+import com.carrotsearch.randomizedtesting.annotations.Name;
+import com.carrotsearch.randomizedtesting.annotations.ParametersFactory;
+import org.elasticsearch.test.rest.ESRestTestCase;
+import org.elasticsearch.test.rest.RestTestCandidate;
+import org.elasticsearch.test.rest.parser.RestTestParseException;
+
+import java.io.IOException;
+
+public class IngestAttachmentRestIT extends ESRestTestCase {
+
+ public IngestAttachmentRestIT(@Name("yaml") RestTestCandidate testCandidate) {
+ super(testCandidate);
+ }
+
+ @ParametersFactory
+ public static Iterable parameters() throws IOException, RestTestParseException {
+ return ESRestTestCase.createParameters(0, 1);
+ }
+}
+
diff --git a/plugins/ingest-attachment/src/test/java/org/elasticsearch/ingest/attachment/TikaDocTests.java b/plugins/ingest-attachment/src/test/java/org/elasticsearch/ingest/attachment/TikaDocTests.java
new file mode 100644
index 00000000000..0c63f65c247
--- /dev/null
+++ b/plugins/ingest-attachment/src/test/java/org/elasticsearch/ingest/attachment/TikaDocTests.java
@@ -0,0 +1,65 @@
+package org.elasticsearch.ingest.attachment;
+
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+import org.apache.lucene.util.LuceneTestCase.SuppressFileSystems;
+import org.apache.lucene.util.TestUtil;
+import org.apache.tika.metadata.Metadata;
+import org.elasticsearch.test.ESTestCase;
+
+import java.nio.file.DirectoryStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+
+/**
+ * Evil test-coverage cheat, we parse a bunch of docs from tika
+ * so that we have a nice grab-bag variety, and assert some content
+ * comes back and no exception.
+ */
+@SuppressFileSystems("ExtrasFS") // don't try to parse extraN
+public class TikaDocTests extends ESTestCase {
+
+ /** some test files from tika test suite, zipped up */
+ static final String TIKA_FILES = "/org/elasticsearch/ingest/attachment/test/tika-files.zip";
+
+ public void testFiles() throws Exception {
+ Path tmp = createTempDir();
+ TestUtil.unzip(getClass().getResourceAsStream(TIKA_FILES), tmp);
+
+ try (DirectoryStream stream = Files.newDirectoryStream(tmp)) {
+ for (Path doc : stream) {
+ logger.debug("parsing: {}", doc);
+ assertParseable(doc);
+ }
+ }
+ }
+
+ void assertParseable(Path fileName) throws Exception {
+ try {
+ byte bytes[] = Files.readAllBytes(fileName);
+ String parsedContent = TikaImpl.parse(bytes, new Metadata(), -1);
+ assertNotNull(parsedContent);
+ assertFalse(parsedContent.isEmpty());
+ logger.debug("extracted content: {}", parsedContent);
+ } catch (Throwable e) {
+ throw new RuntimeException("parsing of filename: " + fileName.getFileName() + " failed", e);
+ }
+ }
+}
diff --git a/plugins/ingest-attachment/src/test/java/org/elasticsearch/ingest/attachment/TikaImplTests.java b/plugins/ingest-attachment/src/test/java/org/elasticsearch/ingest/attachment/TikaImplTests.java
new file mode 100644
index 00000000000..6fe0e94a8e9
--- /dev/null
+++ b/plugins/ingest-attachment/src/test/java/org/elasticsearch/ingest/attachment/TikaImplTests.java
@@ -0,0 +1,30 @@
+package org.elasticsearch.ingest.attachment;
+
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+import org.elasticsearch.test.ESTestCase;
+
+public class TikaImplTests extends ESTestCase {
+
+ public void testTikaLoads() throws Exception {
+ Class.forName("org.elasticsearch.ingest.attachment.TikaImpl");
+ }
+
+}
diff --git a/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/asciidoc.asciidoc b/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/asciidoc.asciidoc
new file mode 100644
index 00000000000..dc06d4e83dd
--- /dev/null
+++ b/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/asciidoc.asciidoc
@@ -0,0 +1,5 @@
+[[tika-asciidoc]]
+= AsciiDoc test
+
+Here is a test of the asciidoc format.
+
diff --git a/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/encrypted.pdf b/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/encrypted.pdf
new file mode 100644
index 00000000000..569a904a315
Binary files /dev/null and b/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/encrypted.pdf differ
diff --git a/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/htmlWithEmptyDateMeta.html b/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/htmlWithEmptyDateMeta.html
new file mode 100644
index 00000000000..c65e214aae4
--- /dev/null
+++ b/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/htmlWithEmptyDateMeta.html
@@ -0,0 +1,11 @@
+
+
+
+ Hello
+
+
+
+
+Hello again. This is a test sentence to check for language detection.
+
diff --git a/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/htmlWithValidDateMeta.html b/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/htmlWithValidDateMeta.html
new file mode 100644
index 00000000000..79b5a6234ec
--- /dev/null
+++ b/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/htmlWithValidDateMeta.html
@@ -0,0 +1,11 @@
+
+
+
+ Hello
+
+
+
+
+World
+
diff --git a/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/htmlWithoutDateMeta.html b/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/htmlWithoutDateMeta.html
new file mode 100644
index 00000000000..3322fa3a734
--- /dev/null
+++ b/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/htmlWithoutDateMeta.html
@@ -0,0 +1,10 @@
+
+
+
+ Hello
+
+
+
+World
+
diff --git a/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/issue-104.docx b/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/issue-104.docx
new file mode 100644
index 00000000000..f126e20b32e
Binary files /dev/null and b/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/issue-104.docx differ
diff --git a/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/testContentLength.txt b/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/testContentLength.txt
new file mode 100644
index 00000000000..d392c2d0979
--- /dev/null
+++ b/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/testContentLength.txt
@@ -0,0 +1,9 @@
+Begin
+
+BeforeLimit AfterLimit
+
+Broadway
+
+Nearing the end
+
+End
\ No newline at end of file
diff --git a/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/testEPUB.epub b/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/testEPUB.epub
new file mode 100644
index 00000000000..a6fc2e634d5
Binary files /dev/null and b/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/testEPUB.epub differ
diff --git a/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/testXHTML.html b/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/testXHTML.html
new file mode 100644
index 00000000000..f5564f025d2
--- /dev/null
+++ b/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/testXHTML.html
@@ -0,0 +1,29 @@
+
+
+
+ XHTML test document
+
+
+
+
+
+ This document tests the ability of Apache Tika to extract content
+ from an XHTML document .
+
+
+
diff --git a/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/text-empty.txt b/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/text-empty.txt
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/text-gibberish.txt b/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/text-gibberish.txt
new file mode 100644
index 00000000000..d4b05975c97
--- /dev/null
+++ b/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/text-gibberish.txt
@@ -0,0 +1 @@
+sdokghdfsbhiughie eij fnseiuvn ifnvropigjnior bnriogbnvr osibnopribn giodbn isn prsbnvrbnirbnrpinb riunbiru ntibnriubnribuni nrbis
diff --git a/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/text-in-english.txt b/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/text-in-english.txt
new file mode 100644
index 00000000000..08280926034
--- /dev/null
+++ b/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/text-in-english.txt
@@ -0,0 +1 @@
+"God Save the Queen" (alternatively "God Save the King"
\ No newline at end of file
diff --git a/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/text-in-french.txt b/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/text-in-french.txt
new file mode 100644
index 00000000000..e4619fb1b88
--- /dev/null
+++ b/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/text-in-french.txt
@@ -0,0 +1 @@
+Allons enfants de la Patrie Le jour de gloire est arrivé. Contre nous de la tyrannie
\ No newline at end of file
diff --git a/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/tika-files.zip b/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/tika-files.zip
new file mode 100644
index 00000000000..10f5d507677
Binary files /dev/null and b/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/tika-files.zip differ
diff --git a/plugins/ingest-attachment/src/test/resources/rest-api-spec/test/ingest_attachment/10_basic.yaml b/plugins/ingest-attachment/src/test/resources/rest-api-spec/test/ingest_attachment/10_basic.yaml
new file mode 100644
index 00000000000..ed752971fcb
--- /dev/null
+++ b/plugins/ingest-attachment/src/test/resources/rest-api-spec/test/ingest_attachment/10_basic.yaml
@@ -0,0 +1,5 @@
+"Ingest attachment plugin installed":
+ - do:
+ cluster.stats: {}
+
+ - match: { nodes.plugins.0.name: ingest-attachment }
diff --git a/plugins/ingest-attachment/src/test/resources/rest-api-spec/test/ingest_attachment/20_attachment_processor.yaml b/plugins/ingest-attachment/src/test/resources/rest-api-spec/test/ingest_attachment/20_attachment_processor.yaml
new file mode 100644
index 00000000000..85c2f0d245d
--- /dev/null
+++ b/plugins/ingest-attachment/src/test/resources/rest-api-spec/test/ingest_attachment/20_attachment_processor.yaml
@@ -0,0 +1,114 @@
+---
+"Test ingest attachment processor with defaults":
+ - do:
+ ingest.put_pipeline:
+ id: "my_pipeline"
+ body: >
+ {
+ "description": "_description",
+ "processors": [
+ {
+ "attachment" : {
+ "source_field" : "field1"
+ }
+ }
+ ]
+ }
+ - match: { acknowledged: true }
+
+ - do:
+ index:
+ index: test
+ type: test
+ id: 1
+ pipeline: "my_pipeline"
+ body: { field1: "VGhpcyBpcyBhbiBlbmdsaXNoIHRleHQgdG8gdGVzdCBpZiB0aGUgcGlwZWxpbmUgd29ya3M=" }
+
+ - do:
+ get:
+ index: test
+ type: test
+ id: 1
+ - match: { _source.field1: "VGhpcyBpcyBhbiBlbmdsaXNoIHRleHQgdG8gdGVzdCBpZiB0aGUgcGlwZWxpbmUgd29ya3M=" }
+ - length: { _source.attachment: 4 }
+ - match: { _source.attachment.content: "This is an english text to test if the pipeline works" }
+ - match: { _source.attachment.language: "en" }
+ - match: { _source.attachment.content_length: "54" }
+ - match: { _source.attachment.content_type: "text/plain; charset=ISO-8859-1" }
+
+---
+"Test attachment processor with fields":
+ - do:
+ cluster.health:
+ wait_for_status: green
+
+ - do:
+ ingest.put_pipeline:
+ id: "my_pipeline"
+ body: >
+ {
+ "description": "_description",
+ "processors": [
+ {
+ "attachment" : {
+ "source_field" : "field1",
+ "fields" : ["language"]
+ }
+ }
+ ]
+ }
+ - match: { acknowledged: true }
+
+ - do:
+ index:
+ index: test
+ type: test
+ id: 1
+ pipeline: "my_pipeline"
+ body: { field1: "VGhpcyBpcyBhbiBlbmdsaXNoIHRleHQgdG8gdGVzdCBpZiB0aGUgcGlwZWxpbmUgd29ya3MK" }
+
+ - do:
+ get:
+ index: test
+ type: test
+ id: 1
+ - match: { _source.field1: "VGhpcyBpcyBhbiBlbmdsaXNoIHRleHQgdG8gdGVzdCBpZiB0aGUgcGlwZWxpbmUgd29ya3MK" }
+ - length: { _source.attachment: 1 }
+ - match: { _source.attachment.language: "en" }
+
+---
+"Test indexed chars are configurable":
+ - do:
+ ingest.put_pipeline:
+ id: "my_pipeline"
+ body: >
+ {
+ "description": "_description",
+ "processors": [
+ {
+ "attachment" : {
+ "source_field" : "field1",
+ "indexed_chars": 30
+ }
+ }
+ ]
+ }
+ - match: { acknowledged: true }
+
+ - do:
+ index:
+ index: test
+ type: test
+ id: 1
+ pipeline: "my_pipeline"
+ body: { field1: "VGhpcyBpcyBhbiBlbmdsaXNoIHRleHQgdG8gdGVzdCBpZiB0aGUgcGlwZWxpbmUgd29ya3M=" }
+
+ - do:
+ get:
+ index: test
+ type: test
+ id: 1
+ - length: { _source.attachment: 4 }
+ - match: { _source.attachment.content: "This is an english text to tes" }
+ - match: { _source.attachment.language: "en" }
+ - match: { _source.attachment.content_length: "30" }
diff --git a/settings.gradle b/settings.gradle
index df2ce16c8bc..228b95ff511 100644
--- a/settings.gradle
+++ b/settings.gradle
@@ -25,6 +25,7 @@ List projects = [
'plugins:discovery-ec2',
'plugins:discovery-gce',
'plugins:ingest-geoip',
+ 'plugins:ingest-attachment',
'plugins:lang-javascript',
'plugins:lang-painless',
'plugins:lang-python',