diff --git a/pom.xml b/pom.xml index 2d23d2e2596..ab68e422e84 100644 --- a/pom.xml +++ b/pom.xml @@ -35,6 +35,22 @@ + + codehausSnapshots + Codehaus Snapshots + + true + always + warn + + + true + always + fail + + http://oss.sonatype.org/content/repositories/releases/ + default + @@ -90,6 +106,7 @@ **/*.json **/*.yml **/*.html + **/*.txt diff --git a/src/main/java/org/elasticsearch/index/mapper/attachment/AttachmentMapper.java b/src/main/java/org/elasticsearch/index/mapper/attachment/AttachmentMapper.java index 6d698009a88..3a319614f85 100644 --- a/src/main/java/org/elasticsearch/index/mapper/attachment/AttachmentMapper.java +++ b/src/main/java/org/elasticsearch/index/mapper/attachment/AttachmentMapper.java @@ -45,12 +45,15 @@ import static org.elasticsearch.plugin.mapper.attachments.tika.TikaInstance.tika * { * file1 : { * _content_type : "application/pdf", + * _content_length : "500000000", * _name : "..../something.pdf", * content : "" * } * } * * + * _content_length = Specify the maximum amount of characters to extract from the attachment. If not specified, then the default for + * tika is 100,000 characters. Caution is required when setting large values as this can cause memory issues. * */ public class AttachmentMapper implements Mapper { @@ -237,6 +240,7 @@ public class AttachmentMapper implements Mapper { public void parse(ParseContext context) throws IOException { byte[] content = null; String contentType = null; + int contentLength = 100000; String name = null; XContentParser parser = context.parser(); @@ -256,10 +260,14 @@ public class AttachmentMapper implements Mapper { } else if ("_name".equals(currentFieldName)) { name = parser.text(); } + } else if (token == XContentParser.Token.VALUE_NUMBER) { + if ("_content_length".equals(currentFieldName)) { + contentLength = parser.intValue(); + } } } } - + Metadata metadata = new Metadata(); if (contentType != null) { metadata.add(Metadata.CONTENT_TYPE, contentType); @@ -270,9 +278,10 @@ public class AttachmentMapper implements Mapper { String parsedContent; try { - parsedContent = tika().parseToString(new FastByteArrayInputStream(content), metadata); + // Set the maximum length of strings returned by the parseToString method, -1 sets no limit + parsedContent = tika().parseToString(new FastByteArrayInputStream(content), metadata, contentLength); } catch (TikaException e) { - throw new MapperParsingException("Failed to extract text for [" + name + "]", e); + throw new MapperParsingException("Failed to extract [" + contentLength + "] characters of text for [" + name + "]", e); } context.externalValue(parsedContent); diff --git a/src/main/java/org/elasticsearch/plugin/mapper/attachments/tika/TikaExtended.java b/src/main/java/org/elasticsearch/plugin/mapper/attachments/tika/TikaExtended.java new file mode 100644 index 00000000000..b56e55c534f --- /dev/null +++ b/src/main/java/org/elasticsearch/plugin/mapper/attachments/tika/TikaExtended.java @@ -0,0 +1,61 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.plugin.mapper.attachments.tika; + +import java.io.IOException; +import java.io.InputStream; + +import org.apache.tika.Tika; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.sax.BodyContentHandler; +import org.apache.tika.sax.WriteOutContentHandler; +import org.xml.sax.SAXException; + + +/** + * Extends the Tika class, so as to provide a way for setting the maximumStringLength on a per parse document basis. + */ +public class TikaExtended extends Tika { + + public String parseToString(InputStream stream, Metadata metadata, int maxExtractedStringLength) throws IOException, TikaException { + + // setup + WriteOutContentHandler writeHandler = new WriteOutContentHandler(maxExtractedStringLength); + BodyContentHandler contentHandler = new BodyContentHandler(writeHandler); + Parser parser = getParser(); + ParseContext context = new ParseContext(); + context.set(Parser.class, parser); + + try { + parser.parse(stream, contentHandler, metadata, context); + } catch (SAXException e) { + if (!writeHandler.isWriteLimitReached(e)) { + throw new TikaException("Unexpected SAX processing failure", e); + } + } finally { + stream.close(); + } + + return writeHandler.toString(); + } +} diff --git a/src/main/java/org/elasticsearch/plugin/mapper/attachments/tika/TikaInstance.java b/src/main/java/org/elasticsearch/plugin/mapper/attachments/tika/TikaInstance.java index c5847491312..05c6ac95ad9 100644 --- a/src/main/java/org/elasticsearch/plugin/mapper/attachments/tika/TikaInstance.java +++ b/src/main/java/org/elasticsearch/plugin/mapper/attachments/tika/TikaInstance.java @@ -19,16 +19,15 @@ package org.elasticsearch.plugin.mapper.attachments.tika; -import org.apache.tika.Tika; /** * */ public class TikaInstance { - private static final Tika tika = new Tika(); + private static final TikaExtended tika = new TikaExtended(); - public static Tika tika() { + public static TikaExtended tika() { return tika; } } diff --git a/src/test/java/org/elasticsearch/index/mapper/xcontent/testContentLength.txt b/src/test/java/org/elasticsearch/index/mapper/xcontent/testContentLength.txt new file mode 100644 index 00000000000..d392c2d0979 --- /dev/null +++ b/src/test/java/org/elasticsearch/index/mapper/xcontent/testContentLength.txt @@ -0,0 +1,9 @@ +Begin + +BeforeLimit AfterLimit + +Broadway + +Nearing the end + +End \ No newline at end of file diff --git a/src/test/java/org/elasticsearch/plugin/mapper/attachments/test/SimpleAttachmentIntegrationTests.java b/src/test/java/org/elasticsearch/plugin/mapper/attachments/test/SimpleAttachmentIntegrationTests.java index aac92dcbf1d..3d7a68c7844 100644 --- a/src/test/java/org/elasticsearch/plugin/mapper/attachments/test/SimpleAttachmentIntegrationTests.java +++ b/src/test/java/org/elasticsearch/plugin/mapper/attachments/test/SimpleAttachmentIntegrationTests.java @@ -95,4 +95,42 @@ public class SimpleAttachmentIntegrationTests { countResponse = node.client().count(countRequest("test").query(fieldQuery("file", "tests the ability"))).actionGet(); assertThat(countResponse.count(), equalTo(1l)); } + + @Test + public void testSimpleAttachmentContentLengthLimit() throws Exception { + String mapping = copyToStringFromClasspath("/org/elasticsearch/index/mapper/xcontent/test-mapping.json"); + byte[] txt = copyToBytesFromClasspath("/org/elasticsearch/index/mapper/xcontent/testContentLength.txt"); + final int CONTENT_LENGTH_LIMIT = 18; + + node.client().admin().indices().putMapping(putMappingRequest("test").type("person").source(mapping)).actionGet(); + + node.client().index(indexRequest("test").type("person") + .source(jsonBuilder().startObject().field("file").startObject().field("content", txt).field("_content_length", CONTENT_LENGTH_LIMIT).endObject())).actionGet(); + node.client().admin().indices().refresh(refreshRequest()).actionGet(); + + CountResponse countResponse = node.client().count(countRequest("test").query(fieldQuery("file", "BeforeLimit"))).actionGet(); + assertThat(countResponse.count(), equalTo(1l)); + + countResponse = node.client().count(countRequest("test").query(fieldQuery("file", "AfterLimit"))).actionGet(); + assertThat(countResponse.count(), equalTo(0l)); + } + + @Test + public void testSimpleAttachmentNoContentLengthLimit() throws Exception { + String mapping = copyToStringFromClasspath("/org/elasticsearch/index/mapper/xcontent/test-mapping.json"); + byte[] txt = copyToBytesFromClasspath("/org/elasticsearch/index/mapper/xcontent/testContentLength.txt"); + final int CONTENT_LENGTH_LIMIT = -1; + + node.client().admin().indices().putMapping(putMappingRequest("test").type("person").source(mapping)).actionGet(); + + node.client().index(indexRequest("test").type("person") + .source(jsonBuilder().startObject().field("file").startObject().field("content", txt).field("_content_length", CONTENT_LENGTH_LIMIT).endObject())).actionGet(); + node.client().admin().indices().refresh(refreshRequest()).actionGet(); + + CountResponse countResponse = node.client().count(countRequest("test").query(fieldQuery("file", "Begin"))).actionGet(); + assertThat(countResponse.count(), equalTo(1l)); + + countResponse = node.client().count(countRequest("test").query(fieldQuery("file", "End"))).actionGet(); + assertThat(countResponse.count(), equalTo(1l)); + } } \ No newline at end of file