diff --git a/src/main/java/org/elasticsearch/index/mapper/attachment/AttachmentMapper.java b/src/main/java/org/elasticsearch/index/mapper/attachment/AttachmentMapper.java index 3a319614f85..c1ca3bb1d91 100644 --- a/src/main/java/org/elasticsearch/index/mapper/attachment/AttachmentMapper.java +++ b/src/main/java/org/elasticsearch/index/mapper/attachment/AttachmentMapper.java @@ -51,10 +51,9 @@ import static org.elasticsearch.plugin.mapper.attachments.tika.TikaInstance.tika * } * } * - * + *

* _content_length = Specify the maximum amount of characters to extract from the attachment. If not specified, then the default for - * tika is 100,000 characters. Caution is required when setting large values as this can cause memory issues. - * + * tika is 100,000 characters. Caution is required when setting large values as this can cause memory issues. */ public class AttachmentMapper implements Mapper { @@ -68,6 +67,8 @@ public class AttachmentMapper implements Mapper { private ContentPath.Type pathType = Defaults.PATH_TYPE; + private Integer defaultIndexedChars = null; + private StringFieldMapper.Builder contentBuilder; private StringFieldMapper.Builder titleBuilder = stringField("title"); @@ -91,6 +92,11 @@ public class AttachmentMapper implements Mapper { return this; } + public Builder defaultIndexedChars(int defaultIndexedChars) { + this.defaultIndexedChars = defaultIndexedChars; + return this; + } + public Builder content(StringFieldMapper.Builder content) { this.contentBuilder = content; return this; @@ -140,7 +146,14 @@ public class AttachmentMapper implements Mapper { context.path().pathType(origPathType); - return new AttachmentMapper(name, pathType, contentMapper, dateMapper, titleMapper, authorMapper, keywordsMapper, contentTypeMapper); + if (defaultIndexedChars != null && context.indexSettings() != null) { + defaultIndexedChars = context.indexSettings().getAsInt("index.mapping.attachment.indexed_chars", 100000); + } + if (defaultIndexedChars == null) { + defaultIndexedChars = 100000; + } + + return new AttachmentMapper(name, pathType, defaultIndexedChars, contentMapper, dateMapper, titleMapper, authorMapper, keywordsMapper, contentTypeMapper); } } @@ -159,8 +172,6 @@ public class AttachmentMapper implements Mapper { * } * } * - * - * */ public static class TypeParser implements Mapper.TypeParser { @@ -206,6 +217,8 @@ public class AttachmentMapper implements Mapper { private final ContentPath.Type pathType; + private final int defaultIndexedChars; + private final StringFieldMapper contentMapper; private final DateFieldMapper dateMapper; @@ -218,11 +231,12 @@ public class AttachmentMapper implements Mapper { private final StringFieldMapper contentTypeMapper; - public AttachmentMapper(String name, ContentPath.Type pathType, StringFieldMapper contentMapper, + public AttachmentMapper(String name, ContentPath.Type pathType, int defaultIndexedChars, StringFieldMapper contentMapper, DateFieldMapper dateMapper, StringFieldMapper titleMapper, StringFieldMapper authorMapper, StringFieldMapper keywordsMapper, StringFieldMapper contentTypeMapper) { this.name = name; this.pathType = pathType; + this.defaultIndexedChars = defaultIndexedChars; this.contentMapper = contentMapper; this.dateMapper = dateMapper; this.titleMapper = titleMapper; @@ -240,7 +254,7 @@ public class AttachmentMapper implements Mapper { public void parse(ParseContext context) throws IOException { byte[] content = null; String contentType = null; - int contentLength = 100000; + int indexedChars = defaultIndexedChars; String name = null; XContentParser parser = context.parser(); @@ -261,13 +275,13 @@ public class AttachmentMapper implements Mapper { name = parser.text(); } } else if (token == XContentParser.Token.VALUE_NUMBER) { - if ("_content_length".equals(currentFieldName)) { - contentLength = parser.intValue(); - } + if ("_indexed_chars".equals(currentFieldName) || "_indexedChars".equals(currentFieldName)) { + indexedChars = parser.intValue(); + } } } } - + Metadata metadata = new Metadata(); if (contentType != null) { metadata.add(Metadata.CONTENT_TYPE, contentType); @@ -279,9 +293,9 @@ public class AttachmentMapper implements Mapper { String parsedContent; try { // Set the maximum length of strings returned by the parseToString method, -1 sets no limit - parsedContent = tika().parseToString(new FastByteArrayInputStream(content), metadata, contentLength); + parsedContent = tika().parseToString(new FastByteArrayInputStream(content), metadata, indexedChars); } catch (TikaException e) { - throw new MapperParsingException("Failed to extract [" + contentLength + "] characters of text for [" + name + "]", e); + throw new MapperParsingException("Failed to extract [" + indexedChars + "] characters of text for [" + name + "]", e); } context.externalValue(parsedContent); diff --git a/src/main/java/org/elasticsearch/plugin/mapper/attachments/tika/TikaExtended.java b/src/main/java/org/elasticsearch/plugin/mapper/attachments/tika/TikaExtended.java index b56e55c534f..6a731db3d65 100644 --- a/src/main/java/org/elasticsearch/plugin/mapper/attachments/tika/TikaExtended.java +++ b/src/main/java/org/elasticsearch/plugin/mapper/attachments/tika/TikaExtended.java @@ -19,9 +19,6 @@ package org.elasticsearch.plugin.mapper.attachments.tika; -import java.io.IOException; -import java.io.InputStream; - import org.apache.tika.Tika; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; @@ -31,31 +28,32 @@ import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.WriteOutContentHandler; import org.xml.sax.SAXException; +import java.io.IOException; +import java.io.InputStream; + /** * Extends the Tika class, so as to provide a way for setting the maximumStringLength on a per parse document basis. */ public class TikaExtended extends Tika { - public String parseToString(InputStream stream, Metadata metadata, int maxExtractedStringLength) throws IOException, TikaException { - - // setup - WriteOutContentHandler writeHandler = new WriteOutContentHandler(maxExtractedStringLength); - BodyContentHandler contentHandler = new BodyContentHandler(writeHandler); - Parser parser = getParser(); - ParseContext context = new ParseContext(); - context.set(Parser.class, parser); - - try { - parser.parse(stream, contentHandler, metadata, context); - } catch (SAXException e) { - if (!writeHandler.isWriteLimitReached(e)) { - throw new TikaException("Unexpected SAX processing failure", e); - } - } finally { - stream.close(); - } - - return writeHandler.toString(); - } + public String parseToString(InputStream stream, Metadata metadata, int maxStringLength) + throws IOException, TikaException { + WriteOutContentHandler handler = + new WriteOutContentHandler(maxStringLength); + try { + ParseContext context = new ParseContext(); + context.set(Parser.class, getParser()); + getParser().parse( + stream, new BodyContentHandler(handler), metadata, context); + } catch (SAXException e) { + if (!handler.isWriteLimitReached(e)) { + // This should never happen with BodyContentHandler... + throw new TikaException("Unexpected SAX processing failure", e); + } + } finally { + stream.close(); + } + return handler.toString(); + } } diff --git a/src/test/java/org/elasticsearch/plugin/mapper/attachments/test/SimpleAttachmentIntegrationTests.java b/src/test/java/org/elasticsearch/plugin/mapper/attachments/test/SimpleAttachmentIntegrationTests.java index 3d7a68c7844..f3cf3c9c60c 100644 --- a/src/test/java/org/elasticsearch/plugin/mapper/attachments/test/SimpleAttachmentIntegrationTests.java +++ b/src/test/java/org/elasticsearch/plugin/mapper/attachments/test/SimpleAttachmentIntegrationTests.java @@ -95,42 +95,42 @@ public class SimpleAttachmentIntegrationTests { countResponse = node.client().count(countRequest("test").query(fieldQuery("file", "tests the ability"))).actionGet(); assertThat(countResponse.count(), equalTo(1l)); } - + @Test public void testSimpleAttachmentContentLengthLimit() throws Exception { - String mapping = copyToStringFromClasspath("/org/elasticsearch/index/mapper/xcontent/test-mapping.json"); - byte[] txt = copyToBytesFromClasspath("/org/elasticsearch/index/mapper/xcontent/testContentLength.txt"); - final int CONTENT_LENGTH_LIMIT = 18; - - node.client().admin().indices().putMapping(putMappingRequest("test").type("person").source(mapping)).actionGet(); - - node.client().index(indexRequest("test").type("person") - .source(jsonBuilder().startObject().field("file").startObject().field("content", txt).field("_content_length", CONTENT_LENGTH_LIMIT).endObject())).actionGet(); - node.client().admin().indices().refresh(refreshRequest()).actionGet(); - - CountResponse countResponse = node.client().count(countRequest("test").query(fieldQuery("file", "BeforeLimit"))).actionGet(); - assertThat(countResponse.count(), equalTo(1l)); - - countResponse = node.client().count(countRequest("test").query(fieldQuery("file", "AfterLimit"))).actionGet(); - assertThat(countResponse.count(), equalTo(0l)); + String mapping = copyToStringFromClasspath("/org/elasticsearch/index/mapper/xcontent/test-mapping.json"); + byte[] txt = copyToBytesFromClasspath("/org/elasticsearch/index/mapper/xcontent/testContentLength.txt"); + final int CONTENT_LENGTH_LIMIT = 18; + + node.client().admin().indices().putMapping(putMappingRequest("test").type("person").source(mapping)).actionGet(); + + node.client().index(indexRequest("test").type("person") + .source(jsonBuilder().startObject().field("file").startObject().field("content", txt).field("_indexed_chars", CONTENT_LENGTH_LIMIT).endObject())).actionGet(); + node.client().admin().indices().refresh(refreshRequest()).actionGet(); + + CountResponse countResponse = node.client().count(countRequest("test").query(fieldQuery("file", "BeforeLimit"))).actionGet(); + assertThat(countResponse.count(), equalTo(1l)); + + countResponse = node.client().count(countRequest("test").query(fieldQuery("file", "AfterLimit"))).actionGet(); + assertThat(countResponse.count(), equalTo(0l)); } - + @Test public void testSimpleAttachmentNoContentLengthLimit() throws Exception { - String mapping = copyToStringFromClasspath("/org/elasticsearch/index/mapper/xcontent/test-mapping.json"); - byte[] txt = copyToBytesFromClasspath("/org/elasticsearch/index/mapper/xcontent/testContentLength.txt"); - final int CONTENT_LENGTH_LIMIT = -1; - - node.client().admin().indices().putMapping(putMappingRequest("test").type("person").source(mapping)).actionGet(); - - node.client().index(indexRequest("test").type("person") - .source(jsonBuilder().startObject().field("file").startObject().field("content", txt).field("_content_length", CONTENT_LENGTH_LIMIT).endObject())).actionGet(); - node.client().admin().indices().refresh(refreshRequest()).actionGet(); - - CountResponse countResponse = node.client().count(countRequest("test").query(fieldQuery("file", "Begin"))).actionGet(); - assertThat(countResponse.count(), equalTo(1l)); - - countResponse = node.client().count(countRequest("test").query(fieldQuery("file", "End"))).actionGet(); - assertThat(countResponse.count(), equalTo(1l)); + String mapping = copyToStringFromClasspath("/org/elasticsearch/index/mapper/xcontent/test-mapping.json"); + byte[] txt = copyToBytesFromClasspath("/org/elasticsearch/index/mapper/xcontent/testContentLength.txt"); + final int CONTENT_LENGTH_LIMIT = -1; + + node.client().admin().indices().putMapping(putMappingRequest("test").type("person").source(mapping)).actionGet(); + + node.client().index(indexRequest("test").type("person") + .source(jsonBuilder().startObject().field("file").startObject().field("content", txt).field("_indexed_chars", CONTENT_LENGTH_LIMIT).endObject())).actionGet(); + node.client().admin().indices().refresh(refreshRequest()).actionGet(); + + CountResponse countResponse = node.client().count(countRequest("test").query(fieldQuery("file", "Begin"))).actionGet(); + assertThat(countResponse.count(), equalTo(1l)); + + countResponse = node.client().count(countRequest("test").query(fieldQuery("file", "End"))).actionGet(); + assertThat(countResponse.count(), equalTo(1l)); } } \ No newline at end of file