From 6a08ca673a41c5009491d8fb6dcb825e6c2025db Mon Sep 17 00:00:00 2001 From: Henac Date: Sun, 4 Mar 2012 16:09:21 +1100 Subject: [PATCH] Added the ability to specify the amount of text to extract and index from an attachment. --- pom.xml | 17 +++++++++ .../mapper/attachment/AttachmentMapper.java | 15 +++++++- .../mapper/xcontent/testContentLength.txt | 9 +++++ .../SimpleAttachmentIntegrationTests.java | 38 +++++++++++++++++++ 4 files changed, 77 insertions(+), 2 deletions(-) create mode 100644 src/test/java/org/elasticsearch/index/mapper/xcontent/testContentLength.txt diff --git a/pom.xml b/pom.xml index 9f234dbd977..4408f877f56 100644 --- a/pom.xml +++ b/pom.xml @@ -35,6 +35,22 @@ + + codehausSnapshots + Codehaus Snapshots + + true + always + warn + + + true + always + fail + + http://oss.sonatype.org/content/repositories/releases/ + default + @@ -90,6 +106,7 @@ **/*.json **/*.yml **/*.html + **/*.txt diff --git a/src/main/java/org/elasticsearch/index/mapper/attachment/AttachmentMapper.java b/src/main/java/org/elasticsearch/index/mapper/attachment/AttachmentMapper.java index 6d698009a88..bd82c0d79f1 100644 --- a/src/main/java/org/elasticsearch/index/mapper/attachment/AttachmentMapper.java +++ b/src/main/java/org/elasticsearch/index/mapper/attachment/AttachmentMapper.java @@ -45,12 +45,15 @@ import static org.elasticsearch.plugin.mapper.attachments.tika.TikaInstance.tika * { * file1 : { * _content_type : "application/pdf", + * _content_length : "500000000", * _name : "..../something.pdf", * content : "" * } * } * * + * _content_length = Specify the maximum amount of characters to extract from the attachment. If not specified, then the default for + * tika is 100,000 characters. Caution is required when setting large values as this can cause memory issues. * */ public class AttachmentMapper implements Mapper { @@ -237,6 +240,7 @@ public class AttachmentMapper implements Mapper { public void parse(ParseContext context) throws IOException { byte[] content = null; String contentType = null; + int contentLength = 100000; String name = null; XContentParser parser = context.parser(); @@ -256,10 +260,14 @@ public class AttachmentMapper implements Mapper { } else if ("_name".equals(currentFieldName)) { name = parser.text(); } + } else if (token == XContentParser.Token.VALUE_NUMBER) { + if ("_content_length".equals(currentFieldName)) { + contentLength = parser.intValue(); + } } } } - + Metadata metadata = new Metadata(); if (contentType != null) { metadata.add(Metadata.CONTENT_TYPE, contentType); @@ -270,9 +278,12 @@ public class AttachmentMapper implements Mapper { String parsedContent; try { + // Set the maximum length of strings returned by the parseToString method, -1 sets no limit + tika().setMaxStringLength(contentLength); + parsedContent = tika().parseToString(new FastByteArrayInputStream(content), metadata); } catch (TikaException e) { - throw new MapperParsingException("Failed to extract text for [" + name + "]", e); + throw new MapperParsingException("Failed to extract [" + contentLength + "] characters of text for [" + name + "]", e); } context.externalValue(parsedContent); diff --git a/src/test/java/org/elasticsearch/index/mapper/xcontent/testContentLength.txt b/src/test/java/org/elasticsearch/index/mapper/xcontent/testContentLength.txt new file mode 100644 index 00000000000..d392c2d0979 --- /dev/null +++ b/src/test/java/org/elasticsearch/index/mapper/xcontent/testContentLength.txt @@ -0,0 +1,9 @@ +Begin + +BeforeLimit AfterLimit + +Broadway + +Nearing the end + +End \ No newline at end of file diff --git a/src/test/java/org/elasticsearch/plugin/mapper/attachments/test/SimpleAttachmentIntegrationTests.java b/src/test/java/org/elasticsearch/plugin/mapper/attachments/test/SimpleAttachmentIntegrationTests.java index aac92dcbf1d..3d7a68c7844 100644 --- a/src/test/java/org/elasticsearch/plugin/mapper/attachments/test/SimpleAttachmentIntegrationTests.java +++ b/src/test/java/org/elasticsearch/plugin/mapper/attachments/test/SimpleAttachmentIntegrationTests.java @@ -95,4 +95,42 @@ public class SimpleAttachmentIntegrationTests { countResponse = node.client().count(countRequest("test").query(fieldQuery("file", "tests the ability"))).actionGet(); assertThat(countResponse.count(), equalTo(1l)); } + + @Test + public void testSimpleAttachmentContentLengthLimit() throws Exception { + String mapping = copyToStringFromClasspath("/org/elasticsearch/index/mapper/xcontent/test-mapping.json"); + byte[] txt = copyToBytesFromClasspath("/org/elasticsearch/index/mapper/xcontent/testContentLength.txt"); + final int CONTENT_LENGTH_LIMIT = 18; + + node.client().admin().indices().putMapping(putMappingRequest("test").type("person").source(mapping)).actionGet(); + + node.client().index(indexRequest("test").type("person") + .source(jsonBuilder().startObject().field("file").startObject().field("content", txt).field("_content_length", CONTENT_LENGTH_LIMIT).endObject())).actionGet(); + node.client().admin().indices().refresh(refreshRequest()).actionGet(); + + CountResponse countResponse = node.client().count(countRequest("test").query(fieldQuery("file", "BeforeLimit"))).actionGet(); + assertThat(countResponse.count(), equalTo(1l)); + + countResponse = node.client().count(countRequest("test").query(fieldQuery("file", "AfterLimit"))).actionGet(); + assertThat(countResponse.count(), equalTo(0l)); + } + + @Test + public void testSimpleAttachmentNoContentLengthLimit() throws Exception { + String mapping = copyToStringFromClasspath("/org/elasticsearch/index/mapper/xcontent/test-mapping.json"); + byte[] txt = copyToBytesFromClasspath("/org/elasticsearch/index/mapper/xcontent/testContentLength.txt"); + final int CONTENT_LENGTH_LIMIT = -1; + + node.client().admin().indices().putMapping(putMappingRequest("test").type("person").source(mapping)).actionGet(); + + node.client().index(indexRequest("test").type("person") + .source(jsonBuilder().startObject().field("file").startObject().field("content", txt).field("_content_length", CONTENT_LENGTH_LIMIT).endObject())).actionGet(); + node.client().admin().indices().refresh(refreshRequest()).actionGet(); + + CountResponse countResponse = node.client().count(countRequest("test").query(fieldQuery("file", "Begin"))).actionGet(); + assertThat(countResponse.count(), equalTo(1l)); + + countResponse = node.client().count(countRequest("test").query(fieldQuery("file", "End"))).actionGet(); + assertThat(countResponse.count(), equalTo(1l)); + } } \ No newline at end of file