change to _indexed_chars the parameter per doc, and add index.mapping.attachment.indexed_chars setting to globally change it (per index)

This commit is contained in:
Shay Banon 2012-03-07 21:53:41 +02:00
parent 59f38ff576
commit 0352c1436e
3 changed files with 82 additions and 70 deletions

View File

@ -51,10 +51,9 @@ import static org.elasticsearch.plugin.mapper.attachments.tika.TikaInstance.tika
* }
* }
* </pre>
*
* <p/>
* _content_length = Specify the maximum amount of characters to extract from the attachment. If not specified, then the default for
* tika is 100,000 characters. Caution is required when setting large values as this can cause memory issues.
*
* tika is 100,000 characters. Caution is required when setting large values as this can cause memory issues.
*/
public class AttachmentMapper implements Mapper {
@ -68,6 +67,8 @@ public class AttachmentMapper implements Mapper {
private ContentPath.Type pathType = Defaults.PATH_TYPE;
private Integer defaultIndexedChars = null;
private StringFieldMapper.Builder contentBuilder;
private StringFieldMapper.Builder titleBuilder = stringField("title");
@ -91,6 +92,11 @@ public class AttachmentMapper implements Mapper {
return this;
}
public Builder defaultIndexedChars(int defaultIndexedChars) {
this.defaultIndexedChars = defaultIndexedChars;
return this;
}
public Builder content(StringFieldMapper.Builder content) {
this.contentBuilder = content;
return this;
@ -140,7 +146,14 @@ public class AttachmentMapper implements Mapper {
context.path().pathType(origPathType);
return new AttachmentMapper(name, pathType, contentMapper, dateMapper, titleMapper, authorMapper, keywordsMapper, contentTypeMapper);
if (defaultIndexedChars != null && context.indexSettings() != null) {
defaultIndexedChars = context.indexSettings().getAsInt("index.mapping.attachment.indexed_chars", 100000);
}
if (defaultIndexedChars == null) {
defaultIndexedChars = 100000;
}
return new AttachmentMapper(name, pathType, defaultIndexedChars, contentMapper, dateMapper, titleMapper, authorMapper, keywordsMapper, contentTypeMapper);
}
}
@ -159,8 +172,6 @@ public class AttachmentMapper implements Mapper {
* }
* }
* </pre>
*
*
*/
public static class TypeParser implements Mapper.TypeParser {
@ -206,6 +217,8 @@ public class AttachmentMapper implements Mapper {
private final ContentPath.Type pathType;
private final int defaultIndexedChars;
private final StringFieldMapper contentMapper;
private final DateFieldMapper dateMapper;
@ -218,11 +231,12 @@ public class AttachmentMapper implements Mapper {
private final StringFieldMapper contentTypeMapper;
public AttachmentMapper(String name, ContentPath.Type pathType, StringFieldMapper contentMapper,
public AttachmentMapper(String name, ContentPath.Type pathType, int defaultIndexedChars, StringFieldMapper contentMapper,
DateFieldMapper dateMapper, StringFieldMapper titleMapper, StringFieldMapper authorMapper,
StringFieldMapper keywordsMapper, StringFieldMapper contentTypeMapper) {
this.name = name;
this.pathType = pathType;
this.defaultIndexedChars = defaultIndexedChars;
this.contentMapper = contentMapper;
this.dateMapper = dateMapper;
this.titleMapper = titleMapper;
@ -240,7 +254,7 @@ public class AttachmentMapper implements Mapper {
public void parse(ParseContext context) throws IOException {
byte[] content = null;
String contentType = null;
int contentLength = 100000;
int indexedChars = defaultIndexedChars;
String name = null;
XContentParser parser = context.parser();
@ -261,13 +275,13 @@ public class AttachmentMapper implements Mapper {
name = parser.text();
}
} else if (token == XContentParser.Token.VALUE_NUMBER) {
if ("_content_length".equals(currentFieldName)) {
contentLength = parser.intValue();
}
if ("_indexed_chars".equals(currentFieldName) || "_indexedChars".equals(currentFieldName)) {
indexedChars = parser.intValue();
}
}
}
}
Metadata metadata = new Metadata();
if (contentType != null) {
metadata.add(Metadata.CONTENT_TYPE, contentType);
@ -279,9 +293,9 @@ public class AttachmentMapper implements Mapper {
String parsedContent;
try {
// Set the maximum length of strings returned by the parseToString method, -1 sets no limit
parsedContent = tika().parseToString(new FastByteArrayInputStream(content), metadata, contentLength);
parsedContent = tika().parseToString(new FastByteArrayInputStream(content), metadata, indexedChars);
} catch (TikaException e) {
throw new MapperParsingException("Failed to extract [" + contentLength + "] characters of text for [" + name + "]", e);
throw new MapperParsingException("Failed to extract [" + indexedChars + "] characters of text for [" + name + "]", e);
}
context.externalValue(parsedContent);

View File

@ -19,9 +19,6 @@
package org.elasticsearch.plugin.mapper.attachments.tika;
import java.io.IOException;
import java.io.InputStream;
import org.apache.tika.Tika;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
@ -31,31 +28,32 @@ import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.WriteOutContentHandler;
import org.xml.sax.SAXException;
import java.io.IOException;
import java.io.InputStream;
/**
* Extends the Tika class, so as to provide a way for setting the maximumStringLength on a per parse document basis.
*/
public class TikaExtended extends Tika {
public String parseToString(InputStream stream, Metadata metadata, int maxExtractedStringLength) throws IOException, TikaException {
// setup
WriteOutContentHandler writeHandler = new WriteOutContentHandler(maxExtractedStringLength);
BodyContentHandler contentHandler = new BodyContentHandler(writeHandler);
Parser parser = getParser();
ParseContext context = new ParseContext();
context.set(Parser.class, parser);
try {
parser.parse(stream, contentHandler, metadata, context);
} catch (SAXException e) {
if (!writeHandler.isWriteLimitReached(e)) {
throw new TikaException("Unexpected SAX processing failure", e);
}
} finally {
stream.close();
}
return writeHandler.toString();
}
public String parseToString(InputStream stream, Metadata metadata, int maxStringLength)
throws IOException, TikaException {
WriteOutContentHandler handler =
new WriteOutContentHandler(maxStringLength);
try {
ParseContext context = new ParseContext();
context.set(Parser.class, getParser());
getParser().parse(
stream, new BodyContentHandler(handler), metadata, context);
} catch (SAXException e) {
if (!handler.isWriteLimitReached(e)) {
// This should never happen with BodyContentHandler...
throw new TikaException("Unexpected SAX processing failure", e);
}
} finally {
stream.close();
}
return handler.toString();
}
}

View File

@ -95,42 +95,42 @@ public class SimpleAttachmentIntegrationTests {
countResponse = node.client().count(countRequest("test").query(fieldQuery("file", "tests the ability"))).actionGet();
assertThat(countResponse.count(), equalTo(1l));
}
@Test
public void testSimpleAttachmentContentLengthLimit() throws Exception {
String mapping = copyToStringFromClasspath("/org/elasticsearch/index/mapper/xcontent/test-mapping.json");
byte[] txt = copyToBytesFromClasspath("/org/elasticsearch/index/mapper/xcontent/testContentLength.txt");
final int CONTENT_LENGTH_LIMIT = 18;
node.client().admin().indices().putMapping(putMappingRequest("test").type("person").source(mapping)).actionGet();
node.client().index(indexRequest("test").type("person")
.source(jsonBuilder().startObject().field("file").startObject().field("content", txt).field("_content_length", CONTENT_LENGTH_LIMIT).endObject())).actionGet();
node.client().admin().indices().refresh(refreshRequest()).actionGet();
CountResponse countResponse = node.client().count(countRequest("test").query(fieldQuery("file", "BeforeLimit"))).actionGet();
assertThat(countResponse.count(), equalTo(1l));
countResponse = node.client().count(countRequest("test").query(fieldQuery("file", "AfterLimit"))).actionGet();
assertThat(countResponse.count(), equalTo(0l));
String mapping = copyToStringFromClasspath("/org/elasticsearch/index/mapper/xcontent/test-mapping.json");
byte[] txt = copyToBytesFromClasspath("/org/elasticsearch/index/mapper/xcontent/testContentLength.txt");
final int CONTENT_LENGTH_LIMIT = 18;
node.client().admin().indices().putMapping(putMappingRequest("test").type("person").source(mapping)).actionGet();
node.client().index(indexRequest("test").type("person")
.source(jsonBuilder().startObject().field("file").startObject().field("content", txt).field("_indexed_chars", CONTENT_LENGTH_LIMIT).endObject())).actionGet();
node.client().admin().indices().refresh(refreshRequest()).actionGet();
CountResponse countResponse = node.client().count(countRequest("test").query(fieldQuery("file", "BeforeLimit"))).actionGet();
assertThat(countResponse.count(), equalTo(1l));
countResponse = node.client().count(countRequest("test").query(fieldQuery("file", "AfterLimit"))).actionGet();
assertThat(countResponse.count(), equalTo(0l));
}
@Test
public void testSimpleAttachmentNoContentLengthLimit() throws Exception {
String mapping = copyToStringFromClasspath("/org/elasticsearch/index/mapper/xcontent/test-mapping.json");
byte[] txt = copyToBytesFromClasspath("/org/elasticsearch/index/mapper/xcontent/testContentLength.txt");
final int CONTENT_LENGTH_LIMIT = -1;
node.client().admin().indices().putMapping(putMappingRequest("test").type("person").source(mapping)).actionGet();
node.client().index(indexRequest("test").type("person")
.source(jsonBuilder().startObject().field("file").startObject().field("content", txt).field("_content_length", CONTENT_LENGTH_LIMIT).endObject())).actionGet();
node.client().admin().indices().refresh(refreshRequest()).actionGet();
CountResponse countResponse = node.client().count(countRequest("test").query(fieldQuery("file", "Begin"))).actionGet();
assertThat(countResponse.count(), equalTo(1l));
countResponse = node.client().count(countRequest("test").query(fieldQuery("file", "End"))).actionGet();
assertThat(countResponse.count(), equalTo(1l));
String mapping = copyToStringFromClasspath("/org/elasticsearch/index/mapper/xcontent/test-mapping.json");
byte[] txt = copyToBytesFromClasspath("/org/elasticsearch/index/mapper/xcontent/testContentLength.txt");
final int CONTENT_LENGTH_LIMIT = -1;
node.client().admin().indices().putMapping(putMappingRequest("test").type("person").source(mapping)).actionGet();
node.client().index(indexRequest("test").type("person")
.source(jsonBuilder().startObject().field("file").startObject().field("content", txt).field("_indexed_chars", CONTENT_LENGTH_LIMIT).endObject())).actionGet();
node.client().admin().indices().refresh(refreshRequest()).actionGet();
CountResponse countResponse = node.client().count(countRequest("test").query(fieldQuery("file", "Begin"))).actionGet();
assertThat(countResponse.count(), equalTo(1l));
countResponse = node.client().count(countRequest("test").query(fieldQuery("file", "End"))).actionGet();
assertThat(countResponse.count(), equalTo(1l));
}
}