change to _indexed_chars the parameter per doc, and add index.mapping.attachment.indexed_chars setting to globally change it (per index)
This commit is contained in:
parent
59f38ff576
commit
0352c1436e
|
@ -51,10 +51,9 @@ import static org.elasticsearch.plugin.mapper.attachments.tika.TikaInstance.tika
|
|||
* }
|
||||
* }
|
||||
* </pre>
|
||||
*
|
||||
* <p/>
|
||||
* _content_length = Specify the maximum amount of characters to extract from the attachment. If not specified, then the default for
|
||||
* tika is 100,000 characters. Caution is required when setting large values as this can cause memory issues.
|
||||
*
|
||||
* tika is 100,000 characters. Caution is required when setting large values as this can cause memory issues.
|
||||
*/
|
||||
public class AttachmentMapper implements Mapper {
|
||||
|
||||
|
@ -68,6 +67,8 @@ public class AttachmentMapper implements Mapper {
|
|||
|
||||
private ContentPath.Type pathType = Defaults.PATH_TYPE;
|
||||
|
||||
private Integer defaultIndexedChars = null;
|
||||
|
||||
private StringFieldMapper.Builder contentBuilder;
|
||||
|
||||
private StringFieldMapper.Builder titleBuilder = stringField("title");
|
||||
|
@ -91,6 +92,11 @@ public class AttachmentMapper implements Mapper {
|
|||
return this;
|
||||
}
|
||||
|
||||
public Builder defaultIndexedChars(int defaultIndexedChars) {
|
||||
this.defaultIndexedChars = defaultIndexedChars;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder content(StringFieldMapper.Builder content) {
|
||||
this.contentBuilder = content;
|
||||
return this;
|
||||
|
@ -140,7 +146,14 @@ public class AttachmentMapper implements Mapper {
|
|||
|
||||
context.path().pathType(origPathType);
|
||||
|
||||
return new AttachmentMapper(name, pathType, contentMapper, dateMapper, titleMapper, authorMapper, keywordsMapper, contentTypeMapper);
|
||||
if (defaultIndexedChars != null && context.indexSettings() != null) {
|
||||
defaultIndexedChars = context.indexSettings().getAsInt("index.mapping.attachment.indexed_chars", 100000);
|
||||
}
|
||||
if (defaultIndexedChars == null) {
|
||||
defaultIndexedChars = 100000;
|
||||
}
|
||||
|
||||
return new AttachmentMapper(name, pathType, defaultIndexedChars, contentMapper, dateMapper, titleMapper, authorMapper, keywordsMapper, contentTypeMapper);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -159,8 +172,6 @@ public class AttachmentMapper implements Mapper {
|
|||
* }
|
||||
* }
|
||||
* </pre>
|
||||
*
|
||||
*
|
||||
*/
|
||||
public static class TypeParser implements Mapper.TypeParser {
|
||||
|
||||
|
@ -206,6 +217,8 @@ public class AttachmentMapper implements Mapper {
|
|||
|
||||
private final ContentPath.Type pathType;
|
||||
|
||||
private final int defaultIndexedChars;
|
||||
|
||||
private final StringFieldMapper contentMapper;
|
||||
|
||||
private final DateFieldMapper dateMapper;
|
||||
|
@ -218,11 +231,12 @@ public class AttachmentMapper implements Mapper {
|
|||
|
||||
private final StringFieldMapper contentTypeMapper;
|
||||
|
||||
public AttachmentMapper(String name, ContentPath.Type pathType, StringFieldMapper contentMapper,
|
||||
public AttachmentMapper(String name, ContentPath.Type pathType, int defaultIndexedChars, StringFieldMapper contentMapper,
|
||||
DateFieldMapper dateMapper, StringFieldMapper titleMapper, StringFieldMapper authorMapper,
|
||||
StringFieldMapper keywordsMapper, StringFieldMapper contentTypeMapper) {
|
||||
this.name = name;
|
||||
this.pathType = pathType;
|
||||
this.defaultIndexedChars = defaultIndexedChars;
|
||||
this.contentMapper = contentMapper;
|
||||
this.dateMapper = dateMapper;
|
||||
this.titleMapper = titleMapper;
|
||||
|
@ -240,7 +254,7 @@ public class AttachmentMapper implements Mapper {
|
|||
public void parse(ParseContext context) throws IOException {
|
||||
byte[] content = null;
|
||||
String contentType = null;
|
||||
int contentLength = 100000;
|
||||
int indexedChars = defaultIndexedChars;
|
||||
String name = null;
|
||||
|
||||
XContentParser parser = context.parser();
|
||||
|
@ -261,13 +275,13 @@ public class AttachmentMapper implements Mapper {
|
|||
name = parser.text();
|
||||
}
|
||||
} else if (token == XContentParser.Token.VALUE_NUMBER) {
|
||||
if ("_content_length".equals(currentFieldName)) {
|
||||
contentLength = parser.intValue();
|
||||
}
|
||||
if ("_indexed_chars".equals(currentFieldName) || "_indexedChars".equals(currentFieldName)) {
|
||||
indexedChars = parser.intValue();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Metadata metadata = new Metadata();
|
||||
if (contentType != null) {
|
||||
metadata.add(Metadata.CONTENT_TYPE, contentType);
|
||||
|
@ -279,9 +293,9 @@ public class AttachmentMapper implements Mapper {
|
|||
String parsedContent;
|
||||
try {
|
||||
// Set the maximum length of strings returned by the parseToString method, -1 sets no limit
|
||||
parsedContent = tika().parseToString(new FastByteArrayInputStream(content), metadata, contentLength);
|
||||
parsedContent = tika().parseToString(new FastByteArrayInputStream(content), metadata, indexedChars);
|
||||
} catch (TikaException e) {
|
||||
throw new MapperParsingException("Failed to extract [" + contentLength + "] characters of text for [" + name + "]", e);
|
||||
throw new MapperParsingException("Failed to extract [" + indexedChars + "] characters of text for [" + name + "]", e);
|
||||
}
|
||||
|
||||
context.externalValue(parsedContent);
|
||||
|
|
|
@ -19,9 +19,6 @@
|
|||
|
||||
package org.elasticsearch.plugin.mapper.attachments.tika;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
|
||||
import org.apache.tika.Tika;
|
||||
import org.apache.tika.exception.TikaException;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
|
@ -31,31 +28,32 @@ import org.apache.tika.sax.BodyContentHandler;
|
|||
import org.apache.tika.sax.WriteOutContentHandler;
|
||||
import org.xml.sax.SAXException;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
|
||||
|
||||
/**
|
||||
* Extends the Tika class, so as to provide a way for setting the maximumStringLength on a per parse document basis.
|
||||
*/
|
||||
public class TikaExtended extends Tika {
|
||||
|
||||
public String parseToString(InputStream stream, Metadata metadata, int maxExtractedStringLength) throws IOException, TikaException {
|
||||
|
||||
// setup
|
||||
WriteOutContentHandler writeHandler = new WriteOutContentHandler(maxExtractedStringLength);
|
||||
BodyContentHandler contentHandler = new BodyContentHandler(writeHandler);
|
||||
Parser parser = getParser();
|
||||
ParseContext context = new ParseContext();
|
||||
context.set(Parser.class, parser);
|
||||
|
||||
try {
|
||||
parser.parse(stream, contentHandler, metadata, context);
|
||||
} catch (SAXException e) {
|
||||
if (!writeHandler.isWriteLimitReached(e)) {
|
||||
throw new TikaException("Unexpected SAX processing failure", e);
|
||||
}
|
||||
} finally {
|
||||
stream.close();
|
||||
}
|
||||
|
||||
return writeHandler.toString();
|
||||
}
|
||||
public String parseToString(InputStream stream, Metadata metadata, int maxStringLength)
|
||||
throws IOException, TikaException {
|
||||
WriteOutContentHandler handler =
|
||||
new WriteOutContentHandler(maxStringLength);
|
||||
try {
|
||||
ParseContext context = new ParseContext();
|
||||
context.set(Parser.class, getParser());
|
||||
getParser().parse(
|
||||
stream, new BodyContentHandler(handler), metadata, context);
|
||||
} catch (SAXException e) {
|
||||
if (!handler.isWriteLimitReached(e)) {
|
||||
// This should never happen with BodyContentHandler...
|
||||
throw new TikaException("Unexpected SAX processing failure", e);
|
||||
}
|
||||
} finally {
|
||||
stream.close();
|
||||
}
|
||||
return handler.toString();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -95,42 +95,42 @@ public class SimpleAttachmentIntegrationTests {
|
|||
countResponse = node.client().count(countRequest("test").query(fieldQuery("file", "tests the ability"))).actionGet();
|
||||
assertThat(countResponse.count(), equalTo(1l));
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testSimpleAttachmentContentLengthLimit() throws Exception {
|
||||
String mapping = copyToStringFromClasspath("/org/elasticsearch/index/mapper/xcontent/test-mapping.json");
|
||||
byte[] txt = copyToBytesFromClasspath("/org/elasticsearch/index/mapper/xcontent/testContentLength.txt");
|
||||
final int CONTENT_LENGTH_LIMIT = 18;
|
||||
|
||||
node.client().admin().indices().putMapping(putMappingRequest("test").type("person").source(mapping)).actionGet();
|
||||
|
||||
node.client().index(indexRequest("test").type("person")
|
||||
.source(jsonBuilder().startObject().field("file").startObject().field("content", txt).field("_content_length", CONTENT_LENGTH_LIMIT).endObject())).actionGet();
|
||||
node.client().admin().indices().refresh(refreshRequest()).actionGet();
|
||||
|
||||
CountResponse countResponse = node.client().count(countRequest("test").query(fieldQuery("file", "BeforeLimit"))).actionGet();
|
||||
assertThat(countResponse.count(), equalTo(1l));
|
||||
|
||||
countResponse = node.client().count(countRequest("test").query(fieldQuery("file", "AfterLimit"))).actionGet();
|
||||
assertThat(countResponse.count(), equalTo(0l));
|
||||
String mapping = copyToStringFromClasspath("/org/elasticsearch/index/mapper/xcontent/test-mapping.json");
|
||||
byte[] txt = copyToBytesFromClasspath("/org/elasticsearch/index/mapper/xcontent/testContentLength.txt");
|
||||
final int CONTENT_LENGTH_LIMIT = 18;
|
||||
|
||||
node.client().admin().indices().putMapping(putMappingRequest("test").type("person").source(mapping)).actionGet();
|
||||
|
||||
node.client().index(indexRequest("test").type("person")
|
||||
.source(jsonBuilder().startObject().field("file").startObject().field("content", txt).field("_indexed_chars", CONTENT_LENGTH_LIMIT).endObject())).actionGet();
|
||||
node.client().admin().indices().refresh(refreshRequest()).actionGet();
|
||||
|
||||
CountResponse countResponse = node.client().count(countRequest("test").query(fieldQuery("file", "BeforeLimit"))).actionGet();
|
||||
assertThat(countResponse.count(), equalTo(1l));
|
||||
|
||||
countResponse = node.client().count(countRequest("test").query(fieldQuery("file", "AfterLimit"))).actionGet();
|
||||
assertThat(countResponse.count(), equalTo(0l));
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testSimpleAttachmentNoContentLengthLimit() throws Exception {
|
||||
String mapping = copyToStringFromClasspath("/org/elasticsearch/index/mapper/xcontent/test-mapping.json");
|
||||
byte[] txt = copyToBytesFromClasspath("/org/elasticsearch/index/mapper/xcontent/testContentLength.txt");
|
||||
final int CONTENT_LENGTH_LIMIT = -1;
|
||||
|
||||
node.client().admin().indices().putMapping(putMappingRequest("test").type("person").source(mapping)).actionGet();
|
||||
|
||||
node.client().index(indexRequest("test").type("person")
|
||||
.source(jsonBuilder().startObject().field("file").startObject().field("content", txt).field("_content_length", CONTENT_LENGTH_LIMIT).endObject())).actionGet();
|
||||
node.client().admin().indices().refresh(refreshRequest()).actionGet();
|
||||
|
||||
CountResponse countResponse = node.client().count(countRequest("test").query(fieldQuery("file", "Begin"))).actionGet();
|
||||
assertThat(countResponse.count(), equalTo(1l));
|
||||
|
||||
countResponse = node.client().count(countRequest("test").query(fieldQuery("file", "End"))).actionGet();
|
||||
assertThat(countResponse.count(), equalTo(1l));
|
||||
String mapping = copyToStringFromClasspath("/org/elasticsearch/index/mapper/xcontent/test-mapping.json");
|
||||
byte[] txt = copyToBytesFromClasspath("/org/elasticsearch/index/mapper/xcontent/testContentLength.txt");
|
||||
final int CONTENT_LENGTH_LIMIT = -1;
|
||||
|
||||
node.client().admin().indices().putMapping(putMappingRequest("test").type("person").source(mapping)).actionGet();
|
||||
|
||||
node.client().index(indexRequest("test").type("person")
|
||||
.source(jsonBuilder().startObject().field("file").startObject().field("content", txt).field("_indexed_chars", CONTENT_LENGTH_LIMIT).endObject())).actionGet();
|
||||
node.client().admin().indices().refresh(refreshRequest()).actionGet();
|
||||
|
||||
CountResponse countResponse = node.client().count(countRequest("test").query(fieldQuery("file", "Begin"))).actionGet();
|
||||
assertThat(countResponse.count(), equalTo(1l));
|
||||
|
||||
countResponse = node.client().count(countRequest("test").query(fieldQuery("file", "End"))).actionGet();
|
||||
assertThat(countResponse.count(), equalTo(1l));
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue