Merge pull request #16574 from javanna/enhancement/attachment_improvements

Minor attachment processor improvements
This commit is contained in:
Luca Cavanna 2016-02-10 13:20:00 +01:00
commit 65391e8a83
2 changed files with 35 additions and 26 deletions

View File

@ -32,6 +32,7 @@ import org.elasticsearch.ingest.core.IngestDocument;
import java.io.IOException; import java.io.IOException;
import java.util.Arrays; import java.util.Arrays;
import java.util.EnumSet; import java.util.EnumSet;
import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Locale; import java.util.Locale;
import java.util.Map; import java.util.Map;
@ -66,66 +67,69 @@ public final class AttachmentProcessor extends AbstractProcessor {
@Override @Override
public void execute(IngestDocument ingestDocument) { public void execute(IngestDocument ingestDocument) {
String base64Input = ingestDocument.getFieldValue(sourceField, String.class); String base64Input = ingestDocument.getFieldValue(sourceField, String.class);
Map<String, Object> additionalFields = new HashMap<>();
Metadata metadata = new Metadata();
try { try {
byte[] decodedContent = Base64.decode(base64Input.getBytes(UTF_8)); byte[] decodedContent = Base64.decode(base64Input.getBytes(UTF_8));
Metadata metadata = new Metadata();
String parsedContent = TikaImpl.parse(decodedContent, metadata, indexedChars); String parsedContent = TikaImpl.parse(decodedContent, metadata, indexedChars);
if (fields.contains(Field.CONTENT) && Strings.hasLength(parsedContent)) { if (fields.contains(Field.CONTENT) && Strings.hasLength(parsedContent)) {
// somehow tika seems to append a newline at the end automatically, lets remove that again // somehow tika seems to append a newline at the end automatically, lets remove that again
ingestDocument.setFieldValue(targetField + "." + Field.CONTENT.toLowerCase(), parsedContent.trim()); additionalFields.put(Field.CONTENT.toLowerCase(), parsedContent.trim());
} }
if (fields.contains(Field.LANGUAGE) && Strings.hasLength(parsedContent)) { if (fields.contains(Field.LANGUAGE) && Strings.hasLength(parsedContent)) {
LanguageIdentifier identifier = new LanguageIdentifier(parsedContent); LanguageIdentifier identifier = new LanguageIdentifier(parsedContent);
String language = identifier.getLanguage(); String language = identifier.getLanguage();
ingestDocument.setFieldValue(targetField + "." + Field.LANGUAGE.toLowerCase(), language); additionalFields.put(Field.LANGUAGE.toLowerCase(), language);
} }
if (fields.contains(Field.DATE)) { if (fields.contains(Field.DATE)) {
String createdDate = metadata.get(TikaCoreProperties.CREATED); String createdDate = metadata.get(TikaCoreProperties.CREATED);
if (createdDate != null) { if (createdDate != null) {
ingestDocument.setFieldValue(targetField + "." + Field.DATE.toLowerCase(), createdDate); additionalFields.put(Field.DATE.toLowerCase(), createdDate);
} }
} }
if (fields.contains(Field.TITLE)) { if (fields.contains(Field.TITLE)) {
String title = metadata.get(TikaCoreProperties.TITLE); String title = metadata.get(TikaCoreProperties.TITLE);
if (Strings.hasLength(title)) { if (Strings.hasLength(title)) {
ingestDocument.setFieldValue(targetField + "." + Field.TITLE.toLowerCase(), title); additionalFields.put(Field.TITLE.toLowerCase(), title);
} }
} }
if (fields.contains(Field.AUTHOR)) { if (fields.contains(Field.AUTHOR)) {
String author = metadata.get("Author"); String author = metadata.get("Author");
if (Strings.hasLength(author)) { if (Strings.hasLength(author)) {
ingestDocument.setFieldValue(targetField + "." + Field.AUTHOR.toLowerCase(), author); additionalFields.put(Field.AUTHOR.toLowerCase(), author);
} }
} }
if (fields.contains(Field.KEYWORDS)) { if (fields.contains(Field.KEYWORDS)) {
String keywords = metadata.get("Keywords"); String keywords = metadata.get("Keywords");
if (Strings.hasLength(keywords)) { if (Strings.hasLength(keywords)) {
ingestDocument.setFieldValue(targetField + "." + Field.KEYWORDS.toLowerCase(), keywords); additionalFields.put(Field.KEYWORDS.toLowerCase(), keywords);
} }
} }
if (fields.contains(Field.CONTENT_TYPE)) { if (fields.contains(Field.CONTENT_TYPE)) {
String contentType = metadata.get(Metadata.CONTENT_TYPE); String contentType = metadata.get(Metadata.CONTENT_TYPE);
if (Strings.hasLength(contentType)) { if (Strings.hasLength(contentType)) {
ingestDocument.setFieldValue(targetField + "." + Field.CONTENT_TYPE.toLowerCase(), contentType); additionalFields.put(Field.CONTENT_TYPE.toLowerCase(), contentType);
} }
} }
if (fields.contains(Field.CONTENT_LENGTH)) { if (fields.contains(Field.CONTENT_LENGTH)) {
String contentLength = metadata.get(Metadata.CONTENT_LENGTH); String contentLength = metadata.get(Metadata.CONTENT_LENGTH);
String length = Strings.hasLength(contentLength) ? contentLength : String.valueOf(parsedContent.length()); String length = Strings.hasLength(contentLength) ? contentLength : String.valueOf(parsedContent.length());
ingestDocument.setFieldValue(targetField + "." + Field.CONTENT_LENGTH.toLowerCase(), length); additionalFields.put(Field.CONTENT_LENGTH.toLowerCase(), length);
} }
} catch (Throwable e) { } catch (Throwable e) {
throw new ElasticsearchParseException("Error parsing document in field [{}]", e, sourceField); throw new ElasticsearchParseException("Error parsing document in field [{}]", e, sourceField);
} }
ingestDocument.setFieldValue(targetField, additionalFields);
} }
@Override @Override
@ -183,7 +187,6 @@ public final class AttachmentProcessor extends AbstractProcessor {
CONTENT, CONTENT,
TITLE, TITLE,
NAME,
AUTHOR, AUTHOR,
KEYWORDS, KEYWORDS,
DATE, DATE,

View File

@ -27,18 +27,15 @@ import org.elasticsearch.ingest.core.IngestDocument;
import org.elasticsearch.test.ESTestCase; import org.elasticsearch.test.ESTestCase;
import org.junit.Before; import org.junit.Before;
import java.io.FileNotFoundException;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.EnumSet; import java.util.EnumSet;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.HashSet;
import java.util.Locale;
import java.util.Map; import java.util.Map;
import java.util.Set; import java.util.Set;
import static org.hamcrest.Matchers.contains;
import static org.hamcrest.Matchers.containsInAnyOrder; import static org.hamcrest.Matchers.containsInAnyOrder;
import static org.hamcrest.Matchers.containsString; import static org.hamcrest.Matchers.containsString;
import static org.hamcrest.Matchers.hasSize; import static org.hamcrest.Matchers.hasSize;
@ -68,22 +65,31 @@ public class AttachmentProcessorTests extends ESTestCase {
assertThat(attachmentData.get("content_length"), is(notNullValue())); assertThat(attachmentData.get("content_length"), is(notNullValue()));
} }
public void testEnglishTextDocumentWithRandomFields() throws Exception { public void testHtmlDocumentWithRandomFields() throws Exception {
Set<AttachmentProcessor.Field> fields = EnumSet.noneOf(AttachmentProcessor.Field.class); //date is not present in the html doc
List<String> fieldNames = new ArrayList<>(); ArrayList<AttachmentProcessor.Field> fieldsList = new ArrayList<>(EnumSet.complementOf(EnumSet.of
int numFields = scaledRandomIntBetween(1, AttachmentProcessor.Field.values().length); (AttachmentProcessor.Field.DATE)));
Set<AttachmentProcessor.Field> selectedFields = new HashSet<>();
int numFields = randomIntBetween(1, fieldsList.size());
String[] selectedFieldNames = new String[numFields];
for (int i = 0; i < numFields; i++) { for (int i = 0; i < numFields; i++) {
AttachmentProcessor.Field field = AttachmentProcessor.Field.values()[i]; AttachmentProcessor.Field field;
fields.add(field); do {
fieldNames.add(field.name().toLowerCase(Locale.ROOT)); field = randomFrom(fieldsList);
} while (selectedFields.add(field) == false);
selectedFieldNames[i] = field.toLowerCase();
}
if (randomBoolean()) {
selectedFields.add(AttachmentProcessor.Field.DATE);
} }
processor = new AttachmentProcessor(randomAsciiOfLength(10), "source_field", processor = new AttachmentProcessor(randomAsciiOfLength(10), "source_field",
"target_field", EnumSet.copyOf(fields), 10000); "target_field", selectedFields, 10000);
Map<String, Object> attachmentData = parseDocument("text-in-english.txt", processor); Map<String, Object> attachmentData = parseDocument("htmlWithEmptyDateMeta.html", processor);
assertThat(attachmentData.keySet(), hasSize(1)); assertThat(attachmentData.keySet(), hasSize(selectedFieldNames.length));
assertThat(attachmentData.keySet(), contains("content")); assertThat(attachmentData.keySet(), containsInAnyOrder(selectedFieldNames));
} }
public void testFrenchTextDocument() throws Exception { public void testFrenchTextDocument() throws Exception {