Merge pull request #16574 from javanna/enhancement/attachment_improvements
Minor attachment processor improvements
This commit is contained in:
commit
65391e8a83
|
@ -32,6 +32,7 @@ import org.elasticsearch.ingest.core.IngestDocument;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.EnumSet;
|
import java.util.EnumSet;
|
||||||
|
import java.util.HashMap;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Locale;
|
import java.util.Locale;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
@ -66,66 +67,69 @@ public final class AttachmentProcessor extends AbstractProcessor {
|
||||||
@Override
|
@Override
|
||||||
public void execute(IngestDocument ingestDocument) {
|
public void execute(IngestDocument ingestDocument) {
|
||||||
String base64Input = ingestDocument.getFieldValue(sourceField, String.class);
|
String base64Input = ingestDocument.getFieldValue(sourceField, String.class);
|
||||||
|
Map<String, Object> additionalFields = new HashMap<>();
|
||||||
|
|
||||||
Metadata metadata = new Metadata();
|
|
||||||
try {
|
try {
|
||||||
byte[] decodedContent = Base64.decode(base64Input.getBytes(UTF_8));
|
byte[] decodedContent = Base64.decode(base64Input.getBytes(UTF_8));
|
||||||
|
Metadata metadata = new Metadata();
|
||||||
String parsedContent = TikaImpl.parse(decodedContent, metadata, indexedChars);
|
String parsedContent = TikaImpl.parse(decodedContent, metadata, indexedChars);
|
||||||
|
|
||||||
if (fields.contains(Field.CONTENT) && Strings.hasLength(parsedContent)) {
|
if (fields.contains(Field.CONTENT) && Strings.hasLength(parsedContent)) {
|
||||||
// somehow tika seems to append a newline at the end automatically, lets remove that again
|
// somehow tika seems to append a newline at the end automatically, lets remove that again
|
||||||
ingestDocument.setFieldValue(targetField + "." + Field.CONTENT.toLowerCase(), parsedContent.trim());
|
additionalFields.put(Field.CONTENT.toLowerCase(), parsedContent.trim());
|
||||||
}
|
}
|
||||||
|
|
||||||
if (fields.contains(Field.LANGUAGE) && Strings.hasLength(parsedContent)) {
|
if (fields.contains(Field.LANGUAGE) && Strings.hasLength(parsedContent)) {
|
||||||
LanguageIdentifier identifier = new LanguageIdentifier(parsedContent);
|
LanguageIdentifier identifier = new LanguageIdentifier(parsedContent);
|
||||||
String language = identifier.getLanguage();
|
String language = identifier.getLanguage();
|
||||||
ingestDocument.setFieldValue(targetField + "." + Field.LANGUAGE.toLowerCase(), language);
|
additionalFields.put(Field.LANGUAGE.toLowerCase(), language);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (fields.contains(Field.DATE)) {
|
if (fields.contains(Field.DATE)) {
|
||||||
String createdDate = metadata.get(TikaCoreProperties.CREATED);
|
String createdDate = metadata.get(TikaCoreProperties.CREATED);
|
||||||
if (createdDate != null) {
|
if (createdDate != null) {
|
||||||
ingestDocument.setFieldValue(targetField + "." + Field.DATE.toLowerCase(), createdDate);
|
additionalFields.put(Field.DATE.toLowerCase(), createdDate);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (fields.contains(Field.TITLE)) {
|
if (fields.contains(Field.TITLE)) {
|
||||||
String title = metadata.get(TikaCoreProperties.TITLE);
|
String title = metadata.get(TikaCoreProperties.TITLE);
|
||||||
if (Strings.hasLength(title)) {
|
if (Strings.hasLength(title)) {
|
||||||
ingestDocument.setFieldValue(targetField + "." + Field.TITLE.toLowerCase(), title);
|
additionalFields.put(Field.TITLE.toLowerCase(), title);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (fields.contains(Field.AUTHOR)) {
|
if (fields.contains(Field.AUTHOR)) {
|
||||||
String author = metadata.get("Author");
|
String author = metadata.get("Author");
|
||||||
if (Strings.hasLength(author)) {
|
if (Strings.hasLength(author)) {
|
||||||
ingestDocument.setFieldValue(targetField + "." + Field.AUTHOR.toLowerCase(), author);
|
additionalFields.put(Field.AUTHOR.toLowerCase(), author);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (fields.contains(Field.KEYWORDS)) {
|
if (fields.contains(Field.KEYWORDS)) {
|
||||||
String keywords = metadata.get("Keywords");
|
String keywords = metadata.get("Keywords");
|
||||||
if (Strings.hasLength(keywords)) {
|
if (Strings.hasLength(keywords)) {
|
||||||
ingestDocument.setFieldValue(targetField + "." + Field.KEYWORDS.toLowerCase(), keywords);
|
additionalFields.put(Field.KEYWORDS.toLowerCase(), keywords);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (fields.contains(Field.CONTENT_TYPE)) {
|
if (fields.contains(Field.CONTENT_TYPE)) {
|
||||||
String contentType = metadata.get(Metadata.CONTENT_TYPE);
|
String contentType = metadata.get(Metadata.CONTENT_TYPE);
|
||||||
if (Strings.hasLength(contentType)) {
|
if (Strings.hasLength(contentType)) {
|
||||||
ingestDocument.setFieldValue(targetField + "." + Field.CONTENT_TYPE.toLowerCase(), contentType);
|
additionalFields.put(Field.CONTENT_TYPE.toLowerCase(), contentType);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (fields.contains(Field.CONTENT_LENGTH)) {
|
if (fields.contains(Field.CONTENT_LENGTH)) {
|
||||||
String contentLength = metadata.get(Metadata.CONTENT_LENGTH);
|
String contentLength = metadata.get(Metadata.CONTENT_LENGTH);
|
||||||
String length = Strings.hasLength(contentLength) ? contentLength : String.valueOf(parsedContent.length());
|
String length = Strings.hasLength(contentLength) ? contentLength : String.valueOf(parsedContent.length());
|
||||||
ingestDocument.setFieldValue(targetField + "." + Field.CONTENT_LENGTH.toLowerCase(), length);
|
additionalFields.put(Field.CONTENT_LENGTH.toLowerCase(), length);
|
||||||
}
|
}
|
||||||
} catch (Throwable e) {
|
} catch (Throwable e) {
|
||||||
throw new ElasticsearchParseException("Error parsing document in field [{}]", e, sourceField);
|
throw new ElasticsearchParseException("Error parsing document in field [{}]", e, sourceField);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ingestDocument.setFieldValue(targetField, additionalFields);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -183,7 +187,6 @@ public final class AttachmentProcessor extends AbstractProcessor {
|
||||||
|
|
||||||
CONTENT,
|
CONTENT,
|
||||||
TITLE,
|
TITLE,
|
||||||
NAME,
|
|
||||||
AUTHOR,
|
AUTHOR,
|
||||||
KEYWORDS,
|
KEYWORDS,
|
||||||
DATE,
|
DATE,
|
||||||
|
|
|
@ -27,18 +27,15 @@ import org.elasticsearch.ingest.core.IngestDocument;
|
||||||
import org.elasticsearch.test.ESTestCase;
|
import org.elasticsearch.test.ESTestCase;
|
||||||
import org.junit.Before;
|
import org.junit.Before;
|
||||||
|
|
||||||
import java.io.FileNotFoundException;
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.EnumSet;
|
import java.util.EnumSet;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.List;
|
import java.util.HashSet;
|
||||||
import java.util.Locale;
|
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
import static org.hamcrest.Matchers.contains;
|
|
||||||
import static org.hamcrest.Matchers.containsInAnyOrder;
|
import static org.hamcrest.Matchers.containsInAnyOrder;
|
||||||
import static org.hamcrest.Matchers.containsString;
|
import static org.hamcrest.Matchers.containsString;
|
||||||
import static org.hamcrest.Matchers.hasSize;
|
import static org.hamcrest.Matchers.hasSize;
|
||||||
|
@ -68,22 +65,31 @@ public class AttachmentProcessorTests extends ESTestCase {
|
||||||
assertThat(attachmentData.get("content_length"), is(notNullValue()));
|
assertThat(attachmentData.get("content_length"), is(notNullValue()));
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testEnglishTextDocumentWithRandomFields() throws Exception {
|
public void testHtmlDocumentWithRandomFields() throws Exception {
|
||||||
Set<AttachmentProcessor.Field> fields = EnumSet.noneOf(AttachmentProcessor.Field.class);
|
//date is not present in the html doc
|
||||||
List<String> fieldNames = new ArrayList<>();
|
ArrayList<AttachmentProcessor.Field> fieldsList = new ArrayList<>(EnumSet.complementOf(EnumSet.of
|
||||||
int numFields = scaledRandomIntBetween(1, AttachmentProcessor.Field.values().length);
|
(AttachmentProcessor.Field.DATE)));
|
||||||
|
Set<AttachmentProcessor.Field> selectedFields = new HashSet<>();
|
||||||
|
|
||||||
|
int numFields = randomIntBetween(1, fieldsList.size());
|
||||||
|
String[] selectedFieldNames = new String[numFields];
|
||||||
for (int i = 0; i < numFields; i++) {
|
for (int i = 0; i < numFields; i++) {
|
||||||
AttachmentProcessor.Field field = AttachmentProcessor.Field.values()[i];
|
AttachmentProcessor.Field field;
|
||||||
fields.add(field);
|
do {
|
||||||
fieldNames.add(field.name().toLowerCase(Locale.ROOT));
|
field = randomFrom(fieldsList);
|
||||||
|
} while (selectedFields.add(field) == false);
|
||||||
|
|
||||||
|
selectedFieldNames[i] = field.toLowerCase();
|
||||||
|
}
|
||||||
|
if (randomBoolean()) {
|
||||||
|
selectedFields.add(AttachmentProcessor.Field.DATE);
|
||||||
}
|
}
|
||||||
|
|
||||||
processor = new AttachmentProcessor(randomAsciiOfLength(10), "source_field",
|
processor = new AttachmentProcessor(randomAsciiOfLength(10), "source_field",
|
||||||
"target_field", EnumSet.copyOf(fields), 10000);
|
"target_field", selectedFields, 10000);
|
||||||
|
|
||||||
Map<String, Object> attachmentData = parseDocument("text-in-english.txt", processor);
|
Map<String, Object> attachmentData = parseDocument("htmlWithEmptyDateMeta.html", processor);
|
||||||
assertThat(attachmentData.keySet(), hasSize(1));
|
assertThat(attachmentData.keySet(), hasSize(selectedFieldNames.length));
|
||||||
assertThat(attachmentData.keySet(), contains("content"));
|
assertThat(attachmentData.keySet(), containsInAnyOrder(selectedFieldNames));
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testFrenchTextDocument() throws Exception {
|
public void testFrenchTextDocument() throws Exception {
|
||||||
|
|
Loading…
Reference in New Issue