Add language detection option
Based on PR #45, we add a new language detection option using Language detection feature available in Tika: https://tika.apache.org/1.4/detection.html#Language_Detection By default, language detection is disabled (`false`) as it could come with a cost. This default value can be changed by setting the `index.mapping.attachment.detect_language` setting. It can also be provided on a per document indexed using the `_detect_language` parameter. Closes #45. Closes #44.
This commit is contained in:
parent
621995d0b4
commit
3d15cb0484
16
README.md
16
README.md
|
@ -63,6 +63,7 @@ The metadata supported are:
|
|||
* `keywords`
|
||||
* `content_type`
|
||||
* `content_length` is the original content_length before text extraction (aka file size)
|
||||
* `language`
|
||||
|
||||
They can be queried using the "dot notation", for example: `my_attachment.author`.
|
||||
|
||||
|
@ -81,7 +82,8 @@ Both the meta data and the actual content are simple core type mappers (string,
|
|||
"author" : {"analyzer" : "myAnalyzer"},
|
||||
"keywords" : {store : "yes"},
|
||||
"content_type" : {store : "yes"},
|
||||
"content_length" : {store : "yes"}
|
||||
"content_length" : {store : "yes"},
|
||||
"language" : {store : "yes"}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -96,7 +98,7 @@ Indexed Characters
|
|||
|
||||
By default, `100000` characters are extracted when indexing the content. This default value can be changed by setting the `index.mapping.attachment.indexed_chars` setting. It can also be provided on a per document indexed using the `_indexed_chars` parameter. `-1` can be set to extract all text, but note that all the text needs to be allowed to be represented in memory.
|
||||
|
||||
Note, this feature is support since `1.3.0` version.
|
||||
Note, this feature is supported since `1.3.0` version.
|
||||
|
||||
Metadata parsing error handling
|
||||
-------------------------------
|
||||
|
@ -106,6 +108,16 @@ Since version `1.9.0`, parsing errors are ignored so your document is indexed.
|
|||
|
||||
You can disable this feature by setting the `index.mapping.attachment.ignore_errors` setting to `false`.
|
||||
|
||||
Language Detection
|
||||
------------------
|
||||
|
||||
By default, language detection is disabled (`false`) as it could come with a cost.
|
||||
This default value can be changed by setting the `index.mapping.attachment.detect_language` setting.
|
||||
It can also be provided on a per document indexed using the `_detect_language` parameter.
|
||||
|
||||
Note, this feature is supported since `2.0.0` version.
|
||||
|
||||
|
||||
License
|
||||
-------
|
||||
|
||||
|
|
|
@ -19,6 +19,7 @@
|
|||
|
||||
package org.elasticsearch.index.mapper.attachment;
|
||||
|
||||
import org.apache.tika.language.LanguageIdentifier;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.elasticsearch.common.io.stream.BytesStreamInput;
|
||||
import org.elasticsearch.common.logging.ESLogger;
|
||||
|
@ -70,9 +71,11 @@ public class AttachmentMapper implements Mapper {
|
|||
|
||||
private ContentPath.Type pathType = Defaults.PATH_TYPE;
|
||||
|
||||
private Boolean ignoreErrors = null;
|
||||
|
||||
private Integer defaultIndexedChars = null;
|
||||
|
||||
private Boolean ignoreErrors = null;
|
||||
private Boolean langDetect = null;
|
||||
|
||||
private Mapper.Builder contentBuilder;
|
||||
|
||||
|
@ -90,6 +93,8 @@ public class AttachmentMapper implements Mapper {
|
|||
|
||||
private Mapper.Builder contentLengthBuilder = integerField("content_length");
|
||||
|
||||
private Mapper.Builder languageBuilder = stringField("language");
|
||||
|
||||
public Builder(String name) {
|
||||
super(name);
|
||||
this.builder = this;
|
||||
|
@ -141,6 +146,11 @@ public class AttachmentMapper implements Mapper {
|
|||
return this;
|
||||
}
|
||||
|
||||
public Builder language(Mapper.Builder language) {
|
||||
this.languageBuilder = language;
|
||||
return this;
|
||||
}
|
||||
|
||||
@Override
|
||||
public AttachmentMapper build(BuilderContext context) {
|
||||
ContentPath.Type origPathType = context.path().pathType();
|
||||
|
@ -158,6 +168,7 @@ public class AttachmentMapper implements Mapper {
|
|||
Mapper keywordsMapper = keywordsBuilder.build(context);
|
||||
Mapper contentTypeMapper = contentTypeBuilder.build(context);
|
||||
Mapper contentLength = contentLengthBuilder.build(context);
|
||||
Mapper language = languageBuilder.build(context);
|
||||
context.path().remove();
|
||||
|
||||
context.path().pathType(origPathType);
|
||||
|
@ -176,7 +187,14 @@ public class AttachmentMapper implements Mapper {
|
|||
ignoreErrors = Boolean.TRUE;
|
||||
}
|
||||
|
||||
return new AttachmentMapper(name, pathType, defaultIndexedChars, ignoreErrors, contentMapper, dateMapper, titleMapper, nameMapper, authorMapper, keywordsMapper, contentTypeMapper, contentLength);
|
||||
if (langDetect == null && context.indexSettings() != null) {
|
||||
langDetect = context.indexSettings().getAsBoolean("index.mapping.attachment.detect_language", Boolean.FALSE);
|
||||
}
|
||||
if (langDetect == null) {
|
||||
langDetect = Boolean.FALSE;
|
||||
}
|
||||
|
||||
return new AttachmentMapper(name, pathType, defaultIndexedChars, ignoreErrors, langDetect, contentMapper, dateMapper, titleMapper, nameMapper, authorMapper, keywordsMapper, contentTypeMapper, contentLength, language);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -245,6 +263,8 @@ public class AttachmentMapper implements Mapper {
|
|||
builder.contentType(parserContext.typeParser(StringFieldMapper.CONTENT_TYPE).parse("content_type", (Map<String, Object>) propNode, parserContext));
|
||||
} else if ("content_length".equals(propName)) {
|
||||
builder.contentLength(parserContext.typeParser(IntegerFieldMapper.CONTENT_TYPE).parse("content_length", (Map<String, Object>) propNode, parserContext));
|
||||
} else if ("language".equals(propName)) {
|
||||
builder.language(parserContext.typeParser(StringFieldMapper.CONTENT_TYPE).parse("language", (Map<String, Object>) propNode, parserContext));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -262,6 +282,8 @@ public class AttachmentMapper implements Mapper {
|
|||
|
||||
private final boolean ignoreErrors;
|
||||
|
||||
private final boolean defaultLangDetect;
|
||||
|
||||
private final Mapper contentMapper;
|
||||
|
||||
private final Mapper dateMapper;
|
||||
|
@ -278,13 +300,16 @@ public class AttachmentMapper implements Mapper {
|
|||
|
||||
private final Mapper contentLengthMapper;
|
||||
|
||||
public AttachmentMapper(String name, ContentPath.Type pathType, int defaultIndexedChars, Boolean ignoreErrors, Mapper contentMapper,
|
||||
private final Mapper languageMapper;
|
||||
|
||||
public AttachmentMapper(String name, ContentPath.Type pathType, int defaultIndexedChars, Boolean ignoreErrors, Boolean defaultLangDetect, Mapper contentMapper,
|
||||
Mapper dateMapper, Mapper titleMapper, Mapper nameMapper, Mapper authorMapper,
|
||||
Mapper keywordsMapper, Mapper contentTypeMapper, Mapper contentLengthMapper) {
|
||||
Mapper keywordsMapper, Mapper contentTypeMapper, Mapper contentLengthMapper, Mapper languageMapper) {
|
||||
this.name = name;
|
||||
this.pathType = pathType;
|
||||
this.defaultIndexedChars = defaultIndexedChars;
|
||||
this.ignoreErrors = ignoreErrors;
|
||||
this.defaultLangDetect = defaultLangDetect;
|
||||
this.contentMapper = contentMapper;
|
||||
this.dateMapper = dateMapper;
|
||||
this.titleMapper = titleMapper;
|
||||
|
@ -293,6 +318,7 @@ public class AttachmentMapper implements Mapper {
|
|||
this.keywordsMapper = keywordsMapper;
|
||||
this.contentTypeMapper = contentTypeMapper;
|
||||
this.contentLengthMapper = contentLengthMapper;
|
||||
this.languageMapper = languageMapper;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -305,7 +331,9 @@ public class AttachmentMapper implements Mapper {
|
|||
byte[] content = null;
|
||||
String contentType = null;
|
||||
int indexedChars = defaultIndexedChars;
|
||||
boolean langDetect = defaultLangDetect;
|
||||
String name = null;
|
||||
String language = null;
|
||||
|
||||
XContentParser parser = context.parser();
|
||||
XContentParser.Token token = parser.currentToken();
|
||||
|
@ -323,11 +351,17 @@ public class AttachmentMapper implements Mapper {
|
|||
contentType = parser.text();
|
||||
} else if ("_name".equals(currentFieldName)) {
|
||||
name = parser.text();
|
||||
} else if ("language".equals(currentFieldName)) {
|
||||
language = parser.text();
|
||||
}
|
||||
} else if (token == XContentParser.Token.VALUE_NUMBER) {
|
||||
if ("_indexed_chars".equals(currentFieldName) || "_indexedChars".equals(currentFieldName)) {
|
||||
indexedChars = parser.intValue();
|
||||
}
|
||||
} else if (token == XContentParser.Token.VALUE_BOOLEAN) {
|
||||
if ("_detect_language".equals(currentFieldName) || "_detectLanguage".equals(currentFieldName)) {
|
||||
langDetect = parser.booleanValue();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -347,7 +381,7 @@ public class AttachmentMapper implements Mapper {
|
|||
|
||||
String parsedContent;
|
||||
try {
|
||||
// Set the maximum length of strings returned by the parseToString method, -1 sets no limit
|
||||
// Set the maximum length of strings returned by the parseToString method, -1 sets no limit
|
||||
parsedContent = tika().parseToString(new BytesStreamInput(content, false), metadata, indexedChars);
|
||||
} catch (Throwable e) {
|
||||
// #18: we could ignore errors when Tika does not parse data
|
||||
|
@ -358,6 +392,20 @@ public class AttachmentMapper implements Mapper {
|
|||
context.externalValue(parsedContent);
|
||||
contentMapper.parse(context);
|
||||
|
||||
if (langDetect) {
|
||||
try {
|
||||
if (language != null) {
|
||||
metadata.add(Metadata.CONTENT_LANGUAGE, language);
|
||||
} else {
|
||||
LanguageIdentifier identifier = new LanguageIdentifier(parsedContent);
|
||||
language = identifier.getLanguage();
|
||||
}
|
||||
context.externalValue(language);
|
||||
languageMapper.parse(context);
|
||||
} catch(Throwable t) {
|
||||
logger.warn("Cannot detect language: {}", t.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
context.externalValue(name);
|
||||
|
@ -437,6 +485,7 @@ public class AttachmentMapper implements Mapper {
|
|||
keywordsMapper.traverse(fieldMapperListener);
|
||||
contentTypeMapper.traverse(fieldMapperListener);
|
||||
contentLengthMapper.traverse(fieldMapperListener);
|
||||
languageMapper.traverse(fieldMapperListener);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -453,6 +502,7 @@ public class AttachmentMapper implements Mapper {
|
|||
keywordsMapper.close();
|
||||
contentTypeMapper.close();
|
||||
contentLengthMapper.close();
|
||||
languageMapper.close();
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -470,6 +520,7 @@ public class AttachmentMapper implements Mapper {
|
|||
keywordsMapper.toXContent(builder, params);
|
||||
contentTypeMapper.toXContent(builder, params);
|
||||
contentLengthMapper.toXContent(builder, params);
|
||||
languageMapper.toXContent(builder, params);
|
||||
builder.endObject();
|
||||
|
||||
builder.endObject();
|
||||
|
|
|
@ -0,0 +1,140 @@
|
|||
/*
|
||||
* Licensed to ElasticSearch and Shay Banon under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. ElasticSearch licenses this
|
||||
* file to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.mapper.xcontent;
|
||||
|
||||
import org.elasticsearch.common.settings.ImmutableSettings;
|
||||
import org.elasticsearch.common.xcontent.XContentBuilder;
|
||||
import org.elasticsearch.index.Index;
|
||||
import org.elasticsearch.index.analysis.AnalysisService;
|
||||
import org.elasticsearch.index.mapper.DocumentMapper;
|
||||
import org.elasticsearch.index.mapper.DocumentMapperParser;
|
||||
import org.elasticsearch.index.mapper.ParseContext;
|
||||
import org.elasticsearch.index.mapper.attachment.AttachmentMapper;
|
||||
import org.elasticsearch.index.mapper.core.StringFieldMapper;
|
||||
import org.elasticsearch.test.ElasticsearchTestCase;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import static org.elasticsearch.common.io.Streams.copyToBytesFromClasspath;
|
||||
import static org.elasticsearch.common.io.Streams.copyToStringFromClasspath;
|
||||
import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;
|
||||
import static org.hamcrest.Matchers.equalTo;
|
||||
import static org.hamcrest.Matchers.instanceOf;
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
public class LanguageDetectionAttachmentMapperTests extends ElasticsearchTestCase {
|
||||
|
||||
private DocumentMapper docMapper;
|
||||
|
||||
@Before
|
||||
public void setupMapperParser() throws IOException {
|
||||
setupMapperParser(true);
|
||||
}
|
||||
|
||||
public void setupMapperParser(boolean langDetect) throws IOException {
|
||||
DocumentMapperParser mapperParser = new DocumentMapperParser(new Index("test"),
|
||||
ImmutableSettings.settingsBuilder().put("index.mapping.attachment.detect_language", langDetect).build(),
|
||||
new AnalysisService(new Index("test")), null, null, null);
|
||||
mapperParser.putTypeParser(AttachmentMapper.CONTENT_TYPE, new AttachmentMapper.TypeParser());
|
||||
String mapping = copyToStringFromClasspath("/org/elasticsearch/index/mapper/language/language-mapping.json");
|
||||
docMapper = mapperParser.parse(mapping);
|
||||
|
||||
assertThat(docMapper.mappers().fullName("file.language").mapper(), instanceOf(StringFieldMapper.class));
|
||||
}
|
||||
|
||||
private void testLanguage(String filename, String expected, String... forcedLanguage) throws IOException {
|
||||
byte[] html = copyToBytesFromClasspath("/org/elasticsearch/index/mapper/xcontent/" + filename);
|
||||
|
||||
XContentBuilder xcb = jsonBuilder()
|
||||
.startObject()
|
||||
.field("_id", 1)
|
||||
.startObject("file")
|
||||
.field("_name", filename)
|
||||
.field("content", html);
|
||||
|
||||
if (forcedLanguage.length > 0) {
|
||||
xcb.field("language", forcedLanguage[0]);
|
||||
}
|
||||
|
||||
xcb.endObject().endObject();
|
||||
|
||||
ParseContext.Document doc = docMapper.parse(xcb.bytes()).rootDoc();
|
||||
|
||||
// Our mapping should be kept as a String
|
||||
assertThat(doc.get(docMapper.mappers().smartName("file.language").mapper().names().indexName()), equalTo(expected));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFrDetection() throws Exception {
|
||||
testLanguage("text-in-french.txt", "fr");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testEnDetection() throws Exception {
|
||||
testLanguage("text-in-english.txt", "en");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFrForced() throws Exception {
|
||||
testLanguage("text-in-english.txt", "fr", "fr");
|
||||
}
|
||||
|
||||
/**
|
||||
* This test gives strange results! detection of ":-)" gives "lt" as a result
|
||||
* @throws Exception
|
||||
*/
|
||||
@Test
|
||||
public void testNoLanguage() throws Exception {
|
||||
testLanguage("text-in-nolang.txt", "lt");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testLangDetectDisabled() throws Exception {
|
||||
// We replace the mapper with another one which have index.mapping.attachment.detect_language = false
|
||||
setupMapperParser(false);
|
||||
testLanguage("text-in-english.txt", null);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testLangDetectDocumentEnabled() throws Exception {
|
||||
// We replace the mapper with another one which have index.mapping.attachment.detect_language = false
|
||||
setupMapperParser(false);
|
||||
|
||||
byte[] html = copyToBytesFromClasspath("/org/elasticsearch/index/mapper/xcontent/text-in-english.txt");
|
||||
|
||||
XContentBuilder xcb = jsonBuilder()
|
||||
.startObject()
|
||||
.field("_id", 1)
|
||||
.startObject("file")
|
||||
.field("_name", "text-in-english.txt")
|
||||
.field("content", html)
|
||||
.field("_detect_language", true)
|
||||
.endObject().endObject();
|
||||
|
||||
ParseContext.Document doc = docMapper.parse(xcb.bytes()).rootDoc();
|
||||
|
||||
// Our mapping should be kept as a String
|
||||
assertThat(doc.get(docMapper.mappers().smartName("file.language").mapper().names().indexName()), equalTo("en"));
|
||||
}
|
||||
}
|
|
@ -0,0 +1,13 @@
|
|||
{
|
||||
"person": {
|
||||
"properties": {
|
||||
"file": {
|
||||
"type": "attachment",
|
||||
"path": "full",
|
||||
"fields": {
|
||||
"language": { "type": "string" }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1 @@
|
|||
"God Save the Queen" (alternatively "God Save the King"
|
|
@ -0,0 +1 @@
|
|||
Allons enfants de la Patrie Le jour de gloire est arrivé. Contre nous de la tyrannie
|
Loading…
Reference in New Issue