Use` _language` field instead of `language`
When we want to force a language instead of using Tika language detection, we set `language` field in documents. To be consistent with other forced fields, `_content_type` and `_name`, we should prefix `language` field by an underscore `_`. So `language` become `_language`. We first deprecate `language` in version 2.1.0 and we remove it in 2.3.0. Closes #68. (cherry picked from commit 2f46343)
This commit is contained in:
parent
7c1c2011bc
commit
94cf141108
14
README.md
14
README.md
|
@ -46,13 +46,14 @@ In this case, the JSON to index can be:
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
Or it is possible to use more elaborated JSON if content type or resource name need to be set explicitly:
|
Or it is possible to use more elaborated JSON if content type, resource name or language need to be set explicitly:
|
||||||
|
|
||||||
```javascript
|
```javascript
|
||||||
{
|
{
|
||||||
"my_attachment" : {
|
"my_attachment" : {
|
||||||
"_content_type" : "application/pdf",
|
"_content_type" : "application/pdf",
|
||||||
"_name" : "resource/name/of/my.pdf",
|
"_name" : "resource/name/of/my.pdf",
|
||||||
|
"_language" : "en",
|
||||||
"content" : "... base64 encoded attachment ..."
|
"content" : "... base64 encoded attachment ..."
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -121,7 +122,16 @@ By default, language detection is disabled (`false`) as it could come with a cos
|
||||||
This default value can be changed by setting the `index.mapping.attachment.detect_language` setting.
|
This default value can be changed by setting the `index.mapping.attachment.detect_language` setting.
|
||||||
It can also be provided on a per document indexed using the `_detect_language` parameter.
|
It can also be provided on a per document indexed using the `_detect_language` parameter.
|
||||||
|
|
||||||
Note, this feature is supported since `2.0.0` version.
|
Note that you can force language using `_language` field when sending your actual document:
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
{
|
||||||
|
"my_attachment" : {
|
||||||
|
"_language" : "en",
|
||||||
|
"content" : "... base64 encoded attachment ..."
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
Highlighting attachments
|
Highlighting attachments
|
||||||
------------------------
|
------------------------
|
||||||
|
|
|
@ -352,8 +352,11 @@ public class AttachmentMapper implements Mapper {
|
||||||
} else if ("_name".equals(currentFieldName)) {
|
} else if ("_name".equals(currentFieldName)) {
|
||||||
name = parser.text();
|
name = parser.text();
|
||||||
} else if ("language".equals(currentFieldName)) {
|
} else if ("language".equals(currentFieldName)) {
|
||||||
// TODO should be _language
|
// TODO deprecated form. Will be removed in 2.3
|
||||||
language = parser.text();
|
language = parser.text();
|
||||||
|
logger.debug("`language` is now deprecated. Use `_language`. See https://github.com/elasticsearch/elasticsearch-mapper-attachments/issues/68");
|
||||||
|
} else if ("_language".equals(currentFieldName)) {
|
||||||
|
language = parser.text();
|
||||||
}
|
}
|
||||||
} else if (token == XContentParser.Token.VALUE_NUMBER) {
|
} else if (token == XContentParser.Token.VALUE_NUMBER) {
|
||||||
if ("_indexed_chars".equals(currentFieldName) || "_indexedChars".equals(currentFieldName)) {
|
if ("_indexed_chars".equals(currentFieldName) || "_indexedChars".equals(currentFieldName)) {
|
||||||
|
|
|
@ -74,7 +74,7 @@ public class LanguageDetectionAttachmentMapperTests extends ElasticsearchTestCas
|
||||||
.field("content", html);
|
.field("content", html);
|
||||||
|
|
||||||
if (forcedLanguage.length > 0) {
|
if (forcedLanguage.length > 0) {
|
||||||
xcb.field("language", forcedLanguage[0]);
|
xcb.field("_language", forcedLanguage[0]);
|
||||||
}
|
}
|
||||||
|
|
||||||
xcb.endObject().endObject();
|
xcb.endObject().endObject();
|
||||||
|
|
Loading…
Reference in New Issue