diff --git a/docs/CHANGELOG.asciidoc b/docs/CHANGELOG.asciidoc index f4ecaf44a6c..66a81038f0d 100644 --- a/docs/CHANGELOG.asciidoc +++ b/docs/CHANGELOG.asciidoc @@ -146,6 +146,9 @@ The new <> field allows to know which fields got ignored at index time because of the <> option. ({pull}30140[#29658]) +A new analysis plugin called `analysis_nori` that exposes the Lucene Korean +analysis module. ({pull}30397[#30397]) + [float] === Enhancements diff --git a/docs/build.gradle b/docs/build.gradle index 5057bead62d..e8c406594b2 100644 --- a/docs/build.gradle +++ b/docs/build.gradle @@ -32,6 +32,7 @@ integTestCluster { configFile 'analysis/synonym.txt' configFile 'analysis/stemmer_override.txt' configFile 'userdict_ja.txt' + configFile 'userdict_ko.txt' configFile 'KeywordTokenizer.rbbi' extraConfigFile 'hunspell/en_US/en_US.aff', '../server/src/test/resources/indices/analyze/conf_dir/hunspell/en_US/en_US.aff' extraConfigFile 'hunspell/en_US/en_US.dic', '../server/src/test/resources/indices/analyze/conf_dir/hunspell/en_US/en_US.dic' diff --git a/docs/plugins/analysis-nori.asciidoc b/docs/plugins/analysis-nori.asciidoc new file mode 100644 index 00000000000..1cec063b273 --- /dev/null +++ b/docs/plugins/analysis-nori.asciidoc @@ -0,0 +1,408 @@ +[[analysis-nori]] +=== Korean (nori) Analysis Plugin + +The Korean (nori) Analysis plugin integrates Lucene nori analysis +module into elasticsearch. It uses the https://bitbucket.org/eunjeon/mecab-ko-dic[mecab-ko-dic dictionary] +to perform morphological analysis of Korean texts. + +:plugin_name: analysis-nori +include::install_remove.asciidoc[] + +[[analysis-nori-analyzer]] +==== `nori` analyzer + +The `nori` analyzer consists of the following tokenizer and token filters: + +* <> +* <> token filter +* <> token filter +* {ref}/analysis-lowercase-tokenfilter.html[`lowercase`] token filter + +It supports the `decompound_mode` and `user_dictionary` settings from +<> and the `stoptags` setting from +<>. + +[[analysis-nori-tokenizer]] +==== `nori_tokenizer` + +The `nori_tokenizer` accepts the following settings: + +`decompound_mode`:: ++ +-- + +The decompound mode determines how the tokenizer handles compound tokens. +It can be set to: + +`none`:: + + No decomposition for compounds. Example output: + + 가거도항 + 가곡역 + +`discard`:: + + Decomposes compounds and discards the original form (*default*). Example output: + + 가곡역 => 가곡, 역 + +`mixed`:: + + Decomposes compounds and keeps the original form. Example output: + + 가곡역 => 가곡역, 가곡, 역 +-- + +`user_dictionary`:: ++ +-- +The Nori tokenizer uses the https://bitbucket.org/eunjeon/mecab-ko-dic[mecab-ko-dic dictionary] by default. +A `user_dictionary` with custom nouns (`NNG`) may be appended to the default dictionary. +The dictionary should have the following format: + +[source,txt] +----------------------- + [ ... ] +----------------------- + +The first token is mandatory and represents the custom noun that should be added in +the dictionary. For compound nouns the custom segmentation can be provided +after the first token (`[ ... ]`). The segmentation of the +custom compound nouns is controlled by the `decompound_mode` setting. +-- + +As a demonstration of how the user dictionary can be used, save the following +dictionary to `$ES_HOME/config/userdict_ko.txt`: + +[source,txt] +----------------------- +c++ <1> +C샤프 +세종 +세종시 세종 시 <2> +----------------------- +-- + +<1> A simple noun +<2> A compound noun (`세종시`) followed by its decomposition: `세종` and `시`. + + +Then create an analyzer as follows: + +[source,js] +-------------------------------------------------- +PUT nori_sample +{ + "settings": { + "index": { + "analysis": { + "tokenizer": { + "nori_user_dict": { + "type": "nori_tokenizer", + "decompound_mode": "mixed", + "user_dictionary": "userdict_ko.txt" + } + }, + "analyzer": { + "my_analyzer": { + "type": "custom", + "tokenizer": "nori_user_dict" + } + } + } + } + } +} + +GET nori_sample/_analyze +{ + "analyzer": "my_analyzer", + "text": "세종시" <1> +} +-------------------------------------------------- +// CONSOLE + +<1> Sejong city + +The above `analyze` request returns the following: + +[source,js] +-------------------------------------------------- +{ + "tokens" : [ { + "token" : "세종시", + "start_offset" : 0, + "end_offset" : 3, + "type" : "word", + "position" : 0, + "positionLength" : 2 <1> + }, { + "token" : "세종", + "start_offset" : 0, + "end_offset" : 2, + "type" : "word", + "position" : 0 + }, { + "token" : "시", + "start_offset" : 2, + "end_offset" : 3, + "type" : "word", + "position" : 1 + }] +} +-------------------------------------------------- +// TESTRESPONSE + +<1> This is a compound token that spans two positions (`mixed` mode). + +The `nori_tokenizer` sets a number of additional attributes per token that are used by token filters +to modify the stream. +You can view all these additional attributes with the following request: + +[source,js] +-------------------------------------------------- +GET _analyze +{ + "tokenizer": "nori_tokenizer", + "text": "뿌리가 깊은 나무는", <1> + "attributes" : ["posType", "leftPOS", "rightPOS", "morphemes", "reading"], + "explain": true +} +-------------------------------------------------- +// CONSOLE + +<1> A tree with deep roots + +Which responds with: + +[source,js] +-------------------------------------------------- +{ + "detail": { + "custom_analyzer": true, + "charfilters": [], + "tokenizer": { + "name": "nori_tokenizer", + "tokens": [ + { + "token": "뿌리", + "start_offset": 0, + "end_offset": 2, + "type": "word", + "position": 0, + "leftPOS": "NNG(General Noun)", + "morphemes": null, + "posType": "MORPHEME", + "reading": null, + "rightPOS": "NNG(General Noun)" + }, + { + "token": "가", + "start_offset": 2, + "end_offset": 3, + "type": "word", + "position": 1, + "leftPOS": "J(Ending Particle)", + "morphemes": null, + "posType": "MORPHEME", + "reading": null, + "rightPOS": "J(Ending Particle)" + }, + { + "token": "깊", + "start_offset": 4, + "end_offset": 5, + "type": "word", + "position": 2, + "leftPOS": "VA(Adjective)", + "morphemes": null, + "posType": "MORPHEME", + "reading": null, + "rightPOS": "VA(Adjective)" + }, + { + "token": "은", + "start_offset": 5, + "end_offset": 6, + "type": "word", + "position": 3, + "leftPOS": "E(Verbal endings)", + "morphemes": null, + "posType": "MORPHEME", + "reading": null, + "rightPOS": "E(Verbal endings)" + }, + { + "token": "나무", + "start_offset": 7, + "end_offset": 9, + "type": "word", + "position": 4, + "leftPOS": "NNG(General Noun)", + "morphemes": null, + "posType": "MORPHEME", + "reading": null, + "rightPOS": "NNG(General Noun)" + }, + { + "token": "는", + "start_offset": 9, + "end_offset": 10, + "type": "word", + "position": 5, + "leftPOS": "J(Ending Particle)", + "morphemes": null, + "posType": "MORPHEME", + "reading": null, + "rightPOS": "J(Ending Particle)" + } + ] + }, + "tokenfilters": [] + } +} +-------------------------------------------------- +// TESTRESPONSE + +[[analysis-nori-speech]] +==== `nori_part_of_speech` token filter + +The `nori_part_of_speech` token filter removes tokens that match a set of +part-of-speech tags. The list of supported tags and their meanings can be found here: +{lucene_version_path}/org/apache/lucene/analysis/ko/POS.Tag.html[Part of speech tags] + +It accepts the following setting: + +`stoptags`:: + + An array of part-of-speech tags that should be removed. + +and defaults to: + +``` +"stoptags": [ + "E", + "IC", + "J", + "MAG", "MAJ", "MM", + "SP", "SSC", "SSO", "SC", "SE", + "XPN", "XSA", "XSN", "XSV", + "UNA", "NA", "VSV" +] +``` + +For example: + +[source,js] +-------------------------------------------------- +PUT nori_sample +{ + "settings": { + "index": { + "analysis": { + "analyzer": { + "my_analyzer": { + "tokenizer": "nori_tokenizer", + "filter": [ + "my_posfilter" + ] + } + }, + "filter": { + "my_posfilter": { + "type": "nori_part_of_speech", + "stoptags": [ + "NR" <1> + ] + } + } + } + } + } +} + +GET nori_sample/_analyze +{ + "analyzer": "my_analyzer", + "text": "여섯 용이" <2> +} +-------------------------------------------------- +// CONSOLE + +<1> Korean numerals should be removed (`NR`) +<2> Six dragons + +Which responds with: + +[source,js] +-------------------------------------------------- +{ + "tokens" : [ { + "token" : "용", + "start_offset" : 3, + "end_offset" : 4, + "type" : "word", + "position" : 1 + }, { + "token" : "이", + "start_offset" : 4, + "end_offset" : 5, + "type" : "word", + "position" : 2 + } ] +} +-------------------------------------------------- +// TESTRESPONSE + +[[analysis-nori-readingform]] +==== `nori_readingform` token filter + +The `nori_readingform` token filter rewrites tokens written in Hanja to their Hangul form. + +[source,js] +-------------------------------------------------- +PUT nori_sample +{ + "settings": { + "index":{ + "analysis":{ + "analyzer" : { + "my_analyzer" : { + "tokenizer" : "nori_tokenizer", + "filter" : ["nori_readingform"] + } + } + } + } + } +} + +GET nori_sample/_analyze +{ + "analyzer": "my_analyzer", + "text": "鄕歌" <1> +} +-------------------------------------------------- +// CONSOLE + +<1> Hyangga + +Which responds with: + +[source,js] +-------------------------------------------------- +{ + "tokens" : [ { + "token" : "향가", <2> + "start_offset" : 0, + "end_offset" : 2, + "type" : "word", + "position" : 0 + }] +} +-------------------------------------------------- +// TESTRESPONSE + +<1> A token written in Hanja. +<2> The Hanja form is replaced by the Hangul translation. diff --git a/docs/plugins/analysis.asciidoc b/docs/plugins/analysis.asciidoc index c09c48640ea..875c87124ef 100644 --- a/docs/plugins/analysis.asciidoc +++ b/docs/plugins/analysis.asciidoc @@ -20,6 +20,10 @@ transliteration. Advanced analysis of Japanese using the http://www.atilika.org/[Kuromoji analyzer]. +<>:: + +Morphological analysis of Korean using the Lucene Nori analyzer. + <>:: Analyzes tokens into their phonetic equivalent using Soundex, Metaphone, @@ -59,6 +63,8 @@ include::analysis-icu.asciidoc[] include::analysis-kuromoji.asciidoc[] +include::analysis-nori.asciidoc[] + include::analysis-phonetic.asciidoc[] include::analysis-smartcn.asciidoc[] diff --git a/docs/reference/cat/plugins.asciidoc b/docs/reference/cat/plugins.asciidoc index ca35a23d305..a9915d7aaa2 100644 --- a/docs/reference/cat/plugins.asciidoc +++ b/docs/reference/cat/plugins.asciidoc @@ -16,10 +16,11 @@ Might look like: name component version description U7321H6 analysis-icu {version} The ICU Analysis plugin integrates Lucene ICU module into elasticsearch, adding ICU relates analysis components. U7321H6 analysis-kuromoji {version} The Japanese (kuromoji) Analysis plugin integrates Lucene kuromoji analysis module into elasticsearch. +U7321H6 analysis-nori {version} The Korean (nori) Analysis plugin integrates Lucene nori analysis module into elasticsearch. U7321H6 analysis-phonetic {version} The Phonetic Analysis plugin integrates phonetic token filter analysis with elasticsearch. U7321H6 analysis-smartcn {version} Smart Chinese Analysis plugin integrates Lucene Smart Chinese analysis module into elasticsearch. U7321H6 analysis-stempel {version} The Stempel (Polish) Analysis plugin integrates Lucene stempel (polish) analysis module into elasticsearch. -U7321H6 analysis-ukrainian {version} The Ukrainian Analysis plugin integrates the Lucene UkrainianMorfologikAnalyzer into elasticsearch. +U7321H6 analysis-ukrainian {version} The Ukrainian Analysis plugin integrates the Lucene UkrainianMorfologikAnalyzer into elasticsearch. U7321H6 discovery-azure-classic {version} The Azure Classic Discovery plugin allows to use Azure Classic API for the unicast discovery mechanism U7321H6 discovery-ec2 {version} The EC2 discovery plugin allows to use AWS API for the unicast discovery mechanism. U7321H6 discovery-file {version} Discovery file plugin enables unicast discovery from hosts stored in a file. diff --git a/docs/src/test/cluster/config/userdict_ko.txt b/docs/src/test/cluster/config/userdict_ko.txt new file mode 100644 index 00000000000..63c1c3a1e22 --- /dev/null +++ b/docs/src/test/cluster/config/userdict_ko.txt @@ -0,0 +1,5 @@ +# Additional nouns +c++ +C샤프 +세종 +세종시 세종 시 \ No newline at end of file diff --git a/plugins/analysis-kuromoji/src/test/resources/rest-api-spec/test/analysis_kuromoji/10_basic.yml b/plugins/analysis-kuromoji/src/test/resources/rest-api-spec/test/analysis_nori/10_basic.yml similarity index 100% rename from plugins/analysis-kuromoji/src/test/resources/rest-api-spec/test/analysis_kuromoji/10_basic.yml rename to plugins/analysis-kuromoji/src/test/resources/rest-api-spec/test/analysis_nori/10_basic.yml diff --git a/plugins/analysis-kuromoji/src/test/resources/rest-api-spec/test/analysis_kuromoji/20_search.yml b/plugins/analysis-kuromoji/src/test/resources/rest-api-spec/test/analysis_nori/20_search.yml similarity index 100% rename from plugins/analysis-kuromoji/src/test/resources/rest-api-spec/test/analysis_kuromoji/20_search.yml rename to plugins/analysis-kuromoji/src/test/resources/rest-api-spec/test/analysis_nori/20_search.yml diff --git a/plugins/analysis-nori/build.gradle b/plugins/analysis-nori/build.gradle new file mode 100644 index 00000000000..a9d3a1126dc --- /dev/null +++ b/plugins/analysis-nori/build.gradle @@ -0,0 +1,32 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +esplugin { + description 'The Korean (nori) Analysis plugin integrates Lucene nori analysis module into elasticsearch.' + classname 'org.elasticsearch.plugin.analysis.nori.AnalysisNoriPlugin' +} + +dependencies { + compile "org.apache.lucene:lucene-analyzers-nori:${versions.lucene}" +} + +dependencyLicenses { + mapping from: /lucene-.*/, to: 'lucene' +} + diff --git a/plugins/analysis-nori/licenses/lucene-LICENSE.txt b/plugins/analysis-nori/licenses/lucene-LICENSE.txt new file mode 100644 index 00000000000..28b134f5f8e --- /dev/null +++ b/plugins/analysis-nori/licenses/lucene-LICENSE.txt @@ -0,0 +1,475 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + + +Some code in core/src/java/org/apache/lucene/util/UnicodeUtil.java was +derived from unicode conversion examples available at +http://www.unicode.org/Public/PROGRAMS/CVTUTF. Here is the copyright +from those sources: + +/* + * Copyright 2001-2004 Unicode, Inc. + * + * Disclaimer + * + * This source code is provided as is by Unicode, Inc. No claims are + * made as to fitness for any particular purpose. No warranties of any + * kind are expressed or implied. The recipient agrees to determine + * applicability of information provided. If this file has been + * purchased on magnetic or optical media from Unicode, Inc., the + * sole remedy for any claim will be exchange of defective media + * within 90 days of receipt. + * + * Limitations on Rights to Redistribute This Code + * + * Unicode, Inc. hereby grants the right to freely use the information + * supplied in this file in the creation of products supporting the + * Unicode Standard, and to make copies of this file in any form + * for internal or external distribution as long as this notice + * remains attached. + */ + + +Some code in core/src/java/org/apache/lucene/util/ArrayUtil.java was +derived from Python 2.4.2 sources available at +http://www.python.org. Full license is here: + + http://www.python.org/download/releases/2.4.2/license/ + +Some code in core/src/java/org/apache/lucene/util/UnicodeUtil.java was +derived from Python 3.1.2 sources available at +http://www.python.org. Full license is here: + + http://www.python.org/download/releases/3.1.2/license/ + +Some code in core/src/java/org/apache/lucene/util/automaton was +derived from Brics automaton sources available at +www.brics.dk/automaton/. Here is the copyright from those sources: + +/* + * Copyright (c) 2001-2009 Anders Moeller + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +The levenshtein automata tables in core/src/java/org/apache/lucene/util/automaton +were automatically generated with the moman/finenight FSA package. +Here is the copyright for those sources: + +# Copyright (c) 2010, Jean-Philippe Barrette-LaPierre, +# +# Permission is hereby granted, free of charge, to any person +# obtaining a copy of this software and associated documentation +# files (the "Software"), to deal in the Software without +# restriction, including without limitation the rights to use, +# copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following +# conditions: +# +# The above copyright notice and this permission notice shall be +# included in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +Some code in core/src/java/org/apache/lucene/util/UnicodeUtil.java was +derived from ICU (http://www.icu-project.org) +The full license is available here: + http://source.icu-project.org/repos/icu/icu/trunk/license.html + +/* + * Copyright (C) 1999-2010, International Business Machines + * Corporation and others. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, and/or sell copies of the + * Software, and to permit persons to whom the Software is furnished to do so, + * provided that the above copyright notice(s) and this permission notice appear + * in all copies of the Software and that both the above copyright notice(s) and + * this permission notice appear in supporting documentation. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE + * LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR + * ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER + * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT + * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + * Except as contained in this notice, the name of a copyright holder shall not + * be used in advertising or otherwise to promote the sale, use or other + * dealings in this Software without prior written authorization of the + * copyright holder. + */ + +The following license applies to the Snowball stemmers: + +Copyright (c) 2001, Dr Martin Porter +Copyright (c) 2002, Richard Boulton +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holders nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +The following license applies to the KStemmer: + +Copyright © 2003, +Center for Intelligent Information Retrieval, +University of Massachusetts, Amherst. +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this +list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + +3. The names "Center for Intelligent Information Retrieval" and +"University of Massachusetts" must not be used to endorse or promote products +derived from this software without prior written permission. To obtain +permission, contact info@ciir.cs.umass.edu. + +THIS SOFTWARE IS PROVIDED BY UNIVERSITY OF MASSACHUSETTS AND OTHER CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE +GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +SUCH DAMAGE. + +The following license applies to the Morfologik project: + +Copyright (c) 2006 Dawid Weiss +Copyright (c) 2007-2011 Dawid Weiss, Marcin Miłkowski +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + + * Neither the name of Morfologik nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +--- + +The dictionary comes from Morfologik project. Morfologik uses data from +Polish ispell/myspell dictionary hosted at http://www.sjp.pl/slownik/en/ and +is licenced on the terms of (inter alia) LGPL and Creative Commons +ShareAlike. The part-of-speech tags were added in Morfologik project and +are not found in the data from sjp.pl. The tagset is similar to IPI PAN +tagset. + +--- + +The following license applies to the Morfeusz project, +used by org.apache.lucene.analysis.morfologik. + +BSD-licensed dictionary of Polish (SGJP) +http://sgjp.pl/morfeusz/ + +Copyright © 2011 Zygmunt Saloni, Włodzimierz Gruszczyński, + Marcin Woliński, Robert Wołosz + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the + distribution. + +THIS SOFTWARE IS PROVIDED BY COPYRIGHT HOLDERS “AS IS” AND ANY EXPRESS +OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL COPYRIGHT HOLDERS OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR +BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN +IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/plugins/analysis-nori/licenses/lucene-NOTICE.txt b/plugins/analysis-nori/licenses/lucene-NOTICE.txt new file mode 100644 index 00000000000..4970d207895 --- /dev/null +++ b/plugins/analysis-nori/licenses/lucene-NOTICE.txt @@ -0,0 +1,204 @@ +Apache Lucene +Copyright 2001-2018 The Apache Software Foundation + +This product includes software developed at +The Apache Software Foundation (http://www.apache.org/). + +Includes software from other Apache Software Foundation projects, +including, but not limited to: + - Apache Ant + - Apache Jakarta Regexp + - Apache Commons + - Apache Xerces + +ICU4J, (under analysis/icu) is licensed under an MIT styles license +and Copyright (c) 1995-2008 International Business Machines Corporation and others + +Some data files (under analysis/icu/src/data) are derived from Unicode data such +as the Unicode Character Database. See http://unicode.org/copyright.html for more +details. + +Brics Automaton (under core/src/java/org/apache/lucene/util/automaton) is +BSD-licensed, created by Anders Møller. See http://www.brics.dk/automaton/ + +The levenshtein automata tables (under core/src/java/org/apache/lucene/util/automaton) were +automatically generated with the moman/finenight FSA library, created by +Jean-Philippe Barrette-LaPierre. This library is available under an MIT license, +see http://sites.google.com/site/rrettesite/moman and +http://bitbucket.org/jpbarrette/moman/overview/ + +The class org.apache.lucene.util.WeakIdentityMap was derived from +the Apache CXF project and is Apache License 2.0. + +The Google Code Prettify is Apache License 2.0. +See http://code.google.com/p/google-code-prettify/ + +JUnit (junit-4.10) is licensed under the Common Public License v. 1.0 +See http://junit.sourceforge.net/cpl-v10.html + +This product includes code (JaspellTernarySearchTrie) from Java Spelling Checkin +g Package (jaspell): http://jaspell.sourceforge.net/ +License: The BSD License (http://www.opensource.org/licenses/bsd-license.php) + +The snowball stemmers in + analysis/common/src/java/net/sf/snowball +were developed by Martin Porter and Richard Boulton. +The snowball stopword lists in + analysis/common/src/resources/org/apache/lucene/analysis/snowball +were developed by Martin Porter and Richard Boulton. +The full snowball package is available from + http://snowball.tartarus.org/ + +The KStem stemmer in + analysis/common/src/org/apache/lucene/analysis/en +was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst) +under the BSD-license. + +The Arabic,Persian,Romanian,Bulgarian, Hindi and Bengali analyzers (common) come with a default +stopword list that is BSD-licensed created by Jacques Savoy. These files reside in: +analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt, +analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt, +analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt, +analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt, +analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt, +analysis/common/src/resources/org/apache/lucene/analysis/bn/stopwords.txt +See http://members.unine.ch/jacques.savoy/clef/index.html. + +The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers +(common) are based on BSD-licensed reference implementations created by Jacques Savoy and +Ljiljana Dolamic. These files reside in: +analysis/common/src/java/org/apache/lucene/analysis/de/GermanLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/de/GermanMinimalStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/es/SpanishLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchMinimalStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/it/ItalianLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/ru/RussianLightStemmer.java +analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishLightStemmer.java + +The Stempel analyzer (stempel) includes BSD-licensed software developed +by the Egothor project http://egothor.sf.net/, created by Leo Galambos, Martin Kvapil, +and Edmond Nolan. + +The Polish analyzer (stempel) comes with a default +stopword list that is BSD-licensed created by the Carrot2 project. The file resides +in stempel/src/resources/org/apache/lucene/analysis/pl/stopwords.txt. +See http://project.carrot2.org/license.html. + +The SmartChineseAnalyzer source code (smartcn) was +provided by Xiaoping Gao and copyright 2009 by www.imdict.net. + +WordBreakTestUnicode_*.java (under modules/analysis/common/src/test/) +is derived from Unicode data such as the Unicode Character Database. +See http://unicode.org/copyright.html for more details. + +The Morfologik analyzer (morfologik) includes BSD-licensed software +developed by Dawid Weiss and Marcin Miłkowski (http://morfologik.blogspot.com/). + +Morfologik uses data from Polish ispell/myspell dictionary +(http://www.sjp.pl/slownik/en/) licenced on the terms of (inter alia) +LGPL and Creative Commons ShareAlike. + +Morfologic includes data from BSD-licensed dictionary of Polish (SGJP) +(http://sgjp.pl/morfeusz/) + +Servlet-api.jar and javax.servlet-*.jar are under the CDDL license, the original +source code for this can be found at http://www.eclipse.org/jetty/downloads.php + +=========================================================================== +Kuromoji Japanese Morphological Analyzer - Apache Lucene Integration +=========================================================================== + +This software includes a binary and/or source version of data from + + mecab-ipadic-2.7.0-20070801 + +which can be obtained from + + http://atilika.com/releases/mecab-ipadic/mecab-ipadic-2.7.0-20070801.tar.gz + +or + + http://jaist.dl.sourceforge.net/project/mecab/mecab-ipadic/2.7.0-20070801/mecab-ipadic-2.7.0-20070801.tar.gz + +=========================================================================== +mecab-ipadic-2.7.0-20070801 Notice +=========================================================================== + +Nara Institute of Science and Technology (NAIST), +the copyright holders, disclaims all warranties with regard to this +software, including all implied warranties of merchantability and +fitness, in no event shall NAIST be liable for +any special, indirect or consequential damages or any damages +whatsoever resulting from loss of use, data or profits, whether in an +action of contract, negligence or other tortuous action, arising out +of or in connection with the use or performance of this software. + +A large portion of the dictionary entries +originate from ICOT Free Software. The following conditions for ICOT +Free Software applies to the current dictionary as well. + +Each User may also freely distribute the Program, whether in its +original form or modified, to any third party or parties, PROVIDED +that the provisions of Section 3 ("NO WARRANTY") will ALWAYS appear +on, or be attached to, the Program, which is distributed substantially +in the same form as set out herein and that such intended +distribution, if actually made, will neither violate or otherwise +contravene any of the laws and regulations of the countries having +jurisdiction over the User or the intended distribution itself. + +NO WARRANTY + +The program was produced on an experimental basis in the course of the +research and development conducted during the project and is provided +to users as so produced on an experimental basis. Accordingly, the +program is provided without any warranty whatsoever, whether express, +implied, statutory or otherwise. The term "warranty" used herein +includes, but is not limited to, any warranty of the quality, +performance, merchantability and fitness for a particular purpose of +the program and the nonexistence of any infringement or violation of +any right of any third party. + +Each user of the program will agree and understand, and be deemed to +have agreed and understood, that there is no warranty whatsoever for +the program and, accordingly, the entire risk arising from or +otherwise connected with the program is assumed by the user. + +Therefore, neither ICOT, the copyright holder, or any other +organization that participated in or was otherwise related to the +development of the program and their respective officials, directors, +officers and other employees shall be held liable for any and all +damages, including, without limitation, general, special, incidental +and consequential damages, arising out of or otherwise in connection +with the use or inability to use the program or any product, material +or result produced or otherwise obtained by using the program, +regardless of whether they have been advised of, or otherwise had +knowledge of, the possibility of such damages at any time during the +project or thereafter. Each user will be deemed to have agreed to the +foregoing by his or her commencement of use of the program. The term +"use" as used herein includes, but is not limited to, the use, +modification, copying and distribution of the program and the +production of secondary products from the program. + +In the case where the program, whether in its original form or +modified, was distributed or delivered to or received by a user from +any person, organization or entity other than ICOT, unless it makes or +grants independently of ICOT any specific warranty to the user in +writing, such person, organization or entity, will also be exempted +from and not be held liable to the user for any such damages as noted +above as far as the program is concerned. + +=========================================================================== +Nori Korean Morphological Analyzer - Apache Lucene Integration +=========================================================================== + +This software includes a binary and/or source version of data from + + mecab-ko-dic-2.0.3-20170922 + +which can be obtained from + + https://bitbucket.org/eunjeon/mecab-ko-dic/downloads/mecab-ko-dic-2.0.3-20170922.tar.gz diff --git a/plugins/analysis-nori/licenses/lucene-analyzers-nori-7.4.0-snapshot-1ed95c097b.jar.sha1 b/plugins/analysis-nori/licenses/lucene-analyzers-nori-7.4.0-snapshot-1ed95c097b.jar.sha1 new file mode 100644 index 00000000000..b10ae670df5 --- /dev/null +++ b/plugins/analysis-nori/licenses/lucene-analyzers-nori-7.4.0-snapshot-1ed95c097b.jar.sha1 @@ -0,0 +1 @@ +a7daed3dc3a67674862002f315cd9193944de783 \ No newline at end of file diff --git a/plugins/analysis-nori/src/main/java/org/elasticsearch/index/analysis/NoriAnalyzerProvider.java b/plugins/analysis-nori/src/main/java/org/elasticsearch/index/analysis/NoriAnalyzerProvider.java new file mode 100644 index 00000000000..f85c3f94e34 --- /dev/null +++ b/plugins/analysis-nori/src/main/java/org/elasticsearch/index/analysis/NoriAnalyzerProvider.java @@ -0,0 +1,54 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +import java.util.List; +import java.util.Set; +import org.apache.lucene.analysis.ko.KoreanAnalyzer; +import org.apache.lucene.analysis.ko.KoreanPartOfSpeechStopFilter; +import org.apache.lucene.analysis.ko.KoreanTokenizer; +import org.apache.lucene.analysis.ko.dict.UserDictionary; +import org.apache.lucene.analysis.ko.POS; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.env.Environment; +import org.elasticsearch.index.IndexSettings; + +import static org.elasticsearch.index.analysis.NoriPartOfSpeechStopFilterFactory.resolvePOSList; + + +public class NoriAnalyzerProvider extends AbstractIndexAnalyzerProvider { + private final KoreanAnalyzer analyzer; + + public NoriAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { + super(indexSettings, name, settings); + final KoreanTokenizer.DecompoundMode mode = NoriTokenizerFactory.getMode(settings); + final UserDictionary userDictionary = NoriTokenizerFactory.getUserDictionary(env, settings); + final List tagList = Analysis.getWordList(env, settings, "stoptags"); + final Set stopTags = tagList != null ? resolvePOSList(tagList) : KoreanPartOfSpeechStopFilter.DEFAULT_STOP_TAGS; + analyzer = new KoreanAnalyzer(userDictionary, mode, stopTags, false); + } + + @Override + public KoreanAnalyzer get() { + return analyzer; + } + + +} diff --git a/plugins/analysis-nori/src/main/java/org/elasticsearch/index/analysis/NoriPartOfSpeechStopFilterFactory.java b/plugins/analysis-nori/src/main/java/org/elasticsearch/index/analysis/NoriPartOfSpeechStopFilterFactory.java new file mode 100644 index 00000000000..d893c35cefb --- /dev/null +++ b/plugins/analysis-nori/src/main/java/org/elasticsearch/index/analysis/NoriPartOfSpeechStopFilterFactory.java @@ -0,0 +1,55 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.ko.KoreanPartOfSpeechStopFilter; +import org.apache.lucene.analysis.ko.POS; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.env.Environment; +import org.elasticsearch.index.IndexSettings; + +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +public class NoriPartOfSpeechStopFilterFactory extends AbstractTokenFilterFactory { + private final Set stopTags; + + public NoriPartOfSpeechStopFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) { + super(indexSettings, name, settings); + List tagList = Analysis.getWordList(env, settings, "stoptags"); + this.stopTags = tagList != null ? resolvePOSList(tagList) : KoreanPartOfSpeechStopFilter.DEFAULT_STOP_TAGS; + } + + @Override + public TokenStream create(TokenStream tokenStream) { + return new KoreanPartOfSpeechStopFilter(tokenStream, stopTags); + } + + + static Set resolvePOSList(List tagList) { + Set stopTags = new HashSet<>(); + for (String tag : tagList) { + stopTags.add(POS.resolveTag(tag)); + } + return stopTags; + } +} diff --git a/plugins/analysis-nori/src/main/java/org/elasticsearch/index/analysis/NoriReadingFormFilterFactory.java b/plugins/analysis-nori/src/main/java/org/elasticsearch/index/analysis/NoriReadingFormFilterFactory.java new file mode 100644 index 00000000000..aac6003c1b7 --- /dev/null +++ b/plugins/analysis-nori/src/main/java/org/elasticsearch/index/analysis/NoriReadingFormFilterFactory.java @@ -0,0 +1,37 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.ko.KoreanReadingFormFilter; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.env.Environment; +import org.elasticsearch.index.IndexSettings; + +public class NoriReadingFormFilterFactory extends AbstractTokenFilterFactory { + public NoriReadingFormFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { + super(indexSettings, name, settings); + } + + @Override + public TokenStream create(TokenStream tokenStream) { + return new KoreanReadingFormFilter(tokenStream); + } +} diff --git a/plugins/analysis-nori/src/main/java/org/elasticsearch/index/analysis/NoriTokenizerFactory.java b/plugins/analysis-nori/src/main/java/org/elasticsearch/index/analysis/NoriTokenizerFactory.java new file mode 100644 index 00000000000..346cc84e5e6 --- /dev/null +++ b/plugins/analysis-nori/src/main/java/org/elasticsearch/index/analysis/NoriTokenizerFactory.java @@ -0,0 +1,72 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.ko.KoreanTokenizer; +import org.apache.lucene.analysis.ko.dict.UserDictionary; +import org.elasticsearch.ElasticsearchException; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.env.Environment; +import org.elasticsearch.index.IndexSettings; + +import java.io.IOException; +import java.io.Reader; +import java.util.Locale; + +public class NoriTokenizerFactory extends AbstractTokenizerFactory { + private static final String USER_DICT_OPTION = "user_dictionary"; + + private final UserDictionary userDictionary; + private final KoreanTokenizer.DecompoundMode decompoundMode; + + public NoriTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) { + super(indexSettings, name, settings); + decompoundMode = getMode(settings); + userDictionary = getUserDictionary(env, settings); + } + + public static UserDictionary getUserDictionary(Environment env, Settings settings) { + try (Reader reader = Analysis.getReaderFromFile(env, settings, USER_DICT_OPTION)) { + if (reader == null) { + return null; + } else { + return UserDictionary.open(reader); + } + } catch (IOException e) { + throw new ElasticsearchException("failed to load nori user dictionary", e); + } + } + + public static KoreanTokenizer.DecompoundMode getMode(Settings settings) { + KoreanTokenizer.DecompoundMode mode = KoreanTokenizer.DEFAULT_DECOMPOUND; + String modeSetting = settings.get("decompound_mode", null); + if (modeSetting != null) { + mode = KoreanTokenizer.DecompoundMode.valueOf(modeSetting.toUpperCase(Locale.ENGLISH)); + } + return mode; + } + + @Override + public Tokenizer create() { + return new KoreanTokenizer(KoreanTokenizer.DEFAULT_TOKEN_ATTRIBUTE_FACTORY, userDictionary, decompoundMode, false); + } + +} diff --git a/plugins/analysis-nori/src/main/java/org/elasticsearch/plugin/analysis/nori/AnalysisNoriPlugin.java b/plugins/analysis-nori/src/main/java/org/elasticsearch/plugin/analysis/nori/AnalysisNoriPlugin.java new file mode 100644 index 00000000000..6e9baa7acd2 --- /dev/null +++ b/plugins/analysis-nori/src/main/java/org/elasticsearch/plugin/analysis/nori/AnalysisNoriPlugin.java @@ -0,0 +1,57 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.plugin.analysis.nori; + +import org.apache.lucene.analysis.Analyzer; +import org.elasticsearch.index.analysis.AnalyzerProvider; +import org.elasticsearch.index.analysis.NoriAnalyzerProvider; +import org.elasticsearch.index.analysis.NoriPartOfSpeechStopFilterFactory; +import org.elasticsearch.index.analysis.NoriReadingFormFilterFactory; +import org.elasticsearch.index.analysis.NoriTokenizerFactory; +import org.elasticsearch.index.analysis.TokenFilterFactory; +import org.elasticsearch.index.analysis.TokenizerFactory; +import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider; +import org.elasticsearch.plugins.AnalysisPlugin; +import org.elasticsearch.plugins.Plugin; + +import java.util.HashMap; +import java.util.Map; + +import static java.util.Collections.singletonMap; + +public class AnalysisNoriPlugin extends Plugin implements AnalysisPlugin { + @Override + public Map> getTokenFilters() { + Map> extra = new HashMap<>(); + extra.put("nori_part_of_speech", NoriPartOfSpeechStopFilterFactory::new); + extra.put("nori_readingform", NoriReadingFormFilterFactory::new); + return extra; + } + + @Override + public Map> getTokenizers() { + return singletonMap("nori_tokenizer", NoriTokenizerFactory::new); + } + + @Override + public Map>> getAnalyzers() { + return singletonMap("nori", NoriAnalyzerProvider::new); + } +} diff --git a/plugins/analysis-nori/src/test/java/org/elasticsearch/index/analysis/AnalysisNoriFactoryTests.java b/plugins/analysis-nori/src/test/java/org/elasticsearch/index/analysis/AnalysisNoriFactoryTests.java new file mode 100644 index 00000000000..1677ba94b87 --- /dev/null +++ b/plugins/analysis-nori/src/test/java/org/elasticsearch/index/analysis/AnalysisNoriFactoryTests.java @@ -0,0 +1,48 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +import org.apache.lucene.analysis.ko.KoreanTokenizerFactory; +import org.elasticsearch.indices.analysis.AnalysisFactoryTestCase; +import org.elasticsearch.plugin.analysis.nori.AnalysisNoriPlugin; + +import java.util.HashMap; +import java.util.Map; + +public class AnalysisNoriFactoryTests extends AnalysisFactoryTestCase { + public AnalysisNoriFactoryTests() { + super(new AnalysisNoriPlugin()); + } + + @Override + protected Map> getTokenizers() { + Map> tokenizers = new HashMap<>(super.getTokenizers()); + tokenizers.put("korean", KoreanTokenizerFactory.class); + return tokenizers; + } + + @Override + protected Map> getTokenFilters() { + Map> filters = new HashMap<>(super.getTokenFilters()); + filters.put("koreanpartofspeechstop", NoriPartOfSpeechStopFilterFactory.class); + filters.put("koreanreadingform", NoriReadingFormFilterFactory.class); + return filters; + } +} diff --git a/plugins/analysis-nori/src/test/java/org/elasticsearch/index/analysis/NoriAnalysisTests.java b/plugins/analysis-nori/src/test/java/org/elasticsearch/index/analysis/NoriAnalysisTests.java new file mode 100644 index 00000000000..fa5858a7bbb --- /dev/null +++ b/plugins/analysis-nori/src/test/java/org/elasticsearch/index/analysis/NoriAnalysisTests.java @@ -0,0 +1,147 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.ko.KoreanAnalyzer; +import org.apache.lucene.analysis.ko.KoreanTokenizer; +import org.elasticsearch.Version; +import org.elasticsearch.cluster.metadata.IndexMetaData; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.env.Environment; +import org.elasticsearch.plugin.analysis.nori.AnalysisNoriPlugin; +import org.elasticsearch.test.ESTestCase.TestAnalysis; +import org.elasticsearch.test.ESTokenStreamTestCase; + +import java.io.IOException; +import java.io.InputStream; +import java.io.StringReader; +import java.nio.file.Files; +import java.nio.file.Path; + +import static org.hamcrest.Matchers.instanceOf; + +public class NoriAnalysisTests extends ESTokenStreamTestCase { + public void testDefaultsNoriAnalysis() throws IOException { + TestAnalysis analysis = createTestAnalysis(Settings.EMPTY); + + TokenizerFactory tokenizerFactory = analysis.tokenizer.get("nori_tokenizer"); + assertThat(tokenizerFactory, instanceOf(NoriTokenizerFactory.class)); + + TokenFilterFactory filterFactory = analysis.tokenFilter.get("nori_part_of_speech"); + assertThat(filterFactory, instanceOf(NoriPartOfSpeechStopFilterFactory.class)); + + filterFactory = analysis.tokenFilter.get("nori_readingform"); + assertThat(filterFactory, instanceOf(NoriReadingFormFilterFactory.class)); + + IndexAnalyzers indexAnalyzers = analysis.indexAnalyzers; + NamedAnalyzer analyzer = indexAnalyzers.get("nori"); + assertThat(analyzer.analyzer(), instanceOf(KoreanAnalyzer.class)); + } + + public void testNoriAnalyzer() throws Exception { + Settings settings = Settings.builder() + .put("index.analysis.analyzer.my_analyzer.type", "nori") + .put("index.analysis.analyzer.my_analyzer.stoptags", "NR, SP") + .put("index.analysis.analyzer.my_analyzer.decompound_mode", "mixed") + .build(); + TestAnalysis analysis = createTestAnalysis(settings); + Analyzer analyzer = analysis.indexAnalyzers.get("my_analyzer"); + try (TokenStream stream = analyzer.tokenStream("", "여섯 용이" )) { + assertTokenStreamContents(stream, new String[] {"용", "이"}); + } + + try (TokenStream stream = analyzer.tokenStream("", "가늠표")) { + assertTokenStreamContents(stream, new String[] {"가늠표", "가늠", "표"}); + } + } + + public void testNoriAnalyzerUserDict() throws Exception { + Settings settings = Settings.builder() + .put("index.analysis.analyzer.my_analyzer.type", "nori") + .put("index.analysis.analyzer.my_analyzer.user_dictionary", "user_dict.txt") + .build(); + TestAnalysis analysis = createTestAnalysis(settings); + Analyzer analyzer = analysis.indexAnalyzers.get("my_analyzer"); + try (TokenStream stream = analyzer.tokenStream("", "세종시" )) { + assertTokenStreamContents(stream, new String[] {"세종", "시"}); + } + + try (TokenStream stream = analyzer.tokenStream("", "c++world")) { + assertTokenStreamContents(stream, new String[] {"c++", "world"}); + } + } + + public void testNoriTokenizer() throws Exception { + Settings settings = Settings.builder() + .put("index.analysis.tokenizer.my_tokenizer.type", "nori_tokenizer") + .put("index.analysis.tokenizer.my_tokenizer.decompound_mode", "mixed") + .build(); + TestAnalysis analysis = createTestAnalysis(settings); + Tokenizer tokenizer = analysis.tokenizer.get("my_tokenizer").create(); + tokenizer.setReader(new StringReader("뿌리가 깊은 나무")); + assertTokenStreamContents(tokenizer, new String[] {"뿌리", "가", "깊", "은", "나무"}); + tokenizer.setReader(new StringReader("가늠표")); + assertTokenStreamContents(tokenizer, new String[] {"가늠표", "가늠", "표"}); + } + + public void testNoriPartOfSpeech() throws IOException { + Settings settings = Settings.builder() + .put("index.analysis.filter.my_filter.type", "nori_part_of_speech") + .put("index.analysis.filter.my_filter.stoptags", "NR, SP") + .build(); + TestAnalysis analysis = createTestAnalysis(settings); + TokenFilterFactory factory = analysis.tokenFilter.get("my_filter"); + Tokenizer tokenizer = new KoreanTokenizer(); + tokenizer.setReader(new StringReader("여섯 용이")); + TokenStream stream = factory.create(tokenizer); + assertTokenStreamContents(stream, new String[] {"용", "이"}); + } + + public void testNoriReadingForm() throws IOException { + Settings settings = Settings.builder() + .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .put("index.analysis.filter.my_filter.type", "nori_readingform") + .build(); + TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new AnalysisNoriPlugin()); + TokenFilterFactory factory = analysis.tokenFilter.get("my_filter"); + Tokenizer tokenizer = new KoreanTokenizer(); + tokenizer.setReader(new StringReader("鄕歌")); + TokenStream stream = factory.create(tokenizer); + assertTokenStreamContents(stream, new String[] {"향가"}); + } + + private TestAnalysis createTestAnalysis(Settings analysisSettings) throws IOException { + InputStream dict = NoriAnalysisTests.class.getResourceAsStream("user_dict.txt"); + Path home = createTempDir(); + Path config = home.resolve("config"); + Files.createDirectory(config); + Files.copy(dict, config.resolve("user_dict.txt")); + Settings settings = Settings.builder() + .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) + .put(Environment.PATH_HOME_SETTING.getKey(), home) + .put(analysisSettings) + .build(); + return AnalysisTestsHelper.createTestAnalysisFromSettings(settings, new AnalysisNoriPlugin()); + } +} diff --git a/plugins/analysis-nori/src/test/java/org/elasticsearch/index/analysis/NoriClientYamlTestSuiteIT.java b/plugins/analysis-nori/src/test/java/org/elasticsearch/index/analysis/NoriClientYamlTestSuiteIT.java new file mode 100644 index 00000000000..5c393f617a8 --- /dev/null +++ b/plugins/analysis-nori/src/test/java/org/elasticsearch/index/analysis/NoriClientYamlTestSuiteIT.java @@ -0,0 +1,39 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +import com.carrotsearch.randomizedtesting.annotations.Name; +import com.carrotsearch.randomizedtesting.annotations.ParametersFactory; + +import org.elasticsearch.test.rest.yaml.ClientYamlTestCandidate; +import org.elasticsearch.test.rest.yaml.ESClientYamlSuiteTestCase; + +public class NoriClientYamlTestSuiteIT extends ESClientYamlSuiteTestCase { + + public NoriClientYamlTestSuiteIT(@Name("yaml") ClientYamlTestCandidate testCandidate) { + super(testCandidate); + } + + @ParametersFactory + public static Iterable parameters() throws Exception { + return ESClientYamlSuiteTestCase.createParameters(); + } +} + diff --git a/plugins/analysis-nori/src/test/resources/org/elasticsearch/index/analysis/user_dict.txt b/plugins/analysis-nori/src/test/resources/org/elasticsearch/index/analysis/user_dict.txt new file mode 100644 index 00000000000..63c1c3a1e22 --- /dev/null +++ b/plugins/analysis-nori/src/test/resources/org/elasticsearch/index/analysis/user_dict.txt @@ -0,0 +1,5 @@ +# Additional nouns +c++ +C샤프 +세종 +세종시 세종 시 \ No newline at end of file diff --git a/plugins/analysis-nori/src/test/resources/rest-api-spec/test/analysis_nori/10_basic.yml b/plugins/analysis-nori/src/test/resources/rest-api-spec/test/analysis_nori/10_basic.yml new file mode 100644 index 00000000000..a5aa9998da6 --- /dev/null +++ b/plugins/analysis-nori/src/test/resources/rest-api-spec/test/analysis_nori/10_basic.yml @@ -0,0 +1,48 @@ +# Integration tests for Korean analysis components +# +--- +"Analyzer": + - do: + indices.analyze: + body: + text: 뿌리가 깊은 나무 + analyzer: nori + - length: { tokens: 3 } + - match: { tokens.0.token: 뿌리 } + - match: { tokens.1.token: 깊 } + - match: { tokens.2.token: 나무 } +--- +"Tokenizer": + - do: + indices.analyze: + body: + text: 뿌리가 깊은 나무 + tokenizer: nori_tokenizer + - length: { tokens: 5 } + - match: { tokens.0.token: 뿌리 } + - match: { tokens.1.token: 가 } + - match: { tokens.2.token: 깊 } + - match: { tokens.3.token: 은 } + - match: { tokens.4.token: 나무 } +--- +"Part of speech filter": + - do: + indices.analyze: + body: + text: 뿌리가 깊은 나무 + tokenizer: nori_tokenizer + filter: [nori_part_of_speech] + - length: { tokens: 3 } + - match: { tokens.0.token: 뿌리 } + - match: { tokens.1.token: 깊 } + - match: { tokens.2.token: 나무 } +--- +"Reading filter": + - do: + indices.analyze: + body: + text: 鄕歌 + tokenizer: nori_tokenizer + filter: [nori_readingform] + - length: { tokens: 1 } + - match: { tokens.0.token: 향가 } diff --git a/plugins/analysis-nori/src/test/resources/rest-api-spec/test/analysis_nori/20_search.yml b/plugins/analysis-nori/src/test/resources/rest-api-spec/test/analysis_nori/20_search.yml new file mode 100644 index 00000000000..cfb0ec5ee94 --- /dev/null +++ b/plugins/analysis-nori/src/test/resources/rest-api-spec/test/analysis_nori/20_search.yml @@ -0,0 +1,32 @@ +# Integration tests for Korean analysis components +# +--- +"Index Korean content": + - do: + indices.create: + index: test + body: + mappings: + type: + properties: + text: + type: text + analyzer: nori + + - do: + index: + index: test + type: type + id: 1 + body: { "text": "뿌리가 깊은 나무는" } + - do: + indices.refresh: {} + + - do: + search: + index: test + body: + query: + match: + text: 나무 + - match: { hits.total: 1 }