Move more token filters to analysis-common module
The following token filters were moved: arabic_normalization, german_normalization, hindi_normalization, indic_normalization, persian_normalization, scandinavian_normalization, serbian_normalization, sorani_normalization, cjk_width and cjk_width Relates to #23658
This commit is contained in:
parent
e9dfb2a215
commit
8003171a0c
|
@ -31,15 +31,12 @@ import org.elasticsearch.index.analysis.AnalysisRegistry;
|
|||
import org.elasticsearch.index.analysis.AnalyzerProvider;
|
||||
import org.elasticsearch.index.analysis.ApostropheFilterFactory;
|
||||
import org.elasticsearch.index.analysis.ArabicAnalyzerProvider;
|
||||
import org.elasticsearch.index.analysis.ArabicNormalizationFilterFactory;
|
||||
import org.elasticsearch.index.analysis.ArabicStemTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.ArmenianAnalyzerProvider;
|
||||
import org.elasticsearch.index.analysis.BasqueAnalyzerProvider;
|
||||
import org.elasticsearch.index.analysis.BrazilianAnalyzerProvider;
|
||||
import org.elasticsearch.index.analysis.BrazilianStemTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.BulgarianAnalyzerProvider;
|
||||
import org.elasticsearch.index.analysis.CJKBigramFilterFactory;
|
||||
import org.elasticsearch.index.analysis.CJKWidthFilterFactory;
|
||||
import org.elasticsearch.index.analysis.CatalanAnalyzerProvider;
|
||||
import org.elasticsearch.index.analysis.CharFilterFactory;
|
||||
import org.elasticsearch.index.analysis.ChineseAnalyzerProvider;
|
||||
|
@ -62,14 +59,11 @@ import org.elasticsearch.index.analysis.FrenchAnalyzerProvider;
|
|||
import org.elasticsearch.index.analysis.FrenchStemTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.GalicianAnalyzerProvider;
|
||||
import org.elasticsearch.index.analysis.GermanAnalyzerProvider;
|
||||
import org.elasticsearch.index.analysis.GermanNormalizationFilterFactory;
|
||||
import org.elasticsearch.index.analysis.GermanStemTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.GreekAnalyzerProvider;
|
||||
import org.elasticsearch.index.analysis.HindiAnalyzerProvider;
|
||||
import org.elasticsearch.index.analysis.HindiNormalizationFilterFactory;
|
||||
import org.elasticsearch.index.analysis.HungarianAnalyzerProvider;
|
||||
import org.elasticsearch.index.analysis.HunspellTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.IndicNormalizationFilterFactory;
|
||||
import org.elasticsearch.index.analysis.IndonesianAnalyzerProvider;
|
||||
import org.elasticsearch.index.analysis.IrishAnalyzerProvider;
|
||||
import org.elasticsearch.index.analysis.ItalianAnalyzerProvider;
|
||||
|
@ -88,7 +82,6 @@ import org.elasticsearch.index.analysis.PathHierarchyTokenizerFactory;
|
|||
import org.elasticsearch.index.analysis.PatternAnalyzerProvider;
|
||||
import org.elasticsearch.index.analysis.PatternTokenizerFactory;
|
||||
import org.elasticsearch.index.analysis.PersianAnalyzerProvider;
|
||||
import org.elasticsearch.index.analysis.PersianNormalizationFilterFactory;
|
||||
import org.elasticsearch.index.analysis.PortugueseAnalyzerProvider;
|
||||
import org.elasticsearch.index.analysis.PreConfiguredCharFilter;
|
||||
import org.elasticsearch.index.analysis.PreConfiguredTokenFilter;
|
||||
|
@ -97,13 +90,10 @@ import org.elasticsearch.index.analysis.RomanianAnalyzerProvider;
|
|||
import org.elasticsearch.index.analysis.RussianAnalyzerProvider;
|
||||
import org.elasticsearch.index.analysis.RussianStemTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.ScandinavianFoldingFilterFactory;
|
||||
import org.elasticsearch.index.analysis.ScandinavianNormalizationFilterFactory;
|
||||
import org.elasticsearch.index.analysis.SerbianNormalizationFilterFactory;
|
||||
import org.elasticsearch.index.analysis.ShingleTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.SimpleAnalyzerProvider;
|
||||
import org.elasticsearch.index.analysis.SnowballAnalyzerProvider;
|
||||
import org.elasticsearch.index.analysis.SoraniAnalyzerProvider;
|
||||
import org.elasticsearch.index.analysis.SoraniNormalizationFilterFactory;
|
||||
import org.elasticsearch.index.analysis.SpanishAnalyzerProvider;
|
||||
import org.elasticsearch.index.analysis.StandardAnalyzerProvider;
|
||||
import org.elasticsearch.index.analysis.StandardHtmlStripAnalyzerProvider;
|
||||
|
@ -202,20 +192,10 @@ public final class AnalysisModule {
|
|||
tokenFilters.register("french_stem", FrenchStemTokenFilterFactory::new);
|
||||
tokenFilters.register("german_stem", GermanStemTokenFilterFactory::new);
|
||||
tokenFilters.register("russian_stem", RussianStemTokenFilterFactory::new);
|
||||
tokenFilters.register("arabic_normalization", ArabicNormalizationFilterFactory::new);
|
||||
tokenFilters.register("german_normalization", GermanNormalizationFilterFactory::new);
|
||||
tokenFilters.register("hindi_normalization", HindiNormalizationFilterFactory::new);
|
||||
tokenFilters.register("indic_normalization", IndicNormalizationFilterFactory::new);
|
||||
tokenFilters.register("sorani_normalization", SoraniNormalizationFilterFactory::new);
|
||||
tokenFilters.register("persian_normalization", PersianNormalizationFilterFactory::new);
|
||||
tokenFilters.register("scandinavian_normalization", ScandinavianNormalizationFilterFactory::new);
|
||||
tokenFilters.register("scandinavian_folding", ScandinavianFoldingFilterFactory::new);
|
||||
tokenFilters.register("serbian_normalization", SerbianNormalizationFilterFactory::new);
|
||||
|
||||
tokenFilters.register("hunspell", requriesAnalysisSettings((indexSettings, env, name, settings) -> new HunspellTokenFilterFactory
|
||||
(indexSettings, name, settings, hunspellService)));
|
||||
tokenFilters.register("cjk_bigram", CJKBigramFilterFactory::new);
|
||||
tokenFilters.register("cjk_width", CJKWidthFilterFactory::new);
|
||||
|
||||
tokenFilters.register("apostrophe", ApostropheFilterFactory::new);
|
||||
tokenFilters.register("classic", ClassicFilterFactory::new);
|
||||
|
|
|
@ -16,17 +16,19 @@
|
|||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
package org.elasticsearch.index.analysis;
|
||||
package org.elasticsearch.analysis.common;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.ar.ArabicNormalizationFilter;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.MultiTermAwareComponent;
|
||||
|
||||
public class ArabicNormalizationFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent {
|
||||
|
||||
public ArabicNormalizationFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
ArabicNormalizationFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
super(indexSettings, name, settings);
|
||||
}
|
||||
|
|
@ -17,7 +17,7 @@
|
|||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.analysis;
|
||||
package org.elasticsearch.analysis.common;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.cjk.CJKBigramFilter;
|
||||
|
@ -25,6 +25,7 @@ import org.apache.lucene.analysis.miscellaneous.DisableGraphAttribute;
|
|||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
|
@ -49,7 +50,7 @@ public final class CJKBigramFilterFactory extends AbstractTokenFilterFactory {
|
|||
private final int flags;
|
||||
private final boolean outputUnigrams;
|
||||
|
||||
public CJKBigramFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
CJKBigramFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
super(indexSettings, name, settings);
|
||||
outputUnigrams = settings.getAsBooleanLenientForPreEs6Indices(
|
||||
indexSettings.getIndexVersionCreated(), "output_unigrams", false, deprecationLogger);
|
|
@ -17,17 +17,19 @@
|
|||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.analysis;
|
||||
package org.elasticsearch.analysis.common;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.cjk.CJKWidthFilter;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.MultiTermAwareComponent;
|
||||
|
||||
public final class CJKWidthFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent {
|
||||
|
||||
public CJKWidthFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
|
||||
CJKWidthFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
|
||||
super(indexSettings, name, settings);
|
||||
}
|
||||
|
|
@ -71,6 +71,7 @@ import org.elasticsearch.index.analysis.HtmlStripCharFilterFactory;
|
|||
import org.elasticsearch.index.analysis.PreConfiguredCharFilter;
|
||||
import org.elasticsearch.index.analysis.PreConfiguredTokenFilter;
|
||||
import org.elasticsearch.index.analysis.PreConfiguredTokenizer;
|
||||
import org.elasticsearch.index.analysis.SoraniNormalizationFilterFactory;
|
||||
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.TokenizerFactory;
|
||||
import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider;
|
||||
|
@ -118,6 +119,16 @@ public class CommonAnalysisPlugin extends Plugin implements AnalysisPlugin {
|
|||
filters.put("common_grams", requriesAnalysisSettings(CommonGramsTokenFilterFactory::new));
|
||||
filters.put("pattern_replace", requriesAnalysisSettings(PatternReplaceTokenFilterFactory::new));
|
||||
filters.put("pattern_capture", requriesAnalysisSettings(PatternCaptureGroupTokenFilterFactory::new));
|
||||
filters.put("arabic_normalization", ArabicNormalizationFilterFactory::new);
|
||||
filters.put("german_normalization", GermanNormalizationFilterFactory::new);
|
||||
filters.put("hindi_normalization", HindiNormalizationFilterFactory::new);
|
||||
filters.put("indic_normalization", IndicNormalizationFilterFactory::new);
|
||||
filters.put("persian_normalization", PersianNormalizationFilterFactory::new);
|
||||
filters.put("scandinavian_normalization", ScandinavianNormalizationFilterFactory::new);
|
||||
filters.put("serbian_normalization", SerbianNormalizationFilterFactory::new);
|
||||
filters.put("sorani_normalization", SoraniNormalizationFilterFactory::new);
|
||||
filters.put("cjk_width", CJKWidthFilterFactory::new);
|
||||
filters.put("cjk_bigram", CJKBigramFilterFactory::new);
|
||||
return filters;
|
||||
}
|
||||
|
||||
|
|
|
@ -16,20 +16,22 @@
|
|||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
package org.elasticsearch.index.analysis;
|
||||
package org.elasticsearch.analysis.common;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.de.GermanNormalizationFilter;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.MultiTermAwareComponent;
|
||||
|
||||
/**
|
||||
* Factory for {@link GermanNormalizationFilter}
|
||||
*/
|
||||
public class GermanNormalizationFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent {
|
||||
|
||||
public GermanNormalizationFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
GermanNormalizationFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
super(indexSettings, name, settings);
|
||||
}
|
||||
|
|
@ -16,20 +16,22 @@
|
|||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
package org.elasticsearch.index.analysis;
|
||||
package org.elasticsearch.analysis.common;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.hi.HindiNormalizationFilter;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.MultiTermAwareComponent;
|
||||
|
||||
/**
|
||||
* Factory for {@link HindiNormalizationFilter}
|
||||
*/
|
||||
public class HindiNormalizationFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent {
|
||||
|
||||
public HindiNormalizationFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
HindiNormalizationFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
super(indexSettings, name, settings);
|
||||
}
|
||||
|
|
@ -16,20 +16,22 @@
|
|||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
package org.elasticsearch.index.analysis;
|
||||
package org.elasticsearch.analysis.common;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.in.IndicNormalizationFilter;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.MultiTermAwareComponent;
|
||||
|
||||
/**
|
||||
* Factory for {@link IndicNormalizationFilter}
|
||||
*/
|
||||
public class IndicNormalizationFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent {
|
||||
|
||||
public IndicNormalizationFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
IndicNormalizationFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
super(indexSettings, name, settings);
|
||||
}
|
||||
|
|
@ -16,17 +16,19 @@
|
|||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
package org.elasticsearch.index.analysis;
|
||||
package org.elasticsearch.analysis.common;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.fa.PersianNormalizationFilter;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.MultiTermAwareComponent;
|
||||
|
||||
public class PersianNormalizationFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent {
|
||||
|
||||
public PersianNormalizationFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
PersianNormalizationFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
super(indexSettings, name, settings);
|
||||
}
|
||||
|
|
@ -16,20 +16,22 @@
|
|||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
package org.elasticsearch.index.analysis;
|
||||
package org.elasticsearch.analysis.common;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.miscellaneous.ScandinavianNormalizationFilter;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.MultiTermAwareComponent;
|
||||
|
||||
/**
|
||||
* Factory for {@link ScandinavianNormalizationFilter}
|
||||
*/
|
||||
public class ScandinavianNormalizationFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent {
|
||||
|
||||
public ScandinavianNormalizationFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
ScandinavianNormalizationFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
super(indexSettings, name, settings);
|
||||
}
|
||||
|
|
@ -17,17 +17,19 @@
|
|||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.analysis;
|
||||
package org.elasticsearch.analysis.common;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.sr.SerbianNormalizationFilter;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.IndexSettings;
|
||||
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.MultiTermAwareComponent;
|
||||
|
||||
public class SerbianNormalizationFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent {
|
||||
|
||||
public SerbianNormalizationFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
SerbianNormalizationFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
|
||||
super(indexSettings, name, settings);
|
||||
}
|
||||
|
|
@ -17,23 +17,32 @@
|
|||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.analysis;
|
||||
package org.elasticsearch.analysis.common;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.miscellaneous.DisableGraphAttribute;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.elasticsearch.index.analysis.AnalysisTestsHelper;
|
||||
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
||||
import org.elasticsearch.test.ESTestCase;
|
||||
import org.elasticsearch.test.ESTokenStreamTestCase;
|
||||
import org.junit.Before;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
|
||||
public class CJKFilterFactoryTests extends ESTokenStreamTestCase {
|
||||
private static final String RESOURCE = "/org/elasticsearch/index/analysis/cjk_analysis.json";
|
||||
private static final String RESOURCE = "/org/elasticsearch/analysis/common/cjk_analysis.json";
|
||||
|
||||
private ESTestCase.TestAnalysis analysis;
|
||||
|
||||
@Before
|
||||
public void setup() throws IOException {
|
||||
analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE, new CommonAnalysisPlugin());
|
||||
}
|
||||
|
||||
public void testDefault() throws IOException {
|
||||
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE);
|
||||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("cjk_bigram");
|
||||
String source = "多くの学生が試験に落ちた。";
|
||||
String[] expected = new String[]{"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた" };
|
||||
|
@ -43,7 +52,6 @@ public class CJKFilterFactoryTests extends ESTokenStreamTestCase {
|
|||
}
|
||||
|
||||
public void testNoFlags() throws IOException {
|
||||
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE);
|
||||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("cjk_no_flags");
|
||||
String source = "多くの学生が試験に落ちた。";
|
||||
String[] expected = new String[]{"多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた" };
|
||||
|
@ -53,7 +61,6 @@ public class CJKFilterFactoryTests extends ESTokenStreamTestCase {
|
|||
}
|
||||
|
||||
public void testHanOnly() throws IOException {
|
||||
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE);
|
||||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("cjk_han_only");
|
||||
String source = "多くの学生が試験に落ちた。";
|
||||
String[] expected = new String[]{"多", "く", "の", "学生", "が", "試験", "に", "落", "ち", "た" };
|
||||
|
@ -63,7 +70,6 @@ public class CJKFilterFactoryTests extends ESTokenStreamTestCase {
|
|||
}
|
||||
|
||||
public void testHanUnigramOnly() throws IOException {
|
||||
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE);
|
||||
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("cjk_han_unigram_only");
|
||||
String source = "多くの学生が試験に落ちた。";
|
||||
String[] expected = new String[]{"多", "く", "の", "学", "学生", "生", "が", "試", "試験", "験", "に", "落", "ち", "た" };
|
||||
|
@ -73,7 +79,6 @@ public class CJKFilterFactoryTests extends ESTokenStreamTestCase {
|
|||
}
|
||||
|
||||
public void testDisableGraph() throws IOException {
|
||||
ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromClassPath(createTempDir(), RESOURCE);
|
||||
TokenFilterFactory allFlagsFactory = analysis.tokenFilter.get("cjk_all_flags");
|
||||
TokenFilterFactory hanOnlyFactory = analysis.tokenFilter.get("cjk_han_only");
|
||||
|
|
@ -26,6 +26,7 @@ import org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilterFactory;
|
|||
import org.apache.lucene.analysis.reverse.ReverseStringFilterFactory;
|
||||
import org.apache.lucene.analysis.snowball.SnowballPorterFilterFactory;
|
||||
import org.elasticsearch.index.analysis.HtmlStripCharFilterFactory;
|
||||
import org.elasticsearch.index.analysis.SoraniNormalizationFilterFactory;
|
||||
import org.elasticsearch.index.analysis.SynonymTokenFilterFactory;
|
||||
import org.elasticsearch.indices.analysis.AnalysisFactoryTestCase;
|
||||
|
||||
|
@ -106,6 +107,16 @@ public class CommonAnalysisFactoryTests extends AnalysisFactoryTestCase {
|
|||
filters.put("commongramsquery", CommonGramsTokenFilterFactory.class);
|
||||
filters.put("patternreplace", PatternReplaceTokenFilterFactory.class);
|
||||
filters.put("patterncapturegroup", PatternCaptureGroupTokenFilterFactory.class);
|
||||
filters.put("arabicnormalization", ArabicNormalizationFilterFactory.class);
|
||||
filters.put("germannormalization", GermanNormalizationFilterFactory.class);
|
||||
filters.put("hindinormalization", HindiNormalizationFilterFactory.class);
|
||||
filters.put("indicnormalization", IndicNormalizationFilterFactory.class);
|
||||
filters.put("persiannormalization", PersianNormalizationFilterFactory.class);
|
||||
filters.put("scandinaviannormalization", ScandinavianNormalizationFilterFactory.class);
|
||||
filters.put("serbiannormalization", SerbianNormalizationFilterFactory.class);
|
||||
filters.put("soraninormalization", SoraniNormalizationFilterFactory.class);
|
||||
filters.put("cjkwidth", CJKWidthFilterFactory.class);
|
||||
filters.put("cjkbigram", CJKBigramFilterFactory.class);
|
||||
return filters;
|
||||
}
|
||||
|
||||
|
|
|
@ -663,3 +663,333 @@
|
|||
- match: { tokens.1.token: a_b }
|
||||
- match: { tokens.2.token: b }
|
||||
- match: { tokens.3.token: c }
|
||||
|
||||
---
|
||||
"arabic_normalization":
|
||||
- do:
|
||||
indices.create:
|
||||
index: test
|
||||
body:
|
||||
settings:
|
||||
analysis:
|
||||
filter:
|
||||
my_arabic_normalization:
|
||||
type: arabic_normalization
|
||||
- do:
|
||||
indices.analyze:
|
||||
index: test
|
||||
body:
|
||||
text: آجن
|
||||
tokenizer: keyword
|
||||
filter: [my_arabic_normalization]
|
||||
- length: { tokens: 1 }
|
||||
- match: { tokens.0.token: اجن }
|
||||
|
||||
# Test pre-configured token filter too:
|
||||
- do:
|
||||
indices.analyze:
|
||||
body:
|
||||
text: آجن
|
||||
tokenizer: keyword
|
||||
filter: [arabic_normalization]
|
||||
- length: { tokens: 1 }
|
||||
- match: { tokens.0.token: اجن }
|
||||
|
||||
---
|
||||
"german_normalization":
|
||||
- do:
|
||||
indices.create:
|
||||
index: test
|
||||
body:
|
||||
settings:
|
||||
analysis:
|
||||
filter:
|
||||
my_german_normalization:
|
||||
type: german_normalization
|
||||
- do:
|
||||
indices.analyze:
|
||||
index: test
|
||||
body:
|
||||
text: weißbier
|
||||
tokenizer: keyword
|
||||
filter: [my_german_normalization]
|
||||
- length: { tokens: 1 }
|
||||
- match: { tokens.0.token: weissbier }
|
||||
|
||||
# Test pre-configured token filter too:
|
||||
- do:
|
||||
indices.analyze:
|
||||
body:
|
||||
text: weißbier
|
||||
tokenizer: keyword
|
||||
filter: [german_normalization]
|
||||
- length: { tokens: 1 }
|
||||
- match: { tokens.0.token: weissbier }
|
||||
|
||||
---
|
||||
"hindi_normalization":
|
||||
- do:
|
||||
indices.create:
|
||||
index: test
|
||||
body:
|
||||
settings:
|
||||
analysis:
|
||||
filter:
|
||||
my_hindi_normalization:
|
||||
type: hindi_normalization
|
||||
- do:
|
||||
indices.analyze:
|
||||
index: test
|
||||
body:
|
||||
text: अँगरेज़ी
|
||||
tokenizer: keyword
|
||||
filter: [my_hindi_normalization]
|
||||
- length: { tokens: 1 }
|
||||
- match: { tokens.0.token: अंगरेजि }
|
||||
|
||||
# Test pre-configured token filter too:
|
||||
- do:
|
||||
indices.analyze:
|
||||
body:
|
||||
text: अँगरेज़ी
|
||||
tokenizer: keyword
|
||||
filter: [hindi_normalization]
|
||||
- length: { tokens: 1 }
|
||||
- match: { tokens.0.token: अंगरेजि }
|
||||
|
||||
---
|
||||
"indic_normalization":
|
||||
- do:
|
||||
indices.create:
|
||||
index: test
|
||||
body:
|
||||
settings:
|
||||
analysis:
|
||||
filter:
|
||||
my_indic_normalization:
|
||||
type: indic_normalization
|
||||
- do:
|
||||
indices.analyze:
|
||||
index: test
|
||||
body:
|
||||
text: ত্
|
||||
tokenizer: keyword
|
||||
filter: [my_indic_normalization]
|
||||
- length: { tokens: 1 }
|
||||
- match: { tokens.0.token: ৎ }
|
||||
|
||||
# Test pre-configured token filter too:
|
||||
- do:
|
||||
indices.analyze:
|
||||
body:
|
||||
text: ত্
|
||||
tokenizer: keyword
|
||||
filter: [indic_normalization]
|
||||
- length: { tokens: 1 }
|
||||
- match: { tokens.0.token: ৎ }
|
||||
|
||||
---
|
||||
"persian_normalization":
|
||||
- do:
|
||||
indices.create:
|
||||
index: test
|
||||
body:
|
||||
settings:
|
||||
analysis:
|
||||
filter:
|
||||
my_persian_normalization:
|
||||
type: persian_normalization
|
||||
- do:
|
||||
indices.analyze:
|
||||
index: test
|
||||
body:
|
||||
text: های
|
||||
tokenizer: keyword
|
||||
filter: [my_persian_normalization]
|
||||
- length: { tokens: 1 }
|
||||
- match: { tokens.0.token: هاي }
|
||||
|
||||
# Test pre-configured token filter too:
|
||||
- do:
|
||||
indices.analyze:
|
||||
body:
|
||||
text: های
|
||||
tokenizer: keyword
|
||||
filter: [persian_normalization]
|
||||
- length: { tokens: 1 }
|
||||
- match: { tokens.0.token: هاي }
|
||||
|
||||
---
|
||||
"scandinavian_normalization":
|
||||
- do:
|
||||
indices.create:
|
||||
index: test
|
||||
body:
|
||||
settings:
|
||||
analysis:
|
||||
filter:
|
||||
my_scandinavian_normalization:
|
||||
type: scandinavian_normalization
|
||||
- do:
|
||||
indices.analyze:
|
||||
index: test
|
||||
body:
|
||||
text: ö
|
||||
tokenizer: keyword
|
||||
filter: [my_scandinavian_normalization]
|
||||
- length: { tokens: 1 }
|
||||
- match: { tokens.0.token: ø }
|
||||
|
||||
# Test pre-configured token filter too:
|
||||
- do:
|
||||
indices.analyze:
|
||||
body:
|
||||
text: ö
|
||||
tokenizer: keyword
|
||||
filter: [scandinavian_normalization]
|
||||
- length: { tokens: 1 }
|
||||
- match: { tokens.0.token: ø }
|
||||
|
||||
---
|
||||
"serbian_normalization":
|
||||
- do:
|
||||
indices.create:
|
||||
index: test
|
||||
body:
|
||||
settings:
|
||||
analysis:
|
||||
filter:
|
||||
my_serbian_normalization:
|
||||
type: serbian_normalization
|
||||
- do:
|
||||
indices.analyze:
|
||||
index: test
|
||||
body:
|
||||
text: абвгдђежзијклљмнњопрстћуфхцчџш
|
||||
tokenizer: keyword
|
||||
filter: [my_serbian_normalization]
|
||||
- length: { tokens: 1 }
|
||||
- match: { tokens.0.token: abvgddjezzijklljmnnjoprstcufhccdzs }
|
||||
|
||||
# Test pre-configured token filter too:
|
||||
- do:
|
||||
indices.analyze:
|
||||
body:
|
||||
text: абвгдђежзијклљмнњопрстћуфхцчџш
|
||||
tokenizer: keyword
|
||||
filter: [serbian_normalization]
|
||||
- length: { tokens: 1 }
|
||||
- match: { tokens.0.token: abvgddjezzijklljmnnjoprstcufhccdzs }
|
||||
|
||||
---
|
||||
"sorani_normalization":
|
||||
- do:
|
||||
indices.create:
|
||||
index: test
|
||||
body:
|
||||
settings:
|
||||
analysis:
|
||||
filter:
|
||||
my_sorani_normalization:
|
||||
type: sorani_normalization
|
||||
- do:
|
||||
indices.analyze:
|
||||
index: test
|
||||
body:
|
||||
text: ي
|
||||
tokenizer: keyword
|
||||
filter: [my_sorani_normalization]
|
||||
- length: { tokens: 1 }
|
||||
- match: { tokens.0.token: ی }
|
||||
|
||||
# Test pre-configured token filter too:
|
||||
- do:
|
||||
indices.analyze:
|
||||
body:
|
||||
text: ي
|
||||
tokenizer: keyword
|
||||
filter: [sorani_normalization]
|
||||
- length: { tokens: 1 }
|
||||
- match: { tokens.0.token: ی }
|
||||
|
||||
---
|
||||
"cjk_width":
|
||||
- do:
|
||||
indices.create:
|
||||
index: test
|
||||
body:
|
||||
settings:
|
||||
analysis:
|
||||
filter:
|
||||
my_cjk_width:
|
||||
type: cjk_width
|
||||
- do:
|
||||
indices.analyze:
|
||||
index: test
|
||||
body:
|
||||
text: カタカナ
|
||||
tokenizer: keyword
|
||||
filter: [my_cjk_width]
|
||||
- length: { tokens: 1 }
|
||||
- match: { tokens.0.token: カタカナ }
|
||||
|
||||
# Test pre-configured token filter too:
|
||||
- do:
|
||||
indices.analyze:
|
||||
body:
|
||||
text: カタカナ
|
||||
tokenizer: keyword
|
||||
filter: [cjk_width]
|
||||
- length: { tokens: 1 }
|
||||
- match: { tokens.0.token: カタカナ }
|
||||
|
||||
---
|
||||
"cjk_bigram":
|
||||
- do:
|
||||
indices.create:
|
||||
index: test
|
||||
body:
|
||||
settings:
|
||||
analysis:
|
||||
filter:
|
||||
my_cjk_bigram:
|
||||
type: cjk_bigram
|
||||
- do:
|
||||
indices.analyze:
|
||||
index: test
|
||||
body:
|
||||
text: 多くの学生が試験に落ちた
|
||||
tokenizer: standard
|
||||
filter: [my_cjk_bigram]
|
||||
- length: { tokens: 11 }
|
||||
- match: { tokens.0.token: 多く }
|
||||
- match: { tokens.1.token: くの }
|
||||
- match: { tokens.2.token: の学 }
|
||||
- match: { tokens.3.token: 学生 }
|
||||
- match: { tokens.4.token: 生が }
|
||||
- match: { tokens.5.token: が試 }
|
||||
- match: { tokens.6.token: 試験 }
|
||||
- match: { tokens.7.token: 験に }
|
||||
- match: { tokens.8.token: に落 }
|
||||
- match: { tokens.9.token: 落ち }
|
||||
- match: { tokens.10.token: ちた }
|
||||
|
||||
# Test pre-configured token filter too:
|
||||
- do:
|
||||
indices.analyze:
|
||||
body:
|
||||
text: 多くの学生が試験に落ちた
|
||||
tokenizer: standard
|
||||
filter: [cjk_bigram]
|
||||
- length: { tokens: 11 }
|
||||
- match: { tokens.0.token: 多く }
|
||||
- match: { tokens.1.token: くの }
|
||||
- match: { tokens.2.token: の学 }
|
||||
- match: { tokens.3.token: 学生 }
|
||||
- match: { tokens.4.token: 生が }
|
||||
- match: { tokens.5.token: が試 }
|
||||
- match: { tokens.6.token: 試験 }
|
||||
- match: { tokens.7.token: 験に }
|
||||
- match: { tokens.8.token: に落 }
|
||||
- match: { tokens.9.token: 落ち }
|
||||
- match: { tokens.10.token: ちた }
|
||||
|
|
|
@ -35,13 +35,15 @@ import java.util.Arrays;
|
|||
|
||||
public class AnalysisTestsHelper {
|
||||
|
||||
public static ESTestCase.TestAnalysis createTestAnalysisFromClassPath(final Path baseDir, final String resource) throws IOException {
|
||||
public static ESTestCase.TestAnalysis createTestAnalysisFromClassPath(final Path baseDir,
|
||||
final String resource,
|
||||
final AnalysisPlugin... plugins) throws IOException {
|
||||
final Settings settings = Settings.builder()
|
||||
.loadFromStream(resource, AnalysisTestsHelper.class.getResourceAsStream(resource))
|
||||
.put(Environment.PATH_HOME_SETTING.getKey(), baseDir.toString())
|
||||
.build();
|
||||
|
||||
return createTestAnalysisFromSettings(settings);
|
||||
return createTestAnalysisFromSettings(settings, plugins);
|
||||
}
|
||||
|
||||
public static ESTestCase.TestAnalysis createTestAnalysisFromSettings(
|
||||
|
|
|
@ -24,22 +24,16 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
|
|||
import org.apache.lucene.analysis.util.TokenizerFactory;
|
||||
import org.elasticsearch.common.collect.MapBuilder;
|
||||
import org.elasticsearch.index.analysis.ApostropheFilterFactory;
|
||||
import org.elasticsearch.index.analysis.ArabicNormalizationFilterFactory;
|
||||
import org.elasticsearch.index.analysis.ArabicStemTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.BrazilianStemTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.CJKBigramFilterFactory;
|
||||
import org.elasticsearch.index.analysis.CJKWidthFilterFactory;
|
||||
import org.elasticsearch.index.analysis.ClassicFilterFactory;
|
||||
import org.elasticsearch.index.analysis.ClassicTokenizerFactory;
|
||||
import org.elasticsearch.index.analysis.CzechStemTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.DecimalDigitFilterFactory;
|
||||
import org.elasticsearch.index.analysis.DelimitedPayloadTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.EdgeNGramTokenizerFactory;
|
||||
import org.elasticsearch.index.analysis.GermanNormalizationFilterFactory;
|
||||
import org.elasticsearch.index.analysis.GermanStemTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.HindiNormalizationFilterFactory;
|
||||
import org.elasticsearch.index.analysis.HunspellTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.IndicNormalizationFilterFactory;
|
||||
import org.elasticsearch.index.analysis.KeepTypesFilterFactory;
|
||||
import org.elasticsearch.index.analysis.KeepWordFilterFactory;
|
||||
import org.elasticsearch.index.analysis.KeywordTokenizerFactory;
|
||||
|
@ -50,15 +44,11 @@ import org.elasticsearch.index.analysis.MultiTermAwareComponent;
|
|||
import org.elasticsearch.index.analysis.NGramTokenizerFactory;
|
||||
import org.elasticsearch.index.analysis.PathHierarchyTokenizerFactory;
|
||||
import org.elasticsearch.index.analysis.PatternTokenizerFactory;
|
||||
import org.elasticsearch.index.analysis.PersianNormalizationFilterFactory;
|
||||
import org.elasticsearch.index.analysis.PreConfiguredCharFilter;
|
||||
import org.elasticsearch.index.analysis.PreConfiguredTokenFilter;
|
||||
import org.elasticsearch.index.analysis.PreConfiguredTokenizer;
|
||||
import org.elasticsearch.index.analysis.ScandinavianFoldingFilterFactory;
|
||||
import org.elasticsearch.index.analysis.ScandinavianNormalizationFilterFactory;
|
||||
import org.elasticsearch.index.analysis.SerbianNormalizationFilterFactory;
|
||||
import org.elasticsearch.index.analysis.ShingleTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.SoraniNormalizationFilterFactory;
|
||||
import org.elasticsearch.index.analysis.StandardTokenFilterFactory;
|
||||
import org.elasticsearch.index.analysis.StandardTokenizerFactory;
|
||||
import org.elasticsearch.index.analysis.StopTokenFilterFactory;
|
||||
|
@ -131,13 +121,13 @@ public abstract class AnalysisFactoryTestCase extends ESTestCase {
|
|||
static final Map<String,Class<?>> KNOWN_TOKENFILTERS = new MapBuilder<String,Class<?>>()
|
||||
// exposed in ES
|
||||
.put("apostrophe", ApostropheFilterFactory.class)
|
||||
.put("arabicnormalization", ArabicNormalizationFilterFactory.class)
|
||||
.put("arabicnormalization", MovedToAnalysisCommon.class)
|
||||
.put("arabicstem", ArabicStemTokenFilterFactory.class)
|
||||
.put("asciifolding", MovedToAnalysisCommon.class)
|
||||
.put("brazilianstem", BrazilianStemTokenFilterFactory.class)
|
||||
.put("bulgarianstem", MovedToAnalysisCommon.class)
|
||||
.put("cjkbigram", CJKBigramFilterFactory.class)
|
||||
.put("cjkwidth", CJKWidthFilterFactory.class)
|
||||
.put("cjkbigram", MovedToAnalysisCommon.class)
|
||||
.put("cjkwidth", MovedToAnalysisCommon.class)
|
||||
.put("classic", ClassicFilterFactory.class)
|
||||
.put("commongrams", MovedToAnalysisCommon.class)
|
||||
.put("commongramsquery", MovedToAnalysisCommon.class)
|
||||
|
@ -157,15 +147,15 @@ public abstract class AnalysisFactoryTestCase extends ESTestCase {
|
|||
.put("germanstem", GermanStemTokenFilterFactory.class)
|
||||
.put("germanlightstem", MovedToAnalysisCommon.class)
|
||||
.put("germanminimalstem", MovedToAnalysisCommon.class)
|
||||
.put("germannormalization", GermanNormalizationFilterFactory.class)
|
||||
.put("germannormalization", MovedToAnalysisCommon.class)
|
||||
.put("greeklowercase", MovedToAnalysisCommon.class)
|
||||
.put("greekstem", MovedToAnalysisCommon.class)
|
||||
.put("hindinormalization", HindiNormalizationFilterFactory.class)
|
||||
.put("hindinormalization", MovedToAnalysisCommon.class)
|
||||
.put("hindistem", MovedToAnalysisCommon.class)
|
||||
.put("hungarianlightstem", MovedToAnalysisCommon.class)
|
||||
.put("hunspellstem", HunspellTokenFilterFactory.class)
|
||||
.put("hyphenationcompoundword", MovedToAnalysisCommon.class)
|
||||
.put("indicnormalization", IndicNormalizationFilterFactory.class)
|
||||
.put("indicnormalization", MovedToAnalysisCommon.class)
|
||||
.put("irishlowercase", MovedToAnalysisCommon.class)
|
||||
.put("indonesianstem", MovedToAnalysisCommon.class)
|
||||
.put("italianlightstem", MovedToAnalysisCommon.class)
|
||||
|
@ -181,7 +171,7 @@ public abstract class AnalysisFactoryTestCase extends ESTestCase {
|
|||
.put("norwegianminimalstem", MovedToAnalysisCommon.class)
|
||||
.put("patterncapturegroup", MovedToAnalysisCommon.class)
|
||||
.put("patternreplace", MovedToAnalysisCommon.class)
|
||||
.put("persiannormalization", PersianNormalizationFilterFactory.class)
|
||||
.put("persiannormalization", MovedToAnalysisCommon.class)
|
||||
.put("porterstem", MovedToAnalysisCommon.class)
|
||||
.put("portuguesestem", MovedToAnalysisCommon.class)
|
||||
.put("portugueselightstem", MovedToAnalysisCommon.class)
|
||||
|
@ -189,12 +179,12 @@ public abstract class AnalysisFactoryTestCase extends ESTestCase {
|
|||
.put("reversestring", MovedToAnalysisCommon.class)
|
||||
.put("russianlightstem", MovedToAnalysisCommon.class)
|
||||
.put("scandinavianfolding", ScandinavianFoldingFilterFactory.class)
|
||||
.put("scandinaviannormalization", ScandinavianNormalizationFilterFactory.class)
|
||||
.put("serbiannormalization", SerbianNormalizationFilterFactory.class)
|
||||
.put("scandinaviannormalization", MovedToAnalysisCommon.class)
|
||||
.put("serbiannormalization", MovedToAnalysisCommon.class)
|
||||
.put("shingle", ShingleTokenFilterFactory.class)
|
||||
.put("minhash", MinHashTokenFilterFactory.class)
|
||||
.put("snowballporter", MovedToAnalysisCommon.class)
|
||||
.put("soraninormalization", SoraniNormalizationFilterFactory.class)
|
||||
.put("soraninormalization", MovedToAnalysisCommon.class)
|
||||
.put("soranistem", MovedToAnalysisCommon.class)
|
||||
.put("spanishlightstem", MovedToAnalysisCommon.class)
|
||||
.put("standard", StandardTokenFilterFactory.class)
|
||||
|
|
Loading…
Reference in New Issue