mirror of
https://github.com/honeymoose/OpenSearch.git
synced 2025-03-09 14:34:43 +00:00
Fix settings processing in WordDelimiterTokenFilterFactory
This commit is contained in:
parent
19295280d9
commit
8277833f8d
@ -61,23 +61,23 @@ public class WordDelimiterTokenFilterFactory extends AbstractTokenFilterFactory
|
||||
}
|
||||
int flags = 0;
|
||||
// If set, causes parts of words to be generated: "PowerShot" => "Power" "Shot"
|
||||
flags |= getFlag(GENERATE_WORD_PARTS ,settings, "generate_word_parts", true);
|
||||
flags |= getFlag(GENERATE_WORD_PARTS, settings, "generate_word_parts", true);
|
||||
// If set, causes number subwords to be generated: "500-42" => "500" "42"
|
||||
flags |= getFlag(GENERATE_WORD_PARTS, settings,"generate_number_parts", true);
|
||||
flags |= getFlag(GENERATE_NUMBER_PARTS, settings, "generate_number_parts", true);
|
||||
// 1, causes maximum runs of word parts to be catenated: "wi-fi" => "wifi"
|
||||
flags |= getFlag(CATENATE_WORDS, settings,"catenate_words", false);
|
||||
flags |= getFlag(CATENATE_WORDS, settings, "catenate_words", false);
|
||||
// If set, causes maximum runs of number parts to be catenated: "500-42" => "50042"
|
||||
flags |= getFlag(CATENATE_NUMBERS, settings,"catenate_numbers", false);
|
||||
flags |= getFlag(CATENATE_NUMBERS, settings, "catenate_numbers", false);
|
||||
// If set, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000"
|
||||
flags |= getFlag(CATENATE_ALL,settings,"catenate_all", false);
|
||||
flags |= getFlag(CATENATE_ALL, settings, "catenate_all", false);
|
||||
// 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards)
|
||||
flags |= getFlag(SPLIT_ON_CASE_CHANGE, settings,"split_on_case_change", true);
|
||||
flags |= getFlag(SPLIT_ON_CASE_CHANGE, settings, "split_on_case_change", true);
|
||||
// If set, includes original words in subwords: "500-42" => "500" "42" "500-42"
|
||||
flags |= getFlag(PRESERVE_ORIGINAL, settings,"preserve_original", false);
|
||||
flags |= getFlag(PRESERVE_ORIGINAL, settings, "preserve_original", false);
|
||||
// 1, causes "j2se" to be three tokens; "j" "2" "se"
|
||||
flags |= getFlag(SPLIT_ON_NUMERICS, settings,"split_on_numerics", true);
|
||||
flags |= getFlag(SPLIT_ON_NUMERICS, settings, "split_on_numerics", true);
|
||||
// If set, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
|
||||
flags |= getFlag(STEM_ENGLISH_POSSESSIVE, settings,"stem_english_possessive", true);
|
||||
flags |= getFlag(STEM_ENGLISH_POSSESSIVE, settings, "stem_english_possessive", true);
|
||||
// If not null is the set of tokens to protect from being delimited
|
||||
Set<?> protectedWords = Analysis.getWordSet(env, settings, "protected_words", version);
|
||||
this.protoWords = protectedWords == null ? null : CharArraySet.copy(Lucene.VERSION, protectedWords);
|
||||
@ -91,12 +91,12 @@ public class WordDelimiterTokenFilterFactory extends AbstractTokenFilterFactory
|
||||
flags,
|
||||
protoWords);
|
||||
}
|
||||
|
||||
|
||||
public int getFlag(int flag, Settings settings, String key, boolean defaultValue) {
|
||||
if (settings.getAsBoolean(key, true)) {
|
||||
return 0;
|
||||
if (settings.getAsBoolean(key, defaultValue)) {
|
||||
return flag;
|
||||
}
|
||||
return flag;
|
||||
return 0;
|
||||
}
|
||||
|
||||
// source => type
|
||||
|
@ -0,0 +1,130 @@
|
||||
/*
|
||||
* Licensed to ElasticSearch and Shay Banon under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. ElasticSearch licenses this
|
||||
* file to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
package org.elasticsearch.test.unit.index.analysis;
|
||||
|
||||
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.elasticsearch.index.analysis.AnalysisService;
|
||||
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
|
||||
import static org.elasticsearch.common.settings.ImmutableSettings.settingsBuilder;
|
||||
|
||||
public class WordDelimiterTokenFilterFactoryTests {
|
||||
|
||||
@Test
|
||||
public void testDefault() throws IOException {
|
||||
AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromSettings(settingsBuilder()
|
||||
.put("index.analysis.filter.my_word_delimiter.type", "word_delimiter")
|
||||
.build());
|
||||
TokenFilterFactory tokenFilter = analysisService.tokenFilter("my_word_delimiter");
|
||||
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
|
||||
String[] expected = new String[]{"Power", "Shot", "500", "42", "wi", "fi", "wi", "fi", "4000", "j", "2", "se", "O", "Neil"};
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_41, new StringReader(source));
|
||||
AnalysisTestsHelper.assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCatenateWords() throws IOException {
|
||||
AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromSettings(settingsBuilder()
|
||||
.put("index.analysis.filter.my_word_delimiter.type", "word_delimiter")
|
||||
.put("index.analysis.filter.my_word_delimiter.catenate_words", "true")
|
||||
.put("index.analysis.filter.my_word_delimiter.generate_word_parts", "false")
|
||||
.build());
|
||||
TokenFilterFactory tokenFilter = analysisService.tokenFilter("my_word_delimiter");
|
||||
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
|
||||
String[] expected = new String[]{"PowerShot", "500", "42", "wifi", "wifi", "4000", "j", "2", "se", "ONeil"};
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_41, new StringReader(source));
|
||||
AnalysisTestsHelper.assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCatenateNumbers() throws IOException {
|
||||
AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromSettings(settingsBuilder()
|
||||
.put("index.analysis.filter.my_word_delimiter.type", "word_delimiter")
|
||||
.put("index.analysis.filter.my_word_delimiter.generate_number_parts", "false")
|
||||
.put("index.analysis.filter.my_word_delimiter.catenate_numbers", "true")
|
||||
.build());
|
||||
TokenFilterFactory tokenFilter = analysisService.tokenFilter("my_word_delimiter");
|
||||
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
|
||||
String[] expected = new String[]{"Power", "Shot", "50042", "wi", "fi", "wi", "fi", "4000", "j", "2", "se", "O", "Neil"};
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_41, new StringReader(source));
|
||||
AnalysisTestsHelper.assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCatenateAll() throws IOException {
|
||||
AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromSettings(settingsBuilder()
|
||||
.put("index.analysis.filter.my_word_delimiter.type", "word_delimiter")
|
||||
.put("index.analysis.filter.my_word_delimiter.generate_word_parts", "false")
|
||||
.put("index.analysis.filter.my_word_delimiter.generate_number_parts", "false")
|
||||
.put("index.analysis.filter.my_word_delimiter.catenate_all", "true")
|
||||
.build());
|
||||
TokenFilterFactory tokenFilter = analysisService.tokenFilter("my_word_delimiter");
|
||||
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
|
||||
String[] expected = new String[]{"PowerShot", "50042", "wifi", "wifi4000", "j2se", "ONeil"};
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_41, new StringReader(source));
|
||||
AnalysisTestsHelper.assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSplitOnCaseChange() throws IOException {
|
||||
AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromSettings(settingsBuilder()
|
||||
.put("index.analysis.filter.my_word_delimiter.type", "word_delimiter")
|
||||
.put("index.analysis.filter.my_word_delimiter.split_on_case_change", "false")
|
||||
.build());
|
||||
TokenFilterFactory tokenFilter = analysisService.tokenFilter("my_word_delimiter");
|
||||
String source = "PowerShot";
|
||||
String[] expected = new String[]{"PowerShot"};
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_41, new StringReader(source));
|
||||
AnalysisTestsHelper.assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testPreserveOriginal() throws IOException {
|
||||
AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromSettings(settingsBuilder()
|
||||
.put("index.analysis.filter.my_word_delimiter.type", "word_delimiter")
|
||||
.put("index.analysis.filter.my_word_delimiter.preserve_original", "true")
|
||||
.build());
|
||||
TokenFilterFactory tokenFilter = analysisService.tokenFilter("my_word_delimiter");
|
||||
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
|
||||
String[] expected = new String[]{"PowerShot", "Power", "Shot", "500-42", "500", "42", "wi-fi", "wi", "fi", "wi-fi-4000", "wi", "fi", "4000", "j2se", "j", "2", "se", "O'Neil's", "O", "Neil"};
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_41, new StringReader(source));
|
||||
AnalysisTestsHelper.assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testStemEnglishPossessive() throws IOException {
|
||||
AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromSettings(settingsBuilder()
|
||||
.put("index.analysis.filter.my_word_delimiter.type", "word_delimiter")
|
||||
.put("index.analysis.filter.my_word_delimiter.stem_english_possessive", "false")
|
||||
.build());
|
||||
TokenFilterFactory tokenFilter = analysisService.tokenFilter("my_word_delimiter");
|
||||
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
|
||||
String[] expected = new String[]{"Power", "Shot", "500", "42", "wi", "fi", "wi", "fi", "4000", "j", "2", "se", "O", "Neil", "s"};
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_41, new StringReader(source));
|
||||
AnalysisTestsHelper.assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
|
||||
}
|
||||
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user