Fix settings processing in WordDelimiterTokenFilterFactory

This commit is contained in:
Igor Motov 2013-02-05 07:38:11 -05:00
parent 19295280d9
commit 8277833f8d
2 changed files with 143 additions and 13 deletions

View File

@ -61,23 +61,23 @@ public class WordDelimiterTokenFilterFactory extends AbstractTokenFilterFactory
}
int flags = 0;
// If set, causes parts of words to be generated: "PowerShot" => "Power" "Shot"
flags |= getFlag(GENERATE_WORD_PARTS ,settings, "generate_word_parts", true);
flags |= getFlag(GENERATE_WORD_PARTS, settings, "generate_word_parts", true);
// If set, causes number subwords to be generated: "500-42" => "500" "42"
flags |= getFlag(GENERATE_WORD_PARTS, settings,"generate_number_parts", true);
flags |= getFlag(GENERATE_NUMBER_PARTS, settings, "generate_number_parts", true);
// 1, causes maximum runs of word parts to be catenated: "wi-fi" => "wifi"
flags |= getFlag(CATENATE_WORDS, settings,"catenate_words", false);
flags |= getFlag(CATENATE_WORDS, settings, "catenate_words", false);
// If set, causes maximum runs of number parts to be catenated: "500-42" => "50042"
flags |= getFlag(CATENATE_NUMBERS, settings,"catenate_numbers", false);
flags |= getFlag(CATENATE_NUMBERS, settings, "catenate_numbers", false);
// If set, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000"
flags |= getFlag(CATENATE_ALL,settings,"catenate_all", false);
flags |= getFlag(CATENATE_ALL, settings, "catenate_all", false);
// 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards)
flags |= getFlag(SPLIT_ON_CASE_CHANGE, settings,"split_on_case_change", true);
flags |= getFlag(SPLIT_ON_CASE_CHANGE, settings, "split_on_case_change", true);
// If set, includes original words in subwords: "500-42" => "500" "42" "500-42"
flags |= getFlag(PRESERVE_ORIGINAL, settings,"preserve_original", false);
flags |= getFlag(PRESERVE_ORIGINAL, settings, "preserve_original", false);
// 1, causes "j2se" to be three tokens; "j" "2" "se"
flags |= getFlag(SPLIT_ON_NUMERICS, settings,"split_on_numerics", true);
flags |= getFlag(SPLIT_ON_NUMERICS, settings, "split_on_numerics", true);
// If set, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
flags |= getFlag(STEM_ENGLISH_POSSESSIVE, settings,"stem_english_possessive", true);
flags |= getFlag(STEM_ENGLISH_POSSESSIVE, settings, "stem_english_possessive", true);
// If not null is the set of tokens to protect from being delimited
Set<?> protectedWords = Analysis.getWordSet(env, settings, "protected_words", version);
this.protoWords = protectedWords == null ? null : CharArraySet.copy(Lucene.VERSION, protectedWords);
@ -91,12 +91,12 @@ public class WordDelimiterTokenFilterFactory extends AbstractTokenFilterFactory
flags,
protoWords);
}
public int getFlag(int flag, Settings settings, String key, boolean defaultValue) {
if (settings.getAsBoolean(key, true)) {
return 0;
if (settings.getAsBoolean(key, defaultValue)) {
return flag;
}
return flag;
return 0;
}
// source => type

View File

@ -0,0 +1,130 @@
/*
* Licensed to ElasticSearch and Shay Banon under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. ElasticSearch licenses this
* file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.test.unit.index.analysis;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.util.Version;
import org.elasticsearch.index.analysis.AnalysisService;
import org.elasticsearch.index.analysis.TokenFilterFactory;
import org.testng.annotations.Test;
import java.io.IOException;
import java.io.StringReader;
import static org.elasticsearch.common.settings.ImmutableSettings.settingsBuilder;
public class WordDelimiterTokenFilterFactoryTests {
@Test
public void testDefault() throws IOException {
AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromSettings(settingsBuilder()
.put("index.analysis.filter.my_word_delimiter.type", "word_delimiter")
.build());
TokenFilterFactory tokenFilter = analysisService.tokenFilter("my_word_delimiter");
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
String[] expected = new String[]{"Power", "Shot", "500", "42", "wi", "fi", "wi", "fi", "4000", "j", "2", "se", "O", "Neil"};
Tokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_41, new StringReader(source));
AnalysisTestsHelper.assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
}
@Test
public void testCatenateWords() throws IOException {
AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromSettings(settingsBuilder()
.put("index.analysis.filter.my_word_delimiter.type", "word_delimiter")
.put("index.analysis.filter.my_word_delimiter.catenate_words", "true")
.put("index.analysis.filter.my_word_delimiter.generate_word_parts", "false")
.build());
TokenFilterFactory tokenFilter = analysisService.tokenFilter("my_word_delimiter");
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
String[] expected = new String[]{"PowerShot", "500", "42", "wifi", "wifi", "4000", "j", "2", "se", "ONeil"};
Tokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_41, new StringReader(source));
AnalysisTestsHelper.assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
}
@Test
public void testCatenateNumbers() throws IOException {
AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromSettings(settingsBuilder()
.put("index.analysis.filter.my_word_delimiter.type", "word_delimiter")
.put("index.analysis.filter.my_word_delimiter.generate_number_parts", "false")
.put("index.analysis.filter.my_word_delimiter.catenate_numbers", "true")
.build());
TokenFilterFactory tokenFilter = analysisService.tokenFilter("my_word_delimiter");
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
String[] expected = new String[]{"Power", "Shot", "50042", "wi", "fi", "wi", "fi", "4000", "j", "2", "se", "O", "Neil"};
Tokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_41, new StringReader(source));
AnalysisTestsHelper.assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
}
@Test
public void testCatenateAll() throws IOException {
AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromSettings(settingsBuilder()
.put("index.analysis.filter.my_word_delimiter.type", "word_delimiter")
.put("index.analysis.filter.my_word_delimiter.generate_word_parts", "false")
.put("index.analysis.filter.my_word_delimiter.generate_number_parts", "false")
.put("index.analysis.filter.my_word_delimiter.catenate_all", "true")
.build());
TokenFilterFactory tokenFilter = analysisService.tokenFilter("my_word_delimiter");
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
String[] expected = new String[]{"PowerShot", "50042", "wifi", "wifi4000", "j2se", "ONeil"};
Tokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_41, new StringReader(source));
AnalysisTestsHelper.assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
}
@Test
public void testSplitOnCaseChange() throws IOException {
AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromSettings(settingsBuilder()
.put("index.analysis.filter.my_word_delimiter.type", "word_delimiter")
.put("index.analysis.filter.my_word_delimiter.split_on_case_change", "false")
.build());
TokenFilterFactory tokenFilter = analysisService.tokenFilter("my_word_delimiter");
String source = "PowerShot";
String[] expected = new String[]{"PowerShot"};
Tokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_41, new StringReader(source));
AnalysisTestsHelper.assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
}
@Test
public void testPreserveOriginal() throws IOException {
AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromSettings(settingsBuilder()
.put("index.analysis.filter.my_word_delimiter.type", "word_delimiter")
.put("index.analysis.filter.my_word_delimiter.preserve_original", "true")
.build());
TokenFilterFactory tokenFilter = analysisService.tokenFilter("my_word_delimiter");
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
String[] expected = new String[]{"PowerShot", "Power", "Shot", "500-42", "500", "42", "wi-fi", "wi", "fi", "wi-fi-4000", "wi", "fi", "4000", "j2se", "j", "2", "se", "O'Neil's", "O", "Neil"};
Tokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_41, new StringReader(source));
AnalysisTestsHelper.assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
}
@Test
public void testStemEnglishPossessive() throws IOException {
AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromSettings(settingsBuilder()
.put("index.analysis.filter.my_word_delimiter.type", "word_delimiter")
.put("index.analysis.filter.my_word_delimiter.stem_english_possessive", "false")
.build());
TokenFilterFactory tokenFilter = analysisService.tokenFilter("my_word_delimiter");
String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
String[] expected = new String[]{"Power", "Shot", "500", "42", "wi", "fi", "wi", "fi", "4000", "j", "2", "se", "O", "Neil", "s"};
Tokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_41, new StringReader(source));
AnalysisTestsHelper.assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
}
}