mirror of https://github.com/apache/lucene.git
LUCENE-9853: Use CJKWidthCharFilter as the default character width normalizer in JapaneseAnalyzer (#26)
This commit is contained in:
parent
3ed87c867a
commit
ea74ffb984
|
@ -95,6 +95,9 @@ API Changes
|
||||||
* LUCENE-9796: SortedDocValues no longer extends BinaryDocValues, as binaryValue() was not performant.
|
* LUCENE-9796: SortedDocValues no longer extends BinaryDocValues, as binaryValue() was not performant.
|
||||||
See MIGRATE.md for details. (Robert Muir)
|
See MIGRATE.md for details. (Robert Muir)
|
||||||
|
|
||||||
|
* LUCENE-9853: JapaneseAnalyzer should use CJKWidthCharFilter for full-width and half-width character normalization.
|
||||||
|
(Tomoko Uchida)
|
||||||
|
|
||||||
Improvements
|
Improvements
|
||||||
|
|
||||||
* LUCENE-9687: Hunspell support improvements: add API for spell-checking and suggestions, support compound words,
|
* LUCENE-9687: Hunspell support improvements: add API for spell-checking and suggestions, support compound words,
|
||||||
|
|
|
@ -17,6 +17,7 @@
|
||||||
package org.apache.lucene.analysis.ja;
|
package org.apache.lucene.analysis.ja;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import org.apache.lucene.analysis.CharArraySet;
|
import org.apache.lucene.analysis.CharArraySet;
|
||||||
|
@ -25,7 +26,7 @@ import org.apache.lucene.analysis.StopFilter;
|
||||||
import org.apache.lucene.analysis.StopwordAnalyzerBase;
|
import org.apache.lucene.analysis.StopwordAnalyzerBase;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.cjk.CJKWidthFilter;
|
import org.apache.lucene.analysis.cjk.CJKWidthCharFilter;
|
||||||
import org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode;
|
import org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode;
|
||||||
import org.apache.lucene.analysis.ja.dict.UserDictionary;
|
import org.apache.lucene.analysis.ja.dict.UserDictionary;
|
||||||
|
|
||||||
|
@ -95,7 +96,6 @@ public class JapaneseAnalyzer extends StopwordAnalyzerBase {
|
||||||
Tokenizer tokenizer = new JapaneseTokenizer(userDict, true, true, mode);
|
Tokenizer tokenizer = new JapaneseTokenizer(userDict, true, true, mode);
|
||||||
TokenStream stream = new JapaneseBaseFormFilter(tokenizer);
|
TokenStream stream = new JapaneseBaseFormFilter(tokenizer);
|
||||||
stream = new JapanesePartOfSpeechStopFilter(stream, stoptags);
|
stream = new JapanesePartOfSpeechStopFilter(stream, stoptags);
|
||||||
stream = new CJKWidthFilter(stream);
|
|
||||||
stream = new StopFilter(stream, stopwords);
|
stream = new StopFilter(stream, stopwords);
|
||||||
stream = new JapaneseKatakanaStemFilter(stream);
|
stream = new JapaneseKatakanaStemFilter(stream);
|
||||||
stream = new LowerCaseFilter(stream);
|
stream = new LowerCaseFilter(stream);
|
||||||
|
@ -104,8 +104,17 @@ public class JapaneseAnalyzer extends StopwordAnalyzerBase {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected TokenStream normalize(String fieldName, TokenStream in) {
|
protected TokenStream normalize(String fieldName, TokenStream in) {
|
||||||
TokenStream result = new CJKWidthFilter(in);
|
TokenStream result = new LowerCaseFilter(in);
|
||||||
result = new LowerCaseFilter(result);
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected Reader initReader(String fieldName, Reader reader) {
|
||||||
|
return new CJKWidthCharFilter(reader);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected Reader initReaderForNormalization(String fieldName, Reader reader) {
|
||||||
|
return new CJKWidthCharFilter(reader);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -146,6 +146,22 @@ public class TestJapaneseAnalyzer extends BaseTokenStreamTestCase {
|
||||||
a.close();
|
a.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testCharWidthNormalization() throws Exception {
|
||||||
|
final Analyzer a =
|
||||||
|
new JapaneseAnalyzer(
|
||||||
|
TestJapaneseTokenizer.readDict(),
|
||||||
|
Mode.SEARCH,
|
||||||
|
JapaneseAnalyzer.getDefaultStopSet(),
|
||||||
|
JapaneseAnalyzer.getDefaultStopTags());
|
||||||
|
assertTokenStreamContents(
|
||||||
|
a.tokenStream("foo", "新橋6-20-1"),
|
||||||
|
new String[] {"新橋", "6", "20", "1"},
|
||||||
|
new int[] {0, 2, 4, 7},
|
||||||
|
new int[] {2, 3, 6, 8},
|
||||||
|
8);
|
||||||
|
a.close();
|
||||||
|
}
|
||||||
|
|
||||||
// LUCENE-3897: this string (found by running all jawiki
|
// LUCENE-3897: this string (found by running all jawiki
|
||||||
// XML through JapaneseAnalyzer) caused AIOOBE
|
// XML through JapaneseAnalyzer) caused AIOOBE
|
||||||
public void testCuriousString() throws Exception {
|
public void testCuriousString() throws Exception {
|
||||||
|
|
Loading…
Reference in New Issue