mirror of https://github.com/apache/lucene.git
LUCENE-2399: Add support for ICU's Normalizer2
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@935186 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
e18484fa4c
commit
12a7360fef
|
@ -119,6 +119,11 @@ New features
|
|||
the ability to override any stemmer with a custom dictionary map.
|
||||
(Robert Muir, Uwe Schindler, Simon Willnauer)
|
||||
|
||||
* LUCENE-2399: Add ICUNormalizer2Filter, which normalizes tokens with ICU's
|
||||
Normalizer2. This allows for efficient combinations of normalization and custom
|
||||
mappings in addition to standard normalization, and normalization combined
|
||||
with unicode case folding. (Robert Muir)
|
||||
|
||||
Build
|
||||
|
||||
* LUCENE-2124: Moved the JDK-based collation support from contrib/collation
|
||||
|
@ -135,6 +140,8 @@ Build
|
|||
core were changed without ant clean. This fix also optimizes the
|
||||
dependency management between contribs by a new ANT macro.
|
||||
(Uwe Schindler, Shai Erera)
|
||||
|
||||
* LUCENE-2399: Upgrade contrib/icu's ICU jar file to ICU 4.4. (Robert Muir)
|
||||
|
||||
Optimizations
|
||||
|
||||
|
|
|
@ -0,0 +1,2 @@
|
|||
AnyObjectId[bd1ef881718dca394dd563279eb015e154b01f90] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -1,2 +0,0 @@
|
|||
AnyObjectId[1159f04caacfda235b66d73dc65198fa40153812] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -0,0 +1,92 @@
|
|||
package org.apache.lucene.analysis.icu;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
|
||||
import com.ibm.icu.text.Normalizer;
|
||||
import com.ibm.icu.text.Normalizer2;
|
||||
|
||||
/**
|
||||
* Normalize token text with ICU's {@link com.ibm.icu.text.Normalizer2}
|
||||
* <p>
|
||||
* With this filter, you can normalize text in the following ways:
|
||||
* <ul>
|
||||
* <li> NFKC Normalization, Case Folding, and removing Ignorables (the default)
|
||||
* <li> Using a standard Normalization mode (NFC, NFD, NFKC, NFKD)
|
||||
* <li> Based on rules from a custom normalization mapping.
|
||||
* </ul>
|
||||
* <p>
|
||||
* If you use the defaults, this filter is a simple way to standardize Unicode text
|
||||
* in a language-independent way for search:
|
||||
* <ul>
|
||||
* <li> The case folding that it does can be seen as a replacement for
|
||||
* LowerCaseFilter: For example, it handles cases such as the Greek sigma, so that
|
||||
* "Μάϊος" and "ΜΆΪΟΣ" will match correctly.
|
||||
* <li> The normalization will standardizes different forms of the same
|
||||
* character in Unicode. For example, CJK full-width numbers will be standardized
|
||||
* to their ASCII forms.
|
||||
* <li> Ignorables such as Zero-Width Joiner and Variation Selectors are removed.
|
||||
* These are typically modifier characters that affect display.
|
||||
* </ul>
|
||||
*
|
||||
* @see com.ibm.icu.text.Normalizer2
|
||||
* @see com.ibm.icu.text.FilteredNormalizer2
|
||||
*/
|
||||
public class ICUNormalizer2Filter extends TokenFilter {
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final Normalizer2 normalizer;
|
||||
private final StringBuilder buffer = new StringBuilder();
|
||||
|
||||
/**
|
||||
* Create a new Normalizer2Filter that combines NFKC normalization, Case
|
||||
* Folding, and removes Default Ignorables (NFKC_Casefold)
|
||||
*/
|
||||
public ICUNormalizer2Filter(TokenStream input) {
|
||||
this(input, Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new Normalizer2Filter with the specified Normalizer2
|
||||
* @param input stream
|
||||
* @param normalizer normalizer to use
|
||||
*/
|
||||
public ICUNormalizer2Filter(TokenStream input, Normalizer2 normalizer) {
|
||||
super(input);
|
||||
this.normalizer = normalizer;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final boolean incrementToken() throws IOException {
|
||||
if (input.incrementToken()) {
|
||||
if (normalizer.quickCheck(termAtt) != Normalizer.YES) {
|
||||
buffer.setLength(0);
|
||||
buffer.append(termAtt.buffer(), 0, termAtt.length());
|
||||
termAtt.setEmpty();
|
||||
normalizer.normalize(buffer, termAtt);
|
||||
}
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,78 @@
|
|||
package org.apache.lucene.analysis.icu;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
|
||||
import com.ibm.icu.text.Normalizer2;
|
||||
|
||||
/**
|
||||
* Tests the ICUNormalizer2Filter
|
||||
*/
|
||||
public class TestICUNormalizer2Filter extends BaseTokenStreamTestCase {
|
||||
|
||||
public void testDefaults() throws IOException {
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
return new ICUNormalizer2Filter(
|
||||
new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader));
|
||||
}
|
||||
};
|
||||
|
||||
// case folding
|
||||
assertAnalyzesTo(a, "This is a test", new String[] { "this", "is", "a", "test" });
|
||||
|
||||
// case folding
|
||||
assertAnalyzesTo(a, "Ruß", new String[] { "russ" });
|
||||
|
||||
// case folding
|
||||
assertAnalyzesTo(a, "ΜΆΪΟΣ", new String[] { "μάϊοσ" });
|
||||
assertAnalyzesTo(a, "Μάϊος", new String[] { "μάϊοσ" });
|
||||
|
||||
// supplementary case folding
|
||||
assertAnalyzesTo(a, "𐐖", new String[] { "𐐾" });
|
||||
|
||||
// normalization
|
||||
assertAnalyzesTo(a, "ﴳﴺﰧ", new String[] { "طمطمطم" });
|
||||
|
||||
// removal of default ignorables
|
||||
assertAnalyzesTo(a, "क्ष", new String[] { "क्ष" });
|
||||
}
|
||||
|
||||
public void testAlternate() throws IOException {
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
return new ICUNormalizer2Filter(
|
||||
new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader),
|
||||
/* specify nfc with decompose to get nfd */
|
||||
Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.DECOMPOSE));
|
||||
}
|
||||
};
|
||||
|
||||
// decompose EAcute into E + combining Acute
|
||||
assertAnalyzesTo(a, "\u00E9", new String[] { "\u0065\u0301" });
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue