LUCENE-2399: Add support for ICU's Normalizer2

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@935186 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2010-04-17 15:07:34 +00:00
parent e18484fa4c
commit 12a7360fef
5 changed files with 179 additions and 2 deletions

View File

@ -119,6 +119,11 @@ New features
the ability to override any stemmer with a custom dictionary map.
(Robert Muir, Uwe Schindler, Simon Willnauer)
* LUCENE-2399: Add ICUNormalizer2Filter, which normalizes tokens with ICU's
Normalizer2. This allows for efficient combinations of normalization and custom
mappings in addition to standard normalization, and normalization combined
with unicode case folding. (Robert Muir)
Build
* LUCENE-2124: Moved the JDK-based collation support from contrib/collation
@ -135,6 +140,8 @@ Build
core were changed without ant clean. This fix also optimizes the
dependency management between contribs by a new ANT macro.
(Uwe Schindler, Shai Erera)
* LUCENE-2399: Upgrade contrib/icu's ICU jar file to ICU 4.4. (Robert Muir)
Optimizations

View File

@ -0,0 +1,2 @@
AnyObjectId[bd1ef881718dca394dd563279eb015e154b01f90] was removed in git history.
Apache SVN contains full history.

View File

@ -1,2 +0,0 @@
AnyObjectId[1159f04caacfda235b66d73dc65198fa40153812] was removed in git history.
Apache SVN contains full history.

View File

@ -0,0 +1,92 @@
package org.apache.lucene.analysis.icu;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import com.ibm.icu.text.Normalizer;
import com.ibm.icu.text.Normalizer2;
/**
* Normalize token text with ICU's {@link com.ibm.icu.text.Normalizer2}
* <p>
* With this filter, you can normalize text in the following ways:
* <ul>
* <li> NFKC Normalization, Case Folding, and removing Ignorables (the default)
* <li> Using a standard Normalization mode (NFC, NFD, NFKC, NFKD)
* <li> Based on rules from a custom normalization mapping.
* </ul>
* <p>
* If you use the defaults, this filter is a simple way to standardize Unicode text
* in a language-independent way for search:
* <ul>
* <li> The case folding that it does can be seen as a replacement for
* LowerCaseFilter: For example, it handles cases such as the Greek sigma, so that
* "Μάϊος" and "ΜΆΪΟΣ" will match correctly.
* <li> The normalization will standardizes different forms of the same
* character in Unicode. For example, CJK full-width numbers will be standardized
* to their ASCII forms.
* <li> Ignorables such as Zero-Width Joiner and Variation Selectors are removed.
* These are typically modifier characters that affect display.
* </ul>
*
* @see com.ibm.icu.text.Normalizer2
* @see com.ibm.icu.text.FilteredNormalizer2
*/
public class ICUNormalizer2Filter extends TokenFilter {
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final Normalizer2 normalizer;
private final StringBuilder buffer = new StringBuilder();
/**
* Create a new Normalizer2Filter that combines NFKC normalization, Case
* Folding, and removes Default Ignorables (NFKC_Casefold)
*/
public ICUNormalizer2Filter(TokenStream input) {
this(input, Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));
}
/**
* Create a new Normalizer2Filter with the specified Normalizer2
* @param input stream
* @param normalizer normalizer to use
*/
public ICUNormalizer2Filter(TokenStream input, Normalizer2 normalizer) {
super(input);
this.normalizer = normalizer;
}
@Override
public final boolean incrementToken() throws IOException {
if (input.incrementToken()) {
if (normalizer.quickCheck(termAtt) != Normalizer.YES) {
buffer.setLength(0);
buffer.append(termAtt.buffer(), 0, termAtt.length());
termAtt.setEmpty();
normalizer.normalize(buffer, termAtt);
}
return true;
} else {
return false;
}
}
}

View File

@ -0,0 +1,78 @@
package org.apache.lucene.analysis.icu;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import com.ibm.icu.text.Normalizer2;
/**
* Tests the ICUNormalizer2Filter
*/
public class TestICUNormalizer2Filter extends BaseTokenStreamTestCase {
public void testDefaults() throws IOException {
Analyzer a = new Analyzer() {
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
return new ICUNormalizer2Filter(
new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader));
}
};
// case folding
assertAnalyzesTo(a, "This is a test", new String[] { "this", "is", "a", "test" });
// case folding
assertAnalyzesTo(a, "Ruß", new String[] { "russ" });
// case folding
assertAnalyzesTo(a, "ΜΆΪΟΣ", new String[] { "μάϊοσ" });
assertAnalyzesTo(a, "Μάϊος", new String[] { "μάϊοσ" });
// supplementary case folding
assertAnalyzesTo(a, "𐐖", new String[] { "𐐾" });
// normalization
assertAnalyzesTo(a, "ﴳﴺﰧ", new String[] { "طمطمطم" });
// removal of default ignorables
assertAnalyzesTo(a, "क्‍ष", new String[] { "क्ष" });
}
public void testAlternate() throws IOException {
Analyzer a = new Analyzer() {
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
return new ICUNormalizer2Filter(
new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader),
/* specify nfc with decompose to get nfd */
Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.DECOMPOSE));
}
};
// decompose EAcute into E + combining Acute
assertAnalyzesTo(a, "\u00E9", new String[] { "\u0065\u0301" });
}
}