mirror of https://github.com/apache/lucene.git
Add accent character normalizer filter contributed by Sven Duzont. Also created simple test case.
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@160011 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
f673bff9ef
commit
3be3e8ab5d
|
@ -0,0 +1,171 @@
|
|||
package org.apache.lucene.analysis;
|
||||
|
||||
/**
|
||||
* Copyright 2005 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* A filter that replace accented characters in the ISO Latin 1 character set by
|
||||
* their unaccented equivalent. The case will not be altered.
|
||||
* <p>
|
||||
* For instance, 'à' will be replaced by 'a'.
|
||||
* <p>
|
||||
*/
|
||||
public class ISOLatin1AccentFilter extends TokenFilter {
|
||||
public ISOLatin1AccentFilter(TokenStream input) {
|
||||
super(input);
|
||||
}
|
||||
|
||||
/**
|
||||
* To replace accented characters by unaccented equivalents.
|
||||
*/
|
||||
public final Token next() throws java.io.IOException {
|
||||
final Token t = input.next();
|
||||
if (t == null)
|
||||
return null;
|
||||
// Return a token with filtered characters.
|
||||
return new Token(RemoveAccents(t.termText()), t.startOffset(), t.endOffset(), t.type());
|
||||
}
|
||||
|
||||
/**
|
||||
* To replace accented characters in a String by unaccented equivalents.
|
||||
*/
|
||||
public final static String RemoveAccents(String input) {
|
||||
final StringBuffer output = new StringBuffer();
|
||||
for (int i = 0; i < input.length(); i++) {
|
||||
switch (input.charAt(i)) {
|
||||
case '\u00C0' : // À
|
||||
case '\u00C1' : // Á
|
||||
case '\u00C2' : // Â
|
||||
case '\u00C3' : // Ã
|
||||
case '\u00C4' : // Ä
|
||||
case '\u00C5' : // Å
|
||||
output.append("A");
|
||||
break;
|
||||
case '\u00C6' : // Æ
|
||||
output.append("AE");
|
||||
break;
|
||||
case '\u00C7' : // Ç
|
||||
output.append("C");
|
||||
break;
|
||||
case '\u00C8' : // È
|
||||
case '\u00C9' : // É
|
||||
case '\u00CA' : // Ê
|
||||
case '\u00CB' : // Ë
|
||||
output.append("E");
|
||||
break;
|
||||
case '\u00CC' : // Ì
|
||||
case '\u00CD' : // Í
|
||||
case '\u00CE' : // Î
|
||||
case '\u00CF' : // Ï
|
||||
output.append("I");
|
||||
break;
|
||||
case '\u00D0' : // Ð
|
||||
output.append("D");
|
||||
break;
|
||||
case '\u00D1' : // Ñ
|
||||
output.append("N");
|
||||
break;
|
||||
case '\u00D2' : // Ò
|
||||
case '\u00D3' : // Ó
|
||||
case '\u00D4' : // Ô
|
||||
case '\u00D5' : // Õ
|
||||
case '\u00D6' : // Ö
|
||||
case '\u00D8' : // Ø
|
||||
output.append("O");
|
||||
break;
|
||||
case '\u0152' : // Œ
|
||||
output.append("OE");
|
||||
break;
|
||||
case '\u00DE' : // Þ
|
||||
output.append("TH");
|
||||
break;
|
||||
case '\u00D9' : // Ù
|
||||
case '\u00DA' : // Ú
|
||||
case '\u00DB' : // Û
|
||||
case '\u00DC' : // Ü
|
||||
output.append("U");
|
||||
break;
|
||||
case '\u00DD' : // Ý
|
||||
case '\u0178' : // Ÿ
|
||||
output.append("Y");
|
||||
break;
|
||||
case '\u00E0' : // à
|
||||
case '\u00E1' : // á
|
||||
case '\u00E2' : // â
|
||||
case '\u00E3' : // ã
|
||||
case '\u00E4' : // ä
|
||||
case '\u00E5' : // å
|
||||
output.append("a");
|
||||
break;
|
||||
case '\u00E6' : // æ
|
||||
output.append("ae");
|
||||
break;
|
||||
case '\u00E7' : // ç
|
||||
output.append("c");
|
||||
break;
|
||||
case '\u00E8' : // è
|
||||
case '\u00E9' : // é
|
||||
case '\u00EA' : // ê
|
||||
case '\u00EB' : // ë
|
||||
output.append("e");
|
||||
break;
|
||||
case '\u00EC' : // ì
|
||||
case '\u00ED' : // í
|
||||
case '\u00EE' : // î
|
||||
case '\u00EF' : // ï
|
||||
output.append("i");
|
||||
break;
|
||||
case '\u00F0' : // ð
|
||||
output.append("d");
|
||||
break;
|
||||
case '\u00F1' : // ñ
|
||||
output.append("n");
|
||||
break;
|
||||
case '\u00F2' : // ò
|
||||
case '\u00F3' : // ó
|
||||
case '\u00F4' : // ô
|
||||
case '\u00F5' : // õ
|
||||
case '\u00F6' : // ö
|
||||
case '\u00F8' : // ø
|
||||
output.append("o");
|
||||
break;
|
||||
case '\u0153' : // œ
|
||||
output.append("oe");
|
||||
break;
|
||||
case '\u00DF' : // ß
|
||||
output.append("ss");
|
||||
break;
|
||||
case '\u00FE' : // þ
|
||||
output.append("th");
|
||||
break;
|
||||
case '\u00F9' : // ù
|
||||
case '\u00FA' : // ú
|
||||
case '\u00FB' : // û
|
||||
case '\u00FC' : // Ÿ
|
||||
output.append("u");
|
||||
break;
|
||||
case '\u00FD' : // ý
|
||||
case '\u00FF' : // ÿ
|
||||
output.append("y");
|
||||
break;
|
||||
default :
|
||||
output.append(input.charAt(i));
|
||||
break;
|
||||
}
|
||||
}
|
||||
return output.toString();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,16 @@
|
|||
package org.apache.lucene.analysis;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
import java.io.StringReader;
|
||||
|
||||
public class TestISOLatin1AccentFilter extends TestCase {
|
||||
public void testU() throws Exception {
|
||||
TokenStream stream = new WhitespaceTokenizer(new StringReader("ü"));
|
||||
ISOLatin1AccentFilter filter = new ISOLatin1AccentFilter(stream);
|
||||
Token token = filter.next();
|
||||
assertEquals("u", token.termText);
|
||||
|
||||
assertNull(filter.next());
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue