LUCENE-1351: clean additional ligatures

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@682766 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Grant Ingersoll 2008-08-05 15:47:33 +00:00
parent f2838b450b
commit dd066edcf1
3 changed files with 48 additions and 3 deletions

View File

@ -147,6 +147,8 @@ Bug fixes
14. LUCENE-1310: Fixed SloppyPhraseScorer to work also for terms repeating more
than twice in the query. (Doron Cohen)
15. LUCENE-1351: ISOLatin1AccentFilter now cleans additional ligatures (Cedrik Lime via Grant Ingersoll)
New features
1. LUCENE-1137: Added Token.set/getFlags() accessors for passing more information about a Token through the analysis

View File

@ -41,7 +41,7 @@ public class ISOLatin1AccentFilter extends TokenFilter {
// just return token as-is:
for(int i=0;i<length;i++) {
final char c = buffer[i];
if (c >= '\u00c0' && c <= '\u0178') {
if (c >= '\u00c0' && c <= '\uFB06') {
removeAccents(buffer, length);
result.setTermBuffer(output, 0, outputPos);
break;
@ -76,7 +76,7 @@ public class ISOLatin1AccentFilter extends TokenFilter {
// Quick test: if it's not in range then just keep
// current character
if (c < '\u00c0')
if (c < '\u00c0' || c > '\uFB06')
output[outputPos++] = c;
else {
switch (c) {
@ -107,6 +107,10 @@ public class ISOLatin1AccentFilter extends TokenFilter {
case '\u00CF' : // Ï
output[outputPos++] = 'I';
break;
case '\u0132' : // IJ
output[outputPos++] = 'I';
output[outputPos++] = 'J';
break;
case '\u00D0' : // Ð
output[outputPos++] = 'D';
break;
@ -166,6 +170,10 @@ public class ISOLatin1AccentFilter extends TokenFilter {
case '\u00EF' : // ï
output[outputPos++] = 'i';
break;
case '\u0133' : // ij
output[outputPos++] = 'i';
output[outputPos++] = 'j';
break;
case '\u00F0' : // ð
output[outputPos++] = 'd';
break;
@ -202,6 +210,37 @@ public class ISOLatin1AccentFilter extends TokenFilter {
case '\u00FF' : // ÿ
output[outputPos++] = 'y';
break;
case '\uFB00': //
output[outputPos++] = 'f';
output[outputPos++] = 'f';
break;
case '\uFB01': //
output[outputPos++] = 'f';
output[outputPos++] = 'i';
break;
case '\uFB02': //
output[outputPos++] = 'f';
output[outputPos++] = 'l';
break;
// following 2 are commented as they can break the maxSizeNeeded (and doing *3 could be expensive)
// case '\uFB03': //
// output[outputPos++] = 'f';
// output[outputPos++] = 'f';
// output[outputPos++] = 'i';
// break;
// case '\uFB04': //
// output[outputPos++] = 'f';
// output[outputPos++] = 'f';
// output[outputPos++] = 'l';
// break;
case '\uFB05': //
output[outputPos++] = 'f';
output[outputPos++] = 't';
break;
case '\uFB06': //
output[outputPos++] = 's';
output[outputPos++] = 't';
break;
default :
output[outputPos++] = c;
break;

View File

@ -23,7 +23,7 @@ import java.io.StringReader;
public class TestISOLatin1AccentFilter extends LuceneTestCase {
public void testU() throws Exception {
TokenStream stream = new WhitespaceTokenizer(new StringReader("Des mot clés À LA CHAÎNE À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï Ð Ñ Ò Ó Ô Õ Ö Ø Œ Þ Ù Ú Û Ü Ý Ÿ à á â ã ä å æ ç è é ê ë ì í î ï ð ñ ò ó ô õ ö ø œ ß þ ù ú û ü ý ÿ"));
TokenStream stream = new WhitespaceTokenizer(new StringReader("Des mot clés À LA CHAÎNE À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï IJ Ð Ñ Ò Ó Ô Õ Ö Ø Œ Þ Ù Ú Û Ü Ý Ÿ à á â ã ä å æ ç è é ê ë ì í î ï ij ð ñ ò ó ô õ ö ø œ ß þ ù ú û ü ý ÿ fi fl"));
ISOLatin1AccentFilter filter = new ISOLatin1AccentFilter(stream);
assertEquals("Des", filter.next().termText());
assertEquals("mot", filter.next().termText());
@ -47,6 +47,7 @@ public class TestISOLatin1AccentFilter extends LuceneTestCase {
assertEquals("I", filter.next().termText());
assertEquals("I", filter.next().termText());
assertEquals("I", filter.next().termText());
assertEquals("IJ", filter.next().termText());
assertEquals("D", filter.next().termText());
assertEquals("N", filter.next().termText());
assertEquals("O", filter.next().termText());
@ -79,6 +80,7 @@ public class TestISOLatin1AccentFilter extends LuceneTestCase {
assertEquals("i", filter.next().termText());
assertEquals("i", filter.next().termText());
assertEquals("i", filter.next().termText());
assertEquals("ij", filter.next().termText());
assertEquals("d", filter.next().termText());
assertEquals("n", filter.next().termText());
assertEquals("o", filter.next().termText());
@ -96,6 +98,8 @@ public class TestISOLatin1AccentFilter extends LuceneTestCase {
assertEquals("u", filter.next().termText());
assertEquals("y", filter.next().termText());
assertEquals("y", filter.next().termText());
assertEquals("fi", filter.next().termText());
assertEquals("fl", filter.next().termText());
assertNull(filter.next());
}
}