LUCENE-1351: clean additional ligatures

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@682766 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Grant Ingersoll 2008-08-05 15:47:33 +00:00
parent f2838b450b
commit dd066edcf1
3 changed files with 48 additions and 3 deletions

View File

@ -147,6 +147,8 @@ Bug fixes
14. LUCENE-1310: Fixed SloppyPhraseScorer to work also for terms repeating more 14. LUCENE-1310: Fixed SloppyPhraseScorer to work also for terms repeating more
than twice in the query. (Doron Cohen) than twice in the query. (Doron Cohen)
15. LUCENE-1351: ISOLatin1AccentFilter now cleans additional ligatures (Cedrik Lime via Grant Ingersoll)
New features New features
1. LUCENE-1137: Added Token.set/getFlags() accessors for passing more information about a Token through the analysis 1. LUCENE-1137: Added Token.set/getFlags() accessors for passing more information about a Token through the analysis

View File

@ -41,7 +41,7 @@ public class ISOLatin1AccentFilter extends TokenFilter {
// just return token as-is: // just return token as-is:
for(int i=0;i<length;i++) { for(int i=0;i<length;i++) {
final char c = buffer[i]; final char c = buffer[i];
if (c >= '\u00c0' && c <= '\u0178') { if (c >= '\u00c0' && c <= '\uFB06') {
removeAccents(buffer, length); removeAccents(buffer, length);
result.setTermBuffer(output, 0, outputPos); result.setTermBuffer(output, 0, outputPos);
break; break;
@ -76,7 +76,7 @@ public class ISOLatin1AccentFilter extends TokenFilter {
// Quick test: if it's not in range then just keep // Quick test: if it's not in range then just keep
// current character // current character
if (c < '\u00c0') if (c < '\u00c0' || c > '\uFB06')
output[outputPos++] = c; output[outputPos++] = c;
else { else {
switch (c) { switch (c) {
@ -107,6 +107,10 @@ public class ISOLatin1AccentFilter extends TokenFilter {
case '\u00CF' : // Ï case '\u00CF' : // Ï
output[outputPos++] = 'I'; output[outputPos++] = 'I';
break; break;
case '\u0132' : // IJ
output[outputPos++] = 'I';
output[outputPos++] = 'J';
break;
case '\u00D0' : // Ð case '\u00D0' : // Ð
output[outputPos++] = 'D'; output[outputPos++] = 'D';
break; break;
@ -166,6 +170,10 @@ public class ISOLatin1AccentFilter extends TokenFilter {
case '\u00EF' : // ï case '\u00EF' : // ï
output[outputPos++] = 'i'; output[outputPos++] = 'i';
break; break;
case '\u0133' : // ij
output[outputPos++] = 'i';
output[outputPos++] = 'j';
break;
case '\u00F0' : // ð case '\u00F0' : // ð
output[outputPos++] = 'd'; output[outputPos++] = 'd';
break; break;
@ -202,6 +210,37 @@ public class ISOLatin1AccentFilter extends TokenFilter {
case '\u00FF' : // ÿ case '\u00FF' : // ÿ
output[outputPos++] = 'y'; output[outputPos++] = 'y';
break; break;
case '\uFB00': //
output[outputPos++] = 'f';
output[outputPos++] = 'f';
break;
case '\uFB01': //
output[outputPos++] = 'f';
output[outputPos++] = 'i';
break;
case '\uFB02': //
output[outputPos++] = 'f';
output[outputPos++] = 'l';
break;
// following 2 are commented as they can break the maxSizeNeeded (and doing *3 could be expensive)
// case '\uFB03': //
// output[outputPos++] = 'f';
// output[outputPos++] = 'f';
// output[outputPos++] = 'i';
// break;
// case '\uFB04': //
// output[outputPos++] = 'f';
// output[outputPos++] = 'f';
// output[outputPos++] = 'l';
// break;
case '\uFB05': //
output[outputPos++] = 'f';
output[outputPos++] = 't';
break;
case '\uFB06': //
output[outputPos++] = 's';
output[outputPos++] = 't';
break;
default : default :
output[outputPos++] = c; output[outputPos++] = c;
break; break;

View File

@ -23,7 +23,7 @@ import java.io.StringReader;
public class TestISOLatin1AccentFilter extends LuceneTestCase { public class TestISOLatin1AccentFilter extends LuceneTestCase {
public void testU() throws Exception { public void testU() throws Exception {
TokenStream stream = new WhitespaceTokenizer(new StringReader("Des mot clés À LA CHAÎNE À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï Ð Ñ Ò Ó Ô Õ Ö Ø Œ Þ Ù Ú Û Ü Ý Ÿ à á â ã ä å æ ç è é ê ë ì í î ï ð ñ ò ó ô õ ö ø œ ß þ ù ú û ü ý ÿ")); TokenStream stream = new WhitespaceTokenizer(new StringReader("Des mot clés À LA CHAÎNE À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï IJ Ð Ñ Ò Ó Ô Õ Ö Ø Œ Þ Ù Ú Û Ü Ý Ÿ à á â ã ä å æ ç è é ê ë ì í î ï ij ð ñ ò ó ô õ ö ø œ ß þ ù ú û ü ý ÿ fi fl"));
ISOLatin1AccentFilter filter = new ISOLatin1AccentFilter(stream); ISOLatin1AccentFilter filter = new ISOLatin1AccentFilter(stream);
assertEquals("Des", filter.next().termText()); assertEquals("Des", filter.next().termText());
assertEquals("mot", filter.next().termText()); assertEquals("mot", filter.next().termText());
@ -47,6 +47,7 @@ public class TestISOLatin1AccentFilter extends LuceneTestCase {
assertEquals("I", filter.next().termText()); assertEquals("I", filter.next().termText());
assertEquals("I", filter.next().termText()); assertEquals("I", filter.next().termText());
assertEquals("I", filter.next().termText()); assertEquals("I", filter.next().termText());
assertEquals("IJ", filter.next().termText());
assertEquals("D", filter.next().termText()); assertEquals("D", filter.next().termText());
assertEquals("N", filter.next().termText()); assertEquals("N", filter.next().termText());
assertEquals("O", filter.next().termText()); assertEquals("O", filter.next().termText());
@ -79,6 +80,7 @@ public class TestISOLatin1AccentFilter extends LuceneTestCase {
assertEquals("i", filter.next().termText()); assertEquals("i", filter.next().termText());
assertEquals("i", filter.next().termText()); assertEquals("i", filter.next().termText());
assertEquals("i", filter.next().termText()); assertEquals("i", filter.next().termText());
assertEquals("ij", filter.next().termText());
assertEquals("d", filter.next().termText()); assertEquals("d", filter.next().termText());
assertEquals("n", filter.next().termText()); assertEquals("n", filter.next().termText());
assertEquals("o", filter.next().termText()); assertEquals("o", filter.next().termText());
@ -96,6 +98,8 @@ public class TestISOLatin1AccentFilter extends LuceneTestCase {
assertEquals("u", filter.next().termText()); assertEquals("u", filter.next().termText());
assertEquals("y", filter.next().termText()); assertEquals("y", filter.next().termText());
assertEquals("y", filter.next().termText()); assertEquals("y", filter.next().termText());
assertEquals("fi", filter.next().termText());
assertEquals("fl", filter.next().termText());
assertNull(filter.next()); assertNull(filter.next());
} }
} }