diff --git a/CHANGES.txt b/CHANGES.txt index f106bdb2924..ff232eef3e3 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -147,6 +147,8 @@ Bug fixes 14. LUCENE-1310: Fixed SloppyPhraseScorer to work also for terms repeating more than twice in the query. (Doron Cohen) +15. LUCENE-1351: ISOLatin1AccentFilter now cleans additional ligatures (Cedrik Lime via Grant Ingersoll) + New features 1. LUCENE-1137: Added Token.set/getFlags() accessors for passing more information about a Token through the analysis diff --git a/src/java/org/apache/lucene/analysis/ISOLatin1AccentFilter.java b/src/java/org/apache/lucene/analysis/ISOLatin1AccentFilter.java index 32edc7bc210..f1ca40e45c3 100644 --- a/src/java/org/apache/lucene/analysis/ISOLatin1AccentFilter.java +++ b/src/java/org/apache/lucene/analysis/ISOLatin1AccentFilter.java @@ -41,7 +41,7 @@ public class ISOLatin1AccentFilter extends TokenFilter { // just return token as-is: for(int i=0;i= '\u00c0' && c <= '\u0178') { + if (c >= '\u00c0' && c <= '\uFB06') { removeAccents(buffer, length); result.setTermBuffer(output, 0, outputPos); break; @@ -76,7 +76,7 @@ public class ISOLatin1AccentFilter extends TokenFilter { // Quick test: if it's not in range then just keep // current character - if (c < '\u00c0') + if (c < '\u00c0' || c > '\uFB06') output[outputPos++] = c; else { switch (c) { @@ -107,6 +107,10 @@ public class ISOLatin1AccentFilter extends TokenFilter { case '\u00CF' : // Ï output[outputPos++] = 'I'; break; + case '\u0132' : // IJ + output[outputPos++] = 'I'; + output[outputPos++] = 'J'; + break; case '\u00D0' : // Ð output[outputPos++] = 'D'; break; @@ -166,6 +170,10 @@ public class ISOLatin1AccentFilter extends TokenFilter { case '\u00EF' : // ï output[outputPos++] = 'i'; break; + case '\u0133' : // ij + output[outputPos++] = 'i'; + output[outputPos++] = 'j'; + break; case '\u00F0' : // ð output[outputPos++] = 'd'; break; @@ -202,6 +210,37 @@ public class ISOLatin1AccentFilter extends TokenFilter { case '\u00FF' : // ÿ output[outputPos++] = 'y'; break; + case '\uFB00': // ff + output[outputPos++] = 'f'; + output[outputPos++] = 'f'; + break; + case '\uFB01': // fi + output[outputPos++] = 'f'; + output[outputPos++] = 'i'; + break; + case '\uFB02': // fl + output[outputPos++] = 'f'; + output[outputPos++] = 'l'; + break; + // following 2 are commented as they can break the maxSizeNeeded (and doing *3 could be expensive) +// case '\uFB03': // ffi +// output[outputPos++] = 'f'; +// output[outputPos++] = 'f'; +// output[outputPos++] = 'i'; +// break; +// case '\uFB04': // ffl +// output[outputPos++] = 'f'; +// output[outputPos++] = 'f'; +// output[outputPos++] = 'l'; +// break; + case '\uFB05': // ſt + output[outputPos++] = 'f'; + output[outputPos++] = 't'; + break; + case '\uFB06': // st + output[outputPos++] = 's'; + output[outputPos++] = 't'; + break; default : output[outputPos++] = c; break; diff --git a/src/test/org/apache/lucene/analysis/TestISOLatin1AccentFilter.java b/src/test/org/apache/lucene/analysis/TestISOLatin1AccentFilter.java index 7eaea81d15e..de0c57a25f8 100644 --- a/src/test/org/apache/lucene/analysis/TestISOLatin1AccentFilter.java +++ b/src/test/org/apache/lucene/analysis/TestISOLatin1AccentFilter.java @@ -23,7 +23,7 @@ import java.io.StringReader; public class TestISOLatin1AccentFilter extends LuceneTestCase { public void testU() throws Exception { - TokenStream stream = new WhitespaceTokenizer(new StringReader("Des mot clés À LA CHAÎNE À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï Ð Ñ Ò Ó Ô Õ Ö Ø Œ Þ Ù Ú Û Ü Ý Ÿ à á â ã ä å æ ç è é ê ë ì í î ï ð ñ ò ó ô õ ö ø œ ß þ ù ú û ü ý ÿ")); + TokenStream stream = new WhitespaceTokenizer(new StringReader("Des mot clés À LA CHAÎNE À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï IJ Ð Ñ Ò Ó Ô Õ Ö Ø Œ Þ Ù Ú Û Ü Ý Ÿ à á â ã ä å æ ç è é ê ë ì í î ï ij ð ñ ò ó ô õ ö ø œ ß þ ù ú û ü ý ÿ fi fl")); ISOLatin1AccentFilter filter = new ISOLatin1AccentFilter(stream); assertEquals("Des", filter.next().termText()); assertEquals("mot", filter.next().termText()); @@ -47,6 +47,7 @@ public class TestISOLatin1AccentFilter extends LuceneTestCase { assertEquals("I", filter.next().termText()); assertEquals("I", filter.next().termText()); assertEquals("I", filter.next().termText()); + assertEquals("IJ", filter.next().termText()); assertEquals("D", filter.next().termText()); assertEquals("N", filter.next().termText()); assertEquals("O", filter.next().termText()); @@ -79,6 +80,7 @@ public class TestISOLatin1AccentFilter extends LuceneTestCase { assertEquals("i", filter.next().termText()); assertEquals("i", filter.next().termText()); assertEquals("i", filter.next().termText()); + assertEquals("ij", filter.next().termText()); assertEquals("d", filter.next().termText()); assertEquals("n", filter.next().termText()); assertEquals("o", filter.next().termText()); @@ -96,6 +98,8 @@ public class TestISOLatin1AccentFilter extends LuceneTestCase { assertEquals("u", filter.next().termText()); assertEquals("y", filter.next().termText()); assertEquals("y", filter.next().termText()); + assertEquals("fi", filter.next().termText()); + assertEquals("fl", filter.next().termText()); assertNull(filter.next()); } }