mirror of https://github.com/apache/lucene.git
LUCENE-1351: clean additional ligatures
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@682766 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
f2838b450b
commit
dd066edcf1
|
@ -147,6 +147,8 @@ Bug fixes
|
|||
14. LUCENE-1310: Fixed SloppyPhraseScorer to work also for terms repeating more
|
||||
than twice in the query. (Doron Cohen)
|
||||
|
||||
15. LUCENE-1351: ISOLatin1AccentFilter now cleans additional ligatures (Cedrik Lime via Grant Ingersoll)
|
||||
|
||||
New features
|
||||
|
||||
1. LUCENE-1137: Added Token.set/getFlags() accessors for passing more information about a Token through the analysis
|
||||
|
|
|
@ -41,7 +41,7 @@ public class ISOLatin1AccentFilter extends TokenFilter {
|
|||
// just return token as-is:
|
||||
for(int i=0;i<length;i++) {
|
||||
final char c = buffer[i];
|
||||
if (c >= '\u00c0' && c <= '\u0178') {
|
||||
if (c >= '\u00c0' && c <= '\uFB06') {
|
||||
removeAccents(buffer, length);
|
||||
result.setTermBuffer(output, 0, outputPos);
|
||||
break;
|
||||
|
@ -76,7 +76,7 @@ public class ISOLatin1AccentFilter extends TokenFilter {
|
|||
|
||||
// Quick test: if it's not in range then just keep
|
||||
// current character
|
||||
if (c < '\u00c0')
|
||||
if (c < '\u00c0' || c > '\uFB06')
|
||||
output[outputPos++] = c;
|
||||
else {
|
||||
switch (c) {
|
||||
|
@ -107,6 +107,10 @@ public class ISOLatin1AccentFilter extends TokenFilter {
|
|||
case '\u00CF' : // Ï
|
||||
output[outputPos++] = 'I';
|
||||
break;
|
||||
case '\u0132' : // IJ
|
||||
output[outputPos++] = 'I';
|
||||
output[outputPos++] = 'J';
|
||||
break;
|
||||
case '\u00D0' : // Ð
|
||||
output[outputPos++] = 'D';
|
||||
break;
|
||||
|
@ -166,6 +170,10 @@ public class ISOLatin1AccentFilter extends TokenFilter {
|
|||
case '\u00EF' : // ï
|
||||
output[outputPos++] = 'i';
|
||||
break;
|
||||
case '\u0133' : // ij
|
||||
output[outputPos++] = 'i';
|
||||
output[outputPos++] = 'j';
|
||||
break;
|
||||
case '\u00F0' : // ð
|
||||
output[outputPos++] = 'd';
|
||||
break;
|
||||
|
@ -202,6 +210,37 @@ public class ISOLatin1AccentFilter extends TokenFilter {
|
|||
case '\u00FF' : // ÿ
|
||||
output[outputPos++] = 'y';
|
||||
break;
|
||||
case '\uFB00': // ff
|
||||
output[outputPos++] = 'f';
|
||||
output[outputPos++] = 'f';
|
||||
break;
|
||||
case '\uFB01': // fi
|
||||
output[outputPos++] = 'f';
|
||||
output[outputPos++] = 'i';
|
||||
break;
|
||||
case '\uFB02': // fl
|
||||
output[outputPos++] = 'f';
|
||||
output[outputPos++] = 'l';
|
||||
break;
|
||||
// following 2 are commented as they can break the maxSizeNeeded (and doing *3 could be expensive)
|
||||
// case '\uFB03': // ffi
|
||||
// output[outputPos++] = 'f';
|
||||
// output[outputPos++] = 'f';
|
||||
// output[outputPos++] = 'i';
|
||||
// break;
|
||||
// case '\uFB04': // ffl
|
||||
// output[outputPos++] = 'f';
|
||||
// output[outputPos++] = 'f';
|
||||
// output[outputPos++] = 'l';
|
||||
// break;
|
||||
case '\uFB05': // ſt
|
||||
output[outputPos++] = 'f';
|
||||
output[outputPos++] = 't';
|
||||
break;
|
||||
case '\uFB06': // st
|
||||
output[outputPos++] = 's';
|
||||
output[outputPos++] = 't';
|
||||
break;
|
||||
default :
|
||||
output[outputPos++] = c;
|
||||
break;
|
||||
|
|
|
@ -23,7 +23,7 @@ import java.io.StringReader;
|
|||
|
||||
public class TestISOLatin1AccentFilter extends LuceneTestCase {
|
||||
public void testU() throws Exception {
|
||||
TokenStream stream = new WhitespaceTokenizer(new StringReader("Des mot clés À LA CHAÎNE À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï Ð Ñ Ò Ó Ô Õ Ö Ø Œ Þ Ù Ú Û Ü Ý Ÿ à á â ã ä å æ ç è é ê ë ì í î ï ð ñ ò ó ô õ ö ø œ ß þ ù ú û ü ý ÿ"));
|
||||
TokenStream stream = new WhitespaceTokenizer(new StringReader("Des mot clés À LA CHAÎNE À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï IJ Ð Ñ Ò Ó Ô Õ Ö Ø Œ Þ Ù Ú Û Ü Ý Ÿ à á â ã ä å æ ç è é ê ë ì í î ï ij ð ñ ò ó ô õ ö ø œ ß þ ù ú û ü ý ÿ fi fl"));
|
||||
ISOLatin1AccentFilter filter = new ISOLatin1AccentFilter(stream);
|
||||
assertEquals("Des", filter.next().termText());
|
||||
assertEquals("mot", filter.next().termText());
|
||||
|
@ -47,6 +47,7 @@ public class TestISOLatin1AccentFilter extends LuceneTestCase {
|
|||
assertEquals("I", filter.next().termText());
|
||||
assertEquals("I", filter.next().termText());
|
||||
assertEquals("I", filter.next().termText());
|
||||
assertEquals("IJ", filter.next().termText());
|
||||
assertEquals("D", filter.next().termText());
|
||||
assertEquals("N", filter.next().termText());
|
||||
assertEquals("O", filter.next().termText());
|
||||
|
@ -79,6 +80,7 @@ public class TestISOLatin1AccentFilter extends LuceneTestCase {
|
|||
assertEquals("i", filter.next().termText());
|
||||
assertEquals("i", filter.next().termText());
|
||||
assertEquals("i", filter.next().termText());
|
||||
assertEquals("ij", filter.next().termText());
|
||||
assertEquals("d", filter.next().termText());
|
||||
assertEquals("n", filter.next().termText());
|
||||
assertEquals("o", filter.next().termText());
|
||||
|
@ -96,6 +98,8 @@ public class TestISOLatin1AccentFilter extends LuceneTestCase {
|
|||
assertEquals("u", filter.next().termText());
|
||||
assertEquals("y", filter.next().termText());
|
||||
assertEquals("y", filter.next().termText());
|
||||
assertEquals("fi", filter.next().termText());
|
||||
assertEquals("fl", filter.next().termText());
|
||||
assertNull(filter.next());
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue