mirror of https://github.com/apache/lucene.git
LUCENE-1351: clean additional ligatures
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@682766 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
f2838b450b
commit
dd066edcf1
|
@ -147,6 +147,8 @@ Bug fixes
|
||||||
14. LUCENE-1310: Fixed SloppyPhraseScorer to work also for terms repeating more
|
14. LUCENE-1310: Fixed SloppyPhraseScorer to work also for terms repeating more
|
||||||
than twice in the query. (Doron Cohen)
|
than twice in the query. (Doron Cohen)
|
||||||
|
|
||||||
|
15. LUCENE-1351: ISOLatin1AccentFilter now cleans additional ligatures (Cedrik Lime via Grant Ingersoll)
|
||||||
|
|
||||||
New features
|
New features
|
||||||
|
|
||||||
1. LUCENE-1137: Added Token.set/getFlags() accessors for passing more information about a Token through the analysis
|
1. LUCENE-1137: Added Token.set/getFlags() accessors for passing more information about a Token through the analysis
|
||||||
|
|
|
@ -41,7 +41,7 @@ public class ISOLatin1AccentFilter extends TokenFilter {
|
||||||
// just return token as-is:
|
// just return token as-is:
|
||||||
for(int i=0;i<length;i++) {
|
for(int i=0;i<length;i++) {
|
||||||
final char c = buffer[i];
|
final char c = buffer[i];
|
||||||
if (c >= '\u00c0' && c <= '\u0178') {
|
if (c >= '\u00c0' && c <= '\uFB06') {
|
||||||
removeAccents(buffer, length);
|
removeAccents(buffer, length);
|
||||||
result.setTermBuffer(output, 0, outputPos);
|
result.setTermBuffer(output, 0, outputPos);
|
||||||
break;
|
break;
|
||||||
|
@ -76,7 +76,7 @@ public class ISOLatin1AccentFilter extends TokenFilter {
|
||||||
|
|
||||||
// Quick test: if it's not in range then just keep
|
// Quick test: if it's not in range then just keep
|
||||||
// current character
|
// current character
|
||||||
if (c < '\u00c0')
|
if (c < '\u00c0' || c > '\uFB06')
|
||||||
output[outputPos++] = c;
|
output[outputPos++] = c;
|
||||||
else {
|
else {
|
||||||
switch (c) {
|
switch (c) {
|
||||||
|
@ -107,6 +107,10 @@ public class ISOLatin1AccentFilter extends TokenFilter {
|
||||||
case '\u00CF' : // Ï
|
case '\u00CF' : // Ï
|
||||||
output[outputPos++] = 'I';
|
output[outputPos++] = 'I';
|
||||||
break;
|
break;
|
||||||
|
case '\u0132' : // IJ
|
||||||
|
output[outputPos++] = 'I';
|
||||||
|
output[outputPos++] = 'J';
|
||||||
|
break;
|
||||||
case '\u00D0' : // Ð
|
case '\u00D0' : // Ð
|
||||||
output[outputPos++] = 'D';
|
output[outputPos++] = 'D';
|
||||||
break;
|
break;
|
||||||
|
@ -166,6 +170,10 @@ public class ISOLatin1AccentFilter extends TokenFilter {
|
||||||
case '\u00EF' : // ï
|
case '\u00EF' : // ï
|
||||||
output[outputPos++] = 'i';
|
output[outputPos++] = 'i';
|
||||||
break;
|
break;
|
||||||
|
case '\u0133' : // ij
|
||||||
|
output[outputPos++] = 'i';
|
||||||
|
output[outputPos++] = 'j';
|
||||||
|
break;
|
||||||
case '\u00F0' : // ð
|
case '\u00F0' : // ð
|
||||||
output[outputPos++] = 'd';
|
output[outputPos++] = 'd';
|
||||||
break;
|
break;
|
||||||
|
@ -202,6 +210,37 @@ public class ISOLatin1AccentFilter extends TokenFilter {
|
||||||
case '\u00FF' : // ÿ
|
case '\u00FF' : // ÿ
|
||||||
output[outputPos++] = 'y';
|
output[outputPos++] = 'y';
|
||||||
break;
|
break;
|
||||||
|
case '\uFB00': // ff
|
||||||
|
output[outputPos++] = 'f';
|
||||||
|
output[outputPos++] = 'f';
|
||||||
|
break;
|
||||||
|
case '\uFB01': // fi
|
||||||
|
output[outputPos++] = 'f';
|
||||||
|
output[outputPos++] = 'i';
|
||||||
|
break;
|
||||||
|
case '\uFB02': // fl
|
||||||
|
output[outputPos++] = 'f';
|
||||||
|
output[outputPos++] = 'l';
|
||||||
|
break;
|
||||||
|
// following 2 are commented as they can break the maxSizeNeeded (and doing *3 could be expensive)
|
||||||
|
// case '\uFB03': // ffi
|
||||||
|
// output[outputPos++] = 'f';
|
||||||
|
// output[outputPos++] = 'f';
|
||||||
|
// output[outputPos++] = 'i';
|
||||||
|
// break;
|
||||||
|
// case '\uFB04': // ffl
|
||||||
|
// output[outputPos++] = 'f';
|
||||||
|
// output[outputPos++] = 'f';
|
||||||
|
// output[outputPos++] = 'l';
|
||||||
|
// break;
|
||||||
|
case '\uFB05': // ſt
|
||||||
|
output[outputPos++] = 'f';
|
||||||
|
output[outputPos++] = 't';
|
||||||
|
break;
|
||||||
|
case '\uFB06': // st
|
||||||
|
output[outputPos++] = 's';
|
||||||
|
output[outputPos++] = 't';
|
||||||
|
break;
|
||||||
default :
|
default :
|
||||||
output[outputPos++] = c;
|
output[outputPos++] = c;
|
||||||
break;
|
break;
|
||||||
|
|
|
@ -23,7 +23,7 @@ import java.io.StringReader;
|
||||||
|
|
||||||
public class TestISOLatin1AccentFilter extends LuceneTestCase {
|
public class TestISOLatin1AccentFilter extends LuceneTestCase {
|
||||||
public void testU() throws Exception {
|
public void testU() throws Exception {
|
||||||
TokenStream stream = new WhitespaceTokenizer(new StringReader("Des mot clés À LA CHAÎNE À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï Ð Ñ Ò Ó Ô Õ Ö Ø Œ Þ Ù Ú Û Ü Ý Ÿ à á â ã ä å æ ç è é ê ë ì í î ï ð ñ ò ó ô õ ö ø œ ß þ ù ú û ü ý ÿ"));
|
TokenStream stream = new WhitespaceTokenizer(new StringReader("Des mot clés À LA CHAÎNE À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï IJ Ð Ñ Ò Ó Ô Õ Ö Ø Œ Þ Ù Ú Û Ü Ý Ÿ à á â ã ä å æ ç è é ê ë ì í î ï ij ð ñ ò ó ô õ ö ø œ ß þ ù ú û ü ý ÿ fi fl"));
|
||||||
ISOLatin1AccentFilter filter = new ISOLatin1AccentFilter(stream);
|
ISOLatin1AccentFilter filter = new ISOLatin1AccentFilter(stream);
|
||||||
assertEquals("Des", filter.next().termText());
|
assertEquals("Des", filter.next().termText());
|
||||||
assertEquals("mot", filter.next().termText());
|
assertEquals("mot", filter.next().termText());
|
||||||
|
@ -47,6 +47,7 @@ public class TestISOLatin1AccentFilter extends LuceneTestCase {
|
||||||
assertEquals("I", filter.next().termText());
|
assertEquals("I", filter.next().termText());
|
||||||
assertEquals("I", filter.next().termText());
|
assertEquals("I", filter.next().termText());
|
||||||
assertEquals("I", filter.next().termText());
|
assertEquals("I", filter.next().termText());
|
||||||
|
assertEquals("IJ", filter.next().termText());
|
||||||
assertEquals("D", filter.next().termText());
|
assertEquals("D", filter.next().termText());
|
||||||
assertEquals("N", filter.next().termText());
|
assertEquals("N", filter.next().termText());
|
||||||
assertEquals("O", filter.next().termText());
|
assertEquals("O", filter.next().termText());
|
||||||
|
@ -79,6 +80,7 @@ public class TestISOLatin1AccentFilter extends LuceneTestCase {
|
||||||
assertEquals("i", filter.next().termText());
|
assertEquals("i", filter.next().termText());
|
||||||
assertEquals("i", filter.next().termText());
|
assertEquals("i", filter.next().termText());
|
||||||
assertEquals("i", filter.next().termText());
|
assertEquals("i", filter.next().termText());
|
||||||
|
assertEquals("ij", filter.next().termText());
|
||||||
assertEquals("d", filter.next().termText());
|
assertEquals("d", filter.next().termText());
|
||||||
assertEquals("n", filter.next().termText());
|
assertEquals("n", filter.next().termText());
|
||||||
assertEquals("o", filter.next().termText());
|
assertEquals("o", filter.next().termText());
|
||||||
|
@ -96,6 +98,8 @@ public class TestISOLatin1AccentFilter extends LuceneTestCase {
|
||||||
assertEquals("u", filter.next().termText());
|
assertEquals("u", filter.next().termText());
|
||||||
assertEquals("y", filter.next().termText());
|
assertEquals("y", filter.next().termText());
|
||||||
assertEquals("y", filter.next().termText());
|
assertEquals("y", filter.next().termText());
|
||||||
|
assertEquals("fi", filter.next().termText());
|
||||||
|
assertEquals("fl", filter.next().termText());
|
||||||
assertNull(filter.next());
|
assertNull(filter.next());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue