LUCENE-1003: Don't let RussianAnalyzer drop numbers.

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@656111 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Otis Gospodnetic 2008-05-14 05:37:45 +00:00
parent b182881092
commit aa0074f5db
3 changed files with 57 additions and 4 deletions

View File

@ -103,7 +103,10 @@ Bug fixes
This is needed when you want to update an index as part of a
transaction involving external resources (eg a database). Also
deprecated abort(), renaming it to rollback(). (Mike McCandless)
10. LUCENE-1003: Stop RussianAnalyzer from removing numbers.
(TUSUR OpenTeam, Dmitry Lihachev via Otis Gospodnetic)
New features
1. LUCENE-1137: Added Token.set/getFlags() accessors for passing more information about a Token through the analysis

View File

@ -94,7 +94,18 @@ public class RussianCharsets
'\u042C',
'\u042D',
'\u042E',
'\u042F'
'\u042F',
// numbers
'0',
'1',
'2',
'3',
'4',
'5',
'6',
'7',
'8',
'9'
};
// KOI8 charset
@ -163,7 +174,18 @@ public class RussianCharsets
0xf8,
0xfc,
0xe0,
0xf1
0xf1,
// numbers
'0',
'1',
'2',
'3',
'4',
'5',
'6',
'7',
'8',
'9'
};
// CP1251 eharset
@ -232,7 +254,18 @@ public class RussianCharsets
0xDC,
0xDD,
0xDE,
0xDF
0xDF,
// numbers
'0',
'1',
'2',
'3',
'4',
'5',
'6',
'7',
'8',
'9'
};
public static char toLowerCase(char letter, char[] charset)

View File

@ -168,4 +168,21 @@ public class TestRussianAnalyzer extends TestCase
inWords1251.close();
sample1251.close();
}
public void testDigitsInRussianCharset()
{
Reader reader = new StringReader("text 1000");
RussianAnalyzer ra = new RussianAnalyzer();
TokenStream stream = ra.tokenStream("", reader);
try {
assertEquals("text", stream.next().termText());
assertNotNull("RussianAnalyzer's tokenizer skips numbers from input text", stream.next());
}
catch (IOException e)
{
fail("unexpected IOException");
}
}
}