LUCENE-8596: Treat hash mark as comment only at beginning of line in kuromoji

user dictionary. Via Masaru Hasegawa and Satoshi Kato
This commit is contained in:
Michael Sokolov 2019-12-21 13:25:09 -05:00
commit 93309e9728
4 changed files with 21 additions and 1 deletions

View File

@ -58,6 +58,10 @@ Improvements
* LUCENE-8984: MoreLikeThis MLT is biased for uncommon fields (Andy Hind via Anshum Gupta) * LUCENE-8984: MoreLikeThis MLT is biased for uncommon fields (Andy Hind via Anshum Gupta)
* LUCENE-8596: Kuromoji user dictionary now accepts entries containing hash mark (#) that were
previously treated as beginning a line-ending comment (Satoshi Kato and Masaru Hasegawa via
Michael Sokolov)
Bug fixes Bug fixes
* LUCENE-8663: NRTCachingDirectory.slowFileExists may open a file while * LUCENE-8663: NRTCachingDirectory.slowFileExists may open a file while

View File

@ -65,7 +65,7 @@ public final class UserDictionary implements Dictionary {
// text, segmentation, readings, POS // text, segmentation, readings, POS
while ((line = br.readLine()) != null) { while ((line = br.readLine()) != null) {
// Remove comments // Remove comments
line = line.replaceAll("#.*$", ""); line = line.replaceAll("^#.*$", "");
// Skip empty lines or comment lines // Skip empty lines or comment lines
if (line.trim().length() == 0) { if (line.trim().length() == 0) {

View File

@ -99,4 +99,16 @@ public class UserDictionaryTest extends LuceneTestCase {
assertTrue(e.getMessage().contains("does not match the surface form")); assertTrue(e.getMessage().contains("does not match the surface form"));
} }
@Test
public void testSharp() throws IOException {
String[] inputs = {"テスト#", "テスト#テスト"};
UserDictionary dictionary = TestJapaneseTokenizer.readDict();
for (String input: inputs) {
System.out.println(input);
int[][] result = dictionary.lookup(input.toCharArray(), 0, input.length());
assertEquals("カスタム名刺", dictionary.getPartOfSpeech(result[0][0]));
}
}
} }

View File

@ -8,3 +8,7 @@
# Silly entry: # Silly entry:
abcd,a b cd,foo1 foo2 foo3,bar abcd,a b cd,foo1 foo2 foo3,bar
abcdefg,ab cd efg,foo1 foo2 foo4,bar abcdefg,ab cd efg,foo1 foo2 foo4,bar
# sharp test
test#テスト,test # テスト,test # テスト,カスタム名刺
テスト#,テスト #,テスト #,カスタム名刺