mirror of https://github.com/apache/lucene.git
LUCENE-8596: Treat hash mark as comment only at beginning of line in kuromoji
user dictionary. Via Masaru Hasegawa and Satoshi Kato
This commit is contained in:
commit
93309e9728
|
@ -58,6 +58,10 @@ Improvements
|
|||
|
||||
* LUCENE-8984: MoreLikeThis MLT is biased for uncommon fields (Andy Hind via Anshum Gupta)
|
||||
|
||||
* LUCENE-8596: Kuromoji user dictionary now accepts entries containing hash mark (#) that were
|
||||
previously treated as beginning a line-ending comment (Satoshi Kato and Masaru Hasegawa via
|
||||
Michael Sokolov)
|
||||
|
||||
Bug fixes
|
||||
|
||||
* LUCENE-8663: NRTCachingDirectory.slowFileExists may open a file while
|
||||
|
|
|
@ -65,7 +65,7 @@ public final class UserDictionary implements Dictionary {
|
|||
// text, segmentation, readings, POS
|
||||
while ((line = br.readLine()) != null) {
|
||||
// Remove comments
|
||||
line = line.replaceAll("#.*$", "");
|
||||
line = line.replaceAll("^#.*$", "");
|
||||
|
||||
// Skip empty lines or comment lines
|
||||
if (line.trim().length() == 0) {
|
||||
|
|
|
@ -99,4 +99,16 @@ public class UserDictionaryTest extends LuceneTestCase {
|
|||
assertTrue(e.getMessage().contains("does not match the surface form"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSharp() throws IOException {
|
||||
String[] inputs = {"テスト#", "テスト#テスト"};
|
||||
UserDictionary dictionary = TestJapaneseTokenizer.readDict();
|
||||
|
||||
for (String input: inputs) {
|
||||
System.out.println(input);
|
||||
int[][] result = dictionary.lookup(input.toCharArray(), 0, input.length());
|
||||
assertEquals("カスタム名刺", dictionary.getPartOfSpeech(result[0][0]));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -8,3 +8,7 @@
|
|||
# Silly entry:
|
||||
abcd,a b cd,foo1 foo2 foo3,bar
|
||||
abcdefg,ab cd efg,foo1 foo2 foo4,bar
|
||||
|
||||
# sharp test
|
||||
test#テスト,test # テスト,test # テスト,カスタム名刺
|
||||
テスト#,テスト #,テスト #,カスタム名刺
|
||||
|
|
Loading…
Reference in New Issue