mirror of https://github.com/apache/lucene.git
LUCENE-8596: Treat hash mark as comment only at beginning of line in kuromoji
user dictionary. Via Masaru Hasegawa and Satoshi Kato
This commit is contained in:
commit
93309e9728
|
@ -58,6 +58,10 @@ Improvements
|
||||||
|
|
||||||
* LUCENE-8984: MoreLikeThis MLT is biased for uncommon fields (Andy Hind via Anshum Gupta)
|
* LUCENE-8984: MoreLikeThis MLT is biased for uncommon fields (Andy Hind via Anshum Gupta)
|
||||||
|
|
||||||
|
* LUCENE-8596: Kuromoji user dictionary now accepts entries containing hash mark (#) that were
|
||||||
|
previously treated as beginning a line-ending comment (Satoshi Kato and Masaru Hasegawa via
|
||||||
|
Michael Sokolov)
|
||||||
|
|
||||||
Bug fixes
|
Bug fixes
|
||||||
|
|
||||||
* LUCENE-8663: NRTCachingDirectory.slowFileExists may open a file while
|
* LUCENE-8663: NRTCachingDirectory.slowFileExists may open a file while
|
||||||
|
|
|
@ -65,7 +65,7 @@ public final class UserDictionary implements Dictionary {
|
||||||
// text, segmentation, readings, POS
|
// text, segmentation, readings, POS
|
||||||
while ((line = br.readLine()) != null) {
|
while ((line = br.readLine()) != null) {
|
||||||
// Remove comments
|
// Remove comments
|
||||||
line = line.replaceAll("#.*$", "");
|
line = line.replaceAll("^#.*$", "");
|
||||||
|
|
||||||
// Skip empty lines or comment lines
|
// Skip empty lines or comment lines
|
||||||
if (line.trim().length() == 0) {
|
if (line.trim().length() == 0) {
|
||||||
|
|
|
@ -99,4 +99,16 @@ public class UserDictionaryTest extends LuceneTestCase {
|
||||||
assertTrue(e.getMessage().contains("does not match the surface form"));
|
assertTrue(e.getMessage().contains("does not match the surface form"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testSharp() throws IOException {
|
||||||
|
String[] inputs = {"テスト#", "テスト#テスト"};
|
||||||
|
UserDictionary dictionary = TestJapaneseTokenizer.readDict();
|
||||||
|
|
||||||
|
for (String input: inputs) {
|
||||||
|
System.out.println(input);
|
||||||
|
int[][] result = dictionary.lookup(input.toCharArray(), 0, input.length());
|
||||||
|
assertEquals("カスタム名刺", dictionary.getPartOfSpeech(result[0][0]));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -8,3 +8,7 @@
|
||||||
# Silly entry:
|
# Silly entry:
|
||||||
abcd,a b cd,foo1 foo2 foo3,bar
|
abcd,a b cd,foo1 foo2 foo3,bar
|
||||||
abcdefg,ab cd efg,foo1 foo2 foo4,bar
|
abcdefg,ab cd efg,foo1 foo2 foo4,bar
|
||||||
|
|
||||||
|
# sharp test
|
||||||
|
test#テスト,test # テスト,test # テスト,カスタム名刺
|
||||||
|
テスト#,テスト #,テスト #,カスタム名刺
|
||||||
|
|
Loading…
Reference in New Issue