diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index eb7ab8a4983..e3adc1ab310 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -58,6 +58,10 @@ Improvements * LUCENE-8984: MoreLikeThis MLT is biased for uncommon fields (Andy Hind via Anshum Gupta) +* LUCENE-8596: Kuromoji user dictionary now accepts entries containing hash mark (#) that were + previously treated as beginning a line-ending comment (Satoshi Kato and Masaru Hasegawa via + Michael Sokolov) + Bug fixes * LUCENE-8663: NRTCachingDirectory.slowFileExists may open a file while diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java index 120fbb580df..16b0721ca80 100644 --- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java @@ -65,7 +65,7 @@ public final class UserDictionary implements Dictionary { // text, segmentation, readings, POS while ((line = br.readLine()) != null) { // Remove comments - line = line.replaceAll("#.*$", ""); + line = line.replaceAll("^#.*$", ""); // Skip empty lines or comment lines if (line.trim().length() == 0) { diff --git a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/dict/UserDictionaryTest.java b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/dict/UserDictionaryTest.java index dae03d78e9b..2265a0f18f2 100644 --- a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/dict/UserDictionaryTest.java +++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/dict/UserDictionaryTest.java @@ -99,4 +99,16 @@ public class UserDictionaryTest extends LuceneTestCase { assertTrue(e.getMessage().contains("does not match the surface form")); } + @Test + public void testSharp() throws IOException { + String[] inputs = {"テスト#", "テスト#テスト"}; + UserDictionary dictionary = TestJapaneseTokenizer.readDict(); + + for (String input: inputs) { + System.out.println(input); + int[][] result = dictionary.lookup(input.toCharArray(), 0, input.length()); + assertEquals("カスタム名刺", dictionary.getPartOfSpeech(result[0][0])); + } + } + } diff --git a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/userdict.txt b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/userdict.txt index f9db02c22ca..8c4ab4e3f9e 100644 --- a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/userdict.txt +++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/userdict.txt @@ -8,3 +8,7 @@ # Silly entry: abcd,a b cd,foo1 foo2 foo3,bar abcdefg,ab cd efg,foo1 foo2 foo4,bar + +# sharp test +test#テスト,test # テスト,test # テスト,カスタム名刺 +テスト#,テスト #,テスト #,カスタム名刺