LUCENE-8596: Treat hash mark as comment only at beginning of line in kuromoji

user dictionary. Via Masaru Hasegawa and Satoshi Kato
2019-12-21 13:25:09 -05:00 · 2019-12-21 13:25:09 -05:00 · 93309e9728
parent c4f68bdab9 3cfe250403
commit 93309e9728
4 changed files with 21 additions and 1 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -58,6 +58,10 @@ Improvements
  
 * LUCENE-8984: MoreLikeThis MLT is biased for uncommon fields (Andy Hind via Anshum Gupta)

+* LUCENE-8596: Kuromoji user dictionary now accepts entries containing hash mark (#) that were
+  previously treated as beginning a line-ending comment (Satoshi Kato and Masaru Hasegawa via
+  Michael Sokolov)
+
 Bug fixes

 * LUCENE-8663: NRTCachingDirectory.slowFileExists may open a file while 
--- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java
@ -65,7 +65,7 @@ public final class UserDictionary implements Dictionary {
    // text, segmentation, readings, POS
    while ((line = br.readLine()) != null) {
      // Remove comments
-      line = line.replaceAll("#.*$", "");
+      line = line.replaceAll("^#.*$", "");

      // Skip empty lines or comment lines
      if (line.trim().length() == 0) {
--- a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/dict/UserDictionaryTest.java
+++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/dict/UserDictionaryTest.java
@ -99,4 +99,16 @@ public class UserDictionaryTest extends LuceneTestCase {
    assertTrue(e.getMessage().contains("does not match the surface form"));
  }

+  @Test
+  public void testSharp() throws IOException {
+    String[] inputs = {"テスト#", "テスト#テスト"};
+    UserDictionary dictionary = TestJapaneseTokenizer.readDict();
+
+    for (String input: inputs) {
+      System.out.println(input);
+      int[][] result = dictionary.lookup(input.toCharArray(), 0, input.length());
+      assertEquals("カスタム名刺", dictionary.getPartOfSpeech(result[0][0]));
+    }
+  }
+
 }
--- a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/userdict.txt
+++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/userdict.txt
@ -8,3 +8,7 @@
 # Silly entry:
 abcd,a b cd,foo1 foo2 foo3,bar
 abcdefg,ab cd efg,foo1 foo2 foo4,bar
+
+# sharp test
+test#テスト,test # テスト,test # テスト,カスタム名刺
+テスト#,テスト #,テスト #,カスタム名刺