LUCENE-8863: enhance Kuromoji DictionaryBuilder tool

added tests enabled ids up to 8191 support loading custom system dictionary from filesystem or classpath
2019-06-15 14:59:46 -04:00 · 2019-06-15 14:59:46 -04:00 · 4502065f03
parent df3011a539
commit 4502065f03
7 changed files with 185 additions and 41 deletions
--- a/lucene/analysis/kuromoji/build.xml
+++ b/lucene/analysis/kuromoji/build.xml
@ -136,8 +136,8 @@
     </compile>
  </target>
-  <target name="test-tools" depends="compile-tools-tests">
+  <target name="test-tools" depends="install-junit4-taskdef, compile-tools-tests">
-    <test-macro dataDir="src/tools/test" junit.classpath="tools.test.classpath"/>
+    <test-macro testsDir="${build.dir}/classes/tools-test" workDir="src/tools/test" junit.classpath="tools.test.classpath"/>
  </target>
  <target name="compile-test" depends="module-build.compile-test, compile-tools-tests"/>
--- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/BinaryDictionary.java
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/BinaryDictionary.java
@ -23,6 +23,8 @@ import java.io.IOException;
 import java.io.FileNotFoundException;
 import java.io.InputStream;
 import java.nio.ByteBuffer;
 import java.nio.file.Files;
 import java.nio.file.Paths;
 import java.nio.channels.Channels;
 import java.nio.channels.ReadableByteChannel;
@ -37,6 +39,10 @@ import org.apache.lucene.util.IOUtils;
 */
 public abstract class BinaryDictionary implements Dictionary {
  enum ResourceScheme {
    CLASSPATH, FILE
  }
  public static final String DICT_FILENAME_SUFFIX = "$buffer.dat";
  public static final String TARGETMAP_FILENAME_SUFFIX = "$targetMap.dat";
  public static final String POSDICT_FILENAME_SUFFIX = "$posDict.dat";
@ -46,6 +52,8 @@ public abstract class BinaryDictionary implements Dictionary {
  public static final String POSDICT_HEADER = "kuromoji_dict_pos";
  public static final int VERSION = 1;
  private final ResourceScheme resourceScheme;
  private final String resourcePath;
  private final ByteBuffer buffer;
  private final int[] targetMapOffsets, targetMap;
  private final String[] posDict;
@ -53,6 +61,24 @@ public abstract class BinaryDictionary implements Dictionary {
  private final String[] inflFormDict;
  protected BinaryDictionary() throws IOException {
    this(ResourceScheme.CLASSPATH, null);
  }
  /**
   * @param resourceScheme - scheme for loading resources (FILE or CLASSPATH).
   * @param resourcePath - where to load resources (dictionaries) from. If null, with CLASSPATH scheme only, use
   * this class's name as the path.
   */
  protected BinaryDictionary(ResourceScheme resourceScheme, String resourcePath) throws IOException {
    this.resourceScheme = resourceScheme;
    if (resourcePath == null) {
      if (resourceScheme != ResourceScheme.CLASSPATH) {
        throw new IllegalArgumentException("resourcePath must be supplied with FILE resource scheme");
      }
      this.resourcePath = getClass().getName().replace('.', '/');
    } else {
      this.resourcePath = resourcePath;
    }
    InputStream mapIS = null, dictIS = null, posIS = null;
    int[] targetMapOffsets = null, targetMap = null;
    String[] posDict = null;
@ -78,7 +104,9 @@ public abstract class BinaryDictionary implements Dictionary {
        targetMap[ofs] = accum;
      }
      if (sourceId + 1 != targetMapOffsets.length)
-        throw new IOException("targetMap file format broken");
+        throw new IOException("targetMap file format broken; targetMap.length=" + targetMap.length
                              + ", targetMapOffsets.length=" + targetMapOffsets.length
                              + ", sourceId=" + sourceId);
      targetMapOffsets[sourceId] = targetMap.length;
      mapIS.close(); mapIS = null;
@ -135,17 +163,33 @@ public abstract class BinaryDictionary implements Dictionary {
  }
  protected final InputStream getResource(String suffix) throws IOException {
-    return getClassResource(getClass(), suffix);
+    switch(resourceScheme) {
      case CLASSPATH:
        return getClassResource(resourcePath + suffix);
      case FILE:
        return Files.newInputStream(Paths.get(resourcePath + suffix));
      default:
        throw new IllegalStateException("unknown resource scheme " + resourceScheme);
    }
  }
  // util, reused by ConnectionCosts and CharacterDefinition
  public static final InputStream getClassResource(Class<?> clazz, String suffix) throws IOException {
    final InputStream is = clazz.getResourceAsStream(clazz.getSimpleName() + suffix);
-    if (is == null)
+    if (is == null) {
      throw new FileNotFoundException("Not in classpath: " + clazz.getName().replace('.','/') + suffix);
    }
    return is;
  }
  private InputStream getClassResource(String path) throws IOException {
    final InputStream is = BinaryDictionary.class.getClassLoader().getResourceAsStream(path);
    if (is == null) {
      throw new FileNotFoundException("Not in classpath: " + path);
    }
    return is;
  }
  public void lookupWordIds(int sourceId, IntsRef ref) {
    ref.ints = targetMap;
    ref.offset = targetMapOffsets[sourceId];
@ -155,12 +199,12 @@ public abstract class BinaryDictionary implements Dictionary {
  @Override
  public int getLeftId(int wordId) {
-    return buffer.getShort(wordId) >>> 3;
+    return (buffer.getShort(wordId) & 0xffff) >>> 3;
  }
  @Override
  public int getRightId(int wordId) {
-    return buffer.getShort(wordId) >>> 3;
+    return (buffer.getShort(wordId) & 0xffff) >>> 3;
  }
  @Override
--- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary.java
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionary.java
@ -36,10 +36,15 @@ public final class TokenInfoDictionary extends BinaryDictionary {
  private final TokenInfoFST fst;
-  private TokenInfoDictionary() throws IOException {
+  /**
-    super();
+   * @param resourceScheme - scheme for loading resources (FILE or CLASSPATH).
   * @param resourcePath - where to load resources (dictionaries) from. If null, with CLASSPATH scheme only, use
   * this class's name as the path.
   */
  TokenInfoDictionary(ResourceScheme resourceScheme, String resourcePath) throws IOException {
    super(resourceScheme, resourcePath);
    InputStream is = null;
-    FST<Long> fst = null;
+    FST<Long> fst;
    boolean success = false;
    try {
      is = getResource(FST_FILENAME_SUFFIX);
@ -56,6 +61,10 @@ public final class TokenInfoDictionary extends BinaryDictionary {
    // TODO: some way to configure?
    this.fst = new TokenInfoFST(fst, true);
  }
  private TokenInfoDictionary() throws IOException {
    this(ResourceScheme.CLASSPATH, null);
  }
  public TokenInfoFST getFST() {
    return fst;
--- a/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/BinaryDictionaryWriter.java
+++ b/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/BinaryDictionaryWriter.java
@ -35,6 +35,8 @@ import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.analysis.ja.dict.BinaryDictionary;
 public abstract class BinaryDictionaryWriter {
  private final static int ID_LIMIT = 8192;
  protected final Class<? extends BinaryDictionary> implClazz;
  protected ByteBuffer buffer;
  private int targetMapEndOffset = 0, lastWordId = -1, lastSourceId = -1;
@ -71,7 +73,9 @@ public abstract class BinaryDictionaryWriter {
    }
    String posData = sb.toString();
-    
+    if (posData.isEmpty()) {
        throw new IllegalArgumentException("POS fields are empty");
    }
    sb.setLength(0);
    sb.append(CSVUtil.quoteEscape(posData));
    sb.append(',');
@ -100,6 +104,9 @@ public abstract class BinaryDictionaryWriter {
    }
    int flags = 0;
    if (baseForm.isEmpty()) {
        throw new IllegalArgumentException("base form is empty");
    }
    if (!("*".equals(baseForm) || baseForm.equals(entry[0]))) {
      flags |= BinaryDictionary.HAS_BASEFORM;
    }
@ -110,8 +117,12 @@ public abstract class BinaryDictionaryWriter {
      flags |= BinaryDictionary.HAS_PRONUNCIATION;
    }
-    assert leftId == rightId;
+    if (leftId != rightId) {
-    assert leftId < 4096; // there are still unused bits
+        throw new IllegalArgumentException("rightId != leftId: " + rightId + " " +leftId);
    }
    if (leftId >= ID_LIMIT) {
        throw new IllegalArgumentException("leftId >= " + ID_LIMIT + ": " + leftId);
    }
    // add pos mapping
    int toFill = 1+leftId - posDict.size();
    for (int i = 0; i < toFill; i++) {
@ -119,14 +130,19 @@ public abstract class BinaryDictionaryWriter {
    }
    String existing = posDict.get(leftId);
-    assert existing == null || existing.equals(fullPOSData);
+    if (existing != null && existing.equals(fullPOSData) == false) {
        // TODO: test me
        throw new IllegalArgumentException("Multiple entries found for leftID=" + leftId);
    }
    posDict.set(leftId, fullPOSData);
    buffer.putShort((short)(leftId << 3 | flags));
    buffer.putShort(wordCost);
    if ((flags & BinaryDictionary.HAS_BASEFORM) != 0) {
-      assert baseForm.length() < 16;
+      if (baseForm.length() >= 16) {
        throw new IllegalArgumentException("Length of base form " + baseForm + " is >= 16");
      }
      int shared = sharedPrefix(entry[0], baseForm);
      int suffix = baseForm.length() - shared;
      buffer.put((byte) (shared << 4 | suffix));
@ -204,16 +220,17 @@ public abstract class BinaryDictionaryWriter {
  }
  public void addMapping(int sourceId, int wordId) {
-    assert wordId > lastWordId : "words out of order: " + wordId + " vs lastID: " + lastWordId;
+    if (wordId <= lastWordId) {
      throw new IllegalStateException("words out of order: " + wordId + " vs lastID: " + lastWordId);
    }
    if (sourceId > lastSourceId) {
      assert sourceId > lastSourceId : "source ids out of order: lastSourceId=" + lastSourceId + " vs sourceId=" + sourceId;
      targetMapOffsets = ArrayUtil.grow(targetMapOffsets, sourceId + 1);
      for (int i = lastSourceId + 1; i <= sourceId; i++) {
        targetMapOffsets[i] = targetMapEndOffset;
      }
-    } else {
+    } else if (sourceId != lastSourceId) {
-      assert sourceId == lastSourceId;
+      throw new IllegalStateException("source ids not in increasing order: lastSourceId=" + lastSourceId + " vs sourceId=" + sourceId);
    }
    targetMap = ArrayUtil.grow(targetMap, targetMapEndOffset + 1);
@ -265,7 +282,9 @@ public abstract class BinaryDictionaryWriter {
        }
        prev += delta;
      }
-      assert sourceId == numSourceIds : "sourceId:"+sourceId+" != numSourceIds:"+numSourceIds;
+      if (sourceId != numSourceIds) {
        throw new IllegalStateException("sourceId:" + sourceId + " != numSourceIds:" + numSourceIds);
      }
    } finally {
      os.close();
    }
@ -286,7 +305,9 @@ public abstract class BinaryDictionaryWriter {
          out.writeByte((byte)0);
        } else {
          String data[] = CSVUtil.parse(s);
-          assert data.length == 3 : "malformed pos/inflection: " + s;
+          if (data.length != 3) {
            throw new IllegalArgumentException("Malformed pos/inflection: " + s + "; expected 3 characters");
          }
          out.writeString(data[0]);
          out.writeString(data[1]);
          out.writeString(data[2]);
--- a/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java
+++ b/lucene/analysis/kuromoji/src/tools/java/org/apache/lucene/analysis/ja/util/TokenInfoDictionaryBuilder.java
@ -96,8 +96,7 @@ public class TokenInfoDictionaryBuilder {
        String[] entry = CSVUtil.parse(line);
        if(entry.length < 13) {
-          System.out.println("Entry in CSV is not valid: " + line);
+          throw new IllegalArgumentException("Entry in CSV is not valid (13 field values expected): " + line);
          continue;
        }
        String[] formatted = formatEntry(entry);
--- a/lucene/analysis/kuromoji/src/tools/test/org/apache/lucene/analysis/ja/dict/TokenInfoDictionaryTest.java
+++ b/lucene/analysis/kuromoji/src/tools/test/org/apache/lucene/analysis/ja/dict/TokenInfoDictionaryTest.java
@ -0,0 +1,85 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.analysis.ja.dict;
 import java.io.OutputStream;
 import java.io.OutputStreamWriter;
 import java.io.PrintWriter;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import org.apache.lucene.analysis.ja.util.DictionaryBuilder.DictionaryFormat;
 import org.apache.lucene.analysis.ja.util.TokenInfoDictionaryBuilder;
 import org.apache.lucene.analysis.ja.util.TokenInfoDictionaryWriter;
 import org.apache.lucene.util.IntsRef;
 import org.apache.lucene.util.IntsRefBuilder;
 import org.apache.lucene.util.LuceneTestCase;
 import static java.io.File.separatorChar;
 import static org.apache.lucene.analysis.ja.dict.BinaryDictionary.ResourceScheme;
 /**
 * Tests of TokenInfoDictionary build tools; run using ant test-tools
 */
 public class TokenInfoDictionaryTest extends LuceneTestCase {
  public void testPut() throws Exception {
    TokenInfoDictionary dict = newDictionary("名詞,1,1,2,名詞,一般,*,*,*,*,*,*,*",
                                               // "large" id
                                               "一般,5000,5000,3,名詞,一般,*,*,*,*,*,*,*");
    IntsRef wordIdRef = new IntsRefBuilder().get();
    dict.lookupWordIds(0, wordIdRef);
    int wordId = wordIdRef.ints[wordIdRef.offset];
    assertEquals(5000, dict.getLeftId(wordId));
    assertEquals(5000, dict.getRightId(wordId));
    assertEquals(3, dict.getWordCost(wordId));
    dict.lookupWordIds(1, wordIdRef);
    wordId = wordIdRef.ints[wordIdRef.offset];
    assertEquals(1, dict.getLeftId(wordId));
    assertEquals(1, dict.getRightId(wordId));
    assertEquals(2, dict.getWordCost(wordId));
  }
  private TokenInfoDictionary newDictionary(String... entries) throws Exception {
    Path dir = createTempDir();
    try (OutputStream out = Files.newOutputStream(dir.resolve("test.csv"));
         PrintWriter printer = new PrintWriter(new OutputStreamWriter(out, "utf-8"))) {
      for (String entry : entries) {
        printer.println(entry);
      }
    }
    TokenInfoDictionaryBuilder builder = new TokenInfoDictionaryBuilder(DictionaryFormat.IPADIC, "utf-8", true);
    TokenInfoDictionaryWriter writer = builder.build(dir.toString());
    writer.write(dir.toString());
    String dictionaryPath = TokenInfoDictionary.class.getName().replace('.', separatorChar);
    // We must also load the other files (in BinaryDictionary) from the correct path
    return new TokenInfoDictionary(ResourceScheme.FILE, dir.resolve(dictionaryPath).toString());
  }
  public void testPutException() throws Exception {
    // too few columns
    expectThrows(IllegalArgumentException.class, () -> newDictionary("KANJI,1,1,1,名詞,一般,*,*,*,*,*"));
    // left id != right id
    expectThrows(IllegalArgumentException.class, () -> newDictionary("KANJI,1285,1,1,名詞,一般,*,*,*,*,*,*,*"));
    // left id != right id
    expectThrows(IllegalArgumentException.class, () -> newDictionary("KANJI,1285,1,1,名詞,一般,*,*,*,*,*,*,*"));
    // id too large
    expectThrows(IllegalArgumentException.class, () -> newDictionary("KANJI,8192,8192,1,名詞,一般,*,*,*,*,*,*,*"));
  }
 }
--- a/lucene/analysis/kuromoji/src/tools/test/org/apache/lucene/analysis/ja/dict/UnknownDictionaryTest.java
+++ b/lucene/analysis/kuromoji/src/tools/test/org/apache/lucene/analysis/ja/dict/UnknownDictionaryTest.java
@ -29,19 +29,9 @@ public class UnknownDictionaryTest extends LuceneTestCase {
  public void testPutCharacterCategory() {
    UnknownDictionaryWriter unkDic = new UnknownDictionaryWriter(10 * 1024 * 1024);
-    try{
+    expectThrows(Exception.class, () -> unkDic.putCharacterCategory(0, "DUMMY_NAME"));
      unkDic.putCharacterCategory(0, "DUMMY_NAME");
      fail();
    } catch(Exception e) {
    }
-    try{
+    expectThrows(Exception.class, () -> unkDic.putCharacterCategory(-1, "KATAKANA"));
      unkDic.putCharacterCategory(-1, "KATAKANA");
      fail();
    } catch(Exception e) {
    }
    unkDic.putCharacterCategory(0, "DEFAULT");
    unkDic.putCharacterCategory(1, "GREEK");
@ -53,12 +43,8 @@ public class UnknownDictionaryTest extends LuceneTestCase {
  @Test
  public void testPut() {
    UnknownDictionaryWriter unkDic = new UnknownDictionaryWriter(10 * 1024 * 1024);
-    try{
+    expectThrows(NumberFormatException.class, () ->
-      unkDic.put(CSVUtil.parse("KANJI,1285,11426,名詞,一般,*,*,*,*,*,*,*"));
+                 unkDic.put(CSVUtil.parse("KANJI,1285,11426,名詞,一般,*,*,*,*,*,*,*")));
      fail();
    } catch(Exception e){
    }
    String entry1 = "ALPHA,1285,1285,13398,名詞,一般,*,*,*,*,*,*,*";
    String entry2 = "HIRAGANA,1285,1285,13069,名詞,一般,*,*,*,*,*,*,*";