diff --git a/lucene/analysis/nori/build.xml b/lucene/analysis/nori/build.xml index dacf3a9436a..0938de2a5f3 100644 --- a/lucene/analysis/nori/build.xml +++ b/lucene/analysis/nori/build.xml @@ -123,8 +123,8 @@ - - + + diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/BinaryDictionary.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/BinaryDictionary.java index b7a36120211..8649837b8cd 100644 --- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/BinaryDictionary.java +++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/BinaryDictionary.java @@ -24,6 +24,8 @@ import java.io.InputStream; import java.nio.ByteBuffer; import java.nio.channels.Channels; import java.nio.channels.ReadableByteChannel; +import java.nio.file.Files; +import java.nio.file.Paths; import org.apache.lucene.analysis.ko.POS; import org.apache.lucene.codecs.CodecUtil; @@ -36,6 +38,14 @@ import org.apache.lucene.util.IntsRef; * Base class for a binary-encoded in-memory dictionary. */ public abstract class BinaryDictionary implements Dictionary { + + /** + * Used to specify where (dictionary) resources get loaded from. + */ + public enum ResourceScheme { + CLASSPATH, FILE + } + public static final String TARGETMAP_FILENAME_SUFFIX = "$targetMap.dat"; public static final String DICT_FILENAME_SUFFIX = "$buffer.dat"; public static final String POSDICT_FILENAME_SUFFIX = "$posDict.dat"; @@ -45,11 +55,31 @@ public abstract class BinaryDictionary implements Dictionary { public static final String POSDICT_HEADER = "ko_dict_pos"; public static final int VERSION = 1; + private final ResourceScheme resourceScheme; + private final String resourcePath; private final ByteBuffer buffer; private final int[] targetMapOffsets, targetMap; private final POS.Tag[] posDict; protected BinaryDictionary() throws IOException { + this(ResourceScheme.CLASSPATH, null); + } + + /** + * @param resourceScheme - scheme for loading resources (FILE or CLASSPATH). + * @param resourcePath - where to load resources (dictionaries) from. If null, with CLASSPATH scheme only, use + * this class's name as the path. + */ + protected BinaryDictionary(ResourceScheme resourceScheme, String resourcePath) throws IOException { + this.resourceScheme = resourceScheme; + if (resourcePath == null) { + if (resourceScheme != ResourceScheme.CLASSPATH) { + throw new IllegalArgumentException("resourcePath must be supplied with FILE resource scheme"); + } + this.resourcePath = getClass().getName().replace('.', '/'); + } else { + this.resourcePath = resourcePath; + } InputStream mapIS = null, dictIS = null, posIS = null; int[] targetMapOffsets = null, targetMap = null; ByteBuffer buffer = null; @@ -72,7 +102,9 @@ public abstract class BinaryDictionary implements Dictionary { targetMap[ofs] = accum; } if (sourceId + 1 != targetMapOffsets.length) - throw new IOException("targetMap file format broken"); + throw new IOException("targetMap file format broken; targetMap.length=" + targetMap.length + + ", targetMapOffsets.length=" + targetMapOffsets.length + + ", sourceId=" + sourceId); targetMapOffsets[sourceId] = targetMap.length; mapIS.close(); mapIS = null; @@ -103,9 +135,9 @@ public abstract class BinaryDictionary implements Dictionary { success = true; } finally { if (success) { - IOUtils.close(mapIS, dictIS); + IOUtils.close(mapIS, posIS, dictIS); } else { - IOUtils.closeWhileHandlingException(mapIS, dictIS); + IOUtils.closeWhileHandlingException(mapIS, posIS, dictIS); } } @@ -115,14 +147,30 @@ public abstract class BinaryDictionary implements Dictionary { } protected final InputStream getResource(String suffix) throws IOException { - return getClassResource(getClass(), suffix); + switch(resourceScheme) { + case CLASSPATH: + return getClassResource(resourcePath + suffix); + case FILE: + return Files.newInputStream(Paths.get(resourcePath + suffix)); + default: + throw new IllegalStateException("unknown resource scheme " + resourceScheme); + } } // util, reused by ConnectionCosts and CharacterDefinition public static final InputStream getClassResource(Class clazz, String suffix) throws IOException { final InputStream is = clazz.getResourceAsStream(clazz.getSimpleName() + suffix); - if (is == null) - throw new FileNotFoundException("Not in classpath: " + clazz.getName().replace('.','/') + suffix); + if (is == null) { + throw new FileNotFoundException("Not in classpath: " + clazz.getName().replace('.', '/') + suffix); + } + return is; + } + + private InputStream getClassResource(String path) throws IOException { + final InputStream is = BinaryDictionary.class.getClassLoader().getResourceAsStream(path); + if (is == null) { + throw new FileNotFoundException("Not in classpath: " + path); + } return is; } diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary.java index 94408c7fe0e..7c645a5cc11 100644 --- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary.java +++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoDictionary.java @@ -34,11 +34,20 @@ public final class TokenInfoDictionary extends BinaryDictionary { public static final String FST_FILENAME_SUFFIX = "$fst.dat"; private final TokenInfoFST fst; - + private TokenInfoDictionary() throws IOException { - super(); + this(ResourceScheme.CLASSPATH, null); + } + + /** + * @param resourceScheme - scheme for loading resources (FILE or CLASSPATH). + * @param resourcePath - where to load resources (dictionaries) from. If null, with CLASSPATH scheme only, use + * this class's name as the path. + */ + TokenInfoDictionary(ResourceScheme resourceScheme, String resourcePath) throws IOException { + super(resourceScheme, resourcePath); InputStream is = null; - FST fst = null; + FST fst; boolean success = false; try { is = getResource(FST_FILENAME_SUFFIX); diff --git a/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/BinaryDictionaryWriter.java b/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/BinaryDictionaryWriter.java index b77d1baa4c1..db57d4fd66f 100644 --- a/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/BinaryDictionaryWriter.java +++ b/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/BinaryDictionaryWriter.java @@ -38,6 +38,8 @@ import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.analysis.ko.dict.BinaryDictionary; public abstract class BinaryDictionaryWriter { + private final static int ID_LIMIT = 8192; + protected final Class implClazz; protected ByteBuffer buffer; private int targetMapEndOffset = 0, lastWordId = -1, lastSourceId = -1; @@ -116,7 +118,7 @@ public abstract class BinaryDictionaryWriter { if (posType != POS.Type.MORPHEME && expression.length() > 0) { String[] exprTokens = expression.split("\\+"); for (int i = 0; i < exprTokens.length; i++) { - String[] tokenSplit = exprTokens[i].split("\\/"); + String[] tokenSplit = exprTokens[i].split("/"); assert tokenSplit.length == 3; String surfaceForm = tokenSplit[0].trim(); if (surfaceForm.isEmpty() == false) { @@ -137,8 +139,12 @@ public abstract class BinaryDictionaryWriter { flags |= BinaryDictionary.HAS_READING; } - assert leftId < 8192; // there are still unused bits - assert posType.ordinal() < 4; + if (leftId >= ID_LIMIT) { + throw new IllegalArgumentException("leftId >= " + ID_LIMIT + ": " + leftId); + } + if (posType.ordinal() >= 4) { + throw new IllegalArgumentException("posType.ordinal() >= " + 4 + ": " + posType.name()); + } buffer.putShort((short)(leftId << 2 | posType.ordinal())); buffer.putShort((short) (rightId << 2 | flags)); buffer.putShort(wordCost); @@ -178,16 +184,17 @@ public abstract class BinaryDictionaryWriter { } public void addMapping(int sourceId, int wordId) { - assert wordId > lastWordId : "words out of order: " + wordId + " vs lastID: " + lastWordId; + if (wordId <= lastWordId) { + throw new IllegalStateException("words out of order: " + wordId + " vs lastID: " + lastWordId); + } if (sourceId > lastSourceId) { - assert sourceId > lastSourceId : "source ids out of order: lastSourceId=" + lastSourceId + " vs sourceId=" + sourceId; targetMapOffsets = ArrayUtil.grow(targetMapOffsets, sourceId + 1); for (int i = lastSourceId + 1; i <= sourceId; i++) { targetMapOffsets[i] = targetMapEndOffset; } - } else { - assert sourceId == lastSourceId; + } else if (sourceId != lastSourceId) { + throw new IllegalStateException("source ids not in increasing order: lastSourceId=" + lastSourceId + " vs sourceId=" + sourceId); } targetMap = ArrayUtil.grow(targetMap, targetMapEndOffset + 1); @@ -236,7 +243,9 @@ public abstract class BinaryDictionaryWriter { } prev += delta; } - assert sourceId == numSourceIds : "sourceId:"+sourceId+" != numSourceIds:"+numSourceIds; + if (sourceId != numSourceIds) { + throw new IllegalStateException("sourceId:" + sourceId + " != numSourceIds:" + numSourceIds); + } } finally { os.close(); } @@ -254,8 +263,10 @@ public abstract class BinaryDictionaryWriter { if (s == null) { out.writeByte((byte) POS.Tag.UNKNOWN.ordinal()); } else { - String data[] = CSVUtil.parse(s); - assert data.length == 2 : "malformed pos/semanticClass: " + s; + String[] data = CSVUtil.parse(s); + if (data.length != 2) { + throw new IllegalArgumentException("Malformed pos/inflection: " + s + "; expected 2 characters"); + } out.writeByte((byte) POS.Tag.valueOf(data[0]).ordinal()); } } diff --git a/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/TokenInfoDictionaryBuilder.java b/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/TokenInfoDictionaryBuilder.java index d5fb73f4c94..6609f50ed54 100644 --- a/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/TokenInfoDictionaryBuilder.java +++ b/lucene/analysis/nori/src/tools/java/org/apache/lucene/analysis/ko/util/TokenInfoDictionaryBuilder.java @@ -84,8 +84,7 @@ public class TokenInfoDictionaryBuilder { String[] entry = CSVUtil.parse(line); if(entry.length < 12) { - System.out.println("Entry in CSV is not valid: " + line); - continue; + throw new IllegalArgumentException("Entry in CSV is not valid (12 field values expected): " + line); } // NFKC normalize dictionary entry diff --git a/lucene/analysis/nori/src/tools/test/org/apache/lucene/analysis/ko/dict/TokenInfoDictionaryTest.java b/lucene/analysis/nori/src/tools/test/org/apache/lucene/analysis/ko/dict/TokenInfoDictionaryTest.java new file mode 100644 index 00000000000..492abea664b --- /dev/null +++ b/lucene/analysis/nori/src/tools/test/org/apache/lucene/analysis/ko/dict/TokenInfoDictionaryTest.java @@ -0,0 +1,80 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.ko.dict; + +import java.io.OutputStream; +import java.io.OutputStreamWriter; +import java.io.PrintWriter; +import java.nio.file.Files; +import java.nio.file.Path; + +import org.apache.lucene.analysis.ko.util.TokenInfoDictionaryBuilder; +import org.apache.lucene.analysis.ko.util.TokenInfoDictionaryWriter; +import org.apache.lucene.util.IntsRef; +import org.apache.lucene.util.IntsRefBuilder; +import org.apache.lucene.util.LuceneTestCase; + +import static java.io.File.separatorChar; +import static org.apache.lucene.analysis.ko.dict.BinaryDictionary.ResourceScheme; + +/** + * Tests of TokenInfoDictionary build tools; run using ant test-tools + */ +public class TokenInfoDictionaryTest extends LuceneTestCase { + + public void testPut() throws Exception { + TokenInfoDictionary dict = newDictionary("명사,1,1,2,NNG,*,*,*,*,*,*,*", + // "large" id + "일반,5000,5000,3,NNG,*,*,*,*,*,*,*"); + IntsRef wordIdRef = new IntsRefBuilder().get(); + + dict.lookupWordIds(0, wordIdRef); + int wordId = wordIdRef.ints[wordIdRef.offset]; + assertEquals(1, dict.getLeftId(wordId)); + assertEquals(1, dict.getRightId(wordId)); + assertEquals(2, dict.getWordCost(wordId)); + + dict.lookupWordIds(1, wordIdRef); + wordId = wordIdRef.ints[wordIdRef.offset]; + assertEquals(5000, dict.getLeftId(wordId)); + assertEquals(5000, dict.getRightId(wordId)); + assertEquals(3, dict.getWordCost(wordId)); + } + + private TokenInfoDictionary newDictionary(String... entries) throws Exception { + Path dir = createTempDir(); + try (OutputStream out = Files.newOutputStream(dir.resolve("test.csv")); + PrintWriter printer = new PrintWriter(new OutputStreamWriter(out, "utf-8"))) { + for (String entry : entries) { + printer.println(entry); + } + } + TokenInfoDictionaryBuilder builder = new TokenInfoDictionaryBuilder("utf-8", true); + TokenInfoDictionaryWriter writer = builder.build(dir.toString()); + writer.write(dir.toString()); + String dictionaryPath = TokenInfoDictionary.class.getName().replace('.', separatorChar); + // We must also load the other files (in BinaryDictionary) from the correct path + return new TokenInfoDictionary(ResourceScheme.FILE, dir.resolve(dictionaryPath).toString()); + } + + public void testPutException() throws Exception { + // too few columns + expectThrows(IllegalArgumentException.class, () -> newDictionary("HANGUL,1,1,1,NNG,*,*,*,*,*")); + // id too large + expectThrows(IllegalArgumentException.class, () -> newDictionary("HANGUL,8192,8192,1,NNG,*,*,*,*,*,*,*")); + } +} diff --git a/lucene/analysis/nori/src/tools/test/org/apache/lucene/analysis/ko/dict/UnknownDictionaryTest.java b/lucene/analysis/nori/src/tools/test/org/apache/lucene/analysis/ko/dict/UnknownDictionaryTest.java index 2bba7149567..cf5d6b7ee1d 100644 --- a/lucene/analysis/nori/src/tools/test/org/apache/lucene/analysis/ko/dict/UnknownDictionaryTest.java +++ b/lucene/analysis/nori/src/tools/test/org/apache/lucene/analysis/ko/dict/UnknownDictionaryTest.java @@ -27,20 +27,10 @@ public class UnknownDictionaryTest extends LuceneTestCase { @Test public void testPutCharacterCategory() { UnknownDictionaryWriter unkDic = new UnknownDictionaryWriter(10 * 1024 * 1024); - - try{ - unkDic.putCharacterCategory(0, "DUMMY_NAME"); - fail(); - } catch(Exception e) { - - } - - try{ - unkDic.putCharacterCategory(-1, "HANGUL"); - fail(); - } catch(Exception e) { - - } + + expectThrows(Exception.class, () -> unkDic.putCharacterCategory(0, "DUMMY_NAME")); + + expectThrows(Exception.class, () -> unkDic.putCharacterCategory(-1, "HANGUL")); unkDic.putCharacterCategory(0, "DEFAULT"); unkDic.putCharacterCategory(1, "GREEK"); @@ -52,12 +42,8 @@ public class UnknownDictionaryTest extends LuceneTestCase { @Test public void testPut() { UnknownDictionaryWriter unkDic = new UnknownDictionaryWriter(10 * 1024 * 1024); - try{ - unkDic.put(CSVUtil.parse("HANGUL,1800,3562,UNKNOWN,*,*,*,*,*,*,*")); - fail(); - } catch(Exception e){ - - } + expectThrows(NumberFormatException.class, () -> + unkDic.put(CSVUtil.parse("HANGUL,1800,3562,UNKNOWN,*,*,*,*,*,*,*"))); String entry1 = "ALPHA,1793,3533,795,SL,*,*,*,*,*,*,*"; String entry2 = "HANGUL,1800,3562,10247,UNKNOWN,*,*,*,*,*,*,*";