LUCENE-10475: Merge o.a.l.a.[ja|ko].util into o.a.l.a.[ja|ko].dict (#772)

This commit is contained in:
Tomoko Uchida 2022-03-29 21:09:26 +09:00 committed by GitHub
parent ac6c36d406
commit 2a3e5ca07f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
44 changed files with 42 additions and 106 deletions

View File

@ -19,7 +19,7 @@
def recompileDictionary(project, dictionaryName, Closure closure) {
project.javaexec {
main = "org.apache.lucene.analysis.ja.util.DictionaryBuilder"
main = "org.apache.lucene.analysis.ja.dict.DictionaryBuilder"
classpath = project.sourceSets.main.runtimeClasspath
jvmArgs '-Xmx1G'

View File

@ -19,7 +19,7 @@
def recompileDictionary(project, dictionaryName, Closure closure) {
project.javaexec {
main = "org.apache.lucene.analysis.ko.util.DictionaryBuilder"
main = "org.apache.lucene.analysis.ko.dict.DictionaryBuilder"
classpath = project.sourceSets.main.runtimeClasspath
jvmArgs '-Xmx1G'

View File

@ -52,6 +52,10 @@ Other
* LUCENE-10393: Unify binary dictionary and dictionary writer in Kuromoji and Nori.
(Tomoko Uchida, Robert Muir)
* LUCENE-10475: Merge dictionary builders in `util` package into `dict` package in Kuromoji and Nori.
All classes in `org.apache.lucene.analysis.[ja|ko].util` was moved to `org.apache.lucene.analysis.[ja|ko].dict`.
(Tomoko Uchida)
======================= Lucene 9.2.0 =======================
API Changes
---------------------

View File

@ -24,7 +24,6 @@ module org.apache.lucene.analysis.kuromoji {
exports org.apache.lucene.analysis.ja.completion;
exports org.apache.lucene.analysis.ja.dict;
exports org.apache.lucene.analysis.ja.tokenattributes;
exports org.apache.lucene.analysis.ja.util;
opens org.apache.lucene.analysis.ja to
org.apache.lucene.core;

View File

@ -19,8 +19,8 @@ package org.apache.lucene.analysis.ja;
import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.ja.dict.ToStringUtil;
import org.apache.lucene.analysis.ja.tokenattributes.ReadingAttribute;
import org.apache.lucene.analysis.ja.util.ToStringUtil;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
/**

View File

@ -265,7 +265,7 @@ public final class JapaneseTokenizer extends Tokenizer {
* Create a new JapaneseTokenizer, supplying a custom system dictionary and unknown dictionary.
* This constructor provides an entry point for users that want to construct custom language
* models that can be used as input to {@link
* org.apache.lucene.analysis.ja.util.DictionaryBuilder}.
* org.apache.lucene.analysis.ja.dict.DictionaryBuilder}.
*
* @param factory the AttributeFactory to use
* @param systemDictionary a custom known token dictionary

View File

@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.ja.util;
package org.apache.lucene.analysis.ja.dict;
import java.util.ArrayList;
import java.util.regex.Matcher;

View File

@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.ja.util;
package org.apache.lucene.analysis.ja.dict;
import java.io.IOException;
import java.io.LineNumberReader;
@ -22,7 +22,6 @@ import java.io.Reader;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import org.apache.lucene.analysis.ja.dict.ConnectionCosts;
import org.apache.lucene.analysis.morph.ConnectionCostsWriter;
class ConnectionCostsBuilder {

View File

@ -14,13 +14,12 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.ja.util;
package org.apache.lucene.analysis.ja.dict;
import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Locale;
import org.apache.lucene.analysis.ja.dict.DictionaryConstants;
/**
* Tool to build dictionaries. Usage:

View File

@ -17,7 +17,7 @@
package org.apache.lucene.analysis.ja.dict;
/** Dictionary constants */
public final class DictionaryConstants {
final class DictionaryConstants {
/** Codec header of the dictionary file. */
public static final String DICT_HEADER = "kuromoji_dict";
/** Codec header of the dictionary mapping file. */

View File

@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.ja.util;
package org.apache.lucene.analysis.ja.dict;
import java.io.IOException;
import java.util.HashMap;

View File

@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.ja.util;
package org.apache.lucene.analysis.ja.dict;
import java.io.BufferedReader;
import java.io.IOException;
@ -28,7 +28,6 @@ import java.util.Comparator;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.lucene.analysis.ja.util.DictionaryBuilder.DictionaryFormat;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.FSTCompiler;
@ -39,7 +38,7 @@ class TokenInfoDictionaryBuilder {
private final String encoding;
private final Normalizer.Form normalForm;
private final DictionaryFormat format;
private final DictionaryBuilder.DictionaryFormat format;
/**
* Internal word id - incrementally assigned as entries are read and added. This will be byte
@ -48,7 +47,7 @@ class TokenInfoDictionaryBuilder {
private int offset = 0;
public TokenInfoDictionaryBuilder(
DictionaryFormat format, String encoding, boolean normalizeEntries) {
DictionaryBuilder.DictionaryFormat format, String encoding, boolean normalizeEntries) {
this.format = format;
this.encoding = encoding;
normalForm = normalizeEntries ? Normalizer.Form.NFKC : null;
@ -159,7 +158,7 @@ class TokenInfoDictionaryBuilder {
*/
private String[] formatEntry(String[] features) {
if (this.format == DictionaryFormat.IPADIC) {
if (this.format == DictionaryBuilder.DictionaryFormat.IPADIC) {
return features;
} else {
String[] features2 = new String[13];

View File

@ -14,12 +14,11 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.ja.util;
package org.apache.lucene.analysis.ja.dict;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.ByteBuffer;
import org.apache.lucene.analysis.ja.dict.TokenInfoMorphData;
import org.apache.lucene.analysis.morph.DictionaryEntryWriter;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.ArrayUtil;

View File

@ -14,14 +14,12 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.ja.util;
package org.apache.lucene.analysis.ja.dict;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Objects;
import org.apache.lucene.analysis.ja.dict.DictionaryConstants;
import org.apache.lucene.analysis.ja.dict.TokenInfoDictionary;
import org.apache.lucene.util.fst.FST;
class TokenInfoDictionaryWriter

View File

@ -26,7 +26,7 @@ import org.apache.lucene.store.InputStreamDataInput;
import org.apache.lucene.util.IOSupplier;
/** Morphological information for system dictionary. */
public class TokenInfoMorphData implements JaMorphData {
class TokenInfoMorphData implements JaMorphData {
private final ByteBuffer buffer;
private final String[] posDict;

View File

@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.ja.util;
package org.apache.lucene.analysis.ja.dict;
import java.io.IOException;
import java.io.LineNumberReader;
@ -25,7 +25,6 @@ import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import org.apache.lucene.analysis.ja.dict.CharacterDefinition;
class UnknownDictionaryBuilder {
private static final String NGRAM_DICTIONARY_ENTRY = "NGRAM,5,5,-32768,記号,一般,*,*,*,*,*,*,*";

View File

@ -14,13 +14,10 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.ja.util;
package org.apache.lucene.analysis.ja.dict;
import java.io.IOException;
import java.nio.file.Path;
import org.apache.lucene.analysis.ja.dict.CharacterDefinition;
import org.apache.lucene.analysis.ja.dict.DictionaryConstants;
import org.apache.lucene.analysis.ja.dict.UnknownDictionary;
import org.apache.lucene.analysis.morph.BinaryDictionaryWriter;
import org.apache.lucene.analysis.morph.CharacterDefinitionWriter;

View File

@ -25,7 +25,6 @@ import java.util.Comparator;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import org.apache.lucene.analysis.ja.util.CSVUtil;
import org.apache.lucene.analysis.morph.Dictionary;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.fst.FST;

View File

@ -19,8 +19,6 @@ package org.apache.lucene.analysis.ja.dict;
import static org.apache.lucene.analysis.ja.dict.UserDictionary.CUSTOM_DICTIONARY_WORD_ID_OFFSET;
import static org.apache.lucene.analysis.ja.dict.UserDictionary.INTERNAL_SEPARATOR;
import org.apache.lucene.analysis.ja.util.CSVUtil;
/** Morphological information for user dictionary. */
final class UserMorphData implements JaMorphData {
public static final int WORD_COST = -100000;

View File

@ -17,7 +17,7 @@
package org.apache.lucene.analysis.ja.tokenattributes;
import org.apache.lucene.analysis.ja.Token;
import org.apache.lucene.analysis.ja.util.ToStringUtil;
import org.apache.lucene.analysis.ja.dict.ToStringUtil;
import org.apache.lucene.util.AttributeImpl;
import org.apache.lucene.util.AttributeReflector;

View File

@ -17,7 +17,7 @@
package org.apache.lucene.analysis.ja.tokenattributes;
import org.apache.lucene.analysis.ja.Token;
import org.apache.lucene.analysis.ja.util.ToStringUtil;
import org.apache.lucene.analysis.ja.dict.ToStringUtil;
import org.apache.lucene.util.AttributeImpl;
import org.apache.lucene.util.AttributeReflector;

View File

@ -17,7 +17,7 @@
package org.apache.lucene.analysis.ja.tokenattributes;
import org.apache.lucene.analysis.ja.Token;
import org.apache.lucene.analysis.ja.util.ToStringUtil;
import org.apache.lucene.analysis.ja.dict.ToStringUtil;
import org.apache.lucene.util.AttributeImpl;
import org.apache.lucene.util.AttributeReflector;

View File

@ -1,19 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/** Kuromoji utility classes. */
package org.apache.lucene.analysis.ja.util;

View File

@ -17,7 +17,7 @@
package org.apache.lucene.analysis.ja;
import java.io.IOException;
import org.apache.lucene.analysis.ja.util.CSVUtil;
import org.apache.lucene.analysis.ja.dict.CSVUtil;
import org.apache.lucene.tests.util.LuceneTestCase;
/*

View File

@ -25,7 +25,6 @@ import java.io.BufferedWriter;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import org.apache.lucene.analysis.ja.util.DictionaryBuilder;
import org.apache.lucene.tests.util.LuceneTestCase;
import org.junit.Before;

View File

@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.ja.util;
package org.apache.lucene.analysis.ja.dict;
import java.util.HashMap;
import java.util.Map;

View File

@ -27,9 +27,6 @@ import java.io.PrintWriter;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import org.apache.lucene.analysis.ja.util.DictionaryBuilder;
import org.apache.lucene.analysis.ja.util.DictionaryBuilder.DictionaryFormat;
import org.apache.lucene.analysis.ja.util.ToStringUtil;
import org.apache.lucene.tests.util.LuceneTestCase;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.IntsRefBuilder;
@ -77,7 +74,7 @@ public class TestTokenInfoDictionary extends LuceneTestCase {
new PrintWriter(new OutputStreamWriter(out, StandardCharsets.UTF_8))) {
printer.println("1 1");
}
DictionaryBuilder.build(DictionaryFormat.IPADIC, dir, dir, "utf-8", true);
DictionaryBuilder.build(DictionaryBuilder.DictionaryFormat.IPADIC, dir, dir, "utf-8", true);
String dictionaryPath = TokenInfoDictionary.class.getName().replace('.', '/');
// We must also load the other files (in BinaryDictionary) from the correct path
return new TokenInfoDictionary(

View File

@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.ja.util;
package org.apache.lucene.analysis.ja.dict;
import org.apache.lucene.tests.util.LuceneTestCase;
import org.junit.Test;

View File

@ -23,7 +23,6 @@ module org.apache.lucene.analysis.nori {
exports org.apache.lucene.analysis.ko;
exports org.apache.lucene.analysis.ko.dict;
exports org.apache.lucene.analysis.ko.tokenattributes;
exports org.apache.lucene.analysis.ko.util;
provides org.apache.lucene.analysis.TokenizerFactory with
org.apache.lucene.analysis.ko.KoreanTokenizerFactory;

View File

@ -26,6 +26,7 @@ import java.util.List;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.ko.dict.CharacterDefinition;
import org.apache.lucene.analysis.ko.dict.ConnectionCosts;
import org.apache.lucene.analysis.ko.dict.DictionaryBuilder;
import org.apache.lucene.analysis.ko.dict.KoMorphData;
import org.apache.lucene.analysis.ko.dict.TokenInfoDictionary;
import org.apache.lucene.analysis.ko.dict.TokenInfoFST;
@ -197,7 +198,7 @@ public final class KoreanTokenizer extends Tokenizer {
/**
* Create a new KoreanTokenizer supplying a custom system dictionary and unknown dictionary. This
* constructor provides an entry point for users that want to construct custom language models
* that can be used as input to {@link org.apache.lucene.analysis.ko.util.DictionaryBuilder}.
* that can be used as input to {@link DictionaryBuilder}.
*
* @param factory the AttributeFactory to use
* @param systemDictionary a custom known token dictionary

View File

@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.ko.util;
package org.apache.lucene.analysis.ko.dict;
import java.util.ArrayList;
import java.util.regex.Matcher;

View File

@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.ko.util;
package org.apache.lucene.analysis.ko.dict;
import java.io.IOException;
import java.io.LineNumberReader;
@ -22,7 +22,6 @@ import java.io.Reader;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import org.apache.lucene.analysis.ko.dict.ConnectionCosts;
import org.apache.lucene.analysis.morph.ConnectionCostsWriter;
class ConnectionCostsBuilder {

View File

@ -14,12 +14,11 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.ko.util;
package org.apache.lucene.analysis.ko.dict;
import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;
import org.apache.lucene.analysis.ko.dict.DictionaryConstants;
/** Tool to build dictionaries. */
public class DictionaryBuilder {

View File

@ -17,7 +17,7 @@
package org.apache.lucene.analysis.ko.dict;
/** Dictionary constants */
public final class DictionaryConstants {
final class DictionaryConstants {
/** Codec header of the dictionary file. */
public static final String DICT_HEADER = "ko_dict";
/** Codec header of the dictionary mapping file. */

View File

@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.ko.util;
package org.apache.lucene.analysis.ko.dict;
import java.io.BufferedReader;
import java.io.IOException;

View File

@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.ko.util;
package org.apache.lucene.analysis.ko.dict;
import java.io.IOException;
import java.io.OutputStream;
@ -23,8 +23,6 @@ import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.apache.lucene.analysis.ko.POS;
import org.apache.lucene.analysis.ko.dict.KoMorphData;
import org.apache.lucene.analysis.ko.dict.TokenInfoMorphData;
import org.apache.lucene.analysis.morph.DictionaryEntryWriter;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.ArrayUtil;

View File

@ -14,14 +14,12 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.ko.util;
package org.apache.lucene.analysis.ko.dict;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Objects;
import org.apache.lucene.analysis.ko.dict.DictionaryConstants;
import org.apache.lucene.analysis.ko.dict.TokenInfoDictionary;
import org.apache.lucene.analysis.morph.BinaryDictionaryWriter;
import org.apache.lucene.util.fst.FST;

View File

@ -27,7 +27,7 @@ import org.apache.lucene.store.InputStreamDataInput;
import org.apache.lucene.util.IOSupplier;
/** Morphological information for system dictionary. */
public class TokenInfoMorphData implements KoMorphData {
class TokenInfoMorphData implements KoMorphData {
private final ByteBuffer buffer;
private final POS.Tag[] posDict;

View File

@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.ko.util;
package org.apache.lucene.analysis.ko.dict;
import java.io.IOException;
import java.io.LineNumberReader;
@ -25,7 +25,6 @@ import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import org.apache.lucene.analysis.ko.dict.CharacterDefinition;
class UnknownDictionaryBuilder {
private static final String NGRAM_DICTIONARY_ENTRY = "NGRAM,1801,3559,3677,SY,*,*,*,*,*,*,*";

View File

@ -14,13 +14,10 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.ko.util;
package org.apache.lucene.analysis.ko.dict;
import java.io.IOException;
import java.nio.file.Path;
import org.apache.lucene.analysis.ko.dict.CharacterDefinition;
import org.apache.lucene.analysis.ko.dict.DictionaryConstants;
import org.apache.lucene.analysis.ko.dict.UnknownDictionary;
import org.apache.lucene.analysis.morph.BinaryDictionaryWriter;
import org.apache.lucene.analysis.morph.CharacterDefinitionWriter;

View File

@ -1,19 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/** Nori utility classes. */
package org.apache.lucene.analysis.ko.util;

View File

@ -25,7 +25,6 @@ import java.io.BufferedWriter;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import org.apache.lucene.analysis.ko.util.DictionaryBuilder;
import org.apache.lucene.tests.util.LuceneTestCase;
import org.junit.Before;

View File

@ -28,7 +28,6 @@ import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import org.apache.lucene.analysis.ko.POS;
import org.apache.lucene.analysis.ko.util.DictionaryBuilder;
import org.apache.lucene.tests.util.LuceneTestCase;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.IntsRefBuilder;

View File

@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.ko.util;
package org.apache.lucene.analysis.ko.dict;
import org.apache.lucene.tests.util.LuceneTestCase;
import org.junit.Test;