LUCENE-8866: remove kuromoji/tools dependency on ICU

This commit is contained in:
Robert Muir 2019-06-20 21:20:17 -04:00
parent c8c2f2f25b
commit 91331d1a89
2 changed files with 7 additions and 22 deletions

View File

@ -69,13 +69,8 @@
originalfile="${dict.src.dir}/Noun.proper.csv"/> originalfile="${dict.src.dir}/Noun.proper.csv"/>
</target> </target>
<path id="tools.dependencies">
<fileset dir="../icu/lib"/>
</path>
<path id="tools.classpath"> <path id="tools.classpath">
<path refid="classpath"/> <path refid="classpath"/>
<path refid="tools.dependencies"/>
<pathelement location="${build.dir}/classes/java"/> <pathelement location="${build.dir}/classes/java"/>
<pathelement location="${build.dir}/classes/tools"/> <pathelement location="${build.dir}/classes/tools"/>
</path> </path>
@ -108,14 +103,7 @@
</sequential> </sequential>
</target> </target>
<!-- we don't actually need to compile this thing, we just want its lib --> <target name="compile-tools" depends="compile-core, common.compile-tools">
<target name="resolve-icu">
<ant dir="../icu/" target="resolve" inheritAll="false">
<propertyset refid="uptodate.and.compiled.properties"/>
</ant>
</target>
<target name="compile-tools" depends="resolve-icu, compile-core, common.compile-tools">
<compile <compile
srcdir="src/tools/java" srcdir="src/tools/java"
destdir="${build.dir}/classes/tools"> destdir="${build.dir}/classes/tools">

View File

@ -26,6 +26,7 @@ import java.io.InputStreamReader;
import java.nio.charset.Charset; import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder; import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction; import java.nio.charset.CodingErrorAction;
import java.text.Normalizer;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.Collections; import java.util.Collections;
@ -38,8 +39,6 @@ import org.apache.lucene.util.fst.Builder;
import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.PositiveIntOutputs; import org.apache.lucene.util.fst.PositiveIntOutputs;
import com.ibm.icu.text.Normalizer2;
/** /**
*/ */
public class TokenInfoDictionaryBuilder { public class TokenInfoDictionaryBuilder {
@ -49,16 +48,14 @@ public class TokenInfoDictionaryBuilder {
private String encoding = "euc-jp"; private String encoding = "euc-jp";
private boolean normalizeEntries = false; private Normalizer.Form normalForm;
private Normalizer2 normalizer;
private DictionaryFormat format = DictionaryFormat.IPADIC; private DictionaryFormat format = DictionaryFormat.IPADIC;
public TokenInfoDictionaryBuilder(DictionaryFormat format, String encoding, boolean normalizeEntries) { public TokenInfoDictionaryBuilder(DictionaryFormat format, String encoding, boolean normalizeEntries) {
this.format = format; this.format = format;
this.encoding = encoding; this.encoding = encoding;
this.normalizeEntries = normalizeEntries; this.normalForm = normalizeEntries ? Normalizer.Form.NFKC : null;
this.normalizer = normalizeEntries ? Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.COMPOSE) : null;
} }
public TokenInfoDictionaryWriter build(String dirname) throws IOException { public TokenInfoDictionaryWriter build(String dirname) throws IOException {
@ -103,13 +100,13 @@ public class TokenInfoDictionaryBuilder {
lines.add(formatted); lines.add(formatted);
// NFKC normalize dictionary entry // NFKC normalize dictionary entry
if (normalizeEntries) { if (normalForm != null) {
if (normalizer.isNormalized(entry[0])){ if (Normalizer.isNormalized(entry[0], normalForm)){
continue; continue;
} }
String[] normalizedEntry = new String[entry.length]; String[] normalizedEntry = new String[entry.length];
for (int i = 0; i < entry.length; i++) { for (int i = 0; i < entry.length; i++) {
normalizedEntry[i] = normalizer.normalize(entry[i]); normalizedEntry[i] = Normalizer.normalize(entry[i], normalForm);
} }
formatted = formatEntry(normalizedEntry); formatted = formatEntry(normalizedEntry);