mirror of https://github.com/apache/lucene.git
LUCENE-8866: remove kuromoji/tools dependency on ICU
This commit is contained in:
parent
c8c2f2f25b
commit
91331d1a89
|
@ -69,13 +69,8 @@
|
||||||
originalfile="${dict.src.dir}/Noun.proper.csv"/>
|
originalfile="${dict.src.dir}/Noun.proper.csv"/>
|
||||||
</target>
|
</target>
|
||||||
|
|
||||||
<path id="tools.dependencies">
|
|
||||||
<fileset dir="../icu/lib"/>
|
|
||||||
</path>
|
|
||||||
|
|
||||||
<path id="tools.classpath">
|
<path id="tools.classpath">
|
||||||
<path refid="classpath"/>
|
<path refid="classpath"/>
|
||||||
<path refid="tools.dependencies"/>
|
|
||||||
<pathelement location="${build.dir}/classes/java"/>
|
<pathelement location="${build.dir}/classes/java"/>
|
||||||
<pathelement location="${build.dir}/classes/tools"/>
|
<pathelement location="${build.dir}/classes/tools"/>
|
||||||
</path>
|
</path>
|
||||||
|
@ -108,14 +103,7 @@
|
||||||
</sequential>
|
</sequential>
|
||||||
</target>
|
</target>
|
||||||
|
|
||||||
<!-- we don't actually need to compile this thing, we just want its lib -->
|
<target name="compile-tools" depends="compile-core, common.compile-tools">
|
||||||
<target name="resolve-icu">
|
|
||||||
<ant dir="../icu/" target="resolve" inheritAll="false">
|
|
||||||
<propertyset refid="uptodate.and.compiled.properties"/>
|
|
||||||
</ant>
|
|
||||||
</target>
|
|
||||||
|
|
||||||
<target name="compile-tools" depends="resolve-icu, compile-core, common.compile-tools">
|
|
||||||
<compile
|
<compile
|
||||||
srcdir="src/tools/java"
|
srcdir="src/tools/java"
|
||||||
destdir="${build.dir}/classes/tools">
|
destdir="${build.dir}/classes/tools">
|
||||||
|
|
|
@ -26,6 +26,7 @@ import java.io.InputStreamReader;
|
||||||
import java.nio.charset.Charset;
|
import java.nio.charset.Charset;
|
||||||
import java.nio.charset.CharsetDecoder;
|
import java.nio.charset.CharsetDecoder;
|
||||||
import java.nio.charset.CodingErrorAction;
|
import java.nio.charset.CodingErrorAction;
|
||||||
|
import java.text.Normalizer;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
|
@ -38,8 +39,6 @@ import org.apache.lucene.util.fst.Builder;
|
||||||
import org.apache.lucene.util.fst.FST;
|
import org.apache.lucene.util.fst.FST;
|
||||||
import org.apache.lucene.util.fst.PositiveIntOutputs;
|
import org.apache.lucene.util.fst.PositiveIntOutputs;
|
||||||
|
|
||||||
import com.ibm.icu.text.Normalizer2;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*/
|
*/
|
||||||
public class TokenInfoDictionaryBuilder {
|
public class TokenInfoDictionaryBuilder {
|
||||||
|
@ -49,16 +48,14 @@ public class TokenInfoDictionaryBuilder {
|
||||||
|
|
||||||
private String encoding = "euc-jp";
|
private String encoding = "euc-jp";
|
||||||
|
|
||||||
private boolean normalizeEntries = false;
|
private Normalizer.Form normalForm;
|
||||||
private Normalizer2 normalizer;
|
|
||||||
|
|
||||||
private DictionaryFormat format = DictionaryFormat.IPADIC;
|
private DictionaryFormat format = DictionaryFormat.IPADIC;
|
||||||
|
|
||||||
public TokenInfoDictionaryBuilder(DictionaryFormat format, String encoding, boolean normalizeEntries) {
|
public TokenInfoDictionaryBuilder(DictionaryFormat format, String encoding, boolean normalizeEntries) {
|
||||||
this.format = format;
|
this.format = format;
|
||||||
this.encoding = encoding;
|
this.encoding = encoding;
|
||||||
this.normalizeEntries = normalizeEntries;
|
this.normalForm = normalizeEntries ? Normalizer.Form.NFKC : null;
|
||||||
this.normalizer = normalizeEntries ? Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.COMPOSE) : null;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public TokenInfoDictionaryWriter build(String dirname) throws IOException {
|
public TokenInfoDictionaryWriter build(String dirname) throws IOException {
|
||||||
|
@ -103,13 +100,13 @@ public class TokenInfoDictionaryBuilder {
|
||||||
lines.add(formatted);
|
lines.add(formatted);
|
||||||
|
|
||||||
// NFKC normalize dictionary entry
|
// NFKC normalize dictionary entry
|
||||||
if (normalizeEntries) {
|
if (normalForm != null) {
|
||||||
if (normalizer.isNormalized(entry[0])){
|
if (Normalizer.isNormalized(entry[0], normalForm)){
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
String[] normalizedEntry = new String[entry.length];
|
String[] normalizedEntry = new String[entry.length];
|
||||||
for (int i = 0; i < entry.length; i++) {
|
for (int i = 0; i < entry.length; i++) {
|
||||||
normalizedEntry[i] = normalizer.normalize(entry[i]);
|
normalizedEntry[i] = Normalizer.normalize(entry[i], normalForm);
|
||||||
}
|
}
|
||||||
|
|
||||||
formatted = formatEntry(normalizedEntry);
|
formatted = formatEntry(normalizedEntry);
|
||||||
|
|
Loading…
Reference in New Issue