mirror of https://github.com/apache/lucene.git
LUCENE-9778: Hunspell: speed up input conversion (#2376)
This commit is contained in:
parent
2d53c6073b
commit
2ae45cc985
|
@ -0,0 +1,110 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.analysis.hunspell;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.TreeMap;
|
||||||
|
import org.apache.lucene.util.CharsRef;
|
||||||
|
import org.apache.lucene.util.FixedBitSet;
|
||||||
|
import org.apache.lucene.util.IntsRefBuilder;
|
||||||
|
import org.apache.lucene.util.fst.CharSequenceOutputs;
|
||||||
|
import org.apache.lucene.util.fst.FST;
|
||||||
|
import org.apache.lucene.util.fst.FSTCompiler;
|
||||||
|
import org.apache.lucene.util.fst.Outputs;
|
||||||
|
import org.apache.lucene.util.fst.Util;
|
||||||
|
|
||||||
|
/** ICONV or OCONV replacement table */
|
||||||
|
class ConvTable {
|
||||||
|
private final FST<CharsRef> fst;
|
||||||
|
private final FixedBitSet firstCharHashes;
|
||||||
|
private final int mod;
|
||||||
|
|
||||||
|
ConvTable(TreeMap<String, String> mappings) {
|
||||||
|
mod = Math.max(256, Integer.highestOneBit(mappings.size()) << 1);
|
||||||
|
firstCharHashes = new FixedBitSet(mod);
|
||||||
|
|
||||||
|
try {
|
||||||
|
Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
|
||||||
|
FSTCompiler<CharsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, outputs);
|
||||||
|
IntsRefBuilder scratchInts = new IntsRefBuilder();
|
||||||
|
for (Map.Entry<String, String> entry : mappings.entrySet()) {
|
||||||
|
String key = entry.getKey();
|
||||||
|
assert key.length() > 0;
|
||||||
|
firstCharHashes.set(key.charAt(0) % mod);
|
||||||
|
Util.toUTF16(key, scratchInts);
|
||||||
|
fstCompiler.add(scratchInts.get(), new CharsRef(entry.getValue()));
|
||||||
|
}
|
||||||
|
|
||||||
|
fst = fstCompiler.compile();
|
||||||
|
} catch (IOException bogus) {
|
||||||
|
throw new RuntimeException(bogus);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void applyMappings(StringBuilder sb) {
|
||||||
|
FST.BytesReader bytesReader = null;
|
||||||
|
FST.Arc<CharsRef> firstArc = null;
|
||||||
|
FST.Arc<CharsRef> arc = null;
|
||||||
|
|
||||||
|
int longestMatch;
|
||||||
|
CharsRef longestOutput;
|
||||||
|
|
||||||
|
for (int i = 0; i < sb.length(); i++) {
|
||||||
|
if (!mightReplaceChar(sb.charAt(i))) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (firstArc == null) {
|
||||||
|
firstArc = fst.getFirstArc(new FST.Arc<>());
|
||||||
|
bytesReader = fst.getBytesReader();
|
||||||
|
arc = new FST.Arc<>();
|
||||||
|
}
|
||||||
|
arc.copyFrom(firstArc);
|
||||||
|
CharsRef output = fst.outputs.getNoOutput();
|
||||||
|
longestMatch = -1;
|
||||||
|
longestOutput = null;
|
||||||
|
|
||||||
|
for (int j = i; j < sb.length(); j++) {
|
||||||
|
char ch = sb.charAt(j);
|
||||||
|
|
||||||
|
try {
|
||||||
|
if (fst.findTargetArc(ch, arc, arc, bytesReader) == null) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
output = fst.outputs.add(output, arc.output());
|
||||||
|
} catch (IOException bogus) {
|
||||||
|
throw new RuntimeException(bogus);
|
||||||
|
}
|
||||||
|
if (arc.isFinal()) {
|
||||||
|
longestOutput = fst.outputs.add(output, arc.nextFinalOutput());
|
||||||
|
longestMatch = j;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (longestMatch >= 0) {
|
||||||
|
sb.delete(i, longestMatch + 1);
|
||||||
|
sb.insert(i, longestOutput);
|
||||||
|
i += (longestOutput.length - 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
boolean mightReplaceChar(char c) {
|
||||||
|
return firstCharHashes.get(c % mod);
|
||||||
|
}
|
||||||
|
}
|
|
@ -51,7 +51,6 @@ import org.apache.lucene.store.IOContext;
|
||||||
import org.apache.lucene.store.IndexOutput;
|
import org.apache.lucene.store.IndexOutput;
|
||||||
import org.apache.lucene.util.ArrayUtil;
|
import org.apache.lucene.util.ArrayUtil;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
import org.apache.lucene.util.CharsRef;
|
|
||||||
import org.apache.lucene.util.IOUtils;
|
import org.apache.lucene.util.IOUtils;
|
||||||
import org.apache.lucene.util.IntsRef;
|
import org.apache.lucene.util.IntsRef;
|
||||||
import org.apache.lucene.util.IntsRefBuilder;
|
import org.apache.lucene.util.IntsRefBuilder;
|
||||||
|
@ -60,11 +59,9 @@ import org.apache.lucene.util.OfflineSorter.ByteSequencesReader;
|
||||||
import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter;
|
import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter;
|
||||||
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
|
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
|
||||||
import org.apache.lucene.util.automaton.RegExp;
|
import org.apache.lucene.util.automaton.RegExp;
|
||||||
import org.apache.lucene.util.fst.CharSequenceOutputs;
|
|
||||||
import org.apache.lucene.util.fst.FST;
|
import org.apache.lucene.util.fst.FST;
|
||||||
import org.apache.lucene.util.fst.FSTCompiler;
|
import org.apache.lucene.util.fst.FSTCompiler;
|
||||||
import org.apache.lucene.util.fst.IntSequenceOutputs;
|
import org.apache.lucene.util.fst.IntSequenceOutputs;
|
||||||
import org.apache.lucene.util.fst.Outputs;
|
|
||||||
import org.apache.lucene.util.fst.Util;
|
import org.apache.lucene.util.fst.Util;
|
||||||
|
|
||||||
/** In-memory structure for the dictionary (.dic) and affix (.aff) data of a hunspell dictionary. */
|
/** In-memory structure for the dictionary (.dic) and affix (.aff) data of a hunspell dictionary. */
|
||||||
|
@ -172,13 +169,7 @@ public class Dictionary {
|
||||||
int maxNGramSuggestions = Integer.MAX_VALUE;
|
int maxNGramSuggestions = Integer.MAX_VALUE;
|
||||||
boolean onlyMaxDiff;
|
boolean onlyMaxDiff;
|
||||||
char noSuggest, subStandard;
|
char noSuggest, subStandard;
|
||||||
|
ConvTable iconv, oconv;
|
||||||
// FSTs used for ICONV/OCONV, output ord pointing to replacement text
|
|
||||||
FST<CharsRef> iconv;
|
|
||||||
FST<CharsRef> oconv;
|
|
||||||
|
|
||||||
boolean needsInputCleaning;
|
|
||||||
boolean needsOutputCleaning;
|
|
||||||
|
|
||||||
// true if we can strip suffixes "down to nothing"
|
// true if we can strip suffixes "down to nothing"
|
||||||
boolean fullStrip;
|
boolean fullStrip;
|
||||||
|
@ -224,8 +215,6 @@ public class Dictionary {
|
||||||
boolean ignoreCase)
|
boolean ignoreCase)
|
||||||
throws IOException, ParseException {
|
throws IOException, ParseException {
|
||||||
this.ignoreCase = ignoreCase;
|
this.ignoreCase = ignoreCase;
|
||||||
this.needsInputCleaning = ignoreCase;
|
|
||||||
this.needsOutputCleaning = false; // set if we have an OCONV
|
|
||||||
|
|
||||||
try (BufferedInputStream affixStream =
|
try (BufferedInputStream affixStream =
|
||||||
new BufferedInputStream(affix, MAX_PROLOGUE_SCAN_WINDOW) {
|
new BufferedInputStream(affix, MAX_PROLOGUE_SCAN_WINDOW) {
|
||||||
|
@ -379,16 +368,13 @@ public class Dictionary {
|
||||||
} else if ("IGNORE".equals(firstWord)) {
|
} else if ("IGNORE".equals(firstWord)) {
|
||||||
ignore = singleArgument(reader, line).toCharArray();
|
ignore = singleArgument(reader, line).toCharArray();
|
||||||
Arrays.sort(ignore);
|
Arrays.sort(ignore);
|
||||||
needsInputCleaning = true;
|
|
||||||
} else if ("ICONV".equals(firstWord) || "OCONV".equals(firstWord)) {
|
} else if ("ICONV".equals(firstWord) || "OCONV".equals(firstWord)) {
|
||||||
int num = parseNum(reader, line);
|
int num = parseNum(reader, line);
|
||||||
FST<CharsRef> res = parseConversions(reader, num);
|
ConvTable res = parseConversions(reader, num);
|
||||||
if (line.startsWith("I")) {
|
if (line.startsWith("I")) {
|
||||||
iconv = res;
|
iconv = res;
|
||||||
needsInputCleaning |= iconv != null;
|
|
||||||
} else {
|
} else {
|
||||||
oconv = res;
|
oconv = res;
|
||||||
needsOutputCleaning |= oconv != null;
|
|
||||||
}
|
}
|
||||||
} else if ("FULLSTRIP".equals(firstWord)) {
|
} else if ("FULLSTRIP".equals(firstWord)) {
|
||||||
fullStrip = true;
|
fullStrip = true;
|
||||||
|
@ -803,9 +789,8 @@ public class Dictionary {
|
||||||
affixData[dataStart + AFFIX_CONDITION] = (char) patternOrd;
|
affixData[dataStart + AFFIX_CONDITION] = (char) patternOrd;
|
||||||
affixData[dataStart + AFFIX_APPEND] = (char) appendFlagsOrd;
|
affixData[dataStart + AFFIX_APPEND] = (char) appendFlagsOrd;
|
||||||
|
|
||||||
if (needsInputCleaning) {
|
if (needsInputCleaning(affixArg)) {
|
||||||
CharSequence cleaned = cleanInput(affixArg, sb);
|
affixArg = cleanInput(affixArg, sb).toString();
|
||||||
affixArg = cleaned.toString();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (isSuffix) {
|
if (isSuffix) {
|
||||||
|
@ -840,9 +825,9 @@ public class Dictionary {
|
||||||
return affixData(affix, AFFIX_CONDITION) >>> 1;
|
return affixData(affix, AFFIX_CONDITION) >>> 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
private FST<CharsRef> parseConversions(LineNumberReader reader, int num)
|
private ConvTable parseConversions(LineNumberReader reader, int num)
|
||||||
throws IOException, ParseException {
|
throws IOException, ParseException {
|
||||||
Map<String, String> mappings = new TreeMap<>();
|
TreeMap<String, String> mappings = new TreeMap<>();
|
||||||
|
|
||||||
for (int i = 0; i < num; i++) {
|
for (int i = 0; i < num; i++) {
|
||||||
String[] parts = splitBySpace(reader, reader.readLine(), 3);
|
String[] parts = splitBySpace(reader, reader.readLine(), 3);
|
||||||
|
@ -851,15 +836,7 @@ public class Dictionary {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
|
return new ConvTable(mappings);
|
||||||
FSTCompiler<CharsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, outputs);
|
|
||||||
IntsRefBuilder scratchInts = new IntsRefBuilder();
|
|
||||||
for (Map.Entry<String, String> entry : mappings.entrySet()) {
|
|
||||||
Util.toUTF16(entry.getKey(), scratchInts);
|
|
||||||
fstCompiler.add(scratchInts.get(), new CharsRef(entry.getValue()));
|
|
||||||
}
|
|
||||||
|
|
||||||
return fstCompiler.compile();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static final byte[] BOM_UTF8 = {(byte) 0xef, (byte) 0xbb, (byte) 0xbf};
|
private static final byte[] BOM_UTF8 = {(byte) 0xef, (byte) 0xbb, (byte) 0xbf};
|
||||||
|
@ -1085,8 +1062,9 @@ public class Dictionary {
|
||||||
int sep = flagSep < 0 ? morphSep : flagSep;
|
int sep = flagSep < 0 ? morphSep : flagSep;
|
||||||
|
|
||||||
CharSequence toWrite;
|
CharSequence toWrite;
|
||||||
if (needsInputCleaning) {
|
String beforeSep = line.substring(0, sep);
|
||||||
cleanInput(line, sep, reuse);
|
if (needsInputCleaning(beforeSep)) {
|
||||||
|
cleanInput(beforeSep, reuse);
|
||||||
reuse.append(line, sep, line.length());
|
reuse.append(line, sep, line.length());
|
||||||
toWrite = reuse;
|
toWrite = reuse;
|
||||||
} else {
|
} else {
|
||||||
|
@ -1571,14 +1549,28 @@ public class Dictionary {
|
||||||
return flagLookup.hasFlag(entryId, flag);
|
return flagLookup.hasFlag(entryId, flag);
|
||||||
}
|
}
|
||||||
|
|
||||||
CharSequence cleanInput(CharSequence input, StringBuilder reuse) {
|
boolean mayNeedInputCleaning() {
|
||||||
return cleanInput(input, input.length(), reuse);
|
return ignoreCase || ignore != null || iconv != null;
|
||||||
}
|
}
|
||||||
|
|
||||||
private CharSequence cleanInput(CharSequence input, int prefixLength, StringBuilder reuse) {
|
boolean needsInputCleaning(CharSequence input) {
|
||||||
|
if (mayNeedInputCleaning()) {
|
||||||
|
for (int i = 0; i < input.length(); i++) {
|
||||||
|
char ch = input.charAt(i);
|
||||||
|
if (ignore != null && Arrays.binarySearch(ignore, ch) >= 0
|
||||||
|
|| ignoreCase && caseFold(ch) != ch
|
||||||
|
|| iconv != null && iconv.mightReplaceChar(ch)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
CharSequence cleanInput(CharSequence input, StringBuilder reuse) {
|
||||||
reuse.setLength(0);
|
reuse.setLength(0);
|
||||||
|
|
||||||
for (int i = 0; i < prefixLength; i++) {
|
for (int i = 0; i < input.length(); i++) {
|
||||||
char ch = input.charAt(i);
|
char ch = input.charAt(i);
|
||||||
|
|
||||||
if (ignore != null && Arrays.binarySearch(ignore, ch) >= 0) {
|
if (ignore != null && Arrays.binarySearch(ignore, ch) >= 0) {
|
||||||
|
@ -1594,11 +1586,7 @@ public class Dictionary {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (iconv != null) {
|
if (iconv != null) {
|
||||||
try {
|
iconv.applyMappings(reuse);
|
||||||
applyMappings(iconv, reuse);
|
|
||||||
} catch (IOException bogus) {
|
|
||||||
throw new RuntimeException(bogus);
|
|
||||||
}
|
|
||||||
if (ignoreCase) {
|
if (ignoreCase) {
|
||||||
for (int i = 0; i < reuse.length(); i++) {
|
for (int i = 0; i < reuse.length(); i++) {
|
||||||
reuse.setCharAt(i, caseFold(reuse.charAt(i)));
|
reuse.setCharAt(i, caseFold(reuse.charAt(i)));
|
||||||
|
@ -1624,44 +1612,6 @@ public class Dictionary {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: this could be more efficient!
|
|
||||||
static void applyMappings(FST<CharsRef> fst, StringBuilder sb) throws IOException {
|
|
||||||
final FST.BytesReader bytesReader = fst.getBytesReader();
|
|
||||||
final FST.Arc<CharsRef> firstArc = fst.getFirstArc(new FST.Arc<>());
|
|
||||||
final CharsRef NO_OUTPUT = fst.outputs.getNoOutput();
|
|
||||||
|
|
||||||
// temporary stuff
|
|
||||||
final FST.Arc<CharsRef> arc = new FST.Arc<>();
|
|
||||||
int longestMatch;
|
|
||||||
CharsRef longestOutput;
|
|
||||||
|
|
||||||
for (int i = 0; i < sb.length(); i++) {
|
|
||||||
arc.copyFrom(firstArc);
|
|
||||||
CharsRef output = NO_OUTPUT;
|
|
||||||
longestMatch = -1;
|
|
||||||
longestOutput = null;
|
|
||||||
|
|
||||||
for (int j = i; j < sb.length(); j++) {
|
|
||||||
char ch = sb.charAt(j);
|
|
||||||
if (fst.findTargetArc(ch, arc, arc, bytesReader) == null) {
|
|
||||||
break;
|
|
||||||
} else {
|
|
||||||
output = fst.outputs.add(output, arc.output());
|
|
||||||
}
|
|
||||||
if (arc.isFinal()) {
|
|
||||||
longestOutput = fst.outputs.add(output, arc.nextFinalOutput());
|
|
||||||
longestMatch = j;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (longestMatch >= 0) {
|
|
||||||
sb.delete(i, longestMatch + 1);
|
|
||||||
sb.insert(i, longestOutput);
|
|
||||||
i += (longestOutput.length - 1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Returns true if this dictionary was constructed with the {@code ignoreCase} option */
|
/** Returns true if this dictionary was constructed with the {@code ignoreCase} option */
|
||||||
public boolean getIgnoreCase() {
|
public boolean getIgnoreCase() {
|
||||||
return ignoreCase;
|
return ignoreCase;
|
||||||
|
|
|
@ -22,7 +22,6 @@ import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_END;
|
||||||
import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_MIDDLE;
|
import static org.apache.lucene.analysis.hunspell.WordContext.COMPOUND_MIDDLE;
|
||||||
import static org.apache.lucene.analysis.hunspell.WordContext.SIMPLE_WORD;
|
import static org.apache.lucene.analysis.hunspell.WordContext.SIMPLE_WORD;
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
import java.util.LinkedHashSet;
|
import java.util.LinkedHashSet;
|
||||||
|
@ -72,7 +71,7 @@ public class Hunspell {
|
||||||
checkCanceled.run();
|
checkCanceled.run();
|
||||||
if (word.isEmpty()) return true;
|
if (word.isEmpty()) return true;
|
||||||
|
|
||||||
if (dictionary.needsInputCleaning) {
|
if (dictionary.needsInputCleaning(word)) {
|
||||||
word = dictionary.cleanInput(word, new StringBuilder()).toString();
|
word = dictionary.cleanInput(word, new StringBuilder()).toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -479,7 +478,7 @@ public class Hunspell {
|
||||||
checkCanceled.run();
|
checkCanceled.run();
|
||||||
if (word.length() >= 100) return Collections.emptyList();
|
if (word.length() >= 100) return Collections.emptyList();
|
||||||
|
|
||||||
if (dictionary.needsInputCleaning) {
|
if (dictionary.needsInputCleaning(word)) {
|
||||||
word = dictionary.cleanInput(word, new StringBuilder()).toString();
|
word = dictionary.cleanInput(word, new StringBuilder()).toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -565,14 +564,10 @@ public class Hunspell {
|
||||||
}
|
}
|
||||||
|
|
||||||
private String cleanOutput(String s) {
|
private String cleanOutput(String s) {
|
||||||
if (!dictionary.needsOutputCleaning) return s;
|
if (dictionary.oconv == null) return s;
|
||||||
|
|
||||||
try {
|
StringBuilder sb = new StringBuilder(s);
|
||||||
StringBuilder sb = new StringBuilder(s);
|
dictionary.oconv.applyMappings(sb);
|
||||||
Dictionary.applyMappings(dictionary.oconv, sb);
|
return sb.toString();
|
||||||
return sb.toString();
|
|
||||||
} catch (IOException bogus) {
|
|
||||||
throw new RuntimeException(bogus);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -83,14 +83,16 @@ final class Stemmer {
|
||||||
*/
|
*/
|
||||||
public List<CharsRef> stem(char[] word, int length) {
|
public List<CharsRef> stem(char[] word, int length) {
|
||||||
|
|
||||||
if (dictionary.needsInputCleaning) {
|
if (dictionary.mayNeedInputCleaning()) {
|
||||||
scratchSegment.setLength(0);
|
scratchSegment.setLength(0);
|
||||||
scratchSegment.append(word, 0, length);
|
scratchSegment.append(word, 0, length);
|
||||||
CharSequence cleaned = dictionary.cleanInput(scratchSegment, segment);
|
if (dictionary.needsInputCleaning(scratchSegment)) {
|
||||||
scratchBuffer = ArrayUtil.grow(scratchBuffer, cleaned.length());
|
CharSequence cleaned = dictionary.cleanInput(scratchSegment, segment);
|
||||||
length = segment.length();
|
scratchBuffer = ArrayUtil.grow(scratchBuffer, cleaned.length());
|
||||||
segment.getChars(0, length, scratchBuffer, 0);
|
length = segment.length();
|
||||||
word = scratchBuffer;
|
segment.getChars(0, length, scratchBuffer, 0);
|
||||||
|
word = scratchBuffer;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
List<CharsRef> list = new ArrayList<>();
|
List<CharsRef> list = new ArrayList<>();
|
||||||
|
@ -365,18 +367,14 @@ final class Stemmer {
|
||||||
private CharsRef newStem(CharsRef stem, int morphDataId) {
|
private CharsRef newStem(CharsRef stem, int morphDataId) {
|
||||||
String exception = stemException(morphDataId);
|
String exception = stemException(morphDataId);
|
||||||
|
|
||||||
if (dictionary.needsOutputCleaning) {
|
if (dictionary.oconv != null) {
|
||||||
scratchSegment.setLength(0);
|
scratchSegment.setLength(0);
|
||||||
if (exception != null) {
|
if (exception != null) {
|
||||||
scratchSegment.append(exception);
|
scratchSegment.append(exception);
|
||||||
} else {
|
} else {
|
||||||
scratchSegment.append(stem.chars, stem.offset, stem.length);
|
scratchSegment.append(stem.chars, stem.offset, stem.length);
|
||||||
}
|
}
|
||||||
try {
|
dictionary.oconv.applyMappings(scratchSegment);
|
||||||
Dictionary.applyMappings(dictionary.oconv, scratchSegment);
|
|
||||||
} catch (IOException bogus) {
|
|
||||||
throw new RuntimeException(bogus);
|
|
||||||
}
|
|
||||||
char[] cleaned = new char[scratchSegment.length()];
|
char[] cleaned = new char[scratchSegment.length()];
|
||||||
scratchSegment.getChars(0, cleaned.length, cleaned, 0);
|
scratchSegment.getChars(0, cleaned.length, cleaned, 0);
|
||||||
return new CharsRef(cleaned, 0, cleaned.length);
|
return new CharsRef(cleaned, 0, cleaned.length);
|
||||||
|
|
|
@ -24,19 +24,13 @@ import java.nio.charset.StandardCharsets;
|
||||||
import java.text.ParseException;
|
import java.text.ParseException;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
|
import java.util.TreeMap;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
import java.util.stream.IntStream;
|
import java.util.stream.IntStream;
|
||||||
import org.apache.lucene.store.ByteBuffersDirectory;
|
import org.apache.lucene.store.ByteBuffersDirectory;
|
||||||
import org.apache.lucene.store.Directory;
|
import org.apache.lucene.store.Directory;
|
||||||
import org.apache.lucene.util.CharsRef;
|
|
||||||
import org.apache.lucene.util.IntsRef;
|
import org.apache.lucene.util.IntsRef;
|
||||||
import org.apache.lucene.util.IntsRefBuilder;
|
|
||||||
import org.apache.lucene.util.LuceneTestCase;
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
import org.apache.lucene.util.fst.CharSequenceOutputs;
|
|
||||||
import org.apache.lucene.util.fst.FST;
|
|
||||||
import org.apache.lucene.util.fst.FSTCompiler;
|
|
||||||
import org.apache.lucene.util.fst.Outputs;
|
|
||||||
import org.apache.lucene.util.fst.Util;
|
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
public class TestDictionary extends LuceneTestCase {
|
public class TestDictionary extends LuceneTestCase {
|
||||||
|
@ -166,51 +160,36 @@ public class TestDictionary extends LuceneTestCase {
|
||||||
assertTrue(dictStream.isClosed());
|
assertTrue(dictStream.isClosed());
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testReplacements() throws Exception {
|
public void testReplacements() {
|
||||||
Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
|
TreeMap<String, String> map = new TreeMap<>();
|
||||||
FSTCompiler<CharsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, outputs);
|
map.put("a", "b");
|
||||||
IntsRefBuilder scratchInts = new IntsRefBuilder();
|
map.put("ab", "c");
|
||||||
|
map.put("c", "de");
|
||||||
// a -> b
|
map.put("def", "gh");
|
||||||
Util.toUTF16("a", scratchInts);
|
ConvTable table = new ConvTable(map);
|
||||||
fstCompiler.add(scratchInts.get(), new CharsRef("b"));
|
|
||||||
|
|
||||||
// ab -> c
|
|
||||||
Util.toUTF16("ab", scratchInts);
|
|
||||||
fstCompiler.add(scratchInts.get(), new CharsRef("c"));
|
|
||||||
|
|
||||||
// c -> de
|
|
||||||
Util.toUTF16("c", scratchInts);
|
|
||||||
fstCompiler.add(scratchInts.get(), new CharsRef("de"));
|
|
||||||
|
|
||||||
// def -> gh
|
|
||||||
Util.toUTF16("def", scratchInts);
|
|
||||||
fstCompiler.add(scratchInts.get(), new CharsRef("gh"));
|
|
||||||
|
|
||||||
FST<CharsRef> fst = fstCompiler.compile();
|
|
||||||
|
|
||||||
StringBuilder sb = new StringBuilder("atestanother");
|
StringBuilder sb = new StringBuilder("atestanother");
|
||||||
Dictionary.applyMappings(fst, sb);
|
table.applyMappings(sb);
|
||||||
assertEquals("btestbnother", sb.toString());
|
assertEquals("btestbnother", sb.toString());
|
||||||
|
|
||||||
sb = new StringBuilder("abtestanother");
|
sb = new StringBuilder("abtestanother");
|
||||||
Dictionary.applyMappings(fst, sb);
|
table.applyMappings(sb);
|
||||||
assertEquals("ctestbnother", sb.toString());
|
assertEquals("ctestbnother", sb.toString());
|
||||||
|
|
||||||
sb = new StringBuilder("atestabnother");
|
sb = new StringBuilder("atestabnother");
|
||||||
Dictionary.applyMappings(fst, sb);
|
table.applyMappings(sb);
|
||||||
assertEquals("btestcnother", sb.toString());
|
assertEquals("btestcnother", sb.toString());
|
||||||
|
|
||||||
sb = new StringBuilder("abtestabnother");
|
sb = new StringBuilder("abtestabnother");
|
||||||
Dictionary.applyMappings(fst, sb);
|
table.applyMappings(sb);
|
||||||
assertEquals("ctestcnother", sb.toString());
|
assertEquals("ctestcnother", sb.toString());
|
||||||
|
|
||||||
sb = new StringBuilder("abtestabcnother");
|
sb = new StringBuilder("abtestabcnother");
|
||||||
Dictionary.applyMappings(fst, sb);
|
table.applyMappings(sb);
|
||||||
assertEquals("ctestcdenother", sb.toString());
|
assertEquals("ctestcdenother", sb.toString());
|
||||||
|
|
||||||
sb = new StringBuilder("defdefdefc");
|
sb = new StringBuilder("defdefdefc");
|
||||||
Dictionary.applyMappings(fst, sb);
|
table.applyMappings(sb);
|
||||||
assertEquals("ghghghde", sb.toString());
|
assertEquals("ghghghde", sb.toString());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue