mirror of https://github.com/apache/lucene.git
LUCENE-9664: Hunspell support: fix most IntelliJ warnings, cleanup (#2202)
This commit is contained in:
parent
90131a605a
commit
82f6f161ae
|
@ -140,8 +140,6 @@ public class Dictionary {
|
|||
// when set, some words have exceptional stems, and the last entry is a pointer to stemExceptions
|
||||
boolean hasStemExceptions;
|
||||
|
||||
private final Path tempPath = getDefaultTempDir(); // TODO: make this configurable?
|
||||
|
||||
boolean ignoreCase;
|
||||
boolean complexPrefixes;
|
||||
// if no affixes have continuation classes, no need to do 2-level affix stripping
|
||||
|
@ -210,6 +208,7 @@ public class Dictionary {
|
|||
this.needsOutputCleaning = false; // set if we have an OCONV
|
||||
flagLookup.add(new BytesRef()); // no flags -> ord 0
|
||||
|
||||
Path tempPath = getDefaultTempDir(); // TODO: make this configurable?
|
||||
Path aff = Files.createTempFile(tempPath, "affix", "aff");
|
||||
OutputStream out = new BufferedOutputStream(Files.newOutputStream(aff));
|
||||
InputStream aff1 = null;
|
||||
|
@ -252,33 +251,33 @@ public class Dictionary {
|
|||
}
|
||||
|
||||
/** Looks up Hunspell word forms from the dictionary */
|
||||
IntsRef lookupWord(char word[], int offset, int length) {
|
||||
IntsRef lookupWord(char[] word, int offset, int length) {
|
||||
return lookup(words, word, offset, length);
|
||||
}
|
||||
|
||||
// only for testing
|
||||
IntsRef lookupPrefix(char word[], int offset, int length) {
|
||||
return lookup(prefixes, word, offset, length);
|
||||
IntsRef lookupPrefix(char[] word) {
|
||||
return lookup(prefixes, word, 0, word.length);
|
||||
}
|
||||
|
||||
// only for testing
|
||||
IntsRef lookupSuffix(char word[], int offset, int length) {
|
||||
return lookup(suffixes, word, offset, length);
|
||||
IntsRef lookupSuffix(char[] word) {
|
||||
return lookup(suffixes, word, 0, word.length);
|
||||
}
|
||||
|
||||
IntsRef lookup(FST<IntsRef> fst, char word[], int offset, int length) {
|
||||
IntsRef lookup(FST<IntsRef> fst, char[] word, int offset, int length) {
|
||||
if (fst == null) {
|
||||
return null;
|
||||
}
|
||||
final FST.BytesReader bytesReader = fst.getBytesReader();
|
||||
final FST.Arc<IntsRef> arc = fst.getFirstArc(new FST.Arc<IntsRef>());
|
||||
final FST.Arc<IntsRef> arc = fst.getFirstArc(new FST.Arc<>());
|
||||
// Accumulate output as we go
|
||||
final IntsRef NO_OUTPUT = fst.outputs.getNoOutput();
|
||||
IntsRef output = NO_OUTPUT;
|
||||
|
||||
int l = offset + length;
|
||||
try {
|
||||
for (int i = offset, cp = 0; i < l; i += Character.charCount(cp)) {
|
||||
for (int i = offset, cp; i < l; i += Character.charCount(cp)) {
|
||||
cp = Character.codePointAt(word, i, l);
|
||||
if (fst.findTargetArc(cp, arc, arc, bytesReader) == null) {
|
||||
return null;
|
||||
|
@ -320,7 +319,7 @@ public class Dictionary {
|
|||
seenStrips.put("", 0);
|
||||
|
||||
LineNumberReader reader = new LineNumberReader(new InputStreamReader(affixStream, decoder));
|
||||
String line = null;
|
||||
String line;
|
||||
while ((line = reader.readLine()) != null) {
|
||||
// ignore any BOM marker on first line
|
||||
if (reader.getLineNumber() == 1 && line.startsWith("\uFEFF")) {
|
||||
|
@ -344,31 +343,31 @@ public class Dictionary {
|
|||
complexPrefixes =
|
||||
true; // 2-stage prefix+1-stage suffix instead of 2-stage suffix+1-stage prefix
|
||||
} else if (line.startsWith(CIRCUMFIX_KEY)) {
|
||||
String parts[] = line.split("\\s+");
|
||||
String[] parts = line.split("\\s+");
|
||||
if (parts.length != 2) {
|
||||
throw new ParseException("Illegal CIRCUMFIX declaration", reader.getLineNumber());
|
||||
}
|
||||
circumfix = flagParsingStrategy.parseFlag(parts[1]);
|
||||
} else if (line.startsWith(KEEPCASE_KEY)) {
|
||||
String parts[] = line.split("\\s+");
|
||||
String[] parts = line.split("\\s+");
|
||||
if (parts.length != 2) {
|
||||
throw new ParseException("Illegal KEEPCASE declaration", reader.getLineNumber());
|
||||
}
|
||||
keepcase = flagParsingStrategy.parseFlag(parts[1]);
|
||||
} else if (line.startsWith(NEEDAFFIX_KEY) || line.startsWith(PSEUDOROOT_KEY)) {
|
||||
String parts[] = line.split("\\s+");
|
||||
String[] parts = line.split("\\s+");
|
||||
if (parts.length != 2) {
|
||||
throw new ParseException("Illegal NEEDAFFIX declaration", reader.getLineNumber());
|
||||
}
|
||||
needaffix = flagParsingStrategy.parseFlag(parts[1]);
|
||||
} else if (line.startsWith(ONLYINCOMPOUND_KEY)) {
|
||||
String parts[] = line.split("\\s+");
|
||||
String[] parts = line.split("\\s+");
|
||||
if (parts.length != 2) {
|
||||
throw new ParseException("Illegal ONLYINCOMPOUND declaration", reader.getLineNumber());
|
||||
}
|
||||
onlyincompound = flagParsingStrategy.parseFlag(parts[1]);
|
||||
} else if (line.startsWith(IGNORE_KEY)) {
|
||||
String parts[] = line.split("\\s+");
|
||||
String[] parts = line.split("\\s+");
|
||||
if (parts.length != 2) {
|
||||
throw new ParseException("Illegal IGNORE declaration", reader.getLineNumber());
|
||||
}
|
||||
|
@ -376,7 +375,7 @@ public class Dictionary {
|
|||
Arrays.sort(ignore);
|
||||
needsInputCleaning = true;
|
||||
} else if (line.startsWith(ICONV_KEY) || line.startsWith(OCONV_KEY)) {
|
||||
String parts[] = line.split("\\s+");
|
||||
String[] parts = line.split("\\s+");
|
||||
String type = parts[0];
|
||||
if (parts.length != 2) {
|
||||
throw new ParseException("Illegal " + type + " declaration", reader.getLineNumber());
|
||||
|
@ -475,10 +474,10 @@ public class Dictionary {
|
|||
|
||||
BytesRefBuilder scratch = new BytesRefBuilder();
|
||||
StringBuilder sb = new StringBuilder();
|
||||
String args[] = header.split("\\s+");
|
||||
String[] args = header.split("\\s+");
|
||||
|
||||
boolean crossProduct = args[2].equals("Y");
|
||||
boolean isSuffix = conditionPattern == SUFFIX_CONDITION_REGEX_PATTERN;
|
||||
boolean isSuffix = conditionPattern.equals(SUFFIX_CONDITION_REGEX_PATTERN);
|
||||
|
||||
int numLines = Integer.parseInt(args[3]);
|
||||
affixData = ArrayUtil.grow(affixData, (currentAffix << 3) + (numLines << 3));
|
||||
|
@ -488,7 +487,7 @@ public class Dictionary {
|
|||
for (int i = 0; i < numLines; i++) {
|
||||
assert affixWriter.getPosition() == currentAffix << 3;
|
||||
String line = reader.readLine();
|
||||
String ruleArgs[] = line.split("\\s+");
|
||||
String[] ruleArgs = line.split("\\s+");
|
||||
|
||||
// from the manpage: PFX flag stripping prefix [condition [morphological_fields...]]
|
||||
// condition is optional
|
||||
|
@ -501,7 +500,7 @@ public class Dictionary {
|
|||
char flag = flagParsingStrategy.parseFlag(ruleArgs[1]);
|
||||
String strip = ruleArgs[2].equals("0") ? "" : ruleArgs[2];
|
||||
String affixArg = ruleArgs[3];
|
||||
char appendFlags[] = null;
|
||||
char[] appendFlags = null;
|
||||
|
||||
// first: parse continuation classes out of affix
|
||||
int flagSep = affixArg.lastIndexOf('/');
|
||||
|
@ -585,7 +584,7 @@ public class Dictionary {
|
|||
affixWriter.writeShort((short) flag);
|
||||
affixWriter.writeShort((short) stripOrd.intValue());
|
||||
// encode crossProduct into patternIndex
|
||||
int patternOrd = patternIndex.intValue() << 1 | (crossProduct ? 1 : 0);
|
||||
int patternOrd = patternIndex << 1 | (crossProduct ? 1 : 0);
|
||||
affixWriter.writeShort((short) patternOrd);
|
||||
affixWriter.writeShort((short) appendFlagsOrd);
|
||||
|
||||
|
@ -598,12 +597,7 @@ public class Dictionary {
|
|||
affixArg = new StringBuilder(affixArg).reverse().toString();
|
||||
}
|
||||
|
||||
List<Integer> list = affixes.get(affixArg);
|
||||
if (list == null) {
|
||||
list = new ArrayList<>();
|
||||
affixes.put(affixArg, list);
|
||||
}
|
||||
list.add(currentAffix);
|
||||
affixes.computeIfAbsent(affixArg, __ -> new ArrayList<>()).add(currentAffix);
|
||||
currentAffix++;
|
||||
}
|
||||
}
|
||||
|
@ -614,7 +608,7 @@ public class Dictionary {
|
|||
|
||||
for (int i = 0; i < num; i++) {
|
||||
String line = reader.readLine();
|
||||
String parts[] = line.split("\\s+");
|
||||
String[] parts = line.split("\\s+");
|
||||
if (parts.length != 3) {
|
||||
throw new ParseException("invalid syntax: " + line, reader.getLineNumber());
|
||||
}
|
||||
|
@ -707,7 +701,7 @@ public class Dictionary {
|
|||
* definition
|
||||
*/
|
||||
static FlagParsingStrategy getFlagParsingStrategy(String flagLine) {
|
||||
String parts[] = flagLine.split("\\s+");
|
||||
String[] parts = flagLine.split("\\s+");
|
||||
if (parts.length != 2) {
|
||||
throw new IllegalArgumentException("Illegal FLAG specification: " + flagLine);
|
||||
}
|
||||
|
@ -724,11 +718,11 @@ public class Dictionary {
|
|||
throw new IllegalArgumentException("Unknown flag type: " + flagType);
|
||||
}
|
||||
|
||||
final char FLAG_SEPARATOR = 0x1f; // flag separator after escaping
|
||||
final char MORPH_SEPARATOR =
|
||||
private static final char FLAG_SEPARATOR = 0x1f; // flag separator after escaping
|
||||
private static final char MORPH_SEPARATOR =
|
||||
0x1e; // separator for boundary of entry (may be followed by morph data)
|
||||
|
||||
String unescapeEntry(String entry) {
|
||||
private String unescapeEntry(String entry) {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
int end = morphBoundary(entry);
|
||||
for (int i = 0; i < end; i++) {
|
||||
|
@ -738,9 +732,7 @@ public class Dictionary {
|
|||
i++;
|
||||
} else if (ch == '/') {
|
||||
sb.append(FLAG_SEPARATOR);
|
||||
} else if (ch == MORPH_SEPARATOR || ch == FLAG_SEPARATOR) {
|
||||
// BINARY EXECUTABLES EMBEDDED IN ZULU DICTIONARIES!!!!!!!
|
||||
} else {
|
||||
} else if (!shouldSkipEscapedChar(ch)) {
|
||||
sb.append(ch);
|
||||
}
|
||||
}
|
||||
|
@ -748,9 +740,7 @@ public class Dictionary {
|
|||
if (end < entry.length()) {
|
||||
for (int i = end; i < entry.length(); i++) {
|
||||
char c = entry.charAt(i);
|
||||
if (c == FLAG_SEPARATOR || c == MORPH_SEPARATOR) {
|
||||
// BINARY EXECUTABLES EMBEDDED IN ZULU DICTIONARIES!!!!!!!
|
||||
} else {
|
||||
if (!shouldSkipEscapedChar(c)) {
|
||||
sb.append(c);
|
||||
}
|
||||
}
|
||||
|
@ -758,6 +748,11 @@ public class Dictionary {
|
|||
return sb.toString();
|
||||
}
|
||||
|
||||
private static boolean shouldSkipEscapedChar(char ch) {
|
||||
return ch == FLAG_SEPARATOR
|
||||
|| ch == MORPH_SEPARATOR; // BINARY EXECUTABLES EMBEDDED IN ZULU DICTIONARIES!!!!!!!
|
||||
}
|
||||
|
||||
static int morphBoundary(String line) {
|
||||
int end = indexOfSpaceOrTab(line, 0);
|
||||
if (end == -1) {
|
||||
|
@ -812,9 +807,9 @@ public class Dictionary {
|
|||
try (ByteSequencesWriter writer = new ByteSequencesWriter(unsorted)) {
|
||||
for (InputStream dictionary : dictionaries) {
|
||||
BufferedReader lines = new BufferedReader(new InputStreamReader(dictionary, decoder));
|
||||
String line =
|
||||
lines.readLine(); // first line is number of entries (approximately, sometimes)
|
||||
|
||||
String line;
|
||||
while ((line = lines.readLine()) != null) {
|
||||
// wild and unpredictable code comment rules
|
||||
if (line.isEmpty()
|
||||
|
@ -825,7 +820,7 @@ public class Dictionary {
|
|||
}
|
||||
line = unescapeEntry(line);
|
||||
// if we havent seen any stem exceptions, try to parse one
|
||||
if (hasStemExceptions == false) {
|
||||
if (!hasStemExceptions) {
|
||||
int morphStart = line.indexOf(MORPH_SEPARATOR);
|
||||
if (morphStart >= 0 && morphStart < line.length()) {
|
||||
hasStemExceptions = parseStemException(line.substring(morphStart + 1)) != null;
|
||||
|
@ -861,35 +856,28 @@ public class Dictionary {
|
|||
new OfflineSorter(
|
||||
tempDir,
|
||||
tempFileNamePrefix,
|
||||
new Comparator<BytesRef>() {
|
||||
BytesRef scratch1 = new BytesRef();
|
||||
BytesRef scratch2 = new BytesRef();
|
||||
new Comparator<>() {
|
||||
final BytesRef scratch1 = new BytesRef();
|
||||
final BytesRef scratch2 = new BytesRef();
|
||||
|
||||
private void initScratch(BytesRef o, BytesRef scratch) {
|
||||
scratch.bytes = o.bytes;
|
||||
scratch.offset = o.offset;
|
||||
scratch.length = o.length;
|
||||
|
||||
for (int i = scratch.length - 1; i >= 0; i--) {
|
||||
if (scratch.bytes[scratch.offset + i] == FLAG_SEPARATOR
|
||||
|| scratch.bytes[scratch.offset + i] == MORPH_SEPARATOR) {
|
||||
scratch.length = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compare(BytesRef o1, BytesRef o2) {
|
||||
scratch1.bytes = o1.bytes;
|
||||
scratch1.offset = o1.offset;
|
||||
scratch1.length = o1.length;
|
||||
|
||||
for (int i = scratch1.length - 1; i >= 0; i--) {
|
||||
if (scratch1.bytes[scratch1.offset + i] == FLAG_SEPARATOR
|
||||
|| scratch1.bytes[scratch1.offset + i] == MORPH_SEPARATOR) {
|
||||
scratch1.length = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
scratch2.bytes = o2.bytes;
|
||||
scratch2.offset = o2.offset;
|
||||
scratch2.length = o2.length;
|
||||
|
||||
for (int i = scratch2.length - 1; i >= 0; i--) {
|
||||
if (scratch2.bytes[scratch2.offset + i] == FLAG_SEPARATOR
|
||||
|| scratch2.bytes[scratch2.offset + i] == MORPH_SEPARATOR) {
|
||||
scratch2.length = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
initScratch(o1, scratch1);
|
||||
initScratch(o2, scratch2);
|
||||
|
||||
int cmp = scratch1.compareTo(scratch2);
|
||||
if (cmp == 0) {
|
||||
|
@ -933,7 +921,7 @@ public class Dictionary {
|
|||
|
||||
String line = scratch.utf8ToString();
|
||||
String entry;
|
||||
char wordForm[];
|
||||
char[] wordForm;
|
||||
int end;
|
||||
|
||||
int flagSep = line.indexOf(FLAG_SEPARATOR);
|
||||
|
@ -980,7 +968,7 @@ public class Dictionary {
|
|||
words.add(scratchInts.get(), currentOrds.get());
|
||||
}
|
||||
// swap current
|
||||
if (cmp > 0 || currentEntry == null) {
|
||||
if (cmp > 0) {
|
||||
currentEntry = entry;
|
||||
currentOrds = new IntsRefBuilder(); // must be this way
|
||||
}
|
||||
|
@ -994,6 +982,7 @@ public class Dictionary {
|
|||
}
|
||||
|
||||
// finalize last entry
|
||||
assert currentEntry != null;
|
||||
Util.toUTF32(currentEntry, scratchInts);
|
||||
words.add(scratchInts.get(), currentOrds.get());
|
||||
success2 = true;
|
||||
|
@ -1011,7 +1000,7 @@ public class Dictionary {
|
|||
return CharsRef.EMPTY_CHARS;
|
||||
}
|
||||
int len = b.length >>> 1;
|
||||
char flags[] = new char[len];
|
||||
char[] flags = new char[len];
|
||||
int upto = 0;
|
||||
int end = b.offset + b.length;
|
||||
for (int i = b.offset; i < end; i += 2) {
|
||||
|
@ -1020,19 +1009,18 @@ public class Dictionary {
|
|||
return flags;
|
||||
}
|
||||
|
||||
static void encodeFlags(BytesRefBuilder b, char flags[]) {
|
||||
private static void encodeFlags(BytesRefBuilder b, char[] flags) {
|
||||
int len = flags.length << 1;
|
||||
b.grow(len);
|
||||
b.clear();
|
||||
for (int i = 0; i < flags.length; i++) {
|
||||
int flag = flags[i];
|
||||
for (int flag : flags) {
|
||||
b.append((byte) ((flag >> 8) & 0xff));
|
||||
b.append((byte) (flag & 0xff));
|
||||
}
|
||||
}
|
||||
|
||||
private void parseAlias(String line) {
|
||||
String ruleArgs[] = line.split("\\s+");
|
||||
String[] ruleArgs = line.split("\\s+");
|
||||
if (aliases == null) {
|
||||
// first line should be the aliases count
|
||||
final int count = Integer.parseInt(ruleArgs[1]);
|
||||
|
@ -1102,7 +1090,7 @@ public class Dictionary {
|
|||
* @return Parsed flag
|
||||
*/
|
||||
char parseFlag(String rawFlag) {
|
||||
char flags[] = parseFlags(rawFlag);
|
||||
char[] flags = parseFlags(rawFlag);
|
||||
if (flags.length != 1) {
|
||||
throw new IllegalArgumentException("expected only one flag, got: " + rawFlag);
|
||||
}
|
||||
|
@ -1140,9 +1128,9 @@ public class Dictionary {
|
|||
char[] flags = new char[rawFlagParts.length];
|
||||
int upto = 0;
|
||||
|
||||
for (int i = 0; i < rawFlagParts.length; i++) {
|
||||
for (String rawFlagPart : rawFlagParts) {
|
||||
// note, removing the trailing X/leading I for nepali... what is the rule here?!
|
||||
String replacement = rawFlagParts[i].replaceAll("[^0-9]", "");
|
||||
String replacement = rawFlagPart.replaceAll("[^0-9]", "");
|
||||
// note, ignoring empty flags (this happens in danish, for example)
|
||||
if (replacement.isEmpty()) {
|
||||
continue;
|
||||
|
@ -1185,13 +1173,13 @@ public class Dictionary {
|
|||
builder.append(combined);
|
||||
}
|
||||
|
||||
char flags[] = new char[builder.length()];
|
||||
char[] flags = new char[builder.length()];
|
||||
builder.getChars(0, builder.length(), flags, 0);
|
||||
return flags;
|
||||
}
|
||||
}
|
||||
|
||||
static boolean hasFlag(char flags[], char flag) {
|
||||
static boolean hasFlag(char[] flags, char flag) {
|
||||
return Arrays.binarySearch(flags, flag) >= 0;
|
||||
}
|
||||
|
||||
|
@ -1247,7 +1235,7 @@ public class Dictionary {
|
|||
// TODO: this could be more efficient!
|
||||
static void applyMappings(FST<CharsRef> fst, StringBuilder sb) throws IOException {
|
||||
final FST.BytesReader bytesReader = fst.getBytesReader();
|
||||
final FST.Arc<CharsRef> firstArc = fst.getFirstArc(new FST.Arc<CharsRef>());
|
||||
final FST.Arc<CharsRef> firstArc = fst.getFirstArc(new FST.Arc<>());
|
||||
final CharsRef NO_OUTPUT = fst.outputs.getNoOutput();
|
||||
|
||||
// temporary stuff
|
||||
|
@ -1290,6 +1278,7 @@ public class Dictionary {
|
|||
private static Path DEFAULT_TEMP_DIR;
|
||||
|
||||
/** Used by test framework */
|
||||
@SuppressWarnings("unused")
|
||||
public static void setDefaultTempDir(Path tempDir) {
|
||||
DEFAULT_TEMP_DIR = tempDir;
|
||||
}
|
||||
|
@ -1306,7 +1295,7 @@ public class Dictionary {
|
|||
throw new IOException("Java has no temporary folder property (java.io.tmpdir)?");
|
||||
}
|
||||
Path tempDirectory = Paths.get(tempDirPath);
|
||||
if (Files.isWritable(tempDirectory) == false) {
|
||||
if (!Files.isWritable(tempDirectory)) {
|
||||
throw new IOException(
|
||||
"Java's temporary folder not present or writeable?: " + tempDirectory.toAbsolutePath());
|
||||
}
|
||||
|
|
|
@ -43,7 +43,7 @@ final class Stemmer {
|
|||
|
||||
// used for normalization
|
||||
private final StringBuilder scratchSegment = new StringBuilder();
|
||||
private char scratchBuffer[] = new char[32];
|
||||
private char[] scratchBuffer = new char[32];
|
||||
|
||||
// it's '1' if we have no stem exceptions, otherwise every other form
|
||||
// is really an ID pointing to the exception table
|
||||
|
@ -86,7 +86,7 @@ final class Stemmer {
|
|||
* @param word Word to find the stems for
|
||||
* @return List of stems for the word
|
||||
*/
|
||||
public List<CharsRef> stem(char word[], int length) {
|
||||
public List<CharsRef> stem(char[] word, int length) {
|
||||
|
||||
if (dictionary.needsInputCleaning) {
|
||||
scratchSegment.setLength(0);
|
||||
|
@ -128,7 +128,7 @@ final class Stemmer {
|
|||
private static final int UPPER_CASE = 2;
|
||||
|
||||
/** returns EXACT_CASE,TITLE_CASE, or UPPER_CASE type for the word */
|
||||
private int caseOf(char word[], int length) {
|
||||
private int caseOf(char[] word, int length) {
|
||||
if (dictionary.ignoreCase || length == 0 || !Character.isUpperCase(word[0])) {
|
||||
return EXACT_CASE;
|
||||
}
|
||||
|
@ -152,7 +152,7 @@ final class Stemmer {
|
|||
}
|
||||
|
||||
/** folds titlecase variant of word to titleBuffer */
|
||||
private void caseFoldTitle(char word[], int length) {
|
||||
private void caseFoldTitle(char[] word, int length) {
|
||||
titleBuffer = ArrayUtil.grow(titleBuffer, length);
|
||||
System.arraycopy(word, 0, titleBuffer, 0, length);
|
||||
for (int i = 1; i < length; i++) {
|
||||
|
@ -161,13 +161,13 @@ final class Stemmer {
|
|||
}
|
||||
|
||||
/** folds lowercase variant of word (title cased) to lowerBuffer */
|
||||
private void caseFoldLower(char word[], int length) {
|
||||
private void caseFoldLower(char[] word, int length) {
|
||||
lowerBuffer = ArrayUtil.grow(lowerBuffer, length);
|
||||
System.arraycopy(word, 0, lowerBuffer, 0, length);
|
||||
lowerBuffer[0] = dictionary.caseFold(lowerBuffer[0]);
|
||||
}
|
||||
|
||||
private List<CharsRef> doStem(char word[], int length, boolean caseVariant) {
|
||||
private List<CharsRef> doStem(char[] word, int length, boolean caseVariant) {
|
||||
List<CharsRef> stems = new ArrayList<>();
|
||||
IntsRef forms = dictionary.lookupWord(word, 0, length);
|
||||
if (forms != null) {
|
||||
|
@ -177,7 +177,7 @@ final class Stemmer {
|
|||
boolean checkOnlyInCompound = dictionary.onlyincompound != -1;
|
||||
if (checkKeepCase || checkNeedAffix || checkOnlyInCompound) {
|
||||
dictionary.flagLookup.get(forms.ints[forms.offset + i], scratch);
|
||||
char wordFlags[] = Dictionary.decodeFlags(scratch);
|
||||
char[] wordFlags = Dictionary.decodeFlags(scratch);
|
||||
// we are looking for a case variant, but this word does not allow it
|
||||
if (checkKeepCase && Dictionary.hasFlag(wordFlags, (char) dictionary.keepcase)) {
|
||||
continue;
|
||||
|
@ -196,7 +196,6 @@ final class Stemmer {
|
|||
}
|
||||
}
|
||||
try {
|
||||
boolean v =
|
||||
stems.addAll(stem(word, length, -1, -1, -1, 0, true, true, false, false, caseVariant));
|
||||
} catch (IOException bogus) {
|
||||
throw new RuntimeException(bogus);
|
||||
|
@ -210,7 +209,7 @@ final class Stemmer {
|
|||
* @param word Word to find the stems for
|
||||
* @return List of stems for the word
|
||||
*/
|
||||
public List<CharsRef> uniqueStems(char word[], int length) {
|
||||
public List<CharsRef> uniqueStems(char[] word, int length) {
|
||||
List<CharsRef> stems = stem(word, length);
|
||||
if (stems.size() < 2) {
|
||||
return stems;
|
||||
|
@ -226,7 +225,7 @@ final class Stemmer {
|
|||
return deduped;
|
||||
}
|
||||
|
||||
private CharsRef newStem(char buffer[], int length, IntsRef forms, int formID) {
|
||||
private CharsRef newStem(char[] buffer, int length, IntsRef forms, int formID) {
|
||||
final String exception;
|
||||
if (dictionary.hasStemExceptions) {
|
||||
int exceptionID = forms.ints[forms.offset + formID + 1];
|
||||
|
@ -251,7 +250,7 @@ final class Stemmer {
|
|||
} catch (IOException bogus) {
|
||||
throw new RuntimeException(bogus);
|
||||
}
|
||||
char cleaned[] = new char[scratchSegment.length()];
|
||||
char[] cleaned = new char[scratchSegment.length()];
|
||||
scratchSegment.getChars(0, cleaned.length, cleaned, 0);
|
||||
return new CharsRef(cleaned, 0, cleaned.length);
|
||||
} else {
|
||||
|
@ -264,15 +263,15 @@ final class Stemmer {
|
|||
}
|
||||
|
||||
// some state for traversing FSTs
|
||||
final FST.BytesReader prefixReaders[] = new FST.BytesReader[3];
|
||||
private final FST.BytesReader[] prefixReaders = new FST.BytesReader[3];
|
||||
|
||||
@SuppressWarnings({"unchecked", "rawtypes"})
|
||||
final FST.Arc<IntsRef> prefixArcs[] = new FST.Arc[3];
|
||||
private final FST.Arc<IntsRef>[] prefixArcs = new FST.Arc[3];
|
||||
|
||||
final FST.BytesReader suffixReaders[] = new FST.BytesReader[3];
|
||||
private final FST.BytesReader[] suffixReaders = new FST.BytesReader[3];
|
||||
|
||||
@SuppressWarnings({"unchecked", "rawtypes"})
|
||||
final FST.Arc<IntsRef> suffixArcs[] = new FST.Arc[3];
|
||||
private final FST.Arc<IntsRef>[] suffixArcs = new FST.Arc[3];
|
||||
|
||||
/**
|
||||
* Generates a list of stems for the provided word
|
||||
|
@ -296,7 +295,7 @@ final class Stemmer {
|
|||
* @return List of stems, or empty list if no stems are found
|
||||
*/
|
||||
private List<CharsRef> stem(
|
||||
char word[],
|
||||
char[] word,
|
||||
int length,
|
||||
int previous,
|
||||
int prevFlag,
|
||||
|
@ -330,12 +329,10 @@ final class Stemmer {
|
|||
output = fst.outputs.add(output, arc.output());
|
||||
}
|
||||
}
|
||||
IntsRef prefixes = null;
|
||||
if (!arc.isFinal()) {
|
||||
continue;
|
||||
} else {
|
||||
prefixes = fst.outputs.add(output, arc.nextFinalOutput());
|
||||
}
|
||||
IntsRef prefixes = fst.outputs.add(output, arc.nextFinalOutput());
|
||||
|
||||
for (int j = 0; j < prefixes.length; j++) {
|
||||
int prefix = prefixes.ints[prefixes.offset + j];
|
||||
|
@ -357,13 +354,13 @@ final class Stemmer {
|
|||
} else {
|
||||
// check if affix is allowed in a non-compound word
|
||||
dictionary.flagLookup.get(append, scratch);
|
||||
char appendFlags[] = Dictionary.decodeFlags(scratch);
|
||||
char[] appendFlags = Dictionary.decodeFlags(scratch);
|
||||
compatible = !Dictionary.hasFlag(appendFlags, (char) dictionary.onlyincompound);
|
||||
}
|
||||
} else if (crossProduct) {
|
||||
// cross check incoming continuation class (flag of previous affix) against list.
|
||||
dictionary.flagLookup.get(append, scratch);
|
||||
char appendFlags[] = Dictionary.decodeFlags(scratch);
|
||||
char[] appendFlags = Dictionary.decodeFlags(scratch);
|
||||
assert prevFlag >= 0;
|
||||
boolean allowed =
|
||||
dictionary.onlyincompound == -1
|
||||
|
@ -374,8 +371,7 @@ final class Stemmer {
|
|||
}
|
||||
|
||||
if (compatible) {
|
||||
int deAffixedStart = i;
|
||||
int deAffixedLength = length - deAffixedStart;
|
||||
int deAffixedLength = length - i;
|
||||
|
||||
int stripStart = dictionary.stripOffsets[stripOrd];
|
||||
int stripEnd = dictionary.stripOffsets[stripOrd + 1];
|
||||
|
@ -387,14 +383,14 @@ final class Stemmer {
|
|||
stripStart,
|
||||
stripLength,
|
||||
word,
|
||||
deAffixedStart,
|
||||
i,
|
||||
deAffixedLength)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
char strippedWord[] = new char[stripLength + deAffixedLength];
|
||||
char[] strippedWord = new char[stripLength + deAffixedLength];
|
||||
System.arraycopy(dictionary.stripData, stripStart, strippedWord, 0, stripLength);
|
||||
System.arraycopy(word, deAffixedStart, strippedWord, stripLength, deAffixedLength);
|
||||
System.arraycopy(word, i, strippedWord, stripLength, deAffixedLength);
|
||||
|
||||
List<CharsRef> stemList =
|
||||
applyAffix(
|
||||
|
@ -431,12 +427,10 @@ final class Stemmer {
|
|||
output = fst.outputs.add(output, arc.output());
|
||||
}
|
||||
}
|
||||
IntsRef suffixes = null;
|
||||
if (!arc.isFinal()) {
|
||||
continue;
|
||||
} else {
|
||||
suffixes = fst.outputs.add(output, arc.nextFinalOutput());
|
||||
}
|
||||
IntsRef suffixes = fst.outputs.add(output, arc.nextFinalOutput());
|
||||
|
||||
for (int j = 0; j < suffixes.length; j++) {
|
||||
int suffix = suffixes.ints[suffixes.offset + j];
|
||||
|
@ -458,13 +452,13 @@ final class Stemmer {
|
|||
} else {
|
||||
// check if affix is allowed in a non-compound word
|
||||
dictionary.flagLookup.get(append, scratch);
|
||||
char appendFlags[] = Dictionary.decodeFlags(scratch);
|
||||
char[] appendFlags = Dictionary.decodeFlags(scratch);
|
||||
compatible = !Dictionary.hasFlag(appendFlags, (char) dictionary.onlyincompound);
|
||||
}
|
||||
} else if (crossProduct) {
|
||||
// cross check incoming continuation class (flag of previous affix) against list.
|
||||
dictionary.flagLookup.get(append, scratch);
|
||||
char appendFlags[] = Dictionary.decodeFlags(scratch);
|
||||
char[] appendFlags = Dictionary.decodeFlags(scratch);
|
||||
assert prevFlag >= 0;
|
||||
boolean allowed =
|
||||
dictionary.onlyincompound == -1
|
||||
|
@ -494,7 +488,7 @@ final class Stemmer {
|
|||
continue;
|
||||
}
|
||||
|
||||
char strippedWord[] = new char[stripLength + deAffixedLength];
|
||||
char[] strippedWord = new char[stripLength + deAffixedLength];
|
||||
System.arraycopy(word, 0, strippedWord, 0, deAffixedLength);
|
||||
System.arraycopy(
|
||||
dictionary.stripData, stripStart, strippedWord, deAffixedLength, stripLength);
|
||||
|
@ -524,7 +518,7 @@ final class Stemmer {
|
|||
// just check the stem
|
||||
// but this is a little bit more complicated.
|
||||
private boolean checkCondition(
|
||||
int condition, char c1[], int c1off, int c1len, char c2[], int c2off, int c2len) {
|
||||
int condition, char[] c1, int c1off, int c1len, char[] c2, int c2off, int c2len) {
|
||||
if (condition != 0) {
|
||||
CharacterRunAutomaton pattern = dictionary.patterns.get(condition);
|
||||
int state = 0;
|
||||
|
@ -559,7 +553,7 @@ final class Stemmer {
|
|||
* @return List of stems for the word, or an empty list if none are found
|
||||
*/
|
||||
List<CharsRef> applyAffix(
|
||||
char strippedWord[],
|
||||
char[] strippedWord,
|
||||
int length,
|
||||
int affix,
|
||||
int prefixFlag,
|
||||
|
@ -572,9 +566,7 @@ final class Stemmer {
|
|||
affixReader.setPosition(8 * affix);
|
||||
char flag = (char) (affixReader.readShort() & 0xffff);
|
||||
affixReader.skipBytes(2); // strip
|
||||
int condition = (char) (affixReader.readShort() & 0xffff);
|
||||
boolean crossProduct = (condition & 1) == 1;
|
||||
condition >>>= 1;
|
||||
boolean crossProduct = ((int) (char) (affixReader.readShort() & 0xffff) & 1) == 1;
|
||||
char append = (char) (affixReader.readShort() & 0xffff);
|
||||
|
||||
List<CharsRef> stems = new ArrayList<>();
|
||||
|
@ -583,18 +575,18 @@ final class Stemmer {
|
|||
if (forms != null) {
|
||||
for (int i = 0; i < forms.length; i += formStep) {
|
||||
dictionary.flagLookup.get(forms.ints[forms.offset + i], scratch);
|
||||
char wordFlags[] = Dictionary.decodeFlags(scratch);
|
||||
char[] wordFlags = Dictionary.decodeFlags(scratch);
|
||||
if (Dictionary.hasFlag(wordFlags, flag)) {
|
||||
// confusing: in this one exception, we already chained the first prefix against the
|
||||
// second,
|
||||
// so it doesnt need to be checked against the word
|
||||
boolean chainedPrefix = dictionary.complexPrefixes && recursionDepth == 1 && prefix;
|
||||
if (chainedPrefix == false
|
||||
if (!chainedPrefix
|
||||
&& prefixFlag >= 0
|
||||
&& !Dictionary.hasFlag(wordFlags, (char) prefixFlag)) {
|
||||
// see if we can chain prefix thru the suffix continuation class (only if it has any!)
|
||||
dictionary.flagLookup.get(append, scratch);
|
||||
char appendFlags[] = Dictionary.decodeFlags(scratch);
|
||||
char[] appendFlags = Dictionary.decodeFlags(scratch);
|
||||
if (!hasCrossCheckedFlag((char) prefixFlag, appendFlags, false)) {
|
||||
continue;
|
||||
}
|
||||
|
@ -604,7 +596,7 @@ final class Stemmer {
|
|||
// to ensure it has it, and vice versa
|
||||
if (dictionary.circumfix != -1) {
|
||||
dictionary.flagLookup.get(append, scratch);
|
||||
char appendFlags[] = Dictionary.decodeFlags(scratch);
|
||||
char[] appendFlags = Dictionary.decodeFlags(scratch);
|
||||
boolean suffixCircumfix = Dictionary.hasFlag(appendFlags, (char) dictionary.circumfix);
|
||||
if (circumfix != suffixCircumfix) {
|
||||
continue;
|
||||
|
@ -631,7 +623,7 @@ final class Stemmer {
|
|||
// have that flag
|
||||
if (dictionary.circumfix != -1 && !circumfix && prefix) {
|
||||
dictionary.flagLookup.get(append, scratch);
|
||||
char appendFlags[] = Dictionary.decodeFlags(scratch);
|
||||
char[] appendFlags = Dictionary.decodeFlags(scratch);
|
||||
circumfix = Dictionary.hasFlag(appendFlags, (char) dictionary.circumfix);
|
||||
}
|
||||
|
||||
|
@ -654,7 +646,7 @@ final class Stemmer {
|
|||
true,
|
||||
circumfix,
|
||||
caseVariant));
|
||||
} else if (dictionary.complexPrefixes == false && dictionary.twoStageAffix) {
|
||||
} else if (!dictionary.complexPrefixes && dictionary.twoStageAffix) {
|
||||
// we took away a suffix.
|
||||
// COMPLEXPREFIXES = true: we don't recurse! only one suffix allowed
|
||||
// COMPLEXPREFIXES = false: combine with another suffix
|
||||
|
@ -688,9 +680,7 @@ final class Stemmer {
|
|||
true,
|
||||
circumfix,
|
||||
caseVariant));
|
||||
} else if (prefix == false
|
||||
&& dictionary.complexPrefixes == false
|
||||
&& dictionary.twoStageAffix) {
|
||||
} else if (!prefix && !dictionary.complexPrefixes && dictionary.twoStageAffix) {
|
||||
// we took away a prefix, then a suffix: go look for another suffix
|
||||
stems.addAll(
|
||||
stem(
|
||||
|
|
|
@ -42,18 +42,19 @@ public class TestDictionary extends LuceneTestCase {
|
|||
Directory tempDir = getDirectory();
|
||||
|
||||
Dictionary dictionary = new Dictionary(tempDir, "dictionary", affixStream, dictStream);
|
||||
assertEquals(3, dictionary.lookupSuffix(new char[] {'e'}, 0, 1).length);
|
||||
assertEquals(1, dictionary.lookupPrefix(new char[] {'s'}, 0, 1).length);
|
||||
assertEquals(3, dictionary.lookupSuffix(new char[] {'e'}).length);
|
||||
assertEquals(1, dictionary.lookupPrefix(new char[] {'s'}).length);
|
||||
IntsRef ordList = dictionary.lookupWord(new char[] {'o', 'l', 'r'}, 0, 3);
|
||||
assertNotNull(ordList);
|
||||
assertEquals(1, ordList.length);
|
||||
|
||||
BytesRef ref = new BytesRef();
|
||||
dictionary.flagLookup.get(ordList.ints[0], ref);
|
||||
char flags[] = Dictionary.decodeFlags(ref);
|
||||
char[] flags = Dictionary.decodeFlags(ref);
|
||||
assertEquals(1, flags.length);
|
||||
|
||||
ordList = dictionary.lookupWord(new char[] {'l', 'u', 'c', 'e', 'n'}, 0, 5);
|
||||
int offset = random().nextInt(10);
|
||||
ordList = dictionary.lookupWord((" ".repeat(offset) + "lucen").toCharArray(), offset, 5);
|
||||
assertNotNull(ordList);
|
||||
assertEquals(1, ordList.length);
|
||||
dictionary.flagLookup.get(ordList.ints[0], ref);
|
||||
|
@ -71,12 +72,12 @@ public class TestDictionary extends LuceneTestCase {
|
|||
|
||||
Directory tempDir = getDirectory();
|
||||
Dictionary dictionary = new Dictionary(tempDir, "dictionary", affixStream, dictStream);
|
||||
assertEquals(3, dictionary.lookupSuffix(new char[] {'e'}, 0, 1).length);
|
||||
assertEquals(1, dictionary.lookupPrefix(new char[] {'s'}, 0, 1).length);
|
||||
assertEquals(3, dictionary.lookupSuffix(new char[] {'e'}).length);
|
||||
assertEquals(1, dictionary.lookupPrefix(new char[] {'s'}).length);
|
||||
IntsRef ordList = dictionary.lookupWord(new char[] {'o', 'l', 'r'}, 0, 3);
|
||||
BytesRef ref = new BytesRef();
|
||||
dictionary.flagLookup.get(ordList.ints[0], ref);
|
||||
char flags[] = Dictionary.decodeFlags(ref);
|
||||
char[] flags = Dictionary.decodeFlags(ref);
|
||||
assertEquals(1, flags.length);
|
||||
|
||||
affixStream.close();
|
||||
|
@ -90,12 +91,12 @@ public class TestDictionary extends LuceneTestCase {
|
|||
Directory tempDir = getDirectory();
|
||||
|
||||
Dictionary dictionary = new Dictionary(tempDir, "dictionary", affixStream, dictStream);
|
||||
assertEquals(3, dictionary.lookupSuffix(new char[] {'e'}, 0, 1).length);
|
||||
assertEquals(1, dictionary.lookupPrefix(new char[] {'s'}, 0, 1).length);
|
||||
assertEquals(3, dictionary.lookupSuffix(new char[] {'e'}).length);
|
||||
assertEquals(1, dictionary.lookupPrefix(new char[] {'s'}).length);
|
||||
IntsRef ordList = dictionary.lookupWord(new char[] {'o', 'l', 'r'}, 0, 3);
|
||||
BytesRef ref = new BytesRef();
|
||||
dictionary.flagLookup.get(ordList.ints[0], ref);
|
||||
char flags[] = Dictionary.decodeFlags(ref);
|
||||
char[] flags = Dictionary.decodeFlags(ref);
|
||||
assertEquals(1, flags.length);
|
||||
|
||||
affixStream.close();
|
||||
|
@ -109,12 +110,12 @@ public class TestDictionary extends LuceneTestCase {
|
|||
Directory tempDir = getDirectory();
|
||||
|
||||
Dictionary dictionary = new Dictionary(tempDir, "dictionary", affixStream, dictStream);
|
||||
assertEquals(3, dictionary.lookupSuffix(new char[] {'e'}, 0, 1).length);
|
||||
assertEquals(1, dictionary.lookupPrefix(new char[] {'s'}, 0, 1).length);
|
||||
assertEquals(3, dictionary.lookupSuffix(new char[] {'e'}).length);
|
||||
assertEquals(1, dictionary.lookupPrefix(new char[] {'s'}).length);
|
||||
IntsRef ordList = dictionary.lookupWord(new char[] {'o', 'l', 'r'}, 0, 3);
|
||||
BytesRef ref = new BytesRef();
|
||||
dictionary.flagLookup.get(ordList.ints[0], ref);
|
||||
char flags[] = Dictionary.decodeFlags(ref);
|
||||
char[] flags = Dictionary.decodeFlags(ref);
|
||||
assertEquals(1, flags.length);
|
||||
|
||||
affixStream.close();
|
||||
|
@ -131,9 +132,7 @@ public class TestDictionary extends LuceneTestCase {
|
|||
ParseException expected =
|
||||
expectThrows(
|
||||
ParseException.class,
|
||||
() -> {
|
||||
new Dictionary(tempDir, "dictionary", affixStream, dictStream);
|
||||
});
|
||||
() -> new Dictionary(tempDir, "dictionary", affixStream, dictStream));
|
||||
assertTrue(
|
||||
expected
|
||||
.getMessage()
|
||||
|
@ -153,10 +152,7 @@ public class TestDictionary extends LuceneTestCase {
|
|||
|
||||
Exception expected =
|
||||
expectThrows(
|
||||
Exception.class,
|
||||
() -> {
|
||||
new Dictionary(tempDir, "dictionary", affixStream, dictStream);
|
||||
});
|
||||
Exception.class, () -> new Dictionary(tempDir, "dictionary", affixStream, dictStream));
|
||||
assertTrue(expected.getMessage().startsWith("expected only one flag"));
|
||||
|
||||
affixStream.close();
|
||||
|
@ -272,7 +268,7 @@ public class TestDictionary extends LuceneTestCase {
|
|||
Dictionary.getDictionaryEncoding(new ByteArrayInputStream(new byte[0])));
|
||||
}
|
||||
|
||||
public void testFlagWithCrazyWhitespace() throws Exception {
|
||||
public void testFlagWithCrazyWhitespace() {
|
||||
assertNotNull(Dictionary.getFlagParsingStrategy("FLAG\tUTF-8"));
|
||||
assertNotNull(Dictionary.getFlagParsingStrategy("FLAG UTF-8"));
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue