mirror of https://github.com/apache/lucene.git
LUCENE-9733: Hunspell: exception when loading dictionaries with mixed-case words and aliased flags (#2305)
This commit is contained in:
parent
b5c1ed7129
commit
16764f1601
|
@ -951,7 +951,7 @@ public class Dictionary {
|
|||
reuse.append(caseFold(word.charAt(i)));
|
||||
}
|
||||
reuse.append(FLAG_SEPARATOR);
|
||||
flagParsingStrategy.appendFlag(HIDDEN_FLAG, reuse);
|
||||
reuse.append(HIDDEN_FLAG);
|
||||
reuse.append(afterSep, afterSep.charAt(0) == FLAG_SEPARATOR ? 1 : 0, afterSep.length());
|
||||
writer.write(reuse.toString().getBytes(StandardCharsets.UTF_8));
|
||||
}
|
||||
|
@ -1039,12 +1039,17 @@ public class Dictionary {
|
|||
entry = line.substring(0, end);
|
||||
} else {
|
||||
end = line.indexOf(MORPH_SEPARATOR);
|
||||
String flagPart = line.substring(flagSep + 1, end);
|
||||
if (aliasCount > 0) {
|
||||
boolean hidden = line.charAt(flagSep + 1) == HIDDEN_FLAG;
|
||||
String flagPart = line.substring(flagSep + (hidden ? 2 : 1), end);
|
||||
if (aliasCount > 0 && !flagPart.isEmpty()) {
|
||||
flagPart = getAliasValue(Integer.parseInt(flagPart));
|
||||
}
|
||||
|
||||
wordForm = flagParsingStrategy.parseFlags(flagPart);
|
||||
if (hidden) {
|
||||
wordForm = ArrayUtil.growExact(wordForm, wordForm.length + 1);
|
||||
wordForm[wordForm.length - 1] = HIDDEN_FLAG;
|
||||
}
|
||||
Arrays.sort(wordForm);
|
||||
entry = line.substring(0, flagSep);
|
||||
}
|
||||
|
@ -1275,8 +1280,6 @@ public class Dictionary {
|
|||
* @return Parsed flags
|
||||
*/
|
||||
abstract char[] parseFlags(String rawFlags);
|
||||
|
||||
abstract void appendFlag(char flag, StringBuilder to);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -1288,11 +1291,6 @@ public class Dictionary {
|
|||
public char[] parseFlags(String rawFlags) {
|
||||
return rawFlags.toCharArray();
|
||||
}
|
||||
|
||||
@Override
|
||||
void appendFlag(char flag, StringBuilder to) {
|
||||
to.append(flag);
|
||||
}
|
||||
}
|
||||
|
||||
/** Used to read flags as UTF-8 even if the rest of the file is in the default (8-bit) encoding */
|
||||
|
@ -1301,11 +1299,6 @@ public class Dictionary {
|
|||
public char[] parseFlags(String rawFlags) {
|
||||
return new String(rawFlags.getBytes(DEFAULT_CHARSET), StandardCharsets.UTF_8).toCharArray();
|
||||
}
|
||||
|
||||
@Override
|
||||
void appendFlag(char flag, StringBuilder to) {
|
||||
to.append(new String(String.valueOf(flag).getBytes(StandardCharsets.UTF_8), DEFAULT_CHARSET));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -1339,12 +1332,6 @@ public class Dictionary {
|
|||
}
|
||||
return flags;
|
||||
}
|
||||
|
||||
@Override
|
||||
void appendFlag(char flag, StringBuilder to) {
|
||||
to.append((int) flag);
|
||||
to.append(",");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -1379,12 +1366,6 @@ public class Dictionary {
|
|||
builder.getChars(0, builder.length(), flags, 0);
|
||||
return flags;
|
||||
}
|
||||
|
||||
@Override
|
||||
void appendFlag(char flag, StringBuilder to) {
|
||||
to.append((char) (flag >> 8));
|
||||
to.append((char) (flag & 0xff));
|
||||
}
|
||||
}
|
||||
|
||||
boolean hasFlag(int entryId, char flag, BytesRef scratch) {
|
||||
|
|
|
@ -0,0 +1,40 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.hunspell;
|
||||
|
||||
import org.junit.BeforeClass;
|
||||
|
||||
public class TestCompressed extends StemmerTestBase {
|
||||
@BeforeClass
|
||||
public static void beforeClass() throws Exception {
|
||||
init("compressed.aff", "compressed.dic");
|
||||
}
|
||||
|
||||
public void test() {
|
||||
assertStemsTo("apach", "apach");
|
||||
assertStemsTo("apache", "apach");
|
||||
assertStemsTo("apachee");
|
||||
|
||||
assertStemsTo("XYZ", "XYZ", "Xyz");
|
||||
assertStemsTo("XYZs", "XYZ");
|
||||
assertStemsTo("XYZS", "Xyz");
|
||||
assertStemsTo("xyz");
|
||||
|
||||
assertStemsTo("mixedCase", "mixedCase");
|
||||
assertStemsTo("MIXEDCASE", "Mixedcase");
|
||||
}
|
||||
}
|
|
@ -20,10 +20,8 @@ import java.io.ByteArrayInputStream;
|
|||
import java.io.FilterInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.nio.charset.Charset;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.text.ParseException;
|
||||
import java.util.Random;
|
||||
import org.apache.lucene.store.ByteBuffersDirectory;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
@ -293,31 +291,6 @@ public class TestDictionary extends LuceneTestCase {
|
|||
assertEquals(src, new String(strategy.parseFlags(asAscii)));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFlagSerialization() {
|
||||
Random r = random();
|
||||
char[] flags = new char[r.nextInt(10)];
|
||||
for (int i = 0; i < flags.length; i++) {
|
||||
flags[i] = (char) r.nextInt(Character.MIN_SURROGATE);
|
||||
}
|
||||
|
||||
String[] flagLines = {"FLAG long", "FLAG UTF-8", "FLAG num"};
|
||||
Charset[] charsets = {StandardCharsets.UTF_8, Dictionary.DEFAULT_CHARSET};
|
||||
for (String flagLine : flagLines) {
|
||||
for (Charset charset : charsets) {
|
||||
Dictionary.FlagParsingStrategy strategy =
|
||||
Dictionary.getFlagParsingStrategy(flagLine, charset);
|
||||
StringBuilder serialized = new StringBuilder();
|
||||
for (char flag : flags) {
|
||||
strategy.appendFlag(flag, serialized);
|
||||
}
|
||||
|
||||
char[] deserialized = strategy.parseFlags(serialized.toString());
|
||||
assertEquals(new String(flags), new String(deserialized));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private Directory getDirectory() {
|
||||
return newDirectory();
|
||||
}
|
||||
|
|
|
@ -20,7 +20,7 @@ SFX CC 0 d/3 c
|
|||
SFX CC 0 c b
|
||||
|
||||
SFX DD Y 1
|
||||
SFX DD 0 s o
|
||||
SFX DD 0 s .
|
||||
|
||||
SFX EE Y 1
|
||||
SFX EE 0 d o
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
6
|
||||
10
|
||||
ab/3
|
||||
apach/1
|
||||
foo/4
|
||||
|
@ -7,3 +7,5 @@ lucen/1
|
|||
lucene
|
||||
mahout/1
|
||||
olr/2
|
||||
XYZ/4
|
||||
mixedCase
|
Loading…
Reference in New Issue