mirror of https://github.com/apache/lucene.git
LUCENE-9691: Hunspell: support trailing comments on aff option lines (#2236)
plus cleanup & deduplicate parsing
This commit is contained in:
parent
c7e1079da9
commit
f64e7cbbda
|
@ -78,31 +78,6 @@ public class Dictionary {
|
||||||
|
|
||||||
private static final char HIDDEN_FLAG = (char) 65511; // called 'ONLYUPCASEFLAG' in Hunspell
|
private static final char HIDDEN_FLAG = (char) 65511; // called 'ONLYUPCASEFLAG' in Hunspell
|
||||||
|
|
||||||
private static final String ALIAS_KEY = "AF";
|
|
||||||
private static final String MORPH_ALIAS_KEY = "AM";
|
|
||||||
private static final String PREFIX_KEY = "PFX";
|
|
||||||
private static final String SUFFIX_KEY = "SFX";
|
|
||||||
private static final String FLAG_KEY = "FLAG";
|
|
||||||
private static final String COMPLEXPREFIXES_KEY = "COMPLEXPREFIXES";
|
|
||||||
private static final String CIRCUMFIX_KEY = "CIRCUMFIX";
|
|
||||||
private static final String IGNORE_KEY = "IGNORE";
|
|
||||||
private static final String ICONV_KEY = "ICONV";
|
|
||||||
private static final String OCONV_KEY = "OCONV";
|
|
||||||
private static final String FULLSTRIP_KEY = "FULLSTRIP";
|
|
||||||
private static final String LANG_KEY = "LANG";
|
|
||||||
private static final String BREAK_KEY = "BREAK";
|
|
||||||
private static final String FORBIDDENWORD_KEY = "FORBIDDENWORD";
|
|
||||||
private static final String COMPOUNDMIN_KEY = "COMPOUNDMIN";
|
|
||||||
private static final String COMPOUNDRULE_KEY = "COMPOUNDRULE";
|
|
||||||
private static final String KEEPCASE_KEY = "KEEPCASE";
|
|
||||||
private static final String NEEDAFFIX_KEY = "NEEDAFFIX";
|
|
||||||
private static final String PSEUDOROOT_KEY = "PSEUDOROOT";
|
|
||||||
private static final String ONLYINCOMPOUND_KEY = "ONLYINCOMPOUND";
|
|
||||||
|
|
||||||
private static final String NUM_FLAG_TYPE = "num";
|
|
||||||
private static final String UTF8_FLAG_TYPE = "UTF-8";
|
|
||||||
private static final String LONG_FLAG_TYPE = "long";
|
|
||||||
|
|
||||||
// TODO: really for suffixes we should reverse the automaton and run them backwards
|
// TODO: really for suffixes we should reverse the automaton and run them backwards
|
||||||
private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*";
|
private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*";
|
||||||
private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s";
|
private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s";
|
||||||
|
@ -346,95 +321,62 @@ public class Dictionary {
|
||||||
if (reader.getLineNumber() == 1 && line.startsWith("\uFEFF")) {
|
if (reader.getLineNumber() == 1 && line.startsWith("\uFEFF")) {
|
||||||
line = line.substring(1);
|
line = line.substring(1);
|
||||||
}
|
}
|
||||||
if (line.startsWith(ALIAS_KEY)) {
|
line = line.trim();
|
||||||
|
if (line.isEmpty()) continue;
|
||||||
|
|
||||||
|
String firstWord = line.split("\\s")[0];
|
||||||
|
if ("AF".equals(firstWord)) {
|
||||||
parseAlias(line);
|
parseAlias(line);
|
||||||
} else if (line.startsWith(MORPH_ALIAS_KEY)) {
|
} else if ("AM".equals(firstWord)) {
|
||||||
parseMorphAlias(line);
|
parseMorphAlias(line);
|
||||||
} else if (line.startsWith(PREFIX_KEY)) {
|
} else if ("PFX".equals(firstWord)) {
|
||||||
parseAffix(
|
parseAffix(
|
||||||
prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN, seenPatterns, seenStrips);
|
prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN, seenPatterns, seenStrips);
|
||||||
} else if (line.startsWith(SUFFIX_KEY)) {
|
} else if ("SFX".equals(firstWord)) {
|
||||||
parseAffix(
|
parseAffix(
|
||||||
suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN, seenPatterns, seenStrips);
|
suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN, seenPatterns, seenStrips);
|
||||||
} else if (line.startsWith(FLAG_KEY)) {
|
} else if ("FLAG".equals(firstWord)) {
|
||||||
// Assume that the FLAG line comes before any prefix or suffixes
|
// Assume that the FLAG line comes before any prefix or suffixes
|
||||||
// Store the strategy so it can be used when parsing the dic file
|
// Store the strategy so it can be used when parsing the dic file
|
||||||
flagParsingStrategy = getFlagParsingStrategy(line);
|
flagParsingStrategy = getFlagParsingStrategy(line);
|
||||||
} else if (line.equals(COMPLEXPREFIXES_KEY)) {
|
} else if (line.equals("COMPLEXPREFIXES")) {
|
||||||
complexPrefixes =
|
complexPrefixes =
|
||||||
true; // 2-stage prefix+1-stage suffix instead of 2-stage suffix+1-stage prefix
|
true; // 2-stage prefix+1-stage suffix instead of 2-stage suffix+1-stage prefix
|
||||||
} else if (line.startsWith(CIRCUMFIX_KEY)) {
|
} else if ("CIRCUMFIX".equals(firstWord)) {
|
||||||
String[] parts = line.split("\\s+");
|
circumfix = flagParsingStrategy.parseFlag(singleArgument(reader, line));
|
||||||
if (parts.length != 2) {
|
} else if ("KEEPCASE".equals(firstWord)) {
|
||||||
throw new ParseException("Illegal CIRCUMFIX declaration", reader.getLineNumber());
|
keepcase = flagParsingStrategy.parseFlag(singleArgument(reader, line));
|
||||||
}
|
} else if ("NEEDAFFIX".equals(firstWord) || "PSEUDOROOT".equals(firstWord)) {
|
||||||
circumfix = flagParsingStrategy.parseFlag(parts[1]);
|
needaffix = flagParsingStrategy.parseFlag(singleArgument(reader, line));
|
||||||
} else if (line.startsWith(KEEPCASE_KEY)) {
|
} else if ("ONLYINCOMPOUND".equals(firstWord)) {
|
||||||
String[] parts = line.split("\\s+");
|
onlyincompound = flagParsingStrategy.parseFlag(singleArgument(reader, line));
|
||||||
if (parts.length != 2) {
|
} else if ("IGNORE".equals(firstWord)) {
|
||||||
throw new ParseException("Illegal KEEPCASE declaration", reader.getLineNumber());
|
ignore = singleArgument(reader, line).toCharArray();
|
||||||
}
|
|
||||||
keepcase = flagParsingStrategy.parseFlag(parts[1]);
|
|
||||||
} else if (line.startsWith(NEEDAFFIX_KEY) || line.startsWith(PSEUDOROOT_KEY)) {
|
|
||||||
String[] parts = line.split("\\s+");
|
|
||||||
if (parts.length != 2) {
|
|
||||||
throw new ParseException("Illegal NEEDAFFIX declaration", reader.getLineNumber());
|
|
||||||
}
|
|
||||||
needaffix = flagParsingStrategy.parseFlag(parts[1]);
|
|
||||||
} else if (line.startsWith(ONLYINCOMPOUND_KEY)) {
|
|
||||||
String[] parts = line.split("\\s+");
|
|
||||||
if (parts.length != 2) {
|
|
||||||
throw new ParseException("Illegal ONLYINCOMPOUND declaration", reader.getLineNumber());
|
|
||||||
}
|
|
||||||
onlyincompound = flagParsingStrategy.parseFlag(parts[1]);
|
|
||||||
} else if (line.startsWith(IGNORE_KEY)) {
|
|
||||||
String[] parts = line.split("\\s+");
|
|
||||||
if (parts.length != 2) {
|
|
||||||
throw new ParseException("Illegal IGNORE declaration", reader.getLineNumber());
|
|
||||||
}
|
|
||||||
ignore = parts[1].toCharArray();
|
|
||||||
Arrays.sort(ignore);
|
Arrays.sort(ignore);
|
||||||
needsInputCleaning = true;
|
needsInputCleaning = true;
|
||||||
} else if (line.startsWith(ICONV_KEY) || line.startsWith(OCONV_KEY)) {
|
} else if ("ICONV".equals(firstWord) || "OCONV".equals(firstWord)) {
|
||||||
String[] parts = line.split("\\s+");
|
int num = Integer.parseInt(singleArgument(reader, line));
|
||||||
String type = parts[0];
|
|
||||||
if (parts.length != 2) {
|
|
||||||
throw new ParseException("Illegal " + type + " declaration", reader.getLineNumber());
|
|
||||||
}
|
|
||||||
int num = Integer.parseInt(parts[1]);
|
|
||||||
FST<CharsRef> res = parseConversions(reader, num);
|
FST<CharsRef> res = parseConversions(reader, num);
|
||||||
if (type.equals("ICONV")) {
|
if (line.startsWith("I")) {
|
||||||
iconv = res;
|
iconv = res;
|
||||||
needsInputCleaning |= iconv != null;
|
needsInputCleaning |= iconv != null;
|
||||||
} else {
|
} else {
|
||||||
oconv = res;
|
oconv = res;
|
||||||
needsOutputCleaning |= oconv != null;
|
needsOutputCleaning |= oconv != null;
|
||||||
}
|
}
|
||||||
} else if (line.startsWith(FULLSTRIP_KEY)) {
|
} else if ("FULLSTRIP".equals(firstWord)) {
|
||||||
fullStrip = true;
|
fullStrip = true;
|
||||||
} else if (line.startsWith(LANG_KEY)) {
|
} else if ("LANG".equals(firstWord)) {
|
||||||
language = line.substring(LANG_KEY.length()).trim();
|
language = singleArgument(reader, line);
|
||||||
alternateCasing = "tr_TR".equals(language) || "az_AZ".equals(language);
|
alternateCasing = "tr_TR".equals(language) || "az_AZ".equals(language);
|
||||||
} else if (line.startsWith(BREAK_KEY)) {
|
} else if ("BREAK".equals(firstWord)) {
|
||||||
breaks = parseBreaks(reader, line);
|
breaks = parseBreaks(reader, line);
|
||||||
} else if (line.startsWith(FORBIDDENWORD_KEY)) {
|
} else if ("FORBIDDENWORD".equals(firstWord)) {
|
||||||
String[] parts = line.split("\\s+");
|
forbiddenword = flagParsingStrategy.parseFlag(singleArgument(reader, line));
|
||||||
if (parts.length != 2) {
|
} else if ("COMPOUNDMIN".equals(firstWord)) {
|
||||||
throw new ParseException("Illegal FORBIDDENWORD declaration", reader.getLineNumber());
|
compoundMin = Math.max(1, Integer.parseInt(singleArgument(reader, line)));
|
||||||
}
|
} else if ("COMPOUNDRULE".equals(firstWord)) {
|
||||||
forbiddenword = flagParsingStrategy.parseFlag(parts[1]);
|
compoundRules = parseCompoundRules(reader, Integer.parseInt(singleArgument(reader, line)));
|
||||||
} else if (line.startsWith(COMPOUNDMIN_KEY)) {
|
|
||||||
String[] parts = line.split("\\s+");
|
|
||||||
if (parts.length != 2) {
|
|
||||||
throw new ParseException("Illegal COMPOUNDMIN declaration", reader.getLineNumber());
|
|
||||||
}
|
|
||||||
compoundMin = Math.max(1, Integer.parseInt(parts[1]));
|
|
||||||
} else if (line.startsWith(COMPOUNDRULE_KEY)) {
|
|
||||||
String[] parts = line.split("\\s+");
|
|
||||||
if (parts.length != 2) {
|
|
||||||
throw new ParseException("Illegal COMPOUNDRULE header", reader.getLineNumber());
|
|
||||||
}
|
|
||||||
this.compoundRules = parseCompoundRules(reader, Integer.parseInt(parts[1]));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -458,17 +400,25 @@ public class Dictionary {
|
||||||
stripOffsets[currentIndex] = currentOffset;
|
stripOffsets[currentIndex] = currentOffset;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private String singleArgument(LineNumberReader reader, String line) throws ParseException {
|
||||||
|
return splitBySpace(reader, line, 2)[1];
|
||||||
|
}
|
||||||
|
|
||||||
|
private String[] splitBySpace(LineNumberReader reader, String line, int expectedParts)
|
||||||
|
throws ParseException {
|
||||||
|
String[] parts = line.split("\\s+");
|
||||||
|
if (parts.length < expectedParts
|
||||||
|
|| parts.length > expectedParts && !parts[expectedParts].startsWith("#")) {
|
||||||
|
throw new ParseException("Invalid syntax", reader.getLineNumber());
|
||||||
|
}
|
||||||
|
return parts;
|
||||||
|
}
|
||||||
|
|
||||||
private List<CompoundRule> parseCompoundRules(LineNumberReader reader, int num)
|
private List<CompoundRule> parseCompoundRules(LineNumberReader reader, int num)
|
||||||
throws IOException, ParseException {
|
throws IOException, ParseException {
|
||||||
String line;
|
|
||||||
List<CompoundRule> compoundRules = new ArrayList<>();
|
List<CompoundRule> compoundRules = new ArrayList<>();
|
||||||
for (int i = 0; i < num; i++) {
|
for (int i = 0; i < num; i++) {
|
||||||
line = reader.readLine();
|
compoundRules.add(new CompoundRule(singleArgument(reader, reader.readLine()), this));
|
||||||
String[] parts = line.split("\\s+");
|
|
||||||
if (!line.startsWith(COMPOUNDRULE_KEY) || parts.length != 2) {
|
|
||||||
throw new ParseException("COMPOUNDRULE rule expected", reader.getLineNumber());
|
|
||||||
}
|
|
||||||
compoundRules.add(new CompoundRule(parts[1], this));
|
|
||||||
}
|
}
|
||||||
return compoundRules;
|
return compoundRules;
|
||||||
}
|
}
|
||||||
|
@ -478,14 +428,9 @@ public class Dictionary {
|
||||||
Set<String> starting = new LinkedHashSet<>();
|
Set<String> starting = new LinkedHashSet<>();
|
||||||
Set<String> ending = new LinkedHashSet<>();
|
Set<String> ending = new LinkedHashSet<>();
|
||||||
Set<String> middle = new LinkedHashSet<>();
|
Set<String> middle = new LinkedHashSet<>();
|
||||||
int num = Integer.parseInt(line.substring(BREAK_KEY.length()).trim());
|
int num = Integer.parseInt(singleArgument(reader, line));
|
||||||
for (int i = 0; i < num; i++) {
|
for (int i = 0; i < num; i++) {
|
||||||
line = reader.readLine();
|
String breakStr = singleArgument(reader, reader.readLine());
|
||||||
String[] parts = line.split("\\s+");
|
|
||||||
if (!line.startsWith(BREAK_KEY) || parts.length != 2) {
|
|
||||||
throw new ParseException("BREAK chars expected", reader.getLineNumber());
|
|
||||||
}
|
|
||||||
String breakStr = parts[1];
|
|
||||||
if (breakStr.startsWith("^")) {
|
if (breakStr.startsWith("^")) {
|
||||||
starting.add(breakStr.substring(1));
|
starting.add(breakStr.substring(1));
|
||||||
} else if (breakStr.endsWith("$")) {
|
} else if (breakStr.endsWith("$")) {
|
||||||
|
@ -689,11 +634,7 @@ public class Dictionary {
|
||||||
Map<String, String> mappings = new TreeMap<>();
|
Map<String, String> mappings = new TreeMap<>();
|
||||||
|
|
||||||
for (int i = 0; i < num; i++) {
|
for (int i = 0; i < num; i++) {
|
||||||
String line = reader.readLine();
|
String[] parts = splitBySpace(reader, reader.readLine(), 3);
|
||||||
String[] parts = line.split("\\s+");
|
|
||||||
if (parts.length != 3) {
|
|
||||||
throw new ParseException("invalid syntax: " + line, reader.getLineNumber());
|
|
||||||
}
|
|
||||||
if (mappings.put(parts[1], parts[2]) != null) {
|
if (mappings.put(parts[1], parts[2]) != null) {
|
||||||
throw new IllegalStateException("duplicate mapping specified for: " + parts[1]);
|
throw new IllegalStateException("duplicate mapping specified for: " + parts[1]);
|
||||||
}
|
}
|
||||||
|
@ -789,11 +730,11 @@ public class Dictionary {
|
||||||
}
|
}
|
||||||
String flagType = parts[1];
|
String flagType = parts[1];
|
||||||
|
|
||||||
if (NUM_FLAG_TYPE.equals(flagType)) {
|
if ("num".equals(flagType)) {
|
||||||
return new NumFlagParsingStrategy();
|
return new NumFlagParsingStrategy();
|
||||||
} else if (UTF8_FLAG_TYPE.equals(flagType)) {
|
} else if ("UTF-8".equals(flagType)) {
|
||||||
return new SimpleFlagParsingStrategy();
|
return new SimpleFlagParsingStrategy();
|
||||||
} else if (LONG_FLAG_TYPE.equals(flagType)) {
|
} else if ("long".equals(flagType)) {
|
||||||
return new DoubleASCIIFlagParsingStrategy();
|
return new DoubleASCIIFlagParsingStrategy();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -6,9 +6,10 @@ ICONV B b
|
||||||
ICONV C c
|
ICONV C c
|
||||||
ICONV I i
|
ICONV I i
|
||||||
|
|
||||||
OCONV 4
|
# Testing also whitespace and comments.
|
||||||
OCONV a A
|
OCONV 4 # space, space
|
||||||
OCONV b B
|
OCONV a A # tab, space, space
|
||||||
|
OCONV b B # tab, tab, space
|
||||||
OCONV c C
|
OCONV c C
|
||||||
OCONV i I
|
OCONV i I
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue