LUCENE-9691: Hunspell: support trailing comments on aff option lines (#2236)

plus cleanup & deduplicate parsing
This commit is contained in:
Peter Gromov 2021-01-25 09:08:57 +01:00 committed by GitHub
parent c7e1079da9
commit f64e7cbbda
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 58 additions and 116 deletions

View File

@ -78,31 +78,6 @@ public class Dictionary {
private static final char HIDDEN_FLAG = (char) 65511; // called 'ONLYUPCASEFLAG' in Hunspell
private static final String ALIAS_KEY = "AF";
private static final String MORPH_ALIAS_KEY = "AM";
private static final String PREFIX_KEY = "PFX";
private static final String SUFFIX_KEY = "SFX";
private static final String FLAG_KEY = "FLAG";
private static final String COMPLEXPREFIXES_KEY = "COMPLEXPREFIXES";
private static final String CIRCUMFIX_KEY = "CIRCUMFIX";
private static final String IGNORE_KEY = "IGNORE";
private static final String ICONV_KEY = "ICONV";
private static final String OCONV_KEY = "OCONV";
private static final String FULLSTRIP_KEY = "FULLSTRIP";
private static final String LANG_KEY = "LANG";
private static final String BREAK_KEY = "BREAK";
private static final String FORBIDDENWORD_KEY = "FORBIDDENWORD";
private static final String COMPOUNDMIN_KEY = "COMPOUNDMIN";
private static final String COMPOUNDRULE_KEY = "COMPOUNDRULE";
private static final String KEEPCASE_KEY = "KEEPCASE";
private static final String NEEDAFFIX_KEY = "NEEDAFFIX";
private static final String PSEUDOROOT_KEY = "PSEUDOROOT";
private static final String ONLYINCOMPOUND_KEY = "ONLYINCOMPOUND";
private static final String NUM_FLAG_TYPE = "num";
private static final String UTF8_FLAG_TYPE = "UTF-8";
private static final String LONG_FLAG_TYPE = "long";
// TODO: really for suffixes we should reverse the automaton and run them backwards
private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*";
private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s";
@ -346,95 +321,62 @@ public class Dictionary {
if (reader.getLineNumber() == 1 && line.startsWith("\uFEFF")) {
line = line.substring(1);
}
if (line.startsWith(ALIAS_KEY)) {
line = line.trim();
if (line.isEmpty()) continue;
String firstWord = line.split("\\s")[0];
if ("AF".equals(firstWord)) {
parseAlias(line);
} else if (line.startsWith(MORPH_ALIAS_KEY)) {
} else if ("AM".equals(firstWord)) {
parseMorphAlias(line);
} else if (line.startsWith(PREFIX_KEY)) {
} else if ("PFX".equals(firstWord)) {
parseAffix(
prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN, seenPatterns, seenStrips);
} else if (line.startsWith(SUFFIX_KEY)) {
} else if ("SFX".equals(firstWord)) {
parseAffix(
suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN, seenPatterns, seenStrips);
} else if (line.startsWith(FLAG_KEY)) {
} else if ("FLAG".equals(firstWord)) {
// Assume that the FLAG line comes before any prefix or suffixes
// Store the strategy so it can be used when parsing the dic file
flagParsingStrategy = getFlagParsingStrategy(line);
} else if (line.equals(COMPLEXPREFIXES_KEY)) {
} else if (line.equals("COMPLEXPREFIXES")) {
complexPrefixes =
true; // 2-stage prefix+1-stage suffix instead of 2-stage suffix+1-stage prefix
} else if (line.startsWith(CIRCUMFIX_KEY)) {
String[] parts = line.split("\\s+");
if (parts.length != 2) {
throw new ParseException("Illegal CIRCUMFIX declaration", reader.getLineNumber());
}
circumfix = flagParsingStrategy.parseFlag(parts[1]);
} else if (line.startsWith(KEEPCASE_KEY)) {
String[] parts = line.split("\\s+");
if (parts.length != 2) {
throw new ParseException("Illegal KEEPCASE declaration", reader.getLineNumber());
}
keepcase = flagParsingStrategy.parseFlag(parts[1]);
} else if (line.startsWith(NEEDAFFIX_KEY) || line.startsWith(PSEUDOROOT_KEY)) {
String[] parts = line.split("\\s+");
if (parts.length != 2) {
throw new ParseException("Illegal NEEDAFFIX declaration", reader.getLineNumber());
}
needaffix = flagParsingStrategy.parseFlag(parts[1]);
} else if (line.startsWith(ONLYINCOMPOUND_KEY)) {
String[] parts = line.split("\\s+");
if (parts.length != 2) {
throw new ParseException("Illegal ONLYINCOMPOUND declaration", reader.getLineNumber());
}
onlyincompound = flagParsingStrategy.parseFlag(parts[1]);
} else if (line.startsWith(IGNORE_KEY)) {
String[] parts = line.split("\\s+");
if (parts.length != 2) {
throw new ParseException("Illegal IGNORE declaration", reader.getLineNumber());
}
ignore = parts[1].toCharArray();
} else if ("CIRCUMFIX".equals(firstWord)) {
circumfix = flagParsingStrategy.parseFlag(singleArgument(reader, line));
} else if ("KEEPCASE".equals(firstWord)) {
keepcase = flagParsingStrategy.parseFlag(singleArgument(reader, line));
} else if ("NEEDAFFIX".equals(firstWord) || "PSEUDOROOT".equals(firstWord)) {
needaffix = flagParsingStrategy.parseFlag(singleArgument(reader, line));
} else if ("ONLYINCOMPOUND".equals(firstWord)) {
onlyincompound = flagParsingStrategy.parseFlag(singleArgument(reader, line));
} else if ("IGNORE".equals(firstWord)) {
ignore = singleArgument(reader, line).toCharArray();
Arrays.sort(ignore);
needsInputCleaning = true;
} else if (line.startsWith(ICONV_KEY) || line.startsWith(OCONV_KEY)) {
String[] parts = line.split("\\s+");
String type = parts[0];
if (parts.length != 2) {
throw new ParseException("Illegal " + type + " declaration", reader.getLineNumber());
}
int num = Integer.parseInt(parts[1]);
} else if ("ICONV".equals(firstWord) || "OCONV".equals(firstWord)) {
int num = Integer.parseInt(singleArgument(reader, line));
FST<CharsRef> res = parseConversions(reader, num);
if (type.equals("ICONV")) {
if (line.startsWith("I")) {
iconv = res;
needsInputCleaning |= iconv != null;
} else {
oconv = res;
needsOutputCleaning |= oconv != null;
}
} else if (line.startsWith(FULLSTRIP_KEY)) {
} else if ("FULLSTRIP".equals(firstWord)) {
fullStrip = true;
} else if (line.startsWith(LANG_KEY)) {
language = line.substring(LANG_KEY.length()).trim();
} else if ("LANG".equals(firstWord)) {
language = singleArgument(reader, line);
alternateCasing = "tr_TR".equals(language) || "az_AZ".equals(language);
} else if (line.startsWith(BREAK_KEY)) {
} else if ("BREAK".equals(firstWord)) {
breaks = parseBreaks(reader, line);
} else if (line.startsWith(FORBIDDENWORD_KEY)) {
String[] parts = line.split("\\s+");
if (parts.length != 2) {
throw new ParseException("Illegal FORBIDDENWORD declaration", reader.getLineNumber());
}
forbiddenword = flagParsingStrategy.parseFlag(parts[1]);
} else if (line.startsWith(COMPOUNDMIN_KEY)) {
String[] parts = line.split("\\s+");
if (parts.length != 2) {
throw new ParseException("Illegal COMPOUNDMIN declaration", reader.getLineNumber());
}
compoundMin = Math.max(1, Integer.parseInt(parts[1]));
} else if (line.startsWith(COMPOUNDRULE_KEY)) {
String[] parts = line.split("\\s+");
if (parts.length != 2) {
throw new ParseException("Illegal COMPOUNDRULE header", reader.getLineNumber());
}
this.compoundRules = parseCompoundRules(reader, Integer.parseInt(parts[1]));
} else if ("FORBIDDENWORD".equals(firstWord)) {
forbiddenword = flagParsingStrategy.parseFlag(singleArgument(reader, line));
} else if ("COMPOUNDMIN".equals(firstWord)) {
compoundMin = Math.max(1, Integer.parseInt(singleArgument(reader, line)));
} else if ("COMPOUNDRULE".equals(firstWord)) {
compoundRules = parseCompoundRules(reader, Integer.parseInt(singleArgument(reader, line)));
}
}
@ -458,17 +400,25 @@ public class Dictionary {
stripOffsets[currentIndex] = currentOffset;
}
private String singleArgument(LineNumberReader reader, String line) throws ParseException {
return splitBySpace(reader, line, 2)[1];
}
private String[] splitBySpace(LineNumberReader reader, String line, int expectedParts)
throws ParseException {
String[] parts = line.split("\\s+");
if (parts.length < expectedParts
|| parts.length > expectedParts && !parts[expectedParts].startsWith("#")) {
throw new ParseException("Invalid syntax", reader.getLineNumber());
}
return parts;
}
private List<CompoundRule> parseCompoundRules(LineNumberReader reader, int num)
throws IOException, ParseException {
String line;
List<CompoundRule> compoundRules = new ArrayList<>();
for (int i = 0; i < num; i++) {
line = reader.readLine();
String[] parts = line.split("\\s+");
if (!line.startsWith(COMPOUNDRULE_KEY) || parts.length != 2) {
throw new ParseException("COMPOUNDRULE rule expected", reader.getLineNumber());
}
compoundRules.add(new CompoundRule(parts[1], this));
compoundRules.add(new CompoundRule(singleArgument(reader, reader.readLine()), this));
}
return compoundRules;
}
@ -478,14 +428,9 @@ public class Dictionary {
Set<String> starting = new LinkedHashSet<>();
Set<String> ending = new LinkedHashSet<>();
Set<String> middle = new LinkedHashSet<>();
int num = Integer.parseInt(line.substring(BREAK_KEY.length()).trim());
int num = Integer.parseInt(singleArgument(reader, line));
for (int i = 0; i < num; i++) {
line = reader.readLine();
String[] parts = line.split("\\s+");
if (!line.startsWith(BREAK_KEY) || parts.length != 2) {
throw new ParseException("BREAK chars expected", reader.getLineNumber());
}
String breakStr = parts[1];
String breakStr = singleArgument(reader, reader.readLine());
if (breakStr.startsWith("^")) {
starting.add(breakStr.substring(1));
} else if (breakStr.endsWith("$")) {
@ -689,11 +634,7 @@ public class Dictionary {
Map<String, String> mappings = new TreeMap<>();
for (int i = 0; i < num; i++) {
String line = reader.readLine();
String[] parts = line.split("\\s+");
if (parts.length != 3) {
throw new ParseException("invalid syntax: " + line, reader.getLineNumber());
}
String[] parts = splitBySpace(reader, reader.readLine(), 3);
if (mappings.put(parts[1], parts[2]) != null) {
throw new IllegalStateException("duplicate mapping specified for: " + parts[1]);
}
@ -789,11 +730,11 @@ public class Dictionary {
}
String flagType = parts[1];
if (NUM_FLAG_TYPE.equals(flagType)) {
if ("num".equals(flagType)) {
return new NumFlagParsingStrategy();
} else if (UTF8_FLAG_TYPE.equals(flagType)) {
} else if ("UTF-8".equals(flagType)) {
return new SimpleFlagParsingStrategy();
} else if (LONG_FLAG_TYPE.equals(flagType)) {
} else if ("long".equals(flagType)) {
return new DoubleASCIIFlagParsingStrategy();
}

View File

@ -6,9 +6,10 @@ ICONV B b
ICONV C c
ICONV I i
OCONV 4
OCONV a A
OCONV b B
# Testing also whitespace and comments.
OCONV 4 # space, space
OCONV a A # tab, space, space
OCONV b B # tab, tab, space
OCONV c C
OCONV i I