mirror of https://github.com/apache/lucene.git
LUCENE-9691: Hunspell: support trailing comments on aff option lines (#2236)
plus cleanup & deduplicate parsing
This commit is contained in:
parent
c7e1079da9
commit
f64e7cbbda
|
@ -78,31 +78,6 @@ public class Dictionary {
|
|||
|
||||
private static final char HIDDEN_FLAG = (char) 65511; // called 'ONLYUPCASEFLAG' in Hunspell
|
||||
|
||||
private static final String ALIAS_KEY = "AF";
|
||||
private static final String MORPH_ALIAS_KEY = "AM";
|
||||
private static final String PREFIX_KEY = "PFX";
|
||||
private static final String SUFFIX_KEY = "SFX";
|
||||
private static final String FLAG_KEY = "FLAG";
|
||||
private static final String COMPLEXPREFIXES_KEY = "COMPLEXPREFIXES";
|
||||
private static final String CIRCUMFIX_KEY = "CIRCUMFIX";
|
||||
private static final String IGNORE_KEY = "IGNORE";
|
||||
private static final String ICONV_KEY = "ICONV";
|
||||
private static final String OCONV_KEY = "OCONV";
|
||||
private static final String FULLSTRIP_KEY = "FULLSTRIP";
|
||||
private static final String LANG_KEY = "LANG";
|
||||
private static final String BREAK_KEY = "BREAK";
|
||||
private static final String FORBIDDENWORD_KEY = "FORBIDDENWORD";
|
||||
private static final String COMPOUNDMIN_KEY = "COMPOUNDMIN";
|
||||
private static final String COMPOUNDRULE_KEY = "COMPOUNDRULE";
|
||||
private static final String KEEPCASE_KEY = "KEEPCASE";
|
||||
private static final String NEEDAFFIX_KEY = "NEEDAFFIX";
|
||||
private static final String PSEUDOROOT_KEY = "PSEUDOROOT";
|
||||
private static final String ONLYINCOMPOUND_KEY = "ONLYINCOMPOUND";
|
||||
|
||||
private static final String NUM_FLAG_TYPE = "num";
|
||||
private static final String UTF8_FLAG_TYPE = "UTF-8";
|
||||
private static final String LONG_FLAG_TYPE = "long";
|
||||
|
||||
// TODO: really for suffixes we should reverse the automaton and run them backwards
|
||||
private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*";
|
||||
private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s";
|
||||
|
@ -346,95 +321,62 @@ public class Dictionary {
|
|||
if (reader.getLineNumber() == 1 && line.startsWith("\uFEFF")) {
|
||||
line = line.substring(1);
|
||||
}
|
||||
if (line.startsWith(ALIAS_KEY)) {
|
||||
line = line.trim();
|
||||
if (line.isEmpty()) continue;
|
||||
|
||||
String firstWord = line.split("\\s")[0];
|
||||
if ("AF".equals(firstWord)) {
|
||||
parseAlias(line);
|
||||
} else if (line.startsWith(MORPH_ALIAS_KEY)) {
|
||||
} else if ("AM".equals(firstWord)) {
|
||||
parseMorphAlias(line);
|
||||
} else if (line.startsWith(PREFIX_KEY)) {
|
||||
} else if ("PFX".equals(firstWord)) {
|
||||
parseAffix(
|
||||
prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN, seenPatterns, seenStrips);
|
||||
} else if (line.startsWith(SUFFIX_KEY)) {
|
||||
} else if ("SFX".equals(firstWord)) {
|
||||
parseAffix(
|
||||
suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN, seenPatterns, seenStrips);
|
||||
} else if (line.startsWith(FLAG_KEY)) {
|
||||
} else if ("FLAG".equals(firstWord)) {
|
||||
// Assume that the FLAG line comes before any prefix or suffixes
|
||||
// Store the strategy so it can be used when parsing the dic file
|
||||
flagParsingStrategy = getFlagParsingStrategy(line);
|
||||
} else if (line.equals(COMPLEXPREFIXES_KEY)) {
|
||||
} else if (line.equals("COMPLEXPREFIXES")) {
|
||||
complexPrefixes =
|
||||
true; // 2-stage prefix+1-stage suffix instead of 2-stage suffix+1-stage prefix
|
||||
} else if (line.startsWith(CIRCUMFIX_KEY)) {
|
||||
String[] parts = line.split("\\s+");
|
||||
if (parts.length != 2) {
|
||||
throw new ParseException("Illegal CIRCUMFIX declaration", reader.getLineNumber());
|
||||
}
|
||||
circumfix = flagParsingStrategy.parseFlag(parts[1]);
|
||||
} else if (line.startsWith(KEEPCASE_KEY)) {
|
||||
String[] parts = line.split("\\s+");
|
||||
if (parts.length != 2) {
|
||||
throw new ParseException("Illegal KEEPCASE declaration", reader.getLineNumber());
|
||||
}
|
||||
keepcase = flagParsingStrategy.parseFlag(parts[1]);
|
||||
} else if (line.startsWith(NEEDAFFIX_KEY) || line.startsWith(PSEUDOROOT_KEY)) {
|
||||
String[] parts = line.split("\\s+");
|
||||
if (parts.length != 2) {
|
||||
throw new ParseException("Illegal NEEDAFFIX declaration", reader.getLineNumber());
|
||||
}
|
||||
needaffix = flagParsingStrategy.parseFlag(parts[1]);
|
||||
} else if (line.startsWith(ONLYINCOMPOUND_KEY)) {
|
||||
String[] parts = line.split("\\s+");
|
||||
if (parts.length != 2) {
|
||||
throw new ParseException("Illegal ONLYINCOMPOUND declaration", reader.getLineNumber());
|
||||
}
|
||||
onlyincompound = flagParsingStrategy.parseFlag(parts[1]);
|
||||
} else if (line.startsWith(IGNORE_KEY)) {
|
||||
String[] parts = line.split("\\s+");
|
||||
if (parts.length != 2) {
|
||||
throw new ParseException("Illegal IGNORE declaration", reader.getLineNumber());
|
||||
}
|
||||
ignore = parts[1].toCharArray();
|
||||
} else if ("CIRCUMFIX".equals(firstWord)) {
|
||||
circumfix = flagParsingStrategy.parseFlag(singleArgument(reader, line));
|
||||
} else if ("KEEPCASE".equals(firstWord)) {
|
||||
keepcase = flagParsingStrategy.parseFlag(singleArgument(reader, line));
|
||||
} else if ("NEEDAFFIX".equals(firstWord) || "PSEUDOROOT".equals(firstWord)) {
|
||||
needaffix = flagParsingStrategy.parseFlag(singleArgument(reader, line));
|
||||
} else if ("ONLYINCOMPOUND".equals(firstWord)) {
|
||||
onlyincompound = flagParsingStrategy.parseFlag(singleArgument(reader, line));
|
||||
} else if ("IGNORE".equals(firstWord)) {
|
||||
ignore = singleArgument(reader, line).toCharArray();
|
||||
Arrays.sort(ignore);
|
||||
needsInputCleaning = true;
|
||||
} else if (line.startsWith(ICONV_KEY) || line.startsWith(OCONV_KEY)) {
|
||||
String[] parts = line.split("\\s+");
|
||||
String type = parts[0];
|
||||
if (parts.length != 2) {
|
||||
throw new ParseException("Illegal " + type + " declaration", reader.getLineNumber());
|
||||
}
|
||||
int num = Integer.parseInt(parts[1]);
|
||||
} else if ("ICONV".equals(firstWord) || "OCONV".equals(firstWord)) {
|
||||
int num = Integer.parseInt(singleArgument(reader, line));
|
||||
FST<CharsRef> res = parseConversions(reader, num);
|
||||
if (type.equals("ICONV")) {
|
||||
if (line.startsWith("I")) {
|
||||
iconv = res;
|
||||
needsInputCleaning |= iconv != null;
|
||||
} else {
|
||||
oconv = res;
|
||||
needsOutputCleaning |= oconv != null;
|
||||
}
|
||||
} else if (line.startsWith(FULLSTRIP_KEY)) {
|
||||
} else if ("FULLSTRIP".equals(firstWord)) {
|
||||
fullStrip = true;
|
||||
} else if (line.startsWith(LANG_KEY)) {
|
||||
language = line.substring(LANG_KEY.length()).trim();
|
||||
} else if ("LANG".equals(firstWord)) {
|
||||
language = singleArgument(reader, line);
|
||||
alternateCasing = "tr_TR".equals(language) || "az_AZ".equals(language);
|
||||
} else if (line.startsWith(BREAK_KEY)) {
|
||||
} else if ("BREAK".equals(firstWord)) {
|
||||
breaks = parseBreaks(reader, line);
|
||||
} else if (line.startsWith(FORBIDDENWORD_KEY)) {
|
||||
String[] parts = line.split("\\s+");
|
||||
if (parts.length != 2) {
|
||||
throw new ParseException("Illegal FORBIDDENWORD declaration", reader.getLineNumber());
|
||||
}
|
||||
forbiddenword = flagParsingStrategy.parseFlag(parts[1]);
|
||||
} else if (line.startsWith(COMPOUNDMIN_KEY)) {
|
||||
String[] parts = line.split("\\s+");
|
||||
if (parts.length != 2) {
|
||||
throw new ParseException("Illegal COMPOUNDMIN declaration", reader.getLineNumber());
|
||||
}
|
||||
compoundMin = Math.max(1, Integer.parseInt(parts[1]));
|
||||
} else if (line.startsWith(COMPOUNDRULE_KEY)) {
|
||||
String[] parts = line.split("\\s+");
|
||||
if (parts.length != 2) {
|
||||
throw new ParseException("Illegal COMPOUNDRULE header", reader.getLineNumber());
|
||||
}
|
||||
this.compoundRules = parseCompoundRules(reader, Integer.parseInt(parts[1]));
|
||||
} else if ("FORBIDDENWORD".equals(firstWord)) {
|
||||
forbiddenword = flagParsingStrategy.parseFlag(singleArgument(reader, line));
|
||||
} else if ("COMPOUNDMIN".equals(firstWord)) {
|
||||
compoundMin = Math.max(1, Integer.parseInt(singleArgument(reader, line)));
|
||||
} else if ("COMPOUNDRULE".equals(firstWord)) {
|
||||
compoundRules = parseCompoundRules(reader, Integer.parseInt(singleArgument(reader, line)));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -458,17 +400,25 @@ public class Dictionary {
|
|||
stripOffsets[currentIndex] = currentOffset;
|
||||
}
|
||||
|
||||
private String singleArgument(LineNumberReader reader, String line) throws ParseException {
|
||||
return splitBySpace(reader, line, 2)[1];
|
||||
}
|
||||
|
||||
private String[] splitBySpace(LineNumberReader reader, String line, int expectedParts)
|
||||
throws ParseException {
|
||||
String[] parts = line.split("\\s+");
|
||||
if (parts.length < expectedParts
|
||||
|| parts.length > expectedParts && !parts[expectedParts].startsWith("#")) {
|
||||
throw new ParseException("Invalid syntax", reader.getLineNumber());
|
||||
}
|
||||
return parts;
|
||||
}
|
||||
|
||||
private List<CompoundRule> parseCompoundRules(LineNumberReader reader, int num)
|
||||
throws IOException, ParseException {
|
||||
String line;
|
||||
List<CompoundRule> compoundRules = new ArrayList<>();
|
||||
for (int i = 0; i < num; i++) {
|
||||
line = reader.readLine();
|
||||
String[] parts = line.split("\\s+");
|
||||
if (!line.startsWith(COMPOUNDRULE_KEY) || parts.length != 2) {
|
||||
throw new ParseException("COMPOUNDRULE rule expected", reader.getLineNumber());
|
||||
}
|
||||
compoundRules.add(new CompoundRule(parts[1], this));
|
||||
compoundRules.add(new CompoundRule(singleArgument(reader, reader.readLine()), this));
|
||||
}
|
||||
return compoundRules;
|
||||
}
|
||||
|
@ -478,14 +428,9 @@ public class Dictionary {
|
|||
Set<String> starting = new LinkedHashSet<>();
|
||||
Set<String> ending = new LinkedHashSet<>();
|
||||
Set<String> middle = new LinkedHashSet<>();
|
||||
int num = Integer.parseInt(line.substring(BREAK_KEY.length()).trim());
|
||||
int num = Integer.parseInt(singleArgument(reader, line));
|
||||
for (int i = 0; i < num; i++) {
|
||||
line = reader.readLine();
|
||||
String[] parts = line.split("\\s+");
|
||||
if (!line.startsWith(BREAK_KEY) || parts.length != 2) {
|
||||
throw new ParseException("BREAK chars expected", reader.getLineNumber());
|
||||
}
|
||||
String breakStr = parts[1];
|
||||
String breakStr = singleArgument(reader, reader.readLine());
|
||||
if (breakStr.startsWith("^")) {
|
||||
starting.add(breakStr.substring(1));
|
||||
} else if (breakStr.endsWith("$")) {
|
||||
|
@ -689,11 +634,7 @@ public class Dictionary {
|
|||
Map<String, String> mappings = new TreeMap<>();
|
||||
|
||||
for (int i = 0; i < num; i++) {
|
||||
String line = reader.readLine();
|
||||
String[] parts = line.split("\\s+");
|
||||
if (parts.length != 3) {
|
||||
throw new ParseException("invalid syntax: " + line, reader.getLineNumber());
|
||||
}
|
||||
String[] parts = splitBySpace(reader, reader.readLine(), 3);
|
||||
if (mappings.put(parts[1], parts[2]) != null) {
|
||||
throw new IllegalStateException("duplicate mapping specified for: " + parts[1]);
|
||||
}
|
||||
|
@ -789,11 +730,11 @@ public class Dictionary {
|
|||
}
|
||||
String flagType = parts[1];
|
||||
|
||||
if (NUM_FLAG_TYPE.equals(flagType)) {
|
||||
if ("num".equals(flagType)) {
|
||||
return new NumFlagParsingStrategy();
|
||||
} else if (UTF8_FLAG_TYPE.equals(flagType)) {
|
||||
} else if ("UTF-8".equals(flagType)) {
|
||||
return new SimpleFlagParsingStrategy();
|
||||
} else if (LONG_FLAG_TYPE.equals(flagType)) {
|
||||
} else if ("long".equals(flagType)) {
|
||||
return new DoubleASCIIFlagParsingStrategy();
|
||||
}
|
||||
|
||||
|
|
|
@ -6,9 +6,10 @@ ICONV B b
|
|||
ICONV C c
|
||||
ICONV I i
|
||||
|
||||
OCONV 4
|
||||
OCONV a A
|
||||
OCONV b B
|
||||
# Testing also whitespace and comments.
|
||||
OCONV 4 # space, space
|
||||
OCONV a A # tab, space, space
|
||||
OCONV b B # tab, tab, space
|
||||
OCONV c C
|
||||
OCONV i I
|
||||
|
||||
|
|
Loading…
Reference in New Issue