LUCENE-9691: Hunspell: support trailing comments on aff option lines (#2236)

plus cleanup & deduplicate parsing
2021-01-25 09:08:57 +01:00 · 2021-01-25 09:08:57 +01:00 · f64e7cbbda
parent c7e1079da9
commit f64e7cbbda
2 changed files with 58 additions and 116 deletions
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
@ -78,31 +78,6 @@ public class Dictionary {
  private static final char HIDDEN_FLAG = (char) 65511; // called 'ONLYUPCASEFLAG' in Hunspell
  private static final String ALIAS_KEY = "AF";
  private static final String MORPH_ALIAS_KEY = "AM";
  private static final String PREFIX_KEY = "PFX";
  private static final String SUFFIX_KEY = "SFX";
  private static final String FLAG_KEY = "FLAG";
  private static final String COMPLEXPREFIXES_KEY = "COMPLEXPREFIXES";
  private static final String CIRCUMFIX_KEY = "CIRCUMFIX";
  private static final String IGNORE_KEY = "IGNORE";
  private static final String ICONV_KEY = "ICONV";
  private static final String OCONV_KEY = "OCONV";
  private static final String FULLSTRIP_KEY = "FULLSTRIP";
  private static final String LANG_KEY = "LANG";
  private static final String BREAK_KEY = "BREAK";
  private static final String FORBIDDENWORD_KEY = "FORBIDDENWORD";
  private static final String COMPOUNDMIN_KEY = "COMPOUNDMIN";
  private static final String COMPOUNDRULE_KEY = "COMPOUNDRULE";
  private static final String KEEPCASE_KEY = "KEEPCASE";
  private static final String NEEDAFFIX_KEY = "NEEDAFFIX";
  private static final String PSEUDOROOT_KEY = "PSEUDOROOT";
  private static final String ONLYINCOMPOUND_KEY = "ONLYINCOMPOUND";
  private static final String NUM_FLAG_TYPE = "num";
  private static final String UTF8_FLAG_TYPE = "UTF-8";
  private static final String LONG_FLAG_TYPE = "long";
  // TODO: really for suffixes we should reverse the automaton and run them backwards
  private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*";
  private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s";
@ -346,95 +321,62 @@ public class Dictionary {
      if (reader.getLineNumber() == 1 && line.startsWith("\uFEFF")) {
        line = line.substring(1);
      }
-      if (line.startsWith(ALIAS_KEY)) {
+      line = line.trim();
      if (line.isEmpty()) continue;
      String firstWord = line.split("\\s")[0];
      if ("AF".equals(firstWord)) {
        parseAlias(line);
-      } else if (line.startsWith(MORPH_ALIAS_KEY)) {
+      } else if ("AM".equals(firstWord)) {
        parseMorphAlias(line);
-      } else if (line.startsWith(PREFIX_KEY)) {
+      } else if ("PFX".equals(firstWord)) {
        parseAffix(
            prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN, seenPatterns, seenStrips);
-      } else if (line.startsWith(SUFFIX_KEY)) {
+      } else if ("SFX".equals(firstWord)) {
        parseAffix(
            suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN, seenPatterns, seenStrips);
-      } else if (line.startsWith(FLAG_KEY)) {
+      } else if ("FLAG".equals(firstWord)) {
        // Assume that the FLAG line comes before any prefix or suffixes
        // Store the strategy so it can be used when parsing the dic file
        flagParsingStrategy = getFlagParsingStrategy(line);
-      } else if (line.equals(COMPLEXPREFIXES_KEY)) {
+      } else if (line.equals("COMPLEXPREFIXES")) {
        complexPrefixes =
            true; // 2-stage prefix+1-stage suffix instead of 2-stage suffix+1-stage prefix
-      } else if (line.startsWith(CIRCUMFIX_KEY)) {
+      } else if ("CIRCUMFIX".equals(firstWord)) {
-        String[] parts = line.split("\\s+");
+        circumfix = flagParsingStrategy.parseFlag(singleArgument(reader, line));
-        if (parts.length != 2) {
+      } else if ("KEEPCASE".equals(firstWord)) {
-          throw new ParseException("Illegal CIRCUMFIX declaration", reader.getLineNumber());
+        keepcase = flagParsingStrategy.parseFlag(singleArgument(reader, line));
-        }
+      } else if ("NEEDAFFIX".equals(firstWord) || "PSEUDOROOT".equals(firstWord)) {
-        circumfix = flagParsingStrategy.parseFlag(parts[1]);
+        needaffix = flagParsingStrategy.parseFlag(singleArgument(reader, line));
-      } else if (line.startsWith(KEEPCASE_KEY)) {
+      } else if ("ONLYINCOMPOUND".equals(firstWord)) {
-        String[] parts = line.split("\\s+");
+        onlyincompound = flagParsingStrategy.parseFlag(singleArgument(reader, line));
-        if (parts.length != 2) {
+      } else if ("IGNORE".equals(firstWord)) {
-          throw new ParseException("Illegal KEEPCASE declaration", reader.getLineNumber());
+        ignore = singleArgument(reader, line).toCharArray();
        }
        keepcase = flagParsingStrategy.parseFlag(parts[1]);
      } else if (line.startsWith(NEEDAFFIX_KEY) || line.startsWith(PSEUDOROOT_KEY)) {
        String[] parts = line.split("\\s+");
        if (parts.length != 2) {
          throw new ParseException("Illegal NEEDAFFIX declaration", reader.getLineNumber());
        }
        needaffix = flagParsingStrategy.parseFlag(parts[1]);
      } else if (line.startsWith(ONLYINCOMPOUND_KEY)) {
        String[] parts = line.split("\\s+");
        if (parts.length != 2) {
          throw new ParseException("Illegal ONLYINCOMPOUND declaration", reader.getLineNumber());
        }
        onlyincompound = flagParsingStrategy.parseFlag(parts[1]);
      } else if (line.startsWith(IGNORE_KEY)) {
        String[] parts = line.split("\\s+");
        if (parts.length != 2) {
          throw new ParseException("Illegal IGNORE declaration", reader.getLineNumber());
        }
        ignore = parts[1].toCharArray();
        Arrays.sort(ignore);
        needsInputCleaning = true;
-      } else if (line.startsWith(ICONV_KEY) || line.startsWith(OCONV_KEY)) {
+      } else if ("ICONV".equals(firstWord) || "OCONV".equals(firstWord)) {
-        String[] parts = line.split("\\s+");
+        int num = Integer.parseInt(singleArgument(reader, line));
        String type = parts[0];
        if (parts.length != 2) {
          throw new ParseException("Illegal " + type + " declaration", reader.getLineNumber());
        }
        int num = Integer.parseInt(parts[1]);
        FST<CharsRef> res = parseConversions(reader, num);
-        if (type.equals("ICONV")) {
+        if (line.startsWith("I")) {
          iconv = res;
          needsInputCleaning |= iconv != null;
        } else {
          oconv = res;
          needsOutputCleaning |= oconv != null;
        }
-      } else if (line.startsWith(FULLSTRIP_KEY)) {
+      } else if ("FULLSTRIP".equals(firstWord)) {
        fullStrip = true;
-      } else if (line.startsWith(LANG_KEY)) {
+      } else if ("LANG".equals(firstWord)) {
-        language = line.substring(LANG_KEY.length()).trim();
+        language = singleArgument(reader, line);
        alternateCasing = "tr_TR".equals(language) || "az_AZ".equals(language);
-      } else if (line.startsWith(BREAK_KEY)) {
+      } else if ("BREAK".equals(firstWord)) {
        breaks = parseBreaks(reader, line);
-      } else if (line.startsWith(FORBIDDENWORD_KEY)) {
+      } else if ("FORBIDDENWORD".equals(firstWord)) {
-        String[] parts = line.split("\\s+");
+        forbiddenword = flagParsingStrategy.parseFlag(singleArgument(reader, line));
-        if (parts.length != 2) {
+      } else if ("COMPOUNDMIN".equals(firstWord)) {
-          throw new ParseException("Illegal FORBIDDENWORD declaration", reader.getLineNumber());
+        compoundMin = Math.max(1, Integer.parseInt(singleArgument(reader, line)));
-        }
+      } else if ("COMPOUNDRULE".equals(firstWord)) {
-        forbiddenword = flagParsingStrategy.parseFlag(parts[1]);
+        compoundRules = parseCompoundRules(reader, Integer.parseInt(singleArgument(reader, line)));
      } else if (line.startsWith(COMPOUNDMIN_KEY)) {
        String[] parts = line.split("\\s+");
        if (parts.length != 2) {
          throw new ParseException("Illegal COMPOUNDMIN declaration", reader.getLineNumber());
        }
        compoundMin = Math.max(1, Integer.parseInt(parts[1]));
      } else if (line.startsWith(COMPOUNDRULE_KEY)) {
        String[] parts = line.split("\\s+");
        if (parts.length != 2) {
          throw new ParseException("Illegal COMPOUNDRULE header", reader.getLineNumber());
        }
        this.compoundRules = parseCompoundRules(reader, Integer.parseInt(parts[1]));
      }
    }
@ -458,17 +400,25 @@ public class Dictionary {
    stripOffsets[currentIndex] = currentOffset;
  }
  private String singleArgument(LineNumberReader reader, String line) throws ParseException {
    return splitBySpace(reader, line, 2)[1];
  }
  private String[] splitBySpace(LineNumberReader reader, String line, int expectedParts)
      throws ParseException {
    String[] parts = line.split("\\s+");
    if (parts.length < expectedParts
        || parts.length > expectedParts && !parts[expectedParts].startsWith("#")) {
      throw new ParseException("Invalid syntax", reader.getLineNumber());
    }
    return parts;
  }
  private List<CompoundRule> parseCompoundRules(LineNumberReader reader, int num)
      throws IOException, ParseException {
    String line;
    List<CompoundRule> compoundRules = new ArrayList<>();
    for (int i = 0; i < num; i++) {
-      line = reader.readLine();
+      compoundRules.add(new CompoundRule(singleArgument(reader, reader.readLine()), this));
      String[] parts = line.split("\\s+");
      if (!line.startsWith(COMPOUNDRULE_KEY) || parts.length != 2) {
        throw new ParseException("COMPOUNDRULE rule expected", reader.getLineNumber());
      }
      compoundRules.add(new CompoundRule(parts[1], this));
    }
    return compoundRules;
  }
@ -478,14 +428,9 @@ public class Dictionary {
    Set<String> starting = new LinkedHashSet<>();
    Set<String> ending = new LinkedHashSet<>();
    Set<String> middle = new LinkedHashSet<>();
-    int num = Integer.parseInt(line.substring(BREAK_KEY.length()).trim());
+    int num = Integer.parseInt(singleArgument(reader, line));
    for (int i = 0; i < num; i++) {
-      line = reader.readLine();
+      String breakStr = singleArgument(reader, reader.readLine());
      String[] parts = line.split("\\s+");
      if (!line.startsWith(BREAK_KEY) || parts.length != 2) {
        throw new ParseException("BREAK chars expected", reader.getLineNumber());
      }
      String breakStr = parts[1];
      if (breakStr.startsWith("^")) {
        starting.add(breakStr.substring(1));
      } else if (breakStr.endsWith("$")) {
@ -689,11 +634,7 @@ public class Dictionary {
    Map<String, String> mappings = new TreeMap<>();
    for (int i = 0; i < num; i++) {
-      String line = reader.readLine();
+      String[] parts = splitBySpace(reader, reader.readLine(), 3);
      String[] parts = line.split("\\s+");
      if (parts.length != 3) {
        throw new ParseException("invalid syntax: " + line, reader.getLineNumber());
      }
      if (mappings.put(parts[1], parts[2]) != null) {
        throw new IllegalStateException("duplicate mapping specified for: " + parts[1]);
      }
@ -789,11 +730,11 @@ public class Dictionary {
    }
    String flagType = parts[1];
-    if (NUM_FLAG_TYPE.equals(flagType)) {
+    if ("num".equals(flagType)) {
      return new NumFlagParsingStrategy();
-    } else if (UTF8_FLAG_TYPE.equals(flagType)) {
+    } else if ("UTF-8".equals(flagType)) {
      return new SimpleFlagParsingStrategy();
-    } else if (LONG_FLAG_TYPE.equals(flagType)) {
+    } else if ("long".equals(flagType)) {
      return new DoubleASCIIFlagParsingStrategy();
    }
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/conv.aff
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/conv.aff
@ -6,9 +6,10 @@ ICONV B b
 ICONV C c
 ICONV I i
-OCONV 4
+# Testing also whitespace and comments.
-OCONV a A
+OCONV 4 # space, space
-OCONV b B
+OCONV	a A # tab, space, space
 OCONV	b	B # tab, tab, space
 OCONV c C
 OCONV i I