LUCENE-9665: Hunspell: support default encoding (#2203, Peter Gromov via Dawid Weiss)

2021-01-15 09:35:25 +01:00 · 2021-01-15 09:35:25 +01:00 · 90131a605a
parent f285f02c89
commit 90131a605a
3 changed files with 9 additions and 4 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -84,6 +84,8 @@ API Changes

 Improvements

+* LUCENE-9665: Hunspell: support default encoding (Peter Gromov)
+
 * LUCENE-9633: Improve match highlighter behavior for degenerate intervals (on non-existing positions).
  (Dawid Weiss)

--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
@ -98,6 +98,7 @@ public class Dictionary {
  // TODO: really for suffixes we should reverse the automaton and run them backwards
  private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*";
  private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s";
+  static final String DEFAULT_ENCODING = StandardCharsets.ISO_8859_1.name();

  FST<IntsRef> prefixes;
  FST<IntsRef> suffixes;
@ -642,10 +643,8 @@ public class Dictionary {
   * @param affix InputStream for reading the affix file
   * @return Encoding specified in the affix file
   * @throws IOException Can be thrown while reading from the InputStream
-   * @throws ParseException Thrown if the first non-empty non-comment line read from the file does
-   *     not adhere to the format {@code SET <encoding>}
   */
-  static String getDictionaryEncoding(InputStream affix) throws IOException, ParseException {
+  static String getDictionaryEncoding(InputStream affix) throws IOException {
    final StringBuilder encoding = new StringBuilder();
    for (; ; ) {
      encoding.setLength(0);
@ -664,7 +663,7 @@ public class Dictionary {
          // this test only at the end as ineffective but would allow lines only containing spaces:
          encoding.toString().trim().length() == 0) {
        if (ch < 0) {
-          throw new ParseException("Unexpected end of affix file.", 0);
+          return DEFAULT_ENCODING;
        }
        continue;
      }
@ -673,6 +672,7 @@ public class Dictionary {
        int last = matcher.end();
        return encoding.substring(last).trim();
      }
+      return DEFAULT_ENCODING;
    }
  }

--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java
@ -267,6 +267,9 @@ public class TestDictionary extends LuceneTestCase {
        "UTF-8",
        Dictionary.getDictionaryEncoding(
            new ByteArrayInputStream("\uFEFFSET\tUTF-8\r\n".getBytes(StandardCharsets.UTF_8))));
+    assertEquals(
+        Dictionary.DEFAULT_ENCODING,
+        Dictionary.getDictionaryEncoding(new ByteArrayInputStream(new byte[0])));
  }

  public void testFlagWithCrazyWhitespace() throws Exception {