LUCENE-9716: Hunspell: support flag usage before its format is even specified (#2277)

2021-02-02 21:25:56 +01:00 · 2021-02-02 21:25:56 +01:00 · 8f75933f3d
parent 47e3d06ce0
commit 8f75933f3d
2 changed files with 83 additions and 74 deletions
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
@ -17,7 +17,6 @@
 package org.apache.lucene.analysis.hunspell;
 import java.io.BufferedInputStream;
 import java.io.BufferedOutputStream;
 import java.io.BufferedReader;
 import java.io.IOException;
 import java.io.InputStream;
@ -45,8 +44,6 @@ import java.util.Locale;
 import java.util.Map;
 import java.util.Set;
 import java.util.TreeMap;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 import org.apache.lucene.codecs.CodecUtil;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.IOContext;
@ -84,6 +81,7 @@ public class Dictionary {
  private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*";
  private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s";
  static final Charset DEFAULT_CHARSET = StandardCharsets.ISO_8859_1;
  CharsetDecoder decoder = replacingDecoder(DEFAULT_CHARSET);
  FST<IntsRef> prefixes;
  FST<IntsRef> suffixes;
@ -212,25 +210,21 @@ public class Dictionary {
    Path tempPath = getDefaultTempDir(); // TODO: make this configurable?
    Path aff = Files.createTempFile(tempPath, "affix", "aff");
-    OutputStream out = new BufferedOutputStream(Files.newOutputStream(aff));
+
-    InputStream aff1 = null;
+    BufferedInputStream aff1 = null;
    InputStream aff2 = null;
    boolean success = false;
    try {
-      // copy contents of affix stream to temp file
+      // Copy contents of the affix stream to a temp file.
-      final byte[] buffer = new byte[1024 * 8];
+      try (OutputStream os = Files.newOutputStream(aff)) {
-      int len;
+        affix.transferTo(os);
      while ((len = affix.read(buffer)) > 0) {
        out.write(buffer, 0, len);
      }
      out.close();
-      // pass 1: get encoding
+      // pass 1: get encoding & flag
      aff1 = new BufferedInputStream(Files.newInputStream(aff));
-      String encoding = getDictionaryEncoding(aff1);
+      readConfig(aff1);
      // pass 2: parse affixes
      CharsetDecoder decoder = getJavaEncoding(encoding);
      aff2 = new BufferedInputStream(Files.newInputStream(aff));
      readAffixFile(aff2, decoder);
@ -242,7 +236,7 @@ public class Dictionary {
      morphAliases = null; // no longer needed
      success = true;
    } finally {
-      IOUtils.closeWhileHandlingException(out, aff1, aff2);
+      IOUtils.closeWhileHandlingException(aff1, aff2);
      if (success) {
        Files.delete(aff);
      } else {
@ -344,10 +338,6 @@ public class Dictionary {
      } else if ("SFX".equals(firstWord)) {
        parseAffix(
            suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN, seenPatterns, seenStrips);
      } else if ("FLAG".equals(firstWord)) {
        // Assume that the FLAG line comes before any prefix or suffixes
        // Store the strategy so it can be used when parsing the dic file
        flagParsingStrategy = getFlagParsingStrategy(line, decoder.charset());
      } else if (line.equals("COMPLEXPREFIXES")) {
        complexPrefixes =
            true; // 2-stage prefix+1-stage suffix instead of 2-stage suffix+1-stage prefix
@ -696,46 +686,51 @@ public class Dictionary {
    return fstCompiler.compile();
  }
-  /** pattern accepts optional BOM + SET + any whitespace */
+  private static final byte[] BOM_UTF8 = {(byte) 0xef, (byte) 0xbb, (byte) 0xbf};
-  static final Pattern ENCODING_PATTERN = Pattern.compile("^(\u00EF\u00BB\u00BF)?SET\\s+");
+
  /** Parses the encoding and flag format specified in the provided InputStream */
  private void readConfig(BufferedInputStream stream) throws IOException, ParseException {
    // I assume we don't support other BOMs (utf16, etc.)? We trivially could,
    // by adding maybeConsume() with a proper bom... but I don't see hunspell repo to have
    // any such exotic examples.
    Charset streamCharset;
    if (maybeConsume(stream, BOM_UTF8)) {
      streamCharset = StandardCharsets.UTF_8;
    } else {
      streamCharset = DEFAULT_CHARSET;
    }
    // TODO: can these flags change throughout the file? If not then we can abort sooner. And
    // then we wouldn't even need to create a temp file for the affix stream - a large enough
    // leading buffer (BufferedInputStream) would be sufficient?
    LineNumberReader reader = new LineNumberReader(new InputStreamReader(stream, streamCharset));
    String line;
    while ((line = reader.readLine()) != null) {
      String firstWord = line.split("\\s")[0];
      if ("SET".equals(firstWord)) {
        decoder = getDecoder(singleArgument(reader, line));
      } else if ("FLAG".equals(firstWord)) {
        flagParsingStrategy = getFlagParsingStrategy(line, decoder.charset());
      }
    }
  }
  /**
-   * Parses the encoding specified in the affix file readable through the provided InputStream
+   * Consume the provided byte sequence in full, if present. Otherwise leave the input stream
   * intact.
   *
-   * @param affix InputStream for reading the affix file
+   * @return {@code true} if the sequence matched and has been consumed.
   * @return Encoding specified in the affix file
   * @throws IOException Can be thrown while reading from the InputStream
   */
-  static String getDictionaryEncoding(InputStream affix) throws IOException {
+  private static boolean maybeConsume(BufferedInputStream stream, byte[] bytes) throws IOException {
-    final StringBuilder encoding = new StringBuilder();
+    stream.mark(bytes.length);
-    for (; ; ) {
+    for (int i = 0; i < bytes.length; i++) {
-      encoding.setLength(0);
+      int nextByte = stream.read();
-      int ch;
+      if (nextByte != (bytes[i] & 0xff)) { // covers EOF (-1) as well.
-      while ((ch = affix.read()) >= 0) {
+        stream.reset();
-        if (ch == '\n') {
+        return false;
          break;
        }
        if (ch != '\r') {
          encoding.append((char) ch);
        }
      }
      if (encoding.length() == 0
          || encoding.charAt(0) == '#'
          ||
          // this test only at the end as ineffective but would allow lines only containing spaces:
          encoding.toString().trim().length() == 0) {
        if (ch < 0) {
          return DEFAULT_CHARSET.name();
        }
        continue;
      }
      Matcher matcher = ENCODING_PATTERN.matcher(encoding);
      if (matcher.find()) {
        int last = matcher.end();
        return encoding.substring(last).trim();
      }
      return DEFAULT_CHARSET.name();
    }
    return true;
  }
  static final Map<String, String> CHARSET_ALIASES =
@ -748,7 +743,7 @@ public class Dictionary {
   * @param encoding Encoding to retrieve the CharsetDecoder for
   * @return CharSetDecoder for the given encoding
   */
-  private CharsetDecoder getJavaEncoding(String encoding) {
+  private CharsetDecoder getDecoder(String encoding) {
    if ("ISO8859-14".equals(encoding)) {
      return new ISO8859_14Decoder();
    }
@ -756,7 +751,10 @@ public class Dictionary {
    if (canon != null) {
      encoding = canon;
    }
-    Charset charset = Charset.forName(encoding);
+    return replacingDecoder(Charset.forName(encoding));
  }
  private static CharsetDecoder replacingDecoder(Charset charset) {
    return charset.newDecoder().onMalformedInput(CodingErrorAction.REPLACE);
  }
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java
@ -24,6 +24,7 @@ import java.nio.charset.Charset;
 import java.nio.charset.StandardCharsets;
 import java.text.ParseException;
 import java.util.Random;
 import org.apache.lucene.store.ByteBuffersDirectory;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.CharsRef;
@ -142,6 +143,20 @@ public class TestDictionary extends LuceneTestCase {
    tempDir.close();
  }
  public void testUsingFlagsBeforeFlagDirective() throws IOException, ParseException {
    byte[] aff = "KEEPCASE 42\nFLAG num".getBytes(StandardCharsets.UTF_8);
    byte[] dic = "1\nfoo/42".getBytes(StandardCharsets.UTF_8);
    Dictionary dictionary =
        new Dictionary(
            new ByteBuffersDirectory(),
            "",
            new ByteArrayInputStream(aff),
            new ByteArrayInputStream(dic));
    assertEquals(42, dictionary.keepcase);
  }
  // malformed flags causes ParseException
  public void testInvalidFlags() throws Exception {
    InputStream affixStream = getClass().getResourceAsStream("broken-flags.aff");
@ -245,25 +260,21 @@ public class TestDictionary extends LuceneTestCase {
  }
  public void testSetWithCrazyWhitespaceAndBOMs() throws Exception {
-    assertEquals(
+    assertEquals("UTF-8", getDictionaryEncoding("SET\tUTF-8\n"));
-        "UTF-8",
+    assertEquals("UTF-8", getDictionaryEncoding("SET\t UTF-8\n"));
-        Dictionary.getDictionaryEncoding(
+    assertEquals("UTF-8", getDictionaryEncoding("\uFEFFSET\tUTF-8\n"));
-            new ByteArrayInputStream("SET\tUTF-8\n".getBytes(StandardCharsets.UTF_8))));
+    assertEquals("UTF-8", getDictionaryEncoding("\uFEFFSET\tUTF-8\r\n"));
-    assertEquals(
+    assertEquals(Dictionary.DEFAULT_CHARSET.name(), getDictionaryEncoding(""));
-        "UTF-8",
+  }
-        Dictionary.getDictionaryEncoding(
+
-            new ByteArrayInputStream("SET\t UTF-8\n".getBytes(StandardCharsets.UTF_8))));
+  private static String getDictionaryEncoding(String affFile) throws IOException, ParseException {
-    assertEquals(
+    Dictionary dictionary =
-        "UTF-8",
+        new Dictionary(
-        Dictionary.getDictionaryEncoding(
+            new ByteBuffersDirectory(),
-            new ByteArrayInputStream("\uFEFFSET\tUTF-8\n".getBytes(StandardCharsets.UTF_8))));
+            "",
-    assertEquals(
+            new ByteArrayInputStream(affFile.getBytes(StandardCharsets.UTF_8)),
-        "UTF-8",
+            new ByteArrayInputStream("1\nmock".getBytes(StandardCharsets.UTF_8)));
-        Dictionary.getDictionaryEncoding(
+    return dictionary.decoder.charset().name();
            new ByteArrayInputStream("\uFEFFSET\tUTF-8\r\n".getBytes(StandardCharsets.UTF_8))));
    assertEquals(
        Dictionary.DEFAULT_CHARSET.name(),
        Dictionary.getDictionaryEncoding(new ByteArrayInputStream(new byte[0])));
  }
  public void testFlagWithCrazyWhitespace() {