diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java index 58b6f220f32..10340b8acdb 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java @@ -17,7 +17,6 @@ package org.apache.lucene.analysis.hunspell; import java.io.BufferedInputStream; -import java.io.BufferedOutputStream; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; @@ -45,8 +44,6 @@ import java.util.Locale; import java.util.Map; import java.util.Set; import java.util.TreeMap; -import java.util.regex.Matcher; -import java.util.regex.Pattern; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; @@ -84,6 +81,7 @@ public class Dictionary { private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*"; private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s"; static final Charset DEFAULT_CHARSET = StandardCharsets.ISO_8859_1; + CharsetDecoder decoder = replacingDecoder(DEFAULT_CHARSET); FST prefixes; FST suffixes; @@ -212,25 +210,21 @@ public class Dictionary { Path tempPath = getDefaultTempDir(); // TODO: make this configurable? Path aff = Files.createTempFile(tempPath, "affix", "aff"); - OutputStream out = new BufferedOutputStream(Files.newOutputStream(aff)); - InputStream aff1 = null; + + BufferedInputStream aff1 = null; InputStream aff2 = null; boolean success = false; try { - // copy contents of affix stream to temp file - final byte[] buffer = new byte[1024 * 8]; - int len; - while ((len = affix.read(buffer)) > 0) { - out.write(buffer, 0, len); + // Copy contents of the affix stream to a temp file. + try (OutputStream os = Files.newOutputStream(aff)) { + affix.transferTo(os); } - out.close(); - // pass 1: get encoding + // pass 1: get encoding & flag aff1 = new BufferedInputStream(Files.newInputStream(aff)); - String encoding = getDictionaryEncoding(aff1); + readConfig(aff1); // pass 2: parse affixes - CharsetDecoder decoder = getJavaEncoding(encoding); aff2 = new BufferedInputStream(Files.newInputStream(aff)); readAffixFile(aff2, decoder); @@ -242,7 +236,7 @@ public class Dictionary { morphAliases = null; // no longer needed success = true; } finally { - IOUtils.closeWhileHandlingException(out, aff1, aff2); + IOUtils.closeWhileHandlingException(aff1, aff2); if (success) { Files.delete(aff); } else { @@ -344,10 +338,6 @@ public class Dictionary { } else if ("SFX".equals(firstWord)) { parseAffix( suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN, seenPatterns, seenStrips); - } else if ("FLAG".equals(firstWord)) { - // Assume that the FLAG line comes before any prefix or suffixes - // Store the strategy so it can be used when parsing the dic file - flagParsingStrategy = getFlagParsingStrategy(line, decoder.charset()); } else if (line.equals("COMPLEXPREFIXES")) { complexPrefixes = true; // 2-stage prefix+1-stage suffix instead of 2-stage suffix+1-stage prefix @@ -696,46 +686,51 @@ public class Dictionary { return fstCompiler.compile(); } - /** pattern accepts optional BOM + SET + any whitespace */ - static final Pattern ENCODING_PATTERN = Pattern.compile("^(\u00EF\u00BB\u00BF)?SET\\s+"); + private static final byte[] BOM_UTF8 = {(byte) 0xef, (byte) 0xbb, (byte) 0xbf}; + + /** Parses the encoding and flag format specified in the provided InputStream */ + private void readConfig(BufferedInputStream stream) throws IOException, ParseException { + // I assume we don't support other BOMs (utf16, etc.)? We trivially could, + // by adding maybeConsume() with a proper bom... but I don't see hunspell repo to have + // any such exotic examples. + Charset streamCharset; + if (maybeConsume(stream, BOM_UTF8)) { + streamCharset = StandardCharsets.UTF_8; + } else { + streamCharset = DEFAULT_CHARSET; + } + + // TODO: can these flags change throughout the file? If not then we can abort sooner. And + // then we wouldn't even need to create a temp file for the affix stream - a large enough + // leading buffer (BufferedInputStream) would be sufficient? + LineNumberReader reader = new LineNumberReader(new InputStreamReader(stream, streamCharset)); + String line; + while ((line = reader.readLine()) != null) { + String firstWord = line.split("\\s")[0]; + if ("SET".equals(firstWord)) { + decoder = getDecoder(singleArgument(reader, line)); + } else if ("FLAG".equals(firstWord)) { + flagParsingStrategy = getFlagParsingStrategy(line, decoder.charset()); + } + } + } /** - * Parses the encoding specified in the affix file readable through the provided InputStream + * Consume the provided byte sequence in full, if present. Otherwise leave the input stream + * intact. * - * @param affix InputStream for reading the affix file - * @return Encoding specified in the affix file - * @throws IOException Can be thrown while reading from the InputStream + * @return {@code true} if the sequence matched and has been consumed. */ - static String getDictionaryEncoding(InputStream affix) throws IOException { - final StringBuilder encoding = new StringBuilder(); - for (; ; ) { - encoding.setLength(0); - int ch; - while ((ch = affix.read()) >= 0) { - if (ch == '\n') { - break; - } - if (ch != '\r') { - encoding.append((char) ch); - } + private static boolean maybeConsume(BufferedInputStream stream, byte[] bytes) throws IOException { + stream.mark(bytes.length); + for (int i = 0; i < bytes.length; i++) { + int nextByte = stream.read(); + if (nextByte != (bytes[i] & 0xff)) { // covers EOF (-1) as well. + stream.reset(); + return false; } - if (encoding.length() == 0 - || encoding.charAt(0) == '#' - || - // this test only at the end as ineffective but would allow lines only containing spaces: - encoding.toString().trim().length() == 0) { - if (ch < 0) { - return DEFAULT_CHARSET.name(); - } - continue; - } - Matcher matcher = ENCODING_PATTERN.matcher(encoding); - if (matcher.find()) { - int last = matcher.end(); - return encoding.substring(last).trim(); - } - return DEFAULT_CHARSET.name(); } + return true; } static final Map CHARSET_ALIASES = @@ -748,7 +743,7 @@ public class Dictionary { * @param encoding Encoding to retrieve the CharsetDecoder for * @return CharSetDecoder for the given encoding */ - private CharsetDecoder getJavaEncoding(String encoding) { + private CharsetDecoder getDecoder(String encoding) { if ("ISO8859-14".equals(encoding)) { return new ISO8859_14Decoder(); } @@ -756,7 +751,10 @@ public class Dictionary { if (canon != null) { encoding = canon; } - Charset charset = Charset.forName(encoding); + return replacingDecoder(Charset.forName(encoding)); + } + + private static CharsetDecoder replacingDecoder(Charset charset) { return charset.newDecoder().onMalformedInput(CodingErrorAction.REPLACE); } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java index faee6f06347..110f487c29b 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java @@ -24,6 +24,7 @@ import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.text.ParseException; import java.util.Random; +import org.apache.lucene.store.ByteBuffersDirectory; import org.apache.lucene.store.Directory; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.CharsRef; @@ -142,6 +143,20 @@ public class TestDictionary extends LuceneTestCase { tempDir.close(); } + public void testUsingFlagsBeforeFlagDirective() throws IOException, ParseException { + byte[] aff = "KEEPCASE 42\nFLAG num".getBytes(StandardCharsets.UTF_8); + byte[] dic = "1\nfoo/42".getBytes(StandardCharsets.UTF_8); + + Dictionary dictionary = + new Dictionary( + new ByteBuffersDirectory(), + "", + new ByteArrayInputStream(aff), + new ByteArrayInputStream(dic)); + + assertEquals(42, dictionary.keepcase); + } + // malformed flags causes ParseException public void testInvalidFlags() throws Exception { InputStream affixStream = getClass().getResourceAsStream("broken-flags.aff"); @@ -245,25 +260,21 @@ public class TestDictionary extends LuceneTestCase { } public void testSetWithCrazyWhitespaceAndBOMs() throws Exception { - assertEquals( - "UTF-8", - Dictionary.getDictionaryEncoding( - new ByteArrayInputStream("SET\tUTF-8\n".getBytes(StandardCharsets.UTF_8)))); - assertEquals( - "UTF-8", - Dictionary.getDictionaryEncoding( - new ByteArrayInputStream("SET\t UTF-8\n".getBytes(StandardCharsets.UTF_8)))); - assertEquals( - "UTF-8", - Dictionary.getDictionaryEncoding( - new ByteArrayInputStream("\uFEFFSET\tUTF-8\n".getBytes(StandardCharsets.UTF_8)))); - assertEquals( - "UTF-8", - Dictionary.getDictionaryEncoding( - new ByteArrayInputStream("\uFEFFSET\tUTF-8\r\n".getBytes(StandardCharsets.UTF_8)))); - assertEquals( - Dictionary.DEFAULT_CHARSET.name(), - Dictionary.getDictionaryEncoding(new ByteArrayInputStream(new byte[0]))); + assertEquals("UTF-8", getDictionaryEncoding("SET\tUTF-8\n")); + assertEquals("UTF-8", getDictionaryEncoding("SET\t UTF-8\n")); + assertEquals("UTF-8", getDictionaryEncoding("\uFEFFSET\tUTF-8\n")); + assertEquals("UTF-8", getDictionaryEncoding("\uFEFFSET\tUTF-8\r\n")); + assertEquals(Dictionary.DEFAULT_CHARSET.name(), getDictionaryEncoding("")); + } + + private static String getDictionaryEncoding(String affFile) throws IOException, ParseException { + Dictionary dictionary = + new Dictionary( + new ByteBuffersDirectory(), + "", + new ByteArrayInputStream(affFile.getBytes(StandardCharsets.UTF_8)), + new ByteArrayInputStream("1\nmock".getBytes(StandardCharsets.UTF_8))); + return dictionary.decoder.charset().name(); } public void testFlagWithCrazyWhitespace() {