LUCENE-9665: Hunspell: support default encoding (#2203, Peter Gromov via Dawid Weiss)

This commit is contained in:
Peter Gromov 2021-01-15 09:35:25 +01:00 committed by GitHub
parent f285f02c89
commit 90131a605a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 9 additions and 4 deletions

View File

@ -84,6 +84,8 @@ API Changes
Improvements
* LUCENE-9665: Hunspell: support default encoding (Peter Gromov)
* LUCENE-9633: Improve match highlighter behavior for degenerate intervals (on non-existing positions).
(Dawid Weiss)

View File

@ -98,6 +98,7 @@ public class Dictionary {
// TODO: really for suffixes we should reverse the automaton and run them backwards
private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*";
private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s";
static final String DEFAULT_ENCODING = StandardCharsets.ISO_8859_1.name();
FST<IntsRef> prefixes;
FST<IntsRef> suffixes;
@ -642,10 +643,8 @@ public class Dictionary {
* @param affix InputStream for reading the affix file
* @return Encoding specified in the affix file
* @throws IOException Can be thrown while reading from the InputStream
* @throws ParseException Thrown if the first non-empty non-comment line read from the file does
* not adhere to the format {@code SET <encoding>}
*/
static String getDictionaryEncoding(InputStream affix) throws IOException, ParseException {
static String getDictionaryEncoding(InputStream affix) throws IOException {
final StringBuilder encoding = new StringBuilder();
for (; ; ) {
encoding.setLength(0);
@ -664,7 +663,7 @@ public class Dictionary {
// this test only at the end as ineffective but would allow lines only containing spaces:
encoding.toString().trim().length() == 0) {
if (ch < 0) {
throw new ParseException("Unexpected end of affix file.", 0);
return DEFAULT_ENCODING;
}
continue;
}
@ -673,6 +672,7 @@ public class Dictionary {
int last = matcher.end();
return encoding.substring(last).trim();
}
return DEFAULT_ENCODING;
}
}

View File

@ -267,6 +267,9 @@ public class TestDictionary extends LuceneTestCase {
"UTF-8",
Dictionary.getDictionaryEncoding(
new ByteArrayInputStream("\uFEFFSET\tUTF-8\r\n".getBytes(StandardCharsets.UTF_8))));
assertEquals(
Dictionary.DEFAULT_ENCODING,
Dictionary.getDictionaryEncoding(new ByteArrayInputStream(new byte[0])));
}
public void testFlagWithCrazyWhitespace() throws Exception {