From 0088308d7cbab526a9b3a791b9753bb5678c5534 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Wed, 18 Dec 2024 09:39:49 -0500 Subject: [PATCH] hunspell: tolerate REP rule count mismatches (#14079) Similar to support for tolerating PFX/SFX count mismatches, add the ability to tolerate REP count mismatches. The issue arises in recent updates to LibreOffice mongolian dictionary and is currently failing all PRs that change the analyzers: https://bugs.documentfoundation.org/show_bug.cgi?id=164366 --- lucene/CHANGES.txt | 4 +++- .../lucene/analysis/hunspell/Dictionary.java | 24 +++++++++++++++---- .../hunspell/TestAllDictionaries.java | 5 ++++ .../analysis/hunspell/TestDictionary.java | 5 ++++ .../analysis/hunspell/forgivable-errors.aff | 4 ++-- 5 files changed, 35 insertions(+), 7 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 661cd041568..5401062e06c 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -44,7 +44,9 @@ New Features Improvements --------------------- -(No changes) + +* GITHUB#14079: Hunspell Dictionary now supports an option to tolerate REP rule count mismatches. + (Robert Muir) Optimizations --------------------- diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java index 525e39dc389..fc5353560b5 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java @@ -405,10 +405,18 @@ public class Dictionary { } else if ("TRY".equals(firstWord)) { tryChars = firstArgument(reader, line); } else if ("REP".equals(firstWord)) { - int count = parseNum(reader, line); - for (int i = 0; i < count; i++) { - String[] parts = splitBySpace(reader, reader.readLine(), 3, Integer.MAX_VALUE); - repTable.add(new RepEntry(parts[1], parts[2])); + if (tolerateRepRuleCountMismatches()) { + String[] parts = splitBySpace(reader, line, 2, Integer.MAX_VALUE); + // ignore REP N, as actual N may be incorrect + if (parts.length >= 3) { + repTable.add(new RepEntry(parts[1], parts[2])); + } + } else { + int count = parseNum(reader, line); + for (int i = 0; i < count; i++) { + String[] parts = splitBySpace(reader, reader.readLine(), 3, Integer.MAX_VALUE); + repTable.add(new RepEntry(parts[1], parts[2])); + } } } else if ("MAP".equals(firstWord)) { int count = parseNum(reader, line); @@ -1168,6 +1176,14 @@ public class Dictionary { return false; } + /** + * Whether incorrect REP rule counts will be silently ignored. False by default: a {@link + * ParseException} will happen. + */ + protected boolean tolerateRepRuleCountMismatches() { + return false; + } + /** * Whether duplicate ICONV/OCONV lines should be silently ignored. False by default: an {@link * IllegalStateException} will happen. diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java index 9b23a81b32b..cdede3092b1 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java @@ -77,6 +77,11 @@ public class TestAllDictionaries extends LuceneTestCase { protected boolean tolerateAffixRuleCountMismatches() { return true; } + + @Override + protected boolean tolerateRepRuleCountMismatches() { + return true; + } }; } } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java index 28cfb1330ed..1aab0164121 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java @@ -207,6 +207,11 @@ public class TestDictionary extends LuceneTestCase { return true; } + @Override + protected boolean tolerateRepRuleCountMismatches() { + return true; + } + @Override protected boolean tolerateDuplicateConversionMappings() { return true; diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forgivable-errors.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forgivable-errors.aff index cb093496691..0c5d22c26bb 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forgivable-errors.aff +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/forgivable-errors.aff @@ -1,4 +1,4 @@ -REP 1 +REP 0 REP foo bar goo doo zoo COMPOUNDWORDMAX 2 y @@ -16,4 +16,4 @@ SFX A b c d ICONV 2 ICONV x y -ICONV x y \ No newline at end of file +ICONV x y