diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 2143d7545e9..79280017b68 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -887,6 +887,11 @@ New features generates suggestions by combining two or more terms and/or breaking terms into multiple words. See Javadocs for usage. (James Dyer) +* LUCENE-4019: Added improved parsing of Hunspell Dictionaries so those + rules missing the required number of parameters either ignored or + cause a ParseException (depending on whether strict parsing is enabled). + (Luca Cavanna via Chris Male) + Optimizations * LUCENE-2588: Don't store unnecessary suffixes when writing the terms diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellDictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellDictionary.java index f4a0eccfec7..1fa1ca714e0 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellDictionary.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellDictionary.java @@ -50,6 +50,7 @@ public class HunspellDictionary { private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s"; private static final boolean IGNORE_CASE_DEFAULT = false; + private static final boolean STRICT_AFFIX_PARSING_DEFAULT = true; private CharArrayMap> words; private CharArrayMap> prefixes; @@ -104,11 +105,27 @@ public class HunspellDictionary { * @throws ParseException Can be thrown if the content of the files does not meet expected formats */ public HunspellDictionary(InputStream affix, List dictionaries, Version version, boolean ignoreCase) throws IOException, ParseException { + this(affix, dictionaries, version, ignoreCase, STRICT_AFFIX_PARSING_DEFAULT); + } + + /** + * Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix + * and dictionary files + * + * @param affix InputStream for reading the hunspell affix file + * @param dictionaries InputStreams for reading the hunspell dictionary file + * @param version Lucene Version + * @param ignoreCase If true, dictionary matching will be case insensitive + * @param strictAffixParsing Affix strict parsing enabled or not (an error while reading a rule causes exception or is ignored) + * @throws IOException Can be thrown while reading from the InputStreams + * @throws ParseException Can be thrown if the content of the files does not meet expected formats + */ + public HunspellDictionary(InputStream affix, List dictionaries, Version version, boolean ignoreCase, boolean strictAffixParsing) throws IOException, ParseException { this.version = version; this.ignoreCase = ignoreCase; String encoding = getDictionaryEncoding(affix); CharsetDecoder decoder = getJavaEncoding(encoding); - readAffixFile(affix, decoder); + readAffixFile(affix, decoder, strictAffixParsing); words = new CharArrayMap>(version, 65535 /* guess */, this.ignoreCase); for (InputStream dictionary : dictionaries) { readDictionaryFile(dictionary, decoder); @@ -158,19 +175,19 @@ public class HunspellDictionary { * @param decoder CharsetDecoder to decode the content of the file * @throws IOException Can be thrown while reading from the InputStream */ - private void readAffixFile(InputStream affixStream, CharsetDecoder decoder) throws IOException { + private void readAffixFile(InputStream affixStream, CharsetDecoder decoder, boolean strict) throws IOException, ParseException { prefixes = new CharArrayMap>(version, 8, ignoreCase); suffixes = new CharArrayMap>(version, 8, ignoreCase); - - BufferedReader reader = new BufferedReader(new InputStreamReader(affixStream, decoder)); + + LineNumberReader reader = new LineNumberReader(new InputStreamReader(affixStream, decoder)); String line = null; while ((line = reader.readLine()) != null) { if (line.startsWith(ALIAS_KEY)) { parseAlias(line); } else if (line.startsWith(PREFIX_KEY)) { - parseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN); + parseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN, strict); } else if (line.startsWith(SUFFIX_KEY)) { - parseAffix(suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN); + parseAffix(suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN, strict); } else if (line.startsWith(FLAG_KEY)) { // Assume that the FLAG line comes before any prefix or suffixes // Store the strategy so it can be used when parsing the dic file @@ -192,8 +209,9 @@ public class HunspellDictionary { */ private void parseAffix(CharArrayMap> affixes, String header, - BufferedReader reader, - String conditionPattern) throws IOException { + LineNumberReader reader, + String conditionPattern, + boolean strict) throws IOException, ParseException { String args[] = header.split("\\s+"); boolean crossProduct = args[2].equals("Y"); @@ -203,6 +221,13 @@ public class HunspellDictionary { String line = reader.readLine(); String ruleArgs[] = line.split("\\s+"); + if (ruleArgs.length < 5) { + if (strict) { + throw new ParseException("The affix file contains a rule with less than five elements", reader.getLineNumber()); + } + continue; + } + HunspellAffix affix = new HunspellAffix(); affix.setFlag(flagParsingStrategy.parseFlag(ruleArgs[1])); diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellDictionaryTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellDictionaryTest.java index 687262a3d36..418b87ad5b4 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellDictionaryTest.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellDictionaryTest.java @@ -19,11 +19,13 @@ package org.apache.lucene.analysis.hunspell; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.Version; +import org.junit.Assert; import org.junit.Test; import java.io.IOException; import java.io.InputStream; import java.text.ParseException; +import java.util.Arrays; import static junit.framework.Assert.assertEquals; @@ -56,4 +58,31 @@ public class HunspellDictionaryTest extends LuceneTestCase { affixStream.close(); dictStream.close(); } + + @Test + public void testHunspellDictionary_loadDicWrongAff() throws IOException, ParseException { + InputStream affixStream = getClass().getResourceAsStream("testWrongAffixRule.aff"); + InputStream dictStream = getClass().getResourceAsStream("test.dic"); + + HunspellDictionary dictionary = new HunspellDictionary(affixStream, Arrays.asList(dictStream), TEST_VERSION_CURRENT, false, false); + assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).size()); + assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).size()); + assertEquals(1, dictionary.lookupWord(new char[]{'o', 'l', 'r'}, 0, 3).size()); + //strict parsing disabled: malformed rule is not loaded + assertNull(dictionary.lookupPrefix(new char[]{'a'}, 0, 1)); + + affixStream = getClass().getResourceAsStream("testWrongAffixRule.aff"); + dictStream = getClass().getResourceAsStream("test.dic"); + //strict parsing enabled: malformed rule causes ParseException + try { + dictionary = new HunspellDictionary(affixStream, Arrays.asList(dictStream), TEST_VERSION_CURRENT, false, true); + Assert.fail(); + } catch(ParseException e) { + Assert.assertEquals("The affix file contains a rule with less than five elements", e.getMessage()); + Assert.assertEquals(23, e.getErrorOffset()); + } + + affixStream.close(); + dictStream.close(); + } } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testWrongAffixRule.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testWrongAffixRule.aff new file mode 100644 index 00000000000..3b780cd1d7b --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testWrongAffixRule.aff @@ -0,0 +1,24 @@ +SET UTF-8 +TRY abcdefghijklmopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ + +SFX A Y 3 +SFX A 0 e n +SFX A 0 e t +SFX A 0 e h + +SFX C Y 2 +SFX C 0 d/C c +SFX C 0 c b + +SFX D Y 1 +SFX D 0 s o + +SFX E Y 1 +SFX E 0 d o + +PFX B Y 1 +PFX B 0 s o + +#wrong rule (only 4 elements) +PFX A0 Y 1 +PFX A0 0 a \ No newline at end of file diff --git a/solr/core/src/java/org/apache/solr/analysis/HunspellStemFilterFactory.java b/solr/core/src/java/org/apache/solr/analysis/HunspellStemFilterFactory.java index 004c3c18b15..08220862018 100644 --- a/solr/core/src/java/org/apache/solr/analysis/HunspellStemFilterFactory.java +++ b/solr/core/src/java/org/apache/solr/analysis/HunspellStemFilterFactory.java @@ -40,7 +40,10 @@ import org.apache.lucene.analysis.util.TokenFilterFactory; * Both parameters dictionary and affix are mandatory. *
* The parameter ignoreCase (true/false) controls whether matching is case sensitive or not. Default false. - *
+ *
+ * The parameter strictAffixParsing (true/false) controls whether the affix parsing is strict or not. Default true. + * If strict an error while reading an affix rule causes a ParseException, otherwise is ignored. + *
* Dictionaries for many languages are available through the OpenOffice project. * * See http://wiki.apache.org/solr/Hunspell @@ -50,6 +53,7 @@ public class HunspellStemFilterFactory extends TokenFilterFactory implements Res private static final String PARAM_DICTIONARY = "dictionary"; private static final String PARAM_AFFIX = "affix"; private static final String PARAM_IGNORE_CASE = "ignoreCase"; + private static final String PARAM_STRICT_AFFIX_PARSING = "strictAffixParsing"; private static final String TRUE = "true"; private static final String FALSE = "false"; @@ -72,12 +76,21 @@ public class HunspellStemFilterFactory extends TokenFilterFactory implements Res else throw new InitializationException("Unknown value for " + PARAM_IGNORE_CASE + ": " + pic + ". Must be true or false"); } + + String strictAffixParsingParam = args.get(PARAM_STRICT_AFFIX_PARSING); + boolean strictAffixParsing = true; + if(strictAffixParsingParam != null) { + if(strictAffixParsingParam.equalsIgnoreCase(FALSE)) strictAffixParsing = false; + else if(strictAffixParsingParam.equalsIgnoreCase(TRUE)) strictAffixParsing = true; + else throw new InitializationException("Unknown value for " + PARAM_STRICT_AFFIX_PARSING + ": " + strictAffixParsingParam + ". Must be true or false"); + } + try { List dictionaries = new ArrayList(); for (String file : dictionaryFiles) { dictionaries.add(loader.openResource(file)); } - this.dictionary = new HunspellDictionary(loader.openResource(affixFile), dictionaries, luceneMatchVersion, ignoreCase); + this.dictionary = new HunspellDictionary(loader.openResource(affixFile), dictionaries, luceneMatchVersion, ignoreCase, strictAffixParsing); } catch (Exception e) { throw new InitializationException("Unable to load hunspell data! [dictionary=" + args.get("dictionary") + ",affix=" + affixFile + "]", e); }