LUCENE-4019: Added support for handling rules with incorrect number of arguments

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1344987 13f79535-47bb-0310-9956-ffa450edef68
2012-06-01 05:13:12 +00:00 · 2012-06-01 05:13:12 +00:00 · 7d87c740ee
parent cf9686ed9a
commit 7d87c740ee
5 changed files with 106 additions and 10 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -887,6 +887,11 @@ New features
    generates suggestions by combining two or more terms and/or 
    breaking terms into multiple words.  See Javadocs for usage. (James Dyer)

+* LUCENE-4019: Added improved parsing of Hunspell Dictionaries so those
+  rules missing the required number of parameters either ignored or 
+  cause a ParseException (depending on whether strict parsing is enabled).
+  (Luca Cavanna via Chris Male) 
+
 Optimizations

 * LUCENE-2588: Don't store unnecessary suffixes when writing the terms
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellDictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellDictionary.java
@ -50,6 +50,7 @@ public class HunspellDictionary {
  private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s";

  private static final boolean IGNORE_CASE_DEFAULT = false;
+  private static final boolean STRICT_AFFIX_PARSING_DEFAULT = true;

  private CharArrayMap<List<HunspellWord>> words;
  private CharArrayMap<List<HunspellAffix>> prefixes;
@ -104,11 +105,27 @@ public class HunspellDictionary {
   * @throws ParseException Can be thrown if the content of the files does not meet expected formats
   */
  public HunspellDictionary(InputStream affix, List<InputStream> dictionaries, Version version, boolean ignoreCase) throws IOException, ParseException {
+    this(affix, dictionaries, version, ignoreCase, STRICT_AFFIX_PARSING_DEFAULT);
+  }
+
+  /**
+   * Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix
+   * and dictionary files
+   *
+   * @param affix InputStream for reading the hunspell affix file
+   * @param dictionaries InputStreams for reading the hunspell dictionary file
+   * @param version Lucene Version
+   * @param ignoreCase If true, dictionary matching will be case insensitive
+   * @param strictAffixParsing Affix strict parsing enabled or not (an error while reading a rule causes exception or is ignored)
+   * @throws IOException Can be thrown while reading from the InputStreams
+   * @throws ParseException Can be thrown if the content of the files does not meet expected formats
+   */
+  public HunspellDictionary(InputStream affix, List<InputStream> dictionaries, Version version, boolean ignoreCase, boolean strictAffixParsing) throws IOException, ParseException {
    this.version = version;
    this.ignoreCase = ignoreCase;
    String encoding = getDictionaryEncoding(affix);
    CharsetDecoder decoder = getJavaEncoding(encoding);
-    readAffixFile(affix, decoder);
+    readAffixFile(affix, decoder, strictAffixParsing);
    words = new CharArrayMap<List<HunspellWord>>(version, 65535 /* guess */, this.ignoreCase);
    for (InputStream dictionary : dictionaries) {
      readDictionaryFile(dictionary, decoder);
@ -158,19 +175,19 @@ public class HunspellDictionary {
   * @param decoder CharsetDecoder to decode the content of the file
   * @throws IOException Can be thrown while reading from the InputStream
   */
-  private void readAffixFile(InputStream affixStream, CharsetDecoder decoder) throws IOException {
+  private void readAffixFile(InputStream affixStream, CharsetDecoder decoder, boolean strict) throws IOException, ParseException {
    prefixes = new CharArrayMap<List<HunspellAffix>>(version, 8, ignoreCase);
    suffixes = new CharArrayMap<List<HunspellAffix>>(version, 8, ignoreCase);
-    
-    BufferedReader reader = new BufferedReader(new InputStreamReader(affixStream, decoder));
+
+    LineNumberReader reader = new LineNumberReader(new InputStreamReader(affixStream, decoder));
    String line = null;
    while ((line = reader.readLine()) != null) {
      if (line.startsWith(ALIAS_KEY)) {
        parseAlias(line);
      } else if (line.startsWith(PREFIX_KEY)) {
-        parseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN);
+        parseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN, strict);
      } else if (line.startsWith(SUFFIX_KEY)) {
-        parseAffix(suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN);
+        parseAffix(suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN, strict);
      } else if (line.startsWith(FLAG_KEY)) {
        // Assume that the FLAG line comes before any prefix or suffixes
        // Store the strategy so it can be used when parsing the dic file
@ -192,8 +209,9 @@ public class HunspellDictionary {
   */
  private void parseAffix(CharArrayMap<List<HunspellAffix>> affixes,
                          String header,
-                          BufferedReader reader,
-                          String conditionPattern) throws IOException {
+                          LineNumberReader reader,
+                          String conditionPattern,
+                          boolean strict) throws IOException, ParseException {
    String args[] = header.split("\\s+");

    boolean crossProduct = args[2].equals("Y");
@ -203,6 +221,13 @@ public class HunspellDictionary {
      String line = reader.readLine();
      String ruleArgs[] = line.split("\\s+");

+      if (ruleArgs.length < 5) {
+        if (strict) {
+          throw new ParseException("The affix file contains a rule with less than five elements", reader.getLineNumber());
+        }
+        continue;
+      }
+
      HunspellAffix affix = new HunspellAffix();
      
      affix.setFlag(flagParsingStrategy.parseFlag(ruleArgs[1]));
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellDictionaryTest.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellDictionaryTest.java
@ -19,11 +19,13 @@ package org.apache.lucene.analysis.hunspell;

 import org.apache.lucene.util.LuceneTestCase;
 import org.apache.lucene.util.Version;
+import org.junit.Assert;
 import org.junit.Test;

 import java.io.IOException;
 import java.io.InputStream;
 import java.text.ParseException;
+import java.util.Arrays;

 import static junit.framework.Assert.assertEquals;

@ -56,4 +58,31 @@ public class HunspellDictionaryTest extends LuceneTestCase {
    affixStream.close();
    dictStream.close();
  }
+
+  @Test
+  public void testHunspellDictionary_loadDicWrongAff() throws IOException, ParseException {
+    InputStream affixStream = getClass().getResourceAsStream("testWrongAffixRule.aff");
+    InputStream dictStream = getClass().getResourceAsStream("test.dic");
+
+    HunspellDictionary dictionary = new HunspellDictionary(affixStream, Arrays.asList(dictStream), TEST_VERSION_CURRENT, false, false);
+    assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).size());
+    assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).size());
+    assertEquals(1, dictionary.lookupWord(new char[]{'o', 'l', 'r'}, 0, 3).size());
+    //strict parsing disabled: malformed rule is not loaded
+    assertNull(dictionary.lookupPrefix(new char[]{'a'}, 0, 1));
+
+    affixStream = getClass().getResourceAsStream("testWrongAffixRule.aff");
+    dictStream = getClass().getResourceAsStream("test.dic");
+    //strict parsing enabled: malformed rule causes ParseException
+    try {
+      dictionary = new HunspellDictionary(affixStream, Arrays.asList(dictStream), TEST_VERSION_CURRENT, false, true);
+      Assert.fail();
+    } catch(ParseException e) {
+      Assert.assertEquals("The affix file contains a rule with less than five elements", e.getMessage());
+      Assert.assertEquals(23, e.getErrorOffset());
+    }
+
+    affixStream.close();
+    dictStream.close();
+  }
 }
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testWrongAffixRule.aff
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testWrongAffixRule.aff
@ -0,0 +1,24 @@
+SET UTF-8
+TRY abcdefghijklmopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ
+
+SFX A Y 3
+SFX A   0     e         n
+SFX A   0     e         t
+SFX A   0     e         h
+
+SFX C Y 2
+SFX C   0     d/C       c
+SFX C   0     c         b
+
+SFX D Y 1
+SFX D   0     s         o
+
+SFX E Y 1
+SFX E   0     d         o
+
+PFX B Y 1
+PFX B   0     s         o
+
+#wrong rule (only 4 elements)
+PFX A0 Y 1
+PFX A0 0 a
--- a/solr/core/src/java/org/apache/solr/analysis/HunspellStemFilterFactory.java
+++ b/solr/core/src/java/org/apache/solr/analysis/HunspellStemFilterFactory.java
@ -40,7 +40,10 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
 * Both parameters dictionary and affix are mandatory.
 * <br/>
 * The parameter ignoreCase (true/false) controls whether matching is case sensitive or not. Default false.
- * <br/> 
+ * <br/>
+ * The parameter strictAffixParsing (true/false) controls whether the affix parsing is strict or not. Default true.
+ * If strict an error while reading an affix rule causes a ParseException, otherwise is ignored.
+ * <br/>
 * Dictionaries for many languages are available through the OpenOffice project.
 * 
 * See <a href="http://wiki.apache.org/solr/Hunspell">http://wiki.apache.org/solr/Hunspell</a>
@ -50,6 +53,7 @@ public class HunspellStemFilterFactory extends TokenFilterFactory implements Res
  private static final String PARAM_DICTIONARY = "dictionary";
  private static final String PARAM_AFFIX = "affix";
  private static final String PARAM_IGNORE_CASE = "ignoreCase";
+  private static final String PARAM_STRICT_AFFIX_PARSING = "strictAffixParsing";
  private static final String TRUE = "true";
  private static final String FALSE = "false";
  
@ -72,12 +76,21 @@ public class HunspellStemFilterFactory extends TokenFilterFactory implements Res
      else throw new InitializationException("Unknown value for " + PARAM_IGNORE_CASE + ": " + pic + ". Must be true or false");
    }

+
+    String strictAffixParsingParam = args.get(PARAM_STRICT_AFFIX_PARSING);
+    boolean strictAffixParsing = true;
+    if(strictAffixParsingParam != null) {
+      if(strictAffixParsingParam.equalsIgnoreCase(FALSE)) strictAffixParsing = false;
+      else if(strictAffixParsingParam.equalsIgnoreCase(TRUE)) strictAffixParsing = true;
+      else throw new InitializationException("Unknown value for " + PARAM_STRICT_AFFIX_PARSING + ": " + strictAffixParsingParam + ". Must be true or false");
+    }
+
    try {
      List<InputStream> dictionaries = new ArrayList<InputStream>();
      for (String file : dictionaryFiles) {
        dictionaries.add(loader.openResource(file));
      }
-      this.dictionary = new HunspellDictionary(loader.openResource(affixFile), dictionaries, luceneMatchVersion, ignoreCase);
+      this.dictionary = new HunspellDictionary(loader.openResource(affixFile), dictionaries, luceneMatchVersion, ignoreCase, strictAffixParsing);
    } catch (Exception e) {
      throw new InitializationException("Unable to load hunspell data! [dictionary=" + args.get("dictionary") + ",affix=" + affixFile + "]", e);
    }