LUCENE-4019: Added support for handling rules with incorrect number of arguments

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1344987 13f79535-47bb-0310-9956-ffa450edef68
2025-02-08 02:58:58 +00:00 · 2012-06-01 05:13:12 +00:00 · 2012-06-01 05:13:12 +00:00 · 7d87c740ee
commit 7d87c740ee
parent cf9686ed9a
5 changed files with 106 additions and 10 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -887,6 +887,11 @@ New features
    generates suggestions by combining two or more terms and/or 
    breaking terms into multiple words.  See Javadocs for usage. (James Dyer)
 * LUCENE-4019: Added improved parsing of Hunspell Dictionaries so those
  rules missing the required number of parameters either ignored or 
  cause a ParseException (depending on whether strict parsing is enabled).
  (Luca Cavanna via Chris Male) 
 Optimizations
 * LUCENE-2588: Don't store unnecessary suffixes when writing the terms
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellDictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellDictionary.java
@ -50,6 +50,7 @@ public class HunspellDictionary {
  private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s";
  private static final boolean IGNORE_CASE_DEFAULT = false;
  private static final boolean STRICT_AFFIX_PARSING_DEFAULT = true;
  private CharArrayMap<List<HunspellWord>> words;
  private CharArrayMap<List<HunspellAffix>> prefixes;
@ -104,11 +105,27 @@ public class HunspellDictionary {
   * @throws ParseException Can be thrown if the content of the files does not meet expected formats
   */
  public HunspellDictionary(InputStream affix, List<InputStream> dictionaries, Version version, boolean ignoreCase) throws IOException, ParseException {
    this(affix, dictionaries, version, ignoreCase, STRICT_AFFIX_PARSING_DEFAULT);
  }
  /**
   * Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix
   * and dictionary files
   *
   * @param affix InputStream for reading the hunspell affix file
   * @param dictionaries InputStreams for reading the hunspell dictionary file
   * @param version Lucene Version
   * @param ignoreCase If true, dictionary matching will be case insensitive
   * @param strictAffixParsing Affix strict parsing enabled or not (an error while reading a rule causes exception or is ignored)
   * @throws IOException Can be thrown while reading from the InputStreams
   * @throws ParseException Can be thrown if the content of the files does not meet expected formats
   */
  public HunspellDictionary(InputStream affix, List<InputStream> dictionaries, Version version, boolean ignoreCase, boolean strictAffixParsing) throws IOException, ParseException {
    this.version = version;
    this.ignoreCase = ignoreCase;
    String encoding = getDictionaryEncoding(affix);
    CharsetDecoder decoder = getJavaEncoding(encoding);
-    readAffixFile(affix, decoder);
+    readAffixFile(affix, decoder, strictAffixParsing);
    words = new CharArrayMap<List<HunspellWord>>(version, 65535 /* guess */, this.ignoreCase);
    for (InputStream dictionary : dictionaries) {
      readDictionaryFile(dictionary, decoder);
@ -158,19 +175,19 @@ public class HunspellDictionary {
   * @param decoder CharsetDecoder to decode the content of the file
   * @throws IOException Can be thrown while reading from the InputStream
   */
-  private void readAffixFile(InputStream affixStream, CharsetDecoder decoder) throws IOException {
+  private void readAffixFile(InputStream affixStream, CharsetDecoder decoder, boolean strict) throws IOException, ParseException {
    prefixes = new CharArrayMap<List<HunspellAffix>>(version, 8, ignoreCase);
    suffixes = new CharArrayMap<List<HunspellAffix>>(version, 8, ignoreCase);
-    
+
-    BufferedReader reader = new BufferedReader(new InputStreamReader(affixStream, decoder));
+    LineNumberReader reader = new LineNumberReader(new InputStreamReader(affixStream, decoder));
    String line = null;
    while ((line = reader.readLine()) != null) {
      if (line.startsWith(ALIAS_KEY)) {
        parseAlias(line);
      } else if (line.startsWith(PREFIX_KEY)) {
-        parseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN);
+        parseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN, strict);
      } else if (line.startsWith(SUFFIX_KEY)) {
-        parseAffix(suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN);
+        parseAffix(suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN, strict);
      } else if (line.startsWith(FLAG_KEY)) {
        // Assume that the FLAG line comes before any prefix or suffixes
        // Store the strategy so it can be used when parsing the dic file
@ -192,8 +209,9 @@ public class HunspellDictionary {
   */
  private void parseAffix(CharArrayMap<List<HunspellAffix>> affixes,
                          String header,
-                          BufferedReader reader,
+                          LineNumberReader reader,
-                          String conditionPattern) throws IOException {
+                          String conditionPattern,
                          boolean strict) throws IOException, ParseException {
    String args[] = header.split("\\s+");
    boolean crossProduct = args[2].equals("Y");
@ -203,6 +221,13 @@ public class HunspellDictionary {
      String line = reader.readLine();
      String ruleArgs[] = line.split("\\s+");
      if (ruleArgs.length < 5) {
        if (strict) {
          throw new ParseException("The affix file contains a rule with less than five elements", reader.getLineNumber());
        }
        continue;
      }
      HunspellAffix affix = new HunspellAffix();
      affix.setFlag(flagParsingStrategy.parseFlag(ruleArgs[1]));
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellDictionaryTest.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellDictionaryTest.java
@ -19,11 +19,13 @@ package org.apache.lucene.analysis.hunspell;
 import org.apache.lucene.util.LuceneTestCase;
 import org.apache.lucene.util.Version;
 import org.junit.Assert;
 import org.junit.Test;
 import java.io.IOException;
 import java.io.InputStream;
 import java.text.ParseException;
 import java.util.Arrays;
 import static junit.framework.Assert.assertEquals;
@ -56,4 +58,31 @@ public class HunspellDictionaryTest extends LuceneTestCase {
    affixStream.close();
    dictStream.close();
  }
  @Test
  public void testHunspellDictionary_loadDicWrongAff() throws IOException, ParseException {
    InputStream affixStream = getClass().getResourceAsStream("testWrongAffixRule.aff");
    InputStream dictStream = getClass().getResourceAsStream("test.dic");
    HunspellDictionary dictionary = new HunspellDictionary(affixStream, Arrays.asList(dictStream), TEST_VERSION_CURRENT, false, false);
    assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).size());
    assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).size());
    assertEquals(1, dictionary.lookupWord(new char[]{'o', 'l', 'r'}, 0, 3).size());
    //strict parsing disabled: malformed rule is not loaded
    assertNull(dictionary.lookupPrefix(new char[]{'a'}, 0, 1));
    affixStream = getClass().getResourceAsStream("testWrongAffixRule.aff");
    dictStream = getClass().getResourceAsStream("test.dic");
    //strict parsing enabled: malformed rule causes ParseException
    try {
      dictionary = new HunspellDictionary(affixStream, Arrays.asList(dictStream), TEST_VERSION_CURRENT, false, true);
      Assert.fail();
    } catch(ParseException e) {
      Assert.assertEquals("The affix file contains a rule with less than five elements", e.getMessage());
      Assert.assertEquals(23, e.getErrorOffset());
    }
    affixStream.close();
    dictStream.close();
  }
 }
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testWrongAffixRule.aff
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testWrongAffixRule.aff
@ -0,0 +1,24 @@
 SET UTF-8
 TRY abcdefghijklmopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ
 SFX A Y 3
 SFX A   0     e         n
 SFX A   0     e         t
 SFX A   0     e         h
 SFX C Y 2
 SFX C   0     d/C       c
 SFX C   0     c         b
 SFX D Y 1
 SFX D   0     s         o
 SFX E Y 1
 SFX E   0     d         o
 PFX B Y 1
 PFX B   0     s         o
 #wrong rule (only 4 elements)
 PFX A0 Y 1
 PFX A0 0 a
--- a/solr/core/src/java/org/apache/solr/analysis/HunspellStemFilterFactory.java
+++ b/solr/core/src/java/org/apache/solr/analysis/HunspellStemFilterFactory.java
@ -40,7 +40,10 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
 * Both parameters dictionary and affix are mandatory.
 * <br/>
 * The parameter ignoreCase (true/false) controls whether matching is case sensitive or not. Default false.
- * <br/> 
+ * <br/>
 * The parameter strictAffixParsing (true/false) controls whether the affix parsing is strict or not. Default true.
 * If strict an error while reading an affix rule causes a ParseException, otherwise is ignored.
 * <br/>
 * Dictionaries for many languages are available through the OpenOffice project.
 * 
 * See <a href="http://wiki.apache.org/solr/Hunspell">http://wiki.apache.org/solr/Hunspell</a>
@ -50,6 +53,7 @@ public class HunspellStemFilterFactory extends TokenFilterFactory implements Res
  private static final String PARAM_DICTIONARY = "dictionary";
  private static final String PARAM_AFFIX = "affix";
  private static final String PARAM_IGNORE_CASE = "ignoreCase";
  private static final String PARAM_STRICT_AFFIX_PARSING = "strictAffixParsing";
  private static final String TRUE = "true";
  private static final String FALSE = "false";
@ -72,12 +76,21 @@ public class HunspellStemFilterFactory extends TokenFilterFactory implements Res
      else throw new InitializationException("Unknown value for " + PARAM_IGNORE_CASE + ": " + pic + ". Must be true or false");
    }
    String strictAffixParsingParam = args.get(PARAM_STRICT_AFFIX_PARSING);
    boolean strictAffixParsing = true;
    if(strictAffixParsingParam != null) {
      if(strictAffixParsingParam.equalsIgnoreCase(FALSE)) strictAffixParsing = false;
      else if(strictAffixParsingParam.equalsIgnoreCase(TRUE)) strictAffixParsing = true;
      else throw new InitializationException("Unknown value for " + PARAM_STRICT_AFFIX_PARSING + ": " + strictAffixParsingParam + ". Must be true or false");
    }
    try {
      List<InputStream> dictionaries = new ArrayList<InputStream>();
      for (String file : dictionaryFiles) {
        dictionaries.add(loader.openResource(file));
      }
-      this.dictionary = new HunspellDictionary(loader.openResource(affixFile), dictionaries, luceneMatchVersion, ignoreCase);
+      this.dictionary = new HunspellDictionary(loader.openResource(affixFile), dictionaries, luceneMatchVersion, ignoreCase, strictAffixParsing);
    } catch (Exception e) {
      throw new InitializationException("Unable to load hunspell data! [dictionary=" + args.get("dictionary") + ",affix=" + affixFile + "]", e);
    }