mirror of
https://github.com/apache/lucene.git
synced 2025-02-08 02:58:58 +00:00
LUCENE-4019: Added support for handling rules with incorrect number of arguments
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1344987 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
cf9686ed9a
commit
7d87c740ee
@ -887,6 +887,11 @@ New features
|
|||||||
generates suggestions by combining two or more terms and/or
|
generates suggestions by combining two or more terms and/or
|
||||||
breaking terms into multiple words. See Javadocs for usage. (James Dyer)
|
breaking terms into multiple words. See Javadocs for usage. (James Dyer)
|
||||||
|
|
||||||
|
* LUCENE-4019: Added improved parsing of Hunspell Dictionaries so those
|
||||||
|
rules missing the required number of parameters either ignored or
|
||||||
|
cause a ParseException (depending on whether strict parsing is enabled).
|
||||||
|
(Luca Cavanna via Chris Male)
|
||||||
|
|
||||||
Optimizations
|
Optimizations
|
||||||
|
|
||||||
* LUCENE-2588: Don't store unnecessary suffixes when writing the terms
|
* LUCENE-2588: Don't store unnecessary suffixes when writing the terms
|
||||||
|
@ -50,6 +50,7 @@ public class HunspellDictionary {
|
|||||||
private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s";
|
private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s";
|
||||||
|
|
||||||
private static final boolean IGNORE_CASE_DEFAULT = false;
|
private static final boolean IGNORE_CASE_DEFAULT = false;
|
||||||
|
private static final boolean STRICT_AFFIX_PARSING_DEFAULT = true;
|
||||||
|
|
||||||
private CharArrayMap<List<HunspellWord>> words;
|
private CharArrayMap<List<HunspellWord>> words;
|
||||||
private CharArrayMap<List<HunspellAffix>> prefixes;
|
private CharArrayMap<List<HunspellAffix>> prefixes;
|
||||||
@ -104,11 +105,27 @@ public class HunspellDictionary {
|
|||||||
* @throws ParseException Can be thrown if the content of the files does not meet expected formats
|
* @throws ParseException Can be thrown if the content of the files does not meet expected formats
|
||||||
*/
|
*/
|
||||||
public HunspellDictionary(InputStream affix, List<InputStream> dictionaries, Version version, boolean ignoreCase) throws IOException, ParseException {
|
public HunspellDictionary(InputStream affix, List<InputStream> dictionaries, Version version, boolean ignoreCase) throws IOException, ParseException {
|
||||||
|
this(affix, dictionaries, version, ignoreCase, STRICT_AFFIX_PARSING_DEFAULT);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix
|
||||||
|
* and dictionary files
|
||||||
|
*
|
||||||
|
* @param affix InputStream for reading the hunspell affix file
|
||||||
|
* @param dictionaries InputStreams for reading the hunspell dictionary file
|
||||||
|
* @param version Lucene Version
|
||||||
|
* @param ignoreCase If true, dictionary matching will be case insensitive
|
||||||
|
* @param strictAffixParsing Affix strict parsing enabled or not (an error while reading a rule causes exception or is ignored)
|
||||||
|
* @throws IOException Can be thrown while reading from the InputStreams
|
||||||
|
* @throws ParseException Can be thrown if the content of the files does not meet expected formats
|
||||||
|
*/
|
||||||
|
public HunspellDictionary(InputStream affix, List<InputStream> dictionaries, Version version, boolean ignoreCase, boolean strictAffixParsing) throws IOException, ParseException {
|
||||||
this.version = version;
|
this.version = version;
|
||||||
this.ignoreCase = ignoreCase;
|
this.ignoreCase = ignoreCase;
|
||||||
String encoding = getDictionaryEncoding(affix);
|
String encoding = getDictionaryEncoding(affix);
|
||||||
CharsetDecoder decoder = getJavaEncoding(encoding);
|
CharsetDecoder decoder = getJavaEncoding(encoding);
|
||||||
readAffixFile(affix, decoder);
|
readAffixFile(affix, decoder, strictAffixParsing);
|
||||||
words = new CharArrayMap<List<HunspellWord>>(version, 65535 /* guess */, this.ignoreCase);
|
words = new CharArrayMap<List<HunspellWord>>(version, 65535 /* guess */, this.ignoreCase);
|
||||||
for (InputStream dictionary : dictionaries) {
|
for (InputStream dictionary : dictionaries) {
|
||||||
readDictionaryFile(dictionary, decoder);
|
readDictionaryFile(dictionary, decoder);
|
||||||
@ -158,19 +175,19 @@ public class HunspellDictionary {
|
|||||||
* @param decoder CharsetDecoder to decode the content of the file
|
* @param decoder CharsetDecoder to decode the content of the file
|
||||||
* @throws IOException Can be thrown while reading from the InputStream
|
* @throws IOException Can be thrown while reading from the InputStream
|
||||||
*/
|
*/
|
||||||
private void readAffixFile(InputStream affixStream, CharsetDecoder decoder) throws IOException {
|
private void readAffixFile(InputStream affixStream, CharsetDecoder decoder, boolean strict) throws IOException, ParseException {
|
||||||
prefixes = new CharArrayMap<List<HunspellAffix>>(version, 8, ignoreCase);
|
prefixes = new CharArrayMap<List<HunspellAffix>>(version, 8, ignoreCase);
|
||||||
suffixes = new CharArrayMap<List<HunspellAffix>>(version, 8, ignoreCase);
|
suffixes = new CharArrayMap<List<HunspellAffix>>(version, 8, ignoreCase);
|
||||||
|
|
||||||
BufferedReader reader = new BufferedReader(new InputStreamReader(affixStream, decoder));
|
LineNumberReader reader = new LineNumberReader(new InputStreamReader(affixStream, decoder));
|
||||||
String line = null;
|
String line = null;
|
||||||
while ((line = reader.readLine()) != null) {
|
while ((line = reader.readLine()) != null) {
|
||||||
if (line.startsWith(ALIAS_KEY)) {
|
if (line.startsWith(ALIAS_KEY)) {
|
||||||
parseAlias(line);
|
parseAlias(line);
|
||||||
} else if (line.startsWith(PREFIX_KEY)) {
|
} else if (line.startsWith(PREFIX_KEY)) {
|
||||||
parseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN);
|
parseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN, strict);
|
||||||
} else if (line.startsWith(SUFFIX_KEY)) {
|
} else if (line.startsWith(SUFFIX_KEY)) {
|
||||||
parseAffix(suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN);
|
parseAffix(suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN, strict);
|
||||||
} else if (line.startsWith(FLAG_KEY)) {
|
} else if (line.startsWith(FLAG_KEY)) {
|
||||||
// Assume that the FLAG line comes before any prefix or suffixes
|
// Assume that the FLAG line comes before any prefix or suffixes
|
||||||
// Store the strategy so it can be used when parsing the dic file
|
// Store the strategy so it can be used when parsing the dic file
|
||||||
@ -192,8 +209,9 @@ public class HunspellDictionary {
|
|||||||
*/
|
*/
|
||||||
private void parseAffix(CharArrayMap<List<HunspellAffix>> affixes,
|
private void parseAffix(CharArrayMap<List<HunspellAffix>> affixes,
|
||||||
String header,
|
String header,
|
||||||
BufferedReader reader,
|
LineNumberReader reader,
|
||||||
String conditionPattern) throws IOException {
|
String conditionPattern,
|
||||||
|
boolean strict) throws IOException, ParseException {
|
||||||
String args[] = header.split("\\s+");
|
String args[] = header.split("\\s+");
|
||||||
|
|
||||||
boolean crossProduct = args[2].equals("Y");
|
boolean crossProduct = args[2].equals("Y");
|
||||||
@ -203,6 +221,13 @@ public class HunspellDictionary {
|
|||||||
String line = reader.readLine();
|
String line = reader.readLine();
|
||||||
String ruleArgs[] = line.split("\\s+");
|
String ruleArgs[] = line.split("\\s+");
|
||||||
|
|
||||||
|
if (ruleArgs.length < 5) {
|
||||||
|
if (strict) {
|
||||||
|
throw new ParseException("The affix file contains a rule with less than five elements", reader.getLineNumber());
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
HunspellAffix affix = new HunspellAffix();
|
HunspellAffix affix = new HunspellAffix();
|
||||||
|
|
||||||
affix.setFlag(flagParsingStrategy.parseFlag(ruleArgs[1]));
|
affix.setFlag(flagParsingStrategy.parseFlag(ruleArgs[1]));
|
||||||
|
@ -19,11 +19,13 @@ package org.apache.lucene.analysis.hunspell;
|
|||||||
|
|
||||||
import org.apache.lucene.util.LuceneTestCase;
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
import org.apache.lucene.util.Version;
|
import org.apache.lucene.util.Version;
|
||||||
|
import org.junit.Assert;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.text.ParseException;
|
import java.text.ParseException;
|
||||||
|
import java.util.Arrays;
|
||||||
|
|
||||||
import static junit.framework.Assert.assertEquals;
|
import static junit.framework.Assert.assertEquals;
|
||||||
|
|
||||||
@ -56,4 +58,31 @@ public class HunspellDictionaryTest extends LuceneTestCase {
|
|||||||
affixStream.close();
|
affixStream.close();
|
||||||
dictStream.close();
|
dictStream.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testHunspellDictionary_loadDicWrongAff() throws IOException, ParseException {
|
||||||
|
InputStream affixStream = getClass().getResourceAsStream("testWrongAffixRule.aff");
|
||||||
|
InputStream dictStream = getClass().getResourceAsStream("test.dic");
|
||||||
|
|
||||||
|
HunspellDictionary dictionary = new HunspellDictionary(affixStream, Arrays.asList(dictStream), TEST_VERSION_CURRENT, false, false);
|
||||||
|
assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).size());
|
||||||
|
assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).size());
|
||||||
|
assertEquals(1, dictionary.lookupWord(new char[]{'o', 'l', 'r'}, 0, 3).size());
|
||||||
|
//strict parsing disabled: malformed rule is not loaded
|
||||||
|
assertNull(dictionary.lookupPrefix(new char[]{'a'}, 0, 1));
|
||||||
|
|
||||||
|
affixStream = getClass().getResourceAsStream("testWrongAffixRule.aff");
|
||||||
|
dictStream = getClass().getResourceAsStream("test.dic");
|
||||||
|
//strict parsing enabled: malformed rule causes ParseException
|
||||||
|
try {
|
||||||
|
dictionary = new HunspellDictionary(affixStream, Arrays.asList(dictStream), TEST_VERSION_CURRENT, false, true);
|
||||||
|
Assert.fail();
|
||||||
|
} catch(ParseException e) {
|
||||||
|
Assert.assertEquals("The affix file contains a rule with less than five elements", e.getMessage());
|
||||||
|
Assert.assertEquals(23, e.getErrorOffset());
|
||||||
|
}
|
||||||
|
|
||||||
|
affixStream.close();
|
||||||
|
dictStream.close();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,24 @@
|
|||||||
|
SET UTF-8
|
||||||
|
TRY abcdefghijklmopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ
|
||||||
|
|
||||||
|
SFX A Y 3
|
||||||
|
SFX A 0 e n
|
||||||
|
SFX A 0 e t
|
||||||
|
SFX A 0 e h
|
||||||
|
|
||||||
|
SFX C Y 2
|
||||||
|
SFX C 0 d/C c
|
||||||
|
SFX C 0 c b
|
||||||
|
|
||||||
|
SFX D Y 1
|
||||||
|
SFX D 0 s o
|
||||||
|
|
||||||
|
SFX E Y 1
|
||||||
|
SFX E 0 d o
|
||||||
|
|
||||||
|
PFX B Y 1
|
||||||
|
PFX B 0 s o
|
||||||
|
|
||||||
|
#wrong rule (only 4 elements)
|
||||||
|
PFX A0 Y 1
|
||||||
|
PFX A0 0 a
|
@ -40,7 +40,10 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
|
|||||||
* Both parameters dictionary and affix are mandatory.
|
* Both parameters dictionary and affix are mandatory.
|
||||||
* <br/>
|
* <br/>
|
||||||
* The parameter ignoreCase (true/false) controls whether matching is case sensitive or not. Default false.
|
* The parameter ignoreCase (true/false) controls whether matching is case sensitive or not. Default false.
|
||||||
* <br/>
|
* <br/>
|
||||||
|
* The parameter strictAffixParsing (true/false) controls whether the affix parsing is strict or not. Default true.
|
||||||
|
* If strict an error while reading an affix rule causes a ParseException, otherwise is ignored.
|
||||||
|
* <br/>
|
||||||
* Dictionaries for many languages are available through the OpenOffice project.
|
* Dictionaries for many languages are available through the OpenOffice project.
|
||||||
*
|
*
|
||||||
* See <a href="http://wiki.apache.org/solr/Hunspell">http://wiki.apache.org/solr/Hunspell</a>
|
* See <a href="http://wiki.apache.org/solr/Hunspell">http://wiki.apache.org/solr/Hunspell</a>
|
||||||
@ -50,6 +53,7 @@ public class HunspellStemFilterFactory extends TokenFilterFactory implements Res
|
|||||||
private static final String PARAM_DICTIONARY = "dictionary";
|
private static final String PARAM_DICTIONARY = "dictionary";
|
||||||
private static final String PARAM_AFFIX = "affix";
|
private static final String PARAM_AFFIX = "affix";
|
||||||
private static final String PARAM_IGNORE_CASE = "ignoreCase";
|
private static final String PARAM_IGNORE_CASE = "ignoreCase";
|
||||||
|
private static final String PARAM_STRICT_AFFIX_PARSING = "strictAffixParsing";
|
||||||
private static final String TRUE = "true";
|
private static final String TRUE = "true";
|
||||||
private static final String FALSE = "false";
|
private static final String FALSE = "false";
|
||||||
|
|
||||||
@ -72,12 +76,21 @@ public class HunspellStemFilterFactory extends TokenFilterFactory implements Res
|
|||||||
else throw new InitializationException("Unknown value for " + PARAM_IGNORE_CASE + ": " + pic + ". Must be true or false");
|
else throw new InitializationException("Unknown value for " + PARAM_IGNORE_CASE + ": " + pic + ". Must be true or false");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
String strictAffixParsingParam = args.get(PARAM_STRICT_AFFIX_PARSING);
|
||||||
|
boolean strictAffixParsing = true;
|
||||||
|
if(strictAffixParsingParam != null) {
|
||||||
|
if(strictAffixParsingParam.equalsIgnoreCase(FALSE)) strictAffixParsing = false;
|
||||||
|
else if(strictAffixParsingParam.equalsIgnoreCase(TRUE)) strictAffixParsing = true;
|
||||||
|
else throw new InitializationException("Unknown value for " + PARAM_STRICT_AFFIX_PARSING + ": " + strictAffixParsingParam + ". Must be true or false");
|
||||||
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
List<InputStream> dictionaries = new ArrayList<InputStream>();
|
List<InputStream> dictionaries = new ArrayList<InputStream>();
|
||||||
for (String file : dictionaryFiles) {
|
for (String file : dictionaryFiles) {
|
||||||
dictionaries.add(loader.openResource(file));
|
dictionaries.add(loader.openResource(file));
|
||||||
}
|
}
|
||||||
this.dictionary = new HunspellDictionary(loader.openResource(affixFile), dictionaries, luceneMatchVersion, ignoreCase);
|
this.dictionary = new HunspellDictionary(loader.openResource(affixFile), dictionaries, luceneMatchVersion, ignoreCase, strictAffixParsing);
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
throw new InitializationException("Unable to load hunspell data! [dictionary=" + args.get("dictionary") + ",affix=" + affixFile + "]", e);
|
throw new InitializationException("Unable to load hunspell data! [dictionary=" + args.get("dictionary") + ",affix=" + affixFile + "]", e);
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user