LUCENE-4079: Added support for aliasing (AF rules) to Hunspell

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1344095 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Christopher John Male 2012-05-30 04:39:42 +00:00
parent 54be02eef2
commit b4fe59532e
5 changed files with 93 additions and 3 deletions

View File

@ -883,6 +883,9 @@ New features
* LUCENE-4043: Added scoring support via score mode for query time joining.
(Martijn van Groningen, Mike McCandless)
* LUCENE-4079: Added support for aliasing (AF rules) in Hunspell dictionaries
(Ludovic Boutros via Chris Male)
Optimizations
* LUCENE-2588: Don't store unnecessary suffixes when writing the terms

View File

@ -37,6 +37,7 @@ public class HunspellDictionary {
static final HunspellWord NOFLAGS = new HunspellWord();
private static final String ALIAS_KEY = "AF";
private static final String PREFIX_KEY = "PFX";
private static final String SUFFIX_KEY = "SFX";
private static final String FLAG_KEY = "FLAG";
@ -59,6 +60,9 @@ public class HunspellDictionary {
private final Version version;
private String[] aliases;
private int aliasCount = 0;
/**
* Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix
* and dictionary files
@ -161,7 +165,9 @@ public class HunspellDictionary {
BufferedReader reader = new BufferedReader(new InputStreamReader(affixStream, decoder));
String line = null;
while ((line = reader.readLine()) != null) {
if (line.startsWith(PREFIX_KEY)) {
if (line.startsWith(ALIAS_KEY)) {
parseAlias(line);
} else if (line.startsWith(PREFIX_KEY)) {
parseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN);
} else if (line.startsWith(SUFFIX_KEY)) {
parseAffix(suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN);
@ -206,7 +212,13 @@ public class HunspellDictionary {
int flagSep = affixArg.lastIndexOf('/');
if (flagSep != -1) {
char appendFlags[] = flagParsingStrategy.parseFlags(affixArg.substring(flagSep + 1));
String flagPart = affixArg.substring(flagSep + 1);
if (aliasCount > 0) {
flagPart = getAliasValue(Integer.parseInt(flagPart));
}
char appendFlags[] = flagParsingStrategy.parseFlags(flagPart);
Arrays.sort(appendFlags);
affix.setAppendFlags(appendFlags);
affix.setAppend(affixArg.substring(0, flagSep));
@ -330,8 +342,12 @@ public class HunspellDictionary {
if (end == -1)
end = line.length();
String flagPart = line.substring(flagSep + 1, end);
if (aliasCount > 0) {
flagPart = getAliasValue(Integer.parseInt(flagPart));
}
wordForm = new HunspellWord(flagParsingStrategy.parseFlags(line.substring(flagSep + 1, end)));
wordForm = new HunspellWord(flagParsingStrategy.parseFlags(flagPart));
Arrays.sort(wordForm.getFlags());
entry = line.substring(0, flagSep);
if(ignoreCase) {
@ -352,6 +368,25 @@ public class HunspellDictionary {
return version;
}
private void parseAlias(String line) {
String ruleArgs[] = line.split("\\s+");
if (aliases == null) {
//first line should be the aliases count
final int count = Integer.parseInt(ruleArgs[1]);
aliases = new String[count];
} else {
aliases[aliasCount++] = ruleArgs[1];
}
}
private String getAliasValue(int id) {
try {
return aliases[id - 1];
} catch (IndexOutOfBoundsException ex) {
throw new IllegalArgumentException("Bad flag alias number:" + id, ex);
}
}
/**
* Abstraction of the process of parsing flags taken from the affix and dic files
*/

View File

@ -42,4 +42,18 @@ public class HunspellDictionaryTest extends LuceneTestCase {
affixStream.close();
dictStream.close();
}
@Test
public void testCompressedHunspellDictionary_loadDicAff() throws IOException, ParseException {
InputStream affixStream = getClass().getResourceAsStream("testCompressed.aff");
InputStream dictStream = getClass().getResourceAsStream("testCompressed.dic");
HunspellDictionary dictionary = new HunspellDictionary(affixStream, dictStream, TEST_VERSION_CURRENT);
assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).size());
assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).size());
assertEquals(1, dictionary.lookupWord(new char[]{'o', 'l', 'r'}, 0, 3).size());
affixStream.close();
dictStream.close();
}
}

View File

@ -0,0 +1,29 @@
SET UTF-8
TRY abcdefghijklmopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ
FLAG long
AF 5
AF AA
AF BB
AF CC
AF DD
AF EE
SFX AA Y 3
SFX AA 0 e n
SFX AA 0 e t
SFX AA 0 e h
SFX CC Y 2
SFX CC 0 d/3 c
SFX CC 0 c b
SFX DD Y 1
SFX DD 0 s o
SFX EE Y 1
SFX EE 0 d o
PFX BB Y 1
PFX BB 0 s o

View File

@ -0,0 +1,9 @@
6
lucen/1
lucene
mahout/1
olr/2
ab/3
Apach/1
foo/4
Foo/5