mirror of https://github.com/apache/lucene.git
LUCENE-4079: Added support for aliasing (AF rules) to Hunspell
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1344095 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
54be02eef2
commit
b4fe59532e
|
@ -883,6 +883,9 @@ New features
|
|||
* LUCENE-4043: Added scoring support via score mode for query time joining.
|
||||
(Martijn van Groningen, Mike McCandless)
|
||||
|
||||
* LUCENE-4079: Added support for aliasing (AF rules) in Hunspell dictionaries
|
||||
(Ludovic Boutros via Chris Male)
|
||||
|
||||
Optimizations
|
||||
|
||||
* LUCENE-2588: Don't store unnecessary suffixes when writing the terms
|
||||
|
|
|
@ -37,6 +37,7 @@ public class HunspellDictionary {
|
|||
|
||||
static final HunspellWord NOFLAGS = new HunspellWord();
|
||||
|
||||
private static final String ALIAS_KEY = "AF";
|
||||
private static final String PREFIX_KEY = "PFX";
|
||||
private static final String SUFFIX_KEY = "SFX";
|
||||
private static final String FLAG_KEY = "FLAG";
|
||||
|
@ -59,6 +60,9 @@ public class HunspellDictionary {
|
|||
|
||||
private final Version version;
|
||||
|
||||
private String[] aliases;
|
||||
private int aliasCount = 0;
|
||||
|
||||
/**
|
||||
* Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix
|
||||
* and dictionary files
|
||||
|
@ -161,7 +165,9 @@ public class HunspellDictionary {
|
|||
BufferedReader reader = new BufferedReader(new InputStreamReader(affixStream, decoder));
|
||||
String line = null;
|
||||
while ((line = reader.readLine()) != null) {
|
||||
if (line.startsWith(PREFIX_KEY)) {
|
||||
if (line.startsWith(ALIAS_KEY)) {
|
||||
parseAlias(line);
|
||||
} else if (line.startsWith(PREFIX_KEY)) {
|
||||
parseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN);
|
||||
} else if (line.startsWith(SUFFIX_KEY)) {
|
||||
parseAffix(suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN);
|
||||
|
@ -206,7 +212,13 @@ public class HunspellDictionary {
|
|||
|
||||
int flagSep = affixArg.lastIndexOf('/');
|
||||
if (flagSep != -1) {
|
||||
char appendFlags[] = flagParsingStrategy.parseFlags(affixArg.substring(flagSep + 1));
|
||||
String flagPart = affixArg.substring(flagSep + 1);
|
||||
|
||||
if (aliasCount > 0) {
|
||||
flagPart = getAliasValue(Integer.parseInt(flagPart));
|
||||
}
|
||||
|
||||
char appendFlags[] = flagParsingStrategy.parseFlags(flagPart);
|
||||
Arrays.sort(appendFlags);
|
||||
affix.setAppendFlags(appendFlags);
|
||||
affix.setAppend(affixArg.substring(0, flagSep));
|
||||
|
@ -330,8 +342,12 @@ public class HunspellDictionary {
|
|||
if (end == -1)
|
||||
end = line.length();
|
||||
|
||||
String flagPart = line.substring(flagSep + 1, end);
|
||||
if (aliasCount > 0) {
|
||||
flagPart = getAliasValue(Integer.parseInt(flagPart));
|
||||
}
|
||||
|
||||
wordForm = new HunspellWord(flagParsingStrategy.parseFlags(line.substring(flagSep + 1, end)));
|
||||
wordForm = new HunspellWord(flagParsingStrategy.parseFlags(flagPart));
|
||||
Arrays.sort(wordForm.getFlags());
|
||||
entry = line.substring(0, flagSep);
|
||||
if(ignoreCase) {
|
||||
|
@ -352,6 +368,25 @@ public class HunspellDictionary {
|
|||
return version;
|
||||
}
|
||||
|
||||
private void parseAlias(String line) {
|
||||
String ruleArgs[] = line.split("\\s+");
|
||||
if (aliases == null) {
|
||||
//first line should be the aliases count
|
||||
final int count = Integer.parseInt(ruleArgs[1]);
|
||||
aliases = new String[count];
|
||||
} else {
|
||||
aliases[aliasCount++] = ruleArgs[1];
|
||||
}
|
||||
}
|
||||
|
||||
private String getAliasValue(int id) {
|
||||
try {
|
||||
return aliases[id - 1];
|
||||
} catch (IndexOutOfBoundsException ex) {
|
||||
throw new IllegalArgumentException("Bad flag alias number:" + id, ex);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Abstraction of the process of parsing flags taken from the affix and dic files
|
||||
*/
|
||||
|
|
|
@ -42,4 +42,18 @@ public class HunspellDictionaryTest extends LuceneTestCase {
|
|||
affixStream.close();
|
||||
dictStream.close();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCompressedHunspellDictionary_loadDicAff() throws IOException, ParseException {
|
||||
InputStream affixStream = getClass().getResourceAsStream("testCompressed.aff");
|
||||
InputStream dictStream = getClass().getResourceAsStream("testCompressed.dic");
|
||||
|
||||
HunspellDictionary dictionary = new HunspellDictionary(affixStream, dictStream, TEST_VERSION_CURRENT);
|
||||
assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).size());
|
||||
assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).size());
|
||||
assertEquals(1, dictionary.lookupWord(new char[]{'o', 'l', 'r'}, 0, 3).size());
|
||||
|
||||
affixStream.close();
|
||||
dictStream.close();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,29 @@
|
|||
SET UTF-8
|
||||
TRY abcdefghijklmopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ
|
||||
|
||||
FLAG long
|
||||
|
||||
AF 5
|
||||
AF AA
|
||||
AF BB
|
||||
AF CC
|
||||
AF DD
|
||||
AF EE
|
||||
|
||||
SFX AA Y 3
|
||||
SFX AA 0 e n
|
||||
SFX AA 0 e t
|
||||
SFX AA 0 e h
|
||||
|
||||
SFX CC Y 2
|
||||
SFX CC 0 d/3 c
|
||||
SFX CC 0 c b
|
||||
|
||||
SFX DD Y 1
|
||||
SFX DD 0 s o
|
||||
|
||||
SFX EE Y 1
|
||||
SFX EE 0 d o
|
||||
|
||||
PFX BB Y 1
|
||||
PFX BB 0 s o
|
|
@ -0,0 +1,9 @@
|
|||
6
|
||||
lucen/1
|
||||
lucene
|
||||
mahout/1
|
||||
olr/2
|
||||
ab/3
|
||||
Apach/1
|
||||
foo/4
|
||||
Foo/5
|
Loading…
Reference in New Issue