mirror of https://github.com/apache/lucene.git
LUCENE-4079: Added support for aliasing (AF rules) to Hunspell
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1344095 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
54be02eef2
commit
b4fe59532e
|
@ -883,6 +883,9 @@ New features
|
||||||
* LUCENE-4043: Added scoring support via score mode for query time joining.
|
* LUCENE-4043: Added scoring support via score mode for query time joining.
|
||||||
(Martijn van Groningen, Mike McCandless)
|
(Martijn van Groningen, Mike McCandless)
|
||||||
|
|
||||||
|
* LUCENE-4079: Added support for aliasing (AF rules) in Hunspell dictionaries
|
||||||
|
(Ludovic Boutros via Chris Male)
|
||||||
|
|
||||||
Optimizations
|
Optimizations
|
||||||
|
|
||||||
* LUCENE-2588: Don't store unnecessary suffixes when writing the terms
|
* LUCENE-2588: Don't store unnecessary suffixes when writing the terms
|
||||||
|
|
|
@ -37,6 +37,7 @@ public class HunspellDictionary {
|
||||||
|
|
||||||
static final HunspellWord NOFLAGS = new HunspellWord();
|
static final HunspellWord NOFLAGS = new HunspellWord();
|
||||||
|
|
||||||
|
private static final String ALIAS_KEY = "AF";
|
||||||
private static final String PREFIX_KEY = "PFX";
|
private static final String PREFIX_KEY = "PFX";
|
||||||
private static final String SUFFIX_KEY = "SFX";
|
private static final String SUFFIX_KEY = "SFX";
|
||||||
private static final String FLAG_KEY = "FLAG";
|
private static final String FLAG_KEY = "FLAG";
|
||||||
|
@ -59,6 +60,9 @@ public class HunspellDictionary {
|
||||||
|
|
||||||
private final Version version;
|
private final Version version;
|
||||||
|
|
||||||
|
private String[] aliases;
|
||||||
|
private int aliasCount = 0;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix
|
* Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix
|
||||||
* and dictionary files
|
* and dictionary files
|
||||||
|
@ -161,7 +165,9 @@ public class HunspellDictionary {
|
||||||
BufferedReader reader = new BufferedReader(new InputStreamReader(affixStream, decoder));
|
BufferedReader reader = new BufferedReader(new InputStreamReader(affixStream, decoder));
|
||||||
String line = null;
|
String line = null;
|
||||||
while ((line = reader.readLine()) != null) {
|
while ((line = reader.readLine()) != null) {
|
||||||
if (line.startsWith(PREFIX_KEY)) {
|
if (line.startsWith(ALIAS_KEY)) {
|
||||||
|
parseAlias(line);
|
||||||
|
} else if (line.startsWith(PREFIX_KEY)) {
|
||||||
parseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN);
|
parseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN);
|
||||||
} else if (line.startsWith(SUFFIX_KEY)) {
|
} else if (line.startsWith(SUFFIX_KEY)) {
|
||||||
parseAffix(suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN);
|
parseAffix(suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN);
|
||||||
|
@ -206,7 +212,13 @@ public class HunspellDictionary {
|
||||||
|
|
||||||
int flagSep = affixArg.lastIndexOf('/');
|
int flagSep = affixArg.lastIndexOf('/');
|
||||||
if (flagSep != -1) {
|
if (flagSep != -1) {
|
||||||
char appendFlags[] = flagParsingStrategy.parseFlags(affixArg.substring(flagSep + 1));
|
String flagPart = affixArg.substring(flagSep + 1);
|
||||||
|
|
||||||
|
if (aliasCount > 0) {
|
||||||
|
flagPart = getAliasValue(Integer.parseInt(flagPart));
|
||||||
|
}
|
||||||
|
|
||||||
|
char appendFlags[] = flagParsingStrategy.parseFlags(flagPart);
|
||||||
Arrays.sort(appendFlags);
|
Arrays.sort(appendFlags);
|
||||||
affix.setAppendFlags(appendFlags);
|
affix.setAppendFlags(appendFlags);
|
||||||
affix.setAppend(affixArg.substring(0, flagSep));
|
affix.setAppend(affixArg.substring(0, flagSep));
|
||||||
|
@ -330,8 +342,12 @@ public class HunspellDictionary {
|
||||||
if (end == -1)
|
if (end == -1)
|
||||||
end = line.length();
|
end = line.length();
|
||||||
|
|
||||||
|
String flagPart = line.substring(flagSep + 1, end);
|
||||||
|
if (aliasCount > 0) {
|
||||||
|
flagPart = getAliasValue(Integer.parseInt(flagPart));
|
||||||
|
}
|
||||||
|
|
||||||
wordForm = new HunspellWord(flagParsingStrategy.parseFlags(line.substring(flagSep + 1, end)));
|
wordForm = new HunspellWord(flagParsingStrategy.parseFlags(flagPart));
|
||||||
Arrays.sort(wordForm.getFlags());
|
Arrays.sort(wordForm.getFlags());
|
||||||
entry = line.substring(0, flagSep);
|
entry = line.substring(0, flagSep);
|
||||||
if(ignoreCase) {
|
if(ignoreCase) {
|
||||||
|
@ -352,6 +368,25 @@ public class HunspellDictionary {
|
||||||
return version;
|
return version;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void parseAlias(String line) {
|
||||||
|
String ruleArgs[] = line.split("\\s+");
|
||||||
|
if (aliases == null) {
|
||||||
|
//first line should be the aliases count
|
||||||
|
final int count = Integer.parseInt(ruleArgs[1]);
|
||||||
|
aliases = new String[count];
|
||||||
|
} else {
|
||||||
|
aliases[aliasCount++] = ruleArgs[1];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private String getAliasValue(int id) {
|
||||||
|
try {
|
||||||
|
return aliases[id - 1];
|
||||||
|
} catch (IndexOutOfBoundsException ex) {
|
||||||
|
throw new IllegalArgumentException("Bad flag alias number:" + id, ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Abstraction of the process of parsing flags taken from the affix and dic files
|
* Abstraction of the process of parsing flags taken from the affix and dic files
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -42,4 +42,18 @@ public class HunspellDictionaryTest extends LuceneTestCase {
|
||||||
affixStream.close();
|
affixStream.close();
|
||||||
dictStream.close();
|
dictStream.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testCompressedHunspellDictionary_loadDicAff() throws IOException, ParseException {
|
||||||
|
InputStream affixStream = getClass().getResourceAsStream("testCompressed.aff");
|
||||||
|
InputStream dictStream = getClass().getResourceAsStream("testCompressed.dic");
|
||||||
|
|
||||||
|
HunspellDictionary dictionary = new HunspellDictionary(affixStream, dictStream, TEST_VERSION_CURRENT);
|
||||||
|
assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).size());
|
||||||
|
assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).size());
|
||||||
|
assertEquals(1, dictionary.lookupWord(new char[]{'o', 'l', 'r'}, 0, 3).size());
|
||||||
|
|
||||||
|
affixStream.close();
|
||||||
|
dictStream.close();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,29 @@
|
||||||
|
SET UTF-8
|
||||||
|
TRY abcdefghijklmopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ
|
||||||
|
|
||||||
|
FLAG long
|
||||||
|
|
||||||
|
AF 5
|
||||||
|
AF AA
|
||||||
|
AF BB
|
||||||
|
AF CC
|
||||||
|
AF DD
|
||||||
|
AF EE
|
||||||
|
|
||||||
|
SFX AA Y 3
|
||||||
|
SFX AA 0 e n
|
||||||
|
SFX AA 0 e t
|
||||||
|
SFX AA 0 e h
|
||||||
|
|
||||||
|
SFX CC Y 2
|
||||||
|
SFX CC 0 d/3 c
|
||||||
|
SFX CC 0 c b
|
||||||
|
|
||||||
|
SFX DD Y 1
|
||||||
|
SFX DD 0 s o
|
||||||
|
|
||||||
|
SFX EE Y 1
|
||||||
|
SFX EE 0 d o
|
||||||
|
|
||||||
|
PFX BB Y 1
|
||||||
|
PFX BB 0 s o
|
|
@ -0,0 +1,9 @@
|
||||||
|
6
|
||||||
|
lucen/1
|
||||||
|
lucene
|
||||||
|
mahout/1
|
||||||
|
olr/2
|
||||||
|
ab/3
|
||||||
|
Apach/1
|
||||||
|
foo/4
|
||||||
|
Foo/5
|
Loading…
Reference in New Issue