From b4fe59532ee46235100c43be95836878c418fa04 Mon Sep 17 00:00:00 2001 From: Christopher John Male Date: Wed, 30 May 2012 04:39:42 +0000 Subject: [PATCH] LUCENE-4079: Added support for aliasing (AF rules) to Hunspell git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1344095 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/CHANGES.txt | 3 ++ .../analysis/hunspell/HunspellDictionary.java | 41 +++++++++++++++++-- .../hunspell/HunspellDictionaryTest.java | 14 +++++++ .../analysis/hunspell/testCompressed.aff | 29 +++++++++++++ .../analysis/hunspell/testCompressed.dic | 9 ++++ 5 files changed, 93 insertions(+), 3 deletions(-) create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testCompressed.aff create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testCompressed.dic diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 8d8437d6fc2..404e7db5092 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -883,6 +883,9 @@ New features * LUCENE-4043: Added scoring support via score mode for query time joining. (Martijn van Groningen, Mike McCandless) +* LUCENE-4079: Added support for aliasing (AF rules) in Hunspell dictionaries + (Ludovic Boutros via Chris Male) + Optimizations * LUCENE-2588: Don't store unnecessary suffixes when writing the terms diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellDictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellDictionary.java index e66389bd274..f4a0eccfec7 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellDictionary.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellDictionary.java @@ -37,6 +37,7 @@ public class HunspellDictionary { static final HunspellWord NOFLAGS = new HunspellWord(); + private static final String ALIAS_KEY = "AF"; private static final String PREFIX_KEY = "PFX"; private static final String SUFFIX_KEY = "SFX"; private static final String FLAG_KEY = "FLAG"; @@ -59,6 +60,9 @@ public class HunspellDictionary { private final Version version; + private String[] aliases; + private int aliasCount = 0; + /** * Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix * and dictionary files @@ -161,7 +165,9 @@ public class HunspellDictionary { BufferedReader reader = new BufferedReader(new InputStreamReader(affixStream, decoder)); String line = null; while ((line = reader.readLine()) != null) { - if (line.startsWith(PREFIX_KEY)) { + if (line.startsWith(ALIAS_KEY)) { + parseAlias(line); + } else if (line.startsWith(PREFIX_KEY)) { parseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN); } else if (line.startsWith(SUFFIX_KEY)) { parseAffix(suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN); @@ -206,7 +212,13 @@ public class HunspellDictionary { int flagSep = affixArg.lastIndexOf('/'); if (flagSep != -1) { - char appendFlags[] = flagParsingStrategy.parseFlags(affixArg.substring(flagSep + 1)); + String flagPart = affixArg.substring(flagSep + 1); + + if (aliasCount > 0) { + flagPart = getAliasValue(Integer.parseInt(flagPart)); + } + + char appendFlags[] = flagParsingStrategy.parseFlags(flagPart); Arrays.sort(appendFlags); affix.setAppendFlags(appendFlags); affix.setAppend(affixArg.substring(0, flagSep)); @@ -330,8 +342,12 @@ public class HunspellDictionary { if (end == -1) end = line.length(); + String flagPart = line.substring(flagSep + 1, end); + if (aliasCount > 0) { + flagPart = getAliasValue(Integer.parseInt(flagPart)); + } - wordForm = new HunspellWord(flagParsingStrategy.parseFlags(line.substring(flagSep + 1, end))); + wordForm = new HunspellWord(flagParsingStrategy.parseFlags(flagPart)); Arrays.sort(wordForm.getFlags()); entry = line.substring(0, flagSep); if(ignoreCase) { @@ -352,6 +368,25 @@ public class HunspellDictionary { return version; } + private void parseAlias(String line) { + String ruleArgs[] = line.split("\\s+"); + if (aliases == null) { + //first line should be the aliases count + final int count = Integer.parseInt(ruleArgs[1]); + aliases = new String[count]; + } else { + aliases[aliasCount++] = ruleArgs[1]; + } + } + + private String getAliasValue(int id) { + try { + return aliases[id - 1]; + } catch (IndexOutOfBoundsException ex) { + throw new IllegalArgumentException("Bad flag alias number:" + id, ex); + } + } + /** * Abstraction of the process of parsing flags taken from the affix and dic files */ diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellDictionaryTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellDictionaryTest.java index dbd9ff5b95f..687262a3d36 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellDictionaryTest.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellDictionaryTest.java @@ -42,4 +42,18 @@ public class HunspellDictionaryTest extends LuceneTestCase { affixStream.close(); dictStream.close(); } + + @Test + public void testCompressedHunspellDictionary_loadDicAff() throws IOException, ParseException { + InputStream affixStream = getClass().getResourceAsStream("testCompressed.aff"); + InputStream dictStream = getClass().getResourceAsStream("testCompressed.dic"); + + HunspellDictionary dictionary = new HunspellDictionary(affixStream, dictStream, TEST_VERSION_CURRENT); + assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).size()); + assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).size()); + assertEquals(1, dictionary.lookupWord(new char[]{'o', 'l', 'r'}, 0, 3).size()); + + affixStream.close(); + dictStream.close(); + } } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testCompressed.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testCompressed.aff new file mode 100644 index 00000000000..e4a1b37300f --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testCompressed.aff @@ -0,0 +1,29 @@ +SET UTF-8 +TRY abcdefghijklmopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ + +FLAG long + +AF 5 +AF AA +AF BB +AF CC +AF DD +AF EE + +SFX AA Y 3 +SFX AA 0 e n +SFX AA 0 e t +SFX AA 0 e h + +SFX CC Y 2 +SFX CC 0 d/3 c +SFX CC 0 c b + +SFX DD Y 1 +SFX DD 0 s o + +SFX EE Y 1 +SFX EE 0 d o + +PFX BB Y 1 +PFX BB 0 s o diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testCompressed.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testCompressed.dic new file mode 100644 index 00000000000..bf237662017 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testCompressed.dic @@ -0,0 +1,9 @@ +6 +lucen/1 +lucene +mahout/1 +olr/2 +ab/3 +Apach/1 +foo/4 +Foo/5