From 10f548d205e9443872c919f7af0ac1b01c735ed3 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Tue, 25 Feb 2014 19:18:09 +0000 Subject: [PATCH] LUCENE-5468: deduplicate patterns used by affix condition check git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5468@1571788 13f79535-47bb-0310-9956-ffa450edef68 --- .../lucene/analysis/hunspell2/Affix.java | 16 ++--------- .../lucene/analysis/hunspell2/Dictionary.java | 27 ++++++++++++++++--- .../hunspell2/TestAllDictionaries.java | 10 ++++--- 3 files changed, 32 insertions(+), 21 deletions(-) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Affix.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Affix.java index 41c3553fb77..443c006c97d 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Affix.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Affix.java @@ -28,7 +28,6 @@ final class Affix { private char appendFlags[]; // continuation class flags private String strip; - private String condition; private Pattern conditionPattern; private char flag; @@ -99,24 +98,13 @@ final class Affix { this.strip = strip; } - /** - * Returns the condition that must be met before the affix can be applied - * - * @return Condition that must be met before the affix can be applied - */ - public String getCondition() { - return condition; - } - /** * Sets the condition that must be met before the affix can be applied * - * @param condition Condition to be met before affix application * @param pattern Condition as a regular expression pattern */ - public void setCondition(String condition, String pattern) { - this.condition = condition; - this.conditionPattern = Pattern.compile(pattern); + public void setCondition(Pattern pattern) { + this.conditionPattern = pattern; } /** diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java index 10baa403413..0456d9946d3 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java @@ -44,6 +44,7 @@ import java.util.HashMap; import java.util.List; import java.util.Locale; import java.util.Map; +import java.util.regex.Pattern; /** * In-memory structure for the dictionary (.dic) and affix (.aff) @@ -68,6 +69,12 @@ public class Dictionary { public CharArrayMap> prefixes; public CharArrayMap> suffixes; + // all Patterns used by prefixes and suffixes. these are typically re-used across + // many affix stripping rules. so these are deduplicated, to save RAM. + // TODO: maybe don't use Pattern for the condition check... + // TODO: when we cut over Affix to FST, just store integer index to this. + public ArrayList patterns = new ArrayList<>(); + // the entries in the .dic file, mapping to their set of flags. // the fst output is the ordinal for flagLookup public FST words; @@ -184,6 +191,7 @@ public class Dictionary { private void readAffixFile(InputStream affixStream, CharsetDecoder decoder) throws IOException, ParseException { prefixes = new CharArrayMap>(Version.LUCENE_CURRENT, 8, false); suffixes = new CharArrayMap>(Version.LUCENE_CURRENT, 8, false); + Map seenPatterns = new HashMap<>(); LineNumberReader reader = new LineNumberReader(new InputStreamReader(affixStream, decoder)); String line = null; @@ -191,9 +199,9 @@ public class Dictionary { if (line.startsWith(ALIAS_KEY)) { parseAlias(line); } else if (line.startsWith(PREFIX_KEY)) { - parseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN); + parseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN, seenPatterns); } else if (line.startsWith(SUFFIX_KEY)) { - parseAffix(suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN); + parseAffix(suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN, seenPatterns); } else if (line.startsWith(FLAG_KEY)) { // Assume that the FLAG line comes before any prefix or suffixes // Store the strategy so it can be used when parsing the dic file @@ -210,12 +218,14 @@ public class Dictionary { * @param reader BufferedReader to read the content of the rule from * @param conditionPattern {@link String#format(String, Object...)} pattern to be used to generate the condition regex * pattern + * @param seenPatterns map from condition -> index of patterns, for deduplication. * @throws IOException Can be thrown while reading the rule */ private void parseAffix(CharArrayMap> affixes, String header, LineNumberReader reader, - String conditionPattern) throws IOException, ParseException { + String conditionPattern, + Map seenPatterns) throws IOException, ParseException { String args[] = header.split("\\s+"); boolean crossProduct = args[2].equals("Y"); @@ -261,7 +271,16 @@ public class Dictionary { if (condition.indexOf('-') >= 0) { condition = condition.replace("-", "\\-"); } - affix.setCondition(condition, String.format(Locale.ROOT, conditionPattern, condition)); + // deduplicate patterns + String regex = String.format(Locale.ROOT, conditionPattern, condition); + Integer patternIndex = seenPatterns.get(regex); + if (patternIndex == null) { + patternIndex = patterns.size(); + seenPatterns.put(regex, patternIndex); + Pattern pattern = Pattern.compile(regex); + patterns.add(pattern); + } + affix.setCondition(patterns.get(patternIndex)); affix.setCrossProduct(crossProduct); List list = affixes.get(affix.getAppend()); diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestAllDictionaries.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestAllDictionaries.java index ecb21b97a7c..9f9bce98236 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestAllDictionaries.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestAllDictionaries.java @@ -33,12 +33,12 @@ import org.junit.Ignore; * wget --mirror -np http://archive.services.openoffice.org/pub/mirror/OpenOffice.org/contrib/dictionaries/ * Note some of the files differ only in case. This may be a problem on your operating system! */ -@Ignore("enable manually") +//@Ignore("enable manually") public class TestAllDictionaries extends LuceneTestCase { // set this to the location of where you downloaded all the files static final File DICTIONARY_HOME = - new File("/Users/rmuir/hunspell/archive.services.openoffice.org/pub/mirror/OpenOffice.org/contrib/dictionaries"); + new File("/data/archive.services.openoffice.org/pub/mirror/OpenOffice.org/contrib/dictionaries"); final String tests[] = { /* zip file */ /* dictionary */ /* affix */ @@ -176,7 +176,11 @@ public class TestAllDictionaries extends LuceneTestCase { try (InputStream dictionary = zip.getInputStream(dicEntry); InputStream affix = zip.getInputStream(affEntry)) { Dictionary dic = new Dictionary(affix, dictionary); - System.out.println(tests[i] + "\t" + oldRAM + "\t" + RamUsageEstimator.humanSizeOf(dic)); + System.out.println(tests[i] + "\t" + oldRAM + "\t" + RamUsageEstimator.humanSizeOf(dic) + "\t(" + + "words=" + RamUsageEstimator.humanSizeOf(dic.words) + ", " + + "flags=" + RamUsageEstimator.humanSizeOf(dic.flagLookup) + ", " + + "prefixes=" + RamUsageEstimator.humanSizeOf(dic.prefixes) + ", " + + "suffixes=" + RamUsageEstimator.humanSizeOf(dic.suffixes) + ")"); } } }