LUCENE-5468: deduplicate patterns used by affix condition check

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5468@1571788 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2014-02-25 19:18:09 +00:00
parent 803226ece4
commit 10f548d205
3 changed files with 32 additions and 21 deletions

View File

@ -28,7 +28,6 @@ final class Affix {
private char appendFlags[]; // continuation class flags private char appendFlags[]; // continuation class flags
private String strip; private String strip;
private String condition;
private Pattern conditionPattern; private Pattern conditionPattern;
private char flag; private char flag;
@ -99,24 +98,13 @@ final class Affix {
this.strip = strip; this.strip = strip;
} }
/**
* Returns the condition that must be met before the affix can be applied
*
* @return Condition that must be met before the affix can be applied
*/
public String getCondition() {
return condition;
}
/** /**
* Sets the condition that must be met before the affix can be applied * Sets the condition that must be met before the affix can be applied
* *
* @param condition Condition to be met before affix application
* @param pattern Condition as a regular expression pattern * @param pattern Condition as a regular expression pattern
*/ */
public void setCondition(String condition, String pattern) { public void setCondition(Pattern pattern) {
this.condition = condition; this.conditionPattern = pattern;
this.conditionPattern = Pattern.compile(pattern);
} }
/** /**

View File

@ -44,6 +44,7 @@ import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Locale; import java.util.Locale;
import java.util.Map; import java.util.Map;
import java.util.regex.Pattern;
/** /**
* In-memory structure for the dictionary (.dic) and affix (.aff) * In-memory structure for the dictionary (.dic) and affix (.aff)
@ -68,6 +69,12 @@ public class Dictionary {
public CharArrayMap<List<Affix>> prefixes; public CharArrayMap<List<Affix>> prefixes;
public CharArrayMap<List<Affix>> suffixes; public CharArrayMap<List<Affix>> suffixes;
// all Patterns used by prefixes and suffixes. these are typically re-used across
// many affix stripping rules. so these are deduplicated, to save RAM.
// TODO: maybe don't use Pattern for the condition check...
// TODO: when we cut over Affix to FST, just store integer index to this.
public ArrayList<Pattern> patterns = new ArrayList<>();
// the entries in the .dic file, mapping to their set of flags. // the entries in the .dic file, mapping to their set of flags.
// the fst output is the ordinal for flagLookup // the fst output is the ordinal for flagLookup
public FST<Long> words; public FST<Long> words;
@ -184,6 +191,7 @@ public class Dictionary {
private void readAffixFile(InputStream affixStream, CharsetDecoder decoder) throws IOException, ParseException { private void readAffixFile(InputStream affixStream, CharsetDecoder decoder) throws IOException, ParseException {
prefixes = new CharArrayMap<List<Affix>>(Version.LUCENE_CURRENT, 8, false); prefixes = new CharArrayMap<List<Affix>>(Version.LUCENE_CURRENT, 8, false);
suffixes = new CharArrayMap<List<Affix>>(Version.LUCENE_CURRENT, 8, false); suffixes = new CharArrayMap<List<Affix>>(Version.LUCENE_CURRENT, 8, false);
Map<String,Integer> seenPatterns = new HashMap<>();
LineNumberReader reader = new LineNumberReader(new InputStreamReader(affixStream, decoder)); LineNumberReader reader = new LineNumberReader(new InputStreamReader(affixStream, decoder));
String line = null; String line = null;
@ -191,9 +199,9 @@ public class Dictionary {
if (line.startsWith(ALIAS_KEY)) { if (line.startsWith(ALIAS_KEY)) {
parseAlias(line); parseAlias(line);
} else if (line.startsWith(PREFIX_KEY)) { } else if (line.startsWith(PREFIX_KEY)) {
parseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN); parseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN, seenPatterns);
} else if (line.startsWith(SUFFIX_KEY)) { } else if (line.startsWith(SUFFIX_KEY)) {
parseAffix(suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN); parseAffix(suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN, seenPatterns);
} else if (line.startsWith(FLAG_KEY)) { } else if (line.startsWith(FLAG_KEY)) {
// Assume that the FLAG line comes before any prefix or suffixes // Assume that the FLAG line comes before any prefix or suffixes
// Store the strategy so it can be used when parsing the dic file // Store the strategy so it can be used when parsing the dic file
@ -210,12 +218,14 @@ public class Dictionary {
* @param reader BufferedReader to read the content of the rule from * @param reader BufferedReader to read the content of the rule from
* @param conditionPattern {@link String#format(String, Object...)} pattern to be used to generate the condition regex * @param conditionPattern {@link String#format(String, Object...)} pattern to be used to generate the condition regex
* pattern * pattern
* @param seenPatterns map from condition -> index of patterns, for deduplication.
* @throws IOException Can be thrown while reading the rule * @throws IOException Can be thrown while reading the rule
*/ */
private void parseAffix(CharArrayMap<List<Affix>> affixes, private void parseAffix(CharArrayMap<List<Affix>> affixes,
String header, String header,
LineNumberReader reader, LineNumberReader reader,
String conditionPattern) throws IOException, ParseException { String conditionPattern,
Map<String,Integer> seenPatterns) throws IOException, ParseException {
String args[] = header.split("\\s+"); String args[] = header.split("\\s+");
boolean crossProduct = args[2].equals("Y"); boolean crossProduct = args[2].equals("Y");
@ -261,7 +271,16 @@ public class Dictionary {
if (condition.indexOf('-') >= 0) { if (condition.indexOf('-') >= 0) {
condition = condition.replace("-", "\\-"); condition = condition.replace("-", "\\-");
} }
affix.setCondition(condition, String.format(Locale.ROOT, conditionPattern, condition)); // deduplicate patterns
String regex = String.format(Locale.ROOT, conditionPattern, condition);
Integer patternIndex = seenPatterns.get(regex);
if (patternIndex == null) {
patternIndex = patterns.size();
seenPatterns.put(regex, patternIndex);
Pattern pattern = Pattern.compile(regex);
patterns.add(pattern);
}
affix.setCondition(patterns.get(patternIndex));
affix.setCrossProduct(crossProduct); affix.setCrossProduct(crossProduct);
List<Affix> list = affixes.get(affix.getAppend()); List<Affix> list = affixes.get(affix.getAppend());

View File

@ -33,12 +33,12 @@ import org.junit.Ignore;
* wget --mirror -np http://archive.services.openoffice.org/pub/mirror/OpenOffice.org/contrib/dictionaries/ * wget --mirror -np http://archive.services.openoffice.org/pub/mirror/OpenOffice.org/contrib/dictionaries/
* Note some of the files differ only in case. This may be a problem on your operating system! * Note some of the files differ only in case. This may be a problem on your operating system!
*/ */
@Ignore("enable manually") //@Ignore("enable manually")
public class TestAllDictionaries extends LuceneTestCase { public class TestAllDictionaries extends LuceneTestCase {
// set this to the location of where you downloaded all the files // set this to the location of where you downloaded all the files
static final File DICTIONARY_HOME = static final File DICTIONARY_HOME =
new File("/Users/rmuir/hunspell/archive.services.openoffice.org/pub/mirror/OpenOffice.org/contrib/dictionaries"); new File("/data/archive.services.openoffice.org/pub/mirror/OpenOffice.org/contrib/dictionaries");
final String tests[] = { final String tests[] = {
/* zip file */ /* dictionary */ /* affix */ /* zip file */ /* dictionary */ /* affix */
@ -176,7 +176,11 @@ public class TestAllDictionaries extends LuceneTestCase {
try (InputStream dictionary = zip.getInputStream(dicEntry); try (InputStream dictionary = zip.getInputStream(dicEntry);
InputStream affix = zip.getInputStream(affEntry)) { InputStream affix = zip.getInputStream(affEntry)) {
Dictionary dic = new Dictionary(affix, dictionary); Dictionary dic = new Dictionary(affix, dictionary);
System.out.println(tests[i] + "\t" + oldRAM + "\t" + RamUsageEstimator.humanSizeOf(dic)); System.out.println(tests[i] + "\t" + oldRAM + "\t" + RamUsageEstimator.humanSizeOf(dic) + "\t(" +
"words=" + RamUsageEstimator.humanSizeOf(dic.words) + ", " +
"flags=" + RamUsageEstimator.humanSizeOf(dic.flagLookup) + ", " +
"prefixes=" + RamUsageEstimator.humanSizeOf(dic.prefixes) + ", " +
"suffixes=" + RamUsageEstimator.humanSizeOf(dic.suffixes) + ")");
} }
} }
} }