mirror of https://github.com/apache/lucene.git
LUCENE-5468: deduplicate patterns used by affix condition check
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5468@1571788 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
803226ece4
commit
10f548d205
|
@ -28,7 +28,6 @@ final class Affix {
|
|||
private char appendFlags[]; // continuation class flags
|
||||
private String strip;
|
||||
|
||||
private String condition;
|
||||
private Pattern conditionPattern;
|
||||
|
||||
private char flag;
|
||||
|
@ -99,24 +98,13 @@ final class Affix {
|
|||
this.strip = strip;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the condition that must be met before the affix can be applied
|
||||
*
|
||||
* @return Condition that must be met before the affix can be applied
|
||||
*/
|
||||
public String getCondition() {
|
||||
return condition;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the condition that must be met before the affix can be applied
|
||||
*
|
||||
* @param condition Condition to be met before affix application
|
||||
* @param pattern Condition as a regular expression pattern
|
||||
*/
|
||||
public void setCondition(String condition, String pattern) {
|
||||
this.condition = condition;
|
||||
this.conditionPattern = Pattern.compile(pattern);
|
||||
public void setCondition(Pattern pattern) {
|
||||
this.conditionPattern = pattern;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -44,6 +44,7 @@ import java.util.HashMap;
|
|||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/**
|
||||
* In-memory structure for the dictionary (.dic) and affix (.aff)
|
||||
|
@ -68,6 +69,12 @@ public class Dictionary {
|
|||
public CharArrayMap<List<Affix>> prefixes;
|
||||
public CharArrayMap<List<Affix>> suffixes;
|
||||
|
||||
// all Patterns used by prefixes and suffixes. these are typically re-used across
|
||||
// many affix stripping rules. so these are deduplicated, to save RAM.
|
||||
// TODO: maybe don't use Pattern for the condition check...
|
||||
// TODO: when we cut over Affix to FST, just store integer index to this.
|
||||
public ArrayList<Pattern> patterns = new ArrayList<>();
|
||||
|
||||
// the entries in the .dic file, mapping to their set of flags.
|
||||
// the fst output is the ordinal for flagLookup
|
||||
public FST<Long> words;
|
||||
|
@ -184,6 +191,7 @@ public class Dictionary {
|
|||
private void readAffixFile(InputStream affixStream, CharsetDecoder decoder) throws IOException, ParseException {
|
||||
prefixes = new CharArrayMap<List<Affix>>(Version.LUCENE_CURRENT, 8, false);
|
||||
suffixes = new CharArrayMap<List<Affix>>(Version.LUCENE_CURRENT, 8, false);
|
||||
Map<String,Integer> seenPatterns = new HashMap<>();
|
||||
|
||||
LineNumberReader reader = new LineNumberReader(new InputStreamReader(affixStream, decoder));
|
||||
String line = null;
|
||||
|
@ -191,9 +199,9 @@ public class Dictionary {
|
|||
if (line.startsWith(ALIAS_KEY)) {
|
||||
parseAlias(line);
|
||||
} else if (line.startsWith(PREFIX_KEY)) {
|
||||
parseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN);
|
||||
parseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN, seenPatterns);
|
||||
} else if (line.startsWith(SUFFIX_KEY)) {
|
||||
parseAffix(suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN);
|
||||
parseAffix(suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN, seenPatterns);
|
||||
} else if (line.startsWith(FLAG_KEY)) {
|
||||
// Assume that the FLAG line comes before any prefix or suffixes
|
||||
// Store the strategy so it can be used when parsing the dic file
|
||||
|
@ -210,12 +218,14 @@ public class Dictionary {
|
|||
* @param reader BufferedReader to read the content of the rule from
|
||||
* @param conditionPattern {@link String#format(String, Object...)} pattern to be used to generate the condition regex
|
||||
* pattern
|
||||
* @param seenPatterns map from condition -> index of patterns, for deduplication.
|
||||
* @throws IOException Can be thrown while reading the rule
|
||||
*/
|
||||
private void parseAffix(CharArrayMap<List<Affix>> affixes,
|
||||
String header,
|
||||
LineNumberReader reader,
|
||||
String conditionPattern) throws IOException, ParseException {
|
||||
String conditionPattern,
|
||||
Map<String,Integer> seenPatterns) throws IOException, ParseException {
|
||||
String args[] = header.split("\\s+");
|
||||
|
||||
boolean crossProduct = args[2].equals("Y");
|
||||
|
@ -261,7 +271,16 @@ public class Dictionary {
|
|||
if (condition.indexOf('-') >= 0) {
|
||||
condition = condition.replace("-", "\\-");
|
||||
}
|
||||
affix.setCondition(condition, String.format(Locale.ROOT, conditionPattern, condition));
|
||||
// deduplicate patterns
|
||||
String regex = String.format(Locale.ROOT, conditionPattern, condition);
|
||||
Integer patternIndex = seenPatterns.get(regex);
|
||||
if (patternIndex == null) {
|
||||
patternIndex = patterns.size();
|
||||
seenPatterns.put(regex, patternIndex);
|
||||
Pattern pattern = Pattern.compile(regex);
|
||||
patterns.add(pattern);
|
||||
}
|
||||
affix.setCondition(patterns.get(patternIndex));
|
||||
affix.setCrossProduct(crossProduct);
|
||||
|
||||
List<Affix> list = affixes.get(affix.getAppend());
|
||||
|
|
|
@ -33,12 +33,12 @@ import org.junit.Ignore;
|
|||
* wget --mirror -np http://archive.services.openoffice.org/pub/mirror/OpenOffice.org/contrib/dictionaries/
|
||||
* Note some of the files differ only in case. This may be a problem on your operating system!
|
||||
*/
|
||||
@Ignore("enable manually")
|
||||
//@Ignore("enable manually")
|
||||
public class TestAllDictionaries extends LuceneTestCase {
|
||||
|
||||
// set this to the location of where you downloaded all the files
|
||||
static final File DICTIONARY_HOME =
|
||||
new File("/Users/rmuir/hunspell/archive.services.openoffice.org/pub/mirror/OpenOffice.org/contrib/dictionaries");
|
||||
new File("/data/archive.services.openoffice.org/pub/mirror/OpenOffice.org/contrib/dictionaries");
|
||||
|
||||
final String tests[] = {
|
||||
/* zip file */ /* dictionary */ /* affix */
|
||||
|
@ -176,7 +176,11 @@ public class TestAllDictionaries extends LuceneTestCase {
|
|||
try (InputStream dictionary = zip.getInputStream(dicEntry);
|
||||
InputStream affix = zip.getInputStream(affEntry)) {
|
||||
Dictionary dic = new Dictionary(affix, dictionary);
|
||||
System.out.println(tests[i] + "\t" + oldRAM + "\t" + RamUsageEstimator.humanSizeOf(dic));
|
||||
System.out.println(tests[i] + "\t" + oldRAM + "\t" + RamUsageEstimator.humanSizeOf(dic) + "\t(" +
|
||||
"words=" + RamUsageEstimator.humanSizeOf(dic.words) + ", " +
|
||||
"flags=" + RamUsageEstimator.humanSizeOf(dic.flagLookup) + ", " +
|
||||
"prefixes=" + RamUsageEstimator.humanSizeOf(dic.prefixes) + ", " +
|
||||
"suffixes=" + RamUsageEstimator.humanSizeOf(dic.suffixes) + ")");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue