mirror of https://github.com/apache/lucene.git
LUCENE-5468: deduplicate patterns used by affix condition check
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5468@1571788 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
803226ece4
commit
10f548d205
|
@ -28,7 +28,6 @@ final class Affix {
|
||||||
private char appendFlags[]; // continuation class flags
|
private char appendFlags[]; // continuation class flags
|
||||||
private String strip;
|
private String strip;
|
||||||
|
|
||||||
private String condition;
|
|
||||||
private Pattern conditionPattern;
|
private Pattern conditionPattern;
|
||||||
|
|
||||||
private char flag;
|
private char flag;
|
||||||
|
@ -99,24 +98,13 @@ final class Affix {
|
||||||
this.strip = strip;
|
this.strip = strip;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the condition that must be met before the affix can be applied
|
|
||||||
*
|
|
||||||
* @return Condition that must be met before the affix can be applied
|
|
||||||
*/
|
|
||||||
public String getCondition() {
|
|
||||||
return condition;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Sets the condition that must be met before the affix can be applied
|
* Sets the condition that must be met before the affix can be applied
|
||||||
*
|
*
|
||||||
* @param condition Condition to be met before affix application
|
|
||||||
* @param pattern Condition as a regular expression pattern
|
* @param pattern Condition as a regular expression pattern
|
||||||
*/
|
*/
|
||||||
public void setCondition(String condition, String pattern) {
|
public void setCondition(Pattern pattern) {
|
||||||
this.condition = condition;
|
this.conditionPattern = pattern;
|
||||||
this.conditionPattern = Pattern.compile(pattern);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -44,6 +44,7 @@ import java.util.HashMap;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Locale;
|
import java.util.Locale;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* In-memory structure for the dictionary (.dic) and affix (.aff)
|
* In-memory structure for the dictionary (.dic) and affix (.aff)
|
||||||
|
@ -68,6 +69,12 @@ public class Dictionary {
|
||||||
public CharArrayMap<List<Affix>> prefixes;
|
public CharArrayMap<List<Affix>> prefixes;
|
||||||
public CharArrayMap<List<Affix>> suffixes;
|
public CharArrayMap<List<Affix>> suffixes;
|
||||||
|
|
||||||
|
// all Patterns used by prefixes and suffixes. these are typically re-used across
|
||||||
|
// many affix stripping rules. so these are deduplicated, to save RAM.
|
||||||
|
// TODO: maybe don't use Pattern for the condition check...
|
||||||
|
// TODO: when we cut over Affix to FST, just store integer index to this.
|
||||||
|
public ArrayList<Pattern> patterns = new ArrayList<>();
|
||||||
|
|
||||||
// the entries in the .dic file, mapping to their set of flags.
|
// the entries in the .dic file, mapping to their set of flags.
|
||||||
// the fst output is the ordinal for flagLookup
|
// the fst output is the ordinal for flagLookup
|
||||||
public FST<Long> words;
|
public FST<Long> words;
|
||||||
|
@ -184,6 +191,7 @@ public class Dictionary {
|
||||||
private void readAffixFile(InputStream affixStream, CharsetDecoder decoder) throws IOException, ParseException {
|
private void readAffixFile(InputStream affixStream, CharsetDecoder decoder) throws IOException, ParseException {
|
||||||
prefixes = new CharArrayMap<List<Affix>>(Version.LUCENE_CURRENT, 8, false);
|
prefixes = new CharArrayMap<List<Affix>>(Version.LUCENE_CURRENT, 8, false);
|
||||||
suffixes = new CharArrayMap<List<Affix>>(Version.LUCENE_CURRENT, 8, false);
|
suffixes = new CharArrayMap<List<Affix>>(Version.LUCENE_CURRENT, 8, false);
|
||||||
|
Map<String,Integer> seenPatterns = new HashMap<>();
|
||||||
|
|
||||||
LineNumberReader reader = new LineNumberReader(new InputStreamReader(affixStream, decoder));
|
LineNumberReader reader = new LineNumberReader(new InputStreamReader(affixStream, decoder));
|
||||||
String line = null;
|
String line = null;
|
||||||
|
@ -191,9 +199,9 @@ public class Dictionary {
|
||||||
if (line.startsWith(ALIAS_KEY)) {
|
if (line.startsWith(ALIAS_KEY)) {
|
||||||
parseAlias(line);
|
parseAlias(line);
|
||||||
} else if (line.startsWith(PREFIX_KEY)) {
|
} else if (line.startsWith(PREFIX_KEY)) {
|
||||||
parseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN);
|
parseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN, seenPatterns);
|
||||||
} else if (line.startsWith(SUFFIX_KEY)) {
|
} else if (line.startsWith(SUFFIX_KEY)) {
|
||||||
parseAffix(suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN);
|
parseAffix(suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN, seenPatterns);
|
||||||
} else if (line.startsWith(FLAG_KEY)) {
|
} else if (line.startsWith(FLAG_KEY)) {
|
||||||
// Assume that the FLAG line comes before any prefix or suffixes
|
// Assume that the FLAG line comes before any prefix or suffixes
|
||||||
// Store the strategy so it can be used when parsing the dic file
|
// Store the strategy so it can be used when parsing the dic file
|
||||||
|
@ -210,12 +218,14 @@ public class Dictionary {
|
||||||
* @param reader BufferedReader to read the content of the rule from
|
* @param reader BufferedReader to read the content of the rule from
|
||||||
* @param conditionPattern {@link String#format(String, Object...)} pattern to be used to generate the condition regex
|
* @param conditionPattern {@link String#format(String, Object...)} pattern to be used to generate the condition regex
|
||||||
* pattern
|
* pattern
|
||||||
|
* @param seenPatterns map from condition -> index of patterns, for deduplication.
|
||||||
* @throws IOException Can be thrown while reading the rule
|
* @throws IOException Can be thrown while reading the rule
|
||||||
*/
|
*/
|
||||||
private void parseAffix(CharArrayMap<List<Affix>> affixes,
|
private void parseAffix(CharArrayMap<List<Affix>> affixes,
|
||||||
String header,
|
String header,
|
||||||
LineNumberReader reader,
|
LineNumberReader reader,
|
||||||
String conditionPattern) throws IOException, ParseException {
|
String conditionPattern,
|
||||||
|
Map<String,Integer> seenPatterns) throws IOException, ParseException {
|
||||||
String args[] = header.split("\\s+");
|
String args[] = header.split("\\s+");
|
||||||
|
|
||||||
boolean crossProduct = args[2].equals("Y");
|
boolean crossProduct = args[2].equals("Y");
|
||||||
|
@ -261,7 +271,16 @@ public class Dictionary {
|
||||||
if (condition.indexOf('-') >= 0) {
|
if (condition.indexOf('-') >= 0) {
|
||||||
condition = condition.replace("-", "\\-");
|
condition = condition.replace("-", "\\-");
|
||||||
}
|
}
|
||||||
affix.setCondition(condition, String.format(Locale.ROOT, conditionPattern, condition));
|
// deduplicate patterns
|
||||||
|
String regex = String.format(Locale.ROOT, conditionPattern, condition);
|
||||||
|
Integer patternIndex = seenPatterns.get(regex);
|
||||||
|
if (patternIndex == null) {
|
||||||
|
patternIndex = patterns.size();
|
||||||
|
seenPatterns.put(regex, patternIndex);
|
||||||
|
Pattern pattern = Pattern.compile(regex);
|
||||||
|
patterns.add(pattern);
|
||||||
|
}
|
||||||
|
affix.setCondition(patterns.get(patternIndex));
|
||||||
affix.setCrossProduct(crossProduct);
|
affix.setCrossProduct(crossProduct);
|
||||||
|
|
||||||
List<Affix> list = affixes.get(affix.getAppend());
|
List<Affix> list = affixes.get(affix.getAppend());
|
||||||
|
|
|
@ -33,12 +33,12 @@ import org.junit.Ignore;
|
||||||
* wget --mirror -np http://archive.services.openoffice.org/pub/mirror/OpenOffice.org/contrib/dictionaries/
|
* wget --mirror -np http://archive.services.openoffice.org/pub/mirror/OpenOffice.org/contrib/dictionaries/
|
||||||
* Note some of the files differ only in case. This may be a problem on your operating system!
|
* Note some of the files differ only in case. This may be a problem on your operating system!
|
||||||
*/
|
*/
|
||||||
@Ignore("enable manually")
|
//@Ignore("enable manually")
|
||||||
public class TestAllDictionaries extends LuceneTestCase {
|
public class TestAllDictionaries extends LuceneTestCase {
|
||||||
|
|
||||||
// set this to the location of where you downloaded all the files
|
// set this to the location of where you downloaded all the files
|
||||||
static final File DICTIONARY_HOME =
|
static final File DICTIONARY_HOME =
|
||||||
new File("/Users/rmuir/hunspell/archive.services.openoffice.org/pub/mirror/OpenOffice.org/contrib/dictionaries");
|
new File("/data/archive.services.openoffice.org/pub/mirror/OpenOffice.org/contrib/dictionaries");
|
||||||
|
|
||||||
final String tests[] = {
|
final String tests[] = {
|
||||||
/* zip file */ /* dictionary */ /* affix */
|
/* zip file */ /* dictionary */ /* affix */
|
||||||
|
@ -176,7 +176,11 @@ public class TestAllDictionaries extends LuceneTestCase {
|
||||||
try (InputStream dictionary = zip.getInputStream(dicEntry);
|
try (InputStream dictionary = zip.getInputStream(dicEntry);
|
||||||
InputStream affix = zip.getInputStream(affEntry)) {
|
InputStream affix = zip.getInputStream(affEntry)) {
|
||||||
Dictionary dic = new Dictionary(affix, dictionary);
|
Dictionary dic = new Dictionary(affix, dictionary);
|
||||||
System.out.println(tests[i] + "\t" + oldRAM + "\t" + RamUsageEstimator.humanSizeOf(dic));
|
System.out.println(tests[i] + "\t" + oldRAM + "\t" + RamUsageEstimator.humanSizeOf(dic) + "\t(" +
|
||||||
|
"words=" + RamUsageEstimator.humanSizeOf(dic.words) + ", " +
|
||||||
|
"flags=" + RamUsageEstimator.humanSizeOf(dic.flagLookup) + ", " +
|
||||||
|
"prefixes=" + RamUsageEstimator.humanSizeOf(dic.prefixes) + ", " +
|
||||||
|
"suffixes=" + RamUsageEstimator.humanSizeOf(dic.suffixes) + ")");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue