mirror of https://github.com/apache/lucene.git
LUCENE-5468: commit current state
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5468@1571137 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
6a4e1e3a92
commit
2e0fc562bc
|
@ -0,0 +1,157 @@
|
||||||
|
package org.apache.lucene.analysis.hunspell2;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Wrapper class representing a hunspell affix
|
||||||
|
*/
|
||||||
|
final class Affix {
|
||||||
|
|
||||||
|
private String append; // the affix itself, what is appended
|
||||||
|
private char appendFlags[]; // continuation class flags
|
||||||
|
private String strip;
|
||||||
|
|
||||||
|
private String condition;
|
||||||
|
private Pattern conditionPattern;
|
||||||
|
|
||||||
|
private char flag;
|
||||||
|
|
||||||
|
private boolean crossProduct;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Checks whether the given text matches the conditional pattern on this affix
|
||||||
|
*
|
||||||
|
* @param text Text to check if it matches the affix's conditional pattern
|
||||||
|
* @return {@code true} if the text meets the condition, {@code false} otherwise
|
||||||
|
*/
|
||||||
|
public boolean checkCondition(CharSequence text) {
|
||||||
|
return conditionPattern.matcher(text).matches();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the append defined for the affix
|
||||||
|
*
|
||||||
|
* @return Defined append
|
||||||
|
*/
|
||||||
|
public String getAppend() {
|
||||||
|
return append;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sets the append defined for the affix
|
||||||
|
*
|
||||||
|
* @param append Defined append for the affix
|
||||||
|
*/
|
||||||
|
public void setAppend(String append) {
|
||||||
|
this.append = append;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the flags defined for the affix append
|
||||||
|
*
|
||||||
|
* @return Flags defined for the affix append
|
||||||
|
*/
|
||||||
|
public char[] getAppendFlags() {
|
||||||
|
return appendFlags;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sets the flags defined for the affix append
|
||||||
|
*
|
||||||
|
* @param appendFlags Flags defined for the affix append
|
||||||
|
*/
|
||||||
|
public void setAppendFlags(char[] appendFlags) {
|
||||||
|
this.appendFlags = appendFlags;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the stripping characters defined for the affix
|
||||||
|
*
|
||||||
|
* @return Stripping characters defined for the affix
|
||||||
|
*/
|
||||||
|
public String getStrip() {
|
||||||
|
return strip;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sets the stripping characters defined for the affix
|
||||||
|
*
|
||||||
|
* @param strip Stripping characters defined for the affix
|
||||||
|
*/
|
||||||
|
public void setStrip(String strip) {
|
||||||
|
this.strip = strip;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the condition that must be met before the affix can be applied
|
||||||
|
*
|
||||||
|
* @return Condition that must be met before the affix can be applied
|
||||||
|
*/
|
||||||
|
public String getCondition() {
|
||||||
|
return condition;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sets the condition that must be met before the affix can be applied
|
||||||
|
*
|
||||||
|
* @param condition Condition to be met before affix application
|
||||||
|
* @param pattern Condition as a regular expression pattern
|
||||||
|
*/
|
||||||
|
public void setCondition(String condition, String pattern) {
|
||||||
|
this.condition = condition;
|
||||||
|
this.conditionPattern = Pattern.compile(pattern);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the affix flag
|
||||||
|
*
|
||||||
|
* @return Affix flag
|
||||||
|
*/
|
||||||
|
public char getFlag() {
|
||||||
|
return flag;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sets the affix flag
|
||||||
|
*
|
||||||
|
* @param flag Affix flag
|
||||||
|
*/
|
||||||
|
public void setFlag(char flag) {
|
||||||
|
this.flag = flag;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns whether the affix is defined as cross product
|
||||||
|
*
|
||||||
|
* @return {@code true} if the affix is cross product, {@code false} otherwise
|
||||||
|
*/
|
||||||
|
public boolean isCrossProduct() {
|
||||||
|
return crossProduct;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sets whether the affix is defined as cross product
|
||||||
|
*
|
||||||
|
* @param crossProduct Whether the affix is defined as cross product
|
||||||
|
*/
|
||||||
|
public void setCrossProduct(boolean crossProduct) {
|
||||||
|
this.crossProduct = crossProduct;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,606 @@
|
||||||
|
package org.apache.lucene.analysis.hunspell2;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.util.CharArrayMap;
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.BytesRefHash;
|
||||||
|
import org.apache.lucene.util.IntsRef;
|
||||||
|
import org.apache.lucene.util.UnicodeUtil;
|
||||||
|
import org.apache.lucene.util.Version;
|
||||||
|
import org.apache.lucene.util.fst.Builder;
|
||||||
|
import org.apache.lucene.util.fst.FST;
|
||||||
|
import org.apache.lucene.util.fst.PositiveIntOutputs;
|
||||||
|
|
||||||
|
import java.io.*;
|
||||||
|
import java.nio.charset.Charset;
|
||||||
|
import java.nio.charset.CharsetDecoder;
|
||||||
|
import java.nio.charset.CodingErrorAction;
|
||||||
|
import java.text.ParseException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Locale;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.TreeMap;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* In-memory structure for the dictionary (.dic) and affix (.aff)
|
||||||
|
* data of a hunspell dictionary.
|
||||||
|
*/
|
||||||
|
public class Dictionary {
|
||||||
|
|
||||||
|
static final char[] NOFLAGS = new char[0];
|
||||||
|
|
||||||
|
private static final String ALIAS_KEY = "AF";
|
||||||
|
private static final String PREFIX_KEY = "PFX";
|
||||||
|
private static final String SUFFIX_KEY = "SFX";
|
||||||
|
private static final String FLAG_KEY = "FLAG";
|
||||||
|
|
||||||
|
private static final String NUM_FLAG_TYPE = "num";
|
||||||
|
private static final String UTF8_FLAG_TYPE = "UTF-8";
|
||||||
|
private static final String LONG_FLAG_TYPE = "long";
|
||||||
|
|
||||||
|
private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*";
|
||||||
|
private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s";
|
||||||
|
|
||||||
|
public CharArrayMap<List<Affix>> prefixes;
|
||||||
|
public CharArrayMap<List<Affix>> suffixes;
|
||||||
|
|
||||||
|
// the entries in the .dic file, mapping to their set of flags.
|
||||||
|
// the fst output is the ordinal for flagLookup
|
||||||
|
public FST<Long> words;
|
||||||
|
// the list of unique flagsets (wordforms). theoretically huge, but practically
|
||||||
|
// small (e.g. for polish this is 756), otherwise humans wouldn't be able to deal with it either.
|
||||||
|
public BytesRefHash flagLookup = new BytesRefHash();
|
||||||
|
|
||||||
|
private FlagParsingStrategy flagParsingStrategy = new SimpleFlagParsingStrategy(); // Default flag parsing strategy
|
||||||
|
|
||||||
|
private String[] aliases;
|
||||||
|
private int aliasCount = 0;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a new Dictionary containing the information read from the provided InputStreams to hunspell affix
|
||||||
|
* and dictionary files.
|
||||||
|
* You have to close the provided InputStreams yourself.
|
||||||
|
*
|
||||||
|
* @param affix InputStream for reading the hunspell affix file (won't be closed).
|
||||||
|
* @param dictionary InputStream for reading the hunspell dictionary file (won't be closed).
|
||||||
|
* @throws IOException Can be thrown while reading from the InputStreams
|
||||||
|
* @throws ParseException Can be thrown if the content of the files does not meet expected formats
|
||||||
|
*/
|
||||||
|
public Dictionary(InputStream affix, InputStream dictionary) throws IOException, ParseException {
|
||||||
|
BufferedInputStream buffered = new BufferedInputStream(affix, 8192);
|
||||||
|
buffered.mark(8192);
|
||||||
|
String encoding = getDictionaryEncoding(affix);
|
||||||
|
buffered.reset();
|
||||||
|
CharsetDecoder decoder = getJavaEncoding(encoding);
|
||||||
|
readAffixFile(buffered, decoder);
|
||||||
|
TreeMap<BytesRef,Integer> tempWords = new TreeMap<BytesRef,Integer>();
|
||||||
|
flagLookup.add(new BytesRef()); // no flags -> ord 0
|
||||||
|
readDictionaryFile(dictionary, decoder, tempWords);
|
||||||
|
PositiveIntOutputs o = PositiveIntOutputs.getSingleton();
|
||||||
|
Builder<Long> b = new Builder<Long>(FST.INPUT_TYPE.BYTE4, o); // nocommit: byte4
|
||||||
|
IntsRef scratchInts = new IntsRef();
|
||||||
|
for (Map.Entry<BytesRef,Integer> e : tempWords.entrySet()) {
|
||||||
|
UnicodeUtil.UTF8toUTF32(e.getKey(), scratchInts);
|
||||||
|
b.add(scratchInts, e.getValue().longValue());
|
||||||
|
}
|
||||||
|
words = b.finish();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Looks up words that match the String created from the given char array, offset and length
|
||||||
|
*
|
||||||
|
* @param word Char array to generate the String from
|
||||||
|
* @param offset Offset in the char array that the String starts at
|
||||||
|
* @param length Length from the offset that the String is
|
||||||
|
* @return List of HunspellWords that match the generated String, or {@code null} if none are found
|
||||||
|
*/
|
||||||
|
char[] lookupWord(char word[], int offset, int length, BytesRef scratch) {
|
||||||
|
Integer ord = null;
|
||||||
|
try {
|
||||||
|
ord = lookupOrd(word, offset, length);
|
||||||
|
} catch (IOException ex) { /* bogus */ }
|
||||||
|
if (ord == null) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
return decodeFlags(flagLookup.get(ord, scratch));
|
||||||
|
}
|
||||||
|
|
||||||
|
public Integer lookupOrd(char word[], int offset, int length) throws IOException {
|
||||||
|
final FST.BytesReader bytesReader = words.getBytesReader();
|
||||||
|
final FST.Arc<Long> arc = words.getFirstArc(new FST.Arc<Long>());
|
||||||
|
// Accumulate output as we go
|
||||||
|
final Long NO_OUTPUT = words.outputs.getNoOutput();
|
||||||
|
Long output = NO_OUTPUT;
|
||||||
|
|
||||||
|
int l = offset + length;
|
||||||
|
for (int i = offset, cp = 0; i < l; i += Character.charCount(cp)) {
|
||||||
|
cp = Character.codePointAt(word, i, l);
|
||||||
|
if (words.findTargetArc(cp, arc, arc, bytesReader) == null) {
|
||||||
|
return null;
|
||||||
|
} else if (arc.output != NO_OUTPUT) {
|
||||||
|
output = words.outputs.add(output, arc.output);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (words.findTargetArc(FST.END_LABEL, arc, arc, bytesReader) == null) {
|
||||||
|
return null;
|
||||||
|
} else if (arc.output != NO_OUTPUT) {
|
||||||
|
return words.outputs.add(output, arc.output).intValue();
|
||||||
|
} else {
|
||||||
|
return output.intValue();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Looks up HunspellAffix prefixes that have an append that matches the String created from the given char array, offset and length
|
||||||
|
*
|
||||||
|
* @param word Char array to generate the String from
|
||||||
|
* @param offset Offset in the char array that the String starts at
|
||||||
|
* @param length Length from the offset that the String is
|
||||||
|
* @return List of HunspellAffix prefixes with an append that matches the String, or {@code null} if none are found
|
||||||
|
*/
|
||||||
|
public List<Affix> lookupPrefix(char word[], int offset, int length) {
|
||||||
|
return prefixes.get(word, offset, length);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Looks up HunspellAffix suffixes that have an append that matches the String created from the given char array, offset and length
|
||||||
|
*
|
||||||
|
* @param word Char array to generate the String from
|
||||||
|
* @param offset Offset in the char array that the String starts at
|
||||||
|
* @param length Length from the offset that the String is
|
||||||
|
* @return List of HunspellAffix suffixes with an append that matches the String, or {@code null} if none are found
|
||||||
|
*/
|
||||||
|
List<Affix> lookupSuffix(char word[], int offset, int length) {
|
||||||
|
return suffixes.get(word, offset, length);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reads the affix file through the provided InputStream, building up the prefix and suffix maps
|
||||||
|
*
|
||||||
|
* @param affixStream InputStream to read the content of the affix file from
|
||||||
|
* @param decoder CharsetDecoder to decode the content of the file
|
||||||
|
* @throws IOException Can be thrown while reading from the InputStream
|
||||||
|
*/
|
||||||
|
private void readAffixFile(InputStream affixStream, CharsetDecoder decoder) throws IOException, ParseException {
|
||||||
|
prefixes = new CharArrayMap<List<Affix>>(Version.LUCENE_CURRENT, 8, false);
|
||||||
|
suffixes = new CharArrayMap<List<Affix>>(Version.LUCENE_CURRENT, 8, false);
|
||||||
|
|
||||||
|
LineNumberReader reader = new LineNumberReader(new InputStreamReader(affixStream, decoder));
|
||||||
|
String line = null;
|
||||||
|
while ((line = reader.readLine()) != null) {
|
||||||
|
if (line.startsWith(ALIAS_KEY)) {
|
||||||
|
parseAlias(line);
|
||||||
|
} else if (line.startsWith(PREFIX_KEY)) {
|
||||||
|
parseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN);
|
||||||
|
} else if (line.startsWith(SUFFIX_KEY)) {
|
||||||
|
parseAffix(suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN);
|
||||||
|
} else if (line.startsWith(FLAG_KEY)) {
|
||||||
|
// Assume that the FLAG line comes before any prefix or suffixes
|
||||||
|
// Store the strategy so it can be used when parsing the dic file
|
||||||
|
flagParsingStrategy = getFlagParsingStrategy(line);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parses a specific affix rule putting the result into the provided affix map
|
||||||
|
*
|
||||||
|
* @param affixes Map where the result of the parsing will be put
|
||||||
|
* @param header Header line of the affix rule
|
||||||
|
* @param reader BufferedReader to read the content of the rule from
|
||||||
|
* @param conditionPattern {@link String#format(String, Object...)} pattern to be used to generate the condition regex
|
||||||
|
* pattern
|
||||||
|
* @throws IOException Can be thrown while reading the rule
|
||||||
|
*/
|
||||||
|
private void parseAffix(CharArrayMap<List<Affix>> affixes,
|
||||||
|
String header,
|
||||||
|
LineNumberReader reader,
|
||||||
|
String conditionPattern) throws IOException, ParseException {
|
||||||
|
String args[] = header.split("\\s+");
|
||||||
|
|
||||||
|
boolean crossProduct = args[2].equals("Y");
|
||||||
|
|
||||||
|
int numLines = Integer.parseInt(args[3]);
|
||||||
|
for (int i = 0; i < numLines; i++) {
|
||||||
|
String line = reader.readLine();
|
||||||
|
String ruleArgs[] = line.split("\\s+");
|
||||||
|
|
||||||
|
if (ruleArgs.length < 5) {
|
||||||
|
throw new ParseException("The affix file contains a rule with less than five elements", reader.getLineNumber());
|
||||||
|
}
|
||||||
|
|
||||||
|
Affix affix = new Affix();
|
||||||
|
|
||||||
|
affix.setFlag(flagParsingStrategy.parseFlag(ruleArgs[1]));
|
||||||
|
affix.setStrip(ruleArgs[2].equals("0") ? "" : ruleArgs[2]);
|
||||||
|
|
||||||
|
String affixArg = ruleArgs[3];
|
||||||
|
|
||||||
|
int flagSep = affixArg.lastIndexOf('/');
|
||||||
|
if (flagSep != -1) {
|
||||||
|
String flagPart = affixArg.substring(flagSep + 1);
|
||||||
|
|
||||||
|
if (aliasCount > 0) {
|
||||||
|
flagPart = getAliasValue(Integer.parseInt(flagPart));
|
||||||
|
}
|
||||||
|
|
||||||
|
char appendFlags[] = flagParsingStrategy.parseFlags(flagPart);
|
||||||
|
Arrays.sort(appendFlags);
|
||||||
|
affix.setAppendFlags(appendFlags);
|
||||||
|
affix.setAppend(affixArg.substring(0, flagSep));
|
||||||
|
} else {
|
||||||
|
affix.setAppend(affixArg);
|
||||||
|
}
|
||||||
|
|
||||||
|
String condition = ruleArgs[4];
|
||||||
|
// at least the gascon affix file has this issue
|
||||||
|
if (condition.startsWith("[") && !condition.endsWith("]")) {
|
||||||
|
condition = condition + "]";
|
||||||
|
}
|
||||||
|
// "dash hasn't got special meaning" (we must escape it)
|
||||||
|
if (condition.indexOf('-') >= 0) {
|
||||||
|
condition = condition.replace("-", "\\-");
|
||||||
|
}
|
||||||
|
affix.setCondition(condition, String.format(Locale.ROOT, conditionPattern, condition));
|
||||||
|
affix.setCrossProduct(crossProduct);
|
||||||
|
|
||||||
|
List<Affix> list = affixes.get(affix.getAppend());
|
||||||
|
if (list == null) {
|
||||||
|
list = new ArrayList<Affix>();
|
||||||
|
affixes.put(affix.getAppend(), list);
|
||||||
|
}
|
||||||
|
|
||||||
|
list.add(affix);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parses the encoding specified in the affix file readable through the provided InputStream
|
||||||
|
*
|
||||||
|
* @param affix InputStream for reading the affix file
|
||||||
|
* @return Encoding specified in the affix file
|
||||||
|
* @throws IOException Can be thrown while reading from the InputStream
|
||||||
|
* @throws ParseException Thrown if the first non-empty non-comment line read from the file does not adhere to the format {@code SET <encoding>}
|
||||||
|
*/
|
||||||
|
private String getDictionaryEncoding(InputStream affix) throws IOException, ParseException {
|
||||||
|
final StringBuilder encoding = new StringBuilder();
|
||||||
|
for (;;) {
|
||||||
|
encoding.setLength(0);
|
||||||
|
int ch;
|
||||||
|
while ((ch = affix.read()) >= 0) {
|
||||||
|
if (ch == '\n') {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (ch != '\r') {
|
||||||
|
encoding.append((char)ch);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (
|
||||||
|
encoding.length() == 0 || encoding.charAt(0) == '#' ||
|
||||||
|
// this test only at the end as ineffective but would allow lines only containing spaces:
|
||||||
|
encoding.toString().trim().length() == 0
|
||||||
|
) {
|
||||||
|
if (ch < 0) {
|
||||||
|
throw new ParseException("Unexpected end of affix file.", 0);
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (encoding.length() > 4 && "SET ".equals(encoding.substring(0, 4))) {
|
||||||
|
// cleanup the encoding string, too (whitespace)
|
||||||
|
return encoding.substring(4).trim();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static final Map<String,String> CHARSET_ALIASES;
|
||||||
|
static {
|
||||||
|
Map<String,String> m = new HashMap<>();
|
||||||
|
m.put("microsoft-cp1251", "windows-1251");
|
||||||
|
m.put("TIS620-2533", "TIS-620");
|
||||||
|
CHARSET_ALIASES = Collections.unmodifiableMap(m);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Retrieves the CharsetDecoder for the given encoding. Note, This isn't perfect as I think ISCII-DEVANAGARI and
|
||||||
|
* MICROSOFT-CP1251 etc are allowed...
|
||||||
|
*
|
||||||
|
* @param encoding Encoding to retrieve the CharsetDecoder for
|
||||||
|
* @return CharSetDecoder for the given encoding
|
||||||
|
*/
|
||||||
|
private CharsetDecoder getJavaEncoding(String encoding) {
|
||||||
|
if ("ISO8859-14".equals(encoding)) {
|
||||||
|
return new ISO8859_14Decoder();
|
||||||
|
}
|
||||||
|
String canon = CHARSET_ALIASES.get(encoding);
|
||||||
|
if (canon != null) {
|
||||||
|
encoding = canon;
|
||||||
|
}
|
||||||
|
Charset charset = Charset.forName(encoding);
|
||||||
|
return charset.newDecoder().onMalformedInput(CodingErrorAction.REPLACE);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Determines the appropriate {@link FlagParsingStrategy} based on the FLAG definition line taken from the affix file
|
||||||
|
*
|
||||||
|
* @param flagLine Line containing the flag information
|
||||||
|
* @return FlagParsingStrategy that handles parsing flags in the way specified in the FLAG definition
|
||||||
|
*/
|
||||||
|
private FlagParsingStrategy getFlagParsingStrategy(String flagLine) {
|
||||||
|
String flagType = flagLine.substring(5);
|
||||||
|
|
||||||
|
if (NUM_FLAG_TYPE.equals(flagType)) {
|
||||||
|
return new NumFlagParsingStrategy();
|
||||||
|
} else if (UTF8_FLAG_TYPE.equals(flagType)) {
|
||||||
|
return new SimpleFlagParsingStrategy();
|
||||||
|
} else if (LONG_FLAG_TYPE.equals(flagType)) {
|
||||||
|
return new DoubleASCIIFlagParsingStrategy();
|
||||||
|
}
|
||||||
|
|
||||||
|
throw new IllegalArgumentException("Unknown flag type: " + flagType);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reads the dictionary file through the provided InputStream, building up the words map
|
||||||
|
*
|
||||||
|
* @param dictionary InputStream to read the dictionary file through
|
||||||
|
* @param decoder CharsetDecoder used to decode the contents of the file
|
||||||
|
* @throws IOException Can be thrown while reading from the file
|
||||||
|
*/
|
||||||
|
private void readDictionaryFile(InputStream dictionary, CharsetDecoder decoder, TreeMap<BytesRef,Integer> words) throws IOException {
|
||||||
|
BytesRef flagsScratch = new BytesRef();
|
||||||
|
BytesRef flagsScratch2 = new BytesRef();
|
||||||
|
|
||||||
|
BufferedReader reader = new BufferedReader(new InputStreamReader(dictionary, decoder));
|
||||||
|
// TODO: don't create millions of strings.
|
||||||
|
String line = reader.readLine(); // first line is number of entries
|
||||||
|
// sometimes the number of entries has a comment/copyright after it
|
||||||
|
line = line.replaceFirst("\\s*\\#.*$", "");
|
||||||
|
int numEntries = Integer.parseInt(line);
|
||||||
|
|
||||||
|
// TODO: the flags themselves can be double-chars (long) or also numeric
|
||||||
|
// either way the trick is to encode them as char... but they must be parsed differently
|
||||||
|
while ((line = reader.readLine()) != null) {
|
||||||
|
String entry;
|
||||||
|
char wordForm[];
|
||||||
|
|
||||||
|
int flagSep = line.lastIndexOf('/');
|
||||||
|
if (flagSep == -1) {
|
||||||
|
wordForm = NOFLAGS;
|
||||||
|
entry = line;
|
||||||
|
} else {
|
||||||
|
// note, there can be comments (morph description) after a flag.
|
||||||
|
// we should really look for any whitespace
|
||||||
|
int end = line.indexOf('\t', flagSep);
|
||||||
|
if (end == -1)
|
||||||
|
end = line.length();
|
||||||
|
|
||||||
|
String flagPart = line.substring(flagSep + 1, end);
|
||||||
|
if (aliasCount > 0) {
|
||||||
|
flagPart = getAliasValue(Integer.parseInt(flagPart));
|
||||||
|
}
|
||||||
|
|
||||||
|
wordForm = flagParsingStrategy.parseFlags(flagPart);
|
||||||
|
Arrays.sort(wordForm);
|
||||||
|
entry = line.substring(0, flagSep);
|
||||||
|
}
|
||||||
|
|
||||||
|
BytesRef scratch = new BytesRef(entry);
|
||||||
|
Integer existingOrd = words.get(scratch);
|
||||||
|
final char mergedEntries[];
|
||||||
|
if (existingOrd == null || existingOrd == 0) {
|
||||||
|
mergedEntries = wordForm;
|
||||||
|
} else {
|
||||||
|
flagLookup.get(existingOrd, flagsScratch2);
|
||||||
|
mergedEntries = merge(decodeFlags(flagsScratch2), wordForm);
|
||||||
|
}
|
||||||
|
|
||||||
|
final int hashCode = encodeFlagsWithHash(flagsScratch, mergedEntries);
|
||||||
|
int ord = flagLookup.add(flagsScratch, hashCode);
|
||||||
|
if (ord < 0) {
|
||||||
|
// already exists in our hash
|
||||||
|
ord = (-ord)-1;
|
||||||
|
}
|
||||||
|
|
||||||
|
words.put(scratch, ord);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static char[] decodeFlags(BytesRef b) {
|
||||||
|
int len = b.length >>> 1;
|
||||||
|
char flags[] = new char[len];
|
||||||
|
int upto = 0;
|
||||||
|
int end = b.offset + b.length;
|
||||||
|
for (int i = b.offset; i < end; i += 2) {
|
||||||
|
flags[upto++] = (char)((b.bytes[i] << 8) | (b.bytes[i+1] & 0xff));
|
||||||
|
}
|
||||||
|
return flags;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int encodeFlagsWithHash(BytesRef b, char flags[]) {
|
||||||
|
int hash = 0;
|
||||||
|
int len = flags.length << 1;
|
||||||
|
b.grow(len);
|
||||||
|
b.length = len;
|
||||||
|
int upto = b.offset;
|
||||||
|
for (int i = 0; i < flags.length; i++) {
|
||||||
|
int flag = flags[i];
|
||||||
|
hash = 31*hash + (b.bytes[upto++] = (byte) ((flag >> 8) & 0xff));
|
||||||
|
hash = 31*hash + (b.bytes[upto++] = (byte) (flag & 0xff));
|
||||||
|
}
|
||||||
|
return hash;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void parseAlias(String line) {
|
||||||
|
String ruleArgs[] = line.split("\\s+");
|
||||||
|
if (aliases == null) {
|
||||||
|
//first line should be the aliases count
|
||||||
|
final int count = Integer.parseInt(ruleArgs[1]);
|
||||||
|
aliases = new String[count];
|
||||||
|
} else {
|
||||||
|
aliases[aliasCount++] = ruleArgs[1];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private String getAliasValue(int id) {
|
||||||
|
try {
|
||||||
|
return aliases[id - 1];
|
||||||
|
} catch (IndexOutOfBoundsException ex) {
|
||||||
|
throw new IllegalArgumentException("Bad flag alias number:" + id, ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Abstraction of the process of parsing flags taken from the affix and dic files
|
||||||
|
*/
|
||||||
|
private static abstract class FlagParsingStrategy {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parses the given String into a single flag
|
||||||
|
*
|
||||||
|
* @param rawFlag String to parse into a flag
|
||||||
|
* @return Parsed flag
|
||||||
|
*/
|
||||||
|
char parseFlag(String rawFlag) {
|
||||||
|
return parseFlags(rawFlag)[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parses the given String into multiple flags
|
||||||
|
*
|
||||||
|
* @param rawFlags String to parse into flags
|
||||||
|
* @return Parsed flags
|
||||||
|
*/
|
||||||
|
abstract char[] parseFlags(String rawFlags);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Simple implementation of {@link FlagParsingStrategy} that treats the chars in each String as a individual flags.
|
||||||
|
* Can be used with both the ASCII and UTF-8 flag types.
|
||||||
|
*/
|
||||||
|
private static class SimpleFlagParsingStrategy extends FlagParsingStrategy {
|
||||||
|
@Override
|
||||||
|
public char[] parseFlags(String rawFlags) {
|
||||||
|
return rawFlags.toCharArray();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Implementation of {@link FlagParsingStrategy} that assumes each flag is encoded in its numerical form. In the case
|
||||||
|
* of multiple flags, each number is separated by a comma.
|
||||||
|
*/
|
||||||
|
private static class NumFlagParsingStrategy extends FlagParsingStrategy {
|
||||||
|
@Override
|
||||||
|
public char[] parseFlags(String rawFlags) {
|
||||||
|
String[] rawFlagParts = rawFlags.trim().split(",");
|
||||||
|
char[] flags = new char[rawFlagParts.length];
|
||||||
|
int upto = 0;
|
||||||
|
|
||||||
|
for (int i = 0; i < rawFlagParts.length; i++) {
|
||||||
|
// note, removing the trailing X/leading I for nepali... what is the rule here?!
|
||||||
|
String replacement = rawFlagParts[i].replaceAll("[^0-9]", "");
|
||||||
|
// note, ignoring empty flags (this happens in danish, for example)
|
||||||
|
if (replacement.isEmpty()) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
flags[upto++] = (char) Integer.parseInt(replacement);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (upto < flags.length) {
|
||||||
|
flags = Arrays.copyOf(flags, upto);
|
||||||
|
}
|
||||||
|
return flags;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Implementation of {@link FlagParsingStrategy} that assumes each flag is encoded as two ASCII characters whose codes
|
||||||
|
* must be combined into a single character.
|
||||||
|
*
|
||||||
|
* TODO (rmuir) test
|
||||||
|
*/
|
||||||
|
private static class DoubleASCIIFlagParsingStrategy extends FlagParsingStrategy {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public char[] parseFlags(String rawFlags) {
|
||||||
|
if (rawFlags.length() == 0) {
|
||||||
|
return new char[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
StringBuilder builder = new StringBuilder();
|
||||||
|
for (int i = 0; i < rawFlags.length(); i+=2) {
|
||||||
|
char cookedFlag = (char) ((int) rawFlags.charAt(i) + (int) rawFlags.charAt(i + 1));
|
||||||
|
builder.append(cookedFlag);
|
||||||
|
}
|
||||||
|
|
||||||
|
char flags[] = new char[builder.length()];
|
||||||
|
builder.getChars(0, builder.length(), flags, 0);
|
||||||
|
return flags;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static boolean hasFlag(char flags[], char flag) {
|
||||||
|
return Arrays.binarySearch(flags, flag) >= 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static char[] merge(char[] flags1, char[] flags2) {
|
||||||
|
char merged[] = new char[flags1.length + flags2.length];
|
||||||
|
int i1 = 0, i2 = 0;
|
||||||
|
int last = -1;
|
||||||
|
int upto = 0;
|
||||||
|
|
||||||
|
while (i1 < flags1.length && i2 < flags2.length) {
|
||||||
|
final char next;
|
||||||
|
if (flags1[i1] <= flags2[i2]) {
|
||||||
|
next = flags1[i1++];
|
||||||
|
} else {
|
||||||
|
next = flags2[i2++];
|
||||||
|
}
|
||||||
|
if (next != last) {
|
||||||
|
merged[upto++] = next;
|
||||||
|
last = next;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
while (i1 < flags1.length) {
|
||||||
|
char next = flags1[i1++];
|
||||||
|
if (next != last) {
|
||||||
|
merged[upto++] = next;
|
||||||
|
last = next;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
while (i2 < flags2.length) {
|
||||||
|
char next = flags2[i2++];
|
||||||
|
if (next != last) {
|
||||||
|
merged[upto++] = next;
|
||||||
|
last = next;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (merged.length != upto) {
|
||||||
|
merged = Arrays.copyOf(merged, upto);
|
||||||
|
}
|
||||||
|
|
||||||
|
return merged;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,139 @@
|
||||||
|
package org.apache.lucene.analysis.hunspell2;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.hunspell2.Stemmer.Stem;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* TokenFilter that uses hunspell affix rules and words to stem tokens. Since hunspell supports a word having multiple
|
||||||
|
* stems, this filter can emit multiple tokens for each consumed token
|
||||||
|
*
|
||||||
|
* <p>
|
||||||
|
* Note: This filter is aware of the {@link KeywordAttribute}. To prevent
|
||||||
|
* certain terms from being passed to the stemmer
|
||||||
|
* {@link KeywordAttribute#isKeyword()} should be set to <code>true</code>
|
||||||
|
* in a previous {@link TokenStream}.
|
||||||
|
*
|
||||||
|
* Note: For including the original term as well as the stemmed version, see
|
||||||
|
* {@link org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilterFactory}
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* @lucene.experimental
|
||||||
|
*/
|
||||||
|
public final class Hunspell2StemFilter extends TokenFilter {
|
||||||
|
|
||||||
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
|
private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
|
||||||
|
private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
|
||||||
|
private final Stemmer stemmer;
|
||||||
|
|
||||||
|
private List<Stem> buffer;
|
||||||
|
private State savedState;
|
||||||
|
|
||||||
|
private final boolean dedup;
|
||||||
|
|
||||||
|
/** Create a {@link Hunspell2StemFilter} which deduplicates stems and has a maximum
|
||||||
|
* recursion level of 2.
|
||||||
|
* @see #Hunspell2StemFilter(TokenStream, Dictionary, int) */
|
||||||
|
public Hunspell2StemFilter(TokenStream input, Dictionary dictionary) {
|
||||||
|
this(input, dictionary, 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a new Hunspell2StemFilter that will stem tokens from the given TokenStream using affix rules in the provided
|
||||||
|
* Dictionary
|
||||||
|
*
|
||||||
|
* @param input TokenStream whose tokens will be stemmed
|
||||||
|
* @param dictionary HunspellDictionary containing the affix rules and words that will be used to stem the tokens
|
||||||
|
* @param recursionCap maximum level of recursion stemmer can go into, defaults to <code>2</code>
|
||||||
|
*/
|
||||||
|
public Hunspell2StemFilter(TokenStream input, Dictionary dictionary, int recursionCap) {
|
||||||
|
this(input, dictionary, true, recursionCap);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Create a {@link Hunspell2StemFilter} which has a maximum recursion level of 2.
|
||||||
|
* @see #Hunspell2StemFilter(TokenStream, Dictionary, boolean, int) */
|
||||||
|
public Hunspell2StemFilter(TokenStream input, Dictionary dictionary, boolean dedup) {
|
||||||
|
this(input, dictionary, dedup, 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a new HunspellStemFilter that will stem tokens from the given TokenStream using affix rules in the provided
|
||||||
|
* Dictionary
|
||||||
|
*
|
||||||
|
* @param input TokenStream whose tokens will be stemmed
|
||||||
|
* @param dictionary HunspellDictionary containing the affix rules and words that will be used to stem the tokens
|
||||||
|
* @param dedup true if only unique terms should be output.
|
||||||
|
* @param recursionCap maximum level of recursion stemmer can go into, defaults to <code>2</code>
|
||||||
|
*/
|
||||||
|
public Hunspell2StemFilter(TokenStream input, Dictionary dictionary, boolean dedup, int recursionCap) {
|
||||||
|
super(input);
|
||||||
|
this.dedup = dedup;
|
||||||
|
this.stemmer = new Stemmer(dictionary, recursionCap);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean incrementToken() throws IOException {
|
||||||
|
if (buffer != null && !buffer.isEmpty()) {
|
||||||
|
Stem nextStem = buffer.remove(0);
|
||||||
|
restoreState(savedState);
|
||||||
|
posIncAtt.setPositionIncrement(0);
|
||||||
|
termAtt.copyBuffer(nextStem.getStem(), 0, nextStem.getStemLength());
|
||||||
|
termAtt.setLength(nextStem.getStemLength());
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!input.incrementToken()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (keywordAtt.isKeyword()) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
buffer = dedup ? stemmer.uniqueStems(termAtt.buffer(), termAtt.length()) : stemmer.stem(termAtt.buffer(), termAtt.length());
|
||||||
|
|
||||||
|
if (buffer.isEmpty()) { // we do not know this word, return it unchanged
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
Stem stem = buffer.remove(0);
|
||||||
|
termAtt.copyBuffer(stem.getStem(), 0, stem.getStemLength());
|
||||||
|
termAtt.setLength(stem.getStemLength());
|
||||||
|
|
||||||
|
if (!buffer.isEmpty()) {
|
||||||
|
savedState = captureState();
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void reset() throws IOException {
|
||||||
|
super.reset();
|
||||||
|
buffer = null;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,80 @@
|
||||||
|
package org.apache.lucene.analysis.hunspell2;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.text.ParseException;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.util.ResourceLoader;
|
||||||
|
import org.apache.lucene.analysis.util.ResourceLoaderAware;
|
||||||
|
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* TokenFilterFactory that creates instances of {@link Hunspell2StemFilter}.
|
||||||
|
* Example config for British English:
|
||||||
|
* <pre class="prettyprint">
|
||||||
|
* <filter class="solr.Hunspell2StemFilterFactory"
|
||||||
|
* dictionary="en_GB.dic"
|
||||||
|
* affix="en_GB.aff" /></pre>
|
||||||
|
* Both parameters dictionary and affix are mandatory.
|
||||||
|
* Dictionaries for many languages are available through the OpenOffice project.
|
||||||
|
*
|
||||||
|
* See <a href="http://wiki.apache.org/solr/Hunspell">http://wiki.apache.org/solr/Hunspell</a>
|
||||||
|
* @lucene.experimental
|
||||||
|
*/
|
||||||
|
public class Hunspell2StemFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
|
||||||
|
private static final String PARAM_DICTIONARY = "dictionary";
|
||||||
|
private static final String PARAM_AFFIX = "affix";
|
||||||
|
private static final String PARAM_RECURSION_CAP = "recursionCap";
|
||||||
|
|
||||||
|
private final String dictionaryFile;
|
||||||
|
private final String affixFile;
|
||||||
|
private Dictionary dictionary;
|
||||||
|
private int recursionCap;
|
||||||
|
|
||||||
|
/** Creates a new Hunspell2StemFilterFactory */
|
||||||
|
public Hunspell2StemFilterFactory(Map<String,String> args) {
|
||||||
|
super(args);
|
||||||
|
dictionaryFile = require(args, PARAM_DICTIONARY);
|
||||||
|
affixFile = get(args, PARAM_AFFIX);
|
||||||
|
recursionCap = getInt(args, PARAM_RECURSION_CAP, 2);
|
||||||
|
if (!args.isEmpty()) {
|
||||||
|
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void inform(ResourceLoader loader) throws IOException {
|
||||||
|
try (InputStream affix = loader.openResource(affixFile);
|
||||||
|
InputStream dictionary = loader.openResource(dictionaryFile)) {
|
||||||
|
try {
|
||||||
|
this.dictionary = new Dictionary(affix, dictionary);
|
||||||
|
} catch (ParseException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public TokenStream create(TokenStream tokenStream) {
|
||||||
|
return new Hunspell2StemFilter(tokenStream, dictionary, recursionCap);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,60 @@
|
||||||
|
package org.apache.lucene.analysis.hunspell2;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.nio.ByteBuffer;
|
||||||
|
import java.nio.CharBuffer;
|
||||||
|
import java.nio.charset.CharsetDecoder;
|
||||||
|
import java.nio.charset.CoderResult;
|
||||||
|
|
||||||
|
import org.apache.lucene.util.IOUtils;
|
||||||
|
|
||||||
|
// many hunspell dictionaries use this encoding, yet java does not have it?!?!
|
||||||
|
final class ISO8859_14Decoder extends CharsetDecoder {
|
||||||
|
|
||||||
|
static final char TABLE[] = new char[] {
|
||||||
|
0x00A0, 0x1E02, 0x1E03, 0x00A3, 0x010A, 0x010B, 0x1E0A, 0x00A7,
|
||||||
|
0x1E80, 0x00A9, 0x1E82, 0x1E0B, 0x1EF2, 0x00AD, 0x00AE, 0x0178,
|
||||||
|
0x1E1E, 0x1E1F, 0x0120, 0x0121, 0x1E40, 0x1E41, 0x00B6, 0x1E56,
|
||||||
|
0x1E81, 0x1E57, 0x1E83, 0x1E60, 0x1EF3, 0x1E84, 0x1E85, 0x1E61,
|
||||||
|
0x00C0, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x00C6, 0x00C7,
|
||||||
|
0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF,
|
||||||
|
0x0174, 0x00D1, 0x00D2, 0x00D3, 0x00D4, 0x00D5, 0x00D6, 0x1E6A,
|
||||||
|
0x00D8, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x00DD, 0x0176, 0x00DF,
|
||||||
|
0x00E0, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6, 0x00E7,
|
||||||
|
0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF,
|
||||||
|
0x0175, 0x00F1, 0x00F2, 0x00F3, 0x00F4, 0x00F5, 0x00F6, 0x1E6B,
|
||||||
|
0x00F8, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x00FD, 0x0177, 0x00FF
|
||||||
|
};
|
||||||
|
|
||||||
|
ISO8859_14Decoder() {
|
||||||
|
super(IOUtils.CHARSET_UTF_8, 1f, 1f);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected CoderResult decodeLoop(ByteBuffer in, CharBuffer out) {
|
||||||
|
while (in.hasRemaining() && out.hasRemaining()) {
|
||||||
|
char ch = (char) (in.get() & 0xff);
|
||||||
|
if (ch >= 0xA0) {
|
||||||
|
ch = TABLE[ch - 0xA0];
|
||||||
|
}
|
||||||
|
out.put(ch);
|
||||||
|
}
|
||||||
|
return in.hasRemaining() ? CoderResult.OVERFLOW : CoderResult.UNDERFLOW;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,288 @@
|
||||||
|
package org.apache.lucene.analysis.hunspell2;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Stemmer uses the affix rules declared in the Dictionary to generate one or more stems for a word. It
|
||||||
|
* conforms to the algorithm in the original hunspell algorithm, including recursive suffix stripping.
|
||||||
|
*/
|
||||||
|
final class Stemmer {
|
||||||
|
private final int recursionCap;
|
||||||
|
private final Dictionary dictionary;
|
||||||
|
private BytesRef scratch = new BytesRef();
|
||||||
|
private final StringBuilder segment = new StringBuilder();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Constructs a new Stemmer which will use the provided Dictionary to create its stems. Uses the
|
||||||
|
* default recursion cap of <code>2</code> (based on Hunspell documentation).
|
||||||
|
*
|
||||||
|
* @param dictionary Dictionary that will be used to create the stems
|
||||||
|
*/
|
||||||
|
public Stemmer(Dictionary dictionary) {
|
||||||
|
this(dictionary, 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Constructs a new Stemmer which will use the provided Dictionary to create its stems.
|
||||||
|
*
|
||||||
|
* @param dictionary Dictionary that will be used to create the stems
|
||||||
|
* @param recursionCap maximum level of recursion stemmer can go into
|
||||||
|
*/
|
||||||
|
public Stemmer(Dictionary dictionary, int recursionCap) {
|
||||||
|
this.dictionary = dictionary;
|
||||||
|
this.recursionCap = recursionCap;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Find the stem(s) of the provided word.
|
||||||
|
*
|
||||||
|
* @param word Word to find the stems for
|
||||||
|
* @return List of stems for the word
|
||||||
|
*/
|
||||||
|
public List<Stem> stem(String word) {
|
||||||
|
return stem(word.toCharArray(), word.length());
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Find the stem(s) of the provided word
|
||||||
|
*
|
||||||
|
* @param word Word to find the stems for
|
||||||
|
* @return List of stems for the word
|
||||||
|
*/
|
||||||
|
public List<Stem> stem(char word[], int length) {
|
||||||
|
List<Stem> stems = new ArrayList<Stem>();
|
||||||
|
if (dictionary.lookupWord(word, 0, length, scratch) != null) {
|
||||||
|
stems.add(new Stem(word, length));
|
||||||
|
}
|
||||||
|
stems.addAll(stem(word, length, null, 0));
|
||||||
|
return stems;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Find the unique stem(s) of the provided word
|
||||||
|
*
|
||||||
|
* @param word Word to find the stems for
|
||||||
|
* @return List of stems for the word
|
||||||
|
*/
|
||||||
|
public List<Stem> uniqueStems(char word[], int length) {
|
||||||
|
List<Stem> stems = new ArrayList<Stem>();
|
||||||
|
CharArraySet terms = new CharArraySet(Version.LUCENE_CURRENT, 8, false);
|
||||||
|
if (dictionary.lookupWord(word, 0, length, scratch) != null) {
|
||||||
|
stems.add(new Stem(word, length));
|
||||||
|
terms.add(word);
|
||||||
|
}
|
||||||
|
List<Stem> otherStems = stem(word, length, null, 0);
|
||||||
|
for (Stem s : otherStems) {
|
||||||
|
if (!terms.contains(s.stem)) {
|
||||||
|
stems.add(s);
|
||||||
|
terms.add(s.stem);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return stems;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ================================================= Helper Methods ================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Generates a list of stems for the provided word
|
||||||
|
*
|
||||||
|
* @param word Word to generate the stems for
|
||||||
|
* @param flags Flags from a previous stemming step that need to be cross-checked with any affixes in this recursive step
|
||||||
|
* @param recursionDepth Level of recursion this stemming step is at
|
||||||
|
* @return List of stems, or empty list if no stems are found
|
||||||
|
*/
|
||||||
|
private List<Stem> stem(char word[], int length, char[] flags, int recursionDepth) {
|
||||||
|
List<Stem> stems = new ArrayList<Stem>();
|
||||||
|
|
||||||
|
for (int i = 0; i < length; i++) {
|
||||||
|
List<Affix> suffixes = dictionary.lookupSuffix(word, i, length - i);
|
||||||
|
if (suffixes == null) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (Affix suffix : suffixes) {
|
||||||
|
if (hasCrossCheckedFlag(suffix.getFlag(), flags)) {
|
||||||
|
int deAffixedLength = length - suffix.getAppend().length();
|
||||||
|
// TODO: can we do this in-place?
|
||||||
|
String strippedWord = new StringBuilder().append(word, 0, deAffixedLength).append(suffix.getStrip()).toString();
|
||||||
|
|
||||||
|
List<Stem> stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), suffix, recursionDepth);
|
||||||
|
for (Stem stem : stemList) {
|
||||||
|
stem.addSuffix(suffix);
|
||||||
|
}
|
||||||
|
|
||||||
|
stems.addAll(stemList);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = length - 1; i >= 0; i--) {
|
||||||
|
List<Affix> prefixes = dictionary.lookupPrefix(word, 0, i);
|
||||||
|
if (prefixes == null) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (Affix prefix : prefixes) {
|
||||||
|
if (hasCrossCheckedFlag(prefix.getFlag(), flags)) {
|
||||||
|
int deAffixedStart = prefix.getAppend().length();
|
||||||
|
int deAffixedLength = length - deAffixedStart;
|
||||||
|
|
||||||
|
String strippedWord = new StringBuilder().append(prefix.getStrip())
|
||||||
|
.append(word, deAffixedStart, deAffixedLength)
|
||||||
|
.toString();
|
||||||
|
|
||||||
|
List<Stem> stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), prefix, recursionDepth);
|
||||||
|
for (Stem stem : stemList) {
|
||||||
|
stem.addPrefix(prefix);
|
||||||
|
}
|
||||||
|
|
||||||
|
stems.addAll(stemList);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return stems;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Applies the affix rule to the given word, producing a list of stems if any are found
|
||||||
|
*
|
||||||
|
* @param strippedWord Word the affix has been removed and the strip added
|
||||||
|
* @param affix HunspellAffix representing the affix rule itself
|
||||||
|
* @param recursionDepth Level of recursion this stemming step is at
|
||||||
|
* @return List of stems for the word, or an empty list if none are found
|
||||||
|
*/
|
||||||
|
public List<Stem> applyAffix(char strippedWord[], int length, Affix affix, int recursionDepth) {
|
||||||
|
segment.setLength(0);
|
||||||
|
segment.append(strippedWord, 0, length);
|
||||||
|
if (!affix.checkCondition(segment)) {
|
||||||
|
return Collections.emptyList();
|
||||||
|
}
|
||||||
|
|
||||||
|
List<Stem> stems = new ArrayList<Stem>();
|
||||||
|
|
||||||
|
char wordFlags[] = dictionary.lookupWord(strippedWord, 0, length, scratch);
|
||||||
|
if (wordFlags != null && Dictionary.hasFlag(wordFlags, affix.getFlag())) {
|
||||||
|
stems.add(new Stem(strippedWord, length));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (affix.isCrossProduct() && recursionDepth < recursionCap) {
|
||||||
|
stems.addAll(stem(strippedWord, length, affix.getAppendFlags(), ++recursionDepth));
|
||||||
|
}
|
||||||
|
|
||||||
|
return stems;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Checks if the given flag cross checks with the given array of flags
|
||||||
|
*
|
||||||
|
* @param flag Flag to cross check with the array of flags
|
||||||
|
* @param flags Array of flags to cross check against. Can be {@code null}
|
||||||
|
* @return {@code true} if the flag is found in the array or the array is {@code null}, {@code false} otherwise
|
||||||
|
*/
|
||||||
|
private boolean hasCrossCheckedFlag(char flag, char[] flags) {
|
||||||
|
return flags == null || Arrays.binarySearch(flags, flag) >= 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Stem represents all information known about a stem of a word. This includes the stem, and the prefixes and suffixes
|
||||||
|
* that were used to change the word into the stem.
|
||||||
|
*/
|
||||||
|
public static class Stem {
|
||||||
|
|
||||||
|
private final List<Affix> prefixes = new ArrayList<Affix>();
|
||||||
|
private final List<Affix> suffixes = new ArrayList<Affix>();
|
||||||
|
private final char stem[];
|
||||||
|
private final int stemLength;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a new Stem wrapping the given word stem
|
||||||
|
*
|
||||||
|
* @param stem Stem of a word
|
||||||
|
*/
|
||||||
|
public Stem(char stem[], int stemLength) {
|
||||||
|
this.stem = stem;
|
||||||
|
this.stemLength = stemLength;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Adds a prefix to the list of prefixes used to generate this stem. Because it is assumed that prefixes are added
|
||||||
|
* depth first, the prefix is added to the front of the list
|
||||||
|
*
|
||||||
|
* @param prefix Prefix to add to the list of prefixes for this stem
|
||||||
|
*/
|
||||||
|
public void addPrefix(Affix prefix) {
|
||||||
|
prefixes.add(0, prefix);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Adds a suffix to the list of suffixes used to generate this stem. Because it is assumed that suffixes are added
|
||||||
|
* depth first, the suffix is added to the end of the list
|
||||||
|
*
|
||||||
|
* @param suffix Suffix to add to the list of suffixes for this stem
|
||||||
|
*/
|
||||||
|
public void addSuffix(Affix suffix) {
|
||||||
|
suffixes.add(suffix);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the list of prefixes used to generate the stem
|
||||||
|
*
|
||||||
|
* @return List of prefixes used to generate the stem or an empty list if no prefixes were required
|
||||||
|
*/
|
||||||
|
public List<Affix> getPrefixes() {
|
||||||
|
return prefixes;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the list of suffixes used to generate the stem
|
||||||
|
*
|
||||||
|
* @return List of suffixes used to generate the stem or an empty list if no suffixes were required
|
||||||
|
*/
|
||||||
|
public List<Affix> getSuffixes() {
|
||||||
|
return suffixes;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the text of the word's stem.
|
||||||
|
* @see #getStemLength()
|
||||||
|
*/
|
||||||
|
public char[] getStem() {
|
||||||
|
return stem;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Returns the valid length of the text in {@link #getStem()} */
|
||||||
|
public int getStemLength() {
|
||||||
|
return stemLength;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Only use this if you really need a string (e.g. for testing) */
|
||||||
|
public String getStemString() {
|
||||||
|
return new String(stem, 0, stemLength);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,26 @@
|
||||||
|
<!--
|
||||||
|
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
contributor license agreements. See the NOTICE file distributed with
|
||||||
|
this work for additional information regarding copyright ownership.
|
||||||
|
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
(the "License"); you may not use this file except in compliance with
|
||||||
|
the License. You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
-->
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
Stemming TokenFilter using a Java implementation of the <a href="http://www.ldc.upenn.edu/Catalog/docs/LDC2008T01/acta04.pdf">
|
||||||
|
Hunspell stemming algorithm.</a>
|
||||||
|
<p>
|
||||||
|
Dictionaries can be found on <a href="http://wiki.services.openoffice.org/wiki/Dictionaries">
|
||||||
|
OpenOffice's wiki</a>
|
||||||
|
</p>
|
||||||
|
</body>
|
||||||
|
</html>
|
|
@ -51,6 +51,7 @@ org.apache.lucene.analysis.hi.HindiNormalizationFilterFactory
|
||||||
org.apache.lucene.analysis.hi.HindiStemFilterFactory
|
org.apache.lucene.analysis.hi.HindiStemFilterFactory
|
||||||
org.apache.lucene.analysis.hu.HungarianLightStemFilterFactory
|
org.apache.lucene.analysis.hu.HungarianLightStemFilterFactory
|
||||||
org.apache.lucene.analysis.hunspell.HunspellStemFilterFactory
|
org.apache.lucene.analysis.hunspell.HunspellStemFilterFactory
|
||||||
|
org.apache.lucene.analysis.hunspell2.Hunspell2StemFilterFactory
|
||||||
org.apache.lucene.analysis.id.IndonesianStemFilterFactory
|
org.apache.lucene.analysis.id.IndonesianStemFilterFactory
|
||||||
org.apache.lucene.analysis.in.IndicNormalizationFilterFactory
|
org.apache.lucene.analysis.in.IndicNormalizationFilterFactory
|
||||||
org.apache.lucene.analysis.it.ItalianLightStemFilterFactory
|
org.apache.lucene.analysis.it.ItalianLightStemFilterFactory
|
||||||
|
|
|
@ -0,0 +1,205 @@
|
||||||
|
package org.apache.lucene.analysis.hunspell2;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.util.zip.ZipEntry;
|
||||||
|
import java.util.zip.ZipFile;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.hunspell.HunspellDictionary;
|
||||||
|
import org.apache.lucene.util.IOUtils;
|
||||||
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
import org.apache.lucene.util.RamUsageEstimator;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Can be retrieved via:
|
||||||
|
* wget --mirror -np http://archive.services.openoffice.org/pub/mirror/OpenOffice.org/contrib/dictionaries/
|
||||||
|
* Note some of the files differ only in case. This may be a problem on your operating system!
|
||||||
|
*/
|
||||||
|
//@Ignore("enable manually")
|
||||||
|
public class TestAllDictionaries extends LuceneTestCase {
|
||||||
|
|
||||||
|
// set this to the location of where you downloaded all the files
|
||||||
|
static final File DICTIONARY_HOME =
|
||||||
|
new File("/Users/rmuir/hunspell/archive.services.openoffice.org/pub/mirror/OpenOffice.org/contrib/dictionaries");
|
||||||
|
|
||||||
|
final String tests[] = {
|
||||||
|
/* zip file */ /* dictionary */ /* affix */
|
||||||
|
"af_ZA.zip", "af_ZA.dic", "af_ZA.aff",
|
||||||
|
"ak_GH.zip", "ak_GH.dic", "ak_GH.aff",
|
||||||
|
"bg_BG.zip", "bg_BG.dic", "bg_BG.aff",
|
||||||
|
"ca_ANY.zip", "catalan.dic", "catalan.aff",
|
||||||
|
"ca_ES.zip", "ca_ES.dic", "ca_ES.aff",
|
||||||
|
"cop_EG.zip", "cop_EG.dic", "cop_EG.aff",
|
||||||
|
"cs_CZ.zip", "cs_CZ.dic", "cs_CZ.aff",
|
||||||
|
"cy_GB.zip", "cy_GB.dic", "cy_GB.aff",
|
||||||
|
"da_DK.zip", "da_DK.dic", "da_DK.aff",
|
||||||
|
"de_AT.zip", "de_AT.dic", "de_AT.aff",
|
||||||
|
"de_CH.zip", "de_CH.dic", "de_CH.aff",
|
||||||
|
"de_DE.zip", "de_DE.dic", "de_DE.aff",
|
||||||
|
"de_DE_comb.zip", "de_DE_comb.dic", "de_DE_comb.aff",
|
||||||
|
"de_DE_frami.zip", "de_DE_frami.dic", "de_DE_frami.aff",
|
||||||
|
"de_DE_neu.zip", "de_DE_neu.dic", "de_DE_neu.aff",
|
||||||
|
"el_GR.zip", "el_GR.dic", "el_GR.aff",
|
||||||
|
"en_AU.zip", "en_AU.dic", "en_AU.aff",
|
||||||
|
"en_CA.zip", "en_CA.dic", "en_CA.aff",
|
||||||
|
"en_GB-oed.zip", "en_GB-oed.dic", "en_GB-oed.aff",
|
||||||
|
"en_GB.zip", "en_GB.dic", "en_GB.aff",
|
||||||
|
"en_NZ.zip", "en_NZ.dic", "en_NZ.aff",
|
||||||
|
"eo.zip", "eo_l3.dic", "eo_l3.aff",
|
||||||
|
"eo_EO.zip", "eo_EO.dic", "eo_EO.aff",
|
||||||
|
"es_AR.zip", "es_AR.dic", "es_AR.aff",
|
||||||
|
"es_BO.zip", "es_BO.dic", "es_BO.aff",
|
||||||
|
"es_CL.zip", "es_CL.dic", "es_CL.aff",
|
||||||
|
"es_CO.zip", "es_CO.dic", "es_CO.aff",
|
||||||
|
"es_CR.zip", "es_CR.dic", "es_CR.aff",
|
||||||
|
"es_CU.zip", "es_CU.dic", "es_CU.aff",
|
||||||
|
"es_DO.zip", "es_DO.dic", "es_DO.aff",
|
||||||
|
"es_EC.zip", "es_EC.dic", "es_EC.aff",
|
||||||
|
"es_ES.zip", "es_ES.dic", "es_ES.aff",
|
||||||
|
"es_GT.zip", "es_GT.dic", "es_GT.aff",
|
||||||
|
"es_HN.zip", "es_HN.dic", "es_HN.aff",
|
||||||
|
"es_MX.zip", "es_MX.dic", "es_MX.aff",
|
||||||
|
"es_NEW.zip", "es_NEW.dic", "es_NEW.aff",
|
||||||
|
"es_NI.zip", "es_NI.dic", "es_NI.aff",
|
||||||
|
"es_PA.zip", "es_PA.dic", "es_PA.aff",
|
||||||
|
"es_PE.zip", "es_PE.dic", "es_PE.aff",
|
||||||
|
"es_PR.zip", "es_PR.dic", "es_PR.aff",
|
||||||
|
"es_PY.zip", "es_PY.dic", "es_PY.aff",
|
||||||
|
"es_SV.zip", "es_SV.dic", "es_SV.aff",
|
||||||
|
"es_UY.zip", "es_UY.dic", "es_UY.aff",
|
||||||
|
"es_VE.zip", "es_VE.dic", "es_VE.aff",
|
||||||
|
"et_EE.zip", "et_EE.dic", "et_EE.aff",
|
||||||
|
"fo_FO.zip", "fo_FO.dic", "fo_FO.aff",
|
||||||
|
"fr_FR-1990_1-3-2.zip", "fr_FR-1990.dic", "fr_FR-1990.aff",
|
||||||
|
"fr_FR-classique_1-3-2.zip", "fr_FR-classique.dic", "fr_FR-classique.aff",
|
||||||
|
"fr_FR_1-3-2.zip", "fr_FR.dic", "fr_FR.aff",
|
||||||
|
"fy_NL.zip", "fy_NL.dic", "fy_NL.aff",
|
||||||
|
"ga_IE.zip", "ga_IE.dic", "ga_IE.aff",
|
||||||
|
"gd_GB.zip", "gd_GB.dic", "gd_GB.aff",
|
||||||
|
"gl_ES.zip", "gl_ES.dic", "gl_ES.aff",
|
||||||
|
"gsc_FR.zip", "gsc_FR.dic", "gsc_FR.aff",
|
||||||
|
"gu_IN.zip", "gu_IN.dic", "gu_IN.aff",
|
||||||
|
"he_IL.zip", "he_IL.dic", "he_IL.aff",
|
||||||
|
"hi_IN.zip", "hi_IN.dic", "hi_IN.aff",
|
||||||
|
"hil_PH.zip", "hil_PH.dic", "hil_PH.aff",
|
||||||
|
"hr_HR.zip", "hr_HR.dic", "hr_HR.aff",
|
||||||
|
"hu_HU.zip", "hu_HU.dic", "hu_HU.aff",
|
||||||
|
"hu_HU_comb.zip", "hu_HU.dic", "hu_HU.aff",
|
||||||
|
"ia.zip", "ia.dic", "ia.aff",
|
||||||
|
"id_ID.zip", "id_ID.dic", "id_ID.aff",
|
||||||
|
"it_IT.zip", "it_IT.dic", "it_IT.aff",
|
||||||
|
"ku_TR.zip", "ku_TR.dic", "ku_TR.aff",
|
||||||
|
"la.zip", "la.dic", "la.aff",
|
||||||
|
"lt_LT.zip", "lt_LT.dic", "lt_LT.aff",
|
||||||
|
"lv_LV.zip", "lv_LV.dic", "lv_LV.aff",
|
||||||
|
"mg_MG.zip", "mg_MG.dic", "mg_MG.aff",
|
||||||
|
"mi_NZ.zip", "mi_NZ.dic", "mi_NZ.aff",
|
||||||
|
"mk_MK.zip", "mk_MK.dic", "mk_MK.aff",
|
||||||
|
"mos_BF.zip", "mos_BF.dic", "mos_BF.aff",
|
||||||
|
"mr_IN.zip", "mr_IN.dic", "mr_IN.aff",
|
||||||
|
"ms_MY.zip", "ms_MY.dic", "ms_MY.aff",
|
||||||
|
"nb_NO.zip", "nb_NO.dic", "nb_NO.aff",
|
||||||
|
"ne_NP.zip", "ne_NP.dic", "ne_NP.aff",
|
||||||
|
"nl_NL.zip", "nl_NL.dic", "nl_NL.aff",
|
||||||
|
"nl_med.zip", "nl_med.dic", "nl_med.aff",
|
||||||
|
"nn_NO.zip", "nn_NO.dic", "nn_NO.aff",
|
||||||
|
"nr_ZA.zip", "nr_ZA.dic", "nr_ZA.aff",
|
||||||
|
"ns_ZA.zip", "ns_ZA.dic", "ns_ZA.aff",
|
||||||
|
"ny_MW.zip", "ny_MW.dic", "ny_MW.aff",
|
||||||
|
"oc_FR.zip", "oc_FR.dic", "oc_FR.aff",
|
||||||
|
"pl_PL.zip", "pl_PL.dic", "pl_PL.aff",
|
||||||
|
"pt_BR.zip", "pt_BR.dic", "pt_BR.aff",
|
||||||
|
"pt_PT.zip", "pt_PT.dic", "pt_PT.aff",
|
||||||
|
"ro_RO.zip", "ro_RO.dic", "ro_RO.aff",
|
||||||
|
"ru_RU.zip", "ru_RU.dic", "ru_RU.aff",
|
||||||
|
"ru_RU_ye.zip", "ru_RU_ie.dic", "ru_RU_ie.aff",
|
||||||
|
"ru_RU_yo.zip", "ru_RU_yo.dic", "ru_RU_yo.aff",
|
||||||
|
"rw_RW.zip", "rw_RW.dic", "rw_RW.aff",
|
||||||
|
"sk_SK.zip", "sk_SK.dic", "sk_SK.aff",
|
||||||
|
"sl_SI.zip", "sl_SI.dic", "sl_SI.aff",
|
||||||
|
"sq_AL.zip", "sq_AL.dic", "sq_AL.aff",
|
||||||
|
"ss_ZA.zip", "ss_ZA.dic", "ss_ZA.aff",
|
||||||
|
"st_ZA.zip", "st_ZA.dic", "st_ZA.aff",
|
||||||
|
"sv_SE.zip", "sv_SE.dic", "sv_SE.aff",
|
||||||
|
"sw_KE.zip", "sw_KE.dic", "sw_KE.aff",
|
||||||
|
"tet_ID.zip", "tet_ID.dic", "tet_ID.aff",
|
||||||
|
"th_TH.zip", "th_TH.dic", "th_TH.aff",
|
||||||
|
"tl_PH.zip", "tl_PH.dic", "tl_PH.aff",
|
||||||
|
"tn_ZA.zip", "tn_ZA.dic", "tn_ZA.aff",
|
||||||
|
"ts_ZA.zip", "ts_ZA.dic", "ts_ZA.aff",
|
||||||
|
"uk_UA.zip", "uk_UA.dic", "uk_UA.aff",
|
||||||
|
"ve_ZA.zip", "ve_ZA.dic", "ve_ZA.aff",
|
||||||
|
"vi_VN.zip", "vi_VN.dic", "vi_VN.aff",
|
||||||
|
"xh_ZA.zip", "xh_ZA.dic", "xh_ZA.aff",
|
||||||
|
"zu_ZA.zip", "zu_ZA.dic", "zu_ZA.aff",
|
||||||
|
};
|
||||||
|
|
||||||
|
public void test() throws Exception {
|
||||||
|
for (int i = 0; i < tests.length; i += 3) {
|
||||||
|
File f = new File(DICTIONARY_HOME, tests[i]);
|
||||||
|
assert f.exists();
|
||||||
|
|
||||||
|
try (ZipFile zip = new ZipFile(f, IOUtils.CHARSET_UTF_8)) {
|
||||||
|
ZipEntry dicEntry = zip.getEntry(tests[i+1]);
|
||||||
|
assert dicEntry != null;
|
||||||
|
ZipEntry affEntry = zip.getEntry(tests[i+2]);
|
||||||
|
assert affEntry != null;
|
||||||
|
|
||||||
|
// get ram from previous impl
|
||||||
|
String oldRAM = "FAIL";
|
||||||
|
try (InputStream dictionary = zip.getInputStream(dicEntry);
|
||||||
|
InputStream affix = zip.getInputStream(affEntry)) {
|
||||||
|
try {
|
||||||
|
HunspellDictionary dic = new HunspellDictionary(affix, dictionary, TEST_VERSION_CURRENT);
|
||||||
|
oldRAM = RamUsageEstimator.humanSizeOf(dic);
|
||||||
|
} catch (Throwable t) {}
|
||||||
|
}
|
||||||
|
|
||||||
|
try (InputStream dictionary = zip.getInputStream(dicEntry);
|
||||||
|
InputStream affix = zip.getInputStream(affEntry)) {
|
||||||
|
Dictionary dic = new Dictionary(affix, dictionary);
|
||||||
|
System.out.println(tests[i] + "\t" + oldRAM + "\t" + RamUsageEstimator.humanSizeOf(dic));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testOneDictionary() throws Exception {
|
||||||
|
String toTest = "hu_HU.zip";
|
||||||
|
for (int i = 0; i < tests.length; i++) {
|
||||||
|
if (tests[i].equals(toTest)) {
|
||||||
|
File f = new File(DICTIONARY_HOME, tests[i]);
|
||||||
|
assert f.exists();
|
||||||
|
|
||||||
|
try (ZipFile zip = new ZipFile(f, IOUtils.CHARSET_UTF_8)) {
|
||||||
|
ZipEntry dicEntry = zip.getEntry(tests[i+1]);
|
||||||
|
assert dicEntry != null;
|
||||||
|
ZipEntry affEntry = zip.getEntry(tests[i+2]);
|
||||||
|
assert affEntry != null;
|
||||||
|
|
||||||
|
try (InputStream dictionary = zip.getInputStream(dicEntry);
|
||||||
|
InputStream affix = zip.getInputStream(affEntry)) {
|
||||||
|
Dictionary dic = new Dictionary(affix, dictionary);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,109 @@
|
||||||
|
package org.apache.lucene.analysis.hunspell2;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.FilterInputStream;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.text.ParseException;
|
||||||
|
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
|
||||||
|
public class TestDictionary extends LuceneTestCase {
|
||||||
|
|
||||||
|
public void testSimpleDictionary() throws Exception {
|
||||||
|
InputStream affixStream = getClass().getResourceAsStream("simple.aff");
|
||||||
|
InputStream dictStream = getClass().getResourceAsStream("simple.dic");
|
||||||
|
|
||||||
|
Dictionary dictionary = new Dictionary(affixStream, dictStream);
|
||||||
|
assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).size());
|
||||||
|
assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).size());
|
||||||
|
char flags[] = dictionary.lookupWord(new char[]{'o', 'l', 'r'}, 0, 3, new BytesRef());
|
||||||
|
assertNotNull(flags);
|
||||||
|
assertEquals(1, flags.length);
|
||||||
|
assertEquals("Wrong number of flags for lucen", 1, dictionary.lookupWord(new char[]{'l', 'u', 'c', 'e', 'n'}, 0, 5, new BytesRef()).length);
|
||||||
|
|
||||||
|
affixStream.close();
|
||||||
|
dictStream.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testCompressedDictionary() throws Exception {
|
||||||
|
InputStream affixStream = getClass().getResourceAsStream("compressed.aff");
|
||||||
|
InputStream dictStream = getClass().getResourceAsStream("compressed.dic");
|
||||||
|
|
||||||
|
Dictionary dictionary = new Dictionary(affixStream, dictStream);
|
||||||
|
assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).size());
|
||||||
|
assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).size());
|
||||||
|
assertEquals(1, dictionary.lookupWord(new char[]{'o', 'l', 'r'}, 0, 3, new BytesRef()).length);
|
||||||
|
|
||||||
|
affixStream.close();
|
||||||
|
dictStream.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
// malformed rule causes ParseException
|
||||||
|
public void testInvalidData() throws Exception {
|
||||||
|
InputStream affixStream = getClass().getResourceAsStream("broken.aff");
|
||||||
|
InputStream dictStream = getClass().getResourceAsStream("simple.dic");
|
||||||
|
|
||||||
|
try {
|
||||||
|
new Dictionary(affixStream, dictStream);
|
||||||
|
fail("didn't get expected exception");
|
||||||
|
} catch (ParseException expected) {
|
||||||
|
assertEquals("The affix file contains a rule with less than five elements", expected.getMessage());
|
||||||
|
assertEquals(23, expected.getErrorOffset());
|
||||||
|
}
|
||||||
|
|
||||||
|
affixStream.close();
|
||||||
|
dictStream.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
private class CloseCheckInputStream extends FilterInputStream {
|
||||||
|
private boolean closed = false;
|
||||||
|
|
||||||
|
public CloseCheckInputStream(InputStream delegate) {
|
||||||
|
super(delegate);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void close() throws IOException {
|
||||||
|
this.closed = true;
|
||||||
|
super.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isClosed() {
|
||||||
|
return this.closed;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testResourceCleanup() throws Exception {
|
||||||
|
CloseCheckInputStream affixStream = new CloseCheckInputStream(getClass().getResourceAsStream("compressed.aff"));
|
||||||
|
CloseCheckInputStream dictStream = new CloseCheckInputStream(getClass().getResourceAsStream("compressed.dic"));
|
||||||
|
|
||||||
|
new Dictionary(affixStream, dictStream);
|
||||||
|
|
||||||
|
assertFalse(affixStream.isClosed());
|
||||||
|
assertFalse(dictStream.isClosed());
|
||||||
|
|
||||||
|
affixStream.close();
|
||||||
|
dictStream.close();
|
||||||
|
|
||||||
|
assertTrue(affixStream.isClosed());
|
||||||
|
assertTrue(dictStream.isClosed());
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,87 @@
|
||||||
|
package org.apache.lucene.analysis.hunspell2;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.util.Arrays;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.MockTokenizer;
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||||
|
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||||
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
|
import org.apache.lucene.util.TestUtil;
|
||||||
|
import org.junit.AfterClass;
|
||||||
|
import org.junit.BeforeClass;
|
||||||
|
|
||||||
|
public class TestHunspell2StemFilter extends BaseTokenStreamTestCase {
|
||||||
|
private static Dictionary dictionary;
|
||||||
|
|
||||||
|
@BeforeClass
|
||||||
|
public static void beforeClass() throws Exception {
|
||||||
|
try (InputStream affixStream = TestStemmer.class.getResourceAsStream("simple.aff");
|
||||||
|
InputStream dictStream = TestStemmer.class.getResourceAsStream("simple.dic")) {
|
||||||
|
dictionary = new Dictionary(affixStream, dictStream);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@AfterClass
|
||||||
|
public static void afterClass() {
|
||||||
|
dictionary = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Simple test for KeywordAttribute */
|
||||||
|
public void testKeywordAttribute() throws IOException {
|
||||||
|
MockTokenizer tokenizer = whitespaceMockTokenizer("lucene is awesome");
|
||||||
|
tokenizer.setEnableChecks(true);
|
||||||
|
Hunspell2StemFilter filter = new Hunspell2StemFilter(tokenizer, dictionary, TestUtil.nextInt(random(), 1, 3));
|
||||||
|
assertTokenStreamContents(filter, new String[]{"lucene", "lucen", "is", "awesome"}, new int[] {1, 0, 1, 1});
|
||||||
|
|
||||||
|
// assert with keyword marker
|
||||||
|
tokenizer = whitespaceMockTokenizer("lucene is awesome");
|
||||||
|
CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, Arrays.asList("Lucene"), true);
|
||||||
|
filter = new Hunspell2StemFilter(new SetKeywordMarkerFilter(tokenizer, set), dictionary, TestUtil.nextInt(random(), 1, 3));
|
||||||
|
assertTokenStreamContents(filter, new String[]{"lucene", "is", "awesome"}, new int[] {1, 1, 1});
|
||||||
|
}
|
||||||
|
|
||||||
|
/** blast some random strings through the analyzer */
|
||||||
|
public void testRandomStrings() throws Exception {
|
||||||
|
Analyzer analyzer = new Analyzer() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName) {
|
||||||
|
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
||||||
|
return new TokenStreamComponents(tokenizer, new Hunspell2StemFilter(tokenizer, dictionary, TestUtil.nextInt(random(), 1, 3)));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testEmptyTerm() throws IOException {
|
||||||
|
Analyzer a = new Analyzer() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName) {
|
||||||
|
Tokenizer tokenizer = new KeywordTokenizer();
|
||||||
|
return new TokenStreamComponents(tokenizer, new Hunspell2StemFilter(tokenizer, dictionary, TestUtil.nextInt(random(), 1, 3)));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
checkOneTerm(a, "", "");
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,50 @@
|
||||||
|
package org.apache.lucene.analysis.hunspell2;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.io.StringReader;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Simple tests to ensure the Hunspell stemmer loads from factory
|
||||||
|
*/
|
||||||
|
public class TestHunspell2StemFilterFactory extends BaseTokenStreamFactoryTestCase {
|
||||||
|
public void testStemming() throws Exception {
|
||||||
|
Reader reader = new StringReader("abc");
|
||||||
|
TokenStream stream = whitespaceMockTokenizer(reader);
|
||||||
|
stream = tokenFilterFactory("Hunspell2Stem",
|
||||||
|
"dictionary", "simple.dic",
|
||||||
|
"affix", "simple.aff").create(stream);
|
||||||
|
assertTokenStreamContents(stream, new String[] { "ab" });
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Test that bogus arguments result in exception */
|
||||||
|
public void testBogusArguments() throws Exception {
|
||||||
|
try {
|
||||||
|
tokenFilterFactory("Hunspell2Stem",
|
||||||
|
"dictionary", "simple.dic",
|
||||||
|
"bogusArg", "bogusValue");
|
||||||
|
fail();
|
||||||
|
} catch (IllegalArgumentException expected) {
|
||||||
|
assertTrue(expected.getMessage().contains("Unknown parameters"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,105 @@
|
||||||
|
package org.apache.lucene.analysis.hunspell2;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.hunspell2.Stemmer.Stem;
|
||||||
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
import org.junit.AfterClass;
|
||||||
|
import org.junit.BeforeClass;
|
||||||
|
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
public class TestStemmer extends LuceneTestCase {
|
||||||
|
private static Stemmer stemmer;
|
||||||
|
|
||||||
|
@BeforeClass
|
||||||
|
public static void beforeClass() throws Exception {
|
||||||
|
try (InputStream affixStream = TestStemmer.class.getResourceAsStream("simple.aff");
|
||||||
|
InputStream dictStream = TestStemmer.class.getResourceAsStream("simple.dic")) {
|
||||||
|
Dictionary dictionary = new Dictionary(affixStream, dictStream);
|
||||||
|
stemmer = new Stemmer(dictionary);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@AfterClass
|
||||||
|
public static void afterClass() {
|
||||||
|
stemmer = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testSimpleSuffix() {
|
||||||
|
assertStemsTo("lucene", "lucene", "lucen");
|
||||||
|
assertStemsTo("mahoute", "mahout");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testSimplePrefix() {
|
||||||
|
assertStemsTo("solr", "olr");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testRecursiveSuffix() {
|
||||||
|
assertStemsTo("abcd", "ab");
|
||||||
|
}
|
||||||
|
|
||||||
|
// all forms unmunched from dictionary
|
||||||
|
public void testAllStems() {
|
||||||
|
assertStemsTo("ab", "ab");
|
||||||
|
assertStemsTo("abc", "ab");
|
||||||
|
assertStemsTo("apach", "apach");
|
||||||
|
assertStemsTo("apache", "apach");
|
||||||
|
assertStemsTo("foo", "foo");
|
||||||
|
assertStemsTo("food", "foo");
|
||||||
|
assertStemsTo("foos", "foo");
|
||||||
|
assertStemsTo("lucen", "lucen");
|
||||||
|
assertStemsTo("lucene", "lucen", "lucene");
|
||||||
|
assertStemsTo("mahout", "mahout");
|
||||||
|
assertStemsTo("mahoute", "mahout");
|
||||||
|
assertStemsTo("moo", "moo");
|
||||||
|
assertStemsTo("mood", "moo");
|
||||||
|
assertStemsTo("olr", "olr");
|
||||||
|
assertStemsTo("solr", "olr");
|
||||||
|
}
|
||||||
|
|
||||||
|
// some bogus stuff that should not stem (empty lists)!
|
||||||
|
public void testBogusStems() {
|
||||||
|
assertStemsTo("abs");
|
||||||
|
assertStemsTo("abe");
|
||||||
|
assertStemsTo("sab");
|
||||||
|
assertStemsTo("sapach");
|
||||||
|
assertStemsTo("sapache");
|
||||||
|
assertStemsTo("apachee");
|
||||||
|
assertStemsTo("sfoo");
|
||||||
|
assertStemsTo("sfoos");
|
||||||
|
assertStemsTo("fooss");
|
||||||
|
assertStemsTo("lucenee");
|
||||||
|
assertStemsTo("solre");
|
||||||
|
}
|
||||||
|
|
||||||
|
private void assertStemsTo(String s, String... expected) {
|
||||||
|
Arrays.sort(expected);
|
||||||
|
|
||||||
|
List<Stem> stems = stemmer.stem(s);
|
||||||
|
String actual[] = new String[stems.size()];
|
||||||
|
for (int i = 0; i < actual.length; i++) {
|
||||||
|
actual[i] = stems.get(i).getStemString();
|
||||||
|
}
|
||||||
|
Arrays.sort(actual);
|
||||||
|
|
||||||
|
assertArrayEquals(expected, actual);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,24 @@
|
||||||
|
SET UTF-8
|
||||||
|
TRY abcdefghijklmopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ
|
||||||
|
|
||||||
|
SFX A Y 3
|
||||||
|
SFX A 0 e n
|
||||||
|
SFX A 0 e t
|
||||||
|
SFX A 0 e h
|
||||||
|
|
||||||
|
SFX C Y 2
|
||||||
|
SFX C 0 d/C c
|
||||||
|
SFX C 0 c b
|
||||||
|
|
||||||
|
SFX D Y 1
|
||||||
|
SFX D 0 s o
|
||||||
|
|
||||||
|
SFX E Y 1
|
||||||
|
SFX E 0 d o
|
||||||
|
|
||||||
|
PFX B Y 1
|
||||||
|
PFX B 0 s o
|
||||||
|
|
||||||
|
#wrong rule (only 4 elements)
|
||||||
|
PFX A0 Y 1
|
||||||
|
PFX A0 0 a
|
|
@ -0,0 +1,29 @@
|
||||||
|
SET UTF-8
|
||||||
|
TRY abcdefghijklmopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ
|
||||||
|
|
||||||
|
FLAG long
|
||||||
|
|
||||||
|
AF 5
|
||||||
|
AF AA
|
||||||
|
AF BB
|
||||||
|
AF CC
|
||||||
|
AF DD
|
||||||
|
AF EE
|
||||||
|
|
||||||
|
SFX AA Y 3
|
||||||
|
SFX AA 0 e n
|
||||||
|
SFX AA 0 e t
|
||||||
|
SFX AA 0 e h
|
||||||
|
|
||||||
|
SFX CC Y 2
|
||||||
|
SFX CC 0 d/3 c
|
||||||
|
SFX CC 0 c b
|
||||||
|
|
||||||
|
SFX DD Y 1
|
||||||
|
SFX DD 0 s o
|
||||||
|
|
||||||
|
SFX EE Y 1
|
||||||
|
SFX EE 0 d o
|
||||||
|
|
||||||
|
PFX BB Y 1
|
||||||
|
PFX BB 0 s o
|
|
@ -0,0 +1,9 @@
|
||||||
|
6
|
||||||
|
ab/3
|
||||||
|
apach/1
|
||||||
|
foo/4
|
||||||
|
foo/5
|
||||||
|
lucen/1
|
||||||
|
lucene
|
||||||
|
mahout/1
|
||||||
|
olr/2
|
|
@ -0,0 +1,20 @@
|
||||||
|
SET UTF-8
|
||||||
|
TRY abcdefghijklmopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ
|
||||||
|
|
||||||
|
SFX A Y 3
|
||||||
|
SFX A 0 e n
|
||||||
|
SFX A 0 e t
|
||||||
|
SFX A 0 e h
|
||||||
|
|
||||||
|
SFX C Y 2
|
||||||
|
SFX C 0 d/C c
|
||||||
|
SFX C 0 c b
|
||||||
|
|
||||||
|
SFX D Y 1
|
||||||
|
SFX D 0 s o
|
||||||
|
|
||||||
|
SFX E Y 1
|
||||||
|
SFX E 0 d o
|
||||||
|
|
||||||
|
PFX B Y 1
|
||||||
|
PFX B 0 s o
|
|
@ -0,0 +1,10 @@
|
||||||
|
9
|
||||||
|
ab/C
|
||||||
|
apach/A
|
||||||
|
foo/D
|
||||||
|
foo/E
|
||||||
|
lucen/A
|
||||||
|
lucene
|
||||||
|
mahout/A
|
||||||
|
moo/E
|
||||||
|
olr/B
|
Loading…
Reference in New Issue