LUCENE-3414: Added Hunspell for Lucene

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1167467 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Christopher John Male 2011-09-10 06:00:39 +00:00
parent 397d68e080
commit e3172b9239
11 changed files with 1280 additions and 0 deletions

View File

@ -98,6 +98,9 @@ New Features
* SOLR-1057: Add PathHierarchyTokenizer that represents file path hierarchies as synonyms of
/something, /something/something, /something/something/else. (Ryan McKinley, Koji Sekiguchi)
* LUCENE-3414: Added HunspellStemFilter which uses a provided pure Java implementation of the
Hunspell algorithm. (Chris Male)
Build
* LUCENE-2413: All analyzers in contrib/analyzers and contrib/icu were moved to the

View File

@ -0,0 +1,157 @@
package org.apache.lucene.analysis.hunspell;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.regex.Pattern;
/**
* Wrapper class representing a hunspell affix
*/
public class HunspellAffix {
private String append; // the affix itself, what is appended
private char appendFlags[]; // continuation class flags
private String strip;
private String condition;
private Pattern conditionPattern;
private char flag;
private boolean crossProduct;
/**
* Checks whether the given text matches the conditional pattern on this affix
*
* @param text Text to check if it matches the affix's conditional pattern
* @return {@code true} if the text meets the condition, {@code false} otherwise
*/
public boolean checkCondition(CharSequence text) {
return conditionPattern.matcher(text).matches();
}
/**
* Returns the append defined for the affix
*
* @return Defined append
*/
public String getAppend() {
return append;
}
/**
* Sets the append defined for the affix
*
* @param append Defined append for the affix
*/
public void setAppend(String append) {
this.append = append;
}
/**
* Returns the flags defined for the affix append
*
* @return Flags defined for the affix append
*/
public char[] getAppendFlags() {
return appendFlags;
}
/**
* Sets the flags defined for the affix append
*
* @param appendFlags Flags defined for the affix append
*/
public void setAppendFlags(char[] appendFlags) {
this.appendFlags = appendFlags;
}
/**
* Returns the stripping characters defined for the affix
*
* @return Stripping characters defined for the affix
*/
public String getStrip() {
return strip;
}
/**
* Sets the stripping characters defined for the affix
*
* @param strip Stripping characters defined for the affix
*/
public void setStrip(String strip) {
this.strip = strip;
}
/**
* Returns the condition that must be met before the affix can be applied
*
* @return Condition that must be met before the affix can be applied
*/
public String getCondition() {
return condition;
}
/**
* Sets the condition that must be met before the affix can be applied
*
* @param condition Condition to be met before affix application
* @param pattern Condition as a regular expression pattern
*/
public void setCondition(String condition, String pattern) {
this.condition = condition;
this.conditionPattern = Pattern.compile(pattern);
}
/**
* Returns the affix flag
*
* @return Affix flag
*/
public char getFlag() {
return flag;
}
/**
* Sets the affix flag
*
* @param flag Affix flag
*/
public void setFlag(char flag) {
this.flag = flag;
}
/**
* Returns whether the affix is defined as cross product
*
* @return {@code true} if the affix is cross product, {@code false} otherwise
*/
public boolean isCrossProduct() {
return crossProduct;
}
/**
* Sets whether the affix is defined as cross product
*
* @param crossProduct Whether the affix is defined as cross product
*/
public void setCrossProduct(boolean crossProduct) {
this.crossProduct = crossProduct;
}
}

View File

@ -0,0 +1,411 @@
package org.apache.lucene.analysis.hunspell;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.util.CharArrayMap;
import org.apache.lucene.util.Version;
import java.io.*;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
public class HunspellDictionary {
static final HunspellWord NOFLAGS = new HunspellWord();
private static final String PREFIX_KEY = "PFX";
private static final String SUFFIX_KEY = "SFX";
private static final String FLAG_KEY = "FLAG";
private static final String NUM_FLAG_TYPE = "num";
private static final String UTF8_FLAG_TYPE = "UTF-8";
private static final String LONG_FLAG_TYPE = "long";
private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*";
private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s";
private CharArrayMap<List<HunspellWord>> words;
private CharArrayMap<List<HunspellAffix>> prefixes;
private CharArrayMap<List<HunspellAffix>> suffixes;
private FlagParsingStrategy flagParsingStrategy = new SimpleFlagParsingStrategy(); // Default flag parsing strategy
private final Version version;
/**
* Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix
* and dictionary files
*
* @param affix InputStream for reading the hunspell affix file
* @param dictionary InputStream for reading the hunspell dictionary file
* @param version Lucene Version
* @throws IOException Can be thrown while reading from the InputStreams
* @throws ParseException Can be thrown if the content of the files does not meet expected formats
*/
public HunspellDictionary(InputStream affix, InputStream dictionary, Version version) throws IOException, ParseException {
this(affix, Arrays.asList(dictionary), version);
}
/**
* Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix
* and dictionary files
*
* @param affix InputStream for reading the hunspell affix file
* @param dictionaries InputStreams for reading the hunspell dictionary file
* @param version Lucene Version
* @throws IOException Can be thrown while reading from the InputStreams
* @throws ParseException Can be thrown if the content of the files does not meet expected formats
*/
public HunspellDictionary(InputStream affix, List<InputStream> dictionaries, Version version) throws IOException, ParseException {
this.version = version;
String encoding = getDictionaryEncoding(affix);
CharsetDecoder decoder = getJavaEncoding(encoding);
readAffixFile(affix, decoder);
words = new CharArrayMap<List<HunspellWord>>(version, 65535 /* guess */, false);
for (InputStream dictionary : dictionaries) {
readDictionaryFile(dictionary, decoder);
}
}
/**
* Looks up HunspellWords that match the String created from the given char array, offset and length
*
* @param word Char array to generate the String from
* @param offset Offset in the char array that the String starts at
* @param length Length from the offset that the String is
* @return List of HunspellWords that match the generated String, or {@code null} if none are found
*/
public List<HunspellWord> lookupWord(char word[], int offset, int length) {
return words.get(word, offset, length);
}
/**
* Looks up HunspellAffix prefixes that have an append that matches the String created from the given char array, offset and length
*
* @param word Char array to generate the String from
* @param offset Offset in the char array that the String starts at
* @param length Length from the offset that the String is
* @return List of HunspellAffix prefixes with an append that matches the String, or {@code null} if none are found
*/
public List<HunspellAffix> lookupPrefix(char word[], int offset, int length) {
return prefixes.get(word, offset, length);
}
/**
* Looks up HunspellAffix suffixes that have an append that matches the String created from the given char array, offset and length
*
* @param word Char array to generate the String from
* @param offset Offset in the char array that the String starts at
* @param length Length from the offset that the String is
* @return List of HunspellAffix suffixes with an append that matches the String, or {@code null} if none are found
*/
public List<HunspellAffix> lookupSuffix(char word[], int offset, int length) {
return suffixes.get(word, offset, length);
}
/**
* Reads the affix file through the provided InputStream, building up the prefix and suffix maps
*
* @param affixStream InputStream to read the content of the affix file from
* @param decoder CharsetDecoder to decode the content of the file
* @throws IOException Can be thrown while reading from the InputStream
*/
private void readAffixFile(InputStream affixStream, CharsetDecoder decoder) throws IOException {
prefixes = new CharArrayMap<List<HunspellAffix>>(version, 8, false);
suffixes = new CharArrayMap<List<HunspellAffix>>(version, 8, false);
BufferedReader reader = new BufferedReader(new InputStreamReader(affixStream, decoder));
String line = null;
while ((line = reader.readLine()) != null) {
if (line.startsWith(PREFIX_KEY)) {
parseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN);
} else if (line.startsWith(SUFFIX_KEY)) {
parseAffix(suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN);
} else if (line.startsWith(FLAG_KEY)) {
// Assume that the FLAG line comes before any prefix or suffixes
// Store the strategy so it can be used when parsing the dic file
flagParsingStrategy = getFlagParsingStrategy(line);
}
}
reader.close();
}
/**
* Parses a specific affix rule putting the result into the provided affix map
*
* @param affixes Map where the result of the parsing will be put
* @param header Header line of the affix rule
* @param reader BufferedReader to read the content of the rule from
* @param conditionPattern {@link String#format(String, Object...)} pattern to be used to generate the condition regex
* pattern
* @throws IOException Can be thrown while reading the rule
*/
private void parseAffix(CharArrayMap<List<HunspellAffix>> affixes,
String header,
BufferedReader reader,
String conditionPattern) throws IOException {
String args[] = header.split("\\s+");
boolean crossProduct = args[2].equals("Y");
int numLines = Integer.parseInt(args[3]);
for (int i = 0; i < numLines; i++) {
String line = reader.readLine();
String ruleArgs[] = line.split("\\s+");
HunspellAffix affix = new HunspellAffix();
affix.setFlag(flagParsingStrategy.parseFlag(ruleArgs[1]));
affix.setStrip(ruleArgs[2].equals("0") ? "" : ruleArgs[2]);
String affixArg = ruleArgs[3];
int flagSep = affixArg.lastIndexOf('/');
if (flagSep != -1) {
char appendFlags[] = flagParsingStrategy.parseFlags(affixArg.substring(flagSep + 1));
Arrays.sort(appendFlags);
affix.setAppendFlags(appendFlags);
affix.setAppend(affixArg.substring(0, flagSep));
} else {
affix.setAppend(affixArg);
}
String condition = ruleArgs[4];
affix.setCondition(condition, String.format(conditionPattern, condition));
affix.setCrossProduct(crossProduct);
List<HunspellAffix> list = affixes.get(affix.getAppend());
if (list == null) {
list = new ArrayList<HunspellAffix>();
affixes.put(affix.getAppend(), list);
}
list.add(affix);
}
}
/**
* Parses the encoding specificed in the affix file readable through the provided InputStream
*
* @param affix InputStream for reading the affix file
* @return Encoding specified in the affix file
* @throws IOException Can be thrown while reading from the InputStream
* @throws ParseException Thrown if the first non-empty non-comment line read from the file does not adhere to the format {@code SET <encoding>}
*/
private String getDictionaryEncoding(InputStream affix) throws IOException, ParseException {
final StringBuilder encoding = new StringBuilder();
for (;;) {
encoding.setLength(0);
int ch;
while ((ch = affix.read()) >= 0) {
if (ch == '\n') {
break;
}
if (ch != '\r') {
encoding.append((char)ch);
}
}
if (
encoding.length() == 0 || encoding.charAt(0) == '#' ||
// this test only at the end as ineffective but would allow lines only containing spaces:
encoding.toString().trim().length() == 0
) {
if (ch < 0) {
throw new ParseException("Unexpected end of affix file.", 0);
}
continue;
}
if ("SET ".equals(encoding.substring(0, 4))) {
// cleanup the encoding string, too (whitespace)
return encoding.substring(4).trim();
}
throw new ParseException("The first non-comment line in the affix file must "+
"be a 'SET charset', was: '" + encoding +"'", 0);
}
}
/**
* Retrieves the CharsetDecoder for the given encoding. Note, This isn't perfect as I think ISCII-DEVANAGARI and
* MICROSOFT-CP1251 etc are allowed...
*
* @param encoding Encoding to retrieve the CharsetDecoder for
* @return CharSetDecoder for the given encoding
*/
private CharsetDecoder getJavaEncoding(String encoding) {
Charset charset = Charset.forName(encoding);
return charset.newDecoder();
}
/**
* Determines the appropriate {@link FlagParsingStrategy} based on the FLAG definiton line taken from the affix file
*
* @param flagLine Line containing the flag information
* @return FlagParsingStrategy that handles parsing flags in the way specified in the FLAG definiton
*/
private FlagParsingStrategy getFlagParsingStrategy(String flagLine) {
String flagType = flagLine.substring(5);
if (NUM_FLAG_TYPE.equals(flagType)) {
return new NumFlagParsingStrategy();
} else if (UTF8_FLAG_TYPE.equals(flagType)) {
return new SimpleFlagParsingStrategy();
} else if (LONG_FLAG_TYPE.equals(flagType)) {
return new DoubleASCIIFlagParsingStrategy();
}
throw new IllegalArgumentException("Unknown flag type: " + flagType);
}
/**
* Reads the dictionary file through the provided InputStream, building up the words map
*
* @param dictionary InputStream to read the dictionary file through
* @param decoder CharsetDecoder used to decode the contents of the file
* @throws IOException Can be thrown while reading from the file
*/
private void readDictionaryFile(InputStream dictionary, CharsetDecoder decoder) throws IOException {
BufferedReader reader = new BufferedReader(new InputStreamReader(dictionary, decoder));
// nocommit, don't create millions of strings.
String line = reader.readLine(); // first line is number of entries
int numEntries = Integer.parseInt(line);
// nocommit, the flags themselves can be double-chars (long) or also numeric
// either way the trick is to encode them as char... but they must be parsed differently
while ((line = reader.readLine()) != null) {
String entry;
HunspellWord wordForm;
int flagSep = line.lastIndexOf('/');
if (flagSep == -1) {
wordForm = NOFLAGS;
entry = line;
} else {
// note, there can be comments (morph description) after a flag.
// we should really look for any whitespace
int end = line.indexOf('\t', flagSep);
if (end == -1)
end = line.length();
wordForm = new HunspellWord(flagParsingStrategy.parseFlags(line.substring(flagSep + 1, end)));
Arrays.sort(wordForm.getFlags());
entry = line.substring(0, flagSep);
}
List<HunspellWord> entries = words.get(entry);
if (entries == null) {
entries = new ArrayList<HunspellWord>();
words.put(entry, entries);
}
entries.add(wordForm);
}
}
public Version getVersion() {
return version;
}
/**
* Abstraction of the process of parsing flags taken from the affix and dic files
*/
private static abstract class FlagParsingStrategy {
/**
* Parses the given String into a single flag
*
* @param rawFlag String to parse into a flag
* @return Parsed flag
*/
char parseFlag(String rawFlag) {
return parseFlags(rawFlag)[0];
}
/**
* Parses the given String into multiple flags
*
* @param rawFlags String to parse into flags
* @return Parsed flags
*/
abstract char[] parseFlags(String rawFlags);
}
/**
* Simple implementation of {@link FlagParsingStrategy} that treats the chars in each String as a individual flags.
* Can be used with both the ASCII and UTF-8 flag types.
*/
private static class SimpleFlagParsingStrategy extends FlagParsingStrategy {
/**
* {@inheritDoc}
*/
public char[] parseFlags(String rawFlags) {
return rawFlags.toCharArray();
}
}
/**
* Implementation of {@link FlagParsingStrategy} that assumes each flag is encoded in its numerical form. In the case
* of multiple flags, each number is separated by a comma.
*/
private static class NumFlagParsingStrategy extends FlagParsingStrategy {
/**
* {@inheritDoc}
*/
public char[] parseFlags(String rawFlags) {
String[] rawFlagParts = rawFlags.trim().split(",");
char[] flags = new char[rawFlagParts.length];
for (int i = 0; i < rawFlagParts.length; i++) {
// note, removing the trailing X/leading I for nepali... what is the rule here?!
flags[i] = (char) Integer.parseInt(rawFlagParts[i].replaceAll("[^0-9]", ""));
}
return flags;
}
}
/**
* Implementation of {@link FlagParsingStrategy} that assumes each flag is encoded as two ASCII characters whose codes
* must be combined into a single character.
*
* TODO (rmuir) test
*/
private static class DoubleASCIIFlagParsingStrategy extends FlagParsingStrategy {
/**
* {@inheritDoc}
*/
public char[] parseFlags(String rawFlags) {
if (rawFlags.length() == 0) {
return new char[0];
}
StringBuilder builder = new StringBuilder();
for (int i = 0; i < rawFlags.length(); i+=2) {
char cookedFlag = (char) ((int) rawFlags.charAt(i) + (int) rawFlags.charAt(i + 1));
builder.append(cookedFlag);
}
char flags[] = new char[builder.length()];
builder.getChars(0, builder.length(), flags, 0);
return flags;
}
}
}

View File

@ -0,0 +1,112 @@
package org.apache.lucene.analysis.hunspell;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.List;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.hunspell.HunspellStemmer.Stem;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
/**
* TokenFilter that uses hunspell affix rules and words to stem tokens. Since hunspell supports a word having multiple
* stems, this filter can emit multiple tokens for each consumed token
*/
public final class HunspellStemFilter extends TokenFilter {
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
private final HunspellStemmer stemmer;
private List<Stem> buffer;
private State savedState;
private final boolean dedup;
/**
* Creates a new HunspellStemFilter that will stem tokens from the given TokenStream using affix rules in the provided
* HunspellDictionary
*
* @param input TokenStream whose tokens will be stemmed
* @param dictionary HunspellDictionary containing the affix rules and words that will be used to stem the tokens
*/
public HunspellStemFilter(TokenStream input, HunspellDictionary dictionary) {
this(input, dictionary, true);
}
/**
* Creates a new HunspellStemFilter that will stem tokens from the given TokenStream using affix rules in the provided
* HunspellDictionary
*
* @param input TokenStream whose tokens will be stemmed
* @param dictionary HunspellDictionary containing the affix rules and words that will be used to stem the tokens
* @param dedup true if only unique terms should be output.
*/
public HunspellStemFilter(TokenStream input, HunspellDictionary dictionary, boolean dedup) {
super(input);
this.dedup = dedup;
this.stemmer = new HunspellStemmer(dictionary);
}
/**
* {@inheritDoc}
*/
@Override
public boolean incrementToken() throws IOException {
if (buffer != null && !buffer.isEmpty()) {
Stem nextStem = buffer.remove(0);
restoreState(savedState);
posIncAtt.setPositionIncrement(0);
termAtt.copyBuffer(nextStem.getStem(), 0, nextStem.getStemLength());
termAtt.setLength(nextStem.getStemLength());
return true;
}
if (!input.incrementToken()) {
return false;
}
buffer = dedup ? stemmer.uniqueStems(termAtt.buffer(), termAtt.length()) : stemmer.stem(termAtt.buffer(), termAtt.length());
if (buffer.isEmpty()) { // we do not know this word, return it unchanged
return true;
}
Stem stem = buffer.remove(0);
termAtt.copyBuffer(stem.getStem(), 0, stem.getStemLength());
termAtt.setLength(stem.getStemLength());
if (!buffer.isEmpty()) {
savedState = captureState();
}
return true;
}
/**
* {@inheritDoc}
*/
@Override
public void reset() throws IOException {
super.reset();
buffer = null;
}
}

View File

@ -0,0 +1,372 @@
package org.apache.lucene.analysis.hunspell;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.text.ParseException;
import java.util.*;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.Version;
/**
* HunspellStemmer uses the affix rules declared in the HunspellDictionary to generate one or more stems for a word. It
* conforms to the algorithm in the original hunspell algorithm, including recursive suffix stripping.
*/
public class HunspellStemmer {
private static final int RECURSION_CAP = 2;
private final HunspellDictionary dictionary;
private final StringBuilder segment = new StringBuilder();
/**
* Constructs a new HunspellStemmer which will use the provided HunspellDictionary to create its stems
*
* @param dictionary HunspellDictionary that will be used to create the stems
*/
public HunspellStemmer(HunspellDictionary dictionary) {
this.dictionary = dictionary;
}
/**
* Find the stem(s) of the provided word
*
* @param word Word to find the stems for
* @return List of stems for the word
*/
public List<Stem> stem(String word) {
return stem(word.toCharArray(), word.length());
}
/**
* Find the stem(s) of the provided word
*
* @param word Word to find the stems for
* @return List of stems for the word
*/
public List<Stem> stem(char word[], int length) {
List<Stem> stems = new ArrayList<Stem>();
if (dictionary.lookupWord(word, 0, length) != null) {
stems.add(new Stem(word, length));
}
stems.addAll(stem(word, length, null, 0));
return stems;
}
/**
* Find the unique stem(s) of the provided word
*
* @param word Word to find the stems for
* @return List of stems for the word
*/
public List<Stem> uniqueStems(char word[], int length) {
List<Stem> stems = new ArrayList<Stem>();
CharArraySet terms = new CharArraySet(dictionary.getVersion(), 8, false);
if (dictionary.lookupWord(word, 0, length) != null) {
stems.add(new Stem(word, length));
terms.add(word);
}
List<Stem> otherStems = stem(word, length, null, 0);
for (Stem s : otherStems) {
if (!terms.contains(s.stem)) {
stems.add(s);
terms.add(s.stem);
}
}
return stems;
}
// ================================================= Helper Methods ================================================
/**
* Generates a list of stems for the provided word
*
* @param word Word to generate the stems for
* @param flags Flags from a previous stemming step that need to be cross-checked with any affixes in this recursive step
* @param recursionDepth Level of recursion this stemming step is at
* @return List of stems, pr an empty if no stems are found
*/
private List<Stem> stem(char word[], int length, char[] flags, int recursionDepth) {
List<Stem> stems = new ArrayList<Stem>();
for (int i = 0; i < length; i++) {
List<HunspellAffix> suffixes = dictionary.lookupSuffix(word, i, length - i);
if (suffixes == null) {
continue;
}
for (HunspellAffix suffix : suffixes) {
if (hasCrossCheckedFlag(suffix.getFlag(), flags)) {
int deAffixedLength = length - suffix.getAppend().length();
// TODO: can we do this in-place?
String strippedWord = new StringBuilder().append(word, 0, deAffixedLength).append(suffix.getStrip()).toString();
List<Stem> stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), suffix, recursionDepth);
for (Stem stem : stemList) {
stem.addSuffix(suffix);
}
stems.addAll(stemList);
}
}
}
for (int i = length - 1; i >= 0; i--) {
List<HunspellAffix> prefixes = dictionary.lookupPrefix(word, 0, i);
if (prefixes == null) {
continue;
}
for (HunspellAffix prefix : prefixes) {
if (hasCrossCheckedFlag(prefix.getFlag(), flags)) {
int deAffixedStart = prefix.getAppend().length();
int deAffixedLength = length - deAffixedStart;
String strippedWord = new StringBuilder().append(prefix.getStrip())
.append(word, deAffixedStart, deAffixedLength)
.toString();
List<Stem> stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), prefix, recursionDepth);
for (Stem stem : stemList) {
stem.addPrefix(prefix);
}
stems.addAll(stemList);
}
}
}
return stems;
}
/**
* Applies the affix rule to the given word, producing a list of stems if any are found
*
* @param strippedWord Word the affix has been removed and the strip added
* @param affix HunspellAffix representing the affix rule itself
* @param recursionDepth Level of recursion this stemming step is at
* @return List of stems for the word, or an empty list if none are found
*/
@SuppressWarnings("unchecked")
public List<Stem> applyAffix(char strippedWord[], int length, HunspellAffix affix, int recursionDepth) {
segment.setLength(0);
segment.append(strippedWord, 0, length);
if (!affix.checkCondition(segment)) {
return Collections.EMPTY_LIST;
}
List<Stem> stems = new ArrayList<Stem>();
List<HunspellWord> words = dictionary.lookupWord(strippedWord, 0, length);
if (words != null) {
for (HunspellWord hunspellWord : words) {
if (hunspellWord.hasFlag(affix.getFlag())) {
stems.add(new Stem(strippedWord, length));
}
}
}
if (affix.isCrossProduct() && recursionDepth < RECURSION_CAP) {
stems.addAll(stem(strippedWord, length, affix.getAppendFlags(), ++recursionDepth));
}
return stems;
}
/**
* Checks if the given flag cross checks with the given array of flags
*
* @param flag Flag to cross check with the array of flags
* @param flags Array of flags to cross check against. Can be {@code null}
* @return {@code true} if the flag is found in the array or the array is {@code null}, {@code false} otherwise
*/
private boolean hasCrossCheckedFlag(char flag, char[] flags) {
return flags == null || Arrays.binarySearch(flags, flag) >= 0;
}
/**
* Stem represents all information known about a stem of a word. This includes the stem, and the prefixes and suffixes
* that were used to change the word into the stem.
*/
public static class Stem {
private final List<HunspellAffix> prefixes = new ArrayList<HunspellAffix>();
private final List<HunspellAffix> suffixes = new ArrayList<HunspellAffix>();
private final char stem[];
private final int stemLength;
/**
* Creates a new Stem wrapping the given word stem
*
* @param stem Stem of a word
*/
public Stem(char stem[], int stemLength) {
this.stem = stem;
this.stemLength = stemLength;
}
/**
* Adds a prefix to the list of prefixes used to generate this stem. Because it is assumed that prefixes are added
* depth first, the prefix is added to the front of the list
*
* @param prefix Prefix to add to the list of prefixes for this stem
*/
public void addPrefix(HunspellAffix prefix) {
prefixes.add(0, prefix);
}
/**
* Adds a suffix to the list of suffixes used to generate this stem. Because it is assumed that suffixes are added
* depth first, the suffix is added to the end of the list
*
* @param suffix Suffix to add to the list of suffixes for this stem
*/
public void addSuffix(HunspellAffix suffix) {
suffixes.add(suffix);
}
/**
* Returns the list of prefixes used to generate the stem
*
* @return List of prefixes used to generate the stem or an empty list if no prefixes were required
*/
public List<HunspellAffix> getPrefixes() {
return prefixes;
}
/**
* Returns the list of suffixes used to generate the stem
*
* @return List of suffixes used to generate the stem or an empty list if no suffixes were required
*/
public List<HunspellAffix> getSuffixes() {
return suffixes;
}
/**
* Returns the actual word stem itself
*
* @return Word stem itself
*/
public char[] getStem() {
return stem;
}
/**
* @return the stemLength
*/
public int getStemLength() {
return stemLength;
}
public String getStemString() {
return new String(stem, 0, stemLength);
}
}
// ================================================= Entry Point ===================================================
/**
* HunspellStemmer entry point. Accepts two arguments: location of affix file and location of dic file
*
* @param args Program arguments. Should contain location of affix file and location of dic file
* @throws IOException Can be thrown while reading from the files
* @throws ParseException Can be thrown while parsing the files
*/
public static void main(String[] args) throws IOException, ParseException {
if (args.length != 2) {
System.out.println("usage: HunspellStemmer <affix location> <dic location>");
System.exit(1);
}
InputStream affixInputStream = new FileInputStream(args[0]);
InputStream dicInputStream = new FileInputStream(args[1]);
HunspellDictionary dictionary = new HunspellDictionary(affixInputStream, dicInputStream, Version.LUCENE_40);
affixInputStream.close();
dicInputStream.close();
HunspellStemmer stemmer = new HunspellStemmer(dictionary);
Scanner scanner = new Scanner(System.in);
System.out.print("> ");
while (scanner.hasNextLine()) {
String word = scanner.nextLine();
if ("exit".equals(word)) {
break;
}
printStemResults(word, stemmer.stem(word.toCharArray(), word.length()));
System.out.print("> ");
}
}
/**
* Prints the results of the stemming of a word
*
* @param originalWord Word that has been stemmed
* @param stems Stems of the word
*/
private static void printStemResults(String originalWord, List<Stem> stems) {
StringBuilder builder = new StringBuilder().append("stem(").append(originalWord).append(")").append("\n");
for (Stem stem : stems) {
builder.append("- ").append(stem.getStem()).append(": ");
for (HunspellAffix prefix : stem.getPrefixes()) {
builder.append(prefix.getAppend()).append("+");
if (hasText(prefix.getStrip())) {
builder.append(prefix.getStrip()).append("-");
}
}
builder.append(stem.getStem());
for (HunspellAffix suffix : stem.getSuffixes()) {
if (hasText(suffix.getStrip())) {
builder.append("-").append(suffix.getStrip());
}
builder.append("+").append(suffix.getAppend());
}
builder.append("\n");
}
System.out.println(builder);
}
/**
* Simple utility to check if the given String has any text
*
* @param str String to check if it has any text
* @return {@code true} if the String has text, {@code false} otherwise
*/
private static boolean hasText(String str) {
return str != null && str.length() > 0;
}
}

View File

@ -0,0 +1,60 @@
package org.apache.lucene.analysis.hunspell;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.Arrays;
public class HunspellWord {
private final char flags[]; // sorted, can we represent more concisely?
/**
* Creates a new HunspellWord with no associated flags
*/
public HunspellWord() {
flags = null;
}
/**
* Constructs a new HunspellWord with the given flags
*
* @param flags Flags to associate with the word
*/
public HunspellWord(char[] flags) {
this.flags = flags;
}
/**
* Checks whether the word has the given flag associated with it
*
* @param flag Flag to check whether it is associated with the word
* @return {@code true} if the flag is associated, {@code false} otherwise
*/
public boolean hasFlag(char flag) {
return flags != null && Arrays.binarySearch(flags, flag) >= 0;
}
/**
* Returns the flags associated with the word
*
* @return Flags asssociated with the word
*/
public char[] getFlags() {
return flags;
}
}

View File

@ -0,0 +1,26 @@
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html>
<body>
Stemming TokenFilter using a Java implementation of the <a href="http://www.ldc.upenn.edu/Catalog/docs/LDC2008T01/acta04.pdf">
Hunspell stemming algorithm.</a>
<p>
Dictionaries can be found on <a href="http://wiki.services.openoffice.org/wiki/Dictionaries">
OpenOffice's wiki</a>
</p>
</body>
</html>

View File

@ -0,0 +1,44 @@
package org.apache.lucene.analysis.hunspell;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.util.Version;
import org.junit.Test;
import java.io.IOException;
import java.io.InputStream;
import java.text.ParseException;
import static junit.framework.Assert.assertEquals;
public class HunspellDictionaryTest {
@Test
public void testHunspellDictionary_loadDicAff() throws IOException, ParseException {
InputStream affixStream = getClass().getResourceAsStream("test.aff");
InputStream dictStream = getClass().getResourceAsStream("test.dic");
HunspellDictionary dictionary = new HunspellDictionary(affixStream, dictStream, Version.LUCENE_40);
assertEquals(2, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).size());
assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).size());
assertEquals(1, dictionary.lookupWord(new char[]{'o', 'l', 'r'}, 0, 3).size());
affixStream.close();
dictStream.close();
}
}

View File

@ -0,0 +1,76 @@
package org.apache.lucene.analysis.hunspell;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.util.Version;
import org.junit.BeforeClass;
import org.junit.Test;
import java.io.IOException;
import java.io.InputStream;
import java.text.ParseException;
import java.util.List;
import static junit.framework.Assert.assertEquals;
public class HunspellStemmerTest {
private static HunspellStemmer stemmer;
@BeforeClass
public static void beforeClass() throws IOException, ParseException {
InputStream affixStream = HunspellStemmerTest.class.getResourceAsStream("test.aff");
InputStream dictStream = HunspellStemmerTest.class.getResourceAsStream("test.dic");
HunspellDictionary dictionary = new HunspellDictionary(affixStream, dictStream, Version.LUCENE_40);
stemmer = new HunspellStemmer(dictionary);
affixStream.close();
dictStream.close();
}
@Test
public void testStem_simpleSuffix() {
List<HunspellStemmer.Stem> stems = stemmer.stem("lucene");
assertEquals(2, stems.size());
assertEquals("lucene", stems.get(0).getStemString());
assertEquals("lucen", stems.get(1).getStemString());
stems = stemmer.stem("mahoute");
assertEquals(1, stems.size());
assertEquals("mahout", stems.get(0).getStemString());
}
@Test
public void testStem_simplePrefix() {
List<HunspellStemmer.Stem> stems = stemmer.stem("solr");
assertEquals(1, stems.size());
assertEquals("olr", stems.get(0).getStemString());
}
@Test
public void testStem_recursiveSuffix() {
List<HunspellStemmer.Stem> stems = stemmer.stem("abcd");
assertEquals(1, stems.size());
assertEquals("ab", stems.get(0).getStemString());
}
}

View File

@ -0,0 +1,13 @@
SET UTF-8
TRY abcdefghijklmopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ
SFX A Y 2
SFX A 0 e n
SFX A 0 e t
SFX C Y 2
SFX C 0 d/C c
SFX C 0 c b
PFX B Y 1
PFX B 0 s o

View File

@ -0,0 +1,6 @@
5
lucen/A
lucene
mahout/A
olr/B
ab/C