LUCENE-5468: commit current state

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5468@1571137 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2014-02-24 04:41:03 +00:00
parent 6a4e1e3a92
commit 2e0fc562bc
18 changed files with 2005 additions and 0 deletions

View File

@ -0,0 +1,157 @@
package org.apache.lucene.analysis.hunspell2;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.regex.Pattern;
/**
* Wrapper class representing a hunspell affix
*/
final class Affix {
private String append; // the affix itself, what is appended
private char appendFlags[]; // continuation class flags
private String strip;
private String condition;
private Pattern conditionPattern;
private char flag;
private boolean crossProduct;
/**
* Checks whether the given text matches the conditional pattern on this affix
*
* @param text Text to check if it matches the affix's conditional pattern
* @return {@code true} if the text meets the condition, {@code false} otherwise
*/
public boolean checkCondition(CharSequence text) {
return conditionPattern.matcher(text).matches();
}
/**
* Returns the append defined for the affix
*
* @return Defined append
*/
public String getAppend() {
return append;
}
/**
* Sets the append defined for the affix
*
* @param append Defined append for the affix
*/
public void setAppend(String append) {
this.append = append;
}
/**
* Returns the flags defined for the affix append
*
* @return Flags defined for the affix append
*/
public char[] getAppendFlags() {
return appendFlags;
}
/**
* Sets the flags defined for the affix append
*
* @param appendFlags Flags defined for the affix append
*/
public void setAppendFlags(char[] appendFlags) {
this.appendFlags = appendFlags;
}
/**
* Returns the stripping characters defined for the affix
*
* @return Stripping characters defined for the affix
*/
public String getStrip() {
return strip;
}
/**
* Sets the stripping characters defined for the affix
*
* @param strip Stripping characters defined for the affix
*/
public void setStrip(String strip) {
this.strip = strip;
}
/**
* Returns the condition that must be met before the affix can be applied
*
* @return Condition that must be met before the affix can be applied
*/
public String getCondition() {
return condition;
}
/**
* Sets the condition that must be met before the affix can be applied
*
* @param condition Condition to be met before affix application
* @param pattern Condition as a regular expression pattern
*/
public void setCondition(String condition, String pattern) {
this.condition = condition;
this.conditionPattern = Pattern.compile(pattern);
}
/**
* Returns the affix flag
*
* @return Affix flag
*/
public char getFlag() {
return flag;
}
/**
* Sets the affix flag
*
* @param flag Affix flag
*/
public void setFlag(char flag) {
this.flag = flag;
}
/**
* Returns whether the affix is defined as cross product
*
* @return {@code true} if the affix is cross product, {@code false} otherwise
*/
public boolean isCrossProduct() {
return crossProduct;
}
/**
* Sets whether the affix is defined as cross product
*
* @param crossProduct Whether the affix is defined as cross product
*/
public void setCrossProduct(boolean crossProduct) {
this.crossProduct = crossProduct;
}
}

View File

@ -0,0 +1,606 @@
package org.apache.lucene.analysis.hunspell2;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.util.CharArrayMap;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefHash;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.Version;
import org.apache.lucene.util.fst.Builder;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.PositiveIntOutputs;
import java.io.*;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.TreeMap;
/**
* In-memory structure for the dictionary (.dic) and affix (.aff)
* data of a hunspell dictionary.
*/
public class Dictionary {
static final char[] NOFLAGS = new char[0];
private static final String ALIAS_KEY = "AF";
private static final String PREFIX_KEY = "PFX";
private static final String SUFFIX_KEY = "SFX";
private static final String FLAG_KEY = "FLAG";
private static final String NUM_FLAG_TYPE = "num";
private static final String UTF8_FLAG_TYPE = "UTF-8";
private static final String LONG_FLAG_TYPE = "long";
private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*";
private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s";
public CharArrayMap<List<Affix>> prefixes;
public CharArrayMap<List<Affix>> suffixes;
// the entries in the .dic file, mapping to their set of flags.
// the fst output is the ordinal for flagLookup
public FST<Long> words;
// the list of unique flagsets (wordforms). theoretically huge, but practically
// small (e.g. for polish this is 756), otherwise humans wouldn't be able to deal with it either.
public BytesRefHash flagLookup = new BytesRefHash();
private FlagParsingStrategy flagParsingStrategy = new SimpleFlagParsingStrategy(); // Default flag parsing strategy
private String[] aliases;
private int aliasCount = 0;
/**
* Creates a new Dictionary containing the information read from the provided InputStreams to hunspell affix
* and dictionary files.
* You have to close the provided InputStreams yourself.
*
* @param affix InputStream for reading the hunspell affix file (won't be closed).
* @param dictionary InputStream for reading the hunspell dictionary file (won't be closed).
* @throws IOException Can be thrown while reading from the InputStreams
* @throws ParseException Can be thrown if the content of the files does not meet expected formats
*/
public Dictionary(InputStream affix, InputStream dictionary) throws IOException, ParseException {
BufferedInputStream buffered = new BufferedInputStream(affix, 8192);
buffered.mark(8192);
String encoding = getDictionaryEncoding(affix);
buffered.reset();
CharsetDecoder decoder = getJavaEncoding(encoding);
readAffixFile(buffered, decoder);
TreeMap<BytesRef,Integer> tempWords = new TreeMap<BytesRef,Integer>();
flagLookup.add(new BytesRef()); // no flags -> ord 0
readDictionaryFile(dictionary, decoder, tempWords);
PositiveIntOutputs o = PositiveIntOutputs.getSingleton();
Builder<Long> b = new Builder<Long>(FST.INPUT_TYPE.BYTE4, o); // nocommit: byte4
IntsRef scratchInts = new IntsRef();
for (Map.Entry<BytesRef,Integer> e : tempWords.entrySet()) {
UnicodeUtil.UTF8toUTF32(e.getKey(), scratchInts);
b.add(scratchInts, e.getValue().longValue());
}
words = b.finish();
}
/**
* Looks up words that match the String created from the given char array, offset and length
*
* @param word Char array to generate the String from
* @param offset Offset in the char array that the String starts at
* @param length Length from the offset that the String is
* @return List of HunspellWords that match the generated String, or {@code null} if none are found
*/
char[] lookupWord(char word[], int offset, int length, BytesRef scratch) {
Integer ord = null;
try {
ord = lookupOrd(word, offset, length);
} catch (IOException ex) { /* bogus */ }
if (ord == null) {
return null;
}
return decodeFlags(flagLookup.get(ord, scratch));
}
public Integer lookupOrd(char word[], int offset, int length) throws IOException {
final FST.BytesReader bytesReader = words.getBytesReader();
final FST.Arc<Long> arc = words.getFirstArc(new FST.Arc<Long>());
// Accumulate output as we go
final Long NO_OUTPUT = words.outputs.getNoOutput();
Long output = NO_OUTPUT;
int l = offset + length;
for (int i = offset, cp = 0; i < l; i += Character.charCount(cp)) {
cp = Character.codePointAt(word, i, l);
if (words.findTargetArc(cp, arc, arc, bytesReader) == null) {
return null;
} else if (arc.output != NO_OUTPUT) {
output = words.outputs.add(output, arc.output);
}
}
if (words.findTargetArc(FST.END_LABEL, arc, arc, bytesReader) == null) {
return null;
} else if (arc.output != NO_OUTPUT) {
return words.outputs.add(output, arc.output).intValue();
} else {
return output.intValue();
}
}
/**
* Looks up HunspellAffix prefixes that have an append that matches the String created from the given char array, offset and length
*
* @param word Char array to generate the String from
* @param offset Offset in the char array that the String starts at
* @param length Length from the offset that the String is
* @return List of HunspellAffix prefixes with an append that matches the String, or {@code null} if none are found
*/
public List<Affix> lookupPrefix(char word[], int offset, int length) {
return prefixes.get(word, offset, length);
}
/**
* Looks up HunspellAffix suffixes that have an append that matches the String created from the given char array, offset and length
*
* @param word Char array to generate the String from
* @param offset Offset in the char array that the String starts at
* @param length Length from the offset that the String is
* @return List of HunspellAffix suffixes with an append that matches the String, or {@code null} if none are found
*/
List<Affix> lookupSuffix(char word[], int offset, int length) {
return suffixes.get(word, offset, length);
}
/**
* Reads the affix file through the provided InputStream, building up the prefix and suffix maps
*
* @param affixStream InputStream to read the content of the affix file from
* @param decoder CharsetDecoder to decode the content of the file
* @throws IOException Can be thrown while reading from the InputStream
*/
private void readAffixFile(InputStream affixStream, CharsetDecoder decoder) throws IOException, ParseException {
prefixes = new CharArrayMap<List<Affix>>(Version.LUCENE_CURRENT, 8, false);
suffixes = new CharArrayMap<List<Affix>>(Version.LUCENE_CURRENT, 8, false);
LineNumberReader reader = new LineNumberReader(new InputStreamReader(affixStream, decoder));
String line = null;
while ((line = reader.readLine()) != null) {
if (line.startsWith(ALIAS_KEY)) {
parseAlias(line);
} else if (line.startsWith(PREFIX_KEY)) {
parseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN);
} else if (line.startsWith(SUFFIX_KEY)) {
parseAffix(suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN);
} else if (line.startsWith(FLAG_KEY)) {
// Assume that the FLAG line comes before any prefix or suffixes
// Store the strategy so it can be used when parsing the dic file
flagParsingStrategy = getFlagParsingStrategy(line);
}
}
}
/**
* Parses a specific affix rule putting the result into the provided affix map
*
* @param affixes Map where the result of the parsing will be put
* @param header Header line of the affix rule
* @param reader BufferedReader to read the content of the rule from
* @param conditionPattern {@link String#format(String, Object...)} pattern to be used to generate the condition regex
* pattern
* @throws IOException Can be thrown while reading the rule
*/
private void parseAffix(CharArrayMap<List<Affix>> affixes,
String header,
LineNumberReader reader,
String conditionPattern) throws IOException, ParseException {
String args[] = header.split("\\s+");
boolean crossProduct = args[2].equals("Y");
int numLines = Integer.parseInt(args[3]);
for (int i = 0; i < numLines; i++) {
String line = reader.readLine();
String ruleArgs[] = line.split("\\s+");
if (ruleArgs.length < 5) {
throw new ParseException("The affix file contains a rule with less than five elements", reader.getLineNumber());
}
Affix affix = new Affix();
affix.setFlag(flagParsingStrategy.parseFlag(ruleArgs[1]));
affix.setStrip(ruleArgs[2].equals("0") ? "" : ruleArgs[2]);
String affixArg = ruleArgs[3];
int flagSep = affixArg.lastIndexOf('/');
if (flagSep != -1) {
String flagPart = affixArg.substring(flagSep + 1);
if (aliasCount > 0) {
flagPart = getAliasValue(Integer.parseInt(flagPart));
}
char appendFlags[] = flagParsingStrategy.parseFlags(flagPart);
Arrays.sort(appendFlags);
affix.setAppendFlags(appendFlags);
affix.setAppend(affixArg.substring(0, flagSep));
} else {
affix.setAppend(affixArg);
}
String condition = ruleArgs[4];
// at least the gascon affix file has this issue
if (condition.startsWith("[") && !condition.endsWith("]")) {
condition = condition + "]";
}
// "dash hasn't got special meaning" (we must escape it)
if (condition.indexOf('-') >= 0) {
condition = condition.replace("-", "\\-");
}
affix.setCondition(condition, String.format(Locale.ROOT, conditionPattern, condition));
affix.setCrossProduct(crossProduct);
List<Affix> list = affixes.get(affix.getAppend());
if (list == null) {
list = new ArrayList<Affix>();
affixes.put(affix.getAppend(), list);
}
list.add(affix);
}
}
/**
* Parses the encoding specified in the affix file readable through the provided InputStream
*
* @param affix InputStream for reading the affix file
* @return Encoding specified in the affix file
* @throws IOException Can be thrown while reading from the InputStream
* @throws ParseException Thrown if the first non-empty non-comment line read from the file does not adhere to the format {@code SET <encoding>}
*/
private String getDictionaryEncoding(InputStream affix) throws IOException, ParseException {
final StringBuilder encoding = new StringBuilder();
for (;;) {
encoding.setLength(0);
int ch;
while ((ch = affix.read()) >= 0) {
if (ch == '\n') {
break;
}
if (ch != '\r') {
encoding.append((char)ch);
}
}
if (
encoding.length() == 0 || encoding.charAt(0) == '#' ||
// this test only at the end as ineffective but would allow lines only containing spaces:
encoding.toString().trim().length() == 0
) {
if (ch < 0) {
throw new ParseException("Unexpected end of affix file.", 0);
}
continue;
}
if (encoding.length() > 4 && "SET ".equals(encoding.substring(0, 4))) {
// cleanup the encoding string, too (whitespace)
return encoding.substring(4).trim();
}
}
}
static final Map<String,String> CHARSET_ALIASES;
static {
Map<String,String> m = new HashMap<>();
m.put("microsoft-cp1251", "windows-1251");
m.put("TIS620-2533", "TIS-620");
CHARSET_ALIASES = Collections.unmodifiableMap(m);
}
/**
* Retrieves the CharsetDecoder for the given encoding. Note, This isn't perfect as I think ISCII-DEVANAGARI and
* MICROSOFT-CP1251 etc are allowed...
*
* @param encoding Encoding to retrieve the CharsetDecoder for
* @return CharSetDecoder for the given encoding
*/
private CharsetDecoder getJavaEncoding(String encoding) {
if ("ISO8859-14".equals(encoding)) {
return new ISO8859_14Decoder();
}
String canon = CHARSET_ALIASES.get(encoding);
if (canon != null) {
encoding = canon;
}
Charset charset = Charset.forName(encoding);
return charset.newDecoder().onMalformedInput(CodingErrorAction.REPLACE);
}
/**
* Determines the appropriate {@link FlagParsingStrategy} based on the FLAG definition line taken from the affix file
*
* @param flagLine Line containing the flag information
* @return FlagParsingStrategy that handles parsing flags in the way specified in the FLAG definition
*/
private FlagParsingStrategy getFlagParsingStrategy(String flagLine) {
String flagType = flagLine.substring(5);
if (NUM_FLAG_TYPE.equals(flagType)) {
return new NumFlagParsingStrategy();
} else if (UTF8_FLAG_TYPE.equals(flagType)) {
return new SimpleFlagParsingStrategy();
} else if (LONG_FLAG_TYPE.equals(flagType)) {
return new DoubleASCIIFlagParsingStrategy();
}
throw new IllegalArgumentException("Unknown flag type: " + flagType);
}
/**
* Reads the dictionary file through the provided InputStream, building up the words map
*
* @param dictionary InputStream to read the dictionary file through
* @param decoder CharsetDecoder used to decode the contents of the file
* @throws IOException Can be thrown while reading from the file
*/
private void readDictionaryFile(InputStream dictionary, CharsetDecoder decoder, TreeMap<BytesRef,Integer> words) throws IOException {
BytesRef flagsScratch = new BytesRef();
BytesRef flagsScratch2 = new BytesRef();
BufferedReader reader = new BufferedReader(new InputStreamReader(dictionary, decoder));
// TODO: don't create millions of strings.
String line = reader.readLine(); // first line is number of entries
// sometimes the number of entries has a comment/copyright after it
line = line.replaceFirst("\\s*\\#.*$", "");
int numEntries = Integer.parseInt(line);
// TODO: the flags themselves can be double-chars (long) or also numeric
// either way the trick is to encode them as char... but they must be parsed differently
while ((line = reader.readLine()) != null) {
String entry;
char wordForm[];
int flagSep = line.lastIndexOf('/');
if (flagSep == -1) {
wordForm = NOFLAGS;
entry = line;
} else {
// note, there can be comments (morph description) after a flag.
// we should really look for any whitespace
int end = line.indexOf('\t', flagSep);
if (end == -1)
end = line.length();
String flagPart = line.substring(flagSep + 1, end);
if (aliasCount > 0) {
flagPart = getAliasValue(Integer.parseInt(flagPart));
}
wordForm = flagParsingStrategy.parseFlags(flagPart);
Arrays.sort(wordForm);
entry = line.substring(0, flagSep);
}
BytesRef scratch = new BytesRef(entry);
Integer existingOrd = words.get(scratch);
final char mergedEntries[];
if (existingOrd == null || existingOrd == 0) {
mergedEntries = wordForm;
} else {
flagLookup.get(existingOrd, flagsScratch2);
mergedEntries = merge(decodeFlags(flagsScratch2), wordForm);
}
final int hashCode = encodeFlagsWithHash(flagsScratch, mergedEntries);
int ord = flagLookup.add(flagsScratch, hashCode);
if (ord < 0) {
// already exists in our hash
ord = (-ord)-1;
}
words.put(scratch, ord);
}
}
static char[] decodeFlags(BytesRef b) {
int len = b.length >>> 1;
char flags[] = new char[len];
int upto = 0;
int end = b.offset + b.length;
for (int i = b.offset; i < end; i += 2) {
flags[upto++] = (char)((b.bytes[i] << 8) | (b.bytes[i+1] & 0xff));
}
return flags;
}
static int encodeFlagsWithHash(BytesRef b, char flags[]) {
int hash = 0;
int len = flags.length << 1;
b.grow(len);
b.length = len;
int upto = b.offset;
for (int i = 0; i < flags.length; i++) {
int flag = flags[i];
hash = 31*hash + (b.bytes[upto++] = (byte) ((flag >> 8) & 0xff));
hash = 31*hash + (b.bytes[upto++] = (byte) (flag & 0xff));
}
return hash;
}
private void parseAlias(String line) {
String ruleArgs[] = line.split("\\s+");
if (aliases == null) {
//first line should be the aliases count
final int count = Integer.parseInt(ruleArgs[1]);
aliases = new String[count];
} else {
aliases[aliasCount++] = ruleArgs[1];
}
}
private String getAliasValue(int id) {
try {
return aliases[id - 1];
} catch (IndexOutOfBoundsException ex) {
throw new IllegalArgumentException("Bad flag alias number:" + id, ex);
}
}
/**
* Abstraction of the process of parsing flags taken from the affix and dic files
*/
private static abstract class FlagParsingStrategy {
/**
* Parses the given String into a single flag
*
* @param rawFlag String to parse into a flag
* @return Parsed flag
*/
char parseFlag(String rawFlag) {
return parseFlags(rawFlag)[0];
}
/**
* Parses the given String into multiple flags
*
* @param rawFlags String to parse into flags
* @return Parsed flags
*/
abstract char[] parseFlags(String rawFlags);
}
/**
* Simple implementation of {@link FlagParsingStrategy} that treats the chars in each String as a individual flags.
* Can be used with both the ASCII and UTF-8 flag types.
*/
private static class SimpleFlagParsingStrategy extends FlagParsingStrategy {
@Override
public char[] parseFlags(String rawFlags) {
return rawFlags.toCharArray();
}
}
/**
* Implementation of {@link FlagParsingStrategy} that assumes each flag is encoded in its numerical form. In the case
* of multiple flags, each number is separated by a comma.
*/
private static class NumFlagParsingStrategy extends FlagParsingStrategy {
@Override
public char[] parseFlags(String rawFlags) {
String[] rawFlagParts = rawFlags.trim().split(",");
char[] flags = new char[rawFlagParts.length];
int upto = 0;
for (int i = 0; i < rawFlagParts.length; i++) {
// note, removing the trailing X/leading I for nepali... what is the rule here?!
String replacement = rawFlagParts[i].replaceAll("[^0-9]", "");
// note, ignoring empty flags (this happens in danish, for example)
if (replacement.isEmpty()) {
continue;
}
flags[upto++] = (char) Integer.parseInt(replacement);
}
if (upto < flags.length) {
flags = Arrays.copyOf(flags, upto);
}
return flags;
}
}
/**
* Implementation of {@link FlagParsingStrategy} that assumes each flag is encoded as two ASCII characters whose codes
* must be combined into a single character.
*
* TODO (rmuir) test
*/
private static class DoubleASCIIFlagParsingStrategy extends FlagParsingStrategy {
@Override
public char[] parseFlags(String rawFlags) {
if (rawFlags.length() == 0) {
return new char[0];
}
StringBuilder builder = new StringBuilder();
for (int i = 0; i < rawFlags.length(); i+=2) {
char cookedFlag = (char) ((int) rawFlags.charAt(i) + (int) rawFlags.charAt(i + 1));
builder.append(cookedFlag);
}
char flags[] = new char[builder.length()];
builder.getChars(0, builder.length(), flags, 0);
return flags;
}
}
static boolean hasFlag(char flags[], char flag) {
return Arrays.binarySearch(flags, flag) >= 0;
}
static char[] merge(char[] flags1, char[] flags2) {
char merged[] = new char[flags1.length + flags2.length];
int i1 = 0, i2 = 0;
int last = -1;
int upto = 0;
while (i1 < flags1.length && i2 < flags2.length) {
final char next;
if (flags1[i1] <= flags2[i2]) {
next = flags1[i1++];
} else {
next = flags2[i2++];
}
if (next != last) {
merged[upto++] = next;
last = next;
}
}
while (i1 < flags1.length) {
char next = flags1[i1++];
if (next != last) {
merged[upto++] = next;
last = next;
}
}
while (i2 < flags2.length) {
char next = flags2[i2++];
if (next != last) {
merged[upto++] = next;
last = next;
}
}
if (merged.length != upto) {
merged = Arrays.copyOf(merged, upto);
}
return merged;
}
}

View File

@ -0,0 +1,139 @@
package org.apache.lucene.analysis.hunspell2;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.List;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.hunspell2.Stemmer.Stem;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
/**
* TokenFilter that uses hunspell affix rules and words to stem tokens. Since hunspell supports a word having multiple
* stems, this filter can emit multiple tokens for each consumed token
*
* <p>
* Note: This filter is aware of the {@link KeywordAttribute}. To prevent
* certain terms from being passed to the stemmer
* {@link KeywordAttribute#isKeyword()} should be set to <code>true</code>
* in a previous {@link TokenStream}.
*
* Note: For including the original term as well as the stemmed version, see
* {@link org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilterFactory}
* </p>
*
* @lucene.experimental
*/
public final class Hunspell2StemFilter extends TokenFilter {
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
private final Stemmer stemmer;
private List<Stem> buffer;
private State savedState;
private final boolean dedup;
/** Create a {@link Hunspell2StemFilter} which deduplicates stems and has a maximum
* recursion level of 2.
* @see #Hunspell2StemFilter(TokenStream, Dictionary, int) */
public Hunspell2StemFilter(TokenStream input, Dictionary dictionary) {
this(input, dictionary, 2);
}
/**
* Creates a new Hunspell2StemFilter that will stem tokens from the given TokenStream using affix rules in the provided
* Dictionary
*
* @param input TokenStream whose tokens will be stemmed
* @param dictionary HunspellDictionary containing the affix rules and words that will be used to stem the tokens
* @param recursionCap maximum level of recursion stemmer can go into, defaults to <code>2</code>
*/
public Hunspell2StemFilter(TokenStream input, Dictionary dictionary, int recursionCap) {
this(input, dictionary, true, recursionCap);
}
/** Create a {@link Hunspell2StemFilter} which has a maximum recursion level of 2.
* @see #Hunspell2StemFilter(TokenStream, Dictionary, boolean, int) */
public Hunspell2StemFilter(TokenStream input, Dictionary dictionary, boolean dedup) {
this(input, dictionary, dedup, 2);
}
/**
* Creates a new HunspellStemFilter that will stem tokens from the given TokenStream using affix rules in the provided
* Dictionary
*
* @param input TokenStream whose tokens will be stemmed
* @param dictionary HunspellDictionary containing the affix rules and words that will be used to stem the tokens
* @param dedup true if only unique terms should be output.
* @param recursionCap maximum level of recursion stemmer can go into, defaults to <code>2</code>
*/
public Hunspell2StemFilter(TokenStream input, Dictionary dictionary, boolean dedup, int recursionCap) {
super(input);
this.dedup = dedup;
this.stemmer = new Stemmer(dictionary, recursionCap);
}
@Override
public boolean incrementToken() throws IOException {
if (buffer != null && !buffer.isEmpty()) {
Stem nextStem = buffer.remove(0);
restoreState(savedState);
posIncAtt.setPositionIncrement(0);
termAtt.copyBuffer(nextStem.getStem(), 0, nextStem.getStemLength());
termAtt.setLength(nextStem.getStemLength());
return true;
}
if (!input.incrementToken()) {
return false;
}
if (keywordAtt.isKeyword()) {
return true;
}
buffer = dedup ? stemmer.uniqueStems(termAtt.buffer(), termAtt.length()) : stemmer.stem(termAtt.buffer(), termAtt.length());
if (buffer.isEmpty()) { // we do not know this word, return it unchanged
return true;
}
Stem stem = buffer.remove(0);
termAtt.copyBuffer(stem.getStem(), 0, stem.getStemLength());
termAtt.setLength(stem.getStemLength());
if (!buffer.isEmpty()) {
savedState = captureState();
}
return true;
}
@Override
public void reset() throws IOException {
super.reset();
buffer = null;
}
}

View File

@ -0,0 +1,80 @@
package org.apache.lucene.analysis.hunspell2;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.InputStream;
import java.text.ParseException;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.ResourceLoader;
import org.apache.lucene.analysis.util.ResourceLoaderAware;
import org.apache.lucene.analysis.util.TokenFilterFactory;
/**
* TokenFilterFactory that creates instances of {@link Hunspell2StemFilter}.
* Example config for British English:
* <pre class="prettyprint">
* &lt;filter class=&quot;solr.Hunspell2StemFilterFactory&quot;
* dictionary=&quot;en_GB.dic&quot;
* affix=&quot;en_GB.aff&quot; /&gt;</pre>
* Both parameters dictionary and affix are mandatory.
* Dictionaries for many languages are available through the OpenOffice project.
*
* See <a href="http://wiki.apache.org/solr/Hunspell">http://wiki.apache.org/solr/Hunspell</a>
* @lucene.experimental
*/
public class Hunspell2StemFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
private static final String PARAM_DICTIONARY = "dictionary";
private static final String PARAM_AFFIX = "affix";
private static final String PARAM_RECURSION_CAP = "recursionCap";
private final String dictionaryFile;
private final String affixFile;
private Dictionary dictionary;
private int recursionCap;
/** Creates a new Hunspell2StemFilterFactory */
public Hunspell2StemFilterFactory(Map<String,String> args) {
super(args);
dictionaryFile = require(args, PARAM_DICTIONARY);
affixFile = get(args, PARAM_AFFIX);
recursionCap = getInt(args, PARAM_RECURSION_CAP, 2);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
}
@Override
public void inform(ResourceLoader loader) throws IOException {
try (InputStream affix = loader.openResource(affixFile);
InputStream dictionary = loader.openResource(dictionaryFile)) {
try {
this.dictionary = new Dictionary(affix, dictionary);
} catch (ParseException e) {
throw new RuntimeException(e);
}
}
}
@Override
public TokenStream create(TokenStream tokenStream) {
return new Hunspell2StemFilter(tokenStream, dictionary, recursionCap);
}
}

View File

@ -0,0 +1,60 @@
package org.apache.lucene.analysis.hunspell2;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CoderResult;
import org.apache.lucene.util.IOUtils;
// many hunspell dictionaries use this encoding, yet java does not have it?!?!
final class ISO8859_14Decoder extends CharsetDecoder {
static final char TABLE[] = new char[] {
0x00A0, 0x1E02, 0x1E03, 0x00A3, 0x010A, 0x010B, 0x1E0A, 0x00A7,
0x1E80, 0x00A9, 0x1E82, 0x1E0B, 0x1EF2, 0x00AD, 0x00AE, 0x0178,
0x1E1E, 0x1E1F, 0x0120, 0x0121, 0x1E40, 0x1E41, 0x00B6, 0x1E56,
0x1E81, 0x1E57, 0x1E83, 0x1E60, 0x1EF3, 0x1E84, 0x1E85, 0x1E61,
0x00C0, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x00C6, 0x00C7,
0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF,
0x0174, 0x00D1, 0x00D2, 0x00D3, 0x00D4, 0x00D5, 0x00D6, 0x1E6A,
0x00D8, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x00DD, 0x0176, 0x00DF,
0x00E0, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6, 0x00E7,
0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF,
0x0175, 0x00F1, 0x00F2, 0x00F3, 0x00F4, 0x00F5, 0x00F6, 0x1E6B,
0x00F8, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x00FD, 0x0177, 0x00FF
};
ISO8859_14Decoder() {
super(IOUtils.CHARSET_UTF_8, 1f, 1f);
}
@Override
protected CoderResult decodeLoop(ByteBuffer in, CharBuffer out) {
while (in.hasRemaining() && out.hasRemaining()) {
char ch = (char) (in.get() & 0xff);
if (ch >= 0xA0) {
ch = TABLE[ch - 0xA0];
}
out.put(ch);
}
return in.hasRemaining() ? CoderResult.OVERFLOW : CoderResult.UNDERFLOW;
}
}

View File

@ -0,0 +1,288 @@
package org.apache.lucene.analysis.hunspell2;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.Version;
/**
* Stemmer uses the affix rules declared in the Dictionary to generate one or more stems for a word. It
* conforms to the algorithm in the original hunspell algorithm, including recursive suffix stripping.
*/
final class Stemmer {
private final int recursionCap;
private final Dictionary dictionary;
private BytesRef scratch = new BytesRef();
private final StringBuilder segment = new StringBuilder();
/**
* Constructs a new Stemmer which will use the provided Dictionary to create its stems. Uses the
* default recursion cap of <code>2</code> (based on Hunspell documentation).
*
* @param dictionary Dictionary that will be used to create the stems
*/
public Stemmer(Dictionary dictionary) {
this(dictionary, 2);
}
/**
* Constructs a new Stemmer which will use the provided Dictionary to create its stems.
*
* @param dictionary Dictionary that will be used to create the stems
* @param recursionCap maximum level of recursion stemmer can go into
*/
public Stemmer(Dictionary dictionary, int recursionCap) {
this.dictionary = dictionary;
this.recursionCap = recursionCap;
}
/**
* Find the stem(s) of the provided word.
*
* @param word Word to find the stems for
* @return List of stems for the word
*/
public List<Stem> stem(String word) {
return stem(word.toCharArray(), word.length());
}
/**
* Find the stem(s) of the provided word
*
* @param word Word to find the stems for
* @return List of stems for the word
*/
public List<Stem> stem(char word[], int length) {
List<Stem> stems = new ArrayList<Stem>();
if (dictionary.lookupWord(word, 0, length, scratch) != null) {
stems.add(new Stem(word, length));
}
stems.addAll(stem(word, length, null, 0));
return stems;
}
/**
* Find the unique stem(s) of the provided word
*
* @param word Word to find the stems for
* @return List of stems for the word
*/
public List<Stem> uniqueStems(char word[], int length) {
List<Stem> stems = new ArrayList<Stem>();
CharArraySet terms = new CharArraySet(Version.LUCENE_CURRENT, 8, false);
if (dictionary.lookupWord(word, 0, length, scratch) != null) {
stems.add(new Stem(word, length));
terms.add(word);
}
List<Stem> otherStems = stem(word, length, null, 0);
for (Stem s : otherStems) {
if (!terms.contains(s.stem)) {
stems.add(s);
terms.add(s.stem);
}
}
return stems;
}
// ================================================= Helper Methods ================================================
/**
* Generates a list of stems for the provided word
*
* @param word Word to generate the stems for
* @param flags Flags from a previous stemming step that need to be cross-checked with any affixes in this recursive step
* @param recursionDepth Level of recursion this stemming step is at
* @return List of stems, or empty list if no stems are found
*/
private List<Stem> stem(char word[], int length, char[] flags, int recursionDepth) {
List<Stem> stems = new ArrayList<Stem>();
for (int i = 0; i < length; i++) {
List<Affix> suffixes = dictionary.lookupSuffix(word, i, length - i);
if (suffixes == null) {
continue;
}
for (Affix suffix : suffixes) {
if (hasCrossCheckedFlag(suffix.getFlag(), flags)) {
int deAffixedLength = length - suffix.getAppend().length();
// TODO: can we do this in-place?
String strippedWord = new StringBuilder().append(word, 0, deAffixedLength).append(suffix.getStrip()).toString();
List<Stem> stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), suffix, recursionDepth);
for (Stem stem : stemList) {
stem.addSuffix(suffix);
}
stems.addAll(stemList);
}
}
}
for (int i = length - 1; i >= 0; i--) {
List<Affix> prefixes = dictionary.lookupPrefix(word, 0, i);
if (prefixes == null) {
continue;
}
for (Affix prefix : prefixes) {
if (hasCrossCheckedFlag(prefix.getFlag(), flags)) {
int deAffixedStart = prefix.getAppend().length();
int deAffixedLength = length - deAffixedStart;
String strippedWord = new StringBuilder().append(prefix.getStrip())
.append(word, deAffixedStart, deAffixedLength)
.toString();
List<Stem> stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), prefix, recursionDepth);
for (Stem stem : stemList) {
stem.addPrefix(prefix);
}
stems.addAll(stemList);
}
}
}
return stems;
}
/**
* Applies the affix rule to the given word, producing a list of stems if any are found
*
* @param strippedWord Word the affix has been removed and the strip added
* @param affix HunspellAffix representing the affix rule itself
* @param recursionDepth Level of recursion this stemming step is at
* @return List of stems for the word, or an empty list if none are found
*/
public List<Stem> applyAffix(char strippedWord[], int length, Affix affix, int recursionDepth) {
segment.setLength(0);
segment.append(strippedWord, 0, length);
if (!affix.checkCondition(segment)) {
return Collections.emptyList();
}
List<Stem> stems = new ArrayList<Stem>();
char wordFlags[] = dictionary.lookupWord(strippedWord, 0, length, scratch);
if (wordFlags != null && Dictionary.hasFlag(wordFlags, affix.getFlag())) {
stems.add(new Stem(strippedWord, length));
}
if (affix.isCrossProduct() && recursionDepth < recursionCap) {
stems.addAll(stem(strippedWord, length, affix.getAppendFlags(), ++recursionDepth));
}
return stems;
}
/**
* Checks if the given flag cross checks with the given array of flags
*
* @param flag Flag to cross check with the array of flags
* @param flags Array of flags to cross check against. Can be {@code null}
* @return {@code true} if the flag is found in the array or the array is {@code null}, {@code false} otherwise
*/
private boolean hasCrossCheckedFlag(char flag, char[] flags) {
return flags == null || Arrays.binarySearch(flags, flag) >= 0;
}
/**
* Stem represents all information known about a stem of a word. This includes the stem, and the prefixes and suffixes
* that were used to change the word into the stem.
*/
public static class Stem {
private final List<Affix> prefixes = new ArrayList<Affix>();
private final List<Affix> suffixes = new ArrayList<Affix>();
private final char stem[];
private final int stemLength;
/**
* Creates a new Stem wrapping the given word stem
*
* @param stem Stem of a word
*/
public Stem(char stem[], int stemLength) {
this.stem = stem;
this.stemLength = stemLength;
}
/**
* Adds a prefix to the list of prefixes used to generate this stem. Because it is assumed that prefixes are added
* depth first, the prefix is added to the front of the list
*
* @param prefix Prefix to add to the list of prefixes for this stem
*/
public void addPrefix(Affix prefix) {
prefixes.add(0, prefix);
}
/**
* Adds a suffix to the list of suffixes used to generate this stem. Because it is assumed that suffixes are added
* depth first, the suffix is added to the end of the list
*
* @param suffix Suffix to add to the list of suffixes for this stem
*/
public void addSuffix(Affix suffix) {
suffixes.add(suffix);
}
/**
* Returns the list of prefixes used to generate the stem
*
* @return List of prefixes used to generate the stem or an empty list if no prefixes were required
*/
public List<Affix> getPrefixes() {
return prefixes;
}
/**
* Returns the list of suffixes used to generate the stem
*
* @return List of suffixes used to generate the stem or an empty list if no suffixes were required
*/
public List<Affix> getSuffixes() {
return suffixes;
}
/**
* Returns the text of the word's stem.
* @see #getStemLength()
*/
public char[] getStem() {
return stem;
}
/** Returns the valid length of the text in {@link #getStem()} */
public int getStemLength() {
return stemLength;
}
/** Only use this if you really need a string (e.g. for testing) */
public String getStemString() {
return new String(stem, 0, stemLength);
}
}
}

View File

@ -0,0 +1,26 @@
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html>
<body>
Stemming TokenFilter using a Java implementation of the <a href="http://www.ldc.upenn.edu/Catalog/docs/LDC2008T01/acta04.pdf">
Hunspell stemming algorithm.</a>
<p>
Dictionaries can be found on <a href="http://wiki.services.openoffice.org/wiki/Dictionaries">
OpenOffice's wiki</a>
</p>
</body>
</html>

View File

@ -51,6 +51,7 @@ org.apache.lucene.analysis.hi.HindiNormalizationFilterFactory
org.apache.lucene.analysis.hi.HindiStemFilterFactory org.apache.lucene.analysis.hi.HindiStemFilterFactory
org.apache.lucene.analysis.hu.HungarianLightStemFilterFactory org.apache.lucene.analysis.hu.HungarianLightStemFilterFactory
org.apache.lucene.analysis.hunspell.HunspellStemFilterFactory org.apache.lucene.analysis.hunspell.HunspellStemFilterFactory
org.apache.lucene.analysis.hunspell2.Hunspell2StemFilterFactory
org.apache.lucene.analysis.id.IndonesianStemFilterFactory org.apache.lucene.analysis.id.IndonesianStemFilterFactory
org.apache.lucene.analysis.in.IndicNormalizationFilterFactory org.apache.lucene.analysis.in.IndicNormalizationFilterFactory
org.apache.lucene.analysis.it.ItalianLightStemFilterFactory org.apache.lucene.analysis.it.ItalianLightStemFilterFactory

View File

@ -0,0 +1,205 @@
package org.apache.lucene.analysis.hunspell2;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.File;
import java.io.InputStream;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
import org.apache.lucene.analysis.hunspell.HunspellDictionary;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.RamUsageEstimator;
/**
* Can be retrieved via:
* wget --mirror -np http://archive.services.openoffice.org/pub/mirror/OpenOffice.org/contrib/dictionaries/
* Note some of the files differ only in case. This may be a problem on your operating system!
*/
//@Ignore("enable manually")
public class TestAllDictionaries extends LuceneTestCase {
// set this to the location of where you downloaded all the files
static final File DICTIONARY_HOME =
new File("/Users/rmuir/hunspell/archive.services.openoffice.org/pub/mirror/OpenOffice.org/contrib/dictionaries");
final String tests[] = {
/* zip file */ /* dictionary */ /* affix */
"af_ZA.zip", "af_ZA.dic", "af_ZA.aff",
"ak_GH.zip", "ak_GH.dic", "ak_GH.aff",
"bg_BG.zip", "bg_BG.dic", "bg_BG.aff",
"ca_ANY.zip", "catalan.dic", "catalan.aff",
"ca_ES.zip", "ca_ES.dic", "ca_ES.aff",
"cop_EG.zip", "cop_EG.dic", "cop_EG.aff",
"cs_CZ.zip", "cs_CZ.dic", "cs_CZ.aff",
"cy_GB.zip", "cy_GB.dic", "cy_GB.aff",
"da_DK.zip", "da_DK.dic", "da_DK.aff",
"de_AT.zip", "de_AT.dic", "de_AT.aff",
"de_CH.zip", "de_CH.dic", "de_CH.aff",
"de_DE.zip", "de_DE.dic", "de_DE.aff",
"de_DE_comb.zip", "de_DE_comb.dic", "de_DE_comb.aff",
"de_DE_frami.zip", "de_DE_frami.dic", "de_DE_frami.aff",
"de_DE_neu.zip", "de_DE_neu.dic", "de_DE_neu.aff",
"el_GR.zip", "el_GR.dic", "el_GR.aff",
"en_AU.zip", "en_AU.dic", "en_AU.aff",
"en_CA.zip", "en_CA.dic", "en_CA.aff",
"en_GB-oed.zip", "en_GB-oed.dic", "en_GB-oed.aff",
"en_GB.zip", "en_GB.dic", "en_GB.aff",
"en_NZ.zip", "en_NZ.dic", "en_NZ.aff",
"eo.zip", "eo_l3.dic", "eo_l3.aff",
"eo_EO.zip", "eo_EO.dic", "eo_EO.aff",
"es_AR.zip", "es_AR.dic", "es_AR.aff",
"es_BO.zip", "es_BO.dic", "es_BO.aff",
"es_CL.zip", "es_CL.dic", "es_CL.aff",
"es_CO.zip", "es_CO.dic", "es_CO.aff",
"es_CR.zip", "es_CR.dic", "es_CR.aff",
"es_CU.zip", "es_CU.dic", "es_CU.aff",
"es_DO.zip", "es_DO.dic", "es_DO.aff",
"es_EC.zip", "es_EC.dic", "es_EC.aff",
"es_ES.zip", "es_ES.dic", "es_ES.aff",
"es_GT.zip", "es_GT.dic", "es_GT.aff",
"es_HN.zip", "es_HN.dic", "es_HN.aff",
"es_MX.zip", "es_MX.dic", "es_MX.aff",
"es_NEW.zip", "es_NEW.dic", "es_NEW.aff",
"es_NI.zip", "es_NI.dic", "es_NI.aff",
"es_PA.zip", "es_PA.dic", "es_PA.aff",
"es_PE.zip", "es_PE.dic", "es_PE.aff",
"es_PR.zip", "es_PR.dic", "es_PR.aff",
"es_PY.zip", "es_PY.dic", "es_PY.aff",
"es_SV.zip", "es_SV.dic", "es_SV.aff",
"es_UY.zip", "es_UY.dic", "es_UY.aff",
"es_VE.zip", "es_VE.dic", "es_VE.aff",
"et_EE.zip", "et_EE.dic", "et_EE.aff",
"fo_FO.zip", "fo_FO.dic", "fo_FO.aff",
"fr_FR-1990_1-3-2.zip", "fr_FR-1990.dic", "fr_FR-1990.aff",
"fr_FR-classique_1-3-2.zip", "fr_FR-classique.dic", "fr_FR-classique.aff",
"fr_FR_1-3-2.zip", "fr_FR.dic", "fr_FR.aff",
"fy_NL.zip", "fy_NL.dic", "fy_NL.aff",
"ga_IE.zip", "ga_IE.dic", "ga_IE.aff",
"gd_GB.zip", "gd_GB.dic", "gd_GB.aff",
"gl_ES.zip", "gl_ES.dic", "gl_ES.aff",
"gsc_FR.zip", "gsc_FR.dic", "gsc_FR.aff",
"gu_IN.zip", "gu_IN.dic", "gu_IN.aff",
"he_IL.zip", "he_IL.dic", "he_IL.aff",
"hi_IN.zip", "hi_IN.dic", "hi_IN.aff",
"hil_PH.zip", "hil_PH.dic", "hil_PH.aff",
"hr_HR.zip", "hr_HR.dic", "hr_HR.aff",
"hu_HU.zip", "hu_HU.dic", "hu_HU.aff",
"hu_HU_comb.zip", "hu_HU.dic", "hu_HU.aff",
"ia.zip", "ia.dic", "ia.aff",
"id_ID.zip", "id_ID.dic", "id_ID.aff",
"it_IT.zip", "it_IT.dic", "it_IT.aff",
"ku_TR.zip", "ku_TR.dic", "ku_TR.aff",
"la.zip", "la.dic", "la.aff",
"lt_LT.zip", "lt_LT.dic", "lt_LT.aff",
"lv_LV.zip", "lv_LV.dic", "lv_LV.aff",
"mg_MG.zip", "mg_MG.dic", "mg_MG.aff",
"mi_NZ.zip", "mi_NZ.dic", "mi_NZ.aff",
"mk_MK.zip", "mk_MK.dic", "mk_MK.aff",
"mos_BF.zip", "mos_BF.dic", "mos_BF.aff",
"mr_IN.zip", "mr_IN.dic", "mr_IN.aff",
"ms_MY.zip", "ms_MY.dic", "ms_MY.aff",
"nb_NO.zip", "nb_NO.dic", "nb_NO.aff",
"ne_NP.zip", "ne_NP.dic", "ne_NP.aff",
"nl_NL.zip", "nl_NL.dic", "nl_NL.aff",
"nl_med.zip", "nl_med.dic", "nl_med.aff",
"nn_NO.zip", "nn_NO.dic", "nn_NO.aff",
"nr_ZA.zip", "nr_ZA.dic", "nr_ZA.aff",
"ns_ZA.zip", "ns_ZA.dic", "ns_ZA.aff",
"ny_MW.zip", "ny_MW.dic", "ny_MW.aff",
"oc_FR.zip", "oc_FR.dic", "oc_FR.aff",
"pl_PL.zip", "pl_PL.dic", "pl_PL.aff",
"pt_BR.zip", "pt_BR.dic", "pt_BR.aff",
"pt_PT.zip", "pt_PT.dic", "pt_PT.aff",
"ro_RO.zip", "ro_RO.dic", "ro_RO.aff",
"ru_RU.zip", "ru_RU.dic", "ru_RU.aff",
"ru_RU_ye.zip", "ru_RU_ie.dic", "ru_RU_ie.aff",
"ru_RU_yo.zip", "ru_RU_yo.dic", "ru_RU_yo.aff",
"rw_RW.zip", "rw_RW.dic", "rw_RW.aff",
"sk_SK.zip", "sk_SK.dic", "sk_SK.aff",
"sl_SI.zip", "sl_SI.dic", "sl_SI.aff",
"sq_AL.zip", "sq_AL.dic", "sq_AL.aff",
"ss_ZA.zip", "ss_ZA.dic", "ss_ZA.aff",
"st_ZA.zip", "st_ZA.dic", "st_ZA.aff",
"sv_SE.zip", "sv_SE.dic", "sv_SE.aff",
"sw_KE.zip", "sw_KE.dic", "sw_KE.aff",
"tet_ID.zip", "tet_ID.dic", "tet_ID.aff",
"th_TH.zip", "th_TH.dic", "th_TH.aff",
"tl_PH.zip", "tl_PH.dic", "tl_PH.aff",
"tn_ZA.zip", "tn_ZA.dic", "tn_ZA.aff",
"ts_ZA.zip", "ts_ZA.dic", "ts_ZA.aff",
"uk_UA.zip", "uk_UA.dic", "uk_UA.aff",
"ve_ZA.zip", "ve_ZA.dic", "ve_ZA.aff",
"vi_VN.zip", "vi_VN.dic", "vi_VN.aff",
"xh_ZA.zip", "xh_ZA.dic", "xh_ZA.aff",
"zu_ZA.zip", "zu_ZA.dic", "zu_ZA.aff",
};
public void test() throws Exception {
for (int i = 0; i < tests.length; i += 3) {
File f = new File(DICTIONARY_HOME, tests[i]);
assert f.exists();
try (ZipFile zip = new ZipFile(f, IOUtils.CHARSET_UTF_8)) {
ZipEntry dicEntry = zip.getEntry(tests[i+1]);
assert dicEntry != null;
ZipEntry affEntry = zip.getEntry(tests[i+2]);
assert affEntry != null;
// get ram from previous impl
String oldRAM = "FAIL";
try (InputStream dictionary = zip.getInputStream(dicEntry);
InputStream affix = zip.getInputStream(affEntry)) {
try {
HunspellDictionary dic = new HunspellDictionary(affix, dictionary, TEST_VERSION_CURRENT);
oldRAM = RamUsageEstimator.humanSizeOf(dic);
} catch (Throwable t) {}
}
try (InputStream dictionary = zip.getInputStream(dicEntry);
InputStream affix = zip.getInputStream(affEntry)) {
Dictionary dic = new Dictionary(affix, dictionary);
System.out.println(tests[i] + "\t" + oldRAM + "\t" + RamUsageEstimator.humanSizeOf(dic));
}
}
}
}
public void testOneDictionary() throws Exception {
String toTest = "hu_HU.zip";
for (int i = 0; i < tests.length; i++) {
if (tests[i].equals(toTest)) {
File f = new File(DICTIONARY_HOME, tests[i]);
assert f.exists();
try (ZipFile zip = new ZipFile(f, IOUtils.CHARSET_UTF_8)) {
ZipEntry dicEntry = zip.getEntry(tests[i+1]);
assert dicEntry != null;
ZipEntry affEntry = zip.getEntry(tests[i+2]);
assert affEntry != null;
try (InputStream dictionary = zip.getInputStream(dicEntry);
InputStream affix = zip.getInputStream(affEntry)) {
Dictionary dic = new Dictionary(affix, dictionary);
}
}
}
}
}
}

View File

@ -0,0 +1,109 @@
package org.apache.lucene.analysis.hunspell2;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.FilterInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.text.ParseException;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
public class TestDictionary extends LuceneTestCase {
public void testSimpleDictionary() throws Exception {
InputStream affixStream = getClass().getResourceAsStream("simple.aff");
InputStream dictStream = getClass().getResourceAsStream("simple.dic");
Dictionary dictionary = new Dictionary(affixStream, dictStream);
assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).size());
assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).size());
char flags[] = dictionary.lookupWord(new char[]{'o', 'l', 'r'}, 0, 3, new BytesRef());
assertNotNull(flags);
assertEquals(1, flags.length);
assertEquals("Wrong number of flags for lucen", 1, dictionary.lookupWord(new char[]{'l', 'u', 'c', 'e', 'n'}, 0, 5, new BytesRef()).length);
affixStream.close();
dictStream.close();
}
public void testCompressedDictionary() throws Exception {
InputStream affixStream = getClass().getResourceAsStream("compressed.aff");
InputStream dictStream = getClass().getResourceAsStream("compressed.dic");
Dictionary dictionary = new Dictionary(affixStream, dictStream);
assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).size());
assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).size());
assertEquals(1, dictionary.lookupWord(new char[]{'o', 'l', 'r'}, 0, 3, new BytesRef()).length);
affixStream.close();
dictStream.close();
}
// malformed rule causes ParseException
public void testInvalidData() throws Exception {
InputStream affixStream = getClass().getResourceAsStream("broken.aff");
InputStream dictStream = getClass().getResourceAsStream("simple.dic");
try {
new Dictionary(affixStream, dictStream);
fail("didn't get expected exception");
} catch (ParseException expected) {
assertEquals("The affix file contains a rule with less than five elements", expected.getMessage());
assertEquals(23, expected.getErrorOffset());
}
affixStream.close();
dictStream.close();
}
private class CloseCheckInputStream extends FilterInputStream {
private boolean closed = false;
public CloseCheckInputStream(InputStream delegate) {
super(delegate);
}
@Override
public void close() throws IOException {
this.closed = true;
super.close();
}
public boolean isClosed() {
return this.closed;
}
}
public void testResourceCleanup() throws Exception {
CloseCheckInputStream affixStream = new CloseCheckInputStream(getClass().getResourceAsStream("compressed.aff"));
CloseCheckInputStream dictStream = new CloseCheckInputStream(getClass().getResourceAsStream("compressed.dic"));
new Dictionary(affixStream, dictStream);
assertFalse(affixStream.isClosed());
assertFalse(dictStream.isClosed());
affixStream.close();
dictStream.close();
assertTrue(affixStream.isClosed());
assertTrue(dictStream.isClosed());
}
}

View File

@ -0,0 +1,87 @@
package org.apache.lucene.analysis.hunspell2;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.TestUtil;
import org.junit.AfterClass;
import org.junit.BeforeClass;
public class TestHunspell2StemFilter extends BaseTokenStreamTestCase {
private static Dictionary dictionary;
@BeforeClass
public static void beforeClass() throws Exception {
try (InputStream affixStream = TestStemmer.class.getResourceAsStream("simple.aff");
InputStream dictStream = TestStemmer.class.getResourceAsStream("simple.dic")) {
dictionary = new Dictionary(affixStream, dictStream);
}
}
@AfterClass
public static void afterClass() {
dictionary = null;
}
/** Simple test for KeywordAttribute */
public void testKeywordAttribute() throws IOException {
MockTokenizer tokenizer = whitespaceMockTokenizer("lucene is awesome");
tokenizer.setEnableChecks(true);
Hunspell2StemFilter filter = new Hunspell2StemFilter(tokenizer, dictionary, TestUtil.nextInt(random(), 1, 3));
assertTokenStreamContents(filter, new String[]{"lucene", "lucen", "is", "awesome"}, new int[] {1, 0, 1, 1});
// assert with keyword marker
tokenizer = whitespaceMockTokenizer("lucene is awesome");
CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, Arrays.asList("Lucene"), true);
filter = new Hunspell2StemFilter(new SetKeywordMarkerFilter(tokenizer, set), dictionary, TestUtil.nextInt(random(), 1, 3));
assertTokenStreamContents(filter, new String[]{"lucene", "is", "awesome"}, new int[] {1, 1, 1});
}
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer, new Hunspell2StemFilter(tokenizer, dictionary, TestUtil.nextInt(random(), 1, 3)));
}
};
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
}
public void testEmptyTerm() throws IOException {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new KeywordTokenizer();
return new TokenStreamComponents(tokenizer, new Hunspell2StemFilter(tokenizer, dictionary, TestUtil.nextInt(random(), 1, 3)));
}
};
checkOneTerm(a, "", "");
}
}

View File

@ -0,0 +1,50 @@
package org.apache.lucene.analysis.hunspell2;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
/**
* Simple tests to ensure the Hunspell stemmer loads from factory
*/
public class TestHunspell2StemFilterFactory extends BaseTokenStreamFactoryTestCase {
public void testStemming() throws Exception {
Reader reader = new StringReader("abc");
TokenStream stream = whitespaceMockTokenizer(reader);
stream = tokenFilterFactory("Hunspell2Stem",
"dictionary", "simple.dic",
"affix", "simple.aff").create(stream);
assertTokenStreamContents(stream, new String[] { "ab" });
}
/** Test that bogus arguments result in exception */
public void testBogusArguments() throws Exception {
try {
tokenFilterFactory("Hunspell2Stem",
"dictionary", "simple.dic",
"bogusArg", "bogusValue");
fail();
} catch (IllegalArgumentException expected) {
assertTrue(expected.getMessage().contains("Unknown parameters"));
}
}
}

View File

@ -0,0 +1,105 @@
package org.apache.lucene.analysis.hunspell2;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.hunspell2.Stemmer.Stem;
import org.apache.lucene.util.LuceneTestCase;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import java.io.InputStream;
import java.util.Arrays;
import java.util.List;
public class TestStemmer extends LuceneTestCase {
private static Stemmer stemmer;
@BeforeClass
public static void beforeClass() throws Exception {
try (InputStream affixStream = TestStemmer.class.getResourceAsStream("simple.aff");
InputStream dictStream = TestStemmer.class.getResourceAsStream("simple.dic")) {
Dictionary dictionary = new Dictionary(affixStream, dictStream);
stemmer = new Stemmer(dictionary);
}
}
@AfterClass
public static void afterClass() {
stemmer = null;
}
public void testSimpleSuffix() {
assertStemsTo("lucene", "lucene", "lucen");
assertStemsTo("mahoute", "mahout");
}
public void testSimplePrefix() {
assertStemsTo("solr", "olr");
}
public void testRecursiveSuffix() {
assertStemsTo("abcd", "ab");
}
// all forms unmunched from dictionary
public void testAllStems() {
assertStemsTo("ab", "ab");
assertStemsTo("abc", "ab");
assertStemsTo("apach", "apach");
assertStemsTo("apache", "apach");
assertStemsTo("foo", "foo");
assertStemsTo("food", "foo");
assertStemsTo("foos", "foo");
assertStemsTo("lucen", "lucen");
assertStemsTo("lucene", "lucen", "lucene");
assertStemsTo("mahout", "mahout");
assertStemsTo("mahoute", "mahout");
assertStemsTo("moo", "moo");
assertStemsTo("mood", "moo");
assertStemsTo("olr", "olr");
assertStemsTo("solr", "olr");
}
// some bogus stuff that should not stem (empty lists)!
public void testBogusStems() {
assertStemsTo("abs");
assertStemsTo("abe");
assertStemsTo("sab");
assertStemsTo("sapach");
assertStemsTo("sapache");
assertStemsTo("apachee");
assertStemsTo("sfoo");
assertStemsTo("sfoos");
assertStemsTo("fooss");
assertStemsTo("lucenee");
assertStemsTo("solre");
}
private void assertStemsTo(String s, String... expected) {
Arrays.sort(expected);
List<Stem> stems = stemmer.stem(s);
String actual[] = new String[stems.size()];
for (int i = 0; i < actual.length; i++) {
actual[i] = stems.get(i).getStemString();
}
Arrays.sort(actual);
assertArrayEquals(expected, actual);
}
}

View File

@ -0,0 +1,24 @@
SET UTF-8
TRY abcdefghijklmopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ
SFX A Y 3
SFX A 0 e n
SFX A 0 e t
SFX A 0 e h
SFX C Y 2
SFX C 0 d/C c
SFX C 0 c b
SFX D Y 1
SFX D 0 s o
SFX E Y 1
SFX E 0 d o
PFX B Y 1
PFX B 0 s o
#wrong rule (only 4 elements)
PFX A0 Y 1
PFX A0 0 a

View File

@ -0,0 +1,29 @@
SET UTF-8
TRY abcdefghijklmopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ
FLAG long
AF 5
AF AA
AF BB
AF CC
AF DD
AF EE
SFX AA Y 3
SFX AA 0 e n
SFX AA 0 e t
SFX AA 0 e h
SFX CC Y 2
SFX CC 0 d/3 c
SFX CC 0 c b
SFX DD Y 1
SFX DD 0 s o
SFX EE Y 1
SFX EE 0 d o
PFX BB Y 1
PFX BB 0 s o

View File

@ -0,0 +1,9 @@
6
ab/3
apach/1
foo/4
foo/5
lucen/1
lucene
mahout/1
olr/2

View File

@ -0,0 +1,20 @@
SET UTF-8
TRY abcdefghijklmopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ
SFX A Y 3
SFX A 0 e n
SFX A 0 e t
SFX A 0 e h
SFX C Y 2
SFX C 0 d/C c
SFX C 0 c b
SFX D Y 1
SFX D 0 s o
SFX E Y 1
SFX E 0 d o
PFX B Y 1
PFX B 0 s o

View File

@ -0,0 +1,10 @@
9
ab/C
apach/A
foo/D
foo/E
lucen/A
lucene
mahout/A
moo/E
olr/B