LUCENE-5468: hunspell2 -> hunspell (with previous options and tests)

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5468@1572718 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2014-02-27 20:19:27 +00:00
parent b2b86dd8ad
commit c4f4beb27e
36 changed files with 320 additions and 2076 deletions

View File

@ -1,4 +1,4 @@
package org.apache.lucene.analysis.hunspell2;
package org.apache.lucene.analysis.hunspell;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@ -17,7 +17,6 @@ package org.apache.lucene.analysis.hunspell2;
* limitations under the License.
*/
import org.apache.lucene.analysis.util.CharArrayMap;
import org.apache.lucene.store.ByteArrayDataOutput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
@ -28,14 +27,19 @@ import org.apache.lucene.util.OfflineSorter;
import org.apache.lucene.util.OfflineSorter.ByteSequencesReader;
import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.Version;
import org.apache.lucene.util.fst.Builder;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.IntSequenceOutputs;
import org.apache.lucene.util.fst.PositiveIntOutputs;
import org.apache.lucene.util.fst.Util;
import java.io.*;
import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction;
@ -71,27 +75,27 @@ public class Dictionary {
private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*";
private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s";
public FST<IntsRef> prefixes;
public FST<IntsRef> suffixes;
FST<IntsRef> prefixes;
FST<IntsRef> suffixes;
// all Patterns used by prefixes and suffixes. these are typically re-used across
// many affix stripping rules. so these are deduplicated, to save RAM.
// TODO: maybe don't use Pattern for the condition check...
// TODO: when we cut over Affix to FST, just store integer index to this.
public ArrayList<Pattern> patterns = new ArrayList<>();
ArrayList<Pattern> patterns = new ArrayList<>();
// the entries in the .dic file, mapping to their set of flags.
// the fst output is the ordinal for flagLookup
public FST<Long> words;
FST<Long> words;
// the list of unique flagsets (wordforms). theoretically huge, but practically
// small (e.g. for polish this is 756), otherwise humans wouldn't be able to deal with it either.
public BytesRefHash flagLookup = new BytesRefHash();
BytesRefHash flagLookup = new BytesRefHash();
// the list of unique strip affixes.
public BytesRefHash stripLookup = new BytesRefHash();
BytesRefHash stripLookup = new BytesRefHash();
// 8 bytes per affix
public byte[] affixData = new byte[64];
byte[] affixData = new byte[64];
private int currentAffix = 0;
private FlagParsingStrategy flagParsingStrategy = new SimpleFlagParsingStrategy(); // Default flag parsing strategy
@ -100,7 +104,11 @@ public class Dictionary {
private int aliasCount = 0;
private final File tempDir = OfflineSorter.defaultTempDir(); // TODO: make this configurable?
public static final int IGNORE_CASE = 1;
boolean ignoreCase;
/**
* Creates a new Dictionary containing the information read from the provided InputStreams to hunspell affix
* and dictionary files.
@ -112,6 +120,21 @@ public class Dictionary {
* @throws ParseException Can be thrown if the content of the files does not meet expected formats
*/
public Dictionary(InputStream affix, InputStream dictionary) throws IOException, ParseException {
this(affix, Collections.singletonList(dictionary), false);
}
/**
* Creates a new Dictionary containing the information read from the provided InputStreams to hunspell affix
* and dictionary files.
* You have to close the provided InputStreams yourself.
*
* @param affix InputStream for reading the hunspell affix file (won't be closed).
* @param dictionaries InputStream for reading the hunspell dictionary files (won't be closed).
* @throws IOException Can be thrown while reading from the InputStreams
* @throws ParseException Can be thrown if the content of the files does not meet expected formats
*/
public Dictionary(InputStream affix, List<InputStream> dictionaries, boolean ignoreCase) throws IOException, ParseException {
this.ignoreCase = ignoreCase;
BufferedInputStream buffered = new BufferedInputStream(affix, 8192);
buffered.mark(8192);
String encoding = getDictionaryEncoding(affix);
@ -122,7 +145,7 @@ public class Dictionary {
stripLookup.add(new BytesRef()); // no strip -> ord 0
PositiveIntOutputs o = PositiveIntOutputs.getSingleton();
Builder<Long> b = new Builder<Long>(FST.INPUT_TYPE.BYTE4, o);
readDictionaryFile(dictionary, decoder, b);
readDictionaryFiles(dictionaries, decoder, b);
words = b.finish();
}
@ -145,7 +168,7 @@ public class Dictionary {
return decodeFlags(flagLookup.get(ord, scratch));
}
public Integer lookupOrd(char word[], int offset, int length) throws IOException {
Integer lookupOrd(char word[], int offset, int length) throws IOException {
final FST.BytesReader bytesReader = words.getBytesReader();
final FST.Arc<Long> arc = words.getFirstArc(new FST.Arc<Long>());
// Accumulate output as we go
@ -269,7 +292,6 @@ public class Dictionary {
Util.toUTF32(entry.getKey(), scratch);
List<Character> entries = entry.getValue();
IntsRef output = new IntsRef(entries.size());
int upto = 0;
for (Character c : entries) {
output.ints[output.length++] = c;
}
@ -480,23 +502,39 @@ public class Dictionary {
}
/**
* Reads the dictionary file through the provided InputStream, building up the words map
* Reads the dictionary file through the provided InputStreams, building up the words map
*
* @param dictionary InputStream to read the dictionary file through
* @param dictionaries InputStreams to read the dictionary file through
* @param decoder CharsetDecoder used to decode the contents of the file
* @throws IOException Can be thrown while reading from the file
*/
private void readDictionaryFile(InputStream dictionary, CharsetDecoder decoder, Builder<Long> words) throws IOException {
private void readDictionaryFiles(List<InputStream> dictionaries, CharsetDecoder decoder, Builder<Long> words) throws IOException {
BytesRef flagsScratch = new BytesRef();
IntsRef scratchInts = new IntsRef();
BufferedReader lines = new BufferedReader(new InputStreamReader(dictionary, decoder));
String line = lines.readLine(); // first line is number of entries (approximately, sometimes)
File unsorted = File.createTempFile("unsorted", "dat", tempDir);
try (ByteSequencesWriter writer = new ByteSequencesWriter(unsorted)) {
while ((line = lines.readLine()) != null) {
writer.write(line.getBytes(IOUtils.CHARSET_UTF_8));
for (InputStream dictionary : dictionaries) {
BufferedReader lines = new BufferedReader(new InputStreamReader(dictionary, decoder));
String line = lines.readLine(); // first line is number of entries (approximately, sometimes)
while ((line = lines.readLine()) != null) {
if (ignoreCase) {
int flagSep = line.lastIndexOf('/');
if (flagSep == -1) {
writer.write(line.toLowerCase(Locale.ROOT).getBytes(IOUtils.CHARSET_UTF_8));
} else {
StringBuilder sb = new StringBuilder();
sb.append(line.substring(0, flagSep).toLowerCase(Locale.ROOT));
if (flagSep < line.length()) {
sb.append(line.substring(flagSep, line.length()));
}
writer.write(sb.toString().getBytes(IOUtils.CHARSET_UTF_8));
}
} else {
writer.write(line.getBytes(IOUtils.CHARSET_UTF_8));
}
}
}
}
File sorted = File.createTempFile("sorted", "dat", tempDir);
@ -544,6 +582,7 @@ public class Dictionary {
BytesRef currentEntry = new BytesRef();
char currentFlags[] = new char[0];
String line;
while (reader.read(scratchLine)) {
line = scratchLine.utf8ToString();
String entry;

View File

@ -1,157 +0,0 @@
package org.apache.lucene.analysis.hunspell;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.regex.Pattern;
/**
* Wrapper class representing a hunspell affix
*/
public class HunspellAffix {
private String append; // the affix itself, what is appended
private char appendFlags[]; // continuation class flags
private String strip;
private String condition;
private Pattern conditionPattern;
private char flag;
private boolean crossProduct;
/**
* Checks whether the given text matches the conditional pattern on this affix
*
* @param text Text to check if it matches the affix's conditional pattern
* @return {@code true} if the text meets the condition, {@code false} otherwise
*/
public boolean checkCondition(CharSequence text) {
return conditionPattern.matcher(text).matches();
}
/**
* Returns the append defined for the affix
*
* @return Defined append
*/
public String getAppend() {
return append;
}
/**
* Sets the append defined for the affix
*
* @param append Defined append for the affix
*/
public void setAppend(String append) {
this.append = append;
}
/**
* Returns the flags defined for the affix append
*
* @return Flags defined for the affix append
*/
public char[] getAppendFlags() {
return appendFlags;
}
/**
* Sets the flags defined for the affix append
*
* @param appendFlags Flags defined for the affix append
*/
public void setAppendFlags(char[] appendFlags) {
this.appendFlags = appendFlags;
}
/**
* Returns the stripping characters defined for the affix
*
* @return Stripping characters defined for the affix
*/
public String getStrip() {
return strip;
}
/**
* Sets the stripping characters defined for the affix
*
* @param strip Stripping characters defined for the affix
*/
public void setStrip(String strip) {
this.strip = strip;
}
/**
* Returns the condition that must be met before the affix can be applied
*
* @return Condition that must be met before the affix can be applied
*/
public String getCondition() {
return condition;
}
/**
* Sets the condition that must be met before the affix can be applied
*
* @param condition Condition to be met before affix application
* @param pattern Condition as a regular expression pattern
*/
public void setCondition(String condition, String pattern) {
this.condition = condition;
this.conditionPattern = Pattern.compile(pattern);
}
/**
* Returns the affix flag
*
* @return Affix flag
*/
public char getFlag() {
return flag;
}
/**
* Sets the affix flag
*
* @param flag Affix flag
*/
public void setFlag(char flag) {
this.flag = flag;
}
/**
* Returns whether the affix is defined as cross product
*
* @return {@code true} if the affix is cross product, {@code false} otherwise
*/
public boolean isCrossProduct() {
return crossProduct;
}
/**
* Sets whether the affix is defined as cross product
*
* @param crossProduct Whether the affix is defined as cross product
*/
public void setCrossProduct(boolean crossProduct) {
this.crossProduct = crossProduct;
}
}

View File

@ -1,507 +0,0 @@
package org.apache.lucene.analysis.hunspell;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.util.CharArrayMap;
import org.apache.lucene.util.Version;
import java.io.*;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Locale;
/**
* In-memory structure for the dictionary (.dic) and affix (.aff)
* data of a hunspell dictionary.
*/
public class HunspellDictionary {
static final HunspellWord NOFLAGS = new HunspellWord();
private static final String ALIAS_KEY = "AF";
private static final String PREFIX_KEY = "PFX";
private static final String SUFFIX_KEY = "SFX";
private static final String FLAG_KEY = "FLAG";
private static final String NUM_FLAG_TYPE = "num";
private static final String UTF8_FLAG_TYPE = "UTF-8";
private static final String LONG_FLAG_TYPE = "long";
private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*";
private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s";
private static final boolean IGNORE_CASE_DEFAULT = false;
private static final boolean STRICT_AFFIX_PARSING_DEFAULT = true;
private CharArrayMap<List<HunspellWord>> words;
private CharArrayMap<List<HunspellAffix>> prefixes;
private CharArrayMap<List<HunspellAffix>> suffixes;
private FlagParsingStrategy flagParsingStrategy = new SimpleFlagParsingStrategy(); // Default flag parsing strategy
private boolean ignoreCase = IGNORE_CASE_DEFAULT;
private final Version version;
private String[] aliases;
private int aliasCount = 0;
/**
* Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix
* and dictionary files.
* You have to close the provided InputStreams yourself.
*
* @param affix InputStream for reading the hunspell affix file (won't be closed).
* @param dictionary InputStream for reading the hunspell dictionary file (won't be closed).
* @param version Lucene Version
* @throws IOException Can be thrown while reading from the InputStreams
* @throws ParseException Can be thrown if the content of the files does not meet expected formats
*/
public HunspellDictionary(InputStream affix, InputStream dictionary, Version version) throws IOException, ParseException {
this(affix, Arrays.asList(dictionary), version, IGNORE_CASE_DEFAULT);
}
/**
* Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix
* and dictionary files.
* You have to close the provided InputStreams yourself.
*
* @param affix InputStream for reading the hunspell affix file (won't be closed).
* @param dictionary InputStream for reading the hunspell dictionary file (won't be closed).
* @param version Lucene Version
* @param ignoreCase If true, dictionary matching will be case insensitive
* @throws IOException Can be thrown while reading from the InputStreams
* @throws ParseException Can be thrown if the content of the files does not meet expected formats
*/
public HunspellDictionary(InputStream affix, InputStream dictionary, Version version, boolean ignoreCase) throws IOException, ParseException {
this(affix, Arrays.asList(dictionary), version, ignoreCase);
}
/**
* Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix
* and dictionary files.
* You have to close the provided InputStreams yourself.
*
* @param affix InputStream for reading the hunspell affix file (won't be closed).
* @param dictionaries InputStreams for reading the hunspell dictionary file (won't be closed).
* @param version Lucene Version
* @param ignoreCase If true, dictionary matching will be case insensitive
* @throws IOException Can be thrown while reading from the InputStreams
* @throws ParseException Can be thrown if the content of the files does not meet expected formats
*/
public HunspellDictionary(InputStream affix, List<InputStream> dictionaries, Version version, boolean ignoreCase) throws IOException, ParseException {
this(affix, dictionaries, version, ignoreCase, STRICT_AFFIX_PARSING_DEFAULT);
}
/**
* Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix
* and dictionary files.
* You have to close the provided InputStreams yourself.
*
* @param affix InputStream for reading the hunspell affix file (won't be closed).
* @param dictionaries InputStreams for reading the hunspell dictionary file (won't be closed).
* @param version Lucene Version
* @param ignoreCase If true, dictionary matching will be case insensitive
* @param strictAffixParsing Affix strict parsing enabled or not (an error while reading a rule causes exception or is ignored)
* @throws IOException Can be thrown while reading from the InputStreams
* @throws ParseException Can be thrown if the content of the files does not meet expected formats
*/
public HunspellDictionary(InputStream affix, List<InputStream> dictionaries, Version version, boolean ignoreCase, boolean strictAffixParsing) throws IOException, ParseException {
this.version = version;
this.ignoreCase = ignoreCase;
String encoding = getDictionaryEncoding(affix);
CharsetDecoder decoder = getJavaEncoding(encoding);
readAffixFile(affix, decoder, strictAffixParsing);
words = new CharArrayMap<List<HunspellWord>>(version, 65535 /* guess */, this.ignoreCase);
for (InputStream dictionary : dictionaries) {
readDictionaryFile(dictionary, decoder);
}
}
/**
* Looks up HunspellWords that match the String created from the given char array, offset and length
*
* @param word Char array to generate the String from
* @param offset Offset in the char array that the String starts at
* @param length Length from the offset that the String is
* @return List of HunspellWords that match the generated String, or {@code null} if none are found
*/
public List<HunspellWord> lookupWord(char word[], int offset, int length) {
return words.get(word, offset, length);
}
/**
* Looks up HunspellAffix prefixes that have an append that matches the String created from the given char array, offset and length
*
* @param word Char array to generate the String from
* @param offset Offset in the char array that the String starts at
* @param length Length from the offset that the String is
* @return List of HunspellAffix prefixes with an append that matches the String, or {@code null} if none are found
*/
public List<HunspellAffix> lookupPrefix(char word[], int offset, int length) {
return prefixes.get(word, offset, length);
}
/**
* Looks up HunspellAffix suffixes that have an append that matches the String created from the given char array, offset and length
*
* @param word Char array to generate the String from
* @param offset Offset in the char array that the String starts at
* @param length Length from the offset that the String is
* @return List of HunspellAffix suffixes with an append that matches the String, or {@code null} if none are found
*/
public List<HunspellAffix> lookupSuffix(char word[], int offset, int length) {
return suffixes.get(word, offset, length);
}
/**
* Reads the affix file through the provided InputStream, building up the prefix and suffix maps
*
* @param affixStream InputStream to read the content of the affix file from
* @param decoder CharsetDecoder to decode the content of the file
* @throws IOException Can be thrown while reading from the InputStream
*/
private void readAffixFile(InputStream affixStream, CharsetDecoder decoder, boolean strict) throws IOException, ParseException {
prefixes = new CharArrayMap<List<HunspellAffix>>(version, 8, ignoreCase);
suffixes = new CharArrayMap<List<HunspellAffix>>(version, 8, ignoreCase);
LineNumberReader reader = new LineNumberReader(new InputStreamReader(affixStream, decoder));
String line = null;
while ((line = reader.readLine()) != null) {
if (line.startsWith(ALIAS_KEY)) {
parseAlias(line);
} else if (line.startsWith(PREFIX_KEY)) {
parseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN, strict);
} else if (line.startsWith(SUFFIX_KEY)) {
parseAffix(suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN, strict);
} else if (line.startsWith(FLAG_KEY)) {
// Assume that the FLAG line comes before any prefix or suffixes
// Store the strategy so it can be used when parsing the dic file
flagParsingStrategy = getFlagParsingStrategy(line);
}
}
}
/**
* Parses a specific affix rule putting the result into the provided affix map
*
* @param affixes Map where the result of the parsing will be put
* @param header Header line of the affix rule
* @param reader BufferedReader to read the content of the rule from
* @param conditionPattern {@link String#format(String, Object...)} pattern to be used to generate the condition regex
* pattern
* @throws IOException Can be thrown while reading the rule
*/
private void parseAffix(CharArrayMap<List<HunspellAffix>> affixes,
String header,
LineNumberReader reader,
String conditionPattern,
boolean strict) throws IOException, ParseException {
String args[] = header.split("\\s+");
boolean crossProduct = args[2].equals("Y");
int numLines = Integer.parseInt(args[3]);
for (int i = 0; i < numLines; i++) {
String line = reader.readLine();
String ruleArgs[] = line.split("\\s+");
if (ruleArgs.length < 5) {
if (strict) {
throw new ParseException("The affix file contains a rule with less than five elements", reader.getLineNumber());
}
continue;
}
HunspellAffix affix = new HunspellAffix();
affix.setFlag(flagParsingStrategy.parseFlag(ruleArgs[1]));
affix.setStrip(ruleArgs[2].equals("0") ? "" : ruleArgs[2]);
String affixArg = ruleArgs[3];
int flagSep = affixArg.lastIndexOf('/');
if (flagSep != -1) {
String flagPart = affixArg.substring(flagSep + 1);
if (aliasCount > 0) {
flagPart = getAliasValue(Integer.parseInt(flagPart));
}
char appendFlags[] = flagParsingStrategy.parseFlags(flagPart);
Arrays.sort(appendFlags);
affix.setAppendFlags(appendFlags);
affix.setAppend(affixArg.substring(0, flagSep));
} else {
affix.setAppend(affixArg);
}
String condition = ruleArgs[4];
affix.setCondition(condition, String.format(Locale.ROOT, conditionPattern, condition));
affix.setCrossProduct(crossProduct);
List<HunspellAffix> list = affixes.get(affix.getAppend());
if (list == null) {
list = new ArrayList<HunspellAffix>();
affixes.put(affix.getAppend(), list);
}
list.add(affix);
}
}
/**
* Parses the encoding specified in the affix file readable through the provided InputStream
*
* @param affix InputStream for reading the affix file
* @return Encoding specified in the affix file
* @throws IOException Can be thrown while reading from the InputStream
* @throws ParseException Thrown if the first non-empty non-comment line read from the file does not adhere to the format {@code SET <encoding>}
*/
private String getDictionaryEncoding(InputStream affix) throws IOException, ParseException {
final StringBuilder encoding = new StringBuilder();
for (;;) {
encoding.setLength(0);
int ch;
while ((ch = affix.read()) >= 0) {
if (ch == '\n') {
break;
}
if (ch != '\r') {
encoding.append((char)ch);
}
}
if (
encoding.length() == 0 || encoding.charAt(0) == '#' ||
// this test only at the end as ineffective but would allow lines only containing spaces:
encoding.toString().trim().length() == 0
) {
if (ch < 0) {
throw new ParseException("Unexpected end of affix file.", 0);
}
continue;
}
if ("SET ".equals(encoding.substring(0, 4))) {
// cleanup the encoding string, too (whitespace)
return encoding.substring(4).trim();
}
throw new ParseException("The first non-comment line in the affix file must "+
"be a 'SET charset', was: '" + encoding +"'", 0);
}
}
/**
* Retrieves the CharsetDecoder for the given encoding. Note, This isn't perfect as I think ISCII-DEVANAGARI and
* MICROSOFT-CP1251 etc are allowed...
*
* @param encoding Encoding to retrieve the CharsetDecoder for
* @return CharSetDecoder for the given encoding
*/
private CharsetDecoder getJavaEncoding(String encoding) {
Charset charset = Charset.forName(encoding);
return charset.newDecoder();
}
/**
* Determines the appropriate {@link FlagParsingStrategy} based on the FLAG definition line taken from the affix file
*
* @param flagLine Line containing the flag information
* @return FlagParsingStrategy that handles parsing flags in the way specified in the FLAG definition
*/
private FlagParsingStrategy getFlagParsingStrategy(String flagLine) {
String flagType = flagLine.substring(5);
if (NUM_FLAG_TYPE.equals(flagType)) {
return new NumFlagParsingStrategy();
} else if (UTF8_FLAG_TYPE.equals(flagType)) {
return new SimpleFlagParsingStrategy();
} else if (LONG_FLAG_TYPE.equals(flagType)) {
return new DoubleASCIIFlagParsingStrategy();
}
throw new IllegalArgumentException("Unknown flag type: " + flagType);
}
/**
* Reads the dictionary file through the provided InputStream, building up the words map
*
* @param dictionary InputStream to read the dictionary file through
* @param decoder CharsetDecoder used to decode the contents of the file
* @throws IOException Can be thrown while reading from the file
*/
private void readDictionaryFile(InputStream dictionary, CharsetDecoder decoder) throws IOException {
BufferedReader reader = new BufferedReader(new InputStreamReader(dictionary, decoder));
// TODO: don't create millions of strings.
String line = reader.readLine(); // first line is number of entries
int numEntries = Integer.parseInt(line);
// TODO: the flags themselves can be double-chars (long) or also numeric
// either way the trick is to encode them as char... but they must be parsed differently
while ((line = reader.readLine()) != null) {
String entry;
HunspellWord wordForm;
int flagSep = line.lastIndexOf('/');
if (flagSep == -1) {
wordForm = NOFLAGS;
entry = line;
} else {
// note, there can be comments (morph description) after a flag.
// we should really look for any whitespace
int end = line.indexOf('\t', flagSep);
if (end == -1)
end = line.length();
String flagPart = line.substring(flagSep + 1, end);
if (aliasCount > 0) {
flagPart = getAliasValue(Integer.parseInt(flagPart));
}
wordForm = new HunspellWord(flagParsingStrategy.parseFlags(flagPart));
Arrays.sort(wordForm.getFlags());
entry = line.substring(0, flagSep);
}
if(ignoreCase) {
entry = entry.toLowerCase(Locale.ROOT);
}
List<HunspellWord> entries = new ArrayList<HunspellWord>();
entries.add(wordForm);
words.put(entry, entries);
}
}
public Version getVersion() {
return version;
}
private void parseAlias(String line) {
String ruleArgs[] = line.split("\\s+");
if (aliases == null) {
//first line should be the aliases count
final int count = Integer.parseInt(ruleArgs[1]);
aliases = new String[count];
} else {
aliases[aliasCount++] = ruleArgs[1];
}
}
private String getAliasValue(int id) {
try {
return aliases[id - 1];
} catch (IndexOutOfBoundsException ex) {
throw new IllegalArgumentException("Bad flag alias number:" + id, ex);
}
}
/**
* Abstraction of the process of parsing flags taken from the affix and dic files
*/
private static abstract class FlagParsingStrategy {
/**
* Parses the given String into a single flag
*
* @param rawFlag String to parse into a flag
* @return Parsed flag
*/
char parseFlag(String rawFlag) {
return parseFlags(rawFlag)[0];
}
/**
* Parses the given String into multiple flags
*
* @param rawFlags String to parse into flags
* @return Parsed flags
*/
abstract char[] parseFlags(String rawFlags);
}
/**
* Simple implementation of {@link FlagParsingStrategy} that treats the chars in each String as a individual flags.
* Can be used with both the ASCII and UTF-8 flag types.
*/
private static class SimpleFlagParsingStrategy extends FlagParsingStrategy {
/**
* {@inheritDoc}
*/
@Override
public char[] parseFlags(String rawFlags) {
return rawFlags.toCharArray();
}
}
/**
* Implementation of {@link FlagParsingStrategy} that assumes each flag is encoded in its numerical form. In the case
* of multiple flags, each number is separated by a comma.
*/
private static class NumFlagParsingStrategy extends FlagParsingStrategy {
/**
* {@inheritDoc}
*/
@Override
public char[] parseFlags(String rawFlags) {
String[] rawFlagParts = rawFlags.trim().split(",");
char[] flags = new char[rawFlagParts.length];
for (int i = 0; i < rawFlagParts.length; i++) {
// note, removing the trailing X/leading I for nepali... what is the rule here?!
flags[i] = (char) Integer.parseInt(rawFlagParts[i].replaceAll("[^0-9]", ""));
}
return flags;
}
}
/**
* Implementation of {@link FlagParsingStrategy} that assumes each flag is encoded as two ASCII characters whose codes
* must be combined into a single character.
*
* TODO (rmuir) test
*/
private static class DoubleASCIIFlagParsingStrategy extends FlagParsingStrategy {
/**
* {@inheritDoc}
*/
@Override
public char[] parseFlags(String rawFlags) {
if (rawFlags.length() == 0) {
return new char[0];
}
StringBuilder builder = new StringBuilder();
for (int i = 0; i < rawFlags.length(); i+=2) {
char cookedFlag = (char) ((int) rawFlags.charAt(i) + (int) rawFlags.charAt(i + 1));
builder.append(cookedFlag);
}
char flags[] = new char[builder.length()];
builder.getChars(0, builder.length(), flags, 0);
return flags;
}
}
public boolean isIgnoreCase() {
return ignoreCase;
}
}

View File

@ -18,14 +18,16 @@ package org.apache.lucene.analysis.hunspell;
*/
import java.io.IOException;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.hunspell.HunspellStemmer.Stem;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.util.CharsRef;
/**
* TokenFilter that uses hunspell affix rules and words to stem tokens. Since hunspell supports a word having multiple
@ -41,71 +43,83 @@ import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
* {@link org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilterFactory}
* </p>
*
*
* @lucene.experimental
*/
public final class HunspellStemFilter extends TokenFilter {
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
private final HunspellStemmer stemmer;
private final Stemmer stemmer;
private List<Stem> buffer;
private List<CharsRef> buffer;
private State savedState;
private final boolean dedup;
private final boolean longestOnly;
/** Create a {@link HunspellStemFilter} which deduplicates stems and has a maximum
* recursion level of 2.
* @see #HunspellStemFilter(TokenStream, HunspellDictionary, int) */
public HunspellStemFilter(TokenStream input, HunspellDictionary dictionary) {
* @see #HunspellStemFilter(TokenStream, Dictionary, int) */
public HunspellStemFilter(TokenStream input, Dictionary dictionary) {
this(input, dictionary, 2);
}
/**
* Creates a new HunspellStemFilter that will stem tokens from the given TokenStream using affix rules in the provided
* HunspellDictionary
* Creates a new Hunspell2StemFilter that will stem tokens from the given TokenStream using affix rules in the provided
* Dictionary
*
* @param input TokenStream whose tokens will be stemmed
* @param dictionary HunspellDictionary containing the affix rules and words that will be used to stem the tokens
* @param recursionCap maximum level of recursion stemmer can go into, defaults to <code>2</code>
*/
public HunspellStemFilter(TokenStream input, HunspellDictionary dictionary, int recursionCap) {
public HunspellStemFilter(TokenStream input, Dictionary dictionary, int recursionCap) {
this(input, dictionary, true, recursionCap);
}
/** Create a {@link HunspellStemFilter} which has a maximum recursion level of 2.
* @see #HunspellStemFilter(TokenStream, HunspellDictionary, boolean, int) */
public HunspellStemFilter(TokenStream input, HunspellDictionary dictionary, boolean dedup) {
* @see #HunspellStemFilter(TokenStream, Dictionary, boolean, int) */
public HunspellStemFilter(TokenStream input, Dictionary dictionary, boolean dedup) {
this(input, dictionary, dedup, 2);
}
/**
* Creates a new HunspellStemFilter that will stem tokens from the given TokenStream using affix rules in the provided
* HunspellDictionary
* Dictionary
*
* @param input TokenStream whose tokens will be stemmed
* @param dictionary HunspellDictionary containing the affix rules and words that will be used to stem the tokens
* @param dedup true if only unique terms should be output.
* @param recursionCap maximum level of recursion stemmer can go into, defaults to <code>2</code>
*/
public HunspellStemFilter(TokenStream input, HunspellDictionary dictionary, boolean dedup, int recursionCap) {
super(input);
this.dedup = dedup;
this.stemmer = new HunspellStemmer(dictionary, recursionCap);
public HunspellStemFilter(TokenStream input, Dictionary dictionary, boolean dedup, int recursionCap) {
this(input, dictionary, dedup, recursionCap, false);
}
/**
* {@inheritDoc}
* Creates a new HunspellStemFilter that will stem tokens from the given TokenStream using affix rules in the provided
* Dictionary
*
* @param input TokenStream whose tokens will be stemmed
* @param dictionary HunspellDictionary containing the affix rules and words that will be used to stem the tokens
* @param dedup true if only unique terms should be output.
* @param recursionCap maximum level of recursion stemmer can go into, defaults to <code>2</code>
* @param longestOnly true if only the longest term should be output.
*/
public HunspellStemFilter(TokenStream input, Dictionary dictionary, boolean dedup, int recursionCap, boolean longestOnly) {
super(input);
this.dedup = dedup && longestOnly == false; // don't waste time deduping if longestOnly is set
this.stemmer = new Stemmer(dictionary, recursionCap);
this.longestOnly = longestOnly;
}
@Override
public boolean incrementToken() throws IOException {
if (buffer != null && !buffer.isEmpty()) {
Stem nextStem = buffer.remove(0);
CharsRef nextStem = buffer.remove(0);
restoreState(savedState);
posIncAtt.setPositionIncrement(0);
termAtt.copyBuffer(nextStem.getStem(), 0, nextStem.getStemLength());
termAtt.setLength(nextStem.getStemLength());
termAtt.setEmpty().append(nextStem);
return true;
}
@ -122,24 +136,41 @@ public final class HunspellStemFilter extends TokenFilter {
if (buffer.isEmpty()) { // we do not know this word, return it unchanged
return true;
}
if (longestOnly && buffer.size() > 1) {
Collections.sort(buffer, lengthComparator);
}
Stem stem = buffer.remove(0);
termAtt.copyBuffer(stem.getStem(), 0, stem.getStemLength());
termAtt.setLength(stem.getStemLength());
CharsRef stem = buffer.remove(0);
termAtt.setEmpty().append(stem);
if (!buffer.isEmpty()) {
savedState = captureState();
if (longestOnly) {
buffer.clear();
} else {
if (!buffer.isEmpty()) {
savedState = captureState();
}
}
return true;
}
/**
* {@inheritDoc}
*/
@Override
public void reset() throws IOException {
super.reset();
buffer = null;
}
static final Comparator<CharsRef> lengthComparator = new Comparator<CharsRef>() {
@Override
public int compare(CharsRef o1, CharsRef o2) {
int cmp = Integer.compare(o2.length, o1.length);
if (cmp == 0) {
// tie break on text
return o2.compareTo(o1);
} else {
return cmp;
}
}
};
}

View File

@ -31,89 +31,75 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
import org.apache.lucene.util.IOUtils;
/**
* TokenFilterFactory that creates instances of {@link org.apache.lucene.analysis.hunspell.HunspellStemFilter}.
* Example config for British English including a custom dictionary, case insensitive matching:
* TokenFilterFactory that creates instances of {@link HunspellStemFilter}.
* Example config for British English:
* <pre class="prettyprint">
* &lt;filter class=&quot;solr.HunspellStemFilterFactory&quot;
* dictionary=&quot;en_GB.dic,my_custom.dic&quot;
* affix=&quot;en_GB.aff&quot;
* ignoreCase=&quot;true&quot; /&gt;</pre>
* dictionary=&quot;en_GB.dic,my_custom.dic&quot;
* affix=&quot;en_GB.aff&quot;
* ignoreCase=&quot;false&quot;
* longestOnly=&quot;false&quot; /&gt;</pre>
* Both parameters dictionary and affix are mandatory.
* <br/>
* The parameter ignoreCase (true/false) controls whether matching is case sensitive or not. Default false.
* <br/>
* The parameter strictAffixParsing (true/false) controls whether the affix parsing is strict or not. Default true.
* If strict an error while reading an affix rule causes a ParseException, otherwise is ignored.
* <br/>
* Dictionaries for many languages are available through the OpenOffice project.
*
* See <a href="http://wiki.apache.org/solr/Hunspell">http://wiki.apache.org/solr/Hunspell</a>
* @lucene.experimental
*/
public class HunspellStemFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
private static final String PARAM_DICTIONARY = "dictionary";
private static final String PARAM_AFFIX = "affix";
private static final String PARAM_IGNORE_CASE = "ignoreCase";
private static final String PARAM_STRICT_AFFIX_PARSING = "strictAffixParsing";
private static final String PARAM_DICTIONARY = "dictionary";
private static final String PARAM_AFFIX = "affix";
private static final String PARAM_RECURSION_CAP = "recursionCap";
private static final String PARAM_IGNORE_CASE = "ignoreCase";
private static final String PARAM_LONGEST_ONLY = "longestOnly";
private final String dictionaryArg;
private final String dictionaryFiles;
private final String affixFile;
private final boolean ignoreCase;
private final boolean strictAffixParsing;
private HunspellDictionary dictionary;
private final boolean longestOnly;
private Dictionary dictionary;
private int recursionCap;
/** Creates a new HunspellStemFilterFactory */
public HunspellStemFilterFactory(Map<String,String> args) {
super(args);
assureMatchVersion();
dictionaryArg = require(args, PARAM_DICTIONARY);
dictionaryFiles = require(args, PARAM_DICTIONARY);
affixFile = get(args, PARAM_AFFIX);
ignoreCase = getBoolean(args, PARAM_IGNORE_CASE, false);
strictAffixParsing = getBoolean(args, PARAM_STRICT_AFFIX_PARSING, true);
recursionCap = getInt(args, PARAM_RECURSION_CAP, 2);
longestOnly = getBoolean(args, PARAM_LONGEST_ONLY, false);
// this isnt necessary: we properly load all dictionaries.
// but recognize and ignore for back compat
getBoolean(args, "strictAffixParsing", true);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
}
/**
* Loads the hunspell dictionary and affix files defined in the configuration
*
* @param loader ResourceLoader used to load the files
*/
@Override
public void inform(ResourceLoader loader) throws IOException {
String dictionaryFiles[] = dictionaryArg.split(",");
String dicts[] = dictionaryFiles.split(",");
InputStream affix = null;
List<InputStream> dictionaries = new ArrayList<InputStream>();
try {
dictionaries = new ArrayList<InputStream>();
for (String file : dictionaryFiles) {
for (String file : dicts) {
dictionaries.add(loader.openResource(file));
}
affix = loader.openResource(affixFile);
this.dictionary = new HunspellDictionary(affix, dictionaries, luceneMatchVersion, ignoreCase, strictAffixParsing);
this.dictionary = new Dictionary(affix, dictionaries, ignoreCase);
} catch (ParseException e) {
throw new IOException("Unable to load hunspell data! [dictionary=" + dictionaryArg + ",affix=" + affixFile + "]", e);
throw new IOException("Unable to load hunspell data! [dictionary=" + dictionaries + ",affix=" + affixFile + "]", e);
} finally {
IOUtils.closeWhileHandlingException(affix);
IOUtils.closeWhileHandlingException(dictionaries);
}
}
/**
* Creates an instance of {@link org.apache.lucene.analysis.hunspell.HunspellStemFilter} that will filter the given
* TokenStream
*
* @param tokenStream TokenStream that will be filtered
* @return HunspellStemFilter that filters the TokenStream
*/
@Override
public TokenStream create(TokenStream tokenStream) {
return new HunspellStemFilter(tokenStream, dictionary, recursionCap);
return new HunspellStemFilter(tokenStream, dictionary, true, recursionCap, longestOnly);
}
}

View File

@ -1,392 +0,0 @@
package org.apache.lucene.analysis.hunspell;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.CharacterUtils;
import org.apache.lucene.util.Version;
/**
* HunspellStemmer uses the affix rules declared in the HunspellDictionary to generate one or more stems for a word. It
* conforms to the algorithm in the original hunspell algorithm, including recursive suffix stripping.
*/
public class HunspellStemmer {
private final int recursionCap;
private final HunspellDictionary dictionary;
private final StringBuilder segment = new StringBuilder();
private CharacterUtils charUtils = CharacterUtils.getInstance(Version.LUCENE_CURRENT);
/**
* Constructs a new HunspellStemmer which will use the provided HunspellDictionary to create its stems. Uses the
* default recursion cap of <code>2</code> (based on Hunspell documentation).
*
* @param dictionary HunspellDictionary that will be used to create the stems
*/
public HunspellStemmer(HunspellDictionary dictionary) {
this(dictionary, 2);
}
/**
* Constructs a new HunspellStemmer which will use the provided HunspellDictionary to create its stems
*
* @param dictionary HunspellDictionary that will be used to create the stems
* @param recursionCap maximum level of recursion stemmer can go into
*/
public HunspellStemmer(HunspellDictionary dictionary, int recursionCap) {
this.dictionary = dictionary;
this.recursionCap = recursionCap;
}
/**
* Find the stem(s) of the provided word
*
* @param word Word to find the stems for
* @return List of stems for the word
*/
public List<Stem> stem(String word) {
return stem(word.toCharArray(), word.length());
}
/**
* Find the stem(s) of the provided word
*
* @param word Word to find the stems for
* @return List of stems for the word
*/
public List<Stem> stem(char word[], int length) {
List<Stem> stems = new ArrayList<Stem>();
if (dictionary.lookupWord(word, 0, length) != null) {
stems.add(new Stem(word, length));
}
stems.addAll(stem(word, length, null, 0));
return stems;
}
/**
* Find the unique stem(s) of the provided word
*
* @param word Word to find the stems for
* @return List of stems for the word
*/
public List<Stem> uniqueStems(char word[], int length) {
List<Stem> stems = new ArrayList<Stem>();
CharArraySet terms = new CharArraySet(dictionary.getVersion(), 8, dictionary.isIgnoreCase());
if (dictionary.lookupWord(word, 0, length) != null) {
stems.add(new Stem(word, length));
terms.add(word);
}
List<Stem> otherStems = stem(word, length, null, 0);
for (Stem s : otherStems) {
if (!terms.contains(s.stem)) {
stems.add(s);
terms.add(s.stem);
}
}
return stems;
}
// ================================================= Helper Methods ================================================
/**
* Generates a list of stems for the provided word
*
* @param word Word to generate the stems for
* @param flags Flags from a previous stemming step that need to be cross-checked with any affixes in this recursive step
* @param recursionDepth Level of recursion this stemming step is at
* @return List of stems, pr an empty if no stems are found
*/
private List<Stem> stem(char word[], int length, char[] flags, int recursionDepth) {
List<Stem> stems = new ArrayList<Stem>();
for (int i = 0; i < length; i++) {
List<HunspellAffix> suffixes = dictionary.lookupSuffix(word, i, length - i);
if (suffixes == null) {
continue;
}
for (HunspellAffix suffix : suffixes) {
if (hasCrossCheckedFlag(suffix.getFlag(), flags)) {
int deAffixedLength = length - suffix.getAppend().length();
// TODO: can we do this in-place?
String strippedWord = new StringBuilder().append(word, 0, deAffixedLength).append(suffix.getStrip()).toString();
List<Stem> stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), suffix, recursionDepth);
for (Stem stem : stemList) {
stem.addSuffix(suffix);
}
stems.addAll(stemList);
}
}
}
for (int i = length - 1; i >= 0; i--) {
List<HunspellAffix> prefixes = dictionary.lookupPrefix(word, 0, i);
if (prefixes == null) {
continue;
}
for (HunspellAffix prefix : prefixes) {
if (hasCrossCheckedFlag(prefix.getFlag(), flags)) {
int deAffixedStart = prefix.getAppend().length();
int deAffixedLength = length - deAffixedStart;
String strippedWord = new StringBuilder().append(prefix.getStrip())
.append(word, deAffixedStart, deAffixedLength)
.toString();
List<Stem> stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), prefix, recursionDepth);
for (Stem stem : stemList) {
stem.addPrefix(prefix);
}
stems.addAll(stemList);
}
}
}
return stems;
}
/**
* Applies the affix rule to the given word, producing a list of stems if any are found
*
* @param strippedWord Word the affix has been removed and the strip added
* @param affix HunspellAffix representing the affix rule itself
* @param recursionDepth Level of recursion this stemming step is at
* @return List of stems for the word, or an empty list if none are found
*/
@SuppressWarnings("unchecked")
public List<Stem> applyAffix(char strippedWord[], int length, HunspellAffix affix, int recursionDepth) {
if(dictionary.isIgnoreCase()) {
charUtils.toLowerCase(strippedWord, 0, strippedWord.length);
}
segment.setLength(0);
segment.append(strippedWord, 0, length);
if (!affix.checkCondition(segment)) {
return Collections.EMPTY_LIST;
}
List<Stem> stems = new ArrayList<Stem>();
List<HunspellWord> words = dictionary.lookupWord(strippedWord, 0, length);
if (words != null) {
for (HunspellWord hunspellWord : words) {
if (hunspellWord.hasFlag(affix.getFlag())) {
stems.add(new Stem(strippedWord, length));
}
}
}
if (affix.isCrossProduct() && recursionDepth < recursionCap) {
stems.addAll(stem(strippedWord, length, affix.getAppendFlags(), ++recursionDepth));
}
return stems;
}
/**
* Checks if the given flag cross checks with the given array of flags
*
* @param flag Flag to cross check with the array of flags
* @param flags Array of flags to cross check against. Can be {@code null}
* @return {@code true} if the flag is found in the array or the array is {@code null}, {@code false} otherwise
*/
private boolean hasCrossCheckedFlag(char flag, char[] flags) {
return flags == null || Arrays.binarySearch(flags, flag) >= 0;
}
/**
* Stem represents all information known about a stem of a word. This includes the stem, and the prefixes and suffixes
* that were used to change the word into the stem.
*/
public static class Stem {
private final List<HunspellAffix> prefixes = new ArrayList<HunspellAffix>();
private final List<HunspellAffix> suffixes = new ArrayList<HunspellAffix>();
private final char stem[];
private final int stemLength;
/**
* Creates a new Stem wrapping the given word stem
*
* @param stem Stem of a word
*/
public Stem(char stem[], int stemLength) {
this.stem = stem;
this.stemLength = stemLength;
}
/**
* Adds a prefix to the list of prefixes used to generate this stem. Because it is assumed that prefixes are added
* depth first, the prefix is added to the front of the list
*
* @param prefix Prefix to add to the list of prefixes for this stem
*/
public void addPrefix(HunspellAffix prefix) {
prefixes.add(0, prefix);
}
/**
* Adds a suffix to the list of suffixes used to generate this stem. Because it is assumed that suffixes are added
* depth first, the suffix is added to the end of the list
*
* @param suffix Suffix to add to the list of suffixes for this stem
*/
public void addSuffix(HunspellAffix suffix) {
suffixes.add(suffix);
}
/**
* Returns the list of prefixes used to generate the stem
*
* @return List of prefixes used to generate the stem or an empty list if no prefixes were required
*/
public List<HunspellAffix> getPrefixes() {
return prefixes;
}
/**
* Returns the list of suffixes used to generate the stem
*
* @return List of suffixes used to generate the stem or an empty list if no suffixes were required
*/
public List<HunspellAffix> getSuffixes() {
return suffixes;
}
/**
* Returns the actual word stem itself
*
* @return Word stem itself
*/
public char[] getStem() {
return stem;
}
/**
* @return the stemLength
*/
public int getStemLength() {
return stemLength;
}
public String getStemString() {
return new String(stem, 0, stemLength);
}
}
// ================================================= Entry Point ===================================================
/*
* HunspellStemmer entry point. Accepts two arguments: location of affix file and location of dic file
*
* @param args Program arguments. Should contain location of affix file and location of dic file
* @throws IOException Can be thrown while reading from the files
* @throws ParseException Can be thrown while parsing the files
public static void main(String[] args) throws IOException, ParseException {
boolean ignoreCase = false;
int offset = 0;
if (args.length < 2) {
System.out.println("usage: HunspellStemmer [-i] <affix location> <dic location>");
System.exit(1);
}
if(args[offset].equals("-i")) {
ignoreCase = true;
System.out.println("Ignoring case. All stems will be returned lowercased");
offset++;
}
InputStream affixInputStream = new FileInputStream(args[offset++]);
InputStream dicInputStream = new FileInputStream(args[offset++]);
// :Post-Release-Update-Version.LUCENE_XY:
HunspellDictionary dictionary = new HunspellDictionary(affixInputStream, dicInputStream, Version.LUCENE_50, ignoreCase);
affixInputStream.close();
dicInputStream.close();
HunspellStemmer stemmer = new HunspellStemmer(dictionary);
Scanner scanner = new Scanner(System.in, Charset.defaultCharset().name());
System.out.print("> ");
while (scanner.hasNextLine()) {
String word = scanner.nextLine();
if ("exit".equals(word)) {
break;
}
printStemResults(word, stemmer.stem(word.toCharArray(), word.length()));
System.out.print("> ");
}
}
* Prints the results of the stemming of a word
*
* @param originalWord Word that has been stemmed
* @param stems Stems of the word
private static void printStemResults(String originalWord, List<Stem> stems) {
StringBuilder builder = new StringBuilder().append("stem(").append(originalWord).append(")").append("\n");
for (Stem stem : stems) {
builder.append("- ").append(stem.getStem()).append(": ");
for (HunspellAffix prefix : stem.getPrefixes()) {
builder.append(prefix.getAppend()).append("+");
if (hasText(prefix.getStrip())) {
builder.append(prefix.getStrip()).append("-");
}
}
builder.append(stem.getStem());
for (HunspellAffix suffix : stem.getSuffixes()) {
if (hasText(suffix.getStrip())) {
builder.append("-").append(suffix.getStrip());
}
builder.append("+").append(suffix.getAppend());
}
builder.append("\n");
}
System.out.println(builder);
}
* Simple utility to check if the given String has any text
*
* @param str String to check if it has any text
* @return {@code true} if the String has text, {@code false} otherwise
private static boolean hasText(String str) {
return str != null && str.length() > 0;
}
*/
}

View File

@ -1,63 +0,0 @@
package org.apache.lucene.analysis.hunspell;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.Arrays;
/**
* A dictionary (.dic) entry with its associated flags.
*/
public class HunspellWord {
private final char flags[]; // sorted, can we represent more concisely?
/**
* Creates a new HunspellWord with no associated flags
*/
public HunspellWord() {
flags = null;
}
/**
* Constructs a new HunspellWord with the given flags
*
* @param flags Flags to associate with the word
*/
public HunspellWord(char[] flags) {
this.flags = flags;
}
/**
* Checks whether the word has the given flag associated with it
*
* @param flag Flag to check whether it is associated with the word
* @return {@code true} if the flag is associated, {@code false} otherwise
*/
public boolean hasFlag(char flag) {
return flags != null && Arrays.binarySearch(flags, flag) >= 0;
}
/**
* Returns the flags associated with the word
*
* @return Flags associated with the word
*/
public char[] getFlags() {
return flags;
}
}

View File

@ -1,4 +1,4 @@
package org.apache.lucene.analysis.hunspell2;
package org.apache.lucene.analysis.hunspell;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more

View File

@ -1,4 +1,4 @@
package org.apache.lucene.analysis.hunspell2;
package org.apache.lucene.analysis.hunspell;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@ -24,6 +24,7 @@ import java.util.List;
import java.util.regex.Pattern;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.CharacterUtils;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRef;
@ -37,9 +38,10 @@ import org.apache.lucene.util.Version;
final class Stemmer {
private final int recursionCap;
private final Dictionary dictionary;
private BytesRef scratch = new BytesRef();
private final BytesRef scratch = new BytesRef();
private final StringBuilder segment = new StringBuilder();
private final ByteArrayDataInput affixReader;
private final CharacterUtils charUtils = CharacterUtils.getInstance(Version.LUCENE_CURRENT);
/**
* Constructs a new Stemmer which will use the provided Dictionary to create its stems. Uses the
@ -80,6 +82,9 @@ final class Stemmer {
* @return List of stems for the word
*/
public List<CharsRef> stem(char word[], int length) {
if (dictionary.ignoreCase) {
charUtils.toLowerCase(word, 0, length);
}
List<CharsRef> stems = new ArrayList<CharsRef>();
if (dictionary.lookupWord(word, 0, length, scratch) != null) {
stems.add(new CharsRef(word, 0, length));
@ -95,20 +100,19 @@ final class Stemmer {
* @return List of stems for the word
*/
public List<CharsRef> uniqueStems(char word[], int length) {
List<CharsRef> stems = new ArrayList<CharsRef>();
CharArraySet terms = new CharArraySet(Version.LUCENE_CURRENT, 8, false);
if (dictionary.lookupWord(word, 0, length, scratch) != null) {
stems.add(new CharsRef(word, 0, length));
terms.add(word);
List<CharsRef> stems = stem(word, length);
if (stems.size() < 2) {
return stems;
}
List<CharsRef> otherStems = stem(word, length, Dictionary.NOFLAGS, 0);
for (CharsRef s : otherStems) {
CharArraySet terms = new CharArraySet(Version.LUCENE_CURRENT, 8, dictionary.ignoreCase);
List<CharsRef> deduped = new ArrayList<>();
for (CharsRef s : stems) {
if (!terms.contains(s)) {
stems.add(s);
deduped.add(s);
terms.add(s);
}
}
return stems;
return deduped;
}
// ================================================= Helper Methods ================================================
@ -188,7 +192,7 @@ final class Stemmer {
* @param recursionDepth Level of recursion this stemming step is at
* @return List of stems for the word, or an empty list if none are found
*/
public List<CharsRef> applyAffix(char strippedWord[], int length, int affix, int recursionDepth) {
List<CharsRef> applyAffix(char strippedWord[], int length, int affix, int recursionDepth) {
segment.setLength(0);
segment.append(strippedWord, 0, length);

View File

@ -1,137 +0,0 @@
package org.apache.lucene.analysis.hunspell2;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.List;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.util.CharsRef;
/**
* TokenFilter that uses hunspell affix rules and words to stem tokens. Since hunspell supports a word having multiple
* stems, this filter can emit multiple tokens for each consumed token
*
* <p>
* Note: This filter is aware of the {@link KeywordAttribute}. To prevent
* certain terms from being passed to the stemmer
* {@link KeywordAttribute#isKeyword()} should be set to <code>true</code>
* in a previous {@link TokenStream}.
*
* Note: For including the original term as well as the stemmed version, see
* {@link org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilterFactory}
* </p>
*
* @lucene.experimental
*/
public final class Hunspell2StemFilter extends TokenFilter {
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
private final Stemmer stemmer;
private List<CharsRef> buffer;
private State savedState;
private final boolean dedup;
/** Create a {@link Hunspell2StemFilter} which deduplicates stems and has a maximum
* recursion level of 2.
* @see #Hunspell2StemFilter(TokenStream, Dictionary, int) */
public Hunspell2StemFilter(TokenStream input, Dictionary dictionary) {
this(input, dictionary, 2);
}
/**
* Creates a new Hunspell2StemFilter that will stem tokens from the given TokenStream using affix rules in the provided
* Dictionary
*
* @param input TokenStream whose tokens will be stemmed
* @param dictionary HunspellDictionary containing the affix rules and words that will be used to stem the tokens
* @param recursionCap maximum level of recursion stemmer can go into, defaults to <code>2</code>
*/
public Hunspell2StemFilter(TokenStream input, Dictionary dictionary, int recursionCap) {
this(input, dictionary, true, recursionCap);
}
/** Create a {@link Hunspell2StemFilter} which has a maximum recursion level of 2.
* @see #Hunspell2StemFilter(TokenStream, Dictionary, boolean, int) */
public Hunspell2StemFilter(TokenStream input, Dictionary dictionary, boolean dedup) {
this(input, dictionary, dedup, 2);
}
/**
* Creates a new HunspellStemFilter that will stem tokens from the given TokenStream using affix rules in the provided
* Dictionary
*
* @param input TokenStream whose tokens will be stemmed
* @param dictionary HunspellDictionary containing the affix rules and words that will be used to stem the tokens
* @param dedup true if only unique terms should be output.
* @param recursionCap maximum level of recursion stemmer can go into, defaults to <code>2</code>
*/
public Hunspell2StemFilter(TokenStream input, Dictionary dictionary, boolean dedup, int recursionCap) {
super(input);
this.dedup = dedup;
this.stemmer = new Stemmer(dictionary, recursionCap);
}
@Override
public boolean incrementToken() throws IOException {
if (buffer != null && !buffer.isEmpty()) {
CharsRef nextStem = buffer.remove(0);
restoreState(savedState);
posIncAtt.setPositionIncrement(0);
termAtt.setEmpty().append(nextStem);
return true;
}
if (!input.incrementToken()) {
return false;
}
if (keywordAtt.isKeyword()) {
return true;
}
buffer = dedup ? stemmer.uniqueStems(termAtt.buffer(), termAtt.length()) : stemmer.stem(termAtt.buffer(), termAtt.length());
if (buffer.isEmpty()) { // we do not know this word, return it unchanged
return true;
}
CharsRef stem = buffer.remove(0);
termAtt.setEmpty().append(stem);
if (!buffer.isEmpty()) {
savedState = captureState();
}
return true;
}
@Override
public void reset() throws IOException {
super.reset();
buffer = null;
}
}

View File

@ -1,80 +0,0 @@
package org.apache.lucene.analysis.hunspell2;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.InputStream;
import java.text.ParseException;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.ResourceLoader;
import org.apache.lucene.analysis.util.ResourceLoaderAware;
import org.apache.lucene.analysis.util.TokenFilterFactory;
/**
* TokenFilterFactory that creates instances of {@link Hunspell2StemFilter}.
* Example config for British English:
* <pre class="prettyprint">
* &lt;filter class=&quot;solr.Hunspell2StemFilterFactory&quot;
* dictionary=&quot;en_GB.dic&quot;
* affix=&quot;en_GB.aff&quot; /&gt;</pre>
* Both parameters dictionary and affix are mandatory.
* Dictionaries for many languages are available through the OpenOffice project.
*
* See <a href="http://wiki.apache.org/solr/Hunspell">http://wiki.apache.org/solr/Hunspell</a>
* @lucene.experimental
*/
public class Hunspell2StemFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
private static final String PARAM_DICTIONARY = "dictionary";
private static final String PARAM_AFFIX = "affix";
private static final String PARAM_RECURSION_CAP = "recursionCap";
private final String dictionaryFile;
private final String affixFile;
private Dictionary dictionary;
private int recursionCap;
/** Creates a new Hunspell2StemFilterFactory */
public Hunspell2StemFilterFactory(Map<String,String> args) {
super(args);
dictionaryFile = require(args, PARAM_DICTIONARY);
affixFile = get(args, PARAM_AFFIX);
recursionCap = getInt(args, PARAM_RECURSION_CAP, 2);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
}
@Override
public void inform(ResourceLoader loader) throws IOException {
try (InputStream affix = loader.openResource(affixFile);
InputStream dictionary = loader.openResource(dictionaryFile)) {
try {
this.dictionary = new Dictionary(affix, dictionary);
} catch (ParseException e) {
throw new RuntimeException(e);
}
}
}
@Override
public TokenStream create(TokenStream tokenStream) {
return new Hunspell2StemFilter(tokenStream, dictionary, recursionCap);
}
}

View File

@ -1,26 +0,0 @@
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html>
<body>
Stemming TokenFilter using a Java implementation of the <a href="http://www.ldc.upenn.edu/Catalog/docs/LDC2008T01/acta04.pdf">
Hunspell stemming algorithm.</a>
<p>
Dictionaries can be found on <a href="http://wiki.services.openoffice.org/wiki/Dictionaries">
OpenOffice's wiki</a>
</p>
</body>
</html>

View File

@ -51,7 +51,6 @@ org.apache.lucene.analysis.hi.HindiNormalizationFilterFactory
org.apache.lucene.analysis.hi.HindiStemFilterFactory
org.apache.lucene.analysis.hu.HungarianLightStemFilterFactory
org.apache.lucene.analysis.hunspell.HunspellStemFilterFactory
org.apache.lucene.analysis.hunspell2.Hunspell2StemFilterFactory
org.apache.lucene.analysis.id.IndonesianStemFilterFactory
org.apache.lucene.analysis.in.IndicNormalizationFilterFactory
org.apache.lucene.analysis.it.ItalianLightStemFilterFactory

View File

@ -62,8 +62,8 @@ import org.apache.lucene.analysis.commongrams.CommonGramsQueryFilter;
import org.apache.lucene.analysis.compound.HyphenationCompoundWordTokenFilter;
import org.apache.lucene.analysis.compound.TestCompoundWordTokenFilter;
import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
import org.apache.lucene.analysis.hunspell.HunspellDictionary;
import org.apache.lucene.analysis.hunspell.HunspellDictionaryTest;
import org.apache.lucene.analysis.hunspell.Dictionary;
import org.apache.lucene.analysis.hunspell.TestHunspellStemFilter;
import org.apache.lucene.analysis.miscellaneous.HyphenatedWordsFilter;
import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilter;
import org.apache.lucene.analysis.miscellaneous.LimitTokenPositionFilter;
@ -406,13 +406,13 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
return new IdentityEncoder(); // the other encoders will throw exceptions if tokens arent numbers?
}
});
put(HunspellDictionary.class, new ArgProducer() {
put(Dictionary.class, new ArgProducer() {
@Override public Object create(Random random) {
// TODO: make nastier
InputStream affixStream = HunspellDictionaryTest.class.getResourceAsStream("test.aff");
InputStream dictStream = HunspellDictionaryTest.class.getResourceAsStream("test.dic");
InputStream affixStream = TestHunspellStemFilter.class.getResourceAsStream("simple.aff");
InputStream dictStream = TestHunspellStemFilter.class.getResourceAsStream("simple.dic");
try {
return new HunspellDictionary(affixStream, dictStream, TEST_VERSION_CURRENT);
return new Dictionary(affixStream, dictStream);
} catch (Exception ex) {
Rethrow.rethrow(ex);
return null; // unreachable code

View File

@ -1,201 +0,0 @@
package org.apache.lucene.analysis.hunspell;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.InputStream;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.apache.lucene.util.LuceneTestCase;
import org.junit.Assert;
import org.junit.Test;
public class HunspellDictionaryTest extends LuceneTestCase {
private class CloseCheckInputStream extends InputStream {
private InputStream delegate;
private boolean closed = false;
public CloseCheckInputStream(InputStream delegate) {
super();
this.delegate = delegate;
}
@Override
public int read() throws IOException {
return delegate.read();
}
@Override
public int hashCode() {
return delegate.hashCode();
}
@Override
public int read(byte[] b) throws IOException {
return delegate.read(b);
}
@Override
public boolean equals(Object obj) {
return delegate.equals(obj);
}
@Override
public int read(byte[] b, int off, int len) throws IOException {
return delegate.read(b, off, len);
}
@Override
public long skip(long n) throws IOException {
return delegate.skip(n);
}
@Override
public String toString() {
return delegate.toString();
}
@Override
public int available() throws IOException {
return delegate.available();
}
@Override
public void close() throws IOException {
this.closed = true;
delegate.close();
}
@Override
public void mark(int readlimit) {
delegate.mark(readlimit);
}
@Override
public void reset() throws IOException {
delegate.reset();
}
@Override
public boolean markSupported() {
return delegate.markSupported();
}
public boolean isClosed() {
return this.closed;
}
}
@Test
public void testResourceCleanup() throws IOException, ParseException {
CloseCheckInputStream affixStream = new CloseCheckInputStream(getClass().getResourceAsStream("testCompressed.aff"));
CloseCheckInputStream dictStream = new CloseCheckInputStream(getClass().getResourceAsStream("testCompressed.dic"));
new HunspellDictionary(affixStream, dictStream, TEST_VERSION_CURRENT);
assertFalse(affixStream.isClosed());
assertFalse(dictStream.isClosed());
affixStream.close();
dictStream.close();
assertTrue(affixStream.isClosed());
assertTrue(dictStream.isClosed());
}
@Test
public void testHunspellDictionary_loadDicAff() throws IOException, ParseException {
InputStream affixStream = getClass().getResourceAsStream("test.aff");
InputStream dictStream = getClass().getResourceAsStream("test.dic");
HunspellDictionary dictionary = new HunspellDictionary(affixStream, dictStream, TEST_VERSION_CURRENT);
assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).size());
assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).size());
assertEquals(1, dictionary.lookupWord(new char[]{'o', 'l', 'r'}, 0, 3).size());
assertEquals("Wrong number of flags for lucen", 1, dictionary.lookupWord(new char[]{'l', 'u', 'c', 'e', 'n'}, 0, 5).get(0).getFlags().length);
affixStream.close();
dictStream.close();
}
@Test
public void testHunspellDictionary_multipleDictWithOverride() throws IOException, ParseException {
InputStream affixStream = getClass().getResourceAsStream("test.aff");
List<InputStream> dictStreams = new ArrayList<InputStream>();
dictStreams.add(getClass().getResourceAsStream("test.dic"));
dictStreams.add(getClass().getResourceAsStream("testOverride.dic"));
HunspellDictionary dictionary = new HunspellDictionary(affixStream, dictStreams, TEST_VERSION_CURRENT, false);
assertEquals("Wrong number of flags for lucen", 3, dictionary.lookupWord(new char[]{'l', 'u', 'c', 'e', 'n'}, 0, 5).get(0).getFlags().length);
assertEquals("Wrong number of flags for bar", 1, dictionary.lookupWord(new char[]{'b', 'a', 'r'}, 0, 3).get(0).getFlags().length);
affixStream.close();
for(InputStream dstream : dictStreams) {
dstream.close();
}
}
@Test
public void testCompressedHunspellDictionary_loadDicAff() throws IOException, ParseException {
InputStream affixStream = getClass().getResourceAsStream("testCompressed.aff");
InputStream dictStream = getClass().getResourceAsStream("testCompressed.dic");
HunspellDictionary dictionary = new HunspellDictionary(affixStream, dictStream, TEST_VERSION_CURRENT);
assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).size());
assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).size());
assertEquals(1, dictionary.lookupWord(new char[]{'o', 'l', 'r'}, 0, 3).size());
affixStream.close();
dictStream.close();
}
@Test
public void testHunspellDictionary_loadDicWrongAff() throws IOException, ParseException {
InputStream affixStream = getClass().getResourceAsStream("testWrongAffixRule.aff");
InputStream dictStream = getClass().getResourceAsStream("test.dic");
HunspellDictionary dictionary = new HunspellDictionary(affixStream, Arrays.asList(dictStream), TEST_VERSION_CURRENT, false, false);
assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).size());
assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).size());
assertEquals(1, dictionary.lookupWord(new char[]{'o', 'l', 'r'}, 0, 3).size());
//strict parsing disabled: malformed rule is not loaded
assertNull(dictionary.lookupPrefix(new char[]{'a'}, 0, 1));
affixStream.close();
dictStream.close();
affixStream = getClass().getResourceAsStream("testWrongAffixRule.aff");
dictStream = getClass().getResourceAsStream("test.dic");
//strict parsing enabled: malformed rule causes ParseException
try {
dictionary = new HunspellDictionary(affixStream, Arrays.asList(dictStream), TEST_VERSION_CURRENT, false, true);
Assert.fail();
} catch(ParseException e) {
Assert.assertEquals("The affix file contains a rule with less than five elements", e.getMessage());
Assert.assertEquals(23, e.getErrorOffset());
}
affixStream.close();
dictStream.close();
}
}

View File

@ -1,92 +0,0 @@
package org.apache.lucene.analysis.hunspell;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.InputStream;
import java.text.ParseException;
import java.util.Arrays;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.TestUtil;
import org.junit.AfterClass;
import org.junit.BeforeClass;
public class HunspellStemFilterTest extends BaseTokenStreamTestCase {
private static HunspellDictionary DICTIONARY;
@BeforeClass
public static void beforeClass() throws IOException, ParseException {
DICTIONARY = createDict(true);
}
@AfterClass
public static void afterClass() {
DICTIONARY = null;
}
public static HunspellDictionary createDict(boolean ignoreCase) throws IOException, ParseException {
InputStream affixStream = HunspellStemmerTest.class.getResourceAsStream("test.aff");
InputStream dictStream = HunspellStemmerTest.class.getResourceAsStream("test.dic");
return new HunspellDictionary(affixStream, dictStream, TEST_VERSION_CURRENT, ignoreCase);
}
/**
* Simple test for KeywordAttribute
*/
public void testKeywordAttribute() throws IOException {
MockTokenizer tokenizer = whitespaceMockTokenizer("lucene is awesome");
tokenizer.setEnableChecks(true);
HunspellStemFilter filter = new HunspellStemFilter(tokenizer, DICTIONARY, TestUtil.nextInt(random(), 1, 3));
assertTokenStreamContents(filter, new String[]{"lucene", "lucen", "is", "awesome"}, new int[] {1, 0, 1, 1});
// assert with keywork marker
tokenizer = whitespaceMockTokenizer("lucene is awesome");
CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, Arrays.asList("Lucene"), true);
filter = new HunspellStemFilter(new SetKeywordMarkerFilter(tokenizer, set), DICTIONARY, TestUtil.nextInt(random(), 1, 3));
assertTokenStreamContents(filter, new String[]{"lucene", "is", "awesome"}, new int[] {1, 1, 1});
}
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer, new HunspellStemFilter(tokenizer, DICTIONARY, TestUtil.nextInt(random(), 1, 3)));
}
};
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
}
public void testEmptyTerm() throws IOException {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new KeywordTokenizer();
return new TokenStreamComponents(tokenizer, new HunspellStemFilter(tokenizer, DICTIONARY, TestUtil.nextInt(random(), 1, 3)));
}
};
checkOneTerm(a, "", "");
}
}

View File

@ -1,137 +0,0 @@
package org.apache.lucene.analysis.hunspell;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.Version;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.junit.Test;
import java.io.IOException;
import java.io.InputStream;
import java.text.ParseException;
import java.util.List;
import static junit.framework.Assert.assertEquals;
public class HunspellStemmerTest extends LuceneTestCase {
private static HunspellStemmer stemmer;
@BeforeClass
public static void beforeClass() throws IOException, ParseException {
createStemmer(true);
}
@AfterClass
public static void afterClass() {
stemmer = null;
}
@Test
public void testStem_simpleSuffix() {
List<HunspellStemmer.Stem> stems = stemmer.stem("lucene");
assertEquals(2, stems.size());
assertEquals("lucene", stems.get(0).getStemString());
assertEquals("lucen", stems.get(1).getStemString());
stems = stemmer.stem("mahoute");
assertEquals(1, stems.size());
assertEquals("mahout", stems.get(0).getStemString());
}
@Test
public void testStem_simplePrefix() {
List<HunspellStemmer.Stem> stems = stemmer.stem("solr");
assertEquals(1, stems.size());
assertEquals("olr", stems.get(0).getStemString());
}
@Test
public void testStem_recursiveSuffix() {
List<HunspellStemmer.Stem> stems = stemmer.stem("abcd");
assertEquals(1, stems.size());
assertEquals("ab", stems.get(0).getStemString());
}
@Test
public void testStem_ignoreCase() throws IOException, ParseException {
List<HunspellStemmer.Stem> stems;
createStemmer(true);
stems = stemmer.stem("apache");
assertEquals(1, stems.size());
assertEquals("apach", stems.get(0).getStemString());
stems = stemmer.stem("APACHE");
assertEquals(1, stems.size());
assertEquals("apach", stems.get(0).getStemString());
stems = stemmer.stem("Apache");
assertEquals(1, stems.size());
assertEquals("apach", stems.get(0).getStemString());
stems = stemmer.stem("foos");
assertEquals(1, stems.size());
assertEquals("foo", stems.get(0).getStemString());
stems = stemmer.stem("mood");
assertEquals(1, stems.size());
assertEquals("moo", stems.get(0).getStemString());
stems = stemmer.stem("Foos");
assertEquals(1, stems.size());
assertEquals("foo", stems.get(0).getStemString());
// The "Foo" rule gets overridden by the "foo" rule, and we don't merge
stems = stemmer.stem("Food");
assertEquals(0, stems.size());
stems = stemmer.stem("Mood");
assertEquals(1, stems.size());
assertEquals("moo", stems.get(0).getStemString());
}
@Test
public void testStem_caseSensitive() throws IOException, ParseException {
createStemmer(false);
List<HunspellStemmer.Stem> stems = stemmer.stem("apache");
assertEquals(0, stems.size());
stems = stemmer.stem("Apache");
assertEquals(1, stems.size());
assertEquals("Apach", stems.get(0).getStemString());
}
private static void createStemmer(boolean ignoreCase) throws IOException, ParseException {
InputStream affixStream = HunspellStemmerTest.class.getResourceAsStream("test.aff");
InputStream dictStream = HunspellStemmerTest.class.getResourceAsStream("test.dic");
HunspellDictionary dictionary = new HunspellDictionary(affixStream, dictStream, TEST_VERSION_CURRENT, ignoreCase);
stemmer = new HunspellStemmer(dictionary);
affixStream.close();
dictStream.close();
}
}

View File

@ -1,4 +1,4 @@
package org.apache.lucene.analysis.hunspell2;
package org.apache.lucene.analysis.hunspell;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@ -22,7 +22,7 @@ import java.io.InputStream;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
import org.apache.lucene.analysis.hunspell.HunspellDictionary;
import org.apache.lucene.analysis.hunspell.Dictionary;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.RamUsageEstimator;
@ -33,7 +33,7 @@ import org.junit.Ignore;
* wget --mirror -np http://archive.services.openoffice.org/pub/mirror/OpenOffice.org/contrib/dictionaries/
* Note some of the files differ only in case. This may be a problem on your operating system!
*/
//@Ignore("enable manually")
@Ignore("enable manually")
public class TestAllDictionaries extends LuceneTestCase {
// set this to the location of where you downloaded all the files
@ -162,21 +162,11 @@ public class TestAllDictionaries extends LuceneTestCase {
assert dicEntry != null;
ZipEntry affEntry = zip.getEntry(tests[i+2]);
assert affEntry != null;
// get ram from previous impl
String oldRAM = "FAIL";
try (InputStream dictionary = zip.getInputStream(dicEntry);
InputStream affix = zip.getInputStream(affEntry)) {
try {
HunspellDictionary dic = new HunspellDictionary(affix, dictionary, TEST_VERSION_CURRENT);
oldRAM = RamUsageEstimator.humanSizeOf(dic);
} catch (Throwable t) {}
}
try (InputStream dictionary = zip.getInputStream(dicEntry);
InputStream affix = zip.getInputStream(affEntry)) {
Dictionary dic = new Dictionary(affix, dictionary);
System.out.println(tests[i] + "\t" + oldRAM + "\t" + RamUsageEstimator.humanSizeOf(dic) + "\t(" +
System.out.println(tests[i] + "\t" + RamUsageEstimator.humanSizeOf(dic) + "\t(" +
"words=" + RamUsageEstimator.humanSizeOf(dic.words) + ", " +
"flags=" + RamUsageEstimator.humanSizeOf(dic.flagLookup) + ", " +
"strips=" + RamUsageEstimator.humanSizeOf(dic.stripLookup) + ", " +
@ -204,7 +194,7 @@ public class TestAllDictionaries extends LuceneTestCase {
try (InputStream dictionary = zip.getInputStream(dicEntry);
InputStream affix = zip.getInputStream(affEntry)) {
Dictionary dic = new Dictionary(affix, dictionary);
new Dictionary(affix, dictionary);
}
}
}

View File

@ -0,0 +1,110 @@
package org.apache.lucene.analysis.hunspell;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.hunspell.Dictionary;
import org.apache.lucene.analysis.hunspell.Stemmer;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.LuceneTestCase;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import java.io.InputStream;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
public class TestCaseInsensitive extends LuceneTestCase {
private static Stemmer stemmer;
@BeforeClass
public static void beforeClass() throws Exception {
try (InputStream affixStream = TestCaseInsensitive.class.getResourceAsStream("simple.aff");
InputStream dictStream = TestCaseInsensitive.class.getResourceAsStream("mixedcase.dic")) {
Dictionary dictionary = new Dictionary(affixStream, Collections.singletonList(dictStream), true);
stemmer = new Stemmer(dictionary);
}
}
@AfterClass
public static void afterClass() {
stemmer = null;
}
public void testCaseInsensitivity() {
assertStemsTo("lucene", "lucene", "lucen");
assertStemsTo("LuCeNe", "lucene", "lucen");
assertStemsTo("mahoute", "mahout");
assertStemsTo("MaHoUte", "mahout");
}
public void testSimplePrefix() {
assertStemsTo("solr", "olr");
}
public void testRecursiveSuffix() {
assertStemsTo("abcd", "ab");
}
// all forms unmunched from dictionary
public void testAllStems() {
assertStemsTo("ab", "ab");
assertStemsTo("abc", "ab");
assertStemsTo("apach", "apach");
assertStemsTo("apache", "apach");
assertStemsTo("foo", "foo");
assertStemsTo("food", "foo");
assertStemsTo("foos", "foo");
assertStemsTo("lucen", "lucen");
assertStemsTo("lucene", "lucen", "lucene");
assertStemsTo("mahout", "mahout");
assertStemsTo("mahoute", "mahout");
assertStemsTo("moo", "moo");
assertStemsTo("mood", "moo");
assertStemsTo("olr", "olr");
assertStemsTo("solr", "olr");
}
// some bogus stuff that should not stem (empty lists)!
public void testBogusStems() {
assertStemsTo("abs");
assertStemsTo("abe");
assertStemsTo("sab");
assertStemsTo("sapach");
assertStemsTo("sapache");
assertStemsTo("apachee");
assertStemsTo("sfoo");
assertStemsTo("sfoos");
assertStemsTo("fooss");
assertStemsTo("lucenee");
assertStemsTo("solre");
}
private void assertStemsTo(String s, String... expected) {
Arrays.sort(expected);
List<CharsRef> stems = stemmer.stem(s);
String actual[] = new String[stems.size()];
for (int i = 0; i < actual.length; i++) {
actual[i] = stems.get(i).toString();
}
Arrays.sort(actual);
assertArrayEquals(expected, actual);
}
}

View File

@ -1,4 +1,4 @@
package org.apache.lucene.analysis.hunspell2;
package org.apache.lucene.analysis.hunspell;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@ -22,6 +22,7 @@ import java.io.IOException;
import java.io.InputStream;
import java.text.ParseException;
import org.apache.lucene.analysis.hunspell.Dictionary;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;

View File

@ -1,4 +1,4 @@
package org.apache.lucene.analysis.hunspell2;
package org.apache.lucene.analysis.hunspell;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@ -26,13 +26,15 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.hunspell.Dictionary;
import org.apache.lucene.analysis.hunspell.HunspellStemFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.TestUtil;
import org.junit.AfterClass;
import org.junit.BeforeClass;
public class TestHunspell2StemFilter extends BaseTokenStreamTestCase {
public class TestHunspellStemFilter extends BaseTokenStreamTestCase {
private static Dictionary dictionary;
@BeforeClass
@ -52,13 +54,21 @@ public class TestHunspell2StemFilter extends BaseTokenStreamTestCase {
public void testKeywordAttribute() throws IOException {
MockTokenizer tokenizer = whitespaceMockTokenizer("lucene is awesome");
tokenizer.setEnableChecks(true);
Hunspell2StemFilter filter = new Hunspell2StemFilter(tokenizer, dictionary, TestUtil.nextInt(random(), 1, 3));
HunspellStemFilter filter = new HunspellStemFilter(tokenizer, dictionary, TestUtil.nextInt(random(), 1, 3));
assertTokenStreamContents(filter, new String[]{"lucene", "lucen", "is", "awesome"}, new int[] {1, 0, 1, 1});
// assert with keyword marker
tokenizer = whitespaceMockTokenizer("lucene is awesome");
CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, Arrays.asList("Lucene"), true);
filter = new Hunspell2StemFilter(new SetKeywordMarkerFilter(tokenizer, set), dictionary, TestUtil.nextInt(random(), 1, 3));
filter = new HunspellStemFilter(new SetKeywordMarkerFilter(tokenizer, set), dictionary, TestUtil.nextInt(random(), 1, 3));
assertTokenStreamContents(filter, new String[]{"lucene", "is", "awesome"}, new int[] {1, 1, 1});
}
/** simple test for longestOnly option */
public void testLongestOnly() throws IOException {
MockTokenizer tokenizer = whitespaceMockTokenizer("lucene is awesome");
tokenizer.setEnableChecks(true);
HunspellStemFilter filter = new HunspellStemFilter(tokenizer, dictionary, true, TestUtil.nextInt(random(), 1, 3), true);
assertTokenStreamContents(filter, new String[]{"lucene", "is", "awesome"}, new int[] {1, 1, 1});
}
@ -68,7 +78,7 @@ public class TestHunspell2StemFilter extends BaseTokenStreamTestCase {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer, new Hunspell2StemFilter(tokenizer, dictionary, TestUtil.nextInt(random(), 1, 3)));
return new TokenStreamComponents(tokenizer, new HunspellStemFilter(tokenizer, dictionary, TestUtil.nextInt(random(), 1, 3)));
}
};
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
@ -79,7 +89,7 @@ public class TestHunspell2StemFilter extends BaseTokenStreamTestCase {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new KeywordTokenizer();
return new TokenStreamComponents(tokenizer, new Hunspell2StemFilter(tokenizer, dictionary, TestUtil.nextInt(random(), 1, 3)));
return new TokenStreamComponents(tokenizer, new HunspellStemFilter(tokenizer, dictionary, TestUtil.nextInt(random(), 1, 3)));
}
};
checkOneTerm(a, "", "");

View File

@ -20,7 +20,6 @@ package org.apache.lucene.analysis.hunspell;
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
@ -31,17 +30,17 @@ public class TestHunspellStemFilterFactory extends BaseTokenStreamFactoryTestCas
public void testStemming() throws Exception {
Reader reader = new StringReader("abc");
TokenStream stream = whitespaceMockTokenizer(reader);
stream = tokenFilterFactory("HunspellStem",
"dictionary", "test.dic",
"affix", "test.aff").create(stream);
stream = tokenFilterFactory("Hunspell2Stem",
"dictionary", "simple.dic",
"affix", "simple.aff").create(stream);
assertTokenStreamContents(stream, new String[] { "ab" });
}
/** Test that bogus arguments result in exception */
public void testBogusArguments() throws Exception {
try {
tokenFilterFactory("HunspellStem",
"dictionary", "test.dic",
tokenFilterFactory("Hunspell2Stem",
"dictionary", "simple.dic",
"bogusArg", "bogusValue");
fail();
} catch (IllegalArgumentException expected) {

View File

@ -1,4 +1,4 @@
package org.apache.lucene.analysis.hunspell2;
package org.apache.lucene.analysis.hunspell;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@ -17,6 +17,8 @@ package org.apache.lucene.analysis.hunspell2;
* limitations under the License.
*/
import org.apache.lucene.analysis.hunspell.Dictionary;
import org.apache.lucene.analysis.hunspell.Stemmer;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.LuceneTestCase;
import org.junit.AfterClass;

View File

@ -0,0 +1,10 @@
9
Ab/C
apach/A
Foo/D
foo/E
Lucen/A
Lucene
mahout/A
Moo/E
olr/B

View File

@ -1,20 +0,0 @@
SET UTF-8
TRY abcdefghijklmopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ
SFX A Y 3
SFX A 0 e n
SFX A 0 e t
SFX A 0 e h
SFX C Y 2
SFX C 0 d/C c
SFX C 0 c b
SFX D Y 1
SFX D 0 s o
SFX E Y 1
SFX E 0 d o
PFX B Y 1
PFX B 0 s o

View File

@ -1,10 +0,0 @@
9
lucen/A
lucene
mahout/A
olr/B
ab/C
Apach/A
Foo/E
foo/D
Moo/E

View File

@ -1,29 +0,0 @@
SET UTF-8
TRY abcdefghijklmopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ
FLAG long
AF 5
AF AA
AF BB
AF CC
AF DD
AF EE
SFX AA Y 3
SFX AA 0 e n
SFX AA 0 e t
SFX AA 0 e h
SFX CC Y 2
SFX CC 0 d/3 c
SFX CC 0 c b
SFX DD Y 1
SFX DD 0 s o
SFX EE Y 1
SFX EE 0 d o
PFX BB Y 1
PFX BB 0 s o

View File

@ -1,9 +0,0 @@
6
lucen/1
lucene
mahout/1
olr/2
ab/3
Apach/1
foo/4
Foo/5

View File

@ -1,24 +0,0 @@
SET UTF-8
TRY abcdefghijklmopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ
SFX A Y 3
SFX A 0 e n
SFX A 0 e t
SFX A 0 e h
SFX C Y 2
SFX C 0 d/C c
SFX C 0 c b
SFX D Y 1
SFX D 0 s o
SFX E Y 1
SFX E 0 d o
PFX B Y 1
PFX B 0 s o
#wrong rule (only 4 elements)
PFX A0 Y 1
PFX A0 0 a

View File

@ -1,50 +0,0 @@
package org.apache.lucene.analysis.hunspell2;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
/**
* Simple tests to ensure the Hunspell stemmer loads from factory
*/
public class TestHunspell2StemFilterFactory extends BaseTokenStreamFactoryTestCase {
public void testStemming() throws Exception {
Reader reader = new StringReader("abc");
TokenStream stream = whitespaceMockTokenizer(reader);
stream = tokenFilterFactory("Hunspell2Stem",
"dictionary", "simple.dic",
"affix", "simple.aff").create(stream);
assertTokenStreamContents(stream, new String[] { "ab" });
}
/** Test that bogus arguments result in exception */
public void testBogusArguments() throws Exception {
try {
tokenFilterFactory("Hunspell2Stem",
"dictionary", "simple.dic",
"bogusArg", "bogusValue");
fail();
} catch (IllegalArgumentException expected) {
assertTrue(expected.getMessage().contains("Unknown parameters"));
}
}
}