mirror of https://github.com/apache/lucene.git
LUCENE-5468: hunspell2 -> hunspell (with previous options and tests)
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5468@1572718 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
b2b86dd8ad
commit
c4f4beb27e
|
@ -1,4 +1,4 @@
|
|||
package org.apache.lucene.analysis.hunspell2;
|
||||
package org.apache.lucene.analysis.hunspell;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
|
@ -17,7 +17,6 @@ package org.apache.lucene.analysis.hunspell2;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.util.CharArrayMap;
|
||||
import org.apache.lucene.store.ByteArrayDataOutput;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
@ -28,14 +27,19 @@ import org.apache.lucene.util.OfflineSorter;
|
|||
import org.apache.lucene.util.OfflineSorter.ByteSequencesReader;
|
||||
import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter;
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.apache.lucene.util.fst.Builder;
|
||||
import org.apache.lucene.util.fst.FST;
|
||||
import org.apache.lucene.util.fst.IntSequenceOutputs;
|
||||
import org.apache.lucene.util.fst.PositiveIntOutputs;
|
||||
import org.apache.lucene.util.fst.Util;
|
||||
|
||||
import java.io.*;
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.BufferedReader;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.LineNumberReader;
|
||||
import java.nio.charset.Charset;
|
||||
import java.nio.charset.CharsetDecoder;
|
||||
import java.nio.charset.CodingErrorAction;
|
||||
|
@ -71,27 +75,27 @@ public class Dictionary {
|
|||
private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*";
|
||||
private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s";
|
||||
|
||||
public FST<IntsRef> prefixes;
|
||||
public FST<IntsRef> suffixes;
|
||||
FST<IntsRef> prefixes;
|
||||
FST<IntsRef> suffixes;
|
||||
|
||||
// all Patterns used by prefixes and suffixes. these are typically re-used across
|
||||
// many affix stripping rules. so these are deduplicated, to save RAM.
|
||||
// TODO: maybe don't use Pattern for the condition check...
|
||||
// TODO: when we cut over Affix to FST, just store integer index to this.
|
||||
public ArrayList<Pattern> patterns = new ArrayList<>();
|
||||
ArrayList<Pattern> patterns = new ArrayList<>();
|
||||
|
||||
// the entries in the .dic file, mapping to their set of flags.
|
||||
// the fst output is the ordinal for flagLookup
|
||||
public FST<Long> words;
|
||||
FST<Long> words;
|
||||
// the list of unique flagsets (wordforms). theoretically huge, but practically
|
||||
// small (e.g. for polish this is 756), otherwise humans wouldn't be able to deal with it either.
|
||||
public BytesRefHash flagLookup = new BytesRefHash();
|
||||
BytesRefHash flagLookup = new BytesRefHash();
|
||||
|
||||
// the list of unique strip affixes.
|
||||
public BytesRefHash stripLookup = new BytesRefHash();
|
||||
BytesRefHash stripLookup = new BytesRefHash();
|
||||
|
||||
// 8 bytes per affix
|
||||
public byte[] affixData = new byte[64];
|
||||
byte[] affixData = new byte[64];
|
||||
private int currentAffix = 0;
|
||||
|
||||
private FlagParsingStrategy flagParsingStrategy = new SimpleFlagParsingStrategy(); // Default flag parsing strategy
|
||||
|
@ -100,7 +104,11 @@ public class Dictionary {
|
|||
private int aliasCount = 0;
|
||||
|
||||
private final File tempDir = OfflineSorter.defaultTempDir(); // TODO: make this configurable?
|
||||
|
||||
|
||||
public static final int IGNORE_CASE = 1;
|
||||
|
||||
boolean ignoreCase;
|
||||
|
||||
/**
|
||||
* Creates a new Dictionary containing the information read from the provided InputStreams to hunspell affix
|
||||
* and dictionary files.
|
||||
|
@ -112,6 +120,21 @@ public class Dictionary {
|
|||
* @throws ParseException Can be thrown if the content of the files does not meet expected formats
|
||||
*/
|
||||
public Dictionary(InputStream affix, InputStream dictionary) throws IOException, ParseException {
|
||||
this(affix, Collections.singletonList(dictionary), false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new Dictionary containing the information read from the provided InputStreams to hunspell affix
|
||||
* and dictionary files.
|
||||
* You have to close the provided InputStreams yourself.
|
||||
*
|
||||
* @param affix InputStream for reading the hunspell affix file (won't be closed).
|
||||
* @param dictionaries InputStream for reading the hunspell dictionary files (won't be closed).
|
||||
* @throws IOException Can be thrown while reading from the InputStreams
|
||||
* @throws ParseException Can be thrown if the content of the files does not meet expected formats
|
||||
*/
|
||||
public Dictionary(InputStream affix, List<InputStream> dictionaries, boolean ignoreCase) throws IOException, ParseException {
|
||||
this.ignoreCase = ignoreCase;
|
||||
BufferedInputStream buffered = new BufferedInputStream(affix, 8192);
|
||||
buffered.mark(8192);
|
||||
String encoding = getDictionaryEncoding(affix);
|
||||
|
@ -122,7 +145,7 @@ public class Dictionary {
|
|||
stripLookup.add(new BytesRef()); // no strip -> ord 0
|
||||
PositiveIntOutputs o = PositiveIntOutputs.getSingleton();
|
||||
Builder<Long> b = new Builder<Long>(FST.INPUT_TYPE.BYTE4, o);
|
||||
readDictionaryFile(dictionary, decoder, b);
|
||||
readDictionaryFiles(dictionaries, decoder, b);
|
||||
words = b.finish();
|
||||
}
|
||||
|
||||
|
@ -145,7 +168,7 @@ public class Dictionary {
|
|||
return decodeFlags(flagLookup.get(ord, scratch));
|
||||
}
|
||||
|
||||
public Integer lookupOrd(char word[], int offset, int length) throws IOException {
|
||||
Integer lookupOrd(char word[], int offset, int length) throws IOException {
|
||||
final FST.BytesReader bytesReader = words.getBytesReader();
|
||||
final FST.Arc<Long> arc = words.getFirstArc(new FST.Arc<Long>());
|
||||
// Accumulate output as we go
|
||||
|
@ -269,7 +292,6 @@ public class Dictionary {
|
|||
Util.toUTF32(entry.getKey(), scratch);
|
||||
List<Character> entries = entry.getValue();
|
||||
IntsRef output = new IntsRef(entries.size());
|
||||
int upto = 0;
|
||||
for (Character c : entries) {
|
||||
output.ints[output.length++] = c;
|
||||
}
|
||||
|
@ -480,23 +502,39 @@ public class Dictionary {
|
|||
}
|
||||
|
||||
/**
|
||||
* Reads the dictionary file through the provided InputStream, building up the words map
|
||||
* Reads the dictionary file through the provided InputStreams, building up the words map
|
||||
*
|
||||
* @param dictionary InputStream to read the dictionary file through
|
||||
* @param dictionaries InputStreams to read the dictionary file through
|
||||
* @param decoder CharsetDecoder used to decode the contents of the file
|
||||
* @throws IOException Can be thrown while reading from the file
|
||||
*/
|
||||
private void readDictionaryFile(InputStream dictionary, CharsetDecoder decoder, Builder<Long> words) throws IOException {
|
||||
private void readDictionaryFiles(List<InputStream> dictionaries, CharsetDecoder decoder, Builder<Long> words) throws IOException {
|
||||
BytesRef flagsScratch = new BytesRef();
|
||||
IntsRef scratchInts = new IntsRef();
|
||||
|
||||
BufferedReader lines = new BufferedReader(new InputStreamReader(dictionary, decoder));
|
||||
String line = lines.readLine(); // first line is number of entries (approximately, sometimes)
|
||||
|
||||
File unsorted = File.createTempFile("unsorted", "dat", tempDir);
|
||||
try (ByteSequencesWriter writer = new ByteSequencesWriter(unsorted)) {
|
||||
while ((line = lines.readLine()) != null) {
|
||||
writer.write(line.getBytes(IOUtils.CHARSET_UTF_8));
|
||||
for (InputStream dictionary : dictionaries) {
|
||||
BufferedReader lines = new BufferedReader(new InputStreamReader(dictionary, decoder));
|
||||
String line = lines.readLine(); // first line is number of entries (approximately, sometimes)
|
||||
|
||||
while ((line = lines.readLine()) != null) {
|
||||
if (ignoreCase) {
|
||||
int flagSep = line.lastIndexOf('/');
|
||||
if (flagSep == -1) {
|
||||
writer.write(line.toLowerCase(Locale.ROOT).getBytes(IOUtils.CHARSET_UTF_8));
|
||||
} else {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
sb.append(line.substring(0, flagSep).toLowerCase(Locale.ROOT));
|
||||
if (flagSep < line.length()) {
|
||||
sb.append(line.substring(flagSep, line.length()));
|
||||
}
|
||||
writer.write(sb.toString().getBytes(IOUtils.CHARSET_UTF_8));
|
||||
}
|
||||
} else {
|
||||
writer.write(line.getBytes(IOUtils.CHARSET_UTF_8));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
File sorted = File.createTempFile("sorted", "dat", tempDir);
|
||||
|
@ -544,6 +582,7 @@ public class Dictionary {
|
|||
BytesRef currentEntry = new BytesRef();
|
||||
char currentFlags[] = new char[0];
|
||||
|
||||
String line;
|
||||
while (reader.read(scratchLine)) {
|
||||
line = scratchLine.utf8ToString();
|
||||
String entry;
|
|
@ -1,157 +0,0 @@
|
|||
package org.apache.lucene.analysis.hunspell;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/**
|
||||
* Wrapper class representing a hunspell affix
|
||||
*/
|
||||
public class HunspellAffix {
|
||||
|
||||
private String append; // the affix itself, what is appended
|
||||
private char appendFlags[]; // continuation class flags
|
||||
private String strip;
|
||||
|
||||
private String condition;
|
||||
private Pattern conditionPattern;
|
||||
|
||||
private char flag;
|
||||
|
||||
private boolean crossProduct;
|
||||
|
||||
/**
|
||||
* Checks whether the given text matches the conditional pattern on this affix
|
||||
*
|
||||
* @param text Text to check if it matches the affix's conditional pattern
|
||||
* @return {@code true} if the text meets the condition, {@code false} otherwise
|
||||
*/
|
||||
public boolean checkCondition(CharSequence text) {
|
||||
return conditionPattern.matcher(text).matches();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the append defined for the affix
|
||||
*
|
||||
* @return Defined append
|
||||
*/
|
||||
public String getAppend() {
|
||||
return append;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the append defined for the affix
|
||||
*
|
||||
* @param append Defined append for the affix
|
||||
*/
|
||||
public void setAppend(String append) {
|
||||
this.append = append;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the flags defined for the affix append
|
||||
*
|
||||
* @return Flags defined for the affix append
|
||||
*/
|
||||
public char[] getAppendFlags() {
|
||||
return appendFlags;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the flags defined for the affix append
|
||||
*
|
||||
* @param appendFlags Flags defined for the affix append
|
||||
*/
|
||||
public void setAppendFlags(char[] appendFlags) {
|
||||
this.appendFlags = appendFlags;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the stripping characters defined for the affix
|
||||
*
|
||||
* @return Stripping characters defined for the affix
|
||||
*/
|
||||
public String getStrip() {
|
||||
return strip;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the stripping characters defined for the affix
|
||||
*
|
||||
* @param strip Stripping characters defined for the affix
|
||||
*/
|
||||
public void setStrip(String strip) {
|
||||
this.strip = strip;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the condition that must be met before the affix can be applied
|
||||
*
|
||||
* @return Condition that must be met before the affix can be applied
|
||||
*/
|
||||
public String getCondition() {
|
||||
return condition;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the condition that must be met before the affix can be applied
|
||||
*
|
||||
* @param condition Condition to be met before affix application
|
||||
* @param pattern Condition as a regular expression pattern
|
||||
*/
|
||||
public void setCondition(String condition, String pattern) {
|
||||
this.condition = condition;
|
||||
this.conditionPattern = Pattern.compile(pattern);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the affix flag
|
||||
*
|
||||
* @return Affix flag
|
||||
*/
|
||||
public char getFlag() {
|
||||
return flag;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the affix flag
|
||||
*
|
||||
* @param flag Affix flag
|
||||
*/
|
||||
public void setFlag(char flag) {
|
||||
this.flag = flag;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns whether the affix is defined as cross product
|
||||
*
|
||||
* @return {@code true} if the affix is cross product, {@code false} otherwise
|
||||
*/
|
||||
public boolean isCrossProduct() {
|
||||
return crossProduct;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets whether the affix is defined as cross product
|
||||
*
|
||||
* @param crossProduct Whether the affix is defined as cross product
|
||||
*/
|
||||
public void setCrossProduct(boolean crossProduct) {
|
||||
this.crossProduct = crossProduct;
|
||||
}
|
||||
}
|
|
@ -1,507 +0,0 @@
|
|||
package org.apache.lucene.analysis.hunspell;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.util.CharArrayMap;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
import java.io.*;
|
||||
import java.nio.charset.Charset;
|
||||
import java.nio.charset.CharsetDecoder;
|
||||
import java.text.ParseException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
|
||||
/**
|
||||
* In-memory structure for the dictionary (.dic) and affix (.aff)
|
||||
* data of a hunspell dictionary.
|
||||
*/
|
||||
public class HunspellDictionary {
|
||||
|
||||
static final HunspellWord NOFLAGS = new HunspellWord();
|
||||
|
||||
private static final String ALIAS_KEY = "AF";
|
||||
private static final String PREFIX_KEY = "PFX";
|
||||
private static final String SUFFIX_KEY = "SFX";
|
||||
private static final String FLAG_KEY = "FLAG";
|
||||
|
||||
private static final String NUM_FLAG_TYPE = "num";
|
||||
private static final String UTF8_FLAG_TYPE = "UTF-8";
|
||||
private static final String LONG_FLAG_TYPE = "long";
|
||||
|
||||
private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*";
|
||||
private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s";
|
||||
|
||||
private static final boolean IGNORE_CASE_DEFAULT = false;
|
||||
private static final boolean STRICT_AFFIX_PARSING_DEFAULT = true;
|
||||
|
||||
private CharArrayMap<List<HunspellWord>> words;
|
||||
private CharArrayMap<List<HunspellAffix>> prefixes;
|
||||
private CharArrayMap<List<HunspellAffix>> suffixes;
|
||||
|
||||
private FlagParsingStrategy flagParsingStrategy = new SimpleFlagParsingStrategy(); // Default flag parsing strategy
|
||||
private boolean ignoreCase = IGNORE_CASE_DEFAULT;
|
||||
|
||||
private final Version version;
|
||||
|
||||
private String[] aliases;
|
||||
private int aliasCount = 0;
|
||||
|
||||
/**
|
||||
* Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix
|
||||
* and dictionary files.
|
||||
* You have to close the provided InputStreams yourself.
|
||||
*
|
||||
* @param affix InputStream for reading the hunspell affix file (won't be closed).
|
||||
* @param dictionary InputStream for reading the hunspell dictionary file (won't be closed).
|
||||
* @param version Lucene Version
|
||||
* @throws IOException Can be thrown while reading from the InputStreams
|
||||
* @throws ParseException Can be thrown if the content of the files does not meet expected formats
|
||||
*/
|
||||
public HunspellDictionary(InputStream affix, InputStream dictionary, Version version) throws IOException, ParseException {
|
||||
this(affix, Arrays.asList(dictionary), version, IGNORE_CASE_DEFAULT);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix
|
||||
* and dictionary files.
|
||||
* You have to close the provided InputStreams yourself.
|
||||
*
|
||||
* @param affix InputStream for reading the hunspell affix file (won't be closed).
|
||||
* @param dictionary InputStream for reading the hunspell dictionary file (won't be closed).
|
||||
* @param version Lucene Version
|
||||
* @param ignoreCase If true, dictionary matching will be case insensitive
|
||||
* @throws IOException Can be thrown while reading from the InputStreams
|
||||
* @throws ParseException Can be thrown if the content of the files does not meet expected formats
|
||||
*/
|
||||
public HunspellDictionary(InputStream affix, InputStream dictionary, Version version, boolean ignoreCase) throws IOException, ParseException {
|
||||
this(affix, Arrays.asList(dictionary), version, ignoreCase);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix
|
||||
* and dictionary files.
|
||||
* You have to close the provided InputStreams yourself.
|
||||
*
|
||||
* @param affix InputStream for reading the hunspell affix file (won't be closed).
|
||||
* @param dictionaries InputStreams for reading the hunspell dictionary file (won't be closed).
|
||||
* @param version Lucene Version
|
||||
* @param ignoreCase If true, dictionary matching will be case insensitive
|
||||
* @throws IOException Can be thrown while reading from the InputStreams
|
||||
* @throws ParseException Can be thrown if the content of the files does not meet expected formats
|
||||
*/
|
||||
public HunspellDictionary(InputStream affix, List<InputStream> dictionaries, Version version, boolean ignoreCase) throws IOException, ParseException {
|
||||
this(affix, dictionaries, version, ignoreCase, STRICT_AFFIX_PARSING_DEFAULT);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix
|
||||
* and dictionary files.
|
||||
* You have to close the provided InputStreams yourself.
|
||||
*
|
||||
* @param affix InputStream for reading the hunspell affix file (won't be closed).
|
||||
* @param dictionaries InputStreams for reading the hunspell dictionary file (won't be closed).
|
||||
* @param version Lucene Version
|
||||
* @param ignoreCase If true, dictionary matching will be case insensitive
|
||||
* @param strictAffixParsing Affix strict parsing enabled or not (an error while reading a rule causes exception or is ignored)
|
||||
* @throws IOException Can be thrown while reading from the InputStreams
|
||||
* @throws ParseException Can be thrown if the content of the files does not meet expected formats
|
||||
*/
|
||||
public HunspellDictionary(InputStream affix, List<InputStream> dictionaries, Version version, boolean ignoreCase, boolean strictAffixParsing) throws IOException, ParseException {
|
||||
this.version = version;
|
||||
this.ignoreCase = ignoreCase;
|
||||
String encoding = getDictionaryEncoding(affix);
|
||||
CharsetDecoder decoder = getJavaEncoding(encoding);
|
||||
readAffixFile(affix, decoder, strictAffixParsing);
|
||||
words = new CharArrayMap<List<HunspellWord>>(version, 65535 /* guess */, this.ignoreCase);
|
||||
for (InputStream dictionary : dictionaries) {
|
||||
readDictionaryFile(dictionary, decoder);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Looks up HunspellWords that match the String created from the given char array, offset and length
|
||||
*
|
||||
* @param word Char array to generate the String from
|
||||
* @param offset Offset in the char array that the String starts at
|
||||
* @param length Length from the offset that the String is
|
||||
* @return List of HunspellWords that match the generated String, or {@code null} if none are found
|
||||
*/
|
||||
public List<HunspellWord> lookupWord(char word[], int offset, int length) {
|
||||
return words.get(word, offset, length);
|
||||
}
|
||||
|
||||
/**
|
||||
* Looks up HunspellAffix prefixes that have an append that matches the String created from the given char array, offset and length
|
||||
*
|
||||
* @param word Char array to generate the String from
|
||||
* @param offset Offset in the char array that the String starts at
|
||||
* @param length Length from the offset that the String is
|
||||
* @return List of HunspellAffix prefixes with an append that matches the String, or {@code null} if none are found
|
||||
*/
|
||||
public List<HunspellAffix> lookupPrefix(char word[], int offset, int length) {
|
||||
return prefixes.get(word, offset, length);
|
||||
}
|
||||
|
||||
/**
|
||||
* Looks up HunspellAffix suffixes that have an append that matches the String created from the given char array, offset and length
|
||||
*
|
||||
* @param word Char array to generate the String from
|
||||
* @param offset Offset in the char array that the String starts at
|
||||
* @param length Length from the offset that the String is
|
||||
* @return List of HunspellAffix suffixes with an append that matches the String, or {@code null} if none are found
|
||||
*/
|
||||
public List<HunspellAffix> lookupSuffix(char word[], int offset, int length) {
|
||||
return suffixes.get(word, offset, length);
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads the affix file through the provided InputStream, building up the prefix and suffix maps
|
||||
*
|
||||
* @param affixStream InputStream to read the content of the affix file from
|
||||
* @param decoder CharsetDecoder to decode the content of the file
|
||||
* @throws IOException Can be thrown while reading from the InputStream
|
||||
*/
|
||||
private void readAffixFile(InputStream affixStream, CharsetDecoder decoder, boolean strict) throws IOException, ParseException {
|
||||
prefixes = new CharArrayMap<List<HunspellAffix>>(version, 8, ignoreCase);
|
||||
suffixes = new CharArrayMap<List<HunspellAffix>>(version, 8, ignoreCase);
|
||||
|
||||
LineNumberReader reader = new LineNumberReader(new InputStreamReader(affixStream, decoder));
|
||||
String line = null;
|
||||
while ((line = reader.readLine()) != null) {
|
||||
if (line.startsWith(ALIAS_KEY)) {
|
||||
parseAlias(line);
|
||||
} else if (line.startsWith(PREFIX_KEY)) {
|
||||
parseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN, strict);
|
||||
} else if (line.startsWith(SUFFIX_KEY)) {
|
||||
parseAffix(suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN, strict);
|
||||
} else if (line.startsWith(FLAG_KEY)) {
|
||||
// Assume that the FLAG line comes before any prefix or suffixes
|
||||
// Store the strategy so it can be used when parsing the dic file
|
||||
flagParsingStrategy = getFlagParsingStrategy(line);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses a specific affix rule putting the result into the provided affix map
|
||||
*
|
||||
* @param affixes Map where the result of the parsing will be put
|
||||
* @param header Header line of the affix rule
|
||||
* @param reader BufferedReader to read the content of the rule from
|
||||
* @param conditionPattern {@link String#format(String, Object...)} pattern to be used to generate the condition regex
|
||||
* pattern
|
||||
* @throws IOException Can be thrown while reading the rule
|
||||
*/
|
||||
private void parseAffix(CharArrayMap<List<HunspellAffix>> affixes,
|
||||
String header,
|
||||
LineNumberReader reader,
|
||||
String conditionPattern,
|
||||
boolean strict) throws IOException, ParseException {
|
||||
String args[] = header.split("\\s+");
|
||||
|
||||
boolean crossProduct = args[2].equals("Y");
|
||||
|
||||
int numLines = Integer.parseInt(args[3]);
|
||||
for (int i = 0; i < numLines; i++) {
|
||||
String line = reader.readLine();
|
||||
String ruleArgs[] = line.split("\\s+");
|
||||
|
||||
if (ruleArgs.length < 5) {
|
||||
if (strict) {
|
||||
throw new ParseException("The affix file contains a rule with less than five elements", reader.getLineNumber());
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
HunspellAffix affix = new HunspellAffix();
|
||||
|
||||
affix.setFlag(flagParsingStrategy.parseFlag(ruleArgs[1]));
|
||||
affix.setStrip(ruleArgs[2].equals("0") ? "" : ruleArgs[2]);
|
||||
|
||||
String affixArg = ruleArgs[3];
|
||||
|
||||
int flagSep = affixArg.lastIndexOf('/');
|
||||
if (flagSep != -1) {
|
||||
String flagPart = affixArg.substring(flagSep + 1);
|
||||
|
||||
if (aliasCount > 0) {
|
||||
flagPart = getAliasValue(Integer.parseInt(flagPart));
|
||||
}
|
||||
|
||||
char appendFlags[] = flagParsingStrategy.parseFlags(flagPart);
|
||||
Arrays.sort(appendFlags);
|
||||
affix.setAppendFlags(appendFlags);
|
||||
affix.setAppend(affixArg.substring(0, flagSep));
|
||||
} else {
|
||||
affix.setAppend(affixArg);
|
||||
}
|
||||
|
||||
String condition = ruleArgs[4];
|
||||
affix.setCondition(condition, String.format(Locale.ROOT, conditionPattern, condition));
|
||||
affix.setCrossProduct(crossProduct);
|
||||
|
||||
List<HunspellAffix> list = affixes.get(affix.getAppend());
|
||||
if (list == null) {
|
||||
list = new ArrayList<HunspellAffix>();
|
||||
affixes.put(affix.getAppend(), list);
|
||||
}
|
||||
|
||||
list.add(affix);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses the encoding specified in the affix file readable through the provided InputStream
|
||||
*
|
||||
* @param affix InputStream for reading the affix file
|
||||
* @return Encoding specified in the affix file
|
||||
* @throws IOException Can be thrown while reading from the InputStream
|
||||
* @throws ParseException Thrown if the first non-empty non-comment line read from the file does not adhere to the format {@code SET <encoding>}
|
||||
*/
|
||||
private String getDictionaryEncoding(InputStream affix) throws IOException, ParseException {
|
||||
final StringBuilder encoding = new StringBuilder();
|
||||
for (;;) {
|
||||
encoding.setLength(0);
|
||||
int ch;
|
||||
while ((ch = affix.read()) >= 0) {
|
||||
if (ch == '\n') {
|
||||
break;
|
||||
}
|
||||
if (ch != '\r') {
|
||||
encoding.append((char)ch);
|
||||
}
|
||||
}
|
||||
if (
|
||||
encoding.length() == 0 || encoding.charAt(0) == '#' ||
|
||||
// this test only at the end as ineffective but would allow lines only containing spaces:
|
||||
encoding.toString().trim().length() == 0
|
||||
) {
|
||||
if (ch < 0) {
|
||||
throw new ParseException("Unexpected end of affix file.", 0);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
if ("SET ".equals(encoding.substring(0, 4))) {
|
||||
// cleanup the encoding string, too (whitespace)
|
||||
return encoding.substring(4).trim();
|
||||
}
|
||||
throw new ParseException("The first non-comment line in the affix file must "+
|
||||
"be a 'SET charset', was: '" + encoding +"'", 0);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieves the CharsetDecoder for the given encoding. Note, This isn't perfect as I think ISCII-DEVANAGARI and
|
||||
* MICROSOFT-CP1251 etc are allowed...
|
||||
*
|
||||
* @param encoding Encoding to retrieve the CharsetDecoder for
|
||||
* @return CharSetDecoder for the given encoding
|
||||
*/
|
||||
private CharsetDecoder getJavaEncoding(String encoding) {
|
||||
Charset charset = Charset.forName(encoding);
|
||||
return charset.newDecoder();
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines the appropriate {@link FlagParsingStrategy} based on the FLAG definition line taken from the affix file
|
||||
*
|
||||
* @param flagLine Line containing the flag information
|
||||
* @return FlagParsingStrategy that handles parsing flags in the way specified in the FLAG definition
|
||||
*/
|
||||
private FlagParsingStrategy getFlagParsingStrategy(String flagLine) {
|
||||
String flagType = flagLine.substring(5);
|
||||
|
||||
if (NUM_FLAG_TYPE.equals(flagType)) {
|
||||
return new NumFlagParsingStrategy();
|
||||
} else if (UTF8_FLAG_TYPE.equals(flagType)) {
|
||||
return new SimpleFlagParsingStrategy();
|
||||
} else if (LONG_FLAG_TYPE.equals(flagType)) {
|
||||
return new DoubleASCIIFlagParsingStrategy();
|
||||
}
|
||||
|
||||
throw new IllegalArgumentException("Unknown flag type: " + flagType);
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads the dictionary file through the provided InputStream, building up the words map
|
||||
*
|
||||
* @param dictionary InputStream to read the dictionary file through
|
||||
* @param decoder CharsetDecoder used to decode the contents of the file
|
||||
* @throws IOException Can be thrown while reading from the file
|
||||
*/
|
||||
private void readDictionaryFile(InputStream dictionary, CharsetDecoder decoder) throws IOException {
|
||||
BufferedReader reader = new BufferedReader(new InputStreamReader(dictionary, decoder));
|
||||
// TODO: don't create millions of strings.
|
||||
String line = reader.readLine(); // first line is number of entries
|
||||
int numEntries = Integer.parseInt(line);
|
||||
|
||||
// TODO: the flags themselves can be double-chars (long) or also numeric
|
||||
// either way the trick is to encode them as char... but they must be parsed differently
|
||||
while ((line = reader.readLine()) != null) {
|
||||
String entry;
|
||||
HunspellWord wordForm;
|
||||
|
||||
int flagSep = line.lastIndexOf('/');
|
||||
if (flagSep == -1) {
|
||||
wordForm = NOFLAGS;
|
||||
entry = line;
|
||||
} else {
|
||||
// note, there can be comments (morph description) after a flag.
|
||||
// we should really look for any whitespace
|
||||
int end = line.indexOf('\t', flagSep);
|
||||
if (end == -1)
|
||||
end = line.length();
|
||||
|
||||
String flagPart = line.substring(flagSep + 1, end);
|
||||
if (aliasCount > 0) {
|
||||
flagPart = getAliasValue(Integer.parseInt(flagPart));
|
||||
}
|
||||
|
||||
wordForm = new HunspellWord(flagParsingStrategy.parseFlags(flagPart));
|
||||
Arrays.sort(wordForm.getFlags());
|
||||
entry = line.substring(0, flagSep);
|
||||
}
|
||||
if(ignoreCase) {
|
||||
entry = entry.toLowerCase(Locale.ROOT);
|
||||
}
|
||||
|
||||
List<HunspellWord> entries = new ArrayList<HunspellWord>();
|
||||
entries.add(wordForm);
|
||||
words.put(entry, entries);
|
||||
}
|
||||
}
|
||||
|
||||
public Version getVersion() {
|
||||
return version;
|
||||
}
|
||||
|
||||
private void parseAlias(String line) {
|
||||
String ruleArgs[] = line.split("\\s+");
|
||||
if (aliases == null) {
|
||||
//first line should be the aliases count
|
||||
final int count = Integer.parseInt(ruleArgs[1]);
|
||||
aliases = new String[count];
|
||||
} else {
|
||||
aliases[aliasCount++] = ruleArgs[1];
|
||||
}
|
||||
}
|
||||
|
||||
private String getAliasValue(int id) {
|
||||
try {
|
||||
return aliases[id - 1];
|
||||
} catch (IndexOutOfBoundsException ex) {
|
||||
throw new IllegalArgumentException("Bad flag alias number:" + id, ex);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Abstraction of the process of parsing flags taken from the affix and dic files
|
||||
*/
|
||||
private static abstract class FlagParsingStrategy {
|
||||
|
||||
/**
|
||||
* Parses the given String into a single flag
|
||||
*
|
||||
* @param rawFlag String to parse into a flag
|
||||
* @return Parsed flag
|
||||
*/
|
||||
char parseFlag(String rawFlag) {
|
||||
return parseFlags(rawFlag)[0];
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses the given String into multiple flags
|
||||
*
|
||||
* @param rawFlags String to parse into flags
|
||||
* @return Parsed flags
|
||||
*/
|
||||
abstract char[] parseFlags(String rawFlags);
|
||||
}
|
||||
|
||||
/**
|
||||
* Simple implementation of {@link FlagParsingStrategy} that treats the chars in each String as a individual flags.
|
||||
* Can be used with both the ASCII and UTF-8 flag types.
|
||||
*/
|
||||
private static class SimpleFlagParsingStrategy extends FlagParsingStrategy {
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public char[] parseFlags(String rawFlags) {
|
||||
return rawFlags.toCharArray();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Implementation of {@link FlagParsingStrategy} that assumes each flag is encoded in its numerical form. In the case
|
||||
* of multiple flags, each number is separated by a comma.
|
||||
*/
|
||||
private static class NumFlagParsingStrategy extends FlagParsingStrategy {
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public char[] parseFlags(String rawFlags) {
|
||||
String[] rawFlagParts = rawFlags.trim().split(",");
|
||||
char[] flags = new char[rawFlagParts.length];
|
||||
|
||||
for (int i = 0; i < rawFlagParts.length; i++) {
|
||||
// note, removing the trailing X/leading I for nepali... what is the rule here?!
|
||||
flags[i] = (char) Integer.parseInt(rawFlagParts[i].replaceAll("[^0-9]", ""));
|
||||
}
|
||||
|
||||
return flags;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Implementation of {@link FlagParsingStrategy} that assumes each flag is encoded as two ASCII characters whose codes
|
||||
* must be combined into a single character.
|
||||
*
|
||||
* TODO (rmuir) test
|
||||
*/
|
||||
private static class DoubleASCIIFlagParsingStrategy extends FlagParsingStrategy {
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public char[] parseFlags(String rawFlags) {
|
||||
if (rawFlags.length() == 0) {
|
||||
return new char[0];
|
||||
}
|
||||
|
||||
StringBuilder builder = new StringBuilder();
|
||||
for (int i = 0; i < rawFlags.length(); i+=2) {
|
||||
char cookedFlag = (char) ((int) rawFlags.charAt(i) + (int) rawFlags.charAt(i + 1));
|
||||
builder.append(cookedFlag);
|
||||
}
|
||||
|
||||
char flags[] = new char[builder.length()];
|
||||
builder.getChars(0, builder.length(), flags, 0);
|
||||
return flags;
|
||||
}
|
||||
}
|
||||
|
||||
public boolean isIgnoreCase() {
|
||||
return ignoreCase;
|
||||
}
|
||||
}
|
|
@ -18,14 +18,16 @@ package org.apache.lucene.analysis.hunspell;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.hunspell.HunspellStemmer.Stem;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
|
||||
/**
|
||||
* TokenFilter that uses hunspell affix rules and words to stem tokens. Since hunspell supports a word having multiple
|
||||
|
@ -41,71 +43,83 @@ import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
|||
* {@link org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilterFactory}
|
||||
* </p>
|
||||
*
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public final class HunspellStemFilter extends TokenFilter {
|
||||
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
|
||||
private final HunspellStemmer stemmer;
|
||||
private final Stemmer stemmer;
|
||||
|
||||
private List<Stem> buffer;
|
||||
private List<CharsRef> buffer;
|
||||
private State savedState;
|
||||
|
||||
private final boolean dedup;
|
||||
private final boolean longestOnly;
|
||||
|
||||
/** Create a {@link HunspellStemFilter} which deduplicates stems and has a maximum
|
||||
* recursion level of 2.
|
||||
* @see #HunspellStemFilter(TokenStream, HunspellDictionary, int) */
|
||||
public HunspellStemFilter(TokenStream input, HunspellDictionary dictionary) {
|
||||
* @see #HunspellStemFilter(TokenStream, Dictionary, int) */
|
||||
public HunspellStemFilter(TokenStream input, Dictionary dictionary) {
|
||||
this(input, dictionary, 2);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new HunspellStemFilter that will stem tokens from the given TokenStream using affix rules in the provided
|
||||
* HunspellDictionary
|
||||
* Creates a new Hunspell2StemFilter that will stem tokens from the given TokenStream using affix rules in the provided
|
||||
* Dictionary
|
||||
*
|
||||
* @param input TokenStream whose tokens will be stemmed
|
||||
* @param dictionary HunspellDictionary containing the affix rules and words that will be used to stem the tokens
|
||||
* @param recursionCap maximum level of recursion stemmer can go into, defaults to <code>2</code>
|
||||
*/
|
||||
public HunspellStemFilter(TokenStream input, HunspellDictionary dictionary, int recursionCap) {
|
||||
public HunspellStemFilter(TokenStream input, Dictionary dictionary, int recursionCap) {
|
||||
this(input, dictionary, true, recursionCap);
|
||||
}
|
||||
|
||||
/** Create a {@link HunspellStemFilter} which has a maximum recursion level of 2.
|
||||
* @see #HunspellStemFilter(TokenStream, HunspellDictionary, boolean, int) */
|
||||
public HunspellStemFilter(TokenStream input, HunspellDictionary dictionary, boolean dedup) {
|
||||
* @see #HunspellStemFilter(TokenStream, Dictionary, boolean, int) */
|
||||
public HunspellStemFilter(TokenStream input, Dictionary dictionary, boolean dedup) {
|
||||
this(input, dictionary, dedup, 2);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Creates a new HunspellStemFilter that will stem tokens from the given TokenStream using affix rules in the provided
|
||||
* HunspellDictionary
|
||||
* Dictionary
|
||||
*
|
||||
* @param input TokenStream whose tokens will be stemmed
|
||||
* @param dictionary HunspellDictionary containing the affix rules and words that will be used to stem the tokens
|
||||
* @param dedup true if only unique terms should be output.
|
||||
* @param recursionCap maximum level of recursion stemmer can go into, defaults to <code>2</code>
|
||||
*/
|
||||
public HunspellStemFilter(TokenStream input, HunspellDictionary dictionary, boolean dedup, int recursionCap) {
|
||||
super(input);
|
||||
this.dedup = dedup;
|
||||
this.stemmer = new HunspellStemmer(dictionary, recursionCap);
|
||||
public HunspellStemFilter(TokenStream input, Dictionary dictionary, boolean dedup, int recursionCap) {
|
||||
this(input, dictionary, dedup, recursionCap, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
* Creates a new HunspellStemFilter that will stem tokens from the given TokenStream using affix rules in the provided
|
||||
* Dictionary
|
||||
*
|
||||
* @param input TokenStream whose tokens will be stemmed
|
||||
* @param dictionary HunspellDictionary containing the affix rules and words that will be used to stem the tokens
|
||||
* @param dedup true if only unique terms should be output.
|
||||
* @param recursionCap maximum level of recursion stemmer can go into, defaults to <code>2</code>
|
||||
* @param longestOnly true if only the longest term should be output.
|
||||
*/
|
||||
public HunspellStemFilter(TokenStream input, Dictionary dictionary, boolean dedup, int recursionCap, boolean longestOnly) {
|
||||
super(input);
|
||||
this.dedup = dedup && longestOnly == false; // don't waste time deduping if longestOnly is set
|
||||
this.stemmer = new Stemmer(dictionary, recursionCap);
|
||||
this.longestOnly = longestOnly;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (buffer != null && !buffer.isEmpty()) {
|
||||
Stem nextStem = buffer.remove(0);
|
||||
CharsRef nextStem = buffer.remove(0);
|
||||
restoreState(savedState);
|
||||
posIncAtt.setPositionIncrement(0);
|
||||
termAtt.copyBuffer(nextStem.getStem(), 0, nextStem.getStemLength());
|
||||
termAtt.setLength(nextStem.getStemLength());
|
||||
termAtt.setEmpty().append(nextStem);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -122,24 +136,41 @@ public final class HunspellStemFilter extends TokenFilter {
|
|||
if (buffer.isEmpty()) { // we do not know this word, return it unchanged
|
||||
return true;
|
||||
}
|
||||
|
||||
if (longestOnly && buffer.size() > 1) {
|
||||
Collections.sort(buffer, lengthComparator);
|
||||
}
|
||||
|
||||
Stem stem = buffer.remove(0);
|
||||
termAtt.copyBuffer(stem.getStem(), 0, stem.getStemLength());
|
||||
termAtt.setLength(stem.getStemLength());
|
||||
CharsRef stem = buffer.remove(0);
|
||||
termAtt.setEmpty().append(stem);
|
||||
|
||||
if (!buffer.isEmpty()) {
|
||||
savedState = captureState();
|
||||
if (longestOnly) {
|
||||
buffer.clear();
|
||||
} else {
|
||||
if (!buffer.isEmpty()) {
|
||||
savedState = captureState();
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
buffer = null;
|
||||
}
|
||||
|
||||
static final Comparator<CharsRef> lengthComparator = new Comparator<CharsRef>() {
|
||||
@Override
|
||||
public int compare(CharsRef o1, CharsRef o2) {
|
||||
int cmp = Integer.compare(o2.length, o1.length);
|
||||
if (cmp == 0) {
|
||||
// tie break on text
|
||||
return o2.compareTo(o1);
|
||||
} else {
|
||||
return cmp;
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
|
|
@ -31,89 +31,75 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
|
|||
import org.apache.lucene.util.IOUtils;
|
||||
|
||||
/**
|
||||
* TokenFilterFactory that creates instances of {@link org.apache.lucene.analysis.hunspell.HunspellStemFilter}.
|
||||
* Example config for British English including a custom dictionary, case insensitive matching:
|
||||
* TokenFilterFactory that creates instances of {@link HunspellStemFilter}.
|
||||
* Example config for British English:
|
||||
* <pre class="prettyprint">
|
||||
* <filter class="solr.HunspellStemFilterFactory"
|
||||
* dictionary="en_GB.dic,my_custom.dic"
|
||||
* affix="en_GB.aff"
|
||||
* ignoreCase="true" /></pre>
|
||||
* dictionary="en_GB.dic,my_custom.dic"
|
||||
* affix="en_GB.aff"
|
||||
* ignoreCase="false"
|
||||
* longestOnly="false" /></pre>
|
||||
* Both parameters dictionary and affix are mandatory.
|
||||
* <br/>
|
||||
* The parameter ignoreCase (true/false) controls whether matching is case sensitive or not. Default false.
|
||||
* <br/>
|
||||
* The parameter strictAffixParsing (true/false) controls whether the affix parsing is strict or not. Default true.
|
||||
* If strict an error while reading an affix rule causes a ParseException, otherwise is ignored.
|
||||
* <br/>
|
||||
* Dictionaries for many languages are available through the OpenOffice project.
|
||||
*
|
||||
* See <a href="http://wiki.apache.org/solr/Hunspell">http://wiki.apache.org/solr/Hunspell</a>
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class HunspellStemFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
|
||||
private static final String PARAM_DICTIONARY = "dictionary";
|
||||
private static final String PARAM_AFFIX = "affix";
|
||||
private static final String PARAM_IGNORE_CASE = "ignoreCase";
|
||||
private static final String PARAM_STRICT_AFFIX_PARSING = "strictAffixParsing";
|
||||
private static final String PARAM_DICTIONARY = "dictionary";
|
||||
private static final String PARAM_AFFIX = "affix";
|
||||
private static final String PARAM_RECURSION_CAP = "recursionCap";
|
||||
private static final String PARAM_IGNORE_CASE = "ignoreCase";
|
||||
private static final String PARAM_LONGEST_ONLY = "longestOnly";
|
||||
|
||||
private final String dictionaryArg;
|
||||
private final String dictionaryFiles;
|
||||
private final String affixFile;
|
||||
private final boolean ignoreCase;
|
||||
private final boolean strictAffixParsing;
|
||||
private HunspellDictionary dictionary;
|
||||
private final boolean longestOnly;
|
||||
private Dictionary dictionary;
|
||||
private int recursionCap;
|
||||
|
||||
/** Creates a new HunspellStemFilterFactory */
|
||||
public HunspellStemFilterFactory(Map<String,String> args) {
|
||||
super(args);
|
||||
assureMatchVersion();
|
||||
dictionaryArg = require(args, PARAM_DICTIONARY);
|
||||
dictionaryFiles = require(args, PARAM_DICTIONARY);
|
||||
affixFile = get(args, PARAM_AFFIX);
|
||||
ignoreCase = getBoolean(args, PARAM_IGNORE_CASE, false);
|
||||
strictAffixParsing = getBoolean(args, PARAM_STRICT_AFFIX_PARSING, true);
|
||||
recursionCap = getInt(args, PARAM_RECURSION_CAP, 2);
|
||||
longestOnly = getBoolean(args, PARAM_LONGEST_ONLY, false);
|
||||
// this isnt necessary: we properly load all dictionaries.
|
||||
// but recognize and ignore for back compat
|
||||
getBoolean(args, "strictAffixParsing", true);
|
||||
if (!args.isEmpty()) {
|
||||
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads the hunspell dictionary and affix files defined in the configuration
|
||||
*
|
||||
* @param loader ResourceLoader used to load the files
|
||||
*/
|
||||
@Override
|
||||
public void inform(ResourceLoader loader) throws IOException {
|
||||
String dictionaryFiles[] = dictionaryArg.split(",");
|
||||
String dicts[] = dictionaryFiles.split(",");
|
||||
|
||||
InputStream affix = null;
|
||||
List<InputStream> dictionaries = new ArrayList<InputStream>();
|
||||
|
||||
try {
|
||||
dictionaries = new ArrayList<InputStream>();
|
||||
for (String file : dictionaryFiles) {
|
||||
for (String file : dicts) {
|
||||
dictionaries.add(loader.openResource(file));
|
||||
}
|
||||
affix = loader.openResource(affixFile);
|
||||
|
||||
this.dictionary = new HunspellDictionary(affix, dictionaries, luceneMatchVersion, ignoreCase, strictAffixParsing);
|
||||
this.dictionary = new Dictionary(affix, dictionaries, ignoreCase);
|
||||
} catch (ParseException e) {
|
||||
throw new IOException("Unable to load hunspell data! [dictionary=" + dictionaryArg + ",affix=" + affixFile + "]", e);
|
||||
throw new IOException("Unable to load hunspell data! [dictionary=" + dictionaries + ",affix=" + affixFile + "]", e);
|
||||
} finally {
|
||||
IOUtils.closeWhileHandlingException(affix);
|
||||
IOUtils.closeWhileHandlingException(dictionaries);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates an instance of {@link org.apache.lucene.analysis.hunspell.HunspellStemFilter} that will filter the given
|
||||
* TokenStream
|
||||
*
|
||||
* @param tokenStream TokenStream that will be filtered
|
||||
* @return HunspellStemFilter that filters the TokenStream
|
||||
*/
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream) {
|
||||
return new HunspellStemFilter(tokenStream, dictionary, recursionCap);
|
||||
return new HunspellStemFilter(tokenStream, dictionary, true, recursionCap, longestOnly);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,392 +0,0 @@
|
|||
package org.apache.lucene.analysis.hunspell;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.analysis.util.CharacterUtils;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
/**
|
||||
* HunspellStemmer uses the affix rules declared in the HunspellDictionary to generate one or more stems for a word. It
|
||||
* conforms to the algorithm in the original hunspell algorithm, including recursive suffix stripping.
|
||||
*/
|
||||
public class HunspellStemmer {
|
||||
private final int recursionCap;
|
||||
private final HunspellDictionary dictionary;
|
||||
private final StringBuilder segment = new StringBuilder();
|
||||
private CharacterUtils charUtils = CharacterUtils.getInstance(Version.LUCENE_CURRENT);
|
||||
|
||||
/**
|
||||
* Constructs a new HunspellStemmer which will use the provided HunspellDictionary to create its stems. Uses the
|
||||
* default recursion cap of <code>2</code> (based on Hunspell documentation).
|
||||
*
|
||||
* @param dictionary HunspellDictionary that will be used to create the stems
|
||||
*/
|
||||
public HunspellStemmer(HunspellDictionary dictionary) {
|
||||
this(dictionary, 2);
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs a new HunspellStemmer which will use the provided HunspellDictionary to create its stems
|
||||
*
|
||||
* @param dictionary HunspellDictionary that will be used to create the stems
|
||||
* @param recursionCap maximum level of recursion stemmer can go into
|
||||
*/
|
||||
public HunspellStemmer(HunspellDictionary dictionary, int recursionCap) {
|
||||
this.dictionary = dictionary;
|
||||
this.recursionCap = recursionCap;
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the stem(s) of the provided word
|
||||
*
|
||||
* @param word Word to find the stems for
|
||||
* @return List of stems for the word
|
||||
*/
|
||||
public List<Stem> stem(String word) {
|
||||
return stem(word.toCharArray(), word.length());
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the stem(s) of the provided word
|
||||
*
|
||||
* @param word Word to find the stems for
|
||||
* @return List of stems for the word
|
||||
*/
|
||||
public List<Stem> stem(char word[], int length) {
|
||||
List<Stem> stems = new ArrayList<Stem>();
|
||||
if (dictionary.lookupWord(word, 0, length) != null) {
|
||||
stems.add(new Stem(word, length));
|
||||
}
|
||||
stems.addAll(stem(word, length, null, 0));
|
||||
return stems;
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the unique stem(s) of the provided word
|
||||
*
|
||||
* @param word Word to find the stems for
|
||||
* @return List of stems for the word
|
||||
*/
|
||||
public List<Stem> uniqueStems(char word[], int length) {
|
||||
List<Stem> stems = new ArrayList<Stem>();
|
||||
CharArraySet terms = new CharArraySet(dictionary.getVersion(), 8, dictionary.isIgnoreCase());
|
||||
if (dictionary.lookupWord(word, 0, length) != null) {
|
||||
stems.add(new Stem(word, length));
|
||||
terms.add(word);
|
||||
}
|
||||
List<Stem> otherStems = stem(word, length, null, 0);
|
||||
for (Stem s : otherStems) {
|
||||
if (!terms.contains(s.stem)) {
|
||||
stems.add(s);
|
||||
terms.add(s.stem);
|
||||
}
|
||||
}
|
||||
return stems;
|
||||
}
|
||||
|
||||
// ================================================= Helper Methods ================================================
|
||||
|
||||
/**
|
||||
* Generates a list of stems for the provided word
|
||||
*
|
||||
* @param word Word to generate the stems for
|
||||
* @param flags Flags from a previous stemming step that need to be cross-checked with any affixes in this recursive step
|
||||
* @param recursionDepth Level of recursion this stemming step is at
|
||||
* @return List of stems, pr an empty if no stems are found
|
||||
*/
|
||||
private List<Stem> stem(char word[], int length, char[] flags, int recursionDepth) {
|
||||
List<Stem> stems = new ArrayList<Stem>();
|
||||
|
||||
for (int i = 0; i < length; i++) {
|
||||
List<HunspellAffix> suffixes = dictionary.lookupSuffix(word, i, length - i);
|
||||
if (suffixes == null) {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (HunspellAffix suffix : suffixes) {
|
||||
if (hasCrossCheckedFlag(suffix.getFlag(), flags)) {
|
||||
int deAffixedLength = length - suffix.getAppend().length();
|
||||
// TODO: can we do this in-place?
|
||||
String strippedWord = new StringBuilder().append(word, 0, deAffixedLength).append(suffix.getStrip()).toString();
|
||||
|
||||
List<Stem> stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), suffix, recursionDepth);
|
||||
for (Stem stem : stemList) {
|
||||
stem.addSuffix(suffix);
|
||||
}
|
||||
|
||||
stems.addAll(stemList);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = length - 1; i >= 0; i--) {
|
||||
List<HunspellAffix> prefixes = dictionary.lookupPrefix(word, 0, i);
|
||||
if (prefixes == null) {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (HunspellAffix prefix : prefixes) {
|
||||
if (hasCrossCheckedFlag(prefix.getFlag(), flags)) {
|
||||
int deAffixedStart = prefix.getAppend().length();
|
||||
int deAffixedLength = length - deAffixedStart;
|
||||
|
||||
String strippedWord = new StringBuilder().append(prefix.getStrip())
|
||||
.append(word, deAffixedStart, deAffixedLength)
|
||||
.toString();
|
||||
|
||||
List<Stem> stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), prefix, recursionDepth);
|
||||
for (Stem stem : stemList) {
|
||||
stem.addPrefix(prefix);
|
||||
}
|
||||
|
||||
stems.addAll(stemList);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return stems;
|
||||
}
|
||||
|
||||
/**
|
||||
* Applies the affix rule to the given word, producing a list of stems if any are found
|
||||
*
|
||||
* @param strippedWord Word the affix has been removed and the strip added
|
||||
* @param affix HunspellAffix representing the affix rule itself
|
||||
* @param recursionDepth Level of recursion this stemming step is at
|
||||
* @return List of stems for the word, or an empty list if none are found
|
||||
*/
|
||||
@SuppressWarnings("unchecked")
|
||||
public List<Stem> applyAffix(char strippedWord[], int length, HunspellAffix affix, int recursionDepth) {
|
||||
if(dictionary.isIgnoreCase()) {
|
||||
charUtils.toLowerCase(strippedWord, 0, strippedWord.length);
|
||||
}
|
||||
segment.setLength(0);
|
||||
segment.append(strippedWord, 0, length);
|
||||
if (!affix.checkCondition(segment)) {
|
||||
return Collections.EMPTY_LIST;
|
||||
}
|
||||
|
||||
List<Stem> stems = new ArrayList<Stem>();
|
||||
|
||||
List<HunspellWord> words = dictionary.lookupWord(strippedWord, 0, length);
|
||||
if (words != null) {
|
||||
for (HunspellWord hunspellWord : words) {
|
||||
if (hunspellWord.hasFlag(affix.getFlag())) {
|
||||
stems.add(new Stem(strippedWord, length));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (affix.isCrossProduct() && recursionDepth < recursionCap) {
|
||||
stems.addAll(stem(strippedWord, length, affix.getAppendFlags(), ++recursionDepth));
|
||||
}
|
||||
|
||||
return stems;
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if the given flag cross checks with the given array of flags
|
||||
*
|
||||
* @param flag Flag to cross check with the array of flags
|
||||
* @param flags Array of flags to cross check against. Can be {@code null}
|
||||
* @return {@code true} if the flag is found in the array or the array is {@code null}, {@code false} otherwise
|
||||
*/
|
||||
private boolean hasCrossCheckedFlag(char flag, char[] flags) {
|
||||
return flags == null || Arrays.binarySearch(flags, flag) >= 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Stem represents all information known about a stem of a word. This includes the stem, and the prefixes and suffixes
|
||||
* that were used to change the word into the stem.
|
||||
*/
|
||||
public static class Stem {
|
||||
|
||||
private final List<HunspellAffix> prefixes = new ArrayList<HunspellAffix>();
|
||||
private final List<HunspellAffix> suffixes = new ArrayList<HunspellAffix>();
|
||||
private final char stem[];
|
||||
private final int stemLength;
|
||||
|
||||
/**
|
||||
* Creates a new Stem wrapping the given word stem
|
||||
*
|
||||
* @param stem Stem of a word
|
||||
*/
|
||||
public Stem(char stem[], int stemLength) {
|
||||
this.stem = stem;
|
||||
this.stemLength = stemLength;
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a prefix to the list of prefixes used to generate this stem. Because it is assumed that prefixes are added
|
||||
* depth first, the prefix is added to the front of the list
|
||||
*
|
||||
* @param prefix Prefix to add to the list of prefixes for this stem
|
||||
*/
|
||||
public void addPrefix(HunspellAffix prefix) {
|
||||
prefixes.add(0, prefix);
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a suffix to the list of suffixes used to generate this stem. Because it is assumed that suffixes are added
|
||||
* depth first, the suffix is added to the end of the list
|
||||
*
|
||||
* @param suffix Suffix to add to the list of suffixes for this stem
|
||||
*/
|
||||
public void addSuffix(HunspellAffix suffix) {
|
||||
suffixes.add(suffix);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the list of prefixes used to generate the stem
|
||||
*
|
||||
* @return List of prefixes used to generate the stem or an empty list if no prefixes were required
|
||||
*/
|
||||
public List<HunspellAffix> getPrefixes() {
|
||||
return prefixes;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the list of suffixes used to generate the stem
|
||||
*
|
||||
* @return List of suffixes used to generate the stem or an empty list if no suffixes were required
|
||||
*/
|
||||
public List<HunspellAffix> getSuffixes() {
|
||||
return suffixes;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the actual word stem itself
|
||||
*
|
||||
* @return Word stem itself
|
||||
*/
|
||||
public char[] getStem() {
|
||||
return stem;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the stemLength
|
||||
*/
|
||||
public int getStemLength() {
|
||||
return stemLength;
|
||||
}
|
||||
|
||||
public String getStemString() {
|
||||
return new String(stem, 0, stemLength);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
// ================================================= Entry Point ===================================================
|
||||
|
||||
/*
|
||||
* HunspellStemmer entry point. Accepts two arguments: location of affix file and location of dic file
|
||||
*
|
||||
* @param args Program arguments. Should contain location of affix file and location of dic file
|
||||
* @throws IOException Can be thrown while reading from the files
|
||||
* @throws ParseException Can be thrown while parsing the files
|
||||
public static void main(String[] args) throws IOException, ParseException {
|
||||
boolean ignoreCase = false;
|
||||
int offset = 0;
|
||||
|
||||
if (args.length < 2) {
|
||||
System.out.println("usage: HunspellStemmer [-i] <affix location> <dic location>");
|
||||
System.exit(1);
|
||||
}
|
||||
|
||||
if(args[offset].equals("-i")) {
|
||||
ignoreCase = true;
|
||||
System.out.println("Ignoring case. All stems will be returned lowercased");
|
||||
offset++;
|
||||
}
|
||||
|
||||
InputStream affixInputStream = new FileInputStream(args[offset++]);
|
||||
InputStream dicInputStream = new FileInputStream(args[offset++]);
|
||||
|
||||
// :Post-Release-Update-Version.LUCENE_XY:
|
||||
HunspellDictionary dictionary = new HunspellDictionary(affixInputStream, dicInputStream, Version.LUCENE_50, ignoreCase);
|
||||
|
||||
affixInputStream.close();
|
||||
dicInputStream.close();
|
||||
|
||||
HunspellStemmer stemmer = new HunspellStemmer(dictionary);
|
||||
|
||||
Scanner scanner = new Scanner(System.in, Charset.defaultCharset().name());
|
||||
|
||||
System.out.print("> ");
|
||||
while (scanner.hasNextLine()) {
|
||||
String word = scanner.nextLine();
|
||||
|
||||
if ("exit".equals(word)) {
|
||||
break;
|
||||
}
|
||||
|
||||
printStemResults(word, stemmer.stem(word.toCharArray(), word.length()));
|
||||
|
||||
System.out.print("> ");
|
||||
}
|
||||
}
|
||||
|
||||
* Prints the results of the stemming of a word
|
||||
*
|
||||
* @param originalWord Word that has been stemmed
|
||||
* @param stems Stems of the word
|
||||
private static void printStemResults(String originalWord, List<Stem> stems) {
|
||||
StringBuilder builder = new StringBuilder().append("stem(").append(originalWord).append(")").append("\n");
|
||||
|
||||
for (Stem stem : stems) {
|
||||
builder.append("- ").append(stem.getStem()).append(": ");
|
||||
|
||||
for (HunspellAffix prefix : stem.getPrefixes()) {
|
||||
builder.append(prefix.getAppend()).append("+");
|
||||
|
||||
if (hasText(prefix.getStrip())) {
|
||||
builder.append(prefix.getStrip()).append("-");
|
||||
}
|
||||
}
|
||||
|
||||
builder.append(stem.getStem());
|
||||
|
||||
for (HunspellAffix suffix : stem.getSuffixes()) {
|
||||
if (hasText(suffix.getStrip())) {
|
||||
builder.append("-").append(suffix.getStrip());
|
||||
}
|
||||
|
||||
builder.append("+").append(suffix.getAppend());
|
||||
}
|
||||
builder.append("\n");
|
||||
}
|
||||
|
||||
System.out.println(builder);
|
||||
}
|
||||
|
||||
* Simple utility to check if the given String has any text
|
||||
*
|
||||
* @param str String to check if it has any text
|
||||
* @return {@code true} if the String has text, {@code false} otherwise
|
||||
private static boolean hasText(String str) {
|
||||
return str != null && str.length() > 0;
|
||||
}
|
||||
*/
|
||||
}
|
|
@ -1,63 +0,0 @@
|
|||
package org.apache.lucene.analysis.hunspell;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
* A dictionary (.dic) entry with its associated flags.
|
||||
*/
|
||||
public class HunspellWord {
|
||||
|
||||
private final char flags[]; // sorted, can we represent more concisely?
|
||||
|
||||
/**
|
||||
* Creates a new HunspellWord with no associated flags
|
||||
*/
|
||||
public HunspellWord() {
|
||||
flags = null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs a new HunspellWord with the given flags
|
||||
*
|
||||
* @param flags Flags to associate with the word
|
||||
*/
|
||||
public HunspellWord(char[] flags) {
|
||||
this.flags = flags;
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks whether the word has the given flag associated with it
|
||||
*
|
||||
* @param flag Flag to check whether it is associated with the word
|
||||
* @return {@code true} if the flag is associated, {@code false} otherwise
|
||||
*/
|
||||
public boolean hasFlag(char flag) {
|
||||
return flags != null && Arrays.binarySearch(flags, flag) >= 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the flags associated with the word
|
||||
*
|
||||
* @return Flags associated with the word
|
||||
*/
|
||||
public char[] getFlags() {
|
||||
return flags;
|
||||
}
|
||||
}
|
|
@ -1,4 +1,4 @@
|
|||
package org.apache.lucene.analysis.hunspell2;
|
||||
package org.apache.lucene.analysis.hunspell;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
@ -1,4 +1,4 @@
|
|||
package org.apache.lucene.analysis.hunspell2;
|
||||
package org.apache.lucene.analysis.hunspell;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
|
@ -24,6 +24,7 @@ import java.util.List;
|
|||
import java.util.regex.Pattern;
|
||||
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.analysis.util.CharacterUtils;
|
||||
import org.apache.lucene.store.ByteArrayDataInput;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
|
@ -37,9 +38,10 @@ import org.apache.lucene.util.Version;
|
|||
final class Stemmer {
|
||||
private final int recursionCap;
|
||||
private final Dictionary dictionary;
|
||||
private BytesRef scratch = new BytesRef();
|
||||
private final BytesRef scratch = new BytesRef();
|
||||
private final StringBuilder segment = new StringBuilder();
|
||||
private final ByteArrayDataInput affixReader;
|
||||
private final CharacterUtils charUtils = CharacterUtils.getInstance(Version.LUCENE_CURRENT);
|
||||
|
||||
/**
|
||||
* Constructs a new Stemmer which will use the provided Dictionary to create its stems. Uses the
|
||||
|
@ -80,6 +82,9 @@ final class Stemmer {
|
|||
* @return List of stems for the word
|
||||
*/
|
||||
public List<CharsRef> stem(char word[], int length) {
|
||||
if (dictionary.ignoreCase) {
|
||||
charUtils.toLowerCase(word, 0, length);
|
||||
}
|
||||
List<CharsRef> stems = new ArrayList<CharsRef>();
|
||||
if (dictionary.lookupWord(word, 0, length, scratch) != null) {
|
||||
stems.add(new CharsRef(word, 0, length));
|
||||
|
@ -95,20 +100,19 @@ final class Stemmer {
|
|||
* @return List of stems for the word
|
||||
*/
|
||||
public List<CharsRef> uniqueStems(char word[], int length) {
|
||||
List<CharsRef> stems = new ArrayList<CharsRef>();
|
||||
CharArraySet terms = new CharArraySet(Version.LUCENE_CURRENT, 8, false);
|
||||
if (dictionary.lookupWord(word, 0, length, scratch) != null) {
|
||||
stems.add(new CharsRef(word, 0, length));
|
||||
terms.add(word);
|
||||
List<CharsRef> stems = stem(word, length);
|
||||
if (stems.size() < 2) {
|
||||
return stems;
|
||||
}
|
||||
List<CharsRef> otherStems = stem(word, length, Dictionary.NOFLAGS, 0);
|
||||
for (CharsRef s : otherStems) {
|
||||
CharArraySet terms = new CharArraySet(Version.LUCENE_CURRENT, 8, dictionary.ignoreCase);
|
||||
List<CharsRef> deduped = new ArrayList<>();
|
||||
for (CharsRef s : stems) {
|
||||
if (!terms.contains(s)) {
|
||||
stems.add(s);
|
||||
deduped.add(s);
|
||||
terms.add(s);
|
||||
}
|
||||
}
|
||||
return stems;
|
||||
return deduped;
|
||||
}
|
||||
|
||||
// ================================================= Helper Methods ================================================
|
||||
|
@ -188,7 +192,7 @@ final class Stemmer {
|
|||
* @param recursionDepth Level of recursion this stemming step is at
|
||||
* @return List of stems for the word, or an empty list if none are found
|
||||
*/
|
||||
public List<CharsRef> applyAffix(char strippedWord[], int length, int affix, int recursionDepth) {
|
||||
List<CharsRef> applyAffix(char strippedWord[], int length, int affix, int recursionDepth) {
|
||||
segment.setLength(0);
|
||||
segment.append(strippedWord, 0, length);
|
||||
|
|
@ -1,137 +0,0 @@
|
|||
package org.apache.lucene.analysis.hunspell2;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
|
||||
/**
|
||||
* TokenFilter that uses hunspell affix rules and words to stem tokens. Since hunspell supports a word having multiple
|
||||
* stems, this filter can emit multiple tokens for each consumed token
|
||||
*
|
||||
* <p>
|
||||
* Note: This filter is aware of the {@link KeywordAttribute}. To prevent
|
||||
* certain terms from being passed to the stemmer
|
||||
* {@link KeywordAttribute#isKeyword()} should be set to <code>true</code>
|
||||
* in a previous {@link TokenStream}.
|
||||
*
|
||||
* Note: For including the original term as well as the stemmed version, see
|
||||
* {@link org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilterFactory}
|
||||
* </p>
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public final class Hunspell2StemFilter extends TokenFilter {
|
||||
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
|
||||
private final Stemmer stemmer;
|
||||
|
||||
private List<CharsRef> buffer;
|
||||
private State savedState;
|
||||
|
||||
private final boolean dedup;
|
||||
|
||||
/** Create a {@link Hunspell2StemFilter} which deduplicates stems and has a maximum
|
||||
* recursion level of 2.
|
||||
* @see #Hunspell2StemFilter(TokenStream, Dictionary, int) */
|
||||
public Hunspell2StemFilter(TokenStream input, Dictionary dictionary) {
|
||||
this(input, dictionary, 2);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new Hunspell2StemFilter that will stem tokens from the given TokenStream using affix rules in the provided
|
||||
* Dictionary
|
||||
*
|
||||
* @param input TokenStream whose tokens will be stemmed
|
||||
* @param dictionary HunspellDictionary containing the affix rules and words that will be used to stem the tokens
|
||||
* @param recursionCap maximum level of recursion stemmer can go into, defaults to <code>2</code>
|
||||
*/
|
||||
public Hunspell2StemFilter(TokenStream input, Dictionary dictionary, int recursionCap) {
|
||||
this(input, dictionary, true, recursionCap);
|
||||
}
|
||||
|
||||
/** Create a {@link Hunspell2StemFilter} which has a maximum recursion level of 2.
|
||||
* @see #Hunspell2StemFilter(TokenStream, Dictionary, boolean, int) */
|
||||
public Hunspell2StemFilter(TokenStream input, Dictionary dictionary, boolean dedup) {
|
||||
this(input, dictionary, dedup, 2);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new HunspellStemFilter that will stem tokens from the given TokenStream using affix rules in the provided
|
||||
* Dictionary
|
||||
*
|
||||
* @param input TokenStream whose tokens will be stemmed
|
||||
* @param dictionary HunspellDictionary containing the affix rules and words that will be used to stem the tokens
|
||||
* @param dedup true if only unique terms should be output.
|
||||
* @param recursionCap maximum level of recursion stemmer can go into, defaults to <code>2</code>
|
||||
*/
|
||||
public Hunspell2StemFilter(TokenStream input, Dictionary dictionary, boolean dedup, int recursionCap) {
|
||||
super(input);
|
||||
this.dedup = dedup;
|
||||
this.stemmer = new Stemmer(dictionary, recursionCap);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (buffer != null && !buffer.isEmpty()) {
|
||||
CharsRef nextStem = buffer.remove(0);
|
||||
restoreState(savedState);
|
||||
posIncAtt.setPositionIncrement(0);
|
||||
termAtt.setEmpty().append(nextStem);
|
||||
return true;
|
||||
}
|
||||
|
||||
if (!input.incrementToken()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (keywordAtt.isKeyword()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
buffer = dedup ? stemmer.uniqueStems(termAtt.buffer(), termAtt.length()) : stemmer.stem(termAtt.buffer(), termAtt.length());
|
||||
|
||||
if (buffer.isEmpty()) { // we do not know this word, return it unchanged
|
||||
return true;
|
||||
}
|
||||
|
||||
CharsRef stem = buffer.remove(0);
|
||||
termAtt.setEmpty().append(stem);
|
||||
|
||||
if (!buffer.isEmpty()) {
|
||||
savedState = captureState();
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
buffer = null;
|
||||
}
|
||||
}
|
|
@ -1,80 +0,0 @@
|
|||
package org.apache.lucene.analysis.hunspell2;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.text.ParseException;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.util.ResourceLoader;
|
||||
import org.apache.lucene.analysis.util.ResourceLoaderAware;
|
||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
|
||||
/**
|
||||
* TokenFilterFactory that creates instances of {@link Hunspell2StemFilter}.
|
||||
* Example config for British English:
|
||||
* <pre class="prettyprint">
|
||||
* <filter class="solr.Hunspell2StemFilterFactory"
|
||||
* dictionary="en_GB.dic"
|
||||
* affix="en_GB.aff" /></pre>
|
||||
* Both parameters dictionary and affix are mandatory.
|
||||
* Dictionaries for many languages are available through the OpenOffice project.
|
||||
*
|
||||
* See <a href="http://wiki.apache.org/solr/Hunspell">http://wiki.apache.org/solr/Hunspell</a>
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class Hunspell2StemFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
|
||||
private static final String PARAM_DICTIONARY = "dictionary";
|
||||
private static final String PARAM_AFFIX = "affix";
|
||||
private static final String PARAM_RECURSION_CAP = "recursionCap";
|
||||
|
||||
private final String dictionaryFile;
|
||||
private final String affixFile;
|
||||
private Dictionary dictionary;
|
||||
private int recursionCap;
|
||||
|
||||
/** Creates a new Hunspell2StemFilterFactory */
|
||||
public Hunspell2StemFilterFactory(Map<String,String> args) {
|
||||
super(args);
|
||||
dictionaryFile = require(args, PARAM_DICTIONARY);
|
||||
affixFile = get(args, PARAM_AFFIX);
|
||||
recursionCap = getInt(args, PARAM_RECURSION_CAP, 2);
|
||||
if (!args.isEmpty()) {
|
||||
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void inform(ResourceLoader loader) throws IOException {
|
||||
try (InputStream affix = loader.openResource(affixFile);
|
||||
InputStream dictionary = loader.openResource(dictionaryFile)) {
|
||||
try {
|
||||
this.dictionary = new Dictionary(affix, dictionary);
|
||||
} catch (ParseException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream) {
|
||||
return new Hunspell2StemFilter(tokenStream, dictionary, recursionCap);
|
||||
}
|
||||
}
|
|
@ -1,26 +0,0 @@
|
|||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<html>
|
||||
<body>
|
||||
Stemming TokenFilter using a Java implementation of the <a href="http://www.ldc.upenn.edu/Catalog/docs/LDC2008T01/acta04.pdf">
|
||||
Hunspell stemming algorithm.</a>
|
||||
<p>
|
||||
Dictionaries can be found on <a href="http://wiki.services.openoffice.org/wiki/Dictionaries">
|
||||
OpenOffice's wiki</a>
|
||||
</p>
|
||||
</body>
|
||||
</html>
|
|
@ -51,7 +51,6 @@ org.apache.lucene.analysis.hi.HindiNormalizationFilterFactory
|
|||
org.apache.lucene.analysis.hi.HindiStemFilterFactory
|
||||
org.apache.lucene.analysis.hu.HungarianLightStemFilterFactory
|
||||
org.apache.lucene.analysis.hunspell.HunspellStemFilterFactory
|
||||
org.apache.lucene.analysis.hunspell2.Hunspell2StemFilterFactory
|
||||
org.apache.lucene.analysis.id.IndonesianStemFilterFactory
|
||||
org.apache.lucene.analysis.in.IndicNormalizationFilterFactory
|
||||
org.apache.lucene.analysis.it.ItalianLightStemFilterFactory
|
||||
|
|
|
@ -62,8 +62,8 @@ import org.apache.lucene.analysis.commongrams.CommonGramsQueryFilter;
|
|||
import org.apache.lucene.analysis.compound.HyphenationCompoundWordTokenFilter;
|
||||
import org.apache.lucene.analysis.compound.TestCompoundWordTokenFilter;
|
||||
import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
|
||||
import org.apache.lucene.analysis.hunspell.HunspellDictionary;
|
||||
import org.apache.lucene.analysis.hunspell.HunspellDictionaryTest;
|
||||
import org.apache.lucene.analysis.hunspell.Dictionary;
|
||||
import org.apache.lucene.analysis.hunspell.TestHunspellStemFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.HyphenatedWordsFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.LimitTokenPositionFilter;
|
||||
|
@ -406,13 +406,13 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
|
|||
return new IdentityEncoder(); // the other encoders will throw exceptions if tokens arent numbers?
|
||||
}
|
||||
});
|
||||
put(HunspellDictionary.class, new ArgProducer() {
|
||||
put(Dictionary.class, new ArgProducer() {
|
||||
@Override public Object create(Random random) {
|
||||
// TODO: make nastier
|
||||
InputStream affixStream = HunspellDictionaryTest.class.getResourceAsStream("test.aff");
|
||||
InputStream dictStream = HunspellDictionaryTest.class.getResourceAsStream("test.dic");
|
||||
InputStream affixStream = TestHunspellStemFilter.class.getResourceAsStream("simple.aff");
|
||||
InputStream dictStream = TestHunspellStemFilter.class.getResourceAsStream("simple.dic");
|
||||
try {
|
||||
return new HunspellDictionary(affixStream, dictStream, TEST_VERSION_CURRENT);
|
||||
return new Dictionary(affixStream, dictStream);
|
||||
} catch (Exception ex) {
|
||||
Rethrow.rethrow(ex);
|
||||
return null; // unreachable code
|
||||
|
|
|
@ -1,201 +0,0 @@
|
|||
package org.apache.lucene.analysis.hunspell;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.text.ParseException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Test;
|
||||
|
||||
public class HunspellDictionaryTest extends LuceneTestCase {
|
||||
|
||||
private class CloseCheckInputStream extends InputStream {
|
||||
private InputStream delegate;
|
||||
|
||||
private boolean closed = false;
|
||||
|
||||
public CloseCheckInputStream(InputStream delegate) {
|
||||
super();
|
||||
this.delegate = delegate;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int read() throws IOException {
|
||||
return delegate.read();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return delegate.hashCode();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int read(byte[] b) throws IOException {
|
||||
return delegate.read(b);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
return delegate.equals(obj);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int read(byte[] b, int off, int len) throws IOException {
|
||||
return delegate.read(b, off, len);
|
||||
}
|
||||
|
||||
@Override
|
||||
public long skip(long n) throws IOException {
|
||||
return delegate.skip(n);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return delegate.toString();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int available() throws IOException {
|
||||
return delegate.available();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
this.closed = true;
|
||||
delegate.close();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void mark(int readlimit) {
|
||||
delegate.mark(readlimit);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
delegate.reset();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean markSupported() {
|
||||
return delegate.markSupported();
|
||||
}
|
||||
|
||||
public boolean isClosed() {
|
||||
return this.closed;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testResourceCleanup() throws IOException, ParseException {
|
||||
CloseCheckInputStream affixStream = new CloseCheckInputStream(getClass().getResourceAsStream("testCompressed.aff"));
|
||||
CloseCheckInputStream dictStream = new CloseCheckInputStream(getClass().getResourceAsStream("testCompressed.dic"));
|
||||
|
||||
new HunspellDictionary(affixStream, dictStream, TEST_VERSION_CURRENT);
|
||||
|
||||
assertFalse(affixStream.isClosed());
|
||||
assertFalse(dictStream.isClosed());
|
||||
|
||||
affixStream.close();
|
||||
dictStream.close();
|
||||
|
||||
assertTrue(affixStream.isClosed());
|
||||
assertTrue(dictStream.isClosed());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testHunspellDictionary_loadDicAff() throws IOException, ParseException {
|
||||
InputStream affixStream = getClass().getResourceAsStream("test.aff");
|
||||
InputStream dictStream = getClass().getResourceAsStream("test.dic");
|
||||
|
||||
HunspellDictionary dictionary = new HunspellDictionary(affixStream, dictStream, TEST_VERSION_CURRENT);
|
||||
assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).size());
|
||||
assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).size());
|
||||
assertEquals(1, dictionary.lookupWord(new char[]{'o', 'l', 'r'}, 0, 3).size());
|
||||
assertEquals("Wrong number of flags for lucen", 1, dictionary.lookupWord(new char[]{'l', 'u', 'c', 'e', 'n'}, 0, 5).get(0).getFlags().length);
|
||||
|
||||
affixStream.close();
|
||||
dictStream.close();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testHunspellDictionary_multipleDictWithOverride() throws IOException, ParseException {
|
||||
InputStream affixStream = getClass().getResourceAsStream("test.aff");
|
||||
List<InputStream> dictStreams = new ArrayList<InputStream>();
|
||||
dictStreams.add(getClass().getResourceAsStream("test.dic"));
|
||||
dictStreams.add(getClass().getResourceAsStream("testOverride.dic"));
|
||||
|
||||
HunspellDictionary dictionary = new HunspellDictionary(affixStream, dictStreams, TEST_VERSION_CURRENT, false);
|
||||
assertEquals("Wrong number of flags for lucen", 3, dictionary.lookupWord(new char[]{'l', 'u', 'c', 'e', 'n'}, 0, 5).get(0).getFlags().length);
|
||||
assertEquals("Wrong number of flags for bar", 1, dictionary.lookupWord(new char[]{'b', 'a', 'r'}, 0, 3).get(0).getFlags().length);
|
||||
|
||||
affixStream.close();
|
||||
for(InputStream dstream : dictStreams) {
|
||||
dstream.close();
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCompressedHunspellDictionary_loadDicAff() throws IOException, ParseException {
|
||||
InputStream affixStream = getClass().getResourceAsStream("testCompressed.aff");
|
||||
InputStream dictStream = getClass().getResourceAsStream("testCompressed.dic");
|
||||
|
||||
HunspellDictionary dictionary = new HunspellDictionary(affixStream, dictStream, TEST_VERSION_CURRENT);
|
||||
assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).size());
|
||||
assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).size());
|
||||
assertEquals(1, dictionary.lookupWord(new char[]{'o', 'l', 'r'}, 0, 3).size());
|
||||
|
||||
affixStream.close();
|
||||
dictStream.close();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testHunspellDictionary_loadDicWrongAff() throws IOException, ParseException {
|
||||
InputStream affixStream = getClass().getResourceAsStream("testWrongAffixRule.aff");
|
||||
InputStream dictStream = getClass().getResourceAsStream("test.dic");
|
||||
|
||||
HunspellDictionary dictionary = new HunspellDictionary(affixStream, Arrays.asList(dictStream), TEST_VERSION_CURRENT, false, false);
|
||||
assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).size());
|
||||
assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).size());
|
||||
assertEquals(1, dictionary.lookupWord(new char[]{'o', 'l', 'r'}, 0, 3).size());
|
||||
//strict parsing disabled: malformed rule is not loaded
|
||||
assertNull(dictionary.lookupPrefix(new char[]{'a'}, 0, 1));
|
||||
affixStream.close();
|
||||
dictStream.close();
|
||||
|
||||
affixStream = getClass().getResourceAsStream("testWrongAffixRule.aff");
|
||||
dictStream = getClass().getResourceAsStream("test.dic");
|
||||
//strict parsing enabled: malformed rule causes ParseException
|
||||
try {
|
||||
dictionary = new HunspellDictionary(affixStream, Arrays.asList(dictStream), TEST_VERSION_CURRENT, false, true);
|
||||
Assert.fail();
|
||||
} catch(ParseException e) {
|
||||
Assert.assertEquals("The affix file contains a rule with less than five elements", e.getMessage());
|
||||
Assert.assertEquals(23, e.getErrorOffset());
|
||||
}
|
||||
|
||||
affixStream.close();
|
||||
dictStream.close();
|
||||
}
|
||||
}
|
|
@ -1,92 +0,0 @@
|
|||
package org.apache.lucene.analysis.hunspell;
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.text.ParseException;
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
import org.junit.AfterClass;
|
||||
import org.junit.BeforeClass;
|
||||
|
||||
public class HunspellStemFilterTest extends BaseTokenStreamTestCase {
|
||||
|
||||
private static HunspellDictionary DICTIONARY;
|
||||
@BeforeClass
|
||||
public static void beforeClass() throws IOException, ParseException {
|
||||
DICTIONARY = createDict(true);
|
||||
}
|
||||
@AfterClass
|
||||
public static void afterClass() {
|
||||
DICTIONARY = null;
|
||||
}
|
||||
public static HunspellDictionary createDict(boolean ignoreCase) throws IOException, ParseException {
|
||||
InputStream affixStream = HunspellStemmerTest.class.getResourceAsStream("test.aff");
|
||||
InputStream dictStream = HunspellStemmerTest.class.getResourceAsStream("test.dic");
|
||||
|
||||
return new HunspellDictionary(affixStream, dictStream, TEST_VERSION_CURRENT, ignoreCase);
|
||||
}
|
||||
|
||||
/**
|
||||
* Simple test for KeywordAttribute
|
||||
*/
|
||||
public void testKeywordAttribute() throws IOException {
|
||||
MockTokenizer tokenizer = whitespaceMockTokenizer("lucene is awesome");
|
||||
tokenizer.setEnableChecks(true);
|
||||
HunspellStemFilter filter = new HunspellStemFilter(tokenizer, DICTIONARY, TestUtil.nextInt(random(), 1, 3));
|
||||
assertTokenStreamContents(filter, new String[]{"lucene", "lucen", "is", "awesome"}, new int[] {1, 0, 1, 1});
|
||||
|
||||
// assert with keywork marker
|
||||
tokenizer = whitespaceMockTokenizer("lucene is awesome");
|
||||
CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, Arrays.asList("Lucene"), true);
|
||||
filter = new HunspellStemFilter(new SetKeywordMarkerFilter(tokenizer, set), DICTIONARY, TestUtil.nextInt(random(), 1, 3));
|
||||
assertTokenStreamContents(filter, new String[]{"lucene", "is", "awesome"}, new int[] {1, 1, 1});
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
Analyzer analyzer = new Analyzer() {
|
||||
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
||||
return new TokenStreamComponents(tokenizer, new HunspellStemFilter(tokenizer, DICTIONARY, TestUtil.nextInt(random(), 1, 3)));
|
||||
}
|
||||
};
|
||||
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
|
||||
public void testEmptyTerm() throws IOException {
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = new KeywordTokenizer();
|
||||
return new TokenStreamComponents(tokenizer, new HunspellStemFilter(tokenizer, DICTIONARY, TestUtil.nextInt(random(), 1, 3)));
|
||||
}
|
||||
};
|
||||
checkOneTerm(a, "", "");
|
||||
}
|
||||
}
|
|
@ -1,137 +0,0 @@
|
|||
package org.apache.lucene.analysis.hunspell;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.junit.AfterClass;
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.text.ParseException;
|
||||
import java.util.List;
|
||||
|
||||
import static junit.framework.Assert.assertEquals;
|
||||
|
||||
public class HunspellStemmerTest extends LuceneTestCase {
|
||||
|
||||
private static HunspellStemmer stemmer;
|
||||
|
||||
@BeforeClass
|
||||
public static void beforeClass() throws IOException, ParseException {
|
||||
createStemmer(true);
|
||||
}
|
||||
|
||||
@AfterClass
|
||||
public static void afterClass() {
|
||||
stemmer = null;
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testStem_simpleSuffix() {
|
||||
List<HunspellStemmer.Stem> stems = stemmer.stem("lucene");
|
||||
|
||||
assertEquals(2, stems.size());
|
||||
assertEquals("lucene", stems.get(0).getStemString());
|
||||
assertEquals("lucen", stems.get(1).getStemString());
|
||||
|
||||
stems = stemmer.stem("mahoute");
|
||||
assertEquals(1, stems.size());
|
||||
assertEquals("mahout", stems.get(0).getStemString());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testStem_simplePrefix() {
|
||||
List<HunspellStemmer.Stem> stems = stemmer.stem("solr");
|
||||
|
||||
assertEquals(1, stems.size());
|
||||
assertEquals("olr", stems.get(0).getStemString());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testStem_recursiveSuffix() {
|
||||
List<HunspellStemmer.Stem> stems = stemmer.stem("abcd");
|
||||
|
||||
assertEquals(1, stems.size());
|
||||
assertEquals("ab", stems.get(0).getStemString());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testStem_ignoreCase() throws IOException, ParseException {
|
||||
List<HunspellStemmer.Stem> stems;
|
||||
createStemmer(true);
|
||||
|
||||
stems = stemmer.stem("apache");
|
||||
assertEquals(1, stems.size());
|
||||
assertEquals("apach", stems.get(0).getStemString());
|
||||
|
||||
stems = stemmer.stem("APACHE");
|
||||
assertEquals(1, stems.size());
|
||||
assertEquals("apach", stems.get(0).getStemString());
|
||||
|
||||
stems = stemmer.stem("Apache");
|
||||
assertEquals(1, stems.size());
|
||||
assertEquals("apach", stems.get(0).getStemString());
|
||||
|
||||
stems = stemmer.stem("foos");
|
||||
assertEquals(1, stems.size());
|
||||
assertEquals("foo", stems.get(0).getStemString());
|
||||
|
||||
stems = stemmer.stem("mood");
|
||||
assertEquals(1, stems.size());
|
||||
assertEquals("moo", stems.get(0).getStemString());
|
||||
|
||||
stems = stemmer.stem("Foos");
|
||||
assertEquals(1, stems.size());
|
||||
assertEquals("foo", stems.get(0).getStemString());
|
||||
|
||||
// The "Foo" rule gets overridden by the "foo" rule, and we don't merge
|
||||
stems = stemmer.stem("Food");
|
||||
assertEquals(0, stems.size());
|
||||
|
||||
stems = stemmer.stem("Mood");
|
||||
assertEquals(1, stems.size());
|
||||
assertEquals("moo", stems.get(0).getStemString());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testStem_caseSensitive() throws IOException, ParseException {
|
||||
createStemmer(false);
|
||||
List<HunspellStemmer.Stem> stems = stemmer.stem("apache");
|
||||
assertEquals(0, stems.size());
|
||||
|
||||
stems = stemmer.stem("Apache");
|
||||
assertEquals(1, stems.size());
|
||||
assertEquals("Apach", stems.get(0).getStemString());
|
||||
}
|
||||
|
||||
|
||||
private static void createStemmer(boolean ignoreCase) throws IOException, ParseException {
|
||||
InputStream affixStream = HunspellStemmerTest.class.getResourceAsStream("test.aff");
|
||||
InputStream dictStream = HunspellStemmerTest.class.getResourceAsStream("test.dic");
|
||||
|
||||
HunspellDictionary dictionary = new HunspellDictionary(affixStream, dictStream, TEST_VERSION_CURRENT, ignoreCase);
|
||||
stemmer = new HunspellStemmer(dictionary);
|
||||
|
||||
affixStream.close();
|
||||
dictStream.close();
|
||||
}
|
||||
|
||||
}
|
|
@ -1,4 +1,4 @@
|
|||
package org.apache.lucene.analysis.hunspell2;
|
||||
package org.apache.lucene.analysis.hunspell;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
|
@ -22,7 +22,7 @@ import java.io.InputStream;
|
|||
import java.util.zip.ZipEntry;
|
||||
import java.util.zip.ZipFile;
|
||||
|
||||
import org.apache.lucene.analysis.hunspell.HunspellDictionary;
|
||||
import org.apache.lucene.analysis.hunspell.Dictionary;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.RamUsageEstimator;
|
||||
|
@ -33,7 +33,7 @@ import org.junit.Ignore;
|
|||
* wget --mirror -np http://archive.services.openoffice.org/pub/mirror/OpenOffice.org/contrib/dictionaries/
|
||||
* Note some of the files differ only in case. This may be a problem on your operating system!
|
||||
*/
|
||||
//@Ignore("enable manually")
|
||||
@Ignore("enable manually")
|
||||
public class TestAllDictionaries extends LuceneTestCase {
|
||||
|
||||
// set this to the location of where you downloaded all the files
|
||||
|
@ -162,21 +162,11 @@ public class TestAllDictionaries extends LuceneTestCase {
|
|||
assert dicEntry != null;
|
||||
ZipEntry affEntry = zip.getEntry(tests[i+2]);
|
||||
assert affEntry != null;
|
||||
|
||||
// get ram from previous impl
|
||||
String oldRAM = "FAIL";
|
||||
try (InputStream dictionary = zip.getInputStream(dicEntry);
|
||||
InputStream affix = zip.getInputStream(affEntry)) {
|
||||
try {
|
||||
HunspellDictionary dic = new HunspellDictionary(affix, dictionary, TEST_VERSION_CURRENT);
|
||||
oldRAM = RamUsageEstimator.humanSizeOf(dic);
|
||||
} catch (Throwable t) {}
|
||||
}
|
||||
|
||||
try (InputStream dictionary = zip.getInputStream(dicEntry);
|
||||
InputStream affix = zip.getInputStream(affEntry)) {
|
||||
Dictionary dic = new Dictionary(affix, dictionary);
|
||||
System.out.println(tests[i] + "\t" + oldRAM + "\t" + RamUsageEstimator.humanSizeOf(dic) + "\t(" +
|
||||
System.out.println(tests[i] + "\t" + RamUsageEstimator.humanSizeOf(dic) + "\t(" +
|
||||
"words=" + RamUsageEstimator.humanSizeOf(dic.words) + ", " +
|
||||
"flags=" + RamUsageEstimator.humanSizeOf(dic.flagLookup) + ", " +
|
||||
"strips=" + RamUsageEstimator.humanSizeOf(dic.stripLookup) + ", " +
|
||||
|
@ -204,7 +194,7 @@ public class TestAllDictionaries extends LuceneTestCase {
|
|||
|
||||
try (InputStream dictionary = zip.getInputStream(dicEntry);
|
||||
InputStream affix = zip.getInputStream(affEntry)) {
|
||||
Dictionary dic = new Dictionary(affix, dictionary);
|
||||
new Dictionary(affix, dictionary);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,110 @@
|
|||
package org.apache.lucene.analysis.hunspell;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.hunspell.Dictionary;
|
||||
import org.apache.lucene.analysis.hunspell.Stemmer;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.junit.AfterClass;
|
||||
import org.junit.BeforeClass;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
public class TestCaseInsensitive extends LuceneTestCase {
|
||||
private static Stemmer stemmer;
|
||||
|
||||
@BeforeClass
|
||||
public static void beforeClass() throws Exception {
|
||||
try (InputStream affixStream = TestCaseInsensitive.class.getResourceAsStream("simple.aff");
|
||||
InputStream dictStream = TestCaseInsensitive.class.getResourceAsStream("mixedcase.dic")) {
|
||||
Dictionary dictionary = new Dictionary(affixStream, Collections.singletonList(dictStream), true);
|
||||
stemmer = new Stemmer(dictionary);
|
||||
}
|
||||
}
|
||||
|
||||
@AfterClass
|
||||
public static void afterClass() {
|
||||
stemmer = null;
|
||||
}
|
||||
|
||||
public void testCaseInsensitivity() {
|
||||
assertStemsTo("lucene", "lucene", "lucen");
|
||||
assertStemsTo("LuCeNe", "lucene", "lucen");
|
||||
assertStemsTo("mahoute", "mahout");
|
||||
assertStemsTo("MaHoUte", "mahout");
|
||||
}
|
||||
|
||||
public void testSimplePrefix() {
|
||||
assertStemsTo("solr", "olr");
|
||||
}
|
||||
|
||||
public void testRecursiveSuffix() {
|
||||
assertStemsTo("abcd", "ab");
|
||||
}
|
||||
|
||||
// all forms unmunched from dictionary
|
||||
public void testAllStems() {
|
||||
assertStemsTo("ab", "ab");
|
||||
assertStemsTo("abc", "ab");
|
||||
assertStemsTo("apach", "apach");
|
||||
assertStemsTo("apache", "apach");
|
||||
assertStemsTo("foo", "foo");
|
||||
assertStemsTo("food", "foo");
|
||||
assertStemsTo("foos", "foo");
|
||||
assertStemsTo("lucen", "lucen");
|
||||
assertStemsTo("lucene", "lucen", "lucene");
|
||||
assertStemsTo("mahout", "mahout");
|
||||
assertStemsTo("mahoute", "mahout");
|
||||
assertStemsTo("moo", "moo");
|
||||
assertStemsTo("mood", "moo");
|
||||
assertStemsTo("olr", "olr");
|
||||
assertStemsTo("solr", "olr");
|
||||
}
|
||||
|
||||
// some bogus stuff that should not stem (empty lists)!
|
||||
public void testBogusStems() {
|
||||
assertStemsTo("abs");
|
||||
assertStemsTo("abe");
|
||||
assertStemsTo("sab");
|
||||
assertStemsTo("sapach");
|
||||
assertStemsTo("sapache");
|
||||
assertStemsTo("apachee");
|
||||
assertStemsTo("sfoo");
|
||||
assertStemsTo("sfoos");
|
||||
assertStemsTo("fooss");
|
||||
assertStemsTo("lucenee");
|
||||
assertStemsTo("solre");
|
||||
}
|
||||
|
||||
private void assertStemsTo(String s, String... expected) {
|
||||
Arrays.sort(expected);
|
||||
|
||||
List<CharsRef> stems = stemmer.stem(s);
|
||||
String actual[] = new String[stems.size()];
|
||||
for (int i = 0; i < actual.length; i++) {
|
||||
actual[i] = stems.get(i).toString();
|
||||
}
|
||||
Arrays.sort(actual);
|
||||
|
||||
assertArrayEquals(expected, actual);
|
||||
}
|
||||
}
|
|
@ -1,4 +1,4 @@
|
|||
package org.apache.lucene.analysis.hunspell2;
|
||||
package org.apache.lucene.analysis.hunspell;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
|
@ -22,6 +22,7 @@ import java.io.IOException;
|
|||
import java.io.InputStream;
|
||||
import java.text.ParseException;
|
||||
|
||||
import org.apache.lucene.analysis.hunspell.Dictionary;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
|
@ -1,4 +1,4 @@
|
|||
package org.apache.lucene.analysis.hunspell2;
|
||||
package org.apache.lucene.analysis.hunspell;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
|
@ -26,13 +26,15 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
|||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.hunspell.Dictionary;
|
||||
import org.apache.lucene.analysis.hunspell.HunspellStemFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.util.TestUtil;
|
||||
import org.junit.AfterClass;
|
||||
import org.junit.BeforeClass;
|
||||
|
||||
public class TestHunspell2StemFilter extends BaseTokenStreamTestCase {
|
||||
public class TestHunspellStemFilter extends BaseTokenStreamTestCase {
|
||||
private static Dictionary dictionary;
|
||||
|
||||
@BeforeClass
|
||||
|
@ -52,13 +54,21 @@ public class TestHunspell2StemFilter extends BaseTokenStreamTestCase {
|
|||
public void testKeywordAttribute() throws IOException {
|
||||
MockTokenizer tokenizer = whitespaceMockTokenizer("lucene is awesome");
|
||||
tokenizer.setEnableChecks(true);
|
||||
Hunspell2StemFilter filter = new Hunspell2StemFilter(tokenizer, dictionary, TestUtil.nextInt(random(), 1, 3));
|
||||
HunspellStemFilter filter = new HunspellStemFilter(tokenizer, dictionary, TestUtil.nextInt(random(), 1, 3));
|
||||
assertTokenStreamContents(filter, new String[]{"lucene", "lucen", "is", "awesome"}, new int[] {1, 0, 1, 1});
|
||||
|
||||
// assert with keyword marker
|
||||
tokenizer = whitespaceMockTokenizer("lucene is awesome");
|
||||
CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, Arrays.asList("Lucene"), true);
|
||||
filter = new Hunspell2StemFilter(new SetKeywordMarkerFilter(tokenizer, set), dictionary, TestUtil.nextInt(random(), 1, 3));
|
||||
filter = new HunspellStemFilter(new SetKeywordMarkerFilter(tokenizer, set), dictionary, TestUtil.nextInt(random(), 1, 3));
|
||||
assertTokenStreamContents(filter, new String[]{"lucene", "is", "awesome"}, new int[] {1, 1, 1});
|
||||
}
|
||||
|
||||
/** simple test for longestOnly option */
|
||||
public void testLongestOnly() throws IOException {
|
||||
MockTokenizer tokenizer = whitespaceMockTokenizer("lucene is awesome");
|
||||
tokenizer.setEnableChecks(true);
|
||||
HunspellStemFilter filter = new HunspellStemFilter(tokenizer, dictionary, true, TestUtil.nextInt(random(), 1, 3), true);
|
||||
assertTokenStreamContents(filter, new String[]{"lucene", "is", "awesome"}, new int[] {1, 1, 1});
|
||||
}
|
||||
|
||||
|
@ -68,7 +78,7 @@ public class TestHunspell2StemFilter extends BaseTokenStreamTestCase {
|
|||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
||||
return new TokenStreamComponents(tokenizer, new Hunspell2StemFilter(tokenizer, dictionary, TestUtil.nextInt(random(), 1, 3)));
|
||||
return new TokenStreamComponents(tokenizer, new HunspellStemFilter(tokenizer, dictionary, TestUtil.nextInt(random(), 1, 3)));
|
||||
}
|
||||
};
|
||||
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
|
||||
|
@ -79,7 +89,7 @@ public class TestHunspell2StemFilter extends BaseTokenStreamTestCase {
|
|||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName) {
|
||||
Tokenizer tokenizer = new KeywordTokenizer();
|
||||
return new TokenStreamComponents(tokenizer, new Hunspell2StemFilter(tokenizer, dictionary, TestUtil.nextInt(random(), 1, 3)));
|
||||
return new TokenStreamComponents(tokenizer, new HunspellStemFilter(tokenizer, dictionary, TestUtil.nextInt(random(), 1, 3)));
|
||||
}
|
||||
};
|
||||
checkOneTerm(a, "", "");
|
|
@ -20,7 +20,6 @@ package org.apache.lucene.analysis.hunspell;
|
|||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
|
||||
|
||||
|
@ -31,17 +30,17 @@ public class TestHunspellStemFilterFactory extends BaseTokenStreamFactoryTestCas
|
|||
public void testStemming() throws Exception {
|
||||
Reader reader = new StringReader("abc");
|
||||
TokenStream stream = whitespaceMockTokenizer(reader);
|
||||
stream = tokenFilterFactory("HunspellStem",
|
||||
"dictionary", "test.dic",
|
||||
"affix", "test.aff").create(stream);
|
||||
stream = tokenFilterFactory("Hunspell2Stem",
|
||||
"dictionary", "simple.dic",
|
||||
"affix", "simple.aff").create(stream);
|
||||
assertTokenStreamContents(stream, new String[] { "ab" });
|
||||
}
|
||||
|
||||
/** Test that bogus arguments result in exception */
|
||||
public void testBogusArguments() throws Exception {
|
||||
try {
|
||||
tokenFilterFactory("HunspellStem",
|
||||
"dictionary", "test.dic",
|
||||
tokenFilterFactory("Hunspell2Stem",
|
||||
"dictionary", "simple.dic",
|
||||
"bogusArg", "bogusValue");
|
||||
fail();
|
||||
} catch (IllegalArgumentException expected) {
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
package org.apache.lucene.analysis.hunspell2;
|
||||
package org.apache.lucene.analysis.hunspell;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
|
@ -17,6 +17,8 @@ package org.apache.lucene.analysis.hunspell2;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.hunspell.Dictionary;
|
||||
import org.apache.lucene.analysis.hunspell.Stemmer;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.junit.AfterClass;
|
|
@ -0,0 +1,10 @@
|
|||
9
|
||||
Ab/C
|
||||
apach/A
|
||||
Foo/D
|
||||
foo/E
|
||||
Lucen/A
|
||||
Lucene
|
||||
mahout/A
|
||||
Moo/E
|
||||
olr/B
|
|
@ -1,20 +0,0 @@
|
|||
SET UTF-8
|
||||
TRY abcdefghijklmopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ
|
||||
|
||||
SFX A Y 3
|
||||
SFX A 0 e n
|
||||
SFX A 0 e t
|
||||
SFX A 0 e h
|
||||
|
||||
SFX C Y 2
|
||||
SFX C 0 d/C c
|
||||
SFX C 0 c b
|
||||
|
||||
SFX D Y 1
|
||||
SFX D 0 s o
|
||||
|
||||
SFX E Y 1
|
||||
SFX E 0 d o
|
||||
|
||||
PFX B Y 1
|
||||
PFX B 0 s o
|
|
@ -1,10 +0,0 @@
|
|||
9
|
||||
lucen/A
|
||||
lucene
|
||||
mahout/A
|
||||
olr/B
|
||||
ab/C
|
||||
Apach/A
|
||||
Foo/E
|
||||
foo/D
|
||||
Moo/E
|
|
@ -1,29 +0,0 @@
|
|||
SET UTF-8
|
||||
TRY abcdefghijklmopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ
|
||||
|
||||
FLAG long
|
||||
|
||||
AF 5
|
||||
AF AA
|
||||
AF BB
|
||||
AF CC
|
||||
AF DD
|
||||
AF EE
|
||||
|
||||
SFX AA Y 3
|
||||
SFX AA 0 e n
|
||||
SFX AA 0 e t
|
||||
SFX AA 0 e h
|
||||
|
||||
SFX CC Y 2
|
||||
SFX CC 0 d/3 c
|
||||
SFX CC 0 c b
|
||||
|
||||
SFX DD Y 1
|
||||
SFX DD 0 s o
|
||||
|
||||
SFX EE Y 1
|
||||
SFX EE 0 d o
|
||||
|
||||
PFX BB Y 1
|
||||
PFX BB 0 s o
|
|
@ -1,9 +0,0 @@
|
|||
6
|
||||
lucen/1
|
||||
lucene
|
||||
mahout/1
|
||||
olr/2
|
||||
ab/3
|
||||
Apach/1
|
||||
foo/4
|
||||
Foo/5
|
|
@ -1,3 +0,0 @@
|
|||
2
|
||||
lucen/ABC
|
||||
bar/A
|
|
@ -1,24 +0,0 @@
|
|||
SET UTF-8
|
||||
TRY abcdefghijklmopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ
|
||||
|
||||
SFX A Y 3
|
||||
SFX A 0 e n
|
||||
SFX A 0 e t
|
||||
SFX A 0 e h
|
||||
|
||||
SFX C Y 2
|
||||
SFX C 0 d/C c
|
||||
SFX C 0 c b
|
||||
|
||||
SFX D Y 1
|
||||
SFX D 0 s o
|
||||
|
||||
SFX E Y 1
|
||||
SFX E 0 d o
|
||||
|
||||
PFX B Y 1
|
||||
PFX B 0 s o
|
||||
|
||||
#wrong rule (only 4 elements)
|
||||
PFX A0 Y 1
|
||||
PFX A0 0 a
|
|
@ -1,50 +0,0 @@
|
|||
package org.apache.lucene.analysis.hunspell2;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
|
||||
|
||||
/**
|
||||
* Simple tests to ensure the Hunspell stemmer loads from factory
|
||||
*/
|
||||
public class TestHunspell2StemFilterFactory extends BaseTokenStreamFactoryTestCase {
|
||||
public void testStemming() throws Exception {
|
||||
Reader reader = new StringReader("abc");
|
||||
TokenStream stream = whitespaceMockTokenizer(reader);
|
||||
stream = tokenFilterFactory("Hunspell2Stem",
|
||||
"dictionary", "simple.dic",
|
||||
"affix", "simple.aff").create(stream);
|
||||
assertTokenStreamContents(stream, new String[] { "ab" });
|
||||
}
|
||||
|
||||
/** Test that bogus arguments result in exception */
|
||||
public void testBogusArguments() throws Exception {
|
||||
try {
|
||||
tokenFilterFactory("Hunspell2Stem",
|
||||
"dictionary", "simple.dic",
|
||||
"bogusArg", "bogusValue");
|
||||
fail();
|
||||
} catch (IllegalArgumentException expected) {
|
||||
assertTrue(expected.getMessage().contains("Unknown parameters"));
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue