From 2e0fc562bc239ea897023796160a8870eddd2a48 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Mon, 24 Feb 2014 04:41:03 +0000 Subject: [PATCH] LUCENE-5468: commit current state git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5468@1571137 13f79535-47bb-0310-9956-ffa450edef68 --- .../lucene/analysis/hunspell2/Affix.java | 157 +++++ .../lucene/analysis/hunspell2/Dictionary.java | 606 ++++++++++++++++++ .../hunspell2/Hunspell2StemFilter.java | 139 ++++ .../hunspell2/Hunspell2StemFilterFactory.java | 80 +++ .../analysis/hunspell2/ISO8859_14Decoder.java | 60 ++ .../lucene/analysis/hunspell2/Stemmer.java | 288 +++++++++ .../lucene/analysis/hunspell2/package.html | 26 + ...he.lucene.analysis.util.TokenFilterFactory | 1 + .../hunspell2/TestAllDictionaries.java | 205 ++++++ .../analysis/hunspell2/TestDictionary.java | 109 ++++ .../hunspell2/TestHunspell2StemFilter.java | 87 +++ .../TestHunspell2StemFilterFactory.java | 50 ++ .../analysis/hunspell2/TestStemmer.java | 105 +++ .../lucene/analysis/hunspell2/broken.aff | 24 + .../lucene/analysis/hunspell2/compressed.aff | 29 + .../lucene/analysis/hunspell2/compressed.dic | 9 + .../lucene/analysis/hunspell2/simple.aff | 20 + .../lucene/analysis/hunspell2/simple.dic | 10 + 18 files changed, 2005 insertions(+) create mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Affix.java create mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java create mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Hunspell2StemFilter.java create mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Hunspell2StemFilterFactory.java create mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/ISO8859_14Decoder.java create mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java create mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/package.html create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestAllDictionaries.java create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestDictionary.java create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestHunspell2StemFilter.java create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestHunspell2StemFilterFactory.java create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestStemmer.java create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/broken.aff create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/compressed.aff create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/compressed.dic create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/simple.aff create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/simple.dic diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Affix.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Affix.java new file mode 100644 index 00000000000..41c3553fb77 --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Affix.java @@ -0,0 +1,157 @@ +package org.apache.lucene.analysis.hunspell2; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.regex.Pattern; + +/** + * Wrapper class representing a hunspell affix + */ +final class Affix { + + private String append; // the affix itself, what is appended + private char appendFlags[]; // continuation class flags + private String strip; + + private String condition; + private Pattern conditionPattern; + + private char flag; + + private boolean crossProduct; + + /** + * Checks whether the given text matches the conditional pattern on this affix + * + * @param text Text to check if it matches the affix's conditional pattern + * @return {@code true} if the text meets the condition, {@code false} otherwise + */ + public boolean checkCondition(CharSequence text) { + return conditionPattern.matcher(text).matches(); + } + + /** + * Returns the append defined for the affix + * + * @return Defined append + */ + public String getAppend() { + return append; + } + + /** + * Sets the append defined for the affix + * + * @param append Defined append for the affix + */ + public void setAppend(String append) { + this.append = append; + } + + /** + * Returns the flags defined for the affix append + * + * @return Flags defined for the affix append + */ + public char[] getAppendFlags() { + return appendFlags; + } + + /** + * Sets the flags defined for the affix append + * + * @param appendFlags Flags defined for the affix append + */ + public void setAppendFlags(char[] appendFlags) { + this.appendFlags = appendFlags; + } + + /** + * Returns the stripping characters defined for the affix + * + * @return Stripping characters defined for the affix + */ + public String getStrip() { + return strip; + } + + /** + * Sets the stripping characters defined for the affix + * + * @param strip Stripping characters defined for the affix + */ + public void setStrip(String strip) { + this.strip = strip; + } + + /** + * Returns the condition that must be met before the affix can be applied + * + * @return Condition that must be met before the affix can be applied + */ + public String getCondition() { + return condition; + } + + /** + * Sets the condition that must be met before the affix can be applied + * + * @param condition Condition to be met before affix application + * @param pattern Condition as a regular expression pattern + */ + public void setCondition(String condition, String pattern) { + this.condition = condition; + this.conditionPattern = Pattern.compile(pattern); + } + + /** + * Returns the affix flag + * + * @return Affix flag + */ + public char getFlag() { + return flag; + } + + /** + * Sets the affix flag + * + * @param flag Affix flag + */ + public void setFlag(char flag) { + this.flag = flag; + } + + /** + * Returns whether the affix is defined as cross product + * + * @return {@code true} if the affix is cross product, {@code false} otherwise + */ + public boolean isCrossProduct() { + return crossProduct; + } + + /** + * Sets whether the affix is defined as cross product + * + * @param crossProduct Whether the affix is defined as cross product + */ + public void setCrossProduct(boolean crossProduct) { + this.crossProduct = crossProduct; + } +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java new file mode 100644 index 00000000000..a7b9a58f080 --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java @@ -0,0 +1,606 @@ +package org.apache.lucene.analysis.hunspell2; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.util.CharArrayMap; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefHash; +import org.apache.lucene.util.IntsRef; +import org.apache.lucene.util.UnicodeUtil; +import org.apache.lucene.util.Version; +import org.apache.lucene.util.fst.Builder; +import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.PositiveIntOutputs; + +import java.io.*; +import java.nio.charset.Charset; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.CodingErrorAction; +import java.text.ParseException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.TreeMap; + +/** + * In-memory structure for the dictionary (.dic) and affix (.aff) + * data of a hunspell dictionary. + */ +public class Dictionary { + + static final char[] NOFLAGS = new char[0]; + + private static final String ALIAS_KEY = "AF"; + private static final String PREFIX_KEY = "PFX"; + private static final String SUFFIX_KEY = "SFX"; + private static final String FLAG_KEY = "FLAG"; + + private static final String NUM_FLAG_TYPE = "num"; + private static final String UTF8_FLAG_TYPE = "UTF-8"; + private static final String LONG_FLAG_TYPE = "long"; + + private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*"; + private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s"; + + public CharArrayMap> prefixes; + public CharArrayMap> suffixes; + + // the entries in the .dic file, mapping to their set of flags. + // the fst output is the ordinal for flagLookup + public FST words; + // the list of unique flagsets (wordforms). theoretically huge, but practically + // small (e.g. for polish this is 756), otherwise humans wouldn't be able to deal with it either. + public BytesRefHash flagLookup = new BytesRefHash(); + + private FlagParsingStrategy flagParsingStrategy = new SimpleFlagParsingStrategy(); // Default flag parsing strategy + + private String[] aliases; + private int aliasCount = 0; + + /** + * Creates a new Dictionary containing the information read from the provided InputStreams to hunspell affix + * and dictionary files. + * You have to close the provided InputStreams yourself. + * + * @param affix InputStream for reading the hunspell affix file (won't be closed). + * @param dictionary InputStream for reading the hunspell dictionary file (won't be closed). + * @throws IOException Can be thrown while reading from the InputStreams + * @throws ParseException Can be thrown if the content of the files does not meet expected formats + */ + public Dictionary(InputStream affix, InputStream dictionary) throws IOException, ParseException { + BufferedInputStream buffered = new BufferedInputStream(affix, 8192); + buffered.mark(8192); + String encoding = getDictionaryEncoding(affix); + buffered.reset(); + CharsetDecoder decoder = getJavaEncoding(encoding); + readAffixFile(buffered, decoder); + TreeMap tempWords = new TreeMap(); + flagLookup.add(new BytesRef()); // no flags -> ord 0 + readDictionaryFile(dictionary, decoder, tempWords); + PositiveIntOutputs o = PositiveIntOutputs.getSingleton(); + Builder b = new Builder(FST.INPUT_TYPE.BYTE4, o); // nocommit: byte4 + IntsRef scratchInts = new IntsRef(); + for (Map.Entry e : tempWords.entrySet()) { + UnicodeUtil.UTF8toUTF32(e.getKey(), scratchInts); + b.add(scratchInts, e.getValue().longValue()); + } + words = b.finish(); + } + + /** + * Looks up words that match the String created from the given char array, offset and length + * + * @param word Char array to generate the String from + * @param offset Offset in the char array that the String starts at + * @param length Length from the offset that the String is + * @return List of HunspellWords that match the generated String, or {@code null} if none are found + */ + char[] lookupWord(char word[], int offset, int length, BytesRef scratch) { + Integer ord = null; + try { + ord = lookupOrd(word, offset, length); + } catch (IOException ex) { /* bogus */ } + if (ord == null) { + return null; + } + return decodeFlags(flagLookup.get(ord, scratch)); + } + + public Integer lookupOrd(char word[], int offset, int length) throws IOException { + final FST.BytesReader bytesReader = words.getBytesReader(); + final FST.Arc arc = words.getFirstArc(new FST.Arc()); + // Accumulate output as we go + final Long NO_OUTPUT = words.outputs.getNoOutput(); + Long output = NO_OUTPUT; + + int l = offset + length; + for (int i = offset, cp = 0; i < l; i += Character.charCount(cp)) { + cp = Character.codePointAt(word, i, l); + if (words.findTargetArc(cp, arc, arc, bytesReader) == null) { + return null; + } else if (arc.output != NO_OUTPUT) { + output = words.outputs.add(output, arc.output); + } + } + if (words.findTargetArc(FST.END_LABEL, arc, arc, bytesReader) == null) { + return null; + } else if (arc.output != NO_OUTPUT) { + return words.outputs.add(output, arc.output).intValue(); + } else { + return output.intValue(); + } + } + + /** + * Looks up HunspellAffix prefixes that have an append that matches the String created from the given char array, offset and length + * + * @param word Char array to generate the String from + * @param offset Offset in the char array that the String starts at + * @param length Length from the offset that the String is + * @return List of HunspellAffix prefixes with an append that matches the String, or {@code null} if none are found + */ + public List lookupPrefix(char word[], int offset, int length) { + return prefixes.get(word, offset, length); + } + + /** + * Looks up HunspellAffix suffixes that have an append that matches the String created from the given char array, offset and length + * + * @param word Char array to generate the String from + * @param offset Offset in the char array that the String starts at + * @param length Length from the offset that the String is + * @return List of HunspellAffix suffixes with an append that matches the String, or {@code null} if none are found + */ + List lookupSuffix(char word[], int offset, int length) { + return suffixes.get(word, offset, length); + } + + /** + * Reads the affix file through the provided InputStream, building up the prefix and suffix maps + * + * @param affixStream InputStream to read the content of the affix file from + * @param decoder CharsetDecoder to decode the content of the file + * @throws IOException Can be thrown while reading from the InputStream + */ + private void readAffixFile(InputStream affixStream, CharsetDecoder decoder) throws IOException, ParseException { + prefixes = new CharArrayMap>(Version.LUCENE_CURRENT, 8, false); + suffixes = new CharArrayMap>(Version.LUCENE_CURRENT, 8, false); + + LineNumberReader reader = new LineNumberReader(new InputStreamReader(affixStream, decoder)); + String line = null; + while ((line = reader.readLine()) != null) { + if (line.startsWith(ALIAS_KEY)) { + parseAlias(line); + } else if (line.startsWith(PREFIX_KEY)) { + parseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN); + } else if (line.startsWith(SUFFIX_KEY)) { + parseAffix(suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN); + } else if (line.startsWith(FLAG_KEY)) { + // Assume that the FLAG line comes before any prefix or suffixes + // Store the strategy so it can be used when parsing the dic file + flagParsingStrategy = getFlagParsingStrategy(line); + } + } + } + + /** + * Parses a specific affix rule putting the result into the provided affix map + * + * @param affixes Map where the result of the parsing will be put + * @param header Header line of the affix rule + * @param reader BufferedReader to read the content of the rule from + * @param conditionPattern {@link String#format(String, Object...)} pattern to be used to generate the condition regex + * pattern + * @throws IOException Can be thrown while reading the rule + */ + private void parseAffix(CharArrayMap> affixes, + String header, + LineNumberReader reader, + String conditionPattern) throws IOException, ParseException { + String args[] = header.split("\\s+"); + + boolean crossProduct = args[2].equals("Y"); + + int numLines = Integer.parseInt(args[3]); + for (int i = 0; i < numLines; i++) { + String line = reader.readLine(); + String ruleArgs[] = line.split("\\s+"); + + if (ruleArgs.length < 5) { + throw new ParseException("The affix file contains a rule with less than five elements", reader.getLineNumber()); + } + + Affix affix = new Affix(); + + affix.setFlag(flagParsingStrategy.parseFlag(ruleArgs[1])); + affix.setStrip(ruleArgs[2].equals("0") ? "" : ruleArgs[2]); + + String affixArg = ruleArgs[3]; + + int flagSep = affixArg.lastIndexOf('/'); + if (flagSep != -1) { + String flagPart = affixArg.substring(flagSep + 1); + + if (aliasCount > 0) { + flagPart = getAliasValue(Integer.parseInt(flagPart)); + } + + char appendFlags[] = flagParsingStrategy.parseFlags(flagPart); + Arrays.sort(appendFlags); + affix.setAppendFlags(appendFlags); + affix.setAppend(affixArg.substring(0, flagSep)); + } else { + affix.setAppend(affixArg); + } + + String condition = ruleArgs[4]; + // at least the gascon affix file has this issue + if (condition.startsWith("[") && !condition.endsWith("]")) { + condition = condition + "]"; + } + // "dash hasn't got special meaning" (we must escape it) + if (condition.indexOf('-') >= 0) { + condition = condition.replace("-", "\\-"); + } + affix.setCondition(condition, String.format(Locale.ROOT, conditionPattern, condition)); + affix.setCrossProduct(crossProduct); + + List list = affixes.get(affix.getAppend()); + if (list == null) { + list = new ArrayList(); + affixes.put(affix.getAppend(), list); + } + + list.add(affix); + } + } + + /** + * Parses the encoding specified in the affix file readable through the provided InputStream + * + * @param affix InputStream for reading the affix file + * @return Encoding specified in the affix file + * @throws IOException Can be thrown while reading from the InputStream + * @throws ParseException Thrown if the first non-empty non-comment line read from the file does not adhere to the format {@code SET } + */ + private String getDictionaryEncoding(InputStream affix) throws IOException, ParseException { + final StringBuilder encoding = new StringBuilder(); + for (;;) { + encoding.setLength(0); + int ch; + while ((ch = affix.read()) >= 0) { + if (ch == '\n') { + break; + } + if (ch != '\r') { + encoding.append((char)ch); + } + } + if ( + encoding.length() == 0 || encoding.charAt(0) == '#' || + // this test only at the end as ineffective but would allow lines only containing spaces: + encoding.toString().trim().length() == 0 + ) { + if (ch < 0) { + throw new ParseException("Unexpected end of affix file.", 0); + } + continue; + } + if (encoding.length() > 4 && "SET ".equals(encoding.substring(0, 4))) { + // cleanup the encoding string, too (whitespace) + return encoding.substring(4).trim(); + } + } + } + + static final Map CHARSET_ALIASES; + static { + Map m = new HashMap<>(); + m.put("microsoft-cp1251", "windows-1251"); + m.put("TIS620-2533", "TIS-620"); + CHARSET_ALIASES = Collections.unmodifiableMap(m); + } + + /** + * Retrieves the CharsetDecoder for the given encoding. Note, This isn't perfect as I think ISCII-DEVANAGARI and + * MICROSOFT-CP1251 etc are allowed... + * + * @param encoding Encoding to retrieve the CharsetDecoder for + * @return CharSetDecoder for the given encoding + */ + private CharsetDecoder getJavaEncoding(String encoding) { + if ("ISO8859-14".equals(encoding)) { + return new ISO8859_14Decoder(); + } + String canon = CHARSET_ALIASES.get(encoding); + if (canon != null) { + encoding = canon; + } + Charset charset = Charset.forName(encoding); + return charset.newDecoder().onMalformedInput(CodingErrorAction.REPLACE); + } + + /** + * Determines the appropriate {@link FlagParsingStrategy} based on the FLAG definition line taken from the affix file + * + * @param flagLine Line containing the flag information + * @return FlagParsingStrategy that handles parsing flags in the way specified in the FLAG definition + */ + private FlagParsingStrategy getFlagParsingStrategy(String flagLine) { + String flagType = flagLine.substring(5); + + if (NUM_FLAG_TYPE.equals(flagType)) { + return new NumFlagParsingStrategy(); + } else if (UTF8_FLAG_TYPE.equals(flagType)) { + return new SimpleFlagParsingStrategy(); + } else if (LONG_FLAG_TYPE.equals(flagType)) { + return new DoubleASCIIFlagParsingStrategy(); + } + + throw new IllegalArgumentException("Unknown flag type: " + flagType); + } + + /** + * Reads the dictionary file through the provided InputStream, building up the words map + * + * @param dictionary InputStream to read the dictionary file through + * @param decoder CharsetDecoder used to decode the contents of the file + * @throws IOException Can be thrown while reading from the file + */ + private void readDictionaryFile(InputStream dictionary, CharsetDecoder decoder, TreeMap words) throws IOException { + BytesRef flagsScratch = new BytesRef(); + BytesRef flagsScratch2 = new BytesRef(); + + BufferedReader reader = new BufferedReader(new InputStreamReader(dictionary, decoder)); + // TODO: don't create millions of strings. + String line = reader.readLine(); // first line is number of entries + // sometimes the number of entries has a comment/copyright after it + line = line.replaceFirst("\\s*\\#.*$", ""); + int numEntries = Integer.parseInt(line); + + // TODO: the flags themselves can be double-chars (long) or also numeric + // either way the trick is to encode them as char... but they must be parsed differently + while ((line = reader.readLine()) != null) { + String entry; + char wordForm[]; + + int flagSep = line.lastIndexOf('/'); + if (flagSep == -1) { + wordForm = NOFLAGS; + entry = line; + } else { + // note, there can be comments (morph description) after a flag. + // we should really look for any whitespace + int end = line.indexOf('\t', flagSep); + if (end == -1) + end = line.length(); + + String flagPart = line.substring(flagSep + 1, end); + if (aliasCount > 0) { + flagPart = getAliasValue(Integer.parseInt(flagPart)); + } + + wordForm = flagParsingStrategy.parseFlags(flagPart); + Arrays.sort(wordForm); + entry = line.substring(0, flagSep); + } + + BytesRef scratch = new BytesRef(entry); + Integer existingOrd = words.get(scratch); + final char mergedEntries[]; + if (existingOrd == null || existingOrd == 0) { + mergedEntries = wordForm; + } else { + flagLookup.get(existingOrd, flagsScratch2); + mergedEntries = merge(decodeFlags(flagsScratch2), wordForm); + } + + final int hashCode = encodeFlagsWithHash(flagsScratch, mergedEntries); + int ord = flagLookup.add(flagsScratch, hashCode); + if (ord < 0) { + // already exists in our hash + ord = (-ord)-1; + } + + words.put(scratch, ord); + } + } + + static char[] decodeFlags(BytesRef b) { + int len = b.length >>> 1; + char flags[] = new char[len]; + int upto = 0; + int end = b.offset + b.length; + for (int i = b.offset; i < end; i += 2) { + flags[upto++] = (char)((b.bytes[i] << 8) | (b.bytes[i+1] & 0xff)); + } + return flags; + } + + static int encodeFlagsWithHash(BytesRef b, char flags[]) { + int hash = 0; + int len = flags.length << 1; + b.grow(len); + b.length = len; + int upto = b.offset; + for (int i = 0; i < flags.length; i++) { + int flag = flags[i]; + hash = 31*hash + (b.bytes[upto++] = (byte) ((flag >> 8) & 0xff)); + hash = 31*hash + (b.bytes[upto++] = (byte) (flag & 0xff)); + } + return hash; + } + + private void parseAlias(String line) { + String ruleArgs[] = line.split("\\s+"); + if (aliases == null) { + //first line should be the aliases count + final int count = Integer.parseInt(ruleArgs[1]); + aliases = new String[count]; + } else { + aliases[aliasCount++] = ruleArgs[1]; + } + } + + private String getAliasValue(int id) { + try { + return aliases[id - 1]; + } catch (IndexOutOfBoundsException ex) { + throw new IllegalArgumentException("Bad flag alias number:" + id, ex); + } + } + + /** + * Abstraction of the process of parsing flags taken from the affix and dic files + */ + private static abstract class FlagParsingStrategy { + + /** + * Parses the given String into a single flag + * + * @param rawFlag String to parse into a flag + * @return Parsed flag + */ + char parseFlag(String rawFlag) { + return parseFlags(rawFlag)[0]; + } + + /** + * Parses the given String into multiple flags + * + * @param rawFlags String to parse into flags + * @return Parsed flags + */ + abstract char[] parseFlags(String rawFlags); + } + + /** + * Simple implementation of {@link FlagParsingStrategy} that treats the chars in each String as a individual flags. + * Can be used with both the ASCII and UTF-8 flag types. + */ + private static class SimpleFlagParsingStrategy extends FlagParsingStrategy { + @Override + public char[] parseFlags(String rawFlags) { + return rawFlags.toCharArray(); + } + } + + /** + * Implementation of {@link FlagParsingStrategy} that assumes each flag is encoded in its numerical form. In the case + * of multiple flags, each number is separated by a comma. + */ + private static class NumFlagParsingStrategy extends FlagParsingStrategy { + @Override + public char[] parseFlags(String rawFlags) { + String[] rawFlagParts = rawFlags.trim().split(","); + char[] flags = new char[rawFlagParts.length]; + int upto = 0; + + for (int i = 0; i < rawFlagParts.length; i++) { + // note, removing the trailing X/leading I for nepali... what is the rule here?! + String replacement = rawFlagParts[i].replaceAll("[^0-9]", ""); + // note, ignoring empty flags (this happens in danish, for example) + if (replacement.isEmpty()) { + continue; + } + flags[upto++] = (char) Integer.parseInt(replacement); + } + + if (upto < flags.length) { + flags = Arrays.copyOf(flags, upto); + } + return flags; + } + } + + /** + * Implementation of {@link FlagParsingStrategy} that assumes each flag is encoded as two ASCII characters whose codes + * must be combined into a single character. + * + * TODO (rmuir) test + */ + private static class DoubleASCIIFlagParsingStrategy extends FlagParsingStrategy { + + @Override + public char[] parseFlags(String rawFlags) { + if (rawFlags.length() == 0) { + return new char[0]; + } + + StringBuilder builder = new StringBuilder(); + for (int i = 0; i < rawFlags.length(); i+=2) { + char cookedFlag = (char) ((int) rawFlags.charAt(i) + (int) rawFlags.charAt(i + 1)); + builder.append(cookedFlag); + } + + char flags[] = new char[builder.length()]; + builder.getChars(0, builder.length(), flags, 0); + return flags; + } + } + + static boolean hasFlag(char flags[], char flag) { + return Arrays.binarySearch(flags, flag) >= 0; + } + + static char[] merge(char[] flags1, char[] flags2) { + char merged[] = new char[flags1.length + flags2.length]; + int i1 = 0, i2 = 0; + int last = -1; + int upto = 0; + + while (i1 < flags1.length && i2 < flags2.length) { + final char next; + if (flags1[i1] <= flags2[i2]) { + next = flags1[i1++]; + } else { + next = flags2[i2++]; + } + if (next != last) { + merged[upto++] = next; + last = next; + } + } + + while (i1 < flags1.length) { + char next = flags1[i1++]; + if (next != last) { + merged[upto++] = next; + last = next; + } + } + + while (i2 < flags2.length) { + char next = flags2[i2++]; + if (next != last) { + merged[upto++] = next; + last = next; + } + } + + if (merged.length != upto) { + merged = Arrays.copyOf(merged, upto); + } + + return merged; + } +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Hunspell2StemFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Hunspell2StemFilter.java new file mode 100644 index 00000000000..f9dfb770ab2 --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Hunspell2StemFilter.java @@ -0,0 +1,139 @@ +package org.apache.lucene.analysis.hunspell2; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.List; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.hunspell2.Stemmer.Stem; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; + +/** + * TokenFilter that uses hunspell affix rules and words to stem tokens. Since hunspell supports a word having multiple + * stems, this filter can emit multiple tokens for each consumed token + * + *

+ * Note: This filter is aware of the {@link KeywordAttribute}. To prevent + * certain terms from being passed to the stemmer + * {@link KeywordAttribute#isKeyword()} should be set to true + * in a previous {@link TokenStream}. + * + * Note: For including the original term as well as the stemmed version, see + * {@link org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilterFactory} + *

+ * + * @lucene.experimental + */ +public final class Hunspell2StemFilter extends TokenFilter { + + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); + private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class); + private final Stemmer stemmer; + + private List buffer; + private State savedState; + + private final boolean dedup; + + /** Create a {@link Hunspell2StemFilter} which deduplicates stems and has a maximum + * recursion level of 2. + * @see #Hunspell2StemFilter(TokenStream, Dictionary, int) */ + public Hunspell2StemFilter(TokenStream input, Dictionary dictionary) { + this(input, dictionary, 2); + } + + /** + * Creates a new Hunspell2StemFilter that will stem tokens from the given TokenStream using affix rules in the provided + * Dictionary + * + * @param input TokenStream whose tokens will be stemmed + * @param dictionary HunspellDictionary containing the affix rules and words that will be used to stem the tokens + * @param recursionCap maximum level of recursion stemmer can go into, defaults to 2 + */ + public Hunspell2StemFilter(TokenStream input, Dictionary dictionary, int recursionCap) { + this(input, dictionary, true, recursionCap); + } + + /** Create a {@link Hunspell2StemFilter} which has a maximum recursion level of 2. + * @see #Hunspell2StemFilter(TokenStream, Dictionary, boolean, int) */ + public Hunspell2StemFilter(TokenStream input, Dictionary dictionary, boolean dedup) { + this(input, dictionary, dedup, 2); + } + + /** + * Creates a new HunspellStemFilter that will stem tokens from the given TokenStream using affix rules in the provided + * Dictionary + * + * @param input TokenStream whose tokens will be stemmed + * @param dictionary HunspellDictionary containing the affix rules and words that will be used to stem the tokens + * @param dedup true if only unique terms should be output. + * @param recursionCap maximum level of recursion stemmer can go into, defaults to 2 + */ + public Hunspell2StemFilter(TokenStream input, Dictionary dictionary, boolean dedup, int recursionCap) { + super(input); + this.dedup = dedup; + this.stemmer = new Stemmer(dictionary, recursionCap); + } + + @Override + public boolean incrementToken() throws IOException { + if (buffer != null && !buffer.isEmpty()) { + Stem nextStem = buffer.remove(0); + restoreState(savedState); + posIncAtt.setPositionIncrement(0); + termAtt.copyBuffer(nextStem.getStem(), 0, nextStem.getStemLength()); + termAtt.setLength(nextStem.getStemLength()); + return true; + } + + if (!input.incrementToken()) { + return false; + } + + if (keywordAtt.isKeyword()) { + return true; + } + + buffer = dedup ? stemmer.uniqueStems(termAtt.buffer(), termAtt.length()) : stemmer.stem(termAtt.buffer(), termAtt.length()); + + if (buffer.isEmpty()) { // we do not know this word, return it unchanged + return true; + } + + Stem stem = buffer.remove(0); + termAtt.copyBuffer(stem.getStem(), 0, stem.getStemLength()); + termAtt.setLength(stem.getStemLength()); + + if (!buffer.isEmpty()) { + savedState = captureState(); + } + + return true; + } + + @Override + public void reset() throws IOException { + super.reset(); + buffer = null; + } +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Hunspell2StemFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Hunspell2StemFilterFactory.java new file mode 100644 index 00000000000..6ce73698dfd --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Hunspell2StemFilterFactory.java @@ -0,0 +1,80 @@ +package org.apache.lucene.analysis.hunspell2; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.InputStream; +import java.text.ParseException; +import java.util.Map; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.util.ResourceLoader; +import org.apache.lucene.analysis.util.ResourceLoaderAware; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +/** + * TokenFilterFactory that creates instances of {@link Hunspell2StemFilter}. + * Example config for British English: + *
+ * <filter class="solr.Hunspell2StemFilterFactory"
+ *         dictionary="en_GB.dic"
+ *         affix="en_GB.aff" />
+ * Both parameters dictionary and affix are mandatory. + * Dictionaries for many languages are available through the OpenOffice project. + * + * See http://wiki.apache.org/solr/Hunspell + * @lucene.experimental + */ +public class Hunspell2StemFilterFactory extends TokenFilterFactory implements ResourceLoaderAware { + private static final String PARAM_DICTIONARY = "dictionary"; + private static final String PARAM_AFFIX = "affix"; + private static final String PARAM_RECURSION_CAP = "recursionCap"; + + private final String dictionaryFile; + private final String affixFile; + private Dictionary dictionary; + private int recursionCap; + + /** Creates a new Hunspell2StemFilterFactory */ + public Hunspell2StemFilterFactory(Map args) { + super(args); + dictionaryFile = require(args, PARAM_DICTIONARY); + affixFile = get(args, PARAM_AFFIX); + recursionCap = getInt(args, PARAM_RECURSION_CAP, 2); + if (!args.isEmpty()) { + throw new IllegalArgumentException("Unknown parameters: " + args); + } + } + + @Override + public void inform(ResourceLoader loader) throws IOException { + try (InputStream affix = loader.openResource(affixFile); + InputStream dictionary = loader.openResource(dictionaryFile)) { + try { + this.dictionary = new Dictionary(affix, dictionary); + } catch (ParseException e) { + throw new RuntimeException(e); + } + } + } + + @Override + public TokenStream create(TokenStream tokenStream) { + return new Hunspell2StemFilter(tokenStream, dictionary, recursionCap); + } +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/ISO8859_14Decoder.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/ISO8859_14Decoder.java new file mode 100644 index 00000000000..4de0d4bc051 --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/ISO8859_14Decoder.java @@ -0,0 +1,60 @@ +package org.apache.lucene.analysis.hunspell2; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.CoderResult; + +import org.apache.lucene.util.IOUtils; + +// many hunspell dictionaries use this encoding, yet java does not have it?!?! +final class ISO8859_14Decoder extends CharsetDecoder { + + static final char TABLE[] = new char[] { + 0x00A0, 0x1E02, 0x1E03, 0x00A3, 0x010A, 0x010B, 0x1E0A, 0x00A7, + 0x1E80, 0x00A9, 0x1E82, 0x1E0B, 0x1EF2, 0x00AD, 0x00AE, 0x0178, + 0x1E1E, 0x1E1F, 0x0120, 0x0121, 0x1E40, 0x1E41, 0x00B6, 0x1E56, + 0x1E81, 0x1E57, 0x1E83, 0x1E60, 0x1EF3, 0x1E84, 0x1E85, 0x1E61, + 0x00C0, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x00C6, 0x00C7, + 0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF, + 0x0174, 0x00D1, 0x00D2, 0x00D3, 0x00D4, 0x00D5, 0x00D6, 0x1E6A, + 0x00D8, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x00DD, 0x0176, 0x00DF, + 0x00E0, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6, 0x00E7, + 0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF, + 0x0175, 0x00F1, 0x00F2, 0x00F3, 0x00F4, 0x00F5, 0x00F6, 0x1E6B, + 0x00F8, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x00FD, 0x0177, 0x00FF + }; + + ISO8859_14Decoder() { + super(IOUtils.CHARSET_UTF_8, 1f, 1f); + } + + @Override + protected CoderResult decodeLoop(ByteBuffer in, CharBuffer out) { + while (in.hasRemaining() && out.hasRemaining()) { + char ch = (char) (in.get() & 0xff); + if (ch >= 0xA0) { + ch = TABLE[ch - 0xA0]; + } + out.put(ch); + } + return in.hasRemaining() ? CoderResult.OVERFLOW : CoderResult.UNDERFLOW; + } +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java new file mode 100644 index 00000000000..7d36c81e4ae --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java @@ -0,0 +1,288 @@ +package org.apache.lucene.analysis.hunspell2; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +import org.apache.lucene.analysis.util.CharArraySet; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.Version; + +/** + * Stemmer uses the affix rules declared in the Dictionary to generate one or more stems for a word. It + * conforms to the algorithm in the original hunspell algorithm, including recursive suffix stripping. + */ +final class Stemmer { + private final int recursionCap; + private final Dictionary dictionary; + private BytesRef scratch = new BytesRef(); + private final StringBuilder segment = new StringBuilder(); + + /** + * Constructs a new Stemmer which will use the provided Dictionary to create its stems. Uses the + * default recursion cap of 2 (based on Hunspell documentation). + * + * @param dictionary Dictionary that will be used to create the stems + */ + public Stemmer(Dictionary dictionary) { + this(dictionary, 2); + } + + /** + * Constructs a new Stemmer which will use the provided Dictionary to create its stems. + * + * @param dictionary Dictionary that will be used to create the stems + * @param recursionCap maximum level of recursion stemmer can go into + */ + public Stemmer(Dictionary dictionary, int recursionCap) { + this.dictionary = dictionary; + this.recursionCap = recursionCap; + } + + /** + * Find the stem(s) of the provided word. + * + * @param word Word to find the stems for + * @return List of stems for the word + */ + public List stem(String word) { + return stem(word.toCharArray(), word.length()); + } + + /** + * Find the stem(s) of the provided word + * + * @param word Word to find the stems for + * @return List of stems for the word + */ + public List stem(char word[], int length) { + List stems = new ArrayList(); + if (dictionary.lookupWord(word, 0, length, scratch) != null) { + stems.add(new Stem(word, length)); + } + stems.addAll(stem(word, length, null, 0)); + return stems; + } + + /** + * Find the unique stem(s) of the provided word + * + * @param word Word to find the stems for + * @return List of stems for the word + */ + public List uniqueStems(char word[], int length) { + List stems = new ArrayList(); + CharArraySet terms = new CharArraySet(Version.LUCENE_CURRENT, 8, false); + if (dictionary.lookupWord(word, 0, length, scratch) != null) { + stems.add(new Stem(word, length)); + terms.add(word); + } + List otherStems = stem(word, length, null, 0); + for (Stem s : otherStems) { + if (!terms.contains(s.stem)) { + stems.add(s); + terms.add(s.stem); + } + } + return stems; + } + + // ================================================= Helper Methods ================================================ + + /** + * Generates a list of stems for the provided word + * + * @param word Word to generate the stems for + * @param flags Flags from a previous stemming step that need to be cross-checked with any affixes in this recursive step + * @param recursionDepth Level of recursion this stemming step is at + * @return List of stems, or empty list if no stems are found + */ + private List stem(char word[], int length, char[] flags, int recursionDepth) { + List stems = new ArrayList(); + + for (int i = 0; i < length; i++) { + List suffixes = dictionary.lookupSuffix(word, i, length - i); + if (suffixes == null) { + continue; + } + + for (Affix suffix : suffixes) { + if (hasCrossCheckedFlag(suffix.getFlag(), flags)) { + int deAffixedLength = length - suffix.getAppend().length(); + // TODO: can we do this in-place? + String strippedWord = new StringBuilder().append(word, 0, deAffixedLength).append(suffix.getStrip()).toString(); + + List stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), suffix, recursionDepth); + for (Stem stem : stemList) { + stem.addSuffix(suffix); + } + + stems.addAll(stemList); + } + } + } + + for (int i = length - 1; i >= 0; i--) { + List prefixes = dictionary.lookupPrefix(word, 0, i); + if (prefixes == null) { + continue; + } + + for (Affix prefix : prefixes) { + if (hasCrossCheckedFlag(prefix.getFlag(), flags)) { + int deAffixedStart = prefix.getAppend().length(); + int deAffixedLength = length - deAffixedStart; + + String strippedWord = new StringBuilder().append(prefix.getStrip()) + .append(word, deAffixedStart, deAffixedLength) + .toString(); + + List stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), prefix, recursionDepth); + for (Stem stem : stemList) { + stem.addPrefix(prefix); + } + + stems.addAll(stemList); + } + } + } + + return stems; + } + + /** + * Applies the affix rule to the given word, producing a list of stems if any are found + * + * @param strippedWord Word the affix has been removed and the strip added + * @param affix HunspellAffix representing the affix rule itself + * @param recursionDepth Level of recursion this stemming step is at + * @return List of stems for the word, or an empty list if none are found + */ + public List applyAffix(char strippedWord[], int length, Affix affix, int recursionDepth) { + segment.setLength(0); + segment.append(strippedWord, 0, length); + if (!affix.checkCondition(segment)) { + return Collections.emptyList(); + } + + List stems = new ArrayList(); + + char wordFlags[] = dictionary.lookupWord(strippedWord, 0, length, scratch); + if (wordFlags != null && Dictionary.hasFlag(wordFlags, affix.getFlag())) { + stems.add(new Stem(strippedWord, length)); + } + + if (affix.isCrossProduct() && recursionDepth < recursionCap) { + stems.addAll(stem(strippedWord, length, affix.getAppendFlags(), ++recursionDepth)); + } + + return stems; + } + + /** + * Checks if the given flag cross checks with the given array of flags + * + * @param flag Flag to cross check with the array of flags + * @param flags Array of flags to cross check against. Can be {@code null} + * @return {@code true} if the flag is found in the array or the array is {@code null}, {@code false} otherwise + */ + private boolean hasCrossCheckedFlag(char flag, char[] flags) { + return flags == null || Arrays.binarySearch(flags, flag) >= 0; + } + + /** + * Stem represents all information known about a stem of a word. This includes the stem, and the prefixes and suffixes + * that were used to change the word into the stem. + */ + public static class Stem { + + private final List prefixes = new ArrayList(); + private final List suffixes = new ArrayList(); + private final char stem[]; + private final int stemLength; + + /** + * Creates a new Stem wrapping the given word stem + * + * @param stem Stem of a word + */ + public Stem(char stem[], int stemLength) { + this.stem = stem; + this.stemLength = stemLength; + } + + /** + * Adds a prefix to the list of prefixes used to generate this stem. Because it is assumed that prefixes are added + * depth first, the prefix is added to the front of the list + * + * @param prefix Prefix to add to the list of prefixes for this stem + */ + public void addPrefix(Affix prefix) { + prefixes.add(0, prefix); + } + + /** + * Adds a suffix to the list of suffixes used to generate this stem. Because it is assumed that suffixes are added + * depth first, the suffix is added to the end of the list + * + * @param suffix Suffix to add to the list of suffixes for this stem + */ + public void addSuffix(Affix suffix) { + suffixes.add(suffix); + } + + /** + * Returns the list of prefixes used to generate the stem + * + * @return List of prefixes used to generate the stem or an empty list if no prefixes were required + */ + public List getPrefixes() { + return prefixes; + } + + /** + * Returns the list of suffixes used to generate the stem + * + * @return List of suffixes used to generate the stem or an empty list if no suffixes were required + */ + public List getSuffixes() { + return suffixes; + } + + /** + * Returns the text of the word's stem. + * @see #getStemLength() + */ + public char[] getStem() { + return stem; + } + + /** Returns the valid length of the text in {@link #getStem()} */ + public int getStemLength() { + return stemLength; + } + + /** Only use this if you really need a string (e.g. for testing) */ + public String getStemString() { + return new String(stem, 0, stemLength); + } + } +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/package.html b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/package.html new file mode 100644 index 00000000000..196591969e8 --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/package.html @@ -0,0 +1,26 @@ + + + +Stemming TokenFilter using a Java implementation of the +Hunspell stemming algorithm. +

+Dictionaries can be found on +OpenOffice's wiki +

+ + diff --git a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory index 04fc80cf59c..e4ca7c6802c 100644 --- a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory +++ b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory @@ -51,6 +51,7 @@ org.apache.lucene.analysis.hi.HindiNormalizationFilterFactory org.apache.lucene.analysis.hi.HindiStemFilterFactory org.apache.lucene.analysis.hu.HungarianLightStemFilterFactory org.apache.lucene.analysis.hunspell.HunspellStemFilterFactory +org.apache.lucene.analysis.hunspell2.Hunspell2StemFilterFactory org.apache.lucene.analysis.id.IndonesianStemFilterFactory org.apache.lucene.analysis.in.IndicNormalizationFilterFactory org.apache.lucene.analysis.it.ItalianLightStemFilterFactory diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestAllDictionaries.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestAllDictionaries.java new file mode 100644 index 00000000000..02ccedb9be7 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestAllDictionaries.java @@ -0,0 +1,205 @@ +package org.apache.lucene.analysis.hunspell2; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.File; +import java.io.InputStream; +import java.util.zip.ZipEntry; +import java.util.zip.ZipFile; + +import org.apache.lucene.analysis.hunspell.HunspellDictionary; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.RamUsageEstimator; + +/** + * Can be retrieved via: + * wget --mirror -np http://archive.services.openoffice.org/pub/mirror/OpenOffice.org/contrib/dictionaries/ + * Note some of the files differ only in case. This may be a problem on your operating system! + */ +//@Ignore("enable manually") +public class TestAllDictionaries extends LuceneTestCase { + + // set this to the location of where you downloaded all the files + static final File DICTIONARY_HOME = + new File("/Users/rmuir/hunspell/archive.services.openoffice.org/pub/mirror/OpenOffice.org/contrib/dictionaries"); + + final String tests[] = { + /* zip file */ /* dictionary */ /* affix */ + "af_ZA.zip", "af_ZA.dic", "af_ZA.aff", + "ak_GH.zip", "ak_GH.dic", "ak_GH.aff", + "bg_BG.zip", "bg_BG.dic", "bg_BG.aff", + "ca_ANY.zip", "catalan.dic", "catalan.aff", + "ca_ES.zip", "ca_ES.dic", "ca_ES.aff", + "cop_EG.zip", "cop_EG.dic", "cop_EG.aff", + "cs_CZ.zip", "cs_CZ.dic", "cs_CZ.aff", + "cy_GB.zip", "cy_GB.dic", "cy_GB.aff", + "da_DK.zip", "da_DK.dic", "da_DK.aff", + "de_AT.zip", "de_AT.dic", "de_AT.aff", + "de_CH.zip", "de_CH.dic", "de_CH.aff", + "de_DE.zip", "de_DE.dic", "de_DE.aff", + "de_DE_comb.zip", "de_DE_comb.dic", "de_DE_comb.aff", + "de_DE_frami.zip", "de_DE_frami.dic", "de_DE_frami.aff", + "de_DE_neu.zip", "de_DE_neu.dic", "de_DE_neu.aff", + "el_GR.zip", "el_GR.dic", "el_GR.aff", + "en_AU.zip", "en_AU.dic", "en_AU.aff", + "en_CA.zip", "en_CA.dic", "en_CA.aff", + "en_GB-oed.zip", "en_GB-oed.dic", "en_GB-oed.aff", + "en_GB.zip", "en_GB.dic", "en_GB.aff", + "en_NZ.zip", "en_NZ.dic", "en_NZ.aff", + "eo.zip", "eo_l3.dic", "eo_l3.aff", + "eo_EO.zip", "eo_EO.dic", "eo_EO.aff", + "es_AR.zip", "es_AR.dic", "es_AR.aff", + "es_BO.zip", "es_BO.dic", "es_BO.aff", + "es_CL.zip", "es_CL.dic", "es_CL.aff", + "es_CO.zip", "es_CO.dic", "es_CO.aff", + "es_CR.zip", "es_CR.dic", "es_CR.aff", + "es_CU.zip", "es_CU.dic", "es_CU.aff", + "es_DO.zip", "es_DO.dic", "es_DO.aff", + "es_EC.zip", "es_EC.dic", "es_EC.aff", + "es_ES.zip", "es_ES.dic", "es_ES.aff", + "es_GT.zip", "es_GT.dic", "es_GT.aff", + "es_HN.zip", "es_HN.dic", "es_HN.aff", + "es_MX.zip", "es_MX.dic", "es_MX.aff", + "es_NEW.zip", "es_NEW.dic", "es_NEW.aff", + "es_NI.zip", "es_NI.dic", "es_NI.aff", + "es_PA.zip", "es_PA.dic", "es_PA.aff", + "es_PE.zip", "es_PE.dic", "es_PE.aff", + "es_PR.zip", "es_PR.dic", "es_PR.aff", + "es_PY.zip", "es_PY.dic", "es_PY.aff", + "es_SV.zip", "es_SV.dic", "es_SV.aff", + "es_UY.zip", "es_UY.dic", "es_UY.aff", + "es_VE.zip", "es_VE.dic", "es_VE.aff", + "et_EE.zip", "et_EE.dic", "et_EE.aff", + "fo_FO.zip", "fo_FO.dic", "fo_FO.aff", + "fr_FR-1990_1-3-2.zip", "fr_FR-1990.dic", "fr_FR-1990.aff", + "fr_FR-classique_1-3-2.zip", "fr_FR-classique.dic", "fr_FR-classique.aff", + "fr_FR_1-3-2.zip", "fr_FR.dic", "fr_FR.aff", + "fy_NL.zip", "fy_NL.dic", "fy_NL.aff", + "ga_IE.zip", "ga_IE.dic", "ga_IE.aff", + "gd_GB.zip", "gd_GB.dic", "gd_GB.aff", + "gl_ES.zip", "gl_ES.dic", "gl_ES.aff", + "gsc_FR.zip", "gsc_FR.dic", "gsc_FR.aff", + "gu_IN.zip", "gu_IN.dic", "gu_IN.aff", + "he_IL.zip", "he_IL.dic", "he_IL.aff", + "hi_IN.zip", "hi_IN.dic", "hi_IN.aff", + "hil_PH.zip", "hil_PH.dic", "hil_PH.aff", + "hr_HR.zip", "hr_HR.dic", "hr_HR.aff", + "hu_HU.zip", "hu_HU.dic", "hu_HU.aff", + "hu_HU_comb.zip", "hu_HU.dic", "hu_HU.aff", + "ia.zip", "ia.dic", "ia.aff", + "id_ID.zip", "id_ID.dic", "id_ID.aff", + "it_IT.zip", "it_IT.dic", "it_IT.aff", + "ku_TR.zip", "ku_TR.dic", "ku_TR.aff", + "la.zip", "la.dic", "la.aff", + "lt_LT.zip", "lt_LT.dic", "lt_LT.aff", + "lv_LV.zip", "lv_LV.dic", "lv_LV.aff", + "mg_MG.zip", "mg_MG.dic", "mg_MG.aff", + "mi_NZ.zip", "mi_NZ.dic", "mi_NZ.aff", + "mk_MK.zip", "mk_MK.dic", "mk_MK.aff", + "mos_BF.zip", "mos_BF.dic", "mos_BF.aff", + "mr_IN.zip", "mr_IN.dic", "mr_IN.aff", + "ms_MY.zip", "ms_MY.dic", "ms_MY.aff", + "nb_NO.zip", "nb_NO.dic", "nb_NO.aff", + "ne_NP.zip", "ne_NP.dic", "ne_NP.aff", + "nl_NL.zip", "nl_NL.dic", "nl_NL.aff", + "nl_med.zip", "nl_med.dic", "nl_med.aff", + "nn_NO.zip", "nn_NO.dic", "nn_NO.aff", + "nr_ZA.zip", "nr_ZA.dic", "nr_ZA.aff", + "ns_ZA.zip", "ns_ZA.dic", "ns_ZA.aff", + "ny_MW.zip", "ny_MW.dic", "ny_MW.aff", + "oc_FR.zip", "oc_FR.dic", "oc_FR.aff", + "pl_PL.zip", "pl_PL.dic", "pl_PL.aff", + "pt_BR.zip", "pt_BR.dic", "pt_BR.aff", + "pt_PT.zip", "pt_PT.dic", "pt_PT.aff", + "ro_RO.zip", "ro_RO.dic", "ro_RO.aff", + "ru_RU.zip", "ru_RU.dic", "ru_RU.aff", + "ru_RU_ye.zip", "ru_RU_ie.dic", "ru_RU_ie.aff", + "ru_RU_yo.zip", "ru_RU_yo.dic", "ru_RU_yo.aff", + "rw_RW.zip", "rw_RW.dic", "rw_RW.aff", + "sk_SK.zip", "sk_SK.dic", "sk_SK.aff", + "sl_SI.zip", "sl_SI.dic", "sl_SI.aff", + "sq_AL.zip", "sq_AL.dic", "sq_AL.aff", + "ss_ZA.zip", "ss_ZA.dic", "ss_ZA.aff", + "st_ZA.zip", "st_ZA.dic", "st_ZA.aff", + "sv_SE.zip", "sv_SE.dic", "sv_SE.aff", + "sw_KE.zip", "sw_KE.dic", "sw_KE.aff", + "tet_ID.zip", "tet_ID.dic", "tet_ID.aff", + "th_TH.zip", "th_TH.dic", "th_TH.aff", + "tl_PH.zip", "tl_PH.dic", "tl_PH.aff", + "tn_ZA.zip", "tn_ZA.dic", "tn_ZA.aff", + "ts_ZA.zip", "ts_ZA.dic", "ts_ZA.aff", + "uk_UA.zip", "uk_UA.dic", "uk_UA.aff", + "ve_ZA.zip", "ve_ZA.dic", "ve_ZA.aff", + "vi_VN.zip", "vi_VN.dic", "vi_VN.aff", + "xh_ZA.zip", "xh_ZA.dic", "xh_ZA.aff", + "zu_ZA.zip", "zu_ZA.dic", "zu_ZA.aff", + }; + + public void test() throws Exception { + for (int i = 0; i < tests.length; i += 3) { + File f = new File(DICTIONARY_HOME, tests[i]); + assert f.exists(); + + try (ZipFile zip = new ZipFile(f, IOUtils.CHARSET_UTF_8)) { + ZipEntry dicEntry = zip.getEntry(tests[i+1]); + assert dicEntry != null; + ZipEntry affEntry = zip.getEntry(tests[i+2]); + assert affEntry != null; + + // get ram from previous impl + String oldRAM = "FAIL"; + try (InputStream dictionary = zip.getInputStream(dicEntry); + InputStream affix = zip.getInputStream(affEntry)) { + try { + HunspellDictionary dic = new HunspellDictionary(affix, dictionary, TEST_VERSION_CURRENT); + oldRAM = RamUsageEstimator.humanSizeOf(dic); + } catch (Throwable t) {} + } + + try (InputStream dictionary = zip.getInputStream(dicEntry); + InputStream affix = zip.getInputStream(affEntry)) { + Dictionary dic = new Dictionary(affix, dictionary); + System.out.println(tests[i] + "\t" + oldRAM + "\t" + RamUsageEstimator.humanSizeOf(dic)); + } + } + } + } + + public void testOneDictionary() throws Exception { + String toTest = "hu_HU.zip"; + for (int i = 0; i < tests.length; i++) { + if (tests[i].equals(toTest)) { + File f = new File(DICTIONARY_HOME, tests[i]); + assert f.exists(); + + try (ZipFile zip = new ZipFile(f, IOUtils.CHARSET_UTF_8)) { + ZipEntry dicEntry = zip.getEntry(tests[i+1]); + assert dicEntry != null; + ZipEntry affEntry = zip.getEntry(tests[i+2]); + assert affEntry != null; + + try (InputStream dictionary = zip.getInputStream(dicEntry); + InputStream affix = zip.getInputStream(affEntry)) { + Dictionary dic = new Dictionary(affix, dictionary); + } + } + } + } + } +} diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestDictionary.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestDictionary.java new file mode 100644 index 00000000000..14c6e8967d0 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestDictionary.java @@ -0,0 +1,109 @@ +package org.apache.lucene.analysis.hunspell2; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.FilterInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.text.ParseException; + +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.LuceneTestCase; + +public class TestDictionary extends LuceneTestCase { + + public void testSimpleDictionary() throws Exception { + InputStream affixStream = getClass().getResourceAsStream("simple.aff"); + InputStream dictStream = getClass().getResourceAsStream("simple.dic"); + + Dictionary dictionary = new Dictionary(affixStream, dictStream); + assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).size()); + assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).size()); + char flags[] = dictionary.lookupWord(new char[]{'o', 'l', 'r'}, 0, 3, new BytesRef()); + assertNotNull(flags); + assertEquals(1, flags.length); + assertEquals("Wrong number of flags for lucen", 1, dictionary.lookupWord(new char[]{'l', 'u', 'c', 'e', 'n'}, 0, 5, new BytesRef()).length); + + affixStream.close(); + dictStream.close(); + } + + public void testCompressedDictionary() throws Exception { + InputStream affixStream = getClass().getResourceAsStream("compressed.aff"); + InputStream dictStream = getClass().getResourceAsStream("compressed.dic"); + + Dictionary dictionary = new Dictionary(affixStream, dictStream); + assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).size()); + assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).size()); + assertEquals(1, dictionary.lookupWord(new char[]{'o', 'l', 'r'}, 0, 3, new BytesRef()).length); + + affixStream.close(); + dictStream.close(); + } + + // malformed rule causes ParseException + public void testInvalidData() throws Exception { + InputStream affixStream = getClass().getResourceAsStream("broken.aff"); + InputStream dictStream = getClass().getResourceAsStream("simple.dic"); + + try { + new Dictionary(affixStream, dictStream); + fail("didn't get expected exception"); + } catch (ParseException expected) { + assertEquals("The affix file contains a rule with less than five elements", expected.getMessage()); + assertEquals(23, expected.getErrorOffset()); + } + + affixStream.close(); + dictStream.close(); + } + + private class CloseCheckInputStream extends FilterInputStream { + private boolean closed = false; + + public CloseCheckInputStream(InputStream delegate) { + super(delegate); + } + + @Override + public void close() throws IOException { + this.closed = true; + super.close(); + } + + public boolean isClosed() { + return this.closed; + } + } + + public void testResourceCleanup() throws Exception { + CloseCheckInputStream affixStream = new CloseCheckInputStream(getClass().getResourceAsStream("compressed.aff")); + CloseCheckInputStream dictStream = new CloseCheckInputStream(getClass().getResourceAsStream("compressed.dic")); + + new Dictionary(affixStream, dictStream); + + assertFalse(affixStream.isClosed()); + assertFalse(dictStream.isClosed()); + + affixStream.close(); + dictStream.close(); + + assertTrue(affixStream.isClosed()); + assertTrue(dictStream.isClosed()); + } +} diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestHunspell2StemFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestHunspell2StemFilter.java new file mode 100644 index 00000000000..eafb1f272cf --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestHunspell2StemFilter.java @@ -0,0 +1,87 @@ +package org.apache.lucene.analysis.hunspell2; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.InputStream; +import java.util.Arrays; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.KeywordTokenizer; +import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter; +import org.apache.lucene.analysis.util.CharArraySet; +import org.apache.lucene.util.TestUtil; +import org.junit.AfterClass; +import org.junit.BeforeClass; + +public class TestHunspell2StemFilter extends BaseTokenStreamTestCase { + private static Dictionary dictionary; + + @BeforeClass + public static void beforeClass() throws Exception { + try (InputStream affixStream = TestStemmer.class.getResourceAsStream("simple.aff"); + InputStream dictStream = TestStemmer.class.getResourceAsStream("simple.dic")) { + dictionary = new Dictionary(affixStream, dictStream); + } + } + + @AfterClass + public static void afterClass() { + dictionary = null; + } + + /** Simple test for KeywordAttribute */ + public void testKeywordAttribute() throws IOException { + MockTokenizer tokenizer = whitespaceMockTokenizer("lucene is awesome"); + tokenizer.setEnableChecks(true); + Hunspell2StemFilter filter = new Hunspell2StemFilter(tokenizer, dictionary, TestUtil.nextInt(random(), 1, 3)); + assertTokenStreamContents(filter, new String[]{"lucene", "lucen", "is", "awesome"}, new int[] {1, 0, 1, 1}); + + // assert with keyword marker + tokenizer = whitespaceMockTokenizer("lucene is awesome"); + CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, Arrays.asList("Lucene"), true); + filter = new Hunspell2StemFilter(new SetKeywordMarkerFilter(tokenizer, set), dictionary, TestUtil.nextInt(random(), 1, 3)); + assertTokenStreamContents(filter, new String[]{"lucene", "is", "awesome"}, new int[] {1, 1, 1}); + } + + /** blast some random strings through the analyzer */ + public void testRandomStrings() throws Exception { + Analyzer analyzer = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName) { + Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); + return new TokenStreamComponents(tokenizer, new Hunspell2StemFilter(tokenizer, dictionary, TestUtil.nextInt(random(), 1, 3))); + } + }; + checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER); + } + + public void testEmptyTerm() throws IOException { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName) { + Tokenizer tokenizer = new KeywordTokenizer(); + return new TokenStreamComponents(tokenizer, new Hunspell2StemFilter(tokenizer, dictionary, TestUtil.nextInt(random(), 1, 3))); + } + }; + checkOneTerm(a, "", ""); + } +} diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestHunspell2StemFilterFactory.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestHunspell2StemFilterFactory.java new file mode 100644 index 00000000000..d95e2be04b6 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestHunspell2StemFilterFactory.java @@ -0,0 +1,50 @@ +package org.apache.lucene.analysis.hunspell2; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase; + +/** + * Simple tests to ensure the Hunspell stemmer loads from factory + */ +public class TestHunspell2StemFilterFactory extends BaseTokenStreamFactoryTestCase { + public void testStemming() throws Exception { + Reader reader = new StringReader("abc"); + TokenStream stream = whitespaceMockTokenizer(reader); + stream = tokenFilterFactory("Hunspell2Stem", + "dictionary", "simple.dic", + "affix", "simple.aff").create(stream); + assertTokenStreamContents(stream, new String[] { "ab" }); + } + + /** Test that bogus arguments result in exception */ + public void testBogusArguments() throws Exception { + try { + tokenFilterFactory("Hunspell2Stem", + "dictionary", "simple.dic", + "bogusArg", "bogusValue"); + fail(); + } catch (IllegalArgumentException expected) { + assertTrue(expected.getMessage().contains("Unknown parameters")); + } + } +} diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestStemmer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestStemmer.java new file mode 100644 index 00000000000..ea98f65256f --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestStemmer.java @@ -0,0 +1,105 @@ +package org.apache.lucene.analysis.hunspell2; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.hunspell2.Stemmer.Stem; +import org.apache.lucene.util.LuceneTestCase; +import org.junit.AfterClass; +import org.junit.BeforeClass; + +import java.io.InputStream; +import java.util.Arrays; +import java.util.List; + +public class TestStemmer extends LuceneTestCase { + private static Stemmer stemmer; + + @BeforeClass + public static void beforeClass() throws Exception { + try (InputStream affixStream = TestStemmer.class.getResourceAsStream("simple.aff"); + InputStream dictStream = TestStemmer.class.getResourceAsStream("simple.dic")) { + Dictionary dictionary = new Dictionary(affixStream, dictStream); + stemmer = new Stemmer(dictionary); + } + } + + @AfterClass + public static void afterClass() { + stemmer = null; + } + + public void testSimpleSuffix() { + assertStemsTo("lucene", "lucene", "lucen"); + assertStemsTo("mahoute", "mahout"); + } + + public void testSimplePrefix() { + assertStemsTo("solr", "olr"); + } + + public void testRecursiveSuffix() { + assertStemsTo("abcd", "ab"); + } + + // all forms unmunched from dictionary + public void testAllStems() { + assertStemsTo("ab", "ab"); + assertStemsTo("abc", "ab"); + assertStemsTo("apach", "apach"); + assertStemsTo("apache", "apach"); + assertStemsTo("foo", "foo"); + assertStemsTo("food", "foo"); + assertStemsTo("foos", "foo"); + assertStemsTo("lucen", "lucen"); + assertStemsTo("lucene", "lucen", "lucene"); + assertStemsTo("mahout", "mahout"); + assertStemsTo("mahoute", "mahout"); + assertStemsTo("moo", "moo"); + assertStemsTo("mood", "moo"); + assertStemsTo("olr", "olr"); + assertStemsTo("solr", "olr"); + } + + // some bogus stuff that should not stem (empty lists)! + public void testBogusStems() { + assertStemsTo("abs"); + assertStemsTo("abe"); + assertStemsTo("sab"); + assertStemsTo("sapach"); + assertStemsTo("sapache"); + assertStemsTo("apachee"); + assertStemsTo("sfoo"); + assertStemsTo("sfoos"); + assertStemsTo("fooss"); + assertStemsTo("lucenee"); + assertStemsTo("solre"); + } + + private void assertStemsTo(String s, String... expected) { + Arrays.sort(expected); + + List stems = stemmer.stem(s); + String actual[] = new String[stems.size()]; + for (int i = 0; i < actual.length; i++) { + actual[i] = stems.get(i).getStemString(); + } + Arrays.sort(actual); + + assertArrayEquals(expected, actual); + } +} diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/broken.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/broken.aff new file mode 100644 index 00000000000..3b780cd1d7b --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/broken.aff @@ -0,0 +1,24 @@ +SET UTF-8 +TRY abcdefghijklmopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ + +SFX A Y 3 +SFX A 0 e n +SFX A 0 e t +SFX A 0 e h + +SFX C Y 2 +SFX C 0 d/C c +SFX C 0 c b + +SFX D Y 1 +SFX D 0 s o + +SFX E Y 1 +SFX E 0 d o + +PFX B Y 1 +PFX B 0 s o + +#wrong rule (only 4 elements) +PFX A0 Y 1 +PFX A0 0 a \ No newline at end of file diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/compressed.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/compressed.aff new file mode 100644 index 00000000000..e4a1b37300f --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/compressed.aff @@ -0,0 +1,29 @@ +SET UTF-8 +TRY abcdefghijklmopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ + +FLAG long + +AF 5 +AF AA +AF BB +AF CC +AF DD +AF EE + +SFX AA Y 3 +SFX AA 0 e n +SFX AA 0 e t +SFX AA 0 e h + +SFX CC Y 2 +SFX CC 0 d/3 c +SFX CC 0 c b + +SFX DD Y 1 +SFX DD 0 s o + +SFX EE Y 1 +SFX EE 0 d o + +PFX BB Y 1 +PFX BB 0 s o diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/compressed.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/compressed.dic new file mode 100644 index 00000000000..dd3890fae31 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/compressed.dic @@ -0,0 +1,9 @@ +6 +ab/3 +apach/1 +foo/4 +foo/5 +lucen/1 +lucene +mahout/1 +olr/2 diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/simple.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/simple.aff new file mode 100644 index 00000000000..db9423dcad1 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/simple.aff @@ -0,0 +1,20 @@ +SET UTF-8 +TRY abcdefghijklmopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ + +SFX A Y 3 +SFX A 0 e n +SFX A 0 e t +SFX A 0 e h + +SFX C Y 2 +SFX C 0 d/C c +SFX C 0 c b + +SFX D Y 1 +SFX D 0 s o + +SFX E Y 1 +SFX E 0 d o + +PFX B Y 1 +PFX B 0 s o \ No newline at end of file diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/simple.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/simple.dic new file mode 100644 index 00000000000..f7bbab3ba67 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/simple.dic @@ -0,0 +1,10 @@ +9 +ab/C +apach/A +foo/D +foo/E +lucen/A +lucene +mahout/A +moo/E +olr/B