From c4f4beb27e6cb636b0b151b4288f2230e350adc4 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Thu, 27 Feb 2014 20:19:27 +0000 Subject: [PATCH] LUCENE-5468: hunspell2 -> hunspell (with previous options and tests) git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5468@1572718 13f79535-47bb-0310-9956-ffa450edef68 --- .../{hunspell2 => hunspell}/Dictionary.java | 85 ++- .../analysis/hunspell/HunspellAffix.java | 157 ------ .../analysis/hunspell/HunspellDictionary.java | 507 ------------------ .../analysis/hunspell/HunspellStemFilter.java | 89 ++- .../hunspell/HunspellStemFilterFactory.java | 62 +-- .../analysis/hunspell/HunspellStemmer.java | 392 -------------- .../analysis/hunspell/HunspellWord.java | 63 --- .../ISO8859_14Decoder.java | 2 +- .../{hunspell2 => hunspell}/Stemmer.java | 28 +- .../hunspell2/Hunspell2StemFilter.java | 137 ----- .../hunspell2/Hunspell2StemFilterFactory.java | 80 --- .../lucene/analysis/hunspell2/package.html | 26 - ...he.lucene.analysis.util.TokenFilterFactory | 1 - .../analysis/core/TestRandomChains.java | 12 +- .../hunspell/HunspellDictionaryTest.java | 201 ------- .../hunspell/HunspellStemFilterTest.java | 92 ---- .../hunspell/HunspellStemmerTest.java | 137 ----- .../TestAllDictionaries.java | 20 +- .../hunspell/TestCaseInsensitive.java | 110 ++++ .../TestDictionary.java | 3 +- .../TestHunspellStemFilter.java} | 22 +- .../TestHunspellStemFilterFactory.java | 11 +- .../{hunspell2 => hunspell}/TestStemmer.java | 4 +- .../{hunspell2 => hunspell}/broken.aff | 0 .../{hunspell2 => hunspell}/compressed.aff | 0 .../{hunspell2 => hunspell}/compressed.dic | 0 .../lucene/analysis/hunspell/mixedcase.dic | 10 + .../{hunspell2 => hunspell}/simple.aff | 0 .../{hunspell2 => hunspell}/simple.dic | 0 .../apache/lucene/analysis/hunspell/test.aff | 20 - .../apache/lucene/analysis/hunspell/test.dic | 10 - .../analysis/hunspell/testCompressed.aff | 29 - .../analysis/hunspell/testCompressed.dic | 9 - .../lucene/analysis/hunspell/testOverride.dic | 3 - .../analysis/hunspell/testWrongAffixRule.aff | 24 - .../TestHunspell2StemFilterFactory.java | 50 -- 36 files changed, 320 insertions(+), 2076 deletions(-) rename lucene/analysis/common/src/java/org/apache/lucene/analysis/{hunspell2 => hunspell}/Dictionary.java (90%) delete mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellAffix.java delete mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellDictionary.java delete mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemmer.java delete mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellWord.java rename lucene/analysis/common/src/java/org/apache/lucene/analysis/{hunspell2 => hunspell}/ISO8859_14Decoder.java (98%) rename lucene/analysis/common/src/java/org/apache/lucene/analysis/{hunspell2 => hunspell}/Stemmer.java (92%) delete mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Hunspell2StemFilter.java delete mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Hunspell2StemFilterFactory.java delete mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/package.html delete mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellDictionaryTest.java delete mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellStemFilterTest.java delete mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellStemmerTest.java rename lucene/analysis/common/src/test/org/apache/lucene/analysis/{hunspell2 => hunspell}/TestAllDictionaries.java (93%) create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestCaseInsensitive.java rename lucene/analysis/common/src/test/org/apache/lucene/analysis/{hunspell2 => hunspell}/TestDictionary.java (97%) rename lucene/analysis/common/src/test/org/apache/lucene/analysis/{hunspell2/TestHunspell2StemFilter.java => hunspell/TestHunspellStemFilter.java} (75%) rename lucene/analysis/common/src/test/org/apache/lucene/analysis/{hunspell2 => hunspell}/TestStemmer.java (95%) rename lucene/analysis/common/src/test/org/apache/lucene/analysis/{hunspell2 => hunspell}/broken.aff (100%) rename lucene/analysis/common/src/test/org/apache/lucene/analysis/{hunspell2 => hunspell}/compressed.aff (100%) rename lucene/analysis/common/src/test/org/apache/lucene/analysis/{hunspell2 => hunspell}/compressed.dic (100%) create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/mixedcase.dic rename lucene/analysis/common/src/test/org/apache/lucene/analysis/{hunspell2 => hunspell}/simple.aff (100%) rename lucene/analysis/common/src/test/org/apache/lucene/analysis/{hunspell2 => hunspell}/simple.dic (100%) delete mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/test.aff delete mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/test.dic delete mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testCompressed.aff delete mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testCompressed.dic delete mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testOverride.dic delete mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testWrongAffixRule.aff delete mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestHunspell2StemFilterFactory.java diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java similarity index 90% rename from lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java rename to lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java index b9f9c82c2f5..7bbf27fb817 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java @@ -1,4 +1,4 @@ -package org.apache.lucene.analysis.hunspell2; +package org.apache.lucene.analysis.hunspell; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -17,7 +17,6 @@ package org.apache.lucene.analysis.hunspell2; * limitations under the License. */ -import org.apache.lucene.analysis.util.CharArrayMap; import org.apache.lucene.store.ByteArrayDataOutput; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; @@ -28,14 +27,19 @@ import org.apache.lucene.util.OfflineSorter; import org.apache.lucene.util.OfflineSorter.ByteSequencesReader; import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter; import org.apache.lucene.util.UnicodeUtil; -import org.apache.lucene.util.Version; import org.apache.lucene.util.fst.Builder; import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.IntSequenceOutputs; import org.apache.lucene.util.fst.PositiveIntOutputs; import org.apache.lucene.util.fst.Util; -import java.io.*; +import java.io.BufferedInputStream; +import java.io.BufferedReader; +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.LineNumberReader; import java.nio.charset.Charset; import java.nio.charset.CharsetDecoder; import java.nio.charset.CodingErrorAction; @@ -71,27 +75,27 @@ public class Dictionary { private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*"; private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s"; - public FST prefixes; - public FST suffixes; + FST prefixes; + FST suffixes; // all Patterns used by prefixes and suffixes. these are typically re-used across // many affix stripping rules. so these are deduplicated, to save RAM. // TODO: maybe don't use Pattern for the condition check... // TODO: when we cut over Affix to FST, just store integer index to this. - public ArrayList patterns = new ArrayList<>(); + ArrayList patterns = new ArrayList<>(); // the entries in the .dic file, mapping to their set of flags. // the fst output is the ordinal for flagLookup - public FST words; + FST words; // the list of unique flagsets (wordforms). theoretically huge, but practically // small (e.g. for polish this is 756), otherwise humans wouldn't be able to deal with it either. - public BytesRefHash flagLookup = new BytesRefHash(); + BytesRefHash flagLookup = new BytesRefHash(); // the list of unique strip affixes. - public BytesRefHash stripLookup = new BytesRefHash(); + BytesRefHash stripLookup = new BytesRefHash(); // 8 bytes per affix - public byte[] affixData = new byte[64]; + byte[] affixData = new byte[64]; private int currentAffix = 0; private FlagParsingStrategy flagParsingStrategy = new SimpleFlagParsingStrategy(); // Default flag parsing strategy @@ -100,7 +104,11 @@ public class Dictionary { private int aliasCount = 0; private final File tempDir = OfflineSorter.defaultTempDir(); // TODO: make this configurable? - + + public static final int IGNORE_CASE = 1; + + boolean ignoreCase; + /** * Creates a new Dictionary containing the information read from the provided InputStreams to hunspell affix * and dictionary files. @@ -112,6 +120,21 @@ public class Dictionary { * @throws ParseException Can be thrown if the content of the files does not meet expected formats */ public Dictionary(InputStream affix, InputStream dictionary) throws IOException, ParseException { + this(affix, Collections.singletonList(dictionary), false); + } + + /** + * Creates a new Dictionary containing the information read from the provided InputStreams to hunspell affix + * and dictionary files. + * You have to close the provided InputStreams yourself. + * + * @param affix InputStream for reading the hunspell affix file (won't be closed). + * @param dictionaries InputStream for reading the hunspell dictionary files (won't be closed). + * @throws IOException Can be thrown while reading from the InputStreams + * @throws ParseException Can be thrown if the content of the files does not meet expected formats + */ + public Dictionary(InputStream affix, List dictionaries, boolean ignoreCase) throws IOException, ParseException { + this.ignoreCase = ignoreCase; BufferedInputStream buffered = new BufferedInputStream(affix, 8192); buffered.mark(8192); String encoding = getDictionaryEncoding(affix); @@ -122,7 +145,7 @@ public class Dictionary { stripLookup.add(new BytesRef()); // no strip -> ord 0 PositiveIntOutputs o = PositiveIntOutputs.getSingleton(); Builder b = new Builder(FST.INPUT_TYPE.BYTE4, o); - readDictionaryFile(dictionary, decoder, b); + readDictionaryFiles(dictionaries, decoder, b); words = b.finish(); } @@ -145,7 +168,7 @@ public class Dictionary { return decodeFlags(flagLookup.get(ord, scratch)); } - public Integer lookupOrd(char word[], int offset, int length) throws IOException { + Integer lookupOrd(char word[], int offset, int length) throws IOException { final FST.BytesReader bytesReader = words.getBytesReader(); final FST.Arc arc = words.getFirstArc(new FST.Arc()); // Accumulate output as we go @@ -269,7 +292,6 @@ public class Dictionary { Util.toUTF32(entry.getKey(), scratch); List entries = entry.getValue(); IntsRef output = new IntsRef(entries.size()); - int upto = 0; for (Character c : entries) { output.ints[output.length++] = c; } @@ -480,23 +502,39 @@ public class Dictionary { } /** - * Reads the dictionary file through the provided InputStream, building up the words map + * Reads the dictionary file through the provided InputStreams, building up the words map * - * @param dictionary InputStream to read the dictionary file through + * @param dictionaries InputStreams to read the dictionary file through * @param decoder CharsetDecoder used to decode the contents of the file * @throws IOException Can be thrown while reading from the file */ - private void readDictionaryFile(InputStream dictionary, CharsetDecoder decoder, Builder words) throws IOException { + private void readDictionaryFiles(List dictionaries, CharsetDecoder decoder, Builder words) throws IOException { BytesRef flagsScratch = new BytesRef(); IntsRef scratchInts = new IntsRef(); - BufferedReader lines = new BufferedReader(new InputStreamReader(dictionary, decoder)); - String line = lines.readLine(); // first line is number of entries (approximately, sometimes) - File unsorted = File.createTempFile("unsorted", "dat", tempDir); try (ByteSequencesWriter writer = new ByteSequencesWriter(unsorted)) { - while ((line = lines.readLine()) != null) { - writer.write(line.getBytes(IOUtils.CHARSET_UTF_8)); + for (InputStream dictionary : dictionaries) { + BufferedReader lines = new BufferedReader(new InputStreamReader(dictionary, decoder)); + String line = lines.readLine(); // first line is number of entries (approximately, sometimes) + + while ((line = lines.readLine()) != null) { + if (ignoreCase) { + int flagSep = line.lastIndexOf('/'); + if (flagSep == -1) { + writer.write(line.toLowerCase(Locale.ROOT).getBytes(IOUtils.CHARSET_UTF_8)); + } else { + StringBuilder sb = new StringBuilder(); + sb.append(line.substring(0, flagSep).toLowerCase(Locale.ROOT)); + if (flagSep < line.length()) { + sb.append(line.substring(flagSep, line.length())); + } + writer.write(sb.toString().getBytes(IOUtils.CHARSET_UTF_8)); + } + } else { + writer.write(line.getBytes(IOUtils.CHARSET_UTF_8)); + } + } } } File sorted = File.createTempFile("sorted", "dat", tempDir); @@ -544,6 +582,7 @@ public class Dictionary { BytesRef currentEntry = new BytesRef(); char currentFlags[] = new char[0]; + String line; while (reader.read(scratchLine)) { line = scratchLine.utf8ToString(); String entry; diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellAffix.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellAffix.java deleted file mode 100644 index 97376c0b15e..00000000000 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellAffix.java +++ /dev/null @@ -1,157 +0,0 @@ -package org.apache.lucene.analysis.hunspell; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.util.regex.Pattern; - -/** - * Wrapper class representing a hunspell affix - */ -public class HunspellAffix { - - private String append; // the affix itself, what is appended - private char appendFlags[]; // continuation class flags - private String strip; - - private String condition; - private Pattern conditionPattern; - - private char flag; - - private boolean crossProduct; - - /** - * Checks whether the given text matches the conditional pattern on this affix - * - * @param text Text to check if it matches the affix's conditional pattern - * @return {@code true} if the text meets the condition, {@code false} otherwise - */ - public boolean checkCondition(CharSequence text) { - return conditionPattern.matcher(text).matches(); - } - - /** - * Returns the append defined for the affix - * - * @return Defined append - */ - public String getAppend() { - return append; - } - - /** - * Sets the append defined for the affix - * - * @param append Defined append for the affix - */ - public void setAppend(String append) { - this.append = append; - } - - /** - * Returns the flags defined for the affix append - * - * @return Flags defined for the affix append - */ - public char[] getAppendFlags() { - return appendFlags; - } - - /** - * Sets the flags defined for the affix append - * - * @param appendFlags Flags defined for the affix append - */ - public void setAppendFlags(char[] appendFlags) { - this.appendFlags = appendFlags; - } - - /** - * Returns the stripping characters defined for the affix - * - * @return Stripping characters defined for the affix - */ - public String getStrip() { - return strip; - } - - /** - * Sets the stripping characters defined for the affix - * - * @param strip Stripping characters defined for the affix - */ - public void setStrip(String strip) { - this.strip = strip; - } - - /** - * Returns the condition that must be met before the affix can be applied - * - * @return Condition that must be met before the affix can be applied - */ - public String getCondition() { - return condition; - } - - /** - * Sets the condition that must be met before the affix can be applied - * - * @param condition Condition to be met before affix application - * @param pattern Condition as a regular expression pattern - */ - public void setCondition(String condition, String pattern) { - this.condition = condition; - this.conditionPattern = Pattern.compile(pattern); - } - - /** - * Returns the affix flag - * - * @return Affix flag - */ - public char getFlag() { - return flag; - } - - /** - * Sets the affix flag - * - * @param flag Affix flag - */ - public void setFlag(char flag) { - this.flag = flag; - } - - /** - * Returns whether the affix is defined as cross product - * - * @return {@code true} if the affix is cross product, {@code false} otherwise - */ - public boolean isCrossProduct() { - return crossProduct; - } - - /** - * Sets whether the affix is defined as cross product - * - * @param crossProduct Whether the affix is defined as cross product - */ - public void setCrossProduct(boolean crossProduct) { - this.crossProduct = crossProduct; - } -} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellDictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellDictionary.java deleted file mode 100644 index ccb53f57d29..00000000000 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellDictionary.java +++ /dev/null @@ -1,507 +0,0 @@ -package org.apache.lucene.analysis.hunspell; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.analysis.util.CharArrayMap; -import org.apache.lucene.util.Version; - -import java.io.*; -import java.nio.charset.Charset; -import java.nio.charset.CharsetDecoder; -import java.text.ParseException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -import java.util.Locale; - -/** - * In-memory structure for the dictionary (.dic) and affix (.aff) - * data of a hunspell dictionary. - */ -public class HunspellDictionary { - - static final HunspellWord NOFLAGS = new HunspellWord(); - - private static final String ALIAS_KEY = "AF"; - private static final String PREFIX_KEY = "PFX"; - private static final String SUFFIX_KEY = "SFX"; - private static final String FLAG_KEY = "FLAG"; - - private static final String NUM_FLAG_TYPE = "num"; - private static final String UTF8_FLAG_TYPE = "UTF-8"; - private static final String LONG_FLAG_TYPE = "long"; - - private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*"; - private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s"; - - private static final boolean IGNORE_CASE_DEFAULT = false; - private static final boolean STRICT_AFFIX_PARSING_DEFAULT = true; - - private CharArrayMap> words; - private CharArrayMap> prefixes; - private CharArrayMap> suffixes; - - private FlagParsingStrategy flagParsingStrategy = new SimpleFlagParsingStrategy(); // Default flag parsing strategy - private boolean ignoreCase = IGNORE_CASE_DEFAULT; - - private final Version version; - - private String[] aliases; - private int aliasCount = 0; - - /** - * Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix - * and dictionary files. - * You have to close the provided InputStreams yourself. - * - * @param affix InputStream for reading the hunspell affix file (won't be closed). - * @param dictionary InputStream for reading the hunspell dictionary file (won't be closed). - * @param version Lucene Version - * @throws IOException Can be thrown while reading from the InputStreams - * @throws ParseException Can be thrown if the content of the files does not meet expected formats - */ - public HunspellDictionary(InputStream affix, InputStream dictionary, Version version) throws IOException, ParseException { - this(affix, Arrays.asList(dictionary), version, IGNORE_CASE_DEFAULT); - } - - /** - * Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix - * and dictionary files. - * You have to close the provided InputStreams yourself. - * - * @param affix InputStream for reading the hunspell affix file (won't be closed). - * @param dictionary InputStream for reading the hunspell dictionary file (won't be closed). - * @param version Lucene Version - * @param ignoreCase If true, dictionary matching will be case insensitive - * @throws IOException Can be thrown while reading from the InputStreams - * @throws ParseException Can be thrown if the content of the files does not meet expected formats - */ - public HunspellDictionary(InputStream affix, InputStream dictionary, Version version, boolean ignoreCase) throws IOException, ParseException { - this(affix, Arrays.asList(dictionary), version, ignoreCase); - } - - /** - * Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix - * and dictionary files. - * You have to close the provided InputStreams yourself. - * - * @param affix InputStream for reading the hunspell affix file (won't be closed). - * @param dictionaries InputStreams for reading the hunspell dictionary file (won't be closed). - * @param version Lucene Version - * @param ignoreCase If true, dictionary matching will be case insensitive - * @throws IOException Can be thrown while reading from the InputStreams - * @throws ParseException Can be thrown if the content of the files does not meet expected formats - */ - public HunspellDictionary(InputStream affix, List dictionaries, Version version, boolean ignoreCase) throws IOException, ParseException { - this(affix, dictionaries, version, ignoreCase, STRICT_AFFIX_PARSING_DEFAULT); - } - - /** - * Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix - * and dictionary files. - * You have to close the provided InputStreams yourself. - * - * @param affix InputStream for reading the hunspell affix file (won't be closed). - * @param dictionaries InputStreams for reading the hunspell dictionary file (won't be closed). - * @param version Lucene Version - * @param ignoreCase If true, dictionary matching will be case insensitive - * @param strictAffixParsing Affix strict parsing enabled or not (an error while reading a rule causes exception or is ignored) - * @throws IOException Can be thrown while reading from the InputStreams - * @throws ParseException Can be thrown if the content of the files does not meet expected formats - */ - public HunspellDictionary(InputStream affix, List dictionaries, Version version, boolean ignoreCase, boolean strictAffixParsing) throws IOException, ParseException { - this.version = version; - this.ignoreCase = ignoreCase; - String encoding = getDictionaryEncoding(affix); - CharsetDecoder decoder = getJavaEncoding(encoding); - readAffixFile(affix, decoder, strictAffixParsing); - words = new CharArrayMap>(version, 65535 /* guess */, this.ignoreCase); - for (InputStream dictionary : dictionaries) { - readDictionaryFile(dictionary, decoder); - } - } - - /** - * Looks up HunspellWords that match the String created from the given char array, offset and length - * - * @param word Char array to generate the String from - * @param offset Offset in the char array that the String starts at - * @param length Length from the offset that the String is - * @return List of HunspellWords that match the generated String, or {@code null} if none are found - */ - public List lookupWord(char word[], int offset, int length) { - return words.get(word, offset, length); - } - - /** - * Looks up HunspellAffix prefixes that have an append that matches the String created from the given char array, offset and length - * - * @param word Char array to generate the String from - * @param offset Offset in the char array that the String starts at - * @param length Length from the offset that the String is - * @return List of HunspellAffix prefixes with an append that matches the String, or {@code null} if none are found - */ - public List lookupPrefix(char word[], int offset, int length) { - return prefixes.get(word, offset, length); - } - - /** - * Looks up HunspellAffix suffixes that have an append that matches the String created from the given char array, offset and length - * - * @param word Char array to generate the String from - * @param offset Offset in the char array that the String starts at - * @param length Length from the offset that the String is - * @return List of HunspellAffix suffixes with an append that matches the String, or {@code null} if none are found - */ - public List lookupSuffix(char word[], int offset, int length) { - return suffixes.get(word, offset, length); - } - - /** - * Reads the affix file through the provided InputStream, building up the prefix and suffix maps - * - * @param affixStream InputStream to read the content of the affix file from - * @param decoder CharsetDecoder to decode the content of the file - * @throws IOException Can be thrown while reading from the InputStream - */ - private void readAffixFile(InputStream affixStream, CharsetDecoder decoder, boolean strict) throws IOException, ParseException { - prefixes = new CharArrayMap>(version, 8, ignoreCase); - suffixes = new CharArrayMap>(version, 8, ignoreCase); - - LineNumberReader reader = new LineNumberReader(new InputStreamReader(affixStream, decoder)); - String line = null; - while ((line = reader.readLine()) != null) { - if (line.startsWith(ALIAS_KEY)) { - parseAlias(line); - } else if (line.startsWith(PREFIX_KEY)) { - parseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN, strict); - } else if (line.startsWith(SUFFIX_KEY)) { - parseAffix(suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN, strict); - } else if (line.startsWith(FLAG_KEY)) { - // Assume that the FLAG line comes before any prefix or suffixes - // Store the strategy so it can be used when parsing the dic file - flagParsingStrategy = getFlagParsingStrategy(line); - } - } - } - - /** - * Parses a specific affix rule putting the result into the provided affix map - * - * @param affixes Map where the result of the parsing will be put - * @param header Header line of the affix rule - * @param reader BufferedReader to read the content of the rule from - * @param conditionPattern {@link String#format(String, Object...)} pattern to be used to generate the condition regex - * pattern - * @throws IOException Can be thrown while reading the rule - */ - private void parseAffix(CharArrayMap> affixes, - String header, - LineNumberReader reader, - String conditionPattern, - boolean strict) throws IOException, ParseException { - String args[] = header.split("\\s+"); - - boolean crossProduct = args[2].equals("Y"); - - int numLines = Integer.parseInt(args[3]); - for (int i = 0; i < numLines; i++) { - String line = reader.readLine(); - String ruleArgs[] = line.split("\\s+"); - - if (ruleArgs.length < 5) { - if (strict) { - throw new ParseException("The affix file contains a rule with less than five elements", reader.getLineNumber()); - } - continue; - } - - HunspellAffix affix = new HunspellAffix(); - - affix.setFlag(flagParsingStrategy.parseFlag(ruleArgs[1])); - affix.setStrip(ruleArgs[2].equals("0") ? "" : ruleArgs[2]); - - String affixArg = ruleArgs[3]; - - int flagSep = affixArg.lastIndexOf('/'); - if (flagSep != -1) { - String flagPart = affixArg.substring(flagSep + 1); - - if (aliasCount > 0) { - flagPart = getAliasValue(Integer.parseInt(flagPart)); - } - - char appendFlags[] = flagParsingStrategy.parseFlags(flagPart); - Arrays.sort(appendFlags); - affix.setAppendFlags(appendFlags); - affix.setAppend(affixArg.substring(0, flagSep)); - } else { - affix.setAppend(affixArg); - } - - String condition = ruleArgs[4]; - affix.setCondition(condition, String.format(Locale.ROOT, conditionPattern, condition)); - affix.setCrossProduct(crossProduct); - - List list = affixes.get(affix.getAppend()); - if (list == null) { - list = new ArrayList(); - affixes.put(affix.getAppend(), list); - } - - list.add(affix); - } - } - - /** - * Parses the encoding specified in the affix file readable through the provided InputStream - * - * @param affix InputStream for reading the affix file - * @return Encoding specified in the affix file - * @throws IOException Can be thrown while reading from the InputStream - * @throws ParseException Thrown if the first non-empty non-comment line read from the file does not adhere to the format {@code SET } - */ - private String getDictionaryEncoding(InputStream affix) throws IOException, ParseException { - final StringBuilder encoding = new StringBuilder(); - for (;;) { - encoding.setLength(0); - int ch; - while ((ch = affix.read()) >= 0) { - if (ch == '\n') { - break; - } - if (ch != '\r') { - encoding.append((char)ch); - } - } - if ( - encoding.length() == 0 || encoding.charAt(0) == '#' || - // this test only at the end as ineffective but would allow lines only containing spaces: - encoding.toString().trim().length() == 0 - ) { - if (ch < 0) { - throw new ParseException("Unexpected end of affix file.", 0); - } - continue; - } - if ("SET ".equals(encoding.substring(0, 4))) { - // cleanup the encoding string, too (whitespace) - return encoding.substring(4).trim(); - } - throw new ParseException("The first non-comment line in the affix file must "+ - "be a 'SET charset', was: '" + encoding +"'", 0); - } - } - - /** - * Retrieves the CharsetDecoder for the given encoding. Note, This isn't perfect as I think ISCII-DEVANAGARI and - * MICROSOFT-CP1251 etc are allowed... - * - * @param encoding Encoding to retrieve the CharsetDecoder for - * @return CharSetDecoder for the given encoding - */ - private CharsetDecoder getJavaEncoding(String encoding) { - Charset charset = Charset.forName(encoding); - return charset.newDecoder(); - } - - /** - * Determines the appropriate {@link FlagParsingStrategy} based on the FLAG definition line taken from the affix file - * - * @param flagLine Line containing the flag information - * @return FlagParsingStrategy that handles parsing flags in the way specified in the FLAG definition - */ - private FlagParsingStrategy getFlagParsingStrategy(String flagLine) { - String flagType = flagLine.substring(5); - - if (NUM_FLAG_TYPE.equals(flagType)) { - return new NumFlagParsingStrategy(); - } else if (UTF8_FLAG_TYPE.equals(flagType)) { - return new SimpleFlagParsingStrategy(); - } else if (LONG_FLAG_TYPE.equals(flagType)) { - return new DoubleASCIIFlagParsingStrategy(); - } - - throw new IllegalArgumentException("Unknown flag type: " + flagType); - } - - /** - * Reads the dictionary file through the provided InputStream, building up the words map - * - * @param dictionary InputStream to read the dictionary file through - * @param decoder CharsetDecoder used to decode the contents of the file - * @throws IOException Can be thrown while reading from the file - */ - private void readDictionaryFile(InputStream dictionary, CharsetDecoder decoder) throws IOException { - BufferedReader reader = new BufferedReader(new InputStreamReader(dictionary, decoder)); - // TODO: don't create millions of strings. - String line = reader.readLine(); // first line is number of entries - int numEntries = Integer.parseInt(line); - - // TODO: the flags themselves can be double-chars (long) or also numeric - // either way the trick is to encode them as char... but they must be parsed differently - while ((line = reader.readLine()) != null) { - String entry; - HunspellWord wordForm; - - int flagSep = line.lastIndexOf('/'); - if (flagSep == -1) { - wordForm = NOFLAGS; - entry = line; - } else { - // note, there can be comments (morph description) after a flag. - // we should really look for any whitespace - int end = line.indexOf('\t', flagSep); - if (end == -1) - end = line.length(); - - String flagPart = line.substring(flagSep + 1, end); - if (aliasCount > 0) { - flagPart = getAliasValue(Integer.parseInt(flagPart)); - } - - wordForm = new HunspellWord(flagParsingStrategy.parseFlags(flagPart)); - Arrays.sort(wordForm.getFlags()); - entry = line.substring(0, flagSep); - } - if(ignoreCase) { - entry = entry.toLowerCase(Locale.ROOT); - } - - List entries = new ArrayList(); - entries.add(wordForm); - words.put(entry, entries); - } - } - - public Version getVersion() { - return version; - } - - private void parseAlias(String line) { - String ruleArgs[] = line.split("\\s+"); - if (aliases == null) { - //first line should be the aliases count - final int count = Integer.parseInt(ruleArgs[1]); - aliases = new String[count]; - } else { - aliases[aliasCount++] = ruleArgs[1]; - } - } - - private String getAliasValue(int id) { - try { - return aliases[id - 1]; - } catch (IndexOutOfBoundsException ex) { - throw new IllegalArgumentException("Bad flag alias number:" + id, ex); - } - } - - /** - * Abstraction of the process of parsing flags taken from the affix and dic files - */ - private static abstract class FlagParsingStrategy { - - /** - * Parses the given String into a single flag - * - * @param rawFlag String to parse into a flag - * @return Parsed flag - */ - char parseFlag(String rawFlag) { - return parseFlags(rawFlag)[0]; - } - - /** - * Parses the given String into multiple flags - * - * @param rawFlags String to parse into flags - * @return Parsed flags - */ - abstract char[] parseFlags(String rawFlags); - } - - /** - * Simple implementation of {@link FlagParsingStrategy} that treats the chars in each String as a individual flags. - * Can be used with both the ASCII and UTF-8 flag types. - */ - private static class SimpleFlagParsingStrategy extends FlagParsingStrategy { - /** - * {@inheritDoc} - */ - @Override - public char[] parseFlags(String rawFlags) { - return rawFlags.toCharArray(); - } - } - - /** - * Implementation of {@link FlagParsingStrategy} that assumes each flag is encoded in its numerical form. In the case - * of multiple flags, each number is separated by a comma. - */ - private static class NumFlagParsingStrategy extends FlagParsingStrategy { - /** - * {@inheritDoc} - */ - @Override - public char[] parseFlags(String rawFlags) { - String[] rawFlagParts = rawFlags.trim().split(","); - char[] flags = new char[rawFlagParts.length]; - - for (int i = 0; i < rawFlagParts.length; i++) { - // note, removing the trailing X/leading I for nepali... what is the rule here?! - flags[i] = (char) Integer.parseInt(rawFlagParts[i].replaceAll("[^0-9]", "")); - } - - return flags; - } - } - - /** - * Implementation of {@link FlagParsingStrategy} that assumes each flag is encoded as two ASCII characters whose codes - * must be combined into a single character. - * - * TODO (rmuir) test - */ - private static class DoubleASCIIFlagParsingStrategy extends FlagParsingStrategy { - - /** - * {@inheritDoc} - */ - @Override - public char[] parseFlags(String rawFlags) { - if (rawFlags.length() == 0) { - return new char[0]; - } - - StringBuilder builder = new StringBuilder(); - for (int i = 0; i < rawFlags.length(); i+=2) { - char cookedFlag = (char) ((int) rawFlags.charAt(i) + (int) rawFlags.charAt(i + 1)); - builder.append(cookedFlag); - } - - char flags[] = new char[builder.length()]; - builder.getChars(0, builder.length(), flags, 0); - return flags; - } - } - - public boolean isIgnoreCase() { - return ignoreCase; - } -} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilter.java index 4ff0a741ad8..a9b512b7bbd 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilter.java @@ -18,14 +18,16 @@ package org.apache.lucene.analysis.hunspell; */ import java.io.IOException; +import java.util.Collections; +import java.util.Comparator; import java.util.List; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.hunspell.HunspellStemmer.Stem; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.util.CharsRef; /** * TokenFilter that uses hunspell affix rules and words to stem tokens. Since hunspell supports a word having multiple @@ -41,71 +43,83 @@ import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; * {@link org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilterFactory} *

* - * + * @lucene.experimental */ public final class HunspellStemFilter extends TokenFilter { private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class); - private final HunspellStemmer stemmer; + private final Stemmer stemmer; - private List buffer; + private List buffer; private State savedState; private final boolean dedup; + private final boolean longestOnly; /** Create a {@link HunspellStemFilter} which deduplicates stems and has a maximum * recursion level of 2. - * @see #HunspellStemFilter(TokenStream, HunspellDictionary, int) */ - public HunspellStemFilter(TokenStream input, HunspellDictionary dictionary) { + * @see #HunspellStemFilter(TokenStream, Dictionary, int) */ + public HunspellStemFilter(TokenStream input, Dictionary dictionary) { this(input, dictionary, 2); } /** - * Creates a new HunspellStemFilter that will stem tokens from the given TokenStream using affix rules in the provided - * HunspellDictionary + * Creates a new Hunspell2StemFilter that will stem tokens from the given TokenStream using affix rules in the provided + * Dictionary * * @param input TokenStream whose tokens will be stemmed * @param dictionary HunspellDictionary containing the affix rules and words that will be used to stem the tokens * @param recursionCap maximum level of recursion stemmer can go into, defaults to 2 */ - public HunspellStemFilter(TokenStream input, HunspellDictionary dictionary, int recursionCap) { + public HunspellStemFilter(TokenStream input, Dictionary dictionary, int recursionCap) { this(input, dictionary, true, recursionCap); } /** Create a {@link HunspellStemFilter} which has a maximum recursion level of 2. - * @see #HunspellStemFilter(TokenStream, HunspellDictionary, boolean, int) */ - public HunspellStemFilter(TokenStream input, HunspellDictionary dictionary, boolean dedup) { + * @see #HunspellStemFilter(TokenStream, Dictionary, boolean, int) */ + public HunspellStemFilter(TokenStream input, Dictionary dictionary, boolean dedup) { this(input, dictionary, dedup, 2); } - + /** * Creates a new HunspellStemFilter that will stem tokens from the given TokenStream using affix rules in the provided - * HunspellDictionary + * Dictionary * * @param input TokenStream whose tokens will be stemmed * @param dictionary HunspellDictionary containing the affix rules and words that will be used to stem the tokens * @param dedup true if only unique terms should be output. * @param recursionCap maximum level of recursion stemmer can go into, defaults to 2 */ - public HunspellStemFilter(TokenStream input, HunspellDictionary dictionary, boolean dedup, int recursionCap) { - super(input); - this.dedup = dedup; - this.stemmer = new HunspellStemmer(dictionary, recursionCap); + public HunspellStemFilter(TokenStream input, Dictionary dictionary, boolean dedup, int recursionCap) { + this(input, dictionary, dedup, recursionCap, false); } /** - * {@inheritDoc} + * Creates a new HunspellStemFilter that will stem tokens from the given TokenStream using affix rules in the provided + * Dictionary + * + * @param input TokenStream whose tokens will be stemmed + * @param dictionary HunspellDictionary containing the affix rules and words that will be used to stem the tokens + * @param dedup true if only unique terms should be output. + * @param recursionCap maximum level of recursion stemmer can go into, defaults to 2 + * @param longestOnly true if only the longest term should be output. */ + public HunspellStemFilter(TokenStream input, Dictionary dictionary, boolean dedup, int recursionCap, boolean longestOnly) { + super(input); + this.dedup = dedup && longestOnly == false; // don't waste time deduping if longestOnly is set + this.stemmer = new Stemmer(dictionary, recursionCap); + this.longestOnly = longestOnly; + } + @Override public boolean incrementToken() throws IOException { if (buffer != null && !buffer.isEmpty()) { - Stem nextStem = buffer.remove(0); + CharsRef nextStem = buffer.remove(0); restoreState(savedState); posIncAtt.setPositionIncrement(0); - termAtt.copyBuffer(nextStem.getStem(), 0, nextStem.getStemLength()); - termAtt.setLength(nextStem.getStemLength()); + termAtt.setEmpty().append(nextStem); return true; } @@ -122,24 +136,41 @@ public final class HunspellStemFilter extends TokenFilter { if (buffer.isEmpty()) { // we do not know this word, return it unchanged return true; } + + if (longestOnly && buffer.size() > 1) { + Collections.sort(buffer, lengthComparator); + } - Stem stem = buffer.remove(0); - termAtt.copyBuffer(stem.getStem(), 0, stem.getStemLength()); - termAtt.setLength(stem.getStemLength()); + CharsRef stem = buffer.remove(0); + termAtt.setEmpty().append(stem); - if (!buffer.isEmpty()) { - savedState = captureState(); + if (longestOnly) { + buffer.clear(); + } else { + if (!buffer.isEmpty()) { + savedState = captureState(); + } } return true; } - /** - * {@inheritDoc} - */ @Override public void reset() throws IOException { super.reset(); buffer = null; } + + static final Comparator lengthComparator = new Comparator() { + @Override + public int compare(CharsRef o1, CharsRef o2) { + int cmp = Integer.compare(o2.length, o1.length); + if (cmp == 0) { + // tie break on text + return o2.compareTo(o1); + } else { + return cmp; + } + } + }; } diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilterFactory.java index 63e621c2ab9..e632b489d51 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilterFactory.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilterFactory.java @@ -31,89 +31,75 @@ import org.apache.lucene.analysis.util.TokenFilterFactory; import org.apache.lucene.util.IOUtils; /** - * TokenFilterFactory that creates instances of {@link org.apache.lucene.analysis.hunspell.HunspellStemFilter}. - * Example config for British English including a custom dictionary, case insensitive matching: + * TokenFilterFactory that creates instances of {@link HunspellStemFilter}. + * Example config for British English: *
  * <filter class="solr.HunspellStemFilterFactory"
- *    dictionary="en_GB.dic,my_custom.dic"
- *    affix="en_GB.aff"
- *    ignoreCase="true" />
+ * dictionary="en_GB.dic,my_custom.dic" + * affix="en_GB.aff" + * ignoreCase="false" + * longestOnly="false" /> * Both parameters dictionary and affix are mandatory. - *
- * The parameter ignoreCase (true/false) controls whether matching is case sensitive or not. Default false. - *
- * The parameter strictAffixParsing (true/false) controls whether the affix parsing is strict or not. Default true. - * If strict an error while reading an affix rule causes a ParseException, otherwise is ignored. - *
* Dictionaries for many languages are available through the OpenOffice project. * * See http://wiki.apache.org/solr/Hunspell + * @lucene.experimental */ public class HunspellStemFilterFactory extends TokenFilterFactory implements ResourceLoaderAware { - private static final String PARAM_DICTIONARY = "dictionary"; - private static final String PARAM_AFFIX = "affix"; - private static final String PARAM_IGNORE_CASE = "ignoreCase"; - private static final String PARAM_STRICT_AFFIX_PARSING = "strictAffixParsing"; + private static final String PARAM_DICTIONARY = "dictionary"; + private static final String PARAM_AFFIX = "affix"; private static final String PARAM_RECURSION_CAP = "recursionCap"; + private static final String PARAM_IGNORE_CASE = "ignoreCase"; + private static final String PARAM_LONGEST_ONLY = "longestOnly"; - private final String dictionaryArg; + private final String dictionaryFiles; private final String affixFile; private final boolean ignoreCase; - private final boolean strictAffixParsing; - private HunspellDictionary dictionary; + private final boolean longestOnly; + private Dictionary dictionary; private int recursionCap; /** Creates a new HunspellStemFilterFactory */ public HunspellStemFilterFactory(Map args) { super(args); - assureMatchVersion(); - dictionaryArg = require(args, PARAM_DICTIONARY); + dictionaryFiles = require(args, PARAM_DICTIONARY); affixFile = get(args, PARAM_AFFIX); ignoreCase = getBoolean(args, PARAM_IGNORE_CASE, false); - strictAffixParsing = getBoolean(args, PARAM_STRICT_AFFIX_PARSING, true); recursionCap = getInt(args, PARAM_RECURSION_CAP, 2); + longestOnly = getBoolean(args, PARAM_LONGEST_ONLY, false); + // this isnt necessary: we properly load all dictionaries. + // but recognize and ignore for back compat + getBoolean(args, "strictAffixParsing", true); if (!args.isEmpty()) { throw new IllegalArgumentException("Unknown parameters: " + args); } } - /** - * Loads the hunspell dictionary and affix files defined in the configuration - * - * @param loader ResourceLoader used to load the files - */ @Override public void inform(ResourceLoader loader) throws IOException { - String dictionaryFiles[] = dictionaryArg.split(","); + String dicts[] = dictionaryFiles.split(","); InputStream affix = null; List dictionaries = new ArrayList(); try { dictionaries = new ArrayList(); - for (String file : dictionaryFiles) { + for (String file : dicts) { dictionaries.add(loader.openResource(file)); } affix = loader.openResource(affixFile); - this.dictionary = new HunspellDictionary(affix, dictionaries, luceneMatchVersion, ignoreCase, strictAffixParsing); + this.dictionary = new Dictionary(affix, dictionaries, ignoreCase); } catch (ParseException e) { - throw new IOException("Unable to load hunspell data! [dictionary=" + dictionaryArg + ",affix=" + affixFile + "]", e); + throw new IOException("Unable to load hunspell data! [dictionary=" + dictionaries + ",affix=" + affixFile + "]", e); } finally { IOUtils.closeWhileHandlingException(affix); IOUtils.closeWhileHandlingException(dictionaries); } } - /** - * Creates an instance of {@link org.apache.lucene.analysis.hunspell.HunspellStemFilter} that will filter the given - * TokenStream - * - * @param tokenStream TokenStream that will be filtered - * @return HunspellStemFilter that filters the TokenStream - */ @Override public TokenStream create(TokenStream tokenStream) { - return new HunspellStemFilter(tokenStream, dictionary, recursionCap); + return new HunspellStemFilter(tokenStream, dictionary, true, recursionCap, longestOnly); } } diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemmer.java deleted file mode 100644 index ae2948284d6..00000000000 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemmer.java +++ /dev/null @@ -1,392 +0,0 @@ -package org.apache.lucene.analysis.hunspell; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.List; - -import org.apache.lucene.analysis.util.CharArraySet; -import org.apache.lucene.analysis.util.CharacterUtils; -import org.apache.lucene.util.Version; - -/** - * HunspellStemmer uses the affix rules declared in the HunspellDictionary to generate one or more stems for a word. It - * conforms to the algorithm in the original hunspell algorithm, including recursive suffix stripping. - */ -public class HunspellStemmer { - private final int recursionCap; - private final HunspellDictionary dictionary; - private final StringBuilder segment = new StringBuilder(); - private CharacterUtils charUtils = CharacterUtils.getInstance(Version.LUCENE_CURRENT); - - /** - * Constructs a new HunspellStemmer which will use the provided HunspellDictionary to create its stems. Uses the - * default recursion cap of 2 (based on Hunspell documentation). - * - * @param dictionary HunspellDictionary that will be used to create the stems - */ - public HunspellStemmer(HunspellDictionary dictionary) { - this(dictionary, 2); - } - - /** - * Constructs a new HunspellStemmer which will use the provided HunspellDictionary to create its stems - * - * @param dictionary HunspellDictionary that will be used to create the stems - * @param recursionCap maximum level of recursion stemmer can go into - */ - public HunspellStemmer(HunspellDictionary dictionary, int recursionCap) { - this.dictionary = dictionary; - this.recursionCap = recursionCap; - } - - /** - * Find the stem(s) of the provided word - * - * @param word Word to find the stems for - * @return List of stems for the word - */ - public List stem(String word) { - return stem(word.toCharArray(), word.length()); - } - - /** - * Find the stem(s) of the provided word - * - * @param word Word to find the stems for - * @return List of stems for the word - */ - public List stem(char word[], int length) { - List stems = new ArrayList(); - if (dictionary.lookupWord(word, 0, length) != null) { - stems.add(new Stem(word, length)); - } - stems.addAll(stem(word, length, null, 0)); - return stems; - } - - /** - * Find the unique stem(s) of the provided word - * - * @param word Word to find the stems for - * @return List of stems for the word - */ - public List uniqueStems(char word[], int length) { - List stems = new ArrayList(); - CharArraySet terms = new CharArraySet(dictionary.getVersion(), 8, dictionary.isIgnoreCase()); - if (dictionary.lookupWord(word, 0, length) != null) { - stems.add(new Stem(word, length)); - terms.add(word); - } - List otherStems = stem(word, length, null, 0); - for (Stem s : otherStems) { - if (!terms.contains(s.stem)) { - stems.add(s); - terms.add(s.stem); - } - } - return stems; - } - - // ================================================= Helper Methods ================================================ - - /** - * Generates a list of stems for the provided word - * - * @param word Word to generate the stems for - * @param flags Flags from a previous stemming step that need to be cross-checked with any affixes in this recursive step - * @param recursionDepth Level of recursion this stemming step is at - * @return List of stems, pr an empty if no stems are found - */ - private List stem(char word[], int length, char[] flags, int recursionDepth) { - List stems = new ArrayList(); - - for (int i = 0; i < length; i++) { - List suffixes = dictionary.lookupSuffix(word, i, length - i); - if (suffixes == null) { - continue; - } - - for (HunspellAffix suffix : suffixes) { - if (hasCrossCheckedFlag(suffix.getFlag(), flags)) { - int deAffixedLength = length - suffix.getAppend().length(); - // TODO: can we do this in-place? - String strippedWord = new StringBuilder().append(word, 0, deAffixedLength).append(suffix.getStrip()).toString(); - - List stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), suffix, recursionDepth); - for (Stem stem : stemList) { - stem.addSuffix(suffix); - } - - stems.addAll(stemList); - } - } - } - - for (int i = length - 1; i >= 0; i--) { - List prefixes = dictionary.lookupPrefix(word, 0, i); - if (prefixes == null) { - continue; - } - - for (HunspellAffix prefix : prefixes) { - if (hasCrossCheckedFlag(prefix.getFlag(), flags)) { - int deAffixedStart = prefix.getAppend().length(); - int deAffixedLength = length - deAffixedStart; - - String strippedWord = new StringBuilder().append(prefix.getStrip()) - .append(word, deAffixedStart, deAffixedLength) - .toString(); - - List stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), prefix, recursionDepth); - for (Stem stem : stemList) { - stem.addPrefix(prefix); - } - - stems.addAll(stemList); - } - } - } - - return stems; - } - - /** - * Applies the affix rule to the given word, producing a list of stems if any are found - * - * @param strippedWord Word the affix has been removed and the strip added - * @param affix HunspellAffix representing the affix rule itself - * @param recursionDepth Level of recursion this stemming step is at - * @return List of stems for the word, or an empty list if none are found - */ - @SuppressWarnings("unchecked") - public List applyAffix(char strippedWord[], int length, HunspellAffix affix, int recursionDepth) { - if(dictionary.isIgnoreCase()) { - charUtils.toLowerCase(strippedWord, 0, strippedWord.length); - } - segment.setLength(0); - segment.append(strippedWord, 0, length); - if (!affix.checkCondition(segment)) { - return Collections.EMPTY_LIST; - } - - List stems = new ArrayList(); - - List words = dictionary.lookupWord(strippedWord, 0, length); - if (words != null) { - for (HunspellWord hunspellWord : words) { - if (hunspellWord.hasFlag(affix.getFlag())) { - stems.add(new Stem(strippedWord, length)); - } - } - } - - if (affix.isCrossProduct() && recursionDepth < recursionCap) { - stems.addAll(stem(strippedWord, length, affix.getAppendFlags(), ++recursionDepth)); - } - - return stems; - } - - /** - * Checks if the given flag cross checks with the given array of flags - * - * @param flag Flag to cross check with the array of flags - * @param flags Array of flags to cross check against. Can be {@code null} - * @return {@code true} if the flag is found in the array or the array is {@code null}, {@code false} otherwise - */ - private boolean hasCrossCheckedFlag(char flag, char[] flags) { - return flags == null || Arrays.binarySearch(flags, flag) >= 0; - } - - /** - * Stem represents all information known about a stem of a word. This includes the stem, and the prefixes and suffixes - * that were used to change the word into the stem. - */ - public static class Stem { - - private final List prefixes = new ArrayList(); - private final List suffixes = new ArrayList(); - private final char stem[]; - private final int stemLength; - - /** - * Creates a new Stem wrapping the given word stem - * - * @param stem Stem of a word - */ - public Stem(char stem[], int stemLength) { - this.stem = stem; - this.stemLength = stemLength; - } - - /** - * Adds a prefix to the list of prefixes used to generate this stem. Because it is assumed that prefixes are added - * depth first, the prefix is added to the front of the list - * - * @param prefix Prefix to add to the list of prefixes for this stem - */ - public void addPrefix(HunspellAffix prefix) { - prefixes.add(0, prefix); - } - - /** - * Adds a suffix to the list of suffixes used to generate this stem. Because it is assumed that suffixes are added - * depth first, the suffix is added to the end of the list - * - * @param suffix Suffix to add to the list of suffixes for this stem - */ - public void addSuffix(HunspellAffix suffix) { - suffixes.add(suffix); - } - - /** - * Returns the list of prefixes used to generate the stem - * - * @return List of prefixes used to generate the stem or an empty list if no prefixes were required - */ - public List getPrefixes() { - return prefixes; - } - - /** - * Returns the list of suffixes used to generate the stem - * - * @return List of suffixes used to generate the stem or an empty list if no suffixes were required - */ - public List getSuffixes() { - return suffixes; - } - - /** - * Returns the actual word stem itself - * - * @return Word stem itself - */ - public char[] getStem() { - return stem; - } - - /** - * @return the stemLength - */ - public int getStemLength() { - return stemLength; - } - - public String getStemString() { - return new String(stem, 0, stemLength); - } - - } - - - // ================================================= Entry Point =================================================== - - /* - * HunspellStemmer entry point. Accepts two arguments: location of affix file and location of dic file - * - * @param args Program arguments. Should contain location of affix file and location of dic file - * @throws IOException Can be thrown while reading from the files - * @throws ParseException Can be thrown while parsing the files - public static void main(String[] args) throws IOException, ParseException { - boolean ignoreCase = false; - int offset = 0; - - if (args.length < 2) { - System.out.println("usage: HunspellStemmer [-i] "); - System.exit(1); - } - - if(args[offset].equals("-i")) { - ignoreCase = true; - System.out.println("Ignoring case. All stems will be returned lowercased"); - offset++; - } - - InputStream affixInputStream = new FileInputStream(args[offset++]); - InputStream dicInputStream = new FileInputStream(args[offset++]); - - // :Post-Release-Update-Version.LUCENE_XY: - HunspellDictionary dictionary = new HunspellDictionary(affixInputStream, dicInputStream, Version.LUCENE_50, ignoreCase); - - affixInputStream.close(); - dicInputStream.close(); - - HunspellStemmer stemmer = new HunspellStemmer(dictionary); - - Scanner scanner = new Scanner(System.in, Charset.defaultCharset().name()); - - System.out.print("> "); - while (scanner.hasNextLine()) { - String word = scanner.nextLine(); - - if ("exit".equals(word)) { - break; - } - - printStemResults(word, stemmer.stem(word.toCharArray(), word.length())); - - System.out.print("> "); - } - } - - * Prints the results of the stemming of a word - * - * @param originalWord Word that has been stemmed - * @param stems Stems of the word - private static void printStemResults(String originalWord, List stems) { - StringBuilder builder = new StringBuilder().append("stem(").append(originalWord).append(")").append("\n"); - - for (Stem stem : stems) { - builder.append("- ").append(stem.getStem()).append(": "); - - for (HunspellAffix prefix : stem.getPrefixes()) { - builder.append(prefix.getAppend()).append("+"); - - if (hasText(prefix.getStrip())) { - builder.append(prefix.getStrip()).append("-"); - } - } - - builder.append(stem.getStem()); - - for (HunspellAffix suffix : stem.getSuffixes()) { - if (hasText(suffix.getStrip())) { - builder.append("-").append(suffix.getStrip()); - } - - builder.append("+").append(suffix.getAppend()); - } - builder.append("\n"); - } - - System.out.println(builder); - } - - * Simple utility to check if the given String has any text - * - * @param str String to check if it has any text - * @return {@code true} if the String has text, {@code false} otherwise - private static boolean hasText(String str) { - return str != null && str.length() > 0; - } - */ -} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellWord.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellWord.java deleted file mode 100644 index fe216d30dc8..00000000000 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellWord.java +++ /dev/null @@ -1,63 +0,0 @@ -package org.apache.lucene.analysis.hunspell; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.util.Arrays; - -/** - * A dictionary (.dic) entry with its associated flags. - */ -public class HunspellWord { - - private final char flags[]; // sorted, can we represent more concisely? - - /** - * Creates a new HunspellWord with no associated flags - */ - public HunspellWord() { - flags = null; - } - - /** - * Constructs a new HunspellWord with the given flags - * - * @param flags Flags to associate with the word - */ - public HunspellWord(char[] flags) { - this.flags = flags; - } - - /** - * Checks whether the word has the given flag associated with it - * - * @param flag Flag to check whether it is associated with the word - * @return {@code true} if the flag is associated, {@code false} otherwise - */ - public boolean hasFlag(char flag) { - return flags != null && Arrays.binarySearch(flags, flag) >= 0; - } - - /** - * Returns the flags associated with the word - * - * @return Flags associated with the word - */ - public char[] getFlags() { - return flags; - } -} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/ISO8859_14Decoder.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ISO8859_14Decoder.java similarity index 98% rename from lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/ISO8859_14Decoder.java rename to lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ISO8859_14Decoder.java index 4de0d4bc051..2d87947ab3d 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/ISO8859_14Decoder.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ISO8859_14Decoder.java @@ -1,4 +1,4 @@ -package org.apache.lucene.analysis.hunspell2; +package org.apache.lucene.analysis.hunspell; /* * Licensed to the Apache Software Foundation (ASF) under one or more diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java similarity index 92% rename from lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java rename to lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java index d6b0133830a..18e6588ce7a 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java @@ -1,4 +1,4 @@ -package org.apache.lucene.analysis.hunspell2; +package org.apache.lucene.analysis.hunspell; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -24,6 +24,7 @@ import java.util.List; import java.util.regex.Pattern; import org.apache.lucene.analysis.util.CharArraySet; +import org.apache.lucene.analysis.util.CharacterUtils; import org.apache.lucene.store.ByteArrayDataInput; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.CharsRef; @@ -37,9 +38,10 @@ import org.apache.lucene.util.Version; final class Stemmer { private final int recursionCap; private final Dictionary dictionary; - private BytesRef scratch = new BytesRef(); + private final BytesRef scratch = new BytesRef(); private final StringBuilder segment = new StringBuilder(); private final ByteArrayDataInput affixReader; + private final CharacterUtils charUtils = CharacterUtils.getInstance(Version.LUCENE_CURRENT); /** * Constructs a new Stemmer which will use the provided Dictionary to create its stems. Uses the @@ -80,6 +82,9 @@ final class Stemmer { * @return List of stems for the word */ public List stem(char word[], int length) { + if (dictionary.ignoreCase) { + charUtils.toLowerCase(word, 0, length); + } List stems = new ArrayList(); if (dictionary.lookupWord(word, 0, length, scratch) != null) { stems.add(new CharsRef(word, 0, length)); @@ -95,20 +100,19 @@ final class Stemmer { * @return List of stems for the word */ public List uniqueStems(char word[], int length) { - List stems = new ArrayList(); - CharArraySet terms = new CharArraySet(Version.LUCENE_CURRENT, 8, false); - if (dictionary.lookupWord(word, 0, length, scratch) != null) { - stems.add(new CharsRef(word, 0, length)); - terms.add(word); + List stems = stem(word, length); + if (stems.size() < 2) { + return stems; } - List otherStems = stem(word, length, Dictionary.NOFLAGS, 0); - for (CharsRef s : otherStems) { + CharArraySet terms = new CharArraySet(Version.LUCENE_CURRENT, 8, dictionary.ignoreCase); + List deduped = new ArrayList<>(); + for (CharsRef s : stems) { if (!terms.contains(s)) { - stems.add(s); + deduped.add(s); terms.add(s); } } - return stems; + return deduped; } // ================================================= Helper Methods ================================================ @@ -188,7 +192,7 @@ final class Stemmer { * @param recursionDepth Level of recursion this stemming step is at * @return List of stems for the word, or an empty list if none are found */ - public List applyAffix(char strippedWord[], int length, int affix, int recursionDepth) { + List applyAffix(char strippedWord[], int length, int affix, int recursionDepth) { segment.setLength(0); segment.append(strippedWord, 0, length); diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Hunspell2StemFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Hunspell2StemFilter.java deleted file mode 100644 index 00ff88469be..00000000000 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Hunspell2StemFilter.java +++ /dev/null @@ -1,137 +0,0 @@ -package org.apache.lucene.analysis.hunspell2; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; -import java.util.List; - -import org.apache.lucene.analysis.TokenFilter; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; -import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; -import org.apache.lucene.util.CharsRef; - -/** - * TokenFilter that uses hunspell affix rules and words to stem tokens. Since hunspell supports a word having multiple - * stems, this filter can emit multiple tokens for each consumed token - * - *

- * Note: This filter is aware of the {@link KeywordAttribute}. To prevent - * certain terms from being passed to the stemmer - * {@link KeywordAttribute#isKeyword()} should be set to true - * in a previous {@link TokenStream}. - * - * Note: For including the original term as well as the stemmed version, see - * {@link org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilterFactory} - *

- * - * @lucene.experimental - */ -public final class Hunspell2StemFilter extends TokenFilter { - - private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); - private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); - private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class); - private final Stemmer stemmer; - - private List buffer; - private State savedState; - - private final boolean dedup; - - /** Create a {@link Hunspell2StemFilter} which deduplicates stems and has a maximum - * recursion level of 2. - * @see #Hunspell2StemFilter(TokenStream, Dictionary, int) */ - public Hunspell2StemFilter(TokenStream input, Dictionary dictionary) { - this(input, dictionary, 2); - } - - /** - * Creates a new Hunspell2StemFilter that will stem tokens from the given TokenStream using affix rules in the provided - * Dictionary - * - * @param input TokenStream whose tokens will be stemmed - * @param dictionary HunspellDictionary containing the affix rules and words that will be used to stem the tokens - * @param recursionCap maximum level of recursion stemmer can go into, defaults to 2 - */ - public Hunspell2StemFilter(TokenStream input, Dictionary dictionary, int recursionCap) { - this(input, dictionary, true, recursionCap); - } - - /** Create a {@link Hunspell2StemFilter} which has a maximum recursion level of 2. - * @see #Hunspell2StemFilter(TokenStream, Dictionary, boolean, int) */ - public Hunspell2StemFilter(TokenStream input, Dictionary dictionary, boolean dedup) { - this(input, dictionary, dedup, 2); - } - - /** - * Creates a new HunspellStemFilter that will stem tokens from the given TokenStream using affix rules in the provided - * Dictionary - * - * @param input TokenStream whose tokens will be stemmed - * @param dictionary HunspellDictionary containing the affix rules and words that will be used to stem the tokens - * @param dedup true if only unique terms should be output. - * @param recursionCap maximum level of recursion stemmer can go into, defaults to 2 - */ - public Hunspell2StemFilter(TokenStream input, Dictionary dictionary, boolean dedup, int recursionCap) { - super(input); - this.dedup = dedup; - this.stemmer = new Stemmer(dictionary, recursionCap); - } - - @Override - public boolean incrementToken() throws IOException { - if (buffer != null && !buffer.isEmpty()) { - CharsRef nextStem = buffer.remove(0); - restoreState(savedState); - posIncAtt.setPositionIncrement(0); - termAtt.setEmpty().append(nextStem); - return true; - } - - if (!input.incrementToken()) { - return false; - } - - if (keywordAtt.isKeyword()) { - return true; - } - - buffer = dedup ? stemmer.uniqueStems(termAtt.buffer(), termAtt.length()) : stemmer.stem(termAtt.buffer(), termAtt.length()); - - if (buffer.isEmpty()) { // we do not know this word, return it unchanged - return true; - } - - CharsRef stem = buffer.remove(0); - termAtt.setEmpty().append(stem); - - if (!buffer.isEmpty()) { - savedState = captureState(); - } - - return true; - } - - @Override - public void reset() throws IOException { - super.reset(); - buffer = null; - } -} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Hunspell2StemFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Hunspell2StemFilterFactory.java deleted file mode 100644 index 6ce73698dfd..00000000000 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Hunspell2StemFilterFactory.java +++ /dev/null @@ -1,80 +0,0 @@ -package org.apache.lucene.analysis.hunspell2; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; -import java.io.InputStream; -import java.text.ParseException; -import java.util.Map; - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.util.ResourceLoader; -import org.apache.lucene.analysis.util.ResourceLoaderAware; -import org.apache.lucene.analysis.util.TokenFilterFactory; - -/** - * TokenFilterFactory that creates instances of {@link Hunspell2StemFilter}. - * Example config for British English: - *
- * <filter class="solr.Hunspell2StemFilterFactory"
- *         dictionary="en_GB.dic"
- *         affix="en_GB.aff" />
- * Both parameters dictionary and affix are mandatory. - * Dictionaries for many languages are available through the OpenOffice project. - * - * See http://wiki.apache.org/solr/Hunspell - * @lucene.experimental - */ -public class Hunspell2StemFilterFactory extends TokenFilterFactory implements ResourceLoaderAware { - private static final String PARAM_DICTIONARY = "dictionary"; - private static final String PARAM_AFFIX = "affix"; - private static final String PARAM_RECURSION_CAP = "recursionCap"; - - private final String dictionaryFile; - private final String affixFile; - private Dictionary dictionary; - private int recursionCap; - - /** Creates a new Hunspell2StemFilterFactory */ - public Hunspell2StemFilterFactory(Map args) { - super(args); - dictionaryFile = require(args, PARAM_DICTIONARY); - affixFile = get(args, PARAM_AFFIX); - recursionCap = getInt(args, PARAM_RECURSION_CAP, 2); - if (!args.isEmpty()) { - throw new IllegalArgumentException("Unknown parameters: " + args); - } - } - - @Override - public void inform(ResourceLoader loader) throws IOException { - try (InputStream affix = loader.openResource(affixFile); - InputStream dictionary = loader.openResource(dictionaryFile)) { - try { - this.dictionary = new Dictionary(affix, dictionary); - } catch (ParseException e) { - throw new RuntimeException(e); - } - } - } - - @Override - public TokenStream create(TokenStream tokenStream) { - return new Hunspell2StemFilter(tokenStream, dictionary, recursionCap); - } -} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/package.html b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/package.html deleted file mode 100644 index 196591969e8..00000000000 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/package.html +++ /dev/null @@ -1,26 +0,0 @@ - - - -Stemming TokenFilter using a Java implementation of the -Hunspell stemming algorithm. -

-Dictionaries can be found on -OpenOffice's wiki -

- - diff --git a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory index e4ca7c6802c..04fc80cf59c 100644 --- a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory +++ b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory @@ -51,7 +51,6 @@ org.apache.lucene.analysis.hi.HindiNormalizationFilterFactory org.apache.lucene.analysis.hi.HindiStemFilterFactory org.apache.lucene.analysis.hu.HungarianLightStemFilterFactory org.apache.lucene.analysis.hunspell.HunspellStemFilterFactory -org.apache.lucene.analysis.hunspell2.Hunspell2StemFilterFactory org.apache.lucene.analysis.id.IndonesianStemFilterFactory org.apache.lucene.analysis.in.IndicNormalizationFilterFactory org.apache.lucene.analysis.it.ItalianLightStemFilterFactory diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java index bca5e1ede50..617e7523b69 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java @@ -62,8 +62,8 @@ import org.apache.lucene.analysis.commongrams.CommonGramsQueryFilter; import org.apache.lucene.analysis.compound.HyphenationCompoundWordTokenFilter; import org.apache.lucene.analysis.compound.TestCompoundWordTokenFilter; import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree; -import org.apache.lucene.analysis.hunspell.HunspellDictionary; -import org.apache.lucene.analysis.hunspell.HunspellDictionaryTest; +import org.apache.lucene.analysis.hunspell.Dictionary; +import org.apache.lucene.analysis.hunspell.TestHunspellStemFilter; import org.apache.lucene.analysis.miscellaneous.HyphenatedWordsFilter; import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilter; import org.apache.lucene.analysis.miscellaneous.LimitTokenPositionFilter; @@ -406,13 +406,13 @@ public class TestRandomChains extends BaseTokenStreamTestCase { return new IdentityEncoder(); // the other encoders will throw exceptions if tokens arent numbers? } }); - put(HunspellDictionary.class, new ArgProducer() { + put(Dictionary.class, new ArgProducer() { @Override public Object create(Random random) { // TODO: make nastier - InputStream affixStream = HunspellDictionaryTest.class.getResourceAsStream("test.aff"); - InputStream dictStream = HunspellDictionaryTest.class.getResourceAsStream("test.dic"); + InputStream affixStream = TestHunspellStemFilter.class.getResourceAsStream("simple.aff"); + InputStream dictStream = TestHunspellStemFilter.class.getResourceAsStream("simple.dic"); try { - return new HunspellDictionary(affixStream, dictStream, TEST_VERSION_CURRENT); + return new Dictionary(affixStream, dictStream); } catch (Exception ex) { Rethrow.rethrow(ex); return null; // unreachable code diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellDictionaryTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellDictionaryTest.java deleted file mode 100644 index fd8f9211727..00000000000 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellDictionaryTest.java +++ /dev/null @@ -1,201 +0,0 @@ -package org.apache.lucene.analysis.hunspell; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; -import java.io.InputStream; -import java.text.ParseException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; - -import org.apache.lucene.util.LuceneTestCase; -import org.junit.Assert; -import org.junit.Test; - -public class HunspellDictionaryTest extends LuceneTestCase { - - private class CloseCheckInputStream extends InputStream { - private InputStream delegate; - - private boolean closed = false; - - public CloseCheckInputStream(InputStream delegate) { - super(); - this.delegate = delegate; - } - - @Override - public int read() throws IOException { - return delegate.read(); - } - - @Override - public int hashCode() { - return delegate.hashCode(); - } - - @Override - public int read(byte[] b) throws IOException { - return delegate.read(b); - } - - @Override - public boolean equals(Object obj) { - return delegate.equals(obj); - } - - @Override - public int read(byte[] b, int off, int len) throws IOException { - return delegate.read(b, off, len); - } - - @Override - public long skip(long n) throws IOException { - return delegate.skip(n); - } - - @Override - public String toString() { - return delegate.toString(); - } - - @Override - public int available() throws IOException { - return delegate.available(); - } - - @Override - public void close() throws IOException { - this.closed = true; - delegate.close(); - } - - @Override - public void mark(int readlimit) { - delegate.mark(readlimit); - } - - @Override - public void reset() throws IOException { - delegate.reset(); - } - - @Override - public boolean markSupported() { - return delegate.markSupported(); - } - - public boolean isClosed() { - return this.closed; - } - - } - - @Test - public void testResourceCleanup() throws IOException, ParseException { - CloseCheckInputStream affixStream = new CloseCheckInputStream(getClass().getResourceAsStream("testCompressed.aff")); - CloseCheckInputStream dictStream = new CloseCheckInputStream(getClass().getResourceAsStream("testCompressed.dic")); - - new HunspellDictionary(affixStream, dictStream, TEST_VERSION_CURRENT); - - assertFalse(affixStream.isClosed()); - assertFalse(dictStream.isClosed()); - - affixStream.close(); - dictStream.close(); - - assertTrue(affixStream.isClosed()); - assertTrue(dictStream.isClosed()); - } - - @Test - public void testHunspellDictionary_loadDicAff() throws IOException, ParseException { - InputStream affixStream = getClass().getResourceAsStream("test.aff"); - InputStream dictStream = getClass().getResourceAsStream("test.dic"); - - HunspellDictionary dictionary = new HunspellDictionary(affixStream, dictStream, TEST_VERSION_CURRENT); - assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).size()); - assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).size()); - assertEquals(1, dictionary.lookupWord(new char[]{'o', 'l', 'r'}, 0, 3).size()); - assertEquals("Wrong number of flags for lucen", 1, dictionary.lookupWord(new char[]{'l', 'u', 'c', 'e', 'n'}, 0, 5).get(0).getFlags().length); - - affixStream.close(); - dictStream.close(); - } - - @Test - public void testHunspellDictionary_multipleDictWithOverride() throws IOException, ParseException { - InputStream affixStream = getClass().getResourceAsStream("test.aff"); - List dictStreams = new ArrayList(); - dictStreams.add(getClass().getResourceAsStream("test.dic")); - dictStreams.add(getClass().getResourceAsStream("testOverride.dic")); - - HunspellDictionary dictionary = new HunspellDictionary(affixStream, dictStreams, TEST_VERSION_CURRENT, false); - assertEquals("Wrong number of flags for lucen", 3, dictionary.lookupWord(new char[]{'l', 'u', 'c', 'e', 'n'}, 0, 5).get(0).getFlags().length); - assertEquals("Wrong number of flags for bar", 1, dictionary.lookupWord(new char[]{'b', 'a', 'r'}, 0, 3).get(0).getFlags().length); - - affixStream.close(); - for(InputStream dstream : dictStreams) { - dstream.close(); - } - } - - @Test - public void testCompressedHunspellDictionary_loadDicAff() throws IOException, ParseException { - InputStream affixStream = getClass().getResourceAsStream("testCompressed.aff"); - InputStream dictStream = getClass().getResourceAsStream("testCompressed.dic"); - - HunspellDictionary dictionary = new HunspellDictionary(affixStream, dictStream, TEST_VERSION_CURRENT); - assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).size()); - assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).size()); - assertEquals(1, dictionary.lookupWord(new char[]{'o', 'l', 'r'}, 0, 3).size()); - - affixStream.close(); - dictStream.close(); - } - - @Test - public void testHunspellDictionary_loadDicWrongAff() throws IOException, ParseException { - InputStream affixStream = getClass().getResourceAsStream("testWrongAffixRule.aff"); - InputStream dictStream = getClass().getResourceAsStream("test.dic"); - - HunspellDictionary dictionary = new HunspellDictionary(affixStream, Arrays.asList(dictStream), TEST_VERSION_CURRENT, false, false); - assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).size()); - assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).size()); - assertEquals(1, dictionary.lookupWord(new char[]{'o', 'l', 'r'}, 0, 3).size()); - //strict parsing disabled: malformed rule is not loaded - assertNull(dictionary.lookupPrefix(new char[]{'a'}, 0, 1)); - affixStream.close(); - dictStream.close(); - - affixStream = getClass().getResourceAsStream("testWrongAffixRule.aff"); - dictStream = getClass().getResourceAsStream("test.dic"); - //strict parsing enabled: malformed rule causes ParseException - try { - dictionary = new HunspellDictionary(affixStream, Arrays.asList(dictStream), TEST_VERSION_CURRENT, false, true); - Assert.fail(); - } catch(ParseException e) { - Assert.assertEquals("The affix file contains a rule with less than five elements", e.getMessage()); - Assert.assertEquals(23, e.getErrorOffset()); - } - - affixStream.close(); - dictStream.close(); - } -} diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellStemFilterTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellStemFilterTest.java deleted file mode 100644 index dd273fa8dc5..00000000000 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellStemFilterTest.java +++ /dev/null @@ -1,92 +0,0 @@ -package org.apache.lucene.analysis.hunspell; -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; -import java.io.InputStream; -import java.text.ParseException; -import java.util.Arrays; - -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.MockTokenizer; -import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.core.KeywordTokenizer; -import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter; -import org.apache.lucene.analysis.util.CharArraySet; -import org.apache.lucene.util.TestUtil; -import org.junit.AfterClass; -import org.junit.BeforeClass; - -public class HunspellStemFilterTest extends BaseTokenStreamTestCase { - - private static HunspellDictionary DICTIONARY; - @BeforeClass - public static void beforeClass() throws IOException, ParseException { - DICTIONARY = createDict(true); - } - @AfterClass - public static void afterClass() { - DICTIONARY = null; - } - public static HunspellDictionary createDict(boolean ignoreCase) throws IOException, ParseException { - InputStream affixStream = HunspellStemmerTest.class.getResourceAsStream("test.aff"); - InputStream dictStream = HunspellStemmerTest.class.getResourceAsStream("test.dic"); - - return new HunspellDictionary(affixStream, dictStream, TEST_VERSION_CURRENT, ignoreCase); - } - - /** - * Simple test for KeywordAttribute - */ - public void testKeywordAttribute() throws IOException { - MockTokenizer tokenizer = whitespaceMockTokenizer("lucene is awesome"); - tokenizer.setEnableChecks(true); - HunspellStemFilter filter = new HunspellStemFilter(tokenizer, DICTIONARY, TestUtil.nextInt(random(), 1, 3)); - assertTokenStreamContents(filter, new String[]{"lucene", "lucen", "is", "awesome"}, new int[] {1, 0, 1, 1}); - - // assert with keywork marker - tokenizer = whitespaceMockTokenizer("lucene is awesome"); - CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, Arrays.asList("Lucene"), true); - filter = new HunspellStemFilter(new SetKeywordMarkerFilter(tokenizer, set), DICTIONARY, TestUtil.nextInt(random(), 1, 3)); - assertTokenStreamContents(filter, new String[]{"lucene", "is", "awesome"}, new int[] {1, 1, 1}); - } - - /** blast some random strings through the analyzer */ - public void testRandomStrings() throws Exception { - Analyzer analyzer = new Analyzer() { - - @Override - protected TokenStreamComponents createComponents(String fieldName) { - Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); - return new TokenStreamComponents(tokenizer, new HunspellStemFilter(tokenizer, DICTIONARY, TestUtil.nextInt(random(), 1, 3))); - } - }; - checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER); - } - - public void testEmptyTerm() throws IOException { - Analyzer a = new Analyzer() { - @Override - protected TokenStreamComponents createComponents(String fieldName) { - Tokenizer tokenizer = new KeywordTokenizer(); - return new TokenStreamComponents(tokenizer, new HunspellStemFilter(tokenizer, DICTIONARY, TestUtil.nextInt(random(), 1, 3))); - } - }; - checkOneTerm(a, "", ""); - } -} diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellStemmerTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellStemmerTest.java deleted file mode 100644 index 66a9410c27a..00000000000 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellStemmerTest.java +++ /dev/null @@ -1,137 +0,0 @@ -package org.apache.lucene.analysis.hunspell; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.util.LuceneTestCase; -import org.apache.lucene.util.Version; -import org.junit.AfterClass; -import org.junit.BeforeClass; -import org.junit.Test; - -import java.io.IOException; -import java.io.InputStream; -import java.text.ParseException; -import java.util.List; - -import static junit.framework.Assert.assertEquals; - -public class HunspellStemmerTest extends LuceneTestCase { - - private static HunspellStemmer stemmer; - - @BeforeClass - public static void beforeClass() throws IOException, ParseException { - createStemmer(true); - } - - @AfterClass - public static void afterClass() { - stemmer = null; - } - - @Test - public void testStem_simpleSuffix() { - List stems = stemmer.stem("lucene"); - - assertEquals(2, stems.size()); - assertEquals("lucene", stems.get(0).getStemString()); - assertEquals("lucen", stems.get(1).getStemString()); - - stems = stemmer.stem("mahoute"); - assertEquals(1, stems.size()); - assertEquals("mahout", stems.get(0).getStemString()); - } - - @Test - public void testStem_simplePrefix() { - List stems = stemmer.stem("solr"); - - assertEquals(1, stems.size()); - assertEquals("olr", stems.get(0).getStemString()); - } - - @Test - public void testStem_recursiveSuffix() { - List stems = stemmer.stem("abcd"); - - assertEquals(1, stems.size()); - assertEquals("ab", stems.get(0).getStemString()); - } - - @Test - public void testStem_ignoreCase() throws IOException, ParseException { - List stems; - createStemmer(true); - - stems = stemmer.stem("apache"); - assertEquals(1, stems.size()); - assertEquals("apach", stems.get(0).getStemString()); - - stems = stemmer.stem("APACHE"); - assertEquals(1, stems.size()); - assertEquals("apach", stems.get(0).getStemString()); - - stems = stemmer.stem("Apache"); - assertEquals(1, stems.size()); - assertEquals("apach", stems.get(0).getStemString()); - - stems = stemmer.stem("foos"); - assertEquals(1, stems.size()); - assertEquals("foo", stems.get(0).getStemString()); - - stems = stemmer.stem("mood"); - assertEquals(1, stems.size()); - assertEquals("moo", stems.get(0).getStemString()); - - stems = stemmer.stem("Foos"); - assertEquals(1, stems.size()); - assertEquals("foo", stems.get(0).getStemString()); - - // The "Foo" rule gets overridden by the "foo" rule, and we don't merge - stems = stemmer.stem("Food"); - assertEquals(0, stems.size()); - - stems = stemmer.stem("Mood"); - assertEquals(1, stems.size()); - assertEquals("moo", stems.get(0).getStemString()); - } - - @Test - public void testStem_caseSensitive() throws IOException, ParseException { - createStemmer(false); - List stems = stemmer.stem("apache"); - assertEquals(0, stems.size()); - - stems = stemmer.stem("Apache"); - assertEquals(1, stems.size()); - assertEquals("Apach", stems.get(0).getStemString()); - } - - - private static void createStemmer(boolean ignoreCase) throws IOException, ParseException { - InputStream affixStream = HunspellStemmerTest.class.getResourceAsStream("test.aff"); - InputStream dictStream = HunspellStemmerTest.class.getResourceAsStream("test.dic"); - - HunspellDictionary dictionary = new HunspellDictionary(affixStream, dictStream, TEST_VERSION_CURRENT, ignoreCase); - stemmer = new HunspellStemmer(dictionary); - - affixStream.close(); - dictStream.close(); - } - -} diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestAllDictionaries.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java similarity index 93% rename from lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestAllDictionaries.java rename to lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java index d00fc634944..3322eb109a6 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestAllDictionaries.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java @@ -1,4 +1,4 @@ -package org.apache.lucene.analysis.hunspell2; +package org.apache.lucene.analysis.hunspell; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -22,7 +22,7 @@ import java.io.InputStream; import java.util.zip.ZipEntry; import java.util.zip.ZipFile; -import org.apache.lucene.analysis.hunspell.HunspellDictionary; +import org.apache.lucene.analysis.hunspell.Dictionary; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.RamUsageEstimator; @@ -33,7 +33,7 @@ import org.junit.Ignore; * wget --mirror -np http://archive.services.openoffice.org/pub/mirror/OpenOffice.org/contrib/dictionaries/ * Note some of the files differ only in case. This may be a problem on your operating system! */ -//@Ignore("enable manually") +@Ignore("enable manually") public class TestAllDictionaries extends LuceneTestCase { // set this to the location of where you downloaded all the files @@ -162,21 +162,11 @@ public class TestAllDictionaries extends LuceneTestCase { assert dicEntry != null; ZipEntry affEntry = zip.getEntry(tests[i+2]); assert affEntry != null; - - // get ram from previous impl - String oldRAM = "FAIL"; - try (InputStream dictionary = zip.getInputStream(dicEntry); - InputStream affix = zip.getInputStream(affEntry)) { - try { - HunspellDictionary dic = new HunspellDictionary(affix, dictionary, TEST_VERSION_CURRENT); - oldRAM = RamUsageEstimator.humanSizeOf(dic); - } catch (Throwable t) {} - } try (InputStream dictionary = zip.getInputStream(dicEntry); InputStream affix = zip.getInputStream(affEntry)) { Dictionary dic = new Dictionary(affix, dictionary); - System.out.println(tests[i] + "\t" + oldRAM + "\t" + RamUsageEstimator.humanSizeOf(dic) + "\t(" + + System.out.println(tests[i] + "\t" + RamUsageEstimator.humanSizeOf(dic) + "\t(" + "words=" + RamUsageEstimator.humanSizeOf(dic.words) + ", " + "flags=" + RamUsageEstimator.humanSizeOf(dic.flagLookup) + ", " + "strips=" + RamUsageEstimator.humanSizeOf(dic.stripLookup) + ", " + @@ -204,7 +194,7 @@ public class TestAllDictionaries extends LuceneTestCase { try (InputStream dictionary = zip.getInputStream(dicEntry); InputStream affix = zip.getInputStream(affEntry)) { - Dictionary dic = new Dictionary(affix, dictionary); + new Dictionary(affix, dictionary); } } } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestCaseInsensitive.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestCaseInsensitive.java new file mode 100644 index 00000000000..64bdb41e8c7 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestCaseInsensitive.java @@ -0,0 +1,110 @@ +package org.apache.lucene.analysis.hunspell; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.hunspell.Dictionary; +import org.apache.lucene.analysis.hunspell.Stemmer; +import org.apache.lucene.util.CharsRef; +import org.apache.lucene.util.LuceneTestCase; +import org.junit.AfterClass; +import org.junit.BeforeClass; + +import java.io.InputStream; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +public class TestCaseInsensitive extends LuceneTestCase { + private static Stemmer stemmer; + + @BeforeClass + public static void beforeClass() throws Exception { + try (InputStream affixStream = TestCaseInsensitive.class.getResourceAsStream("simple.aff"); + InputStream dictStream = TestCaseInsensitive.class.getResourceAsStream("mixedcase.dic")) { + Dictionary dictionary = new Dictionary(affixStream, Collections.singletonList(dictStream), true); + stemmer = new Stemmer(dictionary); + } + } + + @AfterClass + public static void afterClass() { + stemmer = null; + } + + public void testCaseInsensitivity() { + assertStemsTo("lucene", "lucene", "lucen"); + assertStemsTo("LuCeNe", "lucene", "lucen"); + assertStemsTo("mahoute", "mahout"); + assertStemsTo("MaHoUte", "mahout"); + } + + public void testSimplePrefix() { + assertStemsTo("solr", "olr"); + } + + public void testRecursiveSuffix() { + assertStemsTo("abcd", "ab"); + } + + // all forms unmunched from dictionary + public void testAllStems() { + assertStemsTo("ab", "ab"); + assertStemsTo("abc", "ab"); + assertStemsTo("apach", "apach"); + assertStemsTo("apache", "apach"); + assertStemsTo("foo", "foo"); + assertStemsTo("food", "foo"); + assertStemsTo("foos", "foo"); + assertStemsTo("lucen", "lucen"); + assertStemsTo("lucene", "lucen", "lucene"); + assertStemsTo("mahout", "mahout"); + assertStemsTo("mahoute", "mahout"); + assertStemsTo("moo", "moo"); + assertStemsTo("mood", "moo"); + assertStemsTo("olr", "olr"); + assertStemsTo("solr", "olr"); + } + + // some bogus stuff that should not stem (empty lists)! + public void testBogusStems() { + assertStemsTo("abs"); + assertStemsTo("abe"); + assertStemsTo("sab"); + assertStemsTo("sapach"); + assertStemsTo("sapache"); + assertStemsTo("apachee"); + assertStemsTo("sfoo"); + assertStemsTo("sfoos"); + assertStemsTo("fooss"); + assertStemsTo("lucenee"); + assertStemsTo("solre"); + } + + private void assertStemsTo(String s, String... expected) { + Arrays.sort(expected); + + List stems = stemmer.stem(s); + String actual[] = new String[stems.size()]; + for (int i = 0; i < actual.length; i++) { + actual[i] = stems.get(i).toString(); + } + Arrays.sort(actual); + + assertArrayEquals(expected, actual); + } +} diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestDictionary.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java similarity index 97% rename from lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestDictionary.java rename to lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java index e8e0fd0d030..6cbe931d376 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestDictionary.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java @@ -1,4 +1,4 @@ -package org.apache.lucene.analysis.hunspell2; +package org.apache.lucene.analysis.hunspell; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -22,6 +22,7 @@ import java.io.IOException; import java.io.InputStream; import java.text.ParseException; +import org.apache.lucene.analysis.hunspell.Dictionary; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.LuceneTestCase; diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestHunspell2StemFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellStemFilter.java similarity index 75% rename from lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestHunspell2StemFilter.java rename to lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellStemFilter.java index eafb1f272cf..af48427d522 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestHunspell2StemFilter.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellStemFilter.java @@ -1,4 +1,4 @@ -package org.apache.lucene.analysis.hunspell2; +package org.apache.lucene.analysis.hunspell; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -26,13 +26,15 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.core.KeywordTokenizer; +import org.apache.lucene.analysis.hunspell.Dictionary; +import org.apache.lucene.analysis.hunspell.HunspellStemFilter; import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter; import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.util.TestUtil; import org.junit.AfterClass; import org.junit.BeforeClass; -public class TestHunspell2StemFilter extends BaseTokenStreamTestCase { +public class TestHunspellStemFilter extends BaseTokenStreamTestCase { private static Dictionary dictionary; @BeforeClass @@ -52,13 +54,21 @@ public class TestHunspell2StemFilter extends BaseTokenStreamTestCase { public void testKeywordAttribute() throws IOException { MockTokenizer tokenizer = whitespaceMockTokenizer("lucene is awesome"); tokenizer.setEnableChecks(true); - Hunspell2StemFilter filter = new Hunspell2StemFilter(tokenizer, dictionary, TestUtil.nextInt(random(), 1, 3)); + HunspellStemFilter filter = new HunspellStemFilter(tokenizer, dictionary, TestUtil.nextInt(random(), 1, 3)); assertTokenStreamContents(filter, new String[]{"lucene", "lucen", "is", "awesome"}, new int[] {1, 0, 1, 1}); // assert with keyword marker tokenizer = whitespaceMockTokenizer("lucene is awesome"); CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, Arrays.asList("Lucene"), true); - filter = new Hunspell2StemFilter(new SetKeywordMarkerFilter(tokenizer, set), dictionary, TestUtil.nextInt(random(), 1, 3)); + filter = new HunspellStemFilter(new SetKeywordMarkerFilter(tokenizer, set), dictionary, TestUtil.nextInt(random(), 1, 3)); + assertTokenStreamContents(filter, new String[]{"lucene", "is", "awesome"}, new int[] {1, 1, 1}); + } + + /** simple test for longestOnly option */ + public void testLongestOnly() throws IOException { + MockTokenizer tokenizer = whitespaceMockTokenizer("lucene is awesome"); + tokenizer.setEnableChecks(true); + HunspellStemFilter filter = new HunspellStemFilter(tokenizer, dictionary, true, TestUtil.nextInt(random(), 1, 3), true); assertTokenStreamContents(filter, new String[]{"lucene", "is", "awesome"}, new int[] {1, 1, 1}); } @@ -68,7 +78,7 @@ public class TestHunspell2StemFilter extends BaseTokenStreamTestCase { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); - return new TokenStreamComponents(tokenizer, new Hunspell2StemFilter(tokenizer, dictionary, TestUtil.nextInt(random(), 1, 3))); + return new TokenStreamComponents(tokenizer, new HunspellStemFilter(tokenizer, dictionary, TestUtil.nextInt(random(), 1, 3))); } }; checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER); @@ -79,7 +89,7 @@ public class TestHunspell2StemFilter extends BaseTokenStreamTestCase { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new KeywordTokenizer(); - return new TokenStreamComponents(tokenizer, new Hunspell2StemFilter(tokenizer, dictionary, TestUtil.nextInt(random(), 1, 3))); + return new TokenStreamComponents(tokenizer, new HunspellStemFilter(tokenizer, dictionary, TestUtil.nextInt(random(), 1, 3))); } }; checkOneTerm(a, "", ""); diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellStemFilterFactory.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellStemFilterFactory.java index e8e232ce60b..f4302035dbc 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellStemFilterFactory.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellStemFilterFactory.java @@ -20,7 +20,6 @@ package org.apache.lucene.analysis.hunspell; import java.io.Reader; import java.io.StringReader; -import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase; @@ -31,17 +30,17 @@ public class TestHunspellStemFilterFactory extends BaseTokenStreamFactoryTestCas public void testStemming() throws Exception { Reader reader = new StringReader("abc"); TokenStream stream = whitespaceMockTokenizer(reader); - stream = tokenFilterFactory("HunspellStem", - "dictionary", "test.dic", - "affix", "test.aff").create(stream); + stream = tokenFilterFactory("Hunspell2Stem", + "dictionary", "simple.dic", + "affix", "simple.aff").create(stream); assertTokenStreamContents(stream, new String[] { "ab" }); } /** Test that bogus arguments result in exception */ public void testBogusArguments() throws Exception { try { - tokenFilterFactory("HunspellStem", - "dictionary", "test.dic", + tokenFilterFactory("Hunspell2Stem", + "dictionary", "simple.dic", "bogusArg", "bogusValue"); fail(); } catch (IllegalArgumentException expected) { diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestStemmer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestStemmer.java similarity index 95% rename from lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestStemmer.java rename to lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestStemmer.java index 4dec107f314..dca9faa6b16 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestStemmer.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestStemmer.java @@ -1,4 +1,4 @@ -package org.apache.lucene.analysis.hunspell2; +package org.apache.lucene.analysis.hunspell; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -17,6 +17,8 @@ package org.apache.lucene.analysis.hunspell2; * limitations under the License. */ +import org.apache.lucene.analysis.hunspell.Dictionary; +import org.apache.lucene.analysis.hunspell.Stemmer; import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.LuceneTestCase; import org.junit.AfterClass; diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/broken.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/broken.aff similarity index 100% rename from lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/broken.aff rename to lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/broken.aff diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/compressed.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compressed.aff similarity index 100% rename from lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/compressed.aff rename to lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compressed.aff diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/compressed.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compressed.dic similarity index 100% rename from lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/compressed.dic rename to lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compressed.dic diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/mixedcase.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/mixedcase.dic new file mode 100644 index 00000000000..9fae253279e --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/mixedcase.dic @@ -0,0 +1,10 @@ +9 +Ab/C +apach/A +Foo/D +foo/E +Lucen/A +Lucene +mahout/A +Moo/E +olr/B diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/simple.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/simple.aff similarity index 100% rename from lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/simple.aff rename to lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/simple.aff diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/simple.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/simple.dic similarity index 100% rename from lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/simple.dic rename to lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/simple.dic diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/test.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/test.aff deleted file mode 100644 index db9423dcad1..00000000000 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/test.aff +++ /dev/null @@ -1,20 +0,0 @@ -SET UTF-8 -TRY abcdefghijklmopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ - -SFX A Y 3 -SFX A 0 e n -SFX A 0 e t -SFX A 0 e h - -SFX C Y 2 -SFX C 0 d/C c -SFX C 0 c b - -SFX D Y 1 -SFX D 0 s o - -SFX E Y 1 -SFX E 0 d o - -PFX B Y 1 -PFX B 0 s o \ No newline at end of file diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/test.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/test.dic deleted file mode 100644 index 12efd8fccb2..00000000000 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/test.dic +++ /dev/null @@ -1,10 +0,0 @@ -9 -lucen/A -lucene -mahout/A -olr/B -ab/C -Apach/A -Foo/E -foo/D -Moo/E \ No newline at end of file diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testCompressed.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testCompressed.aff deleted file mode 100644 index e4a1b37300f..00000000000 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testCompressed.aff +++ /dev/null @@ -1,29 +0,0 @@ -SET UTF-8 -TRY abcdefghijklmopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ - -FLAG long - -AF 5 -AF AA -AF BB -AF CC -AF DD -AF EE - -SFX AA Y 3 -SFX AA 0 e n -SFX AA 0 e t -SFX AA 0 e h - -SFX CC Y 2 -SFX CC 0 d/3 c -SFX CC 0 c b - -SFX DD Y 1 -SFX DD 0 s o - -SFX EE Y 1 -SFX EE 0 d o - -PFX BB Y 1 -PFX BB 0 s o diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testCompressed.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testCompressed.dic deleted file mode 100644 index bf237662017..00000000000 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testCompressed.dic +++ /dev/null @@ -1,9 +0,0 @@ -6 -lucen/1 -lucene -mahout/1 -olr/2 -ab/3 -Apach/1 -foo/4 -Foo/5 diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testOverride.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testOverride.dic deleted file mode 100644 index c1111ef562b..00000000000 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testOverride.dic +++ /dev/null @@ -1,3 +0,0 @@ -2 -lucen/ABC -bar/A \ No newline at end of file diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testWrongAffixRule.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testWrongAffixRule.aff deleted file mode 100644 index 3b780cd1d7b..00000000000 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testWrongAffixRule.aff +++ /dev/null @@ -1,24 +0,0 @@ -SET UTF-8 -TRY abcdefghijklmopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ - -SFX A Y 3 -SFX A 0 e n -SFX A 0 e t -SFX A 0 e h - -SFX C Y 2 -SFX C 0 d/C c -SFX C 0 c b - -SFX D Y 1 -SFX D 0 s o - -SFX E Y 1 -SFX E 0 d o - -PFX B Y 1 -PFX B 0 s o - -#wrong rule (only 4 elements) -PFX A0 Y 1 -PFX A0 0 a \ No newline at end of file diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestHunspell2StemFilterFactory.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestHunspell2StemFilterFactory.java deleted file mode 100644 index d95e2be04b6..00000000000 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestHunspell2StemFilterFactory.java +++ /dev/null @@ -1,50 +0,0 @@ -package org.apache.lucene.analysis.hunspell2; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.Reader; -import java.io.StringReader; - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase; - -/** - * Simple tests to ensure the Hunspell stemmer loads from factory - */ -public class TestHunspell2StemFilterFactory extends BaseTokenStreamFactoryTestCase { - public void testStemming() throws Exception { - Reader reader = new StringReader("abc"); - TokenStream stream = whitespaceMockTokenizer(reader); - stream = tokenFilterFactory("Hunspell2Stem", - "dictionary", "simple.dic", - "affix", "simple.aff").create(stream); - assertTokenStreamContents(stream, new String[] { "ab" }); - } - - /** Test that bogus arguments result in exception */ - public void testBogusArguments() throws Exception { - try { - tokenFilterFactory("Hunspell2Stem", - "dictionary", "simple.dic", - "bogusArg", "bogusValue"); - fail(); - } catch (IllegalArgumentException expected) { - assertTrue(expected.getMessage().contains("Unknown parameters")); - } - } -}