From c4f4beb27e6cb636b0b151b4288f2230e350adc4 Mon Sep 17 00:00:00 2001
From: Robert Muir
Date: Thu, 27 Feb 2014 20:19:27 +0000
Subject: [PATCH] LUCENE-5468: hunspell2 -> hunspell (with previous options and
tests)
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5468@1572718 13f79535-47bb-0310-9956-ffa450edef68
---
.../{hunspell2 => hunspell}/Dictionary.java | 85 ++-
.../analysis/hunspell/HunspellAffix.java | 157 ------
.../analysis/hunspell/HunspellDictionary.java | 507 ------------------
.../analysis/hunspell/HunspellStemFilter.java | 89 ++-
.../hunspell/HunspellStemFilterFactory.java | 62 +--
.../analysis/hunspell/HunspellStemmer.java | 392 --------------
.../analysis/hunspell/HunspellWord.java | 63 ---
.../ISO8859_14Decoder.java | 2 +-
.../{hunspell2 => hunspell}/Stemmer.java | 28 +-
.../hunspell2/Hunspell2StemFilter.java | 137 -----
.../hunspell2/Hunspell2StemFilterFactory.java | 80 ---
.../lucene/analysis/hunspell2/package.html | 26 -
...he.lucene.analysis.util.TokenFilterFactory | 1 -
.../analysis/core/TestRandomChains.java | 12 +-
.../hunspell/HunspellDictionaryTest.java | 201 -------
.../hunspell/HunspellStemFilterTest.java | 92 ----
.../hunspell/HunspellStemmerTest.java | 137 -----
.../TestAllDictionaries.java | 20 +-
.../hunspell/TestCaseInsensitive.java | 110 ++++
.../TestDictionary.java | 3 +-
.../TestHunspellStemFilter.java} | 22 +-
.../TestHunspellStemFilterFactory.java | 11 +-
.../{hunspell2 => hunspell}/TestStemmer.java | 4 +-
.../{hunspell2 => hunspell}/broken.aff | 0
.../{hunspell2 => hunspell}/compressed.aff | 0
.../{hunspell2 => hunspell}/compressed.dic | 0
.../lucene/analysis/hunspell/mixedcase.dic | 10 +
.../{hunspell2 => hunspell}/simple.aff | 0
.../{hunspell2 => hunspell}/simple.dic | 0
.../apache/lucene/analysis/hunspell/test.aff | 20 -
.../apache/lucene/analysis/hunspell/test.dic | 10 -
.../analysis/hunspell/testCompressed.aff | 29 -
.../analysis/hunspell/testCompressed.dic | 9 -
.../lucene/analysis/hunspell/testOverride.dic | 3 -
.../analysis/hunspell/testWrongAffixRule.aff | 24 -
.../TestHunspell2StemFilterFactory.java | 50 --
36 files changed, 320 insertions(+), 2076 deletions(-)
rename lucene/analysis/common/src/java/org/apache/lucene/analysis/{hunspell2 => hunspell}/Dictionary.java (90%)
delete mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellAffix.java
delete mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellDictionary.java
delete mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemmer.java
delete mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellWord.java
rename lucene/analysis/common/src/java/org/apache/lucene/analysis/{hunspell2 => hunspell}/ISO8859_14Decoder.java (98%)
rename lucene/analysis/common/src/java/org/apache/lucene/analysis/{hunspell2 => hunspell}/Stemmer.java (92%)
delete mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Hunspell2StemFilter.java
delete mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Hunspell2StemFilterFactory.java
delete mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/package.html
delete mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellDictionaryTest.java
delete mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellStemFilterTest.java
delete mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellStemmerTest.java
rename lucene/analysis/common/src/test/org/apache/lucene/analysis/{hunspell2 => hunspell}/TestAllDictionaries.java (93%)
create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestCaseInsensitive.java
rename lucene/analysis/common/src/test/org/apache/lucene/analysis/{hunspell2 => hunspell}/TestDictionary.java (97%)
rename lucene/analysis/common/src/test/org/apache/lucene/analysis/{hunspell2/TestHunspell2StemFilter.java => hunspell/TestHunspellStemFilter.java} (75%)
rename lucene/analysis/common/src/test/org/apache/lucene/analysis/{hunspell2 => hunspell}/TestStemmer.java (95%)
rename lucene/analysis/common/src/test/org/apache/lucene/analysis/{hunspell2 => hunspell}/broken.aff (100%)
rename lucene/analysis/common/src/test/org/apache/lucene/analysis/{hunspell2 => hunspell}/compressed.aff (100%)
rename lucene/analysis/common/src/test/org/apache/lucene/analysis/{hunspell2 => hunspell}/compressed.dic (100%)
create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/mixedcase.dic
rename lucene/analysis/common/src/test/org/apache/lucene/analysis/{hunspell2 => hunspell}/simple.aff (100%)
rename lucene/analysis/common/src/test/org/apache/lucene/analysis/{hunspell2 => hunspell}/simple.dic (100%)
delete mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/test.aff
delete mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/test.dic
delete mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testCompressed.aff
delete mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testCompressed.dic
delete mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testOverride.dic
delete mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testWrongAffixRule.aff
delete mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestHunspell2StemFilterFactory.java
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
similarity index 90%
rename from lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java
rename to lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
index b9f9c82c2f5..7bbf27fb817 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java
@@ -1,4 +1,4 @@
-package org.apache.lucene.analysis.hunspell2;
+package org.apache.lucene.analysis.hunspell;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@@ -17,7 +17,6 @@ package org.apache.lucene.analysis.hunspell2;
* limitations under the License.
*/
-import org.apache.lucene.analysis.util.CharArrayMap;
import org.apache.lucene.store.ByteArrayDataOutput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
@@ -28,14 +27,19 @@ import org.apache.lucene.util.OfflineSorter;
import org.apache.lucene.util.OfflineSorter.ByteSequencesReader;
import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter;
import org.apache.lucene.util.UnicodeUtil;
-import org.apache.lucene.util.Version;
import org.apache.lucene.util.fst.Builder;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.IntSequenceOutputs;
import org.apache.lucene.util.fst.PositiveIntOutputs;
import org.apache.lucene.util.fst.Util;
-import java.io.*;
+import java.io.BufferedInputStream;
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.LineNumberReader;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction;
@@ -71,27 +75,27 @@ public class Dictionary {
private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*";
private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s";
- public FST prefixes;
- public FST suffixes;
+ FST prefixes;
+ FST suffixes;
// all Patterns used by prefixes and suffixes. these are typically re-used across
// many affix stripping rules. so these are deduplicated, to save RAM.
// TODO: maybe don't use Pattern for the condition check...
// TODO: when we cut over Affix to FST, just store integer index to this.
- public ArrayList patterns = new ArrayList<>();
+ ArrayList patterns = new ArrayList<>();
// the entries in the .dic file, mapping to their set of flags.
// the fst output is the ordinal for flagLookup
- public FST words;
+ FST words;
// the list of unique flagsets (wordforms). theoretically huge, but practically
// small (e.g. for polish this is 756), otherwise humans wouldn't be able to deal with it either.
- public BytesRefHash flagLookup = new BytesRefHash();
+ BytesRefHash flagLookup = new BytesRefHash();
// the list of unique strip affixes.
- public BytesRefHash stripLookup = new BytesRefHash();
+ BytesRefHash stripLookup = new BytesRefHash();
// 8 bytes per affix
- public byte[] affixData = new byte[64];
+ byte[] affixData = new byte[64];
private int currentAffix = 0;
private FlagParsingStrategy flagParsingStrategy = new SimpleFlagParsingStrategy(); // Default flag parsing strategy
@@ -100,7 +104,11 @@ public class Dictionary {
private int aliasCount = 0;
private final File tempDir = OfflineSorter.defaultTempDir(); // TODO: make this configurable?
-
+
+ public static final int IGNORE_CASE = 1;
+
+ boolean ignoreCase;
+
/**
* Creates a new Dictionary containing the information read from the provided InputStreams to hunspell affix
* and dictionary files.
@@ -112,6 +120,21 @@ public class Dictionary {
* @throws ParseException Can be thrown if the content of the files does not meet expected formats
*/
public Dictionary(InputStream affix, InputStream dictionary) throws IOException, ParseException {
+ this(affix, Collections.singletonList(dictionary), false);
+ }
+
+ /**
+ * Creates a new Dictionary containing the information read from the provided InputStreams to hunspell affix
+ * and dictionary files.
+ * You have to close the provided InputStreams yourself.
+ *
+ * @param affix InputStream for reading the hunspell affix file (won't be closed).
+ * @param dictionaries InputStream for reading the hunspell dictionary files (won't be closed).
+ * @throws IOException Can be thrown while reading from the InputStreams
+ * @throws ParseException Can be thrown if the content of the files does not meet expected formats
+ */
+ public Dictionary(InputStream affix, List dictionaries, boolean ignoreCase) throws IOException, ParseException {
+ this.ignoreCase = ignoreCase;
BufferedInputStream buffered = new BufferedInputStream(affix, 8192);
buffered.mark(8192);
String encoding = getDictionaryEncoding(affix);
@@ -122,7 +145,7 @@ public class Dictionary {
stripLookup.add(new BytesRef()); // no strip -> ord 0
PositiveIntOutputs o = PositiveIntOutputs.getSingleton();
Builder b = new Builder(FST.INPUT_TYPE.BYTE4, o);
- readDictionaryFile(dictionary, decoder, b);
+ readDictionaryFiles(dictionaries, decoder, b);
words = b.finish();
}
@@ -145,7 +168,7 @@ public class Dictionary {
return decodeFlags(flagLookup.get(ord, scratch));
}
- public Integer lookupOrd(char word[], int offset, int length) throws IOException {
+ Integer lookupOrd(char word[], int offset, int length) throws IOException {
final FST.BytesReader bytesReader = words.getBytesReader();
final FST.Arc arc = words.getFirstArc(new FST.Arc());
// Accumulate output as we go
@@ -269,7 +292,6 @@ public class Dictionary {
Util.toUTF32(entry.getKey(), scratch);
List entries = entry.getValue();
IntsRef output = new IntsRef(entries.size());
- int upto = 0;
for (Character c : entries) {
output.ints[output.length++] = c;
}
@@ -480,23 +502,39 @@ public class Dictionary {
}
/**
- * Reads the dictionary file through the provided InputStream, building up the words map
+ * Reads the dictionary file through the provided InputStreams, building up the words map
*
- * @param dictionary InputStream to read the dictionary file through
+ * @param dictionaries InputStreams to read the dictionary file through
* @param decoder CharsetDecoder used to decode the contents of the file
* @throws IOException Can be thrown while reading from the file
*/
- private void readDictionaryFile(InputStream dictionary, CharsetDecoder decoder, Builder words) throws IOException {
+ private void readDictionaryFiles(List dictionaries, CharsetDecoder decoder, Builder words) throws IOException {
BytesRef flagsScratch = new BytesRef();
IntsRef scratchInts = new IntsRef();
- BufferedReader lines = new BufferedReader(new InputStreamReader(dictionary, decoder));
- String line = lines.readLine(); // first line is number of entries (approximately, sometimes)
-
File unsorted = File.createTempFile("unsorted", "dat", tempDir);
try (ByteSequencesWriter writer = new ByteSequencesWriter(unsorted)) {
- while ((line = lines.readLine()) != null) {
- writer.write(line.getBytes(IOUtils.CHARSET_UTF_8));
+ for (InputStream dictionary : dictionaries) {
+ BufferedReader lines = new BufferedReader(new InputStreamReader(dictionary, decoder));
+ String line = lines.readLine(); // first line is number of entries (approximately, sometimes)
+
+ while ((line = lines.readLine()) != null) {
+ if (ignoreCase) {
+ int flagSep = line.lastIndexOf('/');
+ if (flagSep == -1) {
+ writer.write(line.toLowerCase(Locale.ROOT).getBytes(IOUtils.CHARSET_UTF_8));
+ } else {
+ StringBuilder sb = new StringBuilder();
+ sb.append(line.substring(0, flagSep).toLowerCase(Locale.ROOT));
+ if (flagSep < line.length()) {
+ sb.append(line.substring(flagSep, line.length()));
+ }
+ writer.write(sb.toString().getBytes(IOUtils.CHARSET_UTF_8));
+ }
+ } else {
+ writer.write(line.getBytes(IOUtils.CHARSET_UTF_8));
+ }
+ }
}
}
File sorted = File.createTempFile("sorted", "dat", tempDir);
@@ -544,6 +582,7 @@ public class Dictionary {
BytesRef currentEntry = new BytesRef();
char currentFlags[] = new char[0];
+ String line;
while (reader.read(scratchLine)) {
line = scratchLine.utf8ToString();
String entry;
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellAffix.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellAffix.java
deleted file mode 100644
index 97376c0b15e..00000000000
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellAffix.java
+++ /dev/null
@@ -1,157 +0,0 @@
-package org.apache.lucene.analysis.hunspell;
-
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.util.regex.Pattern;
-
-/**
- * Wrapper class representing a hunspell affix
- */
-public class HunspellAffix {
-
- private String append; // the affix itself, what is appended
- private char appendFlags[]; // continuation class flags
- private String strip;
-
- private String condition;
- private Pattern conditionPattern;
-
- private char flag;
-
- private boolean crossProduct;
-
- /**
- * Checks whether the given text matches the conditional pattern on this affix
- *
- * @param text Text to check if it matches the affix's conditional pattern
- * @return {@code true} if the text meets the condition, {@code false} otherwise
- */
- public boolean checkCondition(CharSequence text) {
- return conditionPattern.matcher(text).matches();
- }
-
- /**
- * Returns the append defined for the affix
- *
- * @return Defined append
- */
- public String getAppend() {
- return append;
- }
-
- /**
- * Sets the append defined for the affix
- *
- * @param append Defined append for the affix
- */
- public void setAppend(String append) {
- this.append = append;
- }
-
- /**
- * Returns the flags defined for the affix append
- *
- * @return Flags defined for the affix append
- */
- public char[] getAppendFlags() {
- return appendFlags;
- }
-
- /**
- * Sets the flags defined for the affix append
- *
- * @param appendFlags Flags defined for the affix append
- */
- public void setAppendFlags(char[] appendFlags) {
- this.appendFlags = appendFlags;
- }
-
- /**
- * Returns the stripping characters defined for the affix
- *
- * @return Stripping characters defined for the affix
- */
- public String getStrip() {
- return strip;
- }
-
- /**
- * Sets the stripping characters defined for the affix
- *
- * @param strip Stripping characters defined for the affix
- */
- public void setStrip(String strip) {
- this.strip = strip;
- }
-
- /**
- * Returns the condition that must be met before the affix can be applied
- *
- * @return Condition that must be met before the affix can be applied
- */
- public String getCondition() {
- return condition;
- }
-
- /**
- * Sets the condition that must be met before the affix can be applied
- *
- * @param condition Condition to be met before affix application
- * @param pattern Condition as a regular expression pattern
- */
- public void setCondition(String condition, String pattern) {
- this.condition = condition;
- this.conditionPattern = Pattern.compile(pattern);
- }
-
- /**
- * Returns the affix flag
- *
- * @return Affix flag
- */
- public char getFlag() {
- return flag;
- }
-
- /**
- * Sets the affix flag
- *
- * @param flag Affix flag
- */
- public void setFlag(char flag) {
- this.flag = flag;
- }
-
- /**
- * Returns whether the affix is defined as cross product
- *
- * @return {@code true} if the affix is cross product, {@code false} otherwise
- */
- public boolean isCrossProduct() {
- return crossProduct;
- }
-
- /**
- * Sets whether the affix is defined as cross product
- *
- * @param crossProduct Whether the affix is defined as cross product
- */
- public void setCrossProduct(boolean crossProduct) {
- this.crossProduct = crossProduct;
- }
-}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellDictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellDictionary.java
deleted file mode 100644
index ccb53f57d29..00000000000
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellDictionary.java
+++ /dev/null
@@ -1,507 +0,0 @@
-package org.apache.lucene.analysis.hunspell;
-
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import org.apache.lucene.analysis.util.CharArrayMap;
-import org.apache.lucene.util.Version;
-
-import java.io.*;
-import java.nio.charset.Charset;
-import java.nio.charset.CharsetDecoder;
-import java.text.ParseException;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.List;
-import java.util.Locale;
-
-/**
- * In-memory structure for the dictionary (.dic) and affix (.aff)
- * data of a hunspell dictionary.
- */
-public class HunspellDictionary {
-
- static final HunspellWord NOFLAGS = new HunspellWord();
-
- private static final String ALIAS_KEY = "AF";
- private static final String PREFIX_KEY = "PFX";
- private static final String SUFFIX_KEY = "SFX";
- private static final String FLAG_KEY = "FLAG";
-
- private static final String NUM_FLAG_TYPE = "num";
- private static final String UTF8_FLAG_TYPE = "UTF-8";
- private static final String LONG_FLAG_TYPE = "long";
-
- private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*";
- private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s";
-
- private static final boolean IGNORE_CASE_DEFAULT = false;
- private static final boolean STRICT_AFFIX_PARSING_DEFAULT = true;
-
- private CharArrayMap> words;
- private CharArrayMap> prefixes;
- private CharArrayMap> suffixes;
-
- private FlagParsingStrategy flagParsingStrategy = new SimpleFlagParsingStrategy(); // Default flag parsing strategy
- private boolean ignoreCase = IGNORE_CASE_DEFAULT;
-
- private final Version version;
-
- private String[] aliases;
- private int aliasCount = 0;
-
- /**
- * Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix
- * and dictionary files.
- * You have to close the provided InputStreams yourself.
- *
- * @param affix InputStream for reading the hunspell affix file (won't be closed).
- * @param dictionary InputStream for reading the hunspell dictionary file (won't be closed).
- * @param version Lucene Version
- * @throws IOException Can be thrown while reading from the InputStreams
- * @throws ParseException Can be thrown if the content of the files does not meet expected formats
- */
- public HunspellDictionary(InputStream affix, InputStream dictionary, Version version) throws IOException, ParseException {
- this(affix, Arrays.asList(dictionary), version, IGNORE_CASE_DEFAULT);
- }
-
- /**
- * Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix
- * and dictionary files.
- * You have to close the provided InputStreams yourself.
- *
- * @param affix InputStream for reading the hunspell affix file (won't be closed).
- * @param dictionary InputStream for reading the hunspell dictionary file (won't be closed).
- * @param version Lucene Version
- * @param ignoreCase If true, dictionary matching will be case insensitive
- * @throws IOException Can be thrown while reading from the InputStreams
- * @throws ParseException Can be thrown if the content of the files does not meet expected formats
- */
- public HunspellDictionary(InputStream affix, InputStream dictionary, Version version, boolean ignoreCase) throws IOException, ParseException {
- this(affix, Arrays.asList(dictionary), version, ignoreCase);
- }
-
- /**
- * Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix
- * and dictionary files.
- * You have to close the provided InputStreams yourself.
- *
- * @param affix InputStream for reading the hunspell affix file (won't be closed).
- * @param dictionaries InputStreams for reading the hunspell dictionary file (won't be closed).
- * @param version Lucene Version
- * @param ignoreCase If true, dictionary matching will be case insensitive
- * @throws IOException Can be thrown while reading from the InputStreams
- * @throws ParseException Can be thrown if the content of the files does not meet expected formats
- */
- public HunspellDictionary(InputStream affix, List dictionaries, Version version, boolean ignoreCase) throws IOException, ParseException {
- this(affix, dictionaries, version, ignoreCase, STRICT_AFFIX_PARSING_DEFAULT);
- }
-
- /**
- * Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix
- * and dictionary files.
- * You have to close the provided InputStreams yourself.
- *
- * @param affix InputStream for reading the hunspell affix file (won't be closed).
- * @param dictionaries InputStreams for reading the hunspell dictionary file (won't be closed).
- * @param version Lucene Version
- * @param ignoreCase If true, dictionary matching will be case insensitive
- * @param strictAffixParsing Affix strict parsing enabled or not (an error while reading a rule causes exception or is ignored)
- * @throws IOException Can be thrown while reading from the InputStreams
- * @throws ParseException Can be thrown if the content of the files does not meet expected formats
- */
- public HunspellDictionary(InputStream affix, List dictionaries, Version version, boolean ignoreCase, boolean strictAffixParsing) throws IOException, ParseException {
- this.version = version;
- this.ignoreCase = ignoreCase;
- String encoding = getDictionaryEncoding(affix);
- CharsetDecoder decoder = getJavaEncoding(encoding);
- readAffixFile(affix, decoder, strictAffixParsing);
- words = new CharArrayMap>(version, 65535 /* guess */, this.ignoreCase);
- for (InputStream dictionary : dictionaries) {
- readDictionaryFile(dictionary, decoder);
- }
- }
-
- /**
- * Looks up HunspellWords that match the String created from the given char array, offset and length
- *
- * @param word Char array to generate the String from
- * @param offset Offset in the char array that the String starts at
- * @param length Length from the offset that the String is
- * @return List of HunspellWords that match the generated String, or {@code null} if none are found
- */
- public List lookupWord(char word[], int offset, int length) {
- return words.get(word, offset, length);
- }
-
- /**
- * Looks up HunspellAffix prefixes that have an append that matches the String created from the given char array, offset and length
- *
- * @param word Char array to generate the String from
- * @param offset Offset in the char array that the String starts at
- * @param length Length from the offset that the String is
- * @return List of HunspellAffix prefixes with an append that matches the String, or {@code null} if none are found
- */
- public List lookupPrefix(char word[], int offset, int length) {
- return prefixes.get(word, offset, length);
- }
-
- /**
- * Looks up HunspellAffix suffixes that have an append that matches the String created from the given char array, offset and length
- *
- * @param word Char array to generate the String from
- * @param offset Offset in the char array that the String starts at
- * @param length Length from the offset that the String is
- * @return List of HunspellAffix suffixes with an append that matches the String, or {@code null} if none are found
- */
- public List lookupSuffix(char word[], int offset, int length) {
- return suffixes.get(word, offset, length);
- }
-
- /**
- * Reads the affix file through the provided InputStream, building up the prefix and suffix maps
- *
- * @param affixStream InputStream to read the content of the affix file from
- * @param decoder CharsetDecoder to decode the content of the file
- * @throws IOException Can be thrown while reading from the InputStream
- */
- private void readAffixFile(InputStream affixStream, CharsetDecoder decoder, boolean strict) throws IOException, ParseException {
- prefixes = new CharArrayMap>(version, 8, ignoreCase);
- suffixes = new CharArrayMap>(version, 8, ignoreCase);
-
- LineNumberReader reader = new LineNumberReader(new InputStreamReader(affixStream, decoder));
- String line = null;
- while ((line = reader.readLine()) != null) {
- if (line.startsWith(ALIAS_KEY)) {
- parseAlias(line);
- } else if (line.startsWith(PREFIX_KEY)) {
- parseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN, strict);
- } else if (line.startsWith(SUFFIX_KEY)) {
- parseAffix(suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN, strict);
- } else if (line.startsWith(FLAG_KEY)) {
- // Assume that the FLAG line comes before any prefix or suffixes
- // Store the strategy so it can be used when parsing the dic file
- flagParsingStrategy = getFlagParsingStrategy(line);
- }
- }
- }
-
- /**
- * Parses a specific affix rule putting the result into the provided affix map
- *
- * @param affixes Map where the result of the parsing will be put
- * @param header Header line of the affix rule
- * @param reader BufferedReader to read the content of the rule from
- * @param conditionPattern {@link String#format(String, Object...)} pattern to be used to generate the condition regex
- * pattern
- * @throws IOException Can be thrown while reading the rule
- */
- private void parseAffix(CharArrayMap> affixes,
- String header,
- LineNumberReader reader,
- String conditionPattern,
- boolean strict) throws IOException, ParseException {
- String args[] = header.split("\\s+");
-
- boolean crossProduct = args[2].equals("Y");
-
- int numLines = Integer.parseInt(args[3]);
- for (int i = 0; i < numLines; i++) {
- String line = reader.readLine();
- String ruleArgs[] = line.split("\\s+");
-
- if (ruleArgs.length < 5) {
- if (strict) {
- throw new ParseException("The affix file contains a rule with less than five elements", reader.getLineNumber());
- }
- continue;
- }
-
- HunspellAffix affix = new HunspellAffix();
-
- affix.setFlag(flagParsingStrategy.parseFlag(ruleArgs[1]));
- affix.setStrip(ruleArgs[2].equals("0") ? "" : ruleArgs[2]);
-
- String affixArg = ruleArgs[3];
-
- int flagSep = affixArg.lastIndexOf('/');
- if (flagSep != -1) {
- String flagPart = affixArg.substring(flagSep + 1);
-
- if (aliasCount > 0) {
- flagPart = getAliasValue(Integer.parseInt(flagPart));
- }
-
- char appendFlags[] = flagParsingStrategy.parseFlags(flagPart);
- Arrays.sort(appendFlags);
- affix.setAppendFlags(appendFlags);
- affix.setAppend(affixArg.substring(0, flagSep));
- } else {
- affix.setAppend(affixArg);
- }
-
- String condition = ruleArgs[4];
- affix.setCondition(condition, String.format(Locale.ROOT, conditionPattern, condition));
- affix.setCrossProduct(crossProduct);
-
- List list = affixes.get(affix.getAppend());
- if (list == null) {
- list = new ArrayList();
- affixes.put(affix.getAppend(), list);
- }
-
- list.add(affix);
- }
- }
-
- /**
- * Parses the encoding specified in the affix file readable through the provided InputStream
- *
- * @param affix InputStream for reading the affix file
- * @return Encoding specified in the affix file
- * @throws IOException Can be thrown while reading from the InputStream
- * @throws ParseException Thrown if the first non-empty non-comment line read from the file does not adhere to the format {@code SET }
- */
- private String getDictionaryEncoding(InputStream affix) throws IOException, ParseException {
- final StringBuilder encoding = new StringBuilder();
- for (;;) {
- encoding.setLength(0);
- int ch;
- while ((ch = affix.read()) >= 0) {
- if (ch == '\n') {
- break;
- }
- if (ch != '\r') {
- encoding.append((char)ch);
- }
- }
- if (
- encoding.length() == 0 || encoding.charAt(0) == '#' ||
- // this test only at the end as ineffective but would allow lines only containing spaces:
- encoding.toString().trim().length() == 0
- ) {
- if (ch < 0) {
- throw new ParseException("Unexpected end of affix file.", 0);
- }
- continue;
- }
- if ("SET ".equals(encoding.substring(0, 4))) {
- // cleanup the encoding string, too (whitespace)
- return encoding.substring(4).trim();
- }
- throw new ParseException("The first non-comment line in the affix file must "+
- "be a 'SET charset', was: '" + encoding +"'", 0);
- }
- }
-
- /**
- * Retrieves the CharsetDecoder for the given encoding. Note, This isn't perfect as I think ISCII-DEVANAGARI and
- * MICROSOFT-CP1251 etc are allowed...
- *
- * @param encoding Encoding to retrieve the CharsetDecoder for
- * @return CharSetDecoder for the given encoding
- */
- private CharsetDecoder getJavaEncoding(String encoding) {
- Charset charset = Charset.forName(encoding);
- return charset.newDecoder();
- }
-
- /**
- * Determines the appropriate {@link FlagParsingStrategy} based on the FLAG definition line taken from the affix file
- *
- * @param flagLine Line containing the flag information
- * @return FlagParsingStrategy that handles parsing flags in the way specified in the FLAG definition
- */
- private FlagParsingStrategy getFlagParsingStrategy(String flagLine) {
- String flagType = flagLine.substring(5);
-
- if (NUM_FLAG_TYPE.equals(flagType)) {
- return new NumFlagParsingStrategy();
- } else if (UTF8_FLAG_TYPE.equals(flagType)) {
- return new SimpleFlagParsingStrategy();
- } else if (LONG_FLAG_TYPE.equals(flagType)) {
- return new DoubleASCIIFlagParsingStrategy();
- }
-
- throw new IllegalArgumentException("Unknown flag type: " + flagType);
- }
-
- /**
- * Reads the dictionary file through the provided InputStream, building up the words map
- *
- * @param dictionary InputStream to read the dictionary file through
- * @param decoder CharsetDecoder used to decode the contents of the file
- * @throws IOException Can be thrown while reading from the file
- */
- private void readDictionaryFile(InputStream dictionary, CharsetDecoder decoder) throws IOException {
- BufferedReader reader = new BufferedReader(new InputStreamReader(dictionary, decoder));
- // TODO: don't create millions of strings.
- String line = reader.readLine(); // first line is number of entries
- int numEntries = Integer.parseInt(line);
-
- // TODO: the flags themselves can be double-chars (long) or also numeric
- // either way the trick is to encode them as char... but they must be parsed differently
- while ((line = reader.readLine()) != null) {
- String entry;
- HunspellWord wordForm;
-
- int flagSep = line.lastIndexOf('/');
- if (flagSep == -1) {
- wordForm = NOFLAGS;
- entry = line;
- } else {
- // note, there can be comments (morph description) after a flag.
- // we should really look for any whitespace
- int end = line.indexOf('\t', flagSep);
- if (end == -1)
- end = line.length();
-
- String flagPart = line.substring(flagSep + 1, end);
- if (aliasCount > 0) {
- flagPart = getAliasValue(Integer.parseInt(flagPart));
- }
-
- wordForm = new HunspellWord(flagParsingStrategy.parseFlags(flagPart));
- Arrays.sort(wordForm.getFlags());
- entry = line.substring(0, flagSep);
- }
- if(ignoreCase) {
- entry = entry.toLowerCase(Locale.ROOT);
- }
-
- List entries = new ArrayList();
- entries.add(wordForm);
- words.put(entry, entries);
- }
- }
-
- public Version getVersion() {
- return version;
- }
-
- private void parseAlias(String line) {
- String ruleArgs[] = line.split("\\s+");
- if (aliases == null) {
- //first line should be the aliases count
- final int count = Integer.parseInt(ruleArgs[1]);
- aliases = new String[count];
- } else {
- aliases[aliasCount++] = ruleArgs[1];
- }
- }
-
- private String getAliasValue(int id) {
- try {
- return aliases[id - 1];
- } catch (IndexOutOfBoundsException ex) {
- throw new IllegalArgumentException("Bad flag alias number:" + id, ex);
- }
- }
-
- /**
- * Abstraction of the process of parsing flags taken from the affix and dic files
- */
- private static abstract class FlagParsingStrategy {
-
- /**
- * Parses the given String into a single flag
- *
- * @param rawFlag String to parse into a flag
- * @return Parsed flag
- */
- char parseFlag(String rawFlag) {
- return parseFlags(rawFlag)[0];
- }
-
- /**
- * Parses the given String into multiple flags
- *
- * @param rawFlags String to parse into flags
- * @return Parsed flags
- */
- abstract char[] parseFlags(String rawFlags);
- }
-
- /**
- * Simple implementation of {@link FlagParsingStrategy} that treats the chars in each String as a individual flags.
- * Can be used with both the ASCII and UTF-8 flag types.
- */
- private static class SimpleFlagParsingStrategy extends FlagParsingStrategy {
- /**
- * {@inheritDoc}
- */
- @Override
- public char[] parseFlags(String rawFlags) {
- return rawFlags.toCharArray();
- }
- }
-
- /**
- * Implementation of {@link FlagParsingStrategy} that assumes each flag is encoded in its numerical form. In the case
- * of multiple flags, each number is separated by a comma.
- */
- private static class NumFlagParsingStrategy extends FlagParsingStrategy {
- /**
- * {@inheritDoc}
- */
- @Override
- public char[] parseFlags(String rawFlags) {
- String[] rawFlagParts = rawFlags.trim().split(",");
- char[] flags = new char[rawFlagParts.length];
-
- for (int i = 0; i < rawFlagParts.length; i++) {
- // note, removing the trailing X/leading I for nepali... what is the rule here?!
- flags[i] = (char) Integer.parseInt(rawFlagParts[i].replaceAll("[^0-9]", ""));
- }
-
- return flags;
- }
- }
-
- /**
- * Implementation of {@link FlagParsingStrategy} that assumes each flag is encoded as two ASCII characters whose codes
- * must be combined into a single character.
- *
- * TODO (rmuir) test
- */
- private static class DoubleASCIIFlagParsingStrategy extends FlagParsingStrategy {
-
- /**
- * {@inheritDoc}
- */
- @Override
- public char[] parseFlags(String rawFlags) {
- if (rawFlags.length() == 0) {
- return new char[0];
- }
-
- StringBuilder builder = new StringBuilder();
- for (int i = 0; i < rawFlags.length(); i+=2) {
- char cookedFlag = (char) ((int) rawFlags.charAt(i) + (int) rawFlags.charAt(i + 1));
- builder.append(cookedFlag);
- }
-
- char flags[] = new char[builder.length()];
- builder.getChars(0, builder.length(), flags, 0);
- return flags;
- }
- }
-
- public boolean isIgnoreCase() {
- return ignoreCase;
- }
-}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilter.java
index 4ff0a741ad8..a9b512b7bbd 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilter.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilter.java
@@ -18,14 +18,16 @@ package org.apache.lucene.analysis.hunspell;
*/
import java.io.IOException;
+import java.util.Collections;
+import java.util.Comparator;
import java.util.List;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.hunspell.HunspellStemmer.Stem;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.util.CharsRef;
/**
* TokenFilter that uses hunspell affix rules and words to stem tokens. Since hunspell supports a word having multiple
@@ -41,71 +43,83 @@ import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
* {@link org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilterFactory}
*
*
- *
+ * @lucene.experimental
*/
public final class HunspellStemFilter extends TokenFilter {
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
- private final HunspellStemmer stemmer;
+ private final Stemmer stemmer;
- private List buffer;
+ private List buffer;
private State savedState;
private final boolean dedup;
+ private final boolean longestOnly;
/** Create a {@link HunspellStemFilter} which deduplicates stems and has a maximum
* recursion level of 2.
- * @see #HunspellStemFilter(TokenStream, HunspellDictionary, int) */
- public HunspellStemFilter(TokenStream input, HunspellDictionary dictionary) {
+ * @see #HunspellStemFilter(TokenStream, Dictionary, int) */
+ public HunspellStemFilter(TokenStream input, Dictionary dictionary) {
this(input, dictionary, 2);
}
/**
- * Creates a new HunspellStemFilter that will stem tokens from the given TokenStream using affix rules in the provided
- * HunspellDictionary
+ * Creates a new Hunspell2StemFilter that will stem tokens from the given TokenStream using affix rules in the provided
+ * Dictionary
*
* @param input TokenStream whose tokens will be stemmed
* @param dictionary HunspellDictionary containing the affix rules and words that will be used to stem the tokens
* @param recursionCap maximum level of recursion stemmer can go into, defaults to 2
*/
- public HunspellStemFilter(TokenStream input, HunspellDictionary dictionary, int recursionCap) {
+ public HunspellStemFilter(TokenStream input, Dictionary dictionary, int recursionCap) {
this(input, dictionary, true, recursionCap);
}
/** Create a {@link HunspellStemFilter} which has a maximum recursion level of 2.
- * @see #HunspellStemFilter(TokenStream, HunspellDictionary, boolean, int) */
- public HunspellStemFilter(TokenStream input, HunspellDictionary dictionary, boolean dedup) {
+ * @see #HunspellStemFilter(TokenStream, Dictionary, boolean, int) */
+ public HunspellStemFilter(TokenStream input, Dictionary dictionary, boolean dedup) {
this(input, dictionary, dedup, 2);
}
-
+
/**
* Creates a new HunspellStemFilter that will stem tokens from the given TokenStream using affix rules in the provided
- * HunspellDictionary
+ * Dictionary
*
* @param input TokenStream whose tokens will be stemmed
* @param dictionary HunspellDictionary containing the affix rules and words that will be used to stem the tokens
* @param dedup true if only unique terms should be output.
* @param recursionCap maximum level of recursion stemmer can go into, defaults to 2
*/
- public HunspellStemFilter(TokenStream input, HunspellDictionary dictionary, boolean dedup, int recursionCap) {
- super(input);
- this.dedup = dedup;
- this.stemmer = new HunspellStemmer(dictionary, recursionCap);
+ public HunspellStemFilter(TokenStream input, Dictionary dictionary, boolean dedup, int recursionCap) {
+ this(input, dictionary, dedup, recursionCap, false);
}
/**
- * {@inheritDoc}
+ * Creates a new HunspellStemFilter that will stem tokens from the given TokenStream using affix rules in the provided
+ * Dictionary
+ *
+ * @param input TokenStream whose tokens will be stemmed
+ * @param dictionary HunspellDictionary containing the affix rules and words that will be used to stem the tokens
+ * @param dedup true if only unique terms should be output.
+ * @param recursionCap maximum level of recursion stemmer can go into, defaults to 2
+ * @param longestOnly true if only the longest term should be output.
*/
+ public HunspellStemFilter(TokenStream input, Dictionary dictionary, boolean dedup, int recursionCap, boolean longestOnly) {
+ super(input);
+ this.dedup = dedup && longestOnly == false; // don't waste time deduping if longestOnly is set
+ this.stemmer = new Stemmer(dictionary, recursionCap);
+ this.longestOnly = longestOnly;
+ }
+
@Override
public boolean incrementToken() throws IOException {
if (buffer != null && !buffer.isEmpty()) {
- Stem nextStem = buffer.remove(0);
+ CharsRef nextStem = buffer.remove(0);
restoreState(savedState);
posIncAtt.setPositionIncrement(0);
- termAtt.copyBuffer(nextStem.getStem(), 0, nextStem.getStemLength());
- termAtt.setLength(nextStem.getStemLength());
+ termAtt.setEmpty().append(nextStem);
return true;
}
@@ -122,24 +136,41 @@ public final class HunspellStemFilter extends TokenFilter {
if (buffer.isEmpty()) { // we do not know this word, return it unchanged
return true;
}
+
+ if (longestOnly && buffer.size() > 1) {
+ Collections.sort(buffer, lengthComparator);
+ }
- Stem stem = buffer.remove(0);
- termAtt.copyBuffer(stem.getStem(), 0, stem.getStemLength());
- termAtt.setLength(stem.getStemLength());
+ CharsRef stem = buffer.remove(0);
+ termAtt.setEmpty().append(stem);
- if (!buffer.isEmpty()) {
- savedState = captureState();
+ if (longestOnly) {
+ buffer.clear();
+ } else {
+ if (!buffer.isEmpty()) {
+ savedState = captureState();
+ }
}
return true;
}
- /**
- * {@inheritDoc}
- */
@Override
public void reset() throws IOException {
super.reset();
buffer = null;
}
+
+ static final Comparator lengthComparator = new Comparator() {
+ @Override
+ public int compare(CharsRef o1, CharsRef o2) {
+ int cmp = Integer.compare(o2.length, o1.length);
+ if (cmp == 0) {
+ // tie break on text
+ return o2.compareTo(o1);
+ } else {
+ return cmp;
+ }
+ }
+ };
}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilterFactory.java
index 63e621c2ab9..e632b489d51 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilterFactory.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilterFactory.java
@@ -31,89 +31,75 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
import org.apache.lucene.util.IOUtils;
/**
- * TokenFilterFactory that creates instances of {@link org.apache.lucene.analysis.hunspell.HunspellStemFilter}.
- * Example config for British English including a custom dictionary, case insensitive matching:
+ * TokenFilterFactory that creates instances of {@link HunspellStemFilter}.
+ * Example config for British English:
*
* <filter class="solr.HunspellStemFilterFactory"
- * dictionary="en_GB.dic,my_custom.dic"
- * affix="en_GB.aff"
- * ignoreCase="true" />
+ * dictionary="en_GB.dic,my_custom.dic"
+ * affix="en_GB.aff"
+ * ignoreCase="false"
+ * longestOnly="false" />
* Both parameters dictionary and affix are mandatory.
- *
- * The parameter ignoreCase (true/false) controls whether matching is case sensitive or not. Default false.
- *
- * The parameter strictAffixParsing (true/false) controls whether the affix parsing is strict or not. Default true.
- * If strict an error while reading an affix rule causes a ParseException, otherwise is ignored.
- *
* Dictionaries for many languages are available through the OpenOffice project.
*
* See http://wiki.apache.org/solr/Hunspell
+ * @lucene.experimental
*/
public class HunspellStemFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
- private static final String PARAM_DICTIONARY = "dictionary";
- private static final String PARAM_AFFIX = "affix";
- private static final String PARAM_IGNORE_CASE = "ignoreCase";
- private static final String PARAM_STRICT_AFFIX_PARSING = "strictAffixParsing";
+ private static final String PARAM_DICTIONARY = "dictionary";
+ private static final String PARAM_AFFIX = "affix";
private static final String PARAM_RECURSION_CAP = "recursionCap";
+ private static final String PARAM_IGNORE_CASE = "ignoreCase";
+ private static final String PARAM_LONGEST_ONLY = "longestOnly";
- private final String dictionaryArg;
+ private final String dictionaryFiles;
private final String affixFile;
private final boolean ignoreCase;
- private final boolean strictAffixParsing;
- private HunspellDictionary dictionary;
+ private final boolean longestOnly;
+ private Dictionary dictionary;
private int recursionCap;
/** Creates a new HunspellStemFilterFactory */
public HunspellStemFilterFactory(Map args) {
super(args);
- assureMatchVersion();
- dictionaryArg = require(args, PARAM_DICTIONARY);
+ dictionaryFiles = require(args, PARAM_DICTIONARY);
affixFile = get(args, PARAM_AFFIX);
ignoreCase = getBoolean(args, PARAM_IGNORE_CASE, false);
- strictAffixParsing = getBoolean(args, PARAM_STRICT_AFFIX_PARSING, true);
recursionCap = getInt(args, PARAM_RECURSION_CAP, 2);
+ longestOnly = getBoolean(args, PARAM_LONGEST_ONLY, false);
+ // this isnt necessary: we properly load all dictionaries.
+ // but recognize and ignore for back compat
+ getBoolean(args, "strictAffixParsing", true);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
}
- /**
- * Loads the hunspell dictionary and affix files defined in the configuration
- *
- * @param loader ResourceLoader used to load the files
- */
@Override
public void inform(ResourceLoader loader) throws IOException {
- String dictionaryFiles[] = dictionaryArg.split(",");
+ String dicts[] = dictionaryFiles.split(",");
InputStream affix = null;
List dictionaries = new ArrayList();
try {
dictionaries = new ArrayList();
- for (String file : dictionaryFiles) {
+ for (String file : dicts) {
dictionaries.add(loader.openResource(file));
}
affix = loader.openResource(affixFile);
- this.dictionary = new HunspellDictionary(affix, dictionaries, luceneMatchVersion, ignoreCase, strictAffixParsing);
+ this.dictionary = new Dictionary(affix, dictionaries, ignoreCase);
} catch (ParseException e) {
- throw new IOException("Unable to load hunspell data! [dictionary=" + dictionaryArg + ",affix=" + affixFile + "]", e);
+ throw new IOException("Unable to load hunspell data! [dictionary=" + dictionaries + ",affix=" + affixFile + "]", e);
} finally {
IOUtils.closeWhileHandlingException(affix);
IOUtils.closeWhileHandlingException(dictionaries);
}
}
- /**
- * Creates an instance of {@link org.apache.lucene.analysis.hunspell.HunspellStemFilter} that will filter the given
- * TokenStream
- *
- * @param tokenStream TokenStream that will be filtered
- * @return HunspellStemFilter that filters the TokenStream
- */
@Override
public TokenStream create(TokenStream tokenStream) {
- return new HunspellStemFilter(tokenStream, dictionary, recursionCap);
+ return new HunspellStemFilter(tokenStream, dictionary, true, recursionCap, longestOnly);
}
}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemmer.java
deleted file mode 100644
index ae2948284d6..00000000000
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemmer.java
+++ /dev/null
@@ -1,392 +0,0 @@
-package org.apache.lucene.analysis.hunspell;
-
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.List;
-
-import org.apache.lucene.analysis.util.CharArraySet;
-import org.apache.lucene.analysis.util.CharacterUtils;
-import org.apache.lucene.util.Version;
-
-/**
- * HunspellStemmer uses the affix rules declared in the HunspellDictionary to generate one or more stems for a word. It
- * conforms to the algorithm in the original hunspell algorithm, including recursive suffix stripping.
- */
-public class HunspellStemmer {
- private final int recursionCap;
- private final HunspellDictionary dictionary;
- private final StringBuilder segment = new StringBuilder();
- private CharacterUtils charUtils = CharacterUtils.getInstance(Version.LUCENE_CURRENT);
-
- /**
- * Constructs a new HunspellStemmer which will use the provided HunspellDictionary to create its stems. Uses the
- * default recursion cap of 2
(based on Hunspell documentation).
- *
- * @param dictionary HunspellDictionary that will be used to create the stems
- */
- public HunspellStemmer(HunspellDictionary dictionary) {
- this(dictionary, 2);
- }
-
- /**
- * Constructs a new HunspellStemmer which will use the provided HunspellDictionary to create its stems
- *
- * @param dictionary HunspellDictionary that will be used to create the stems
- * @param recursionCap maximum level of recursion stemmer can go into
- */
- public HunspellStemmer(HunspellDictionary dictionary, int recursionCap) {
- this.dictionary = dictionary;
- this.recursionCap = recursionCap;
- }
-
- /**
- * Find the stem(s) of the provided word
- *
- * @param word Word to find the stems for
- * @return List of stems for the word
- */
- public List stem(String word) {
- return stem(word.toCharArray(), word.length());
- }
-
- /**
- * Find the stem(s) of the provided word
- *
- * @param word Word to find the stems for
- * @return List of stems for the word
- */
- public List stem(char word[], int length) {
- List stems = new ArrayList();
- if (dictionary.lookupWord(word, 0, length) != null) {
- stems.add(new Stem(word, length));
- }
- stems.addAll(stem(word, length, null, 0));
- return stems;
- }
-
- /**
- * Find the unique stem(s) of the provided word
- *
- * @param word Word to find the stems for
- * @return List of stems for the word
- */
- public List uniqueStems(char word[], int length) {
- List stems = new ArrayList();
- CharArraySet terms = new CharArraySet(dictionary.getVersion(), 8, dictionary.isIgnoreCase());
- if (dictionary.lookupWord(word, 0, length) != null) {
- stems.add(new Stem(word, length));
- terms.add(word);
- }
- List otherStems = stem(word, length, null, 0);
- for (Stem s : otherStems) {
- if (!terms.contains(s.stem)) {
- stems.add(s);
- terms.add(s.stem);
- }
- }
- return stems;
- }
-
- // ================================================= Helper Methods ================================================
-
- /**
- * Generates a list of stems for the provided word
- *
- * @param word Word to generate the stems for
- * @param flags Flags from a previous stemming step that need to be cross-checked with any affixes in this recursive step
- * @param recursionDepth Level of recursion this stemming step is at
- * @return List of stems, pr an empty if no stems are found
- */
- private List stem(char word[], int length, char[] flags, int recursionDepth) {
- List stems = new ArrayList();
-
- for (int i = 0; i < length; i++) {
- List suffixes = dictionary.lookupSuffix(word, i, length - i);
- if (suffixes == null) {
- continue;
- }
-
- for (HunspellAffix suffix : suffixes) {
- if (hasCrossCheckedFlag(suffix.getFlag(), flags)) {
- int deAffixedLength = length - suffix.getAppend().length();
- // TODO: can we do this in-place?
- String strippedWord = new StringBuilder().append(word, 0, deAffixedLength).append(suffix.getStrip()).toString();
-
- List stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), suffix, recursionDepth);
- for (Stem stem : stemList) {
- stem.addSuffix(suffix);
- }
-
- stems.addAll(stemList);
- }
- }
- }
-
- for (int i = length - 1; i >= 0; i--) {
- List prefixes = dictionary.lookupPrefix(word, 0, i);
- if (prefixes == null) {
- continue;
- }
-
- for (HunspellAffix prefix : prefixes) {
- if (hasCrossCheckedFlag(prefix.getFlag(), flags)) {
- int deAffixedStart = prefix.getAppend().length();
- int deAffixedLength = length - deAffixedStart;
-
- String strippedWord = new StringBuilder().append(prefix.getStrip())
- .append(word, deAffixedStart, deAffixedLength)
- .toString();
-
- List stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), prefix, recursionDepth);
- for (Stem stem : stemList) {
- stem.addPrefix(prefix);
- }
-
- stems.addAll(stemList);
- }
- }
- }
-
- return stems;
- }
-
- /**
- * Applies the affix rule to the given word, producing a list of stems if any are found
- *
- * @param strippedWord Word the affix has been removed and the strip added
- * @param affix HunspellAffix representing the affix rule itself
- * @param recursionDepth Level of recursion this stemming step is at
- * @return List of stems for the word, or an empty list if none are found
- */
- @SuppressWarnings("unchecked")
- public List applyAffix(char strippedWord[], int length, HunspellAffix affix, int recursionDepth) {
- if(dictionary.isIgnoreCase()) {
- charUtils.toLowerCase(strippedWord, 0, strippedWord.length);
- }
- segment.setLength(0);
- segment.append(strippedWord, 0, length);
- if (!affix.checkCondition(segment)) {
- return Collections.EMPTY_LIST;
- }
-
- List stems = new ArrayList();
-
- List words = dictionary.lookupWord(strippedWord, 0, length);
- if (words != null) {
- for (HunspellWord hunspellWord : words) {
- if (hunspellWord.hasFlag(affix.getFlag())) {
- stems.add(new Stem(strippedWord, length));
- }
- }
- }
-
- if (affix.isCrossProduct() && recursionDepth < recursionCap) {
- stems.addAll(stem(strippedWord, length, affix.getAppendFlags(), ++recursionDepth));
- }
-
- return stems;
- }
-
- /**
- * Checks if the given flag cross checks with the given array of flags
- *
- * @param flag Flag to cross check with the array of flags
- * @param flags Array of flags to cross check against. Can be {@code null}
- * @return {@code true} if the flag is found in the array or the array is {@code null}, {@code false} otherwise
- */
- private boolean hasCrossCheckedFlag(char flag, char[] flags) {
- return flags == null || Arrays.binarySearch(flags, flag) >= 0;
- }
-
- /**
- * Stem represents all information known about a stem of a word. This includes the stem, and the prefixes and suffixes
- * that were used to change the word into the stem.
- */
- public static class Stem {
-
- private final List prefixes = new ArrayList();
- private final List suffixes = new ArrayList();
- private final char stem[];
- private final int stemLength;
-
- /**
- * Creates a new Stem wrapping the given word stem
- *
- * @param stem Stem of a word
- */
- public Stem(char stem[], int stemLength) {
- this.stem = stem;
- this.stemLength = stemLength;
- }
-
- /**
- * Adds a prefix to the list of prefixes used to generate this stem. Because it is assumed that prefixes are added
- * depth first, the prefix is added to the front of the list
- *
- * @param prefix Prefix to add to the list of prefixes for this stem
- */
- public void addPrefix(HunspellAffix prefix) {
- prefixes.add(0, prefix);
- }
-
- /**
- * Adds a suffix to the list of suffixes used to generate this stem. Because it is assumed that suffixes are added
- * depth first, the suffix is added to the end of the list
- *
- * @param suffix Suffix to add to the list of suffixes for this stem
- */
- public void addSuffix(HunspellAffix suffix) {
- suffixes.add(suffix);
- }
-
- /**
- * Returns the list of prefixes used to generate the stem
- *
- * @return List of prefixes used to generate the stem or an empty list if no prefixes were required
- */
- public List getPrefixes() {
- return prefixes;
- }
-
- /**
- * Returns the list of suffixes used to generate the stem
- *
- * @return List of suffixes used to generate the stem or an empty list if no suffixes were required
- */
- public List getSuffixes() {
- return suffixes;
- }
-
- /**
- * Returns the actual word stem itself
- *
- * @return Word stem itself
- */
- public char[] getStem() {
- return stem;
- }
-
- /**
- * @return the stemLength
- */
- public int getStemLength() {
- return stemLength;
- }
-
- public String getStemString() {
- return new String(stem, 0, stemLength);
- }
-
- }
-
-
- // ================================================= Entry Point ===================================================
-
- /*
- * HunspellStemmer entry point. Accepts two arguments: location of affix file and location of dic file
- *
- * @param args Program arguments. Should contain location of affix file and location of dic file
- * @throws IOException Can be thrown while reading from the files
- * @throws ParseException Can be thrown while parsing the files
- public static void main(String[] args) throws IOException, ParseException {
- boolean ignoreCase = false;
- int offset = 0;
-
- if (args.length < 2) {
- System.out.println("usage: HunspellStemmer [-i] ");
- System.exit(1);
- }
-
- if(args[offset].equals("-i")) {
- ignoreCase = true;
- System.out.println("Ignoring case. All stems will be returned lowercased");
- offset++;
- }
-
- InputStream affixInputStream = new FileInputStream(args[offset++]);
- InputStream dicInputStream = new FileInputStream(args[offset++]);
-
- // :Post-Release-Update-Version.LUCENE_XY:
- HunspellDictionary dictionary = new HunspellDictionary(affixInputStream, dicInputStream, Version.LUCENE_50, ignoreCase);
-
- affixInputStream.close();
- dicInputStream.close();
-
- HunspellStemmer stemmer = new HunspellStemmer(dictionary);
-
- Scanner scanner = new Scanner(System.in, Charset.defaultCharset().name());
-
- System.out.print("> ");
- while (scanner.hasNextLine()) {
- String word = scanner.nextLine();
-
- if ("exit".equals(word)) {
- break;
- }
-
- printStemResults(word, stemmer.stem(word.toCharArray(), word.length()));
-
- System.out.print("> ");
- }
- }
-
- * Prints the results of the stemming of a word
- *
- * @param originalWord Word that has been stemmed
- * @param stems Stems of the word
- private static void printStemResults(String originalWord, List stems) {
- StringBuilder builder = new StringBuilder().append("stem(").append(originalWord).append(")").append("\n");
-
- for (Stem stem : stems) {
- builder.append("- ").append(stem.getStem()).append(": ");
-
- for (HunspellAffix prefix : stem.getPrefixes()) {
- builder.append(prefix.getAppend()).append("+");
-
- if (hasText(prefix.getStrip())) {
- builder.append(prefix.getStrip()).append("-");
- }
- }
-
- builder.append(stem.getStem());
-
- for (HunspellAffix suffix : stem.getSuffixes()) {
- if (hasText(suffix.getStrip())) {
- builder.append("-").append(suffix.getStrip());
- }
-
- builder.append("+").append(suffix.getAppend());
- }
- builder.append("\n");
- }
-
- System.out.println(builder);
- }
-
- * Simple utility to check if the given String has any text
- *
- * @param str String to check if it has any text
- * @return {@code true} if the String has text, {@code false} otherwise
- private static boolean hasText(String str) {
- return str != null && str.length() > 0;
- }
- */
-}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellWord.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellWord.java
deleted file mode 100644
index fe216d30dc8..00000000000
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellWord.java
+++ /dev/null
@@ -1,63 +0,0 @@
-package org.apache.lucene.analysis.hunspell;
-
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.util.Arrays;
-
-/**
- * A dictionary (.dic) entry with its associated flags.
- */
-public class HunspellWord {
-
- private final char flags[]; // sorted, can we represent more concisely?
-
- /**
- * Creates a new HunspellWord with no associated flags
- */
- public HunspellWord() {
- flags = null;
- }
-
- /**
- * Constructs a new HunspellWord with the given flags
- *
- * @param flags Flags to associate with the word
- */
- public HunspellWord(char[] flags) {
- this.flags = flags;
- }
-
- /**
- * Checks whether the word has the given flag associated with it
- *
- * @param flag Flag to check whether it is associated with the word
- * @return {@code true} if the flag is associated, {@code false} otherwise
- */
- public boolean hasFlag(char flag) {
- return flags != null && Arrays.binarySearch(flags, flag) >= 0;
- }
-
- /**
- * Returns the flags associated with the word
- *
- * @return Flags associated with the word
- */
- public char[] getFlags() {
- return flags;
- }
-}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/ISO8859_14Decoder.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ISO8859_14Decoder.java
similarity index 98%
rename from lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/ISO8859_14Decoder.java
rename to lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ISO8859_14Decoder.java
index 4de0d4bc051..2d87947ab3d 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/ISO8859_14Decoder.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ISO8859_14Decoder.java
@@ -1,4 +1,4 @@
-package org.apache.lucene.analysis.hunspell2;
+package org.apache.lucene.analysis.hunspell;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
similarity index 92%
rename from lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java
rename to lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
index d6b0133830a..18e6588ce7a 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java
@@ -1,4 +1,4 @@
-package org.apache.lucene.analysis.hunspell2;
+package org.apache.lucene.analysis.hunspell;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@@ -24,6 +24,7 @@ import java.util.List;
import java.util.regex.Pattern;
import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.analysis.util.CharacterUtils;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRef;
@@ -37,9 +38,10 @@ import org.apache.lucene.util.Version;
final class Stemmer {
private final int recursionCap;
private final Dictionary dictionary;
- private BytesRef scratch = new BytesRef();
+ private final BytesRef scratch = new BytesRef();
private final StringBuilder segment = new StringBuilder();
private final ByteArrayDataInput affixReader;
+ private final CharacterUtils charUtils = CharacterUtils.getInstance(Version.LUCENE_CURRENT);
/**
* Constructs a new Stemmer which will use the provided Dictionary to create its stems. Uses the
@@ -80,6 +82,9 @@ final class Stemmer {
* @return List of stems for the word
*/
public List stem(char word[], int length) {
+ if (dictionary.ignoreCase) {
+ charUtils.toLowerCase(word, 0, length);
+ }
List stems = new ArrayList();
if (dictionary.lookupWord(word, 0, length, scratch) != null) {
stems.add(new CharsRef(word, 0, length));
@@ -95,20 +100,19 @@ final class Stemmer {
* @return List of stems for the word
*/
public List uniqueStems(char word[], int length) {
- List stems = new ArrayList();
- CharArraySet terms = new CharArraySet(Version.LUCENE_CURRENT, 8, false);
- if (dictionary.lookupWord(word, 0, length, scratch) != null) {
- stems.add(new CharsRef(word, 0, length));
- terms.add(word);
+ List stems = stem(word, length);
+ if (stems.size() < 2) {
+ return stems;
}
- List otherStems = stem(word, length, Dictionary.NOFLAGS, 0);
- for (CharsRef s : otherStems) {
+ CharArraySet terms = new CharArraySet(Version.LUCENE_CURRENT, 8, dictionary.ignoreCase);
+ List deduped = new ArrayList<>();
+ for (CharsRef s : stems) {
if (!terms.contains(s)) {
- stems.add(s);
+ deduped.add(s);
terms.add(s);
}
}
- return stems;
+ return deduped;
}
// ================================================= Helper Methods ================================================
@@ -188,7 +192,7 @@ final class Stemmer {
* @param recursionDepth Level of recursion this stemming step is at
* @return List of stems for the word, or an empty list if none are found
*/
- public List applyAffix(char strippedWord[], int length, int affix, int recursionDepth) {
+ List applyAffix(char strippedWord[], int length, int affix, int recursionDepth) {
segment.setLength(0);
segment.append(strippedWord, 0, length);
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Hunspell2StemFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Hunspell2StemFilter.java
deleted file mode 100644
index 00ff88469be..00000000000
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Hunspell2StemFilter.java
+++ /dev/null
@@ -1,137 +0,0 @@
-package org.apache.lucene.analysis.hunspell2;
-
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.IOException;
-import java.util.List;
-
-import org.apache.lucene.analysis.TokenFilter;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
-import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.util.CharsRef;
-
-/**
- * TokenFilter that uses hunspell affix rules and words to stem tokens. Since hunspell supports a word having multiple
- * stems, this filter can emit multiple tokens for each consumed token
- *
- *
- * Note: This filter is aware of the {@link KeywordAttribute}. To prevent
- * certain terms from being passed to the stemmer
- * {@link KeywordAttribute#isKeyword()} should be set to true
- * in a previous {@link TokenStream}.
- *
- * Note: For including the original term as well as the stemmed version, see
- * {@link org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilterFactory}
- *
- *
- * @lucene.experimental
- */
-public final class Hunspell2StemFilter extends TokenFilter {
-
- private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
- private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
- private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
- private final Stemmer stemmer;
-
- private List buffer;
- private State savedState;
-
- private final boolean dedup;
-
- /** Create a {@link Hunspell2StemFilter} which deduplicates stems and has a maximum
- * recursion level of 2.
- * @see #Hunspell2StemFilter(TokenStream, Dictionary, int) */
- public Hunspell2StemFilter(TokenStream input, Dictionary dictionary) {
- this(input, dictionary, 2);
- }
-
- /**
- * Creates a new Hunspell2StemFilter that will stem tokens from the given TokenStream using affix rules in the provided
- * Dictionary
- *
- * @param input TokenStream whose tokens will be stemmed
- * @param dictionary HunspellDictionary containing the affix rules and words that will be used to stem the tokens
- * @param recursionCap maximum level of recursion stemmer can go into, defaults to 2
- */
- public Hunspell2StemFilter(TokenStream input, Dictionary dictionary, int recursionCap) {
- this(input, dictionary, true, recursionCap);
- }
-
- /** Create a {@link Hunspell2StemFilter} which has a maximum recursion level of 2.
- * @see #Hunspell2StemFilter(TokenStream, Dictionary, boolean, int) */
- public Hunspell2StemFilter(TokenStream input, Dictionary dictionary, boolean dedup) {
- this(input, dictionary, dedup, 2);
- }
-
- /**
- * Creates a new HunspellStemFilter that will stem tokens from the given TokenStream using affix rules in the provided
- * Dictionary
- *
- * @param input TokenStream whose tokens will be stemmed
- * @param dictionary HunspellDictionary containing the affix rules and words that will be used to stem the tokens
- * @param dedup true if only unique terms should be output.
- * @param recursionCap maximum level of recursion stemmer can go into, defaults to 2
- */
- public Hunspell2StemFilter(TokenStream input, Dictionary dictionary, boolean dedup, int recursionCap) {
- super(input);
- this.dedup = dedup;
- this.stemmer = new Stemmer(dictionary, recursionCap);
- }
-
- @Override
- public boolean incrementToken() throws IOException {
- if (buffer != null && !buffer.isEmpty()) {
- CharsRef nextStem = buffer.remove(0);
- restoreState(savedState);
- posIncAtt.setPositionIncrement(0);
- termAtt.setEmpty().append(nextStem);
- return true;
- }
-
- if (!input.incrementToken()) {
- return false;
- }
-
- if (keywordAtt.isKeyword()) {
- return true;
- }
-
- buffer = dedup ? stemmer.uniqueStems(termAtt.buffer(), termAtt.length()) : stemmer.stem(termAtt.buffer(), termAtt.length());
-
- if (buffer.isEmpty()) { // we do not know this word, return it unchanged
- return true;
- }
-
- CharsRef stem = buffer.remove(0);
- termAtt.setEmpty().append(stem);
-
- if (!buffer.isEmpty()) {
- savedState = captureState();
- }
-
- return true;
- }
-
- @Override
- public void reset() throws IOException {
- super.reset();
- buffer = null;
- }
-}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Hunspell2StemFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Hunspell2StemFilterFactory.java
deleted file mode 100644
index 6ce73698dfd..00000000000
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Hunspell2StemFilterFactory.java
+++ /dev/null
@@ -1,80 +0,0 @@
-package org.apache.lucene.analysis.hunspell2;
-
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.text.ParseException;
-import java.util.Map;
-
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.util.ResourceLoader;
-import org.apache.lucene.analysis.util.ResourceLoaderAware;
-import org.apache.lucene.analysis.util.TokenFilterFactory;
-
-/**
- * TokenFilterFactory that creates instances of {@link Hunspell2StemFilter}.
- * Example config for British English:
- *
- * <filter class="solr.Hunspell2StemFilterFactory"
- * dictionary="en_GB.dic"
- * affix="en_GB.aff" />
- * Both parameters dictionary and affix are mandatory.
- * Dictionaries for many languages are available through the OpenOffice project.
- *
- * See http://wiki.apache.org/solr/Hunspell
- * @lucene.experimental
- */
-public class Hunspell2StemFilterFactory extends TokenFilterFactory implements ResourceLoaderAware {
- private static final String PARAM_DICTIONARY = "dictionary";
- private static final String PARAM_AFFIX = "affix";
- private static final String PARAM_RECURSION_CAP = "recursionCap";
-
- private final String dictionaryFile;
- private final String affixFile;
- private Dictionary dictionary;
- private int recursionCap;
-
- /** Creates a new Hunspell2StemFilterFactory */
- public Hunspell2StemFilterFactory(Map args) {
- super(args);
- dictionaryFile = require(args, PARAM_DICTIONARY);
- affixFile = get(args, PARAM_AFFIX);
- recursionCap = getInt(args, PARAM_RECURSION_CAP, 2);
- if (!args.isEmpty()) {
- throw new IllegalArgumentException("Unknown parameters: " + args);
- }
- }
-
- @Override
- public void inform(ResourceLoader loader) throws IOException {
- try (InputStream affix = loader.openResource(affixFile);
- InputStream dictionary = loader.openResource(dictionaryFile)) {
- try {
- this.dictionary = new Dictionary(affix, dictionary);
- } catch (ParseException e) {
- throw new RuntimeException(e);
- }
- }
- }
-
- @Override
- public TokenStream create(TokenStream tokenStream) {
- return new Hunspell2StemFilter(tokenStream, dictionary, recursionCap);
- }
-}
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/package.html b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/package.html
deleted file mode 100644
index 196591969e8..00000000000
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/package.html
+++ /dev/null
@@ -1,26 +0,0 @@
-
-
-
-Stemming TokenFilter using a Java implementation of the
-Hunspell stemming algorithm.
-
-Dictionaries can be found on
-OpenOffice's wiki
-
-
-
diff --git a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
index e4ca7c6802c..04fc80cf59c 100644
--- a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
+++ b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
@@ -51,7 +51,6 @@ org.apache.lucene.analysis.hi.HindiNormalizationFilterFactory
org.apache.lucene.analysis.hi.HindiStemFilterFactory
org.apache.lucene.analysis.hu.HungarianLightStemFilterFactory
org.apache.lucene.analysis.hunspell.HunspellStemFilterFactory
-org.apache.lucene.analysis.hunspell2.Hunspell2StemFilterFactory
org.apache.lucene.analysis.id.IndonesianStemFilterFactory
org.apache.lucene.analysis.in.IndicNormalizationFilterFactory
org.apache.lucene.analysis.it.ItalianLightStemFilterFactory
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
index bca5e1ede50..617e7523b69 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java
@@ -62,8 +62,8 @@ import org.apache.lucene.analysis.commongrams.CommonGramsQueryFilter;
import org.apache.lucene.analysis.compound.HyphenationCompoundWordTokenFilter;
import org.apache.lucene.analysis.compound.TestCompoundWordTokenFilter;
import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
-import org.apache.lucene.analysis.hunspell.HunspellDictionary;
-import org.apache.lucene.analysis.hunspell.HunspellDictionaryTest;
+import org.apache.lucene.analysis.hunspell.Dictionary;
+import org.apache.lucene.analysis.hunspell.TestHunspellStemFilter;
import org.apache.lucene.analysis.miscellaneous.HyphenatedWordsFilter;
import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilter;
import org.apache.lucene.analysis.miscellaneous.LimitTokenPositionFilter;
@@ -406,13 +406,13 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
return new IdentityEncoder(); // the other encoders will throw exceptions if tokens arent numbers?
}
});
- put(HunspellDictionary.class, new ArgProducer() {
+ put(Dictionary.class, new ArgProducer() {
@Override public Object create(Random random) {
// TODO: make nastier
- InputStream affixStream = HunspellDictionaryTest.class.getResourceAsStream("test.aff");
- InputStream dictStream = HunspellDictionaryTest.class.getResourceAsStream("test.dic");
+ InputStream affixStream = TestHunspellStemFilter.class.getResourceAsStream("simple.aff");
+ InputStream dictStream = TestHunspellStemFilter.class.getResourceAsStream("simple.dic");
try {
- return new HunspellDictionary(affixStream, dictStream, TEST_VERSION_CURRENT);
+ return new Dictionary(affixStream, dictStream);
} catch (Exception ex) {
Rethrow.rethrow(ex);
return null; // unreachable code
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellDictionaryTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellDictionaryTest.java
deleted file mode 100644
index fd8f9211727..00000000000
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellDictionaryTest.java
+++ /dev/null
@@ -1,201 +0,0 @@
-package org.apache.lucene.analysis.hunspell;
-
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.text.ParseException;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.List;
-
-import org.apache.lucene.util.LuceneTestCase;
-import org.junit.Assert;
-import org.junit.Test;
-
-public class HunspellDictionaryTest extends LuceneTestCase {
-
- private class CloseCheckInputStream extends InputStream {
- private InputStream delegate;
-
- private boolean closed = false;
-
- public CloseCheckInputStream(InputStream delegate) {
- super();
- this.delegate = delegate;
- }
-
- @Override
- public int read() throws IOException {
- return delegate.read();
- }
-
- @Override
- public int hashCode() {
- return delegate.hashCode();
- }
-
- @Override
- public int read(byte[] b) throws IOException {
- return delegate.read(b);
- }
-
- @Override
- public boolean equals(Object obj) {
- return delegate.equals(obj);
- }
-
- @Override
- public int read(byte[] b, int off, int len) throws IOException {
- return delegate.read(b, off, len);
- }
-
- @Override
- public long skip(long n) throws IOException {
- return delegate.skip(n);
- }
-
- @Override
- public String toString() {
- return delegate.toString();
- }
-
- @Override
- public int available() throws IOException {
- return delegate.available();
- }
-
- @Override
- public void close() throws IOException {
- this.closed = true;
- delegate.close();
- }
-
- @Override
- public void mark(int readlimit) {
- delegate.mark(readlimit);
- }
-
- @Override
- public void reset() throws IOException {
- delegate.reset();
- }
-
- @Override
- public boolean markSupported() {
- return delegate.markSupported();
- }
-
- public boolean isClosed() {
- return this.closed;
- }
-
- }
-
- @Test
- public void testResourceCleanup() throws IOException, ParseException {
- CloseCheckInputStream affixStream = new CloseCheckInputStream(getClass().getResourceAsStream("testCompressed.aff"));
- CloseCheckInputStream dictStream = new CloseCheckInputStream(getClass().getResourceAsStream("testCompressed.dic"));
-
- new HunspellDictionary(affixStream, dictStream, TEST_VERSION_CURRENT);
-
- assertFalse(affixStream.isClosed());
- assertFalse(dictStream.isClosed());
-
- affixStream.close();
- dictStream.close();
-
- assertTrue(affixStream.isClosed());
- assertTrue(dictStream.isClosed());
- }
-
- @Test
- public void testHunspellDictionary_loadDicAff() throws IOException, ParseException {
- InputStream affixStream = getClass().getResourceAsStream("test.aff");
- InputStream dictStream = getClass().getResourceAsStream("test.dic");
-
- HunspellDictionary dictionary = new HunspellDictionary(affixStream, dictStream, TEST_VERSION_CURRENT);
- assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).size());
- assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).size());
- assertEquals(1, dictionary.lookupWord(new char[]{'o', 'l', 'r'}, 0, 3).size());
- assertEquals("Wrong number of flags for lucen", 1, dictionary.lookupWord(new char[]{'l', 'u', 'c', 'e', 'n'}, 0, 5).get(0).getFlags().length);
-
- affixStream.close();
- dictStream.close();
- }
-
- @Test
- public void testHunspellDictionary_multipleDictWithOverride() throws IOException, ParseException {
- InputStream affixStream = getClass().getResourceAsStream("test.aff");
- List dictStreams = new ArrayList();
- dictStreams.add(getClass().getResourceAsStream("test.dic"));
- dictStreams.add(getClass().getResourceAsStream("testOverride.dic"));
-
- HunspellDictionary dictionary = new HunspellDictionary(affixStream, dictStreams, TEST_VERSION_CURRENT, false);
- assertEquals("Wrong number of flags for lucen", 3, dictionary.lookupWord(new char[]{'l', 'u', 'c', 'e', 'n'}, 0, 5).get(0).getFlags().length);
- assertEquals("Wrong number of flags for bar", 1, dictionary.lookupWord(new char[]{'b', 'a', 'r'}, 0, 3).get(0).getFlags().length);
-
- affixStream.close();
- for(InputStream dstream : dictStreams) {
- dstream.close();
- }
- }
-
- @Test
- public void testCompressedHunspellDictionary_loadDicAff() throws IOException, ParseException {
- InputStream affixStream = getClass().getResourceAsStream("testCompressed.aff");
- InputStream dictStream = getClass().getResourceAsStream("testCompressed.dic");
-
- HunspellDictionary dictionary = new HunspellDictionary(affixStream, dictStream, TEST_VERSION_CURRENT);
- assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).size());
- assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).size());
- assertEquals(1, dictionary.lookupWord(new char[]{'o', 'l', 'r'}, 0, 3).size());
-
- affixStream.close();
- dictStream.close();
- }
-
- @Test
- public void testHunspellDictionary_loadDicWrongAff() throws IOException, ParseException {
- InputStream affixStream = getClass().getResourceAsStream("testWrongAffixRule.aff");
- InputStream dictStream = getClass().getResourceAsStream("test.dic");
-
- HunspellDictionary dictionary = new HunspellDictionary(affixStream, Arrays.asList(dictStream), TEST_VERSION_CURRENT, false, false);
- assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).size());
- assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).size());
- assertEquals(1, dictionary.lookupWord(new char[]{'o', 'l', 'r'}, 0, 3).size());
- //strict parsing disabled: malformed rule is not loaded
- assertNull(dictionary.lookupPrefix(new char[]{'a'}, 0, 1));
- affixStream.close();
- dictStream.close();
-
- affixStream = getClass().getResourceAsStream("testWrongAffixRule.aff");
- dictStream = getClass().getResourceAsStream("test.dic");
- //strict parsing enabled: malformed rule causes ParseException
- try {
- dictionary = new HunspellDictionary(affixStream, Arrays.asList(dictStream), TEST_VERSION_CURRENT, false, true);
- Assert.fail();
- } catch(ParseException e) {
- Assert.assertEquals("The affix file contains a rule with less than five elements", e.getMessage());
- Assert.assertEquals(23, e.getErrorOffset());
- }
-
- affixStream.close();
- dictStream.close();
- }
-}
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellStemFilterTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellStemFilterTest.java
deleted file mode 100644
index dd273fa8dc5..00000000000
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellStemFilterTest.java
+++ /dev/null
@@ -1,92 +0,0 @@
-package org.apache.lucene.analysis.hunspell;
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.text.ParseException;
-import java.util.Arrays;
-
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.MockTokenizer;
-import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.core.KeywordTokenizer;
-import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
-import org.apache.lucene.analysis.util.CharArraySet;
-import org.apache.lucene.util.TestUtil;
-import org.junit.AfterClass;
-import org.junit.BeforeClass;
-
-public class HunspellStemFilterTest extends BaseTokenStreamTestCase {
-
- private static HunspellDictionary DICTIONARY;
- @BeforeClass
- public static void beforeClass() throws IOException, ParseException {
- DICTIONARY = createDict(true);
- }
- @AfterClass
- public static void afterClass() {
- DICTIONARY = null;
- }
- public static HunspellDictionary createDict(boolean ignoreCase) throws IOException, ParseException {
- InputStream affixStream = HunspellStemmerTest.class.getResourceAsStream("test.aff");
- InputStream dictStream = HunspellStemmerTest.class.getResourceAsStream("test.dic");
-
- return new HunspellDictionary(affixStream, dictStream, TEST_VERSION_CURRENT, ignoreCase);
- }
-
- /**
- * Simple test for KeywordAttribute
- */
- public void testKeywordAttribute() throws IOException {
- MockTokenizer tokenizer = whitespaceMockTokenizer("lucene is awesome");
- tokenizer.setEnableChecks(true);
- HunspellStemFilter filter = new HunspellStemFilter(tokenizer, DICTIONARY, TestUtil.nextInt(random(), 1, 3));
- assertTokenStreamContents(filter, new String[]{"lucene", "lucen", "is", "awesome"}, new int[] {1, 0, 1, 1});
-
- // assert with keywork marker
- tokenizer = whitespaceMockTokenizer("lucene is awesome");
- CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, Arrays.asList("Lucene"), true);
- filter = new HunspellStemFilter(new SetKeywordMarkerFilter(tokenizer, set), DICTIONARY, TestUtil.nextInt(random(), 1, 3));
- assertTokenStreamContents(filter, new String[]{"lucene", "is", "awesome"}, new int[] {1, 1, 1});
- }
-
- /** blast some random strings through the analyzer */
- public void testRandomStrings() throws Exception {
- Analyzer analyzer = new Analyzer() {
-
- @Override
- protected TokenStreamComponents createComponents(String fieldName) {
- Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
- return new TokenStreamComponents(tokenizer, new HunspellStemFilter(tokenizer, DICTIONARY, TestUtil.nextInt(random(), 1, 3)));
- }
- };
- checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
- }
-
- public void testEmptyTerm() throws IOException {
- Analyzer a = new Analyzer() {
- @Override
- protected TokenStreamComponents createComponents(String fieldName) {
- Tokenizer tokenizer = new KeywordTokenizer();
- return new TokenStreamComponents(tokenizer, new HunspellStemFilter(tokenizer, DICTIONARY, TestUtil.nextInt(random(), 1, 3)));
- }
- };
- checkOneTerm(a, "", "");
- }
-}
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellStemmerTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellStemmerTest.java
deleted file mode 100644
index 66a9410c27a..00000000000
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellStemmerTest.java
+++ /dev/null
@@ -1,137 +0,0 @@
-package org.apache.lucene.analysis.hunspell;
-
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import org.apache.lucene.util.LuceneTestCase;
-import org.apache.lucene.util.Version;
-import org.junit.AfterClass;
-import org.junit.BeforeClass;
-import org.junit.Test;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.text.ParseException;
-import java.util.List;
-
-import static junit.framework.Assert.assertEquals;
-
-public class HunspellStemmerTest extends LuceneTestCase {
-
- private static HunspellStemmer stemmer;
-
- @BeforeClass
- public static void beforeClass() throws IOException, ParseException {
- createStemmer(true);
- }
-
- @AfterClass
- public static void afterClass() {
- stemmer = null;
- }
-
- @Test
- public void testStem_simpleSuffix() {
- List stems = stemmer.stem("lucene");
-
- assertEquals(2, stems.size());
- assertEquals("lucene", stems.get(0).getStemString());
- assertEquals("lucen", stems.get(1).getStemString());
-
- stems = stemmer.stem("mahoute");
- assertEquals(1, stems.size());
- assertEquals("mahout", stems.get(0).getStemString());
- }
-
- @Test
- public void testStem_simplePrefix() {
- List stems = stemmer.stem("solr");
-
- assertEquals(1, stems.size());
- assertEquals("olr", stems.get(0).getStemString());
- }
-
- @Test
- public void testStem_recursiveSuffix() {
- List stems = stemmer.stem("abcd");
-
- assertEquals(1, stems.size());
- assertEquals("ab", stems.get(0).getStemString());
- }
-
- @Test
- public void testStem_ignoreCase() throws IOException, ParseException {
- List stems;
- createStemmer(true);
-
- stems = stemmer.stem("apache");
- assertEquals(1, stems.size());
- assertEquals("apach", stems.get(0).getStemString());
-
- stems = stemmer.stem("APACHE");
- assertEquals(1, stems.size());
- assertEquals("apach", stems.get(0).getStemString());
-
- stems = stemmer.stem("Apache");
- assertEquals(1, stems.size());
- assertEquals("apach", stems.get(0).getStemString());
-
- stems = stemmer.stem("foos");
- assertEquals(1, stems.size());
- assertEquals("foo", stems.get(0).getStemString());
-
- stems = stemmer.stem("mood");
- assertEquals(1, stems.size());
- assertEquals("moo", stems.get(0).getStemString());
-
- stems = stemmer.stem("Foos");
- assertEquals(1, stems.size());
- assertEquals("foo", stems.get(0).getStemString());
-
- // The "Foo" rule gets overridden by the "foo" rule, and we don't merge
- stems = stemmer.stem("Food");
- assertEquals(0, stems.size());
-
- stems = stemmer.stem("Mood");
- assertEquals(1, stems.size());
- assertEquals("moo", stems.get(0).getStemString());
- }
-
- @Test
- public void testStem_caseSensitive() throws IOException, ParseException {
- createStemmer(false);
- List stems = stemmer.stem("apache");
- assertEquals(0, stems.size());
-
- stems = stemmer.stem("Apache");
- assertEquals(1, stems.size());
- assertEquals("Apach", stems.get(0).getStemString());
- }
-
-
- private static void createStemmer(boolean ignoreCase) throws IOException, ParseException {
- InputStream affixStream = HunspellStemmerTest.class.getResourceAsStream("test.aff");
- InputStream dictStream = HunspellStemmerTest.class.getResourceAsStream("test.dic");
-
- HunspellDictionary dictionary = new HunspellDictionary(affixStream, dictStream, TEST_VERSION_CURRENT, ignoreCase);
- stemmer = new HunspellStemmer(dictionary);
-
- affixStream.close();
- dictStream.close();
- }
-
-}
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestAllDictionaries.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java
similarity index 93%
rename from lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestAllDictionaries.java
rename to lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java
index d00fc634944..3322eb109a6 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestAllDictionaries.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java
@@ -1,4 +1,4 @@
-package org.apache.lucene.analysis.hunspell2;
+package org.apache.lucene.analysis.hunspell;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@@ -22,7 +22,7 @@ import java.io.InputStream;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
-import org.apache.lucene.analysis.hunspell.HunspellDictionary;
+import org.apache.lucene.analysis.hunspell.Dictionary;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.RamUsageEstimator;
@@ -33,7 +33,7 @@ import org.junit.Ignore;
* wget --mirror -np http://archive.services.openoffice.org/pub/mirror/OpenOffice.org/contrib/dictionaries/
* Note some of the files differ only in case. This may be a problem on your operating system!
*/
-//@Ignore("enable manually")
+@Ignore("enable manually")
public class TestAllDictionaries extends LuceneTestCase {
// set this to the location of where you downloaded all the files
@@ -162,21 +162,11 @@ public class TestAllDictionaries extends LuceneTestCase {
assert dicEntry != null;
ZipEntry affEntry = zip.getEntry(tests[i+2]);
assert affEntry != null;
-
- // get ram from previous impl
- String oldRAM = "FAIL";
- try (InputStream dictionary = zip.getInputStream(dicEntry);
- InputStream affix = zip.getInputStream(affEntry)) {
- try {
- HunspellDictionary dic = new HunspellDictionary(affix, dictionary, TEST_VERSION_CURRENT);
- oldRAM = RamUsageEstimator.humanSizeOf(dic);
- } catch (Throwable t) {}
- }
try (InputStream dictionary = zip.getInputStream(dicEntry);
InputStream affix = zip.getInputStream(affEntry)) {
Dictionary dic = new Dictionary(affix, dictionary);
- System.out.println(tests[i] + "\t" + oldRAM + "\t" + RamUsageEstimator.humanSizeOf(dic) + "\t(" +
+ System.out.println(tests[i] + "\t" + RamUsageEstimator.humanSizeOf(dic) + "\t(" +
"words=" + RamUsageEstimator.humanSizeOf(dic.words) + ", " +
"flags=" + RamUsageEstimator.humanSizeOf(dic.flagLookup) + ", " +
"strips=" + RamUsageEstimator.humanSizeOf(dic.stripLookup) + ", " +
@@ -204,7 +194,7 @@ public class TestAllDictionaries extends LuceneTestCase {
try (InputStream dictionary = zip.getInputStream(dicEntry);
InputStream affix = zip.getInputStream(affEntry)) {
- Dictionary dic = new Dictionary(affix, dictionary);
+ new Dictionary(affix, dictionary);
}
}
}
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestCaseInsensitive.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestCaseInsensitive.java
new file mode 100644
index 00000000000..64bdb41e8c7
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestCaseInsensitive.java
@@ -0,0 +1,110 @@
+package org.apache.lucene.analysis.hunspell;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.hunspell.Dictionary;
+import org.apache.lucene.analysis.hunspell.Stemmer;
+import org.apache.lucene.util.CharsRef;
+import org.apache.lucene.util.LuceneTestCase;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+
+public class TestCaseInsensitive extends LuceneTestCase {
+ private static Stemmer stemmer;
+
+ @BeforeClass
+ public static void beforeClass() throws Exception {
+ try (InputStream affixStream = TestCaseInsensitive.class.getResourceAsStream("simple.aff");
+ InputStream dictStream = TestCaseInsensitive.class.getResourceAsStream("mixedcase.dic")) {
+ Dictionary dictionary = new Dictionary(affixStream, Collections.singletonList(dictStream), true);
+ stemmer = new Stemmer(dictionary);
+ }
+ }
+
+ @AfterClass
+ public static void afterClass() {
+ stemmer = null;
+ }
+
+ public void testCaseInsensitivity() {
+ assertStemsTo("lucene", "lucene", "lucen");
+ assertStemsTo("LuCeNe", "lucene", "lucen");
+ assertStemsTo("mahoute", "mahout");
+ assertStemsTo("MaHoUte", "mahout");
+ }
+
+ public void testSimplePrefix() {
+ assertStemsTo("solr", "olr");
+ }
+
+ public void testRecursiveSuffix() {
+ assertStemsTo("abcd", "ab");
+ }
+
+ // all forms unmunched from dictionary
+ public void testAllStems() {
+ assertStemsTo("ab", "ab");
+ assertStemsTo("abc", "ab");
+ assertStemsTo("apach", "apach");
+ assertStemsTo("apache", "apach");
+ assertStemsTo("foo", "foo");
+ assertStemsTo("food", "foo");
+ assertStemsTo("foos", "foo");
+ assertStemsTo("lucen", "lucen");
+ assertStemsTo("lucene", "lucen", "lucene");
+ assertStemsTo("mahout", "mahout");
+ assertStemsTo("mahoute", "mahout");
+ assertStemsTo("moo", "moo");
+ assertStemsTo("mood", "moo");
+ assertStemsTo("olr", "olr");
+ assertStemsTo("solr", "olr");
+ }
+
+ // some bogus stuff that should not stem (empty lists)!
+ public void testBogusStems() {
+ assertStemsTo("abs");
+ assertStemsTo("abe");
+ assertStemsTo("sab");
+ assertStemsTo("sapach");
+ assertStemsTo("sapache");
+ assertStemsTo("apachee");
+ assertStemsTo("sfoo");
+ assertStemsTo("sfoos");
+ assertStemsTo("fooss");
+ assertStemsTo("lucenee");
+ assertStemsTo("solre");
+ }
+
+ private void assertStemsTo(String s, String... expected) {
+ Arrays.sort(expected);
+
+ List stems = stemmer.stem(s);
+ String actual[] = new String[stems.size()];
+ for (int i = 0; i < actual.length; i++) {
+ actual[i] = stems.get(i).toString();
+ }
+ Arrays.sort(actual);
+
+ assertArrayEquals(expected, actual);
+ }
+}
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestDictionary.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java
similarity index 97%
rename from lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestDictionary.java
rename to lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java
index e8e0fd0d030..6cbe931d376 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestDictionary.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java
@@ -1,4 +1,4 @@
-package org.apache.lucene.analysis.hunspell2;
+package org.apache.lucene.analysis.hunspell;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@@ -22,6 +22,7 @@ import java.io.IOException;
import java.io.InputStream;
import java.text.ParseException;
+import org.apache.lucene.analysis.hunspell.Dictionary;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestHunspell2StemFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellStemFilter.java
similarity index 75%
rename from lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestHunspell2StemFilter.java
rename to lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellStemFilter.java
index eafb1f272cf..af48427d522 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestHunspell2StemFilter.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellStemFilter.java
@@ -1,4 +1,4 @@
-package org.apache.lucene.analysis.hunspell2;
+package org.apache.lucene.analysis.hunspell;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@@ -26,13 +26,15 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
+import org.apache.lucene.analysis.hunspell.Dictionary;
+import org.apache.lucene.analysis.hunspell.HunspellStemFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.TestUtil;
import org.junit.AfterClass;
import org.junit.BeforeClass;
-public class TestHunspell2StemFilter extends BaseTokenStreamTestCase {
+public class TestHunspellStemFilter extends BaseTokenStreamTestCase {
private static Dictionary dictionary;
@BeforeClass
@@ -52,13 +54,21 @@ public class TestHunspell2StemFilter extends BaseTokenStreamTestCase {
public void testKeywordAttribute() throws IOException {
MockTokenizer tokenizer = whitespaceMockTokenizer("lucene is awesome");
tokenizer.setEnableChecks(true);
- Hunspell2StemFilter filter = new Hunspell2StemFilter(tokenizer, dictionary, TestUtil.nextInt(random(), 1, 3));
+ HunspellStemFilter filter = new HunspellStemFilter(tokenizer, dictionary, TestUtil.nextInt(random(), 1, 3));
assertTokenStreamContents(filter, new String[]{"lucene", "lucen", "is", "awesome"}, new int[] {1, 0, 1, 1});
// assert with keyword marker
tokenizer = whitespaceMockTokenizer("lucene is awesome");
CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, Arrays.asList("Lucene"), true);
- filter = new Hunspell2StemFilter(new SetKeywordMarkerFilter(tokenizer, set), dictionary, TestUtil.nextInt(random(), 1, 3));
+ filter = new HunspellStemFilter(new SetKeywordMarkerFilter(tokenizer, set), dictionary, TestUtil.nextInt(random(), 1, 3));
+ assertTokenStreamContents(filter, new String[]{"lucene", "is", "awesome"}, new int[] {1, 1, 1});
+ }
+
+ /** simple test for longestOnly option */
+ public void testLongestOnly() throws IOException {
+ MockTokenizer tokenizer = whitespaceMockTokenizer("lucene is awesome");
+ tokenizer.setEnableChecks(true);
+ HunspellStemFilter filter = new HunspellStemFilter(tokenizer, dictionary, true, TestUtil.nextInt(random(), 1, 3), true);
assertTokenStreamContents(filter, new String[]{"lucene", "is", "awesome"}, new int[] {1, 1, 1});
}
@@ -68,7 +78,7 @@ public class TestHunspell2StemFilter extends BaseTokenStreamTestCase {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
- return new TokenStreamComponents(tokenizer, new Hunspell2StemFilter(tokenizer, dictionary, TestUtil.nextInt(random(), 1, 3)));
+ return new TokenStreamComponents(tokenizer, new HunspellStemFilter(tokenizer, dictionary, TestUtil.nextInt(random(), 1, 3)));
}
};
checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER);
@@ -79,7 +89,7 @@ public class TestHunspell2StemFilter extends BaseTokenStreamTestCase {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new KeywordTokenizer();
- return new TokenStreamComponents(tokenizer, new Hunspell2StemFilter(tokenizer, dictionary, TestUtil.nextInt(random(), 1, 3)));
+ return new TokenStreamComponents(tokenizer, new HunspellStemFilter(tokenizer, dictionary, TestUtil.nextInt(random(), 1, 3)));
}
};
checkOneTerm(a, "", "");
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellStemFilterFactory.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellStemFilterFactory.java
index e8e232ce60b..f4302035dbc 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellStemFilterFactory.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellStemFilterFactory.java
@@ -20,7 +20,6 @@ package org.apache.lucene.analysis.hunspell;
import java.io.Reader;
import java.io.StringReader;
-import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
@@ -31,17 +30,17 @@ public class TestHunspellStemFilterFactory extends BaseTokenStreamFactoryTestCas
public void testStemming() throws Exception {
Reader reader = new StringReader("abc");
TokenStream stream = whitespaceMockTokenizer(reader);
- stream = tokenFilterFactory("HunspellStem",
- "dictionary", "test.dic",
- "affix", "test.aff").create(stream);
+ stream = tokenFilterFactory("Hunspell2Stem",
+ "dictionary", "simple.dic",
+ "affix", "simple.aff").create(stream);
assertTokenStreamContents(stream, new String[] { "ab" });
}
/** Test that bogus arguments result in exception */
public void testBogusArguments() throws Exception {
try {
- tokenFilterFactory("HunspellStem",
- "dictionary", "test.dic",
+ tokenFilterFactory("Hunspell2Stem",
+ "dictionary", "simple.dic",
"bogusArg", "bogusValue");
fail();
} catch (IllegalArgumentException expected) {
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestStemmer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestStemmer.java
similarity index 95%
rename from lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestStemmer.java
rename to lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestStemmer.java
index 4dec107f314..dca9faa6b16 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestStemmer.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestStemmer.java
@@ -1,4 +1,4 @@
-package org.apache.lucene.analysis.hunspell2;
+package org.apache.lucene.analysis.hunspell;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@@ -17,6 +17,8 @@ package org.apache.lucene.analysis.hunspell2;
* limitations under the License.
*/
+import org.apache.lucene.analysis.hunspell.Dictionary;
+import org.apache.lucene.analysis.hunspell.Stemmer;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.LuceneTestCase;
import org.junit.AfterClass;
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/broken.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/broken.aff
similarity index 100%
rename from lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/broken.aff
rename to lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/broken.aff
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/compressed.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compressed.aff
similarity index 100%
rename from lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/compressed.aff
rename to lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compressed.aff
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/compressed.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compressed.dic
similarity index 100%
rename from lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/compressed.dic
rename to lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compressed.dic
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/mixedcase.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/mixedcase.dic
new file mode 100644
index 00000000000..9fae253279e
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/mixedcase.dic
@@ -0,0 +1,10 @@
+9
+Ab/C
+apach/A
+Foo/D
+foo/E
+Lucen/A
+Lucene
+mahout/A
+Moo/E
+olr/B
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/simple.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/simple.aff
similarity index 100%
rename from lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/simple.aff
rename to lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/simple.aff
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/simple.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/simple.dic
similarity index 100%
rename from lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/simple.dic
rename to lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/simple.dic
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/test.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/test.aff
deleted file mode 100644
index db9423dcad1..00000000000
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/test.aff
+++ /dev/null
@@ -1,20 +0,0 @@
-SET UTF-8
-TRY abcdefghijklmopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ
-
-SFX A Y 3
-SFX A 0 e n
-SFX A 0 e t
-SFX A 0 e h
-
-SFX C Y 2
-SFX C 0 d/C c
-SFX C 0 c b
-
-SFX D Y 1
-SFX D 0 s o
-
-SFX E Y 1
-SFX E 0 d o
-
-PFX B Y 1
-PFX B 0 s o
\ No newline at end of file
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/test.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/test.dic
deleted file mode 100644
index 12efd8fccb2..00000000000
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/test.dic
+++ /dev/null
@@ -1,10 +0,0 @@
-9
-lucen/A
-lucene
-mahout/A
-olr/B
-ab/C
-Apach/A
-Foo/E
-foo/D
-Moo/E
\ No newline at end of file
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testCompressed.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testCompressed.aff
deleted file mode 100644
index e4a1b37300f..00000000000
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testCompressed.aff
+++ /dev/null
@@ -1,29 +0,0 @@
-SET UTF-8
-TRY abcdefghijklmopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ
-
-FLAG long
-
-AF 5
-AF AA
-AF BB
-AF CC
-AF DD
-AF EE
-
-SFX AA Y 3
-SFX AA 0 e n
-SFX AA 0 e t
-SFX AA 0 e h
-
-SFX CC Y 2
-SFX CC 0 d/3 c
-SFX CC 0 c b
-
-SFX DD Y 1
-SFX DD 0 s o
-
-SFX EE Y 1
-SFX EE 0 d o
-
-PFX BB Y 1
-PFX BB 0 s o
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testCompressed.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testCompressed.dic
deleted file mode 100644
index bf237662017..00000000000
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testCompressed.dic
+++ /dev/null
@@ -1,9 +0,0 @@
-6
-lucen/1
-lucene
-mahout/1
-olr/2
-ab/3
-Apach/1
-foo/4
-Foo/5
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testOverride.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testOverride.dic
deleted file mode 100644
index c1111ef562b..00000000000
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testOverride.dic
+++ /dev/null
@@ -1,3 +0,0 @@
-2
-lucen/ABC
-bar/A
\ No newline at end of file
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testWrongAffixRule.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testWrongAffixRule.aff
deleted file mode 100644
index 3b780cd1d7b..00000000000
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testWrongAffixRule.aff
+++ /dev/null
@@ -1,24 +0,0 @@
-SET UTF-8
-TRY abcdefghijklmopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ
-
-SFX A Y 3
-SFX A 0 e n
-SFX A 0 e t
-SFX A 0 e h
-
-SFX C Y 2
-SFX C 0 d/C c
-SFX C 0 c b
-
-SFX D Y 1
-SFX D 0 s o
-
-SFX E Y 1
-SFX E 0 d o
-
-PFX B Y 1
-PFX B 0 s o
-
-#wrong rule (only 4 elements)
-PFX A0 Y 1
-PFX A0 0 a
\ No newline at end of file
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestHunspell2StemFilterFactory.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestHunspell2StemFilterFactory.java
deleted file mode 100644
index d95e2be04b6..00000000000
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestHunspell2StemFilterFactory.java
+++ /dev/null
@@ -1,50 +0,0 @@
-package org.apache.lucene.analysis.hunspell2;
-
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.Reader;
-import java.io.StringReader;
-
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase;
-
-/**
- * Simple tests to ensure the Hunspell stemmer loads from factory
- */
-public class TestHunspell2StemFilterFactory extends BaseTokenStreamFactoryTestCase {
- public void testStemming() throws Exception {
- Reader reader = new StringReader("abc");
- TokenStream stream = whitespaceMockTokenizer(reader);
- stream = tokenFilterFactory("Hunspell2Stem",
- "dictionary", "simple.dic",
- "affix", "simple.aff").create(stream);
- assertTokenStreamContents(stream, new String[] { "ab" });
- }
-
- /** Test that bogus arguments result in exception */
- public void testBogusArguments() throws Exception {
- try {
- tokenFilterFactory("Hunspell2Stem",
- "dictionary", "simple.dic",
- "bogusArg", "bogusValue");
- fail();
- } catch (IllegalArgumentException expected) {
- assertTrue(expected.getMessage().contains("Unknown parameters"));
- }
- }
-}