From 6a4e1e3a9262913c0284123503d361e599009534 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Mon, 24 Feb 2014 04:40:07 +0000 Subject: [PATCH 01/17] create branch git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5468@1571136 13f79535-47bb-0310-9956-ffa450edef68 From 2e0fc562bc239ea897023796160a8870eddd2a48 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Mon, 24 Feb 2014 04:41:03 +0000 Subject: [PATCH 02/17] LUCENE-5468: commit current state git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5468@1571137 13f79535-47bb-0310-9956-ffa450edef68 --- .../lucene/analysis/hunspell2/Affix.java | 157 +++++ .../lucene/analysis/hunspell2/Dictionary.java | 606 ++++++++++++++++++ .../hunspell2/Hunspell2StemFilter.java | 139 ++++ .../hunspell2/Hunspell2StemFilterFactory.java | 80 +++ .../analysis/hunspell2/ISO8859_14Decoder.java | 60 ++ .../lucene/analysis/hunspell2/Stemmer.java | 288 +++++++++ .../lucene/analysis/hunspell2/package.html | 26 + ...he.lucene.analysis.util.TokenFilterFactory | 1 + .../hunspell2/TestAllDictionaries.java | 205 ++++++ .../analysis/hunspell2/TestDictionary.java | 109 ++++ .../hunspell2/TestHunspell2StemFilter.java | 87 +++ .../TestHunspell2StemFilterFactory.java | 50 ++ .../analysis/hunspell2/TestStemmer.java | 105 +++ .../lucene/analysis/hunspell2/broken.aff | 24 + .../lucene/analysis/hunspell2/compressed.aff | 29 + .../lucene/analysis/hunspell2/compressed.dic | 9 + .../lucene/analysis/hunspell2/simple.aff | 20 + .../lucene/analysis/hunspell2/simple.dic | 10 + 18 files changed, 2005 insertions(+) create mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Affix.java create mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java create mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Hunspell2StemFilter.java create mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Hunspell2StemFilterFactory.java create mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/ISO8859_14Decoder.java create mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java create mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/package.html create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestAllDictionaries.java create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestDictionary.java create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestHunspell2StemFilter.java create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestHunspell2StemFilterFactory.java create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestStemmer.java create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/broken.aff create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/compressed.aff create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/compressed.dic create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/simple.aff create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/simple.dic diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Affix.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Affix.java new file mode 100644 index 00000000000..41c3553fb77 --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Affix.java @@ -0,0 +1,157 @@ +package org.apache.lucene.analysis.hunspell2; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.regex.Pattern; + +/** + * Wrapper class representing a hunspell affix + */ +final class Affix { + + private String append; // the affix itself, what is appended + private char appendFlags[]; // continuation class flags + private String strip; + + private String condition; + private Pattern conditionPattern; + + private char flag; + + private boolean crossProduct; + + /** + * Checks whether the given text matches the conditional pattern on this affix + * + * @param text Text to check if it matches the affix's conditional pattern + * @return {@code true} if the text meets the condition, {@code false} otherwise + */ + public boolean checkCondition(CharSequence text) { + return conditionPattern.matcher(text).matches(); + } + + /** + * Returns the append defined for the affix + * + * @return Defined append + */ + public String getAppend() { + return append; + } + + /** + * Sets the append defined for the affix + * + * @param append Defined append for the affix + */ + public void setAppend(String append) { + this.append = append; + } + + /** + * Returns the flags defined for the affix append + * + * @return Flags defined for the affix append + */ + public char[] getAppendFlags() { + return appendFlags; + } + + /** + * Sets the flags defined for the affix append + * + * @param appendFlags Flags defined for the affix append + */ + public void setAppendFlags(char[] appendFlags) { + this.appendFlags = appendFlags; + } + + /** + * Returns the stripping characters defined for the affix + * + * @return Stripping characters defined for the affix + */ + public String getStrip() { + return strip; + } + + /** + * Sets the stripping characters defined for the affix + * + * @param strip Stripping characters defined for the affix + */ + public void setStrip(String strip) { + this.strip = strip; + } + + /** + * Returns the condition that must be met before the affix can be applied + * + * @return Condition that must be met before the affix can be applied + */ + public String getCondition() { + return condition; + } + + /** + * Sets the condition that must be met before the affix can be applied + * + * @param condition Condition to be met before affix application + * @param pattern Condition as a regular expression pattern + */ + public void setCondition(String condition, String pattern) { + this.condition = condition; + this.conditionPattern = Pattern.compile(pattern); + } + + /** + * Returns the affix flag + * + * @return Affix flag + */ + public char getFlag() { + return flag; + } + + /** + * Sets the affix flag + * + * @param flag Affix flag + */ + public void setFlag(char flag) { + this.flag = flag; + } + + /** + * Returns whether the affix is defined as cross product + * + * @return {@code true} if the affix is cross product, {@code false} otherwise + */ + public boolean isCrossProduct() { + return crossProduct; + } + + /** + * Sets whether the affix is defined as cross product + * + * @param crossProduct Whether the affix is defined as cross product + */ + public void setCrossProduct(boolean crossProduct) { + this.crossProduct = crossProduct; + } +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java new file mode 100644 index 00000000000..a7b9a58f080 --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java @@ -0,0 +1,606 @@ +package org.apache.lucene.analysis.hunspell2; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.util.CharArrayMap; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefHash; +import org.apache.lucene.util.IntsRef; +import org.apache.lucene.util.UnicodeUtil; +import org.apache.lucene.util.Version; +import org.apache.lucene.util.fst.Builder; +import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.PositiveIntOutputs; + +import java.io.*; +import java.nio.charset.Charset; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.CodingErrorAction; +import java.text.ParseException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.TreeMap; + +/** + * In-memory structure for the dictionary (.dic) and affix (.aff) + * data of a hunspell dictionary. + */ +public class Dictionary { + + static final char[] NOFLAGS = new char[0]; + + private static final String ALIAS_KEY = "AF"; + private static final String PREFIX_KEY = "PFX"; + private static final String SUFFIX_KEY = "SFX"; + private static final String FLAG_KEY = "FLAG"; + + private static final String NUM_FLAG_TYPE = "num"; + private static final String UTF8_FLAG_TYPE = "UTF-8"; + private static final String LONG_FLAG_TYPE = "long"; + + private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*"; + private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s"; + + public CharArrayMap> prefixes; + public CharArrayMap> suffixes; + + // the entries in the .dic file, mapping to their set of flags. + // the fst output is the ordinal for flagLookup + public FST words; + // the list of unique flagsets (wordforms). theoretically huge, but practically + // small (e.g. for polish this is 756), otherwise humans wouldn't be able to deal with it either. + public BytesRefHash flagLookup = new BytesRefHash(); + + private FlagParsingStrategy flagParsingStrategy = new SimpleFlagParsingStrategy(); // Default flag parsing strategy + + private String[] aliases; + private int aliasCount = 0; + + /** + * Creates a new Dictionary containing the information read from the provided InputStreams to hunspell affix + * and dictionary files. + * You have to close the provided InputStreams yourself. + * + * @param affix InputStream for reading the hunspell affix file (won't be closed). + * @param dictionary InputStream for reading the hunspell dictionary file (won't be closed). + * @throws IOException Can be thrown while reading from the InputStreams + * @throws ParseException Can be thrown if the content of the files does not meet expected formats + */ + public Dictionary(InputStream affix, InputStream dictionary) throws IOException, ParseException { + BufferedInputStream buffered = new BufferedInputStream(affix, 8192); + buffered.mark(8192); + String encoding = getDictionaryEncoding(affix); + buffered.reset(); + CharsetDecoder decoder = getJavaEncoding(encoding); + readAffixFile(buffered, decoder); + TreeMap tempWords = new TreeMap(); + flagLookup.add(new BytesRef()); // no flags -> ord 0 + readDictionaryFile(dictionary, decoder, tempWords); + PositiveIntOutputs o = PositiveIntOutputs.getSingleton(); + Builder b = new Builder(FST.INPUT_TYPE.BYTE4, o); // nocommit: byte4 + IntsRef scratchInts = new IntsRef(); + for (Map.Entry e : tempWords.entrySet()) { + UnicodeUtil.UTF8toUTF32(e.getKey(), scratchInts); + b.add(scratchInts, e.getValue().longValue()); + } + words = b.finish(); + } + + /** + * Looks up words that match the String created from the given char array, offset and length + * + * @param word Char array to generate the String from + * @param offset Offset in the char array that the String starts at + * @param length Length from the offset that the String is + * @return List of HunspellWords that match the generated String, or {@code null} if none are found + */ + char[] lookupWord(char word[], int offset, int length, BytesRef scratch) { + Integer ord = null; + try { + ord = lookupOrd(word, offset, length); + } catch (IOException ex) { /* bogus */ } + if (ord == null) { + return null; + } + return decodeFlags(flagLookup.get(ord, scratch)); + } + + public Integer lookupOrd(char word[], int offset, int length) throws IOException { + final FST.BytesReader bytesReader = words.getBytesReader(); + final FST.Arc arc = words.getFirstArc(new FST.Arc()); + // Accumulate output as we go + final Long NO_OUTPUT = words.outputs.getNoOutput(); + Long output = NO_OUTPUT; + + int l = offset + length; + for (int i = offset, cp = 0; i < l; i += Character.charCount(cp)) { + cp = Character.codePointAt(word, i, l); + if (words.findTargetArc(cp, arc, arc, bytesReader) == null) { + return null; + } else if (arc.output != NO_OUTPUT) { + output = words.outputs.add(output, arc.output); + } + } + if (words.findTargetArc(FST.END_LABEL, arc, arc, bytesReader) == null) { + return null; + } else if (arc.output != NO_OUTPUT) { + return words.outputs.add(output, arc.output).intValue(); + } else { + return output.intValue(); + } + } + + /** + * Looks up HunspellAffix prefixes that have an append that matches the String created from the given char array, offset and length + * + * @param word Char array to generate the String from + * @param offset Offset in the char array that the String starts at + * @param length Length from the offset that the String is + * @return List of HunspellAffix prefixes with an append that matches the String, or {@code null} if none are found + */ + public List lookupPrefix(char word[], int offset, int length) { + return prefixes.get(word, offset, length); + } + + /** + * Looks up HunspellAffix suffixes that have an append that matches the String created from the given char array, offset and length + * + * @param word Char array to generate the String from + * @param offset Offset in the char array that the String starts at + * @param length Length from the offset that the String is + * @return List of HunspellAffix suffixes with an append that matches the String, or {@code null} if none are found + */ + List lookupSuffix(char word[], int offset, int length) { + return suffixes.get(word, offset, length); + } + + /** + * Reads the affix file through the provided InputStream, building up the prefix and suffix maps + * + * @param affixStream InputStream to read the content of the affix file from + * @param decoder CharsetDecoder to decode the content of the file + * @throws IOException Can be thrown while reading from the InputStream + */ + private void readAffixFile(InputStream affixStream, CharsetDecoder decoder) throws IOException, ParseException { + prefixes = new CharArrayMap>(Version.LUCENE_CURRENT, 8, false); + suffixes = new CharArrayMap>(Version.LUCENE_CURRENT, 8, false); + + LineNumberReader reader = new LineNumberReader(new InputStreamReader(affixStream, decoder)); + String line = null; + while ((line = reader.readLine()) != null) { + if (line.startsWith(ALIAS_KEY)) { + parseAlias(line); + } else if (line.startsWith(PREFIX_KEY)) { + parseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN); + } else if (line.startsWith(SUFFIX_KEY)) { + parseAffix(suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN); + } else if (line.startsWith(FLAG_KEY)) { + // Assume that the FLAG line comes before any prefix or suffixes + // Store the strategy so it can be used when parsing the dic file + flagParsingStrategy = getFlagParsingStrategy(line); + } + } + } + + /** + * Parses a specific affix rule putting the result into the provided affix map + * + * @param affixes Map where the result of the parsing will be put + * @param header Header line of the affix rule + * @param reader BufferedReader to read the content of the rule from + * @param conditionPattern {@link String#format(String, Object...)} pattern to be used to generate the condition regex + * pattern + * @throws IOException Can be thrown while reading the rule + */ + private void parseAffix(CharArrayMap> affixes, + String header, + LineNumberReader reader, + String conditionPattern) throws IOException, ParseException { + String args[] = header.split("\\s+"); + + boolean crossProduct = args[2].equals("Y"); + + int numLines = Integer.parseInt(args[3]); + for (int i = 0; i < numLines; i++) { + String line = reader.readLine(); + String ruleArgs[] = line.split("\\s+"); + + if (ruleArgs.length < 5) { + throw new ParseException("The affix file contains a rule with less than five elements", reader.getLineNumber()); + } + + Affix affix = new Affix(); + + affix.setFlag(flagParsingStrategy.parseFlag(ruleArgs[1])); + affix.setStrip(ruleArgs[2].equals("0") ? "" : ruleArgs[2]); + + String affixArg = ruleArgs[3]; + + int flagSep = affixArg.lastIndexOf('/'); + if (flagSep != -1) { + String flagPart = affixArg.substring(flagSep + 1); + + if (aliasCount > 0) { + flagPart = getAliasValue(Integer.parseInt(flagPart)); + } + + char appendFlags[] = flagParsingStrategy.parseFlags(flagPart); + Arrays.sort(appendFlags); + affix.setAppendFlags(appendFlags); + affix.setAppend(affixArg.substring(0, flagSep)); + } else { + affix.setAppend(affixArg); + } + + String condition = ruleArgs[4]; + // at least the gascon affix file has this issue + if (condition.startsWith("[") && !condition.endsWith("]")) { + condition = condition + "]"; + } + // "dash hasn't got special meaning" (we must escape it) + if (condition.indexOf('-') >= 0) { + condition = condition.replace("-", "\\-"); + } + affix.setCondition(condition, String.format(Locale.ROOT, conditionPattern, condition)); + affix.setCrossProduct(crossProduct); + + List list = affixes.get(affix.getAppend()); + if (list == null) { + list = new ArrayList(); + affixes.put(affix.getAppend(), list); + } + + list.add(affix); + } + } + + /** + * Parses the encoding specified in the affix file readable through the provided InputStream + * + * @param affix InputStream for reading the affix file + * @return Encoding specified in the affix file + * @throws IOException Can be thrown while reading from the InputStream + * @throws ParseException Thrown if the first non-empty non-comment line read from the file does not adhere to the format {@code SET } + */ + private String getDictionaryEncoding(InputStream affix) throws IOException, ParseException { + final StringBuilder encoding = new StringBuilder(); + for (;;) { + encoding.setLength(0); + int ch; + while ((ch = affix.read()) >= 0) { + if (ch == '\n') { + break; + } + if (ch != '\r') { + encoding.append((char)ch); + } + } + if ( + encoding.length() == 0 || encoding.charAt(0) == '#' || + // this test only at the end as ineffective but would allow lines only containing spaces: + encoding.toString().trim().length() == 0 + ) { + if (ch < 0) { + throw new ParseException("Unexpected end of affix file.", 0); + } + continue; + } + if (encoding.length() > 4 && "SET ".equals(encoding.substring(0, 4))) { + // cleanup the encoding string, too (whitespace) + return encoding.substring(4).trim(); + } + } + } + + static final Map CHARSET_ALIASES; + static { + Map m = new HashMap<>(); + m.put("microsoft-cp1251", "windows-1251"); + m.put("TIS620-2533", "TIS-620"); + CHARSET_ALIASES = Collections.unmodifiableMap(m); + } + + /** + * Retrieves the CharsetDecoder for the given encoding. Note, This isn't perfect as I think ISCII-DEVANAGARI and + * MICROSOFT-CP1251 etc are allowed... + * + * @param encoding Encoding to retrieve the CharsetDecoder for + * @return CharSetDecoder for the given encoding + */ + private CharsetDecoder getJavaEncoding(String encoding) { + if ("ISO8859-14".equals(encoding)) { + return new ISO8859_14Decoder(); + } + String canon = CHARSET_ALIASES.get(encoding); + if (canon != null) { + encoding = canon; + } + Charset charset = Charset.forName(encoding); + return charset.newDecoder().onMalformedInput(CodingErrorAction.REPLACE); + } + + /** + * Determines the appropriate {@link FlagParsingStrategy} based on the FLAG definition line taken from the affix file + * + * @param flagLine Line containing the flag information + * @return FlagParsingStrategy that handles parsing flags in the way specified in the FLAG definition + */ + private FlagParsingStrategy getFlagParsingStrategy(String flagLine) { + String flagType = flagLine.substring(5); + + if (NUM_FLAG_TYPE.equals(flagType)) { + return new NumFlagParsingStrategy(); + } else if (UTF8_FLAG_TYPE.equals(flagType)) { + return new SimpleFlagParsingStrategy(); + } else if (LONG_FLAG_TYPE.equals(flagType)) { + return new DoubleASCIIFlagParsingStrategy(); + } + + throw new IllegalArgumentException("Unknown flag type: " + flagType); + } + + /** + * Reads the dictionary file through the provided InputStream, building up the words map + * + * @param dictionary InputStream to read the dictionary file through + * @param decoder CharsetDecoder used to decode the contents of the file + * @throws IOException Can be thrown while reading from the file + */ + private void readDictionaryFile(InputStream dictionary, CharsetDecoder decoder, TreeMap words) throws IOException { + BytesRef flagsScratch = new BytesRef(); + BytesRef flagsScratch2 = new BytesRef(); + + BufferedReader reader = new BufferedReader(new InputStreamReader(dictionary, decoder)); + // TODO: don't create millions of strings. + String line = reader.readLine(); // first line is number of entries + // sometimes the number of entries has a comment/copyright after it + line = line.replaceFirst("\\s*\\#.*$", ""); + int numEntries = Integer.parseInt(line); + + // TODO: the flags themselves can be double-chars (long) or also numeric + // either way the trick is to encode them as char... but they must be parsed differently + while ((line = reader.readLine()) != null) { + String entry; + char wordForm[]; + + int flagSep = line.lastIndexOf('/'); + if (flagSep == -1) { + wordForm = NOFLAGS; + entry = line; + } else { + // note, there can be comments (morph description) after a flag. + // we should really look for any whitespace + int end = line.indexOf('\t', flagSep); + if (end == -1) + end = line.length(); + + String flagPart = line.substring(flagSep + 1, end); + if (aliasCount > 0) { + flagPart = getAliasValue(Integer.parseInt(flagPart)); + } + + wordForm = flagParsingStrategy.parseFlags(flagPart); + Arrays.sort(wordForm); + entry = line.substring(0, flagSep); + } + + BytesRef scratch = new BytesRef(entry); + Integer existingOrd = words.get(scratch); + final char mergedEntries[]; + if (existingOrd == null || existingOrd == 0) { + mergedEntries = wordForm; + } else { + flagLookup.get(existingOrd, flagsScratch2); + mergedEntries = merge(decodeFlags(flagsScratch2), wordForm); + } + + final int hashCode = encodeFlagsWithHash(flagsScratch, mergedEntries); + int ord = flagLookup.add(flagsScratch, hashCode); + if (ord < 0) { + // already exists in our hash + ord = (-ord)-1; + } + + words.put(scratch, ord); + } + } + + static char[] decodeFlags(BytesRef b) { + int len = b.length >>> 1; + char flags[] = new char[len]; + int upto = 0; + int end = b.offset + b.length; + for (int i = b.offset; i < end; i += 2) { + flags[upto++] = (char)((b.bytes[i] << 8) | (b.bytes[i+1] & 0xff)); + } + return flags; + } + + static int encodeFlagsWithHash(BytesRef b, char flags[]) { + int hash = 0; + int len = flags.length << 1; + b.grow(len); + b.length = len; + int upto = b.offset; + for (int i = 0; i < flags.length; i++) { + int flag = flags[i]; + hash = 31*hash + (b.bytes[upto++] = (byte) ((flag >> 8) & 0xff)); + hash = 31*hash + (b.bytes[upto++] = (byte) (flag & 0xff)); + } + return hash; + } + + private void parseAlias(String line) { + String ruleArgs[] = line.split("\\s+"); + if (aliases == null) { + //first line should be the aliases count + final int count = Integer.parseInt(ruleArgs[1]); + aliases = new String[count]; + } else { + aliases[aliasCount++] = ruleArgs[1]; + } + } + + private String getAliasValue(int id) { + try { + return aliases[id - 1]; + } catch (IndexOutOfBoundsException ex) { + throw new IllegalArgumentException("Bad flag alias number:" + id, ex); + } + } + + /** + * Abstraction of the process of parsing flags taken from the affix and dic files + */ + private static abstract class FlagParsingStrategy { + + /** + * Parses the given String into a single flag + * + * @param rawFlag String to parse into a flag + * @return Parsed flag + */ + char parseFlag(String rawFlag) { + return parseFlags(rawFlag)[0]; + } + + /** + * Parses the given String into multiple flags + * + * @param rawFlags String to parse into flags + * @return Parsed flags + */ + abstract char[] parseFlags(String rawFlags); + } + + /** + * Simple implementation of {@link FlagParsingStrategy} that treats the chars in each String as a individual flags. + * Can be used with both the ASCII and UTF-8 flag types. + */ + private static class SimpleFlagParsingStrategy extends FlagParsingStrategy { + @Override + public char[] parseFlags(String rawFlags) { + return rawFlags.toCharArray(); + } + } + + /** + * Implementation of {@link FlagParsingStrategy} that assumes each flag is encoded in its numerical form. In the case + * of multiple flags, each number is separated by a comma. + */ + private static class NumFlagParsingStrategy extends FlagParsingStrategy { + @Override + public char[] parseFlags(String rawFlags) { + String[] rawFlagParts = rawFlags.trim().split(","); + char[] flags = new char[rawFlagParts.length]; + int upto = 0; + + for (int i = 0; i < rawFlagParts.length; i++) { + // note, removing the trailing X/leading I for nepali... what is the rule here?! + String replacement = rawFlagParts[i].replaceAll("[^0-9]", ""); + // note, ignoring empty flags (this happens in danish, for example) + if (replacement.isEmpty()) { + continue; + } + flags[upto++] = (char) Integer.parseInt(replacement); + } + + if (upto < flags.length) { + flags = Arrays.copyOf(flags, upto); + } + return flags; + } + } + + /** + * Implementation of {@link FlagParsingStrategy} that assumes each flag is encoded as two ASCII characters whose codes + * must be combined into a single character. + * + * TODO (rmuir) test + */ + private static class DoubleASCIIFlagParsingStrategy extends FlagParsingStrategy { + + @Override + public char[] parseFlags(String rawFlags) { + if (rawFlags.length() == 0) { + return new char[0]; + } + + StringBuilder builder = new StringBuilder(); + for (int i = 0; i < rawFlags.length(); i+=2) { + char cookedFlag = (char) ((int) rawFlags.charAt(i) + (int) rawFlags.charAt(i + 1)); + builder.append(cookedFlag); + } + + char flags[] = new char[builder.length()]; + builder.getChars(0, builder.length(), flags, 0); + return flags; + } + } + + static boolean hasFlag(char flags[], char flag) { + return Arrays.binarySearch(flags, flag) >= 0; + } + + static char[] merge(char[] flags1, char[] flags2) { + char merged[] = new char[flags1.length + flags2.length]; + int i1 = 0, i2 = 0; + int last = -1; + int upto = 0; + + while (i1 < flags1.length && i2 < flags2.length) { + final char next; + if (flags1[i1] <= flags2[i2]) { + next = flags1[i1++]; + } else { + next = flags2[i2++]; + } + if (next != last) { + merged[upto++] = next; + last = next; + } + } + + while (i1 < flags1.length) { + char next = flags1[i1++]; + if (next != last) { + merged[upto++] = next; + last = next; + } + } + + while (i2 < flags2.length) { + char next = flags2[i2++]; + if (next != last) { + merged[upto++] = next; + last = next; + } + } + + if (merged.length != upto) { + merged = Arrays.copyOf(merged, upto); + } + + return merged; + } +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Hunspell2StemFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Hunspell2StemFilter.java new file mode 100644 index 00000000000..f9dfb770ab2 --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Hunspell2StemFilter.java @@ -0,0 +1,139 @@ +package org.apache.lucene.analysis.hunspell2; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.List; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.hunspell2.Stemmer.Stem; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; + +/** + * TokenFilter that uses hunspell affix rules and words to stem tokens. Since hunspell supports a word having multiple + * stems, this filter can emit multiple tokens for each consumed token + * + *

+ * Note: This filter is aware of the {@link KeywordAttribute}. To prevent + * certain terms from being passed to the stemmer + * {@link KeywordAttribute#isKeyword()} should be set to true + * in a previous {@link TokenStream}. + * + * Note: For including the original term as well as the stemmed version, see + * {@link org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilterFactory} + *

+ * + * @lucene.experimental + */ +public final class Hunspell2StemFilter extends TokenFilter { + + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); + private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class); + private final Stemmer stemmer; + + private List buffer; + private State savedState; + + private final boolean dedup; + + /** Create a {@link Hunspell2StemFilter} which deduplicates stems and has a maximum + * recursion level of 2. + * @see #Hunspell2StemFilter(TokenStream, Dictionary, int) */ + public Hunspell2StemFilter(TokenStream input, Dictionary dictionary) { + this(input, dictionary, 2); + } + + /** + * Creates a new Hunspell2StemFilter that will stem tokens from the given TokenStream using affix rules in the provided + * Dictionary + * + * @param input TokenStream whose tokens will be stemmed + * @param dictionary HunspellDictionary containing the affix rules and words that will be used to stem the tokens + * @param recursionCap maximum level of recursion stemmer can go into, defaults to 2 + */ + public Hunspell2StemFilter(TokenStream input, Dictionary dictionary, int recursionCap) { + this(input, dictionary, true, recursionCap); + } + + /** Create a {@link Hunspell2StemFilter} which has a maximum recursion level of 2. + * @see #Hunspell2StemFilter(TokenStream, Dictionary, boolean, int) */ + public Hunspell2StemFilter(TokenStream input, Dictionary dictionary, boolean dedup) { + this(input, dictionary, dedup, 2); + } + + /** + * Creates a new HunspellStemFilter that will stem tokens from the given TokenStream using affix rules in the provided + * Dictionary + * + * @param input TokenStream whose tokens will be stemmed + * @param dictionary HunspellDictionary containing the affix rules and words that will be used to stem the tokens + * @param dedup true if only unique terms should be output. + * @param recursionCap maximum level of recursion stemmer can go into, defaults to 2 + */ + public Hunspell2StemFilter(TokenStream input, Dictionary dictionary, boolean dedup, int recursionCap) { + super(input); + this.dedup = dedup; + this.stemmer = new Stemmer(dictionary, recursionCap); + } + + @Override + public boolean incrementToken() throws IOException { + if (buffer != null && !buffer.isEmpty()) { + Stem nextStem = buffer.remove(0); + restoreState(savedState); + posIncAtt.setPositionIncrement(0); + termAtt.copyBuffer(nextStem.getStem(), 0, nextStem.getStemLength()); + termAtt.setLength(nextStem.getStemLength()); + return true; + } + + if (!input.incrementToken()) { + return false; + } + + if (keywordAtt.isKeyword()) { + return true; + } + + buffer = dedup ? stemmer.uniqueStems(termAtt.buffer(), termAtt.length()) : stemmer.stem(termAtt.buffer(), termAtt.length()); + + if (buffer.isEmpty()) { // we do not know this word, return it unchanged + return true; + } + + Stem stem = buffer.remove(0); + termAtt.copyBuffer(stem.getStem(), 0, stem.getStemLength()); + termAtt.setLength(stem.getStemLength()); + + if (!buffer.isEmpty()) { + savedState = captureState(); + } + + return true; + } + + @Override + public void reset() throws IOException { + super.reset(); + buffer = null; + } +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Hunspell2StemFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Hunspell2StemFilterFactory.java new file mode 100644 index 00000000000..6ce73698dfd --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Hunspell2StemFilterFactory.java @@ -0,0 +1,80 @@ +package org.apache.lucene.analysis.hunspell2; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.InputStream; +import java.text.ParseException; +import java.util.Map; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.util.ResourceLoader; +import org.apache.lucene.analysis.util.ResourceLoaderAware; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +/** + * TokenFilterFactory that creates instances of {@link Hunspell2StemFilter}. + * Example config for British English: + *
+ * <filter class="solr.Hunspell2StemFilterFactory"
+ *         dictionary="en_GB.dic"
+ *         affix="en_GB.aff" />
+ * Both parameters dictionary and affix are mandatory. + * Dictionaries for many languages are available through the OpenOffice project. + * + * See http://wiki.apache.org/solr/Hunspell + * @lucene.experimental + */ +public class Hunspell2StemFilterFactory extends TokenFilterFactory implements ResourceLoaderAware { + private static final String PARAM_DICTIONARY = "dictionary"; + private static final String PARAM_AFFIX = "affix"; + private static final String PARAM_RECURSION_CAP = "recursionCap"; + + private final String dictionaryFile; + private final String affixFile; + private Dictionary dictionary; + private int recursionCap; + + /** Creates a new Hunspell2StemFilterFactory */ + public Hunspell2StemFilterFactory(Map args) { + super(args); + dictionaryFile = require(args, PARAM_DICTIONARY); + affixFile = get(args, PARAM_AFFIX); + recursionCap = getInt(args, PARAM_RECURSION_CAP, 2); + if (!args.isEmpty()) { + throw new IllegalArgumentException("Unknown parameters: " + args); + } + } + + @Override + public void inform(ResourceLoader loader) throws IOException { + try (InputStream affix = loader.openResource(affixFile); + InputStream dictionary = loader.openResource(dictionaryFile)) { + try { + this.dictionary = new Dictionary(affix, dictionary); + } catch (ParseException e) { + throw new RuntimeException(e); + } + } + } + + @Override + public TokenStream create(TokenStream tokenStream) { + return new Hunspell2StemFilter(tokenStream, dictionary, recursionCap); + } +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/ISO8859_14Decoder.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/ISO8859_14Decoder.java new file mode 100644 index 00000000000..4de0d4bc051 --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/ISO8859_14Decoder.java @@ -0,0 +1,60 @@ +package org.apache.lucene.analysis.hunspell2; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.CoderResult; + +import org.apache.lucene.util.IOUtils; + +// many hunspell dictionaries use this encoding, yet java does not have it?!?! +final class ISO8859_14Decoder extends CharsetDecoder { + + static final char TABLE[] = new char[] { + 0x00A0, 0x1E02, 0x1E03, 0x00A3, 0x010A, 0x010B, 0x1E0A, 0x00A7, + 0x1E80, 0x00A9, 0x1E82, 0x1E0B, 0x1EF2, 0x00AD, 0x00AE, 0x0178, + 0x1E1E, 0x1E1F, 0x0120, 0x0121, 0x1E40, 0x1E41, 0x00B6, 0x1E56, + 0x1E81, 0x1E57, 0x1E83, 0x1E60, 0x1EF3, 0x1E84, 0x1E85, 0x1E61, + 0x00C0, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x00C6, 0x00C7, + 0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF, + 0x0174, 0x00D1, 0x00D2, 0x00D3, 0x00D4, 0x00D5, 0x00D6, 0x1E6A, + 0x00D8, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x00DD, 0x0176, 0x00DF, + 0x00E0, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6, 0x00E7, + 0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF, + 0x0175, 0x00F1, 0x00F2, 0x00F3, 0x00F4, 0x00F5, 0x00F6, 0x1E6B, + 0x00F8, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x00FD, 0x0177, 0x00FF + }; + + ISO8859_14Decoder() { + super(IOUtils.CHARSET_UTF_8, 1f, 1f); + } + + @Override + protected CoderResult decodeLoop(ByteBuffer in, CharBuffer out) { + while (in.hasRemaining() && out.hasRemaining()) { + char ch = (char) (in.get() & 0xff); + if (ch >= 0xA0) { + ch = TABLE[ch - 0xA0]; + } + out.put(ch); + } + return in.hasRemaining() ? CoderResult.OVERFLOW : CoderResult.UNDERFLOW; + } +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java new file mode 100644 index 00000000000..7d36c81e4ae --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java @@ -0,0 +1,288 @@ +package org.apache.lucene.analysis.hunspell2; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +import org.apache.lucene.analysis.util.CharArraySet; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.Version; + +/** + * Stemmer uses the affix rules declared in the Dictionary to generate one or more stems for a word. It + * conforms to the algorithm in the original hunspell algorithm, including recursive suffix stripping. + */ +final class Stemmer { + private final int recursionCap; + private final Dictionary dictionary; + private BytesRef scratch = new BytesRef(); + private final StringBuilder segment = new StringBuilder(); + + /** + * Constructs a new Stemmer which will use the provided Dictionary to create its stems. Uses the + * default recursion cap of 2 (based on Hunspell documentation). + * + * @param dictionary Dictionary that will be used to create the stems + */ + public Stemmer(Dictionary dictionary) { + this(dictionary, 2); + } + + /** + * Constructs a new Stemmer which will use the provided Dictionary to create its stems. + * + * @param dictionary Dictionary that will be used to create the stems + * @param recursionCap maximum level of recursion stemmer can go into + */ + public Stemmer(Dictionary dictionary, int recursionCap) { + this.dictionary = dictionary; + this.recursionCap = recursionCap; + } + + /** + * Find the stem(s) of the provided word. + * + * @param word Word to find the stems for + * @return List of stems for the word + */ + public List stem(String word) { + return stem(word.toCharArray(), word.length()); + } + + /** + * Find the stem(s) of the provided word + * + * @param word Word to find the stems for + * @return List of stems for the word + */ + public List stem(char word[], int length) { + List stems = new ArrayList(); + if (dictionary.lookupWord(word, 0, length, scratch) != null) { + stems.add(new Stem(word, length)); + } + stems.addAll(stem(word, length, null, 0)); + return stems; + } + + /** + * Find the unique stem(s) of the provided word + * + * @param word Word to find the stems for + * @return List of stems for the word + */ + public List uniqueStems(char word[], int length) { + List stems = new ArrayList(); + CharArraySet terms = new CharArraySet(Version.LUCENE_CURRENT, 8, false); + if (dictionary.lookupWord(word, 0, length, scratch) != null) { + stems.add(new Stem(word, length)); + terms.add(word); + } + List otherStems = stem(word, length, null, 0); + for (Stem s : otherStems) { + if (!terms.contains(s.stem)) { + stems.add(s); + terms.add(s.stem); + } + } + return stems; + } + + // ================================================= Helper Methods ================================================ + + /** + * Generates a list of stems for the provided word + * + * @param word Word to generate the stems for + * @param flags Flags from a previous stemming step that need to be cross-checked with any affixes in this recursive step + * @param recursionDepth Level of recursion this stemming step is at + * @return List of stems, or empty list if no stems are found + */ + private List stem(char word[], int length, char[] flags, int recursionDepth) { + List stems = new ArrayList(); + + for (int i = 0; i < length; i++) { + List suffixes = dictionary.lookupSuffix(word, i, length - i); + if (suffixes == null) { + continue; + } + + for (Affix suffix : suffixes) { + if (hasCrossCheckedFlag(suffix.getFlag(), flags)) { + int deAffixedLength = length - suffix.getAppend().length(); + // TODO: can we do this in-place? + String strippedWord = new StringBuilder().append(word, 0, deAffixedLength).append(suffix.getStrip()).toString(); + + List stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), suffix, recursionDepth); + for (Stem stem : stemList) { + stem.addSuffix(suffix); + } + + stems.addAll(stemList); + } + } + } + + for (int i = length - 1; i >= 0; i--) { + List prefixes = dictionary.lookupPrefix(word, 0, i); + if (prefixes == null) { + continue; + } + + for (Affix prefix : prefixes) { + if (hasCrossCheckedFlag(prefix.getFlag(), flags)) { + int deAffixedStart = prefix.getAppend().length(); + int deAffixedLength = length - deAffixedStart; + + String strippedWord = new StringBuilder().append(prefix.getStrip()) + .append(word, deAffixedStart, deAffixedLength) + .toString(); + + List stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), prefix, recursionDepth); + for (Stem stem : stemList) { + stem.addPrefix(prefix); + } + + stems.addAll(stemList); + } + } + } + + return stems; + } + + /** + * Applies the affix rule to the given word, producing a list of stems if any are found + * + * @param strippedWord Word the affix has been removed and the strip added + * @param affix HunspellAffix representing the affix rule itself + * @param recursionDepth Level of recursion this stemming step is at + * @return List of stems for the word, or an empty list if none are found + */ + public List applyAffix(char strippedWord[], int length, Affix affix, int recursionDepth) { + segment.setLength(0); + segment.append(strippedWord, 0, length); + if (!affix.checkCondition(segment)) { + return Collections.emptyList(); + } + + List stems = new ArrayList(); + + char wordFlags[] = dictionary.lookupWord(strippedWord, 0, length, scratch); + if (wordFlags != null && Dictionary.hasFlag(wordFlags, affix.getFlag())) { + stems.add(new Stem(strippedWord, length)); + } + + if (affix.isCrossProduct() && recursionDepth < recursionCap) { + stems.addAll(stem(strippedWord, length, affix.getAppendFlags(), ++recursionDepth)); + } + + return stems; + } + + /** + * Checks if the given flag cross checks with the given array of flags + * + * @param flag Flag to cross check with the array of flags + * @param flags Array of flags to cross check against. Can be {@code null} + * @return {@code true} if the flag is found in the array or the array is {@code null}, {@code false} otherwise + */ + private boolean hasCrossCheckedFlag(char flag, char[] flags) { + return flags == null || Arrays.binarySearch(flags, flag) >= 0; + } + + /** + * Stem represents all information known about a stem of a word. This includes the stem, and the prefixes and suffixes + * that were used to change the word into the stem. + */ + public static class Stem { + + private final List prefixes = new ArrayList(); + private final List suffixes = new ArrayList(); + private final char stem[]; + private final int stemLength; + + /** + * Creates a new Stem wrapping the given word stem + * + * @param stem Stem of a word + */ + public Stem(char stem[], int stemLength) { + this.stem = stem; + this.stemLength = stemLength; + } + + /** + * Adds a prefix to the list of prefixes used to generate this stem. Because it is assumed that prefixes are added + * depth first, the prefix is added to the front of the list + * + * @param prefix Prefix to add to the list of prefixes for this stem + */ + public void addPrefix(Affix prefix) { + prefixes.add(0, prefix); + } + + /** + * Adds a suffix to the list of suffixes used to generate this stem. Because it is assumed that suffixes are added + * depth first, the suffix is added to the end of the list + * + * @param suffix Suffix to add to the list of suffixes for this stem + */ + public void addSuffix(Affix suffix) { + suffixes.add(suffix); + } + + /** + * Returns the list of prefixes used to generate the stem + * + * @return List of prefixes used to generate the stem or an empty list if no prefixes were required + */ + public List getPrefixes() { + return prefixes; + } + + /** + * Returns the list of suffixes used to generate the stem + * + * @return List of suffixes used to generate the stem or an empty list if no suffixes were required + */ + public List getSuffixes() { + return suffixes; + } + + /** + * Returns the text of the word's stem. + * @see #getStemLength() + */ + public char[] getStem() { + return stem; + } + + /** Returns the valid length of the text in {@link #getStem()} */ + public int getStemLength() { + return stemLength; + } + + /** Only use this if you really need a string (e.g. for testing) */ + public String getStemString() { + return new String(stem, 0, stemLength); + } + } +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/package.html b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/package.html new file mode 100644 index 00000000000..196591969e8 --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/package.html @@ -0,0 +1,26 @@ + + + +Stemming TokenFilter using a Java implementation of the +Hunspell stemming algorithm. +

+Dictionaries can be found on +OpenOffice's wiki +

+ + diff --git a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory index 04fc80cf59c..e4ca7c6802c 100644 --- a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory +++ b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory @@ -51,6 +51,7 @@ org.apache.lucene.analysis.hi.HindiNormalizationFilterFactory org.apache.lucene.analysis.hi.HindiStemFilterFactory org.apache.lucene.analysis.hu.HungarianLightStemFilterFactory org.apache.lucene.analysis.hunspell.HunspellStemFilterFactory +org.apache.lucene.analysis.hunspell2.Hunspell2StemFilterFactory org.apache.lucene.analysis.id.IndonesianStemFilterFactory org.apache.lucene.analysis.in.IndicNormalizationFilterFactory org.apache.lucene.analysis.it.ItalianLightStemFilterFactory diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestAllDictionaries.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestAllDictionaries.java new file mode 100644 index 00000000000..02ccedb9be7 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestAllDictionaries.java @@ -0,0 +1,205 @@ +package org.apache.lucene.analysis.hunspell2; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.File; +import java.io.InputStream; +import java.util.zip.ZipEntry; +import java.util.zip.ZipFile; + +import org.apache.lucene.analysis.hunspell.HunspellDictionary; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.RamUsageEstimator; + +/** + * Can be retrieved via: + * wget --mirror -np http://archive.services.openoffice.org/pub/mirror/OpenOffice.org/contrib/dictionaries/ + * Note some of the files differ only in case. This may be a problem on your operating system! + */ +//@Ignore("enable manually") +public class TestAllDictionaries extends LuceneTestCase { + + // set this to the location of where you downloaded all the files + static final File DICTIONARY_HOME = + new File("/Users/rmuir/hunspell/archive.services.openoffice.org/pub/mirror/OpenOffice.org/contrib/dictionaries"); + + final String tests[] = { + /* zip file */ /* dictionary */ /* affix */ + "af_ZA.zip", "af_ZA.dic", "af_ZA.aff", + "ak_GH.zip", "ak_GH.dic", "ak_GH.aff", + "bg_BG.zip", "bg_BG.dic", "bg_BG.aff", + "ca_ANY.zip", "catalan.dic", "catalan.aff", + "ca_ES.zip", "ca_ES.dic", "ca_ES.aff", + "cop_EG.zip", "cop_EG.dic", "cop_EG.aff", + "cs_CZ.zip", "cs_CZ.dic", "cs_CZ.aff", + "cy_GB.zip", "cy_GB.dic", "cy_GB.aff", + "da_DK.zip", "da_DK.dic", "da_DK.aff", + "de_AT.zip", "de_AT.dic", "de_AT.aff", + "de_CH.zip", "de_CH.dic", "de_CH.aff", + "de_DE.zip", "de_DE.dic", "de_DE.aff", + "de_DE_comb.zip", "de_DE_comb.dic", "de_DE_comb.aff", + "de_DE_frami.zip", "de_DE_frami.dic", "de_DE_frami.aff", + "de_DE_neu.zip", "de_DE_neu.dic", "de_DE_neu.aff", + "el_GR.zip", "el_GR.dic", "el_GR.aff", + "en_AU.zip", "en_AU.dic", "en_AU.aff", + "en_CA.zip", "en_CA.dic", "en_CA.aff", + "en_GB-oed.zip", "en_GB-oed.dic", "en_GB-oed.aff", + "en_GB.zip", "en_GB.dic", "en_GB.aff", + "en_NZ.zip", "en_NZ.dic", "en_NZ.aff", + "eo.zip", "eo_l3.dic", "eo_l3.aff", + "eo_EO.zip", "eo_EO.dic", "eo_EO.aff", + "es_AR.zip", "es_AR.dic", "es_AR.aff", + "es_BO.zip", "es_BO.dic", "es_BO.aff", + "es_CL.zip", "es_CL.dic", "es_CL.aff", + "es_CO.zip", "es_CO.dic", "es_CO.aff", + "es_CR.zip", "es_CR.dic", "es_CR.aff", + "es_CU.zip", "es_CU.dic", "es_CU.aff", + "es_DO.zip", "es_DO.dic", "es_DO.aff", + "es_EC.zip", "es_EC.dic", "es_EC.aff", + "es_ES.zip", "es_ES.dic", "es_ES.aff", + "es_GT.zip", "es_GT.dic", "es_GT.aff", + "es_HN.zip", "es_HN.dic", "es_HN.aff", + "es_MX.zip", "es_MX.dic", "es_MX.aff", + "es_NEW.zip", "es_NEW.dic", "es_NEW.aff", + "es_NI.zip", "es_NI.dic", "es_NI.aff", + "es_PA.zip", "es_PA.dic", "es_PA.aff", + "es_PE.zip", "es_PE.dic", "es_PE.aff", + "es_PR.zip", "es_PR.dic", "es_PR.aff", + "es_PY.zip", "es_PY.dic", "es_PY.aff", + "es_SV.zip", "es_SV.dic", "es_SV.aff", + "es_UY.zip", "es_UY.dic", "es_UY.aff", + "es_VE.zip", "es_VE.dic", "es_VE.aff", + "et_EE.zip", "et_EE.dic", "et_EE.aff", + "fo_FO.zip", "fo_FO.dic", "fo_FO.aff", + "fr_FR-1990_1-3-2.zip", "fr_FR-1990.dic", "fr_FR-1990.aff", + "fr_FR-classique_1-3-2.zip", "fr_FR-classique.dic", "fr_FR-classique.aff", + "fr_FR_1-3-2.zip", "fr_FR.dic", "fr_FR.aff", + "fy_NL.zip", "fy_NL.dic", "fy_NL.aff", + "ga_IE.zip", "ga_IE.dic", "ga_IE.aff", + "gd_GB.zip", "gd_GB.dic", "gd_GB.aff", + "gl_ES.zip", "gl_ES.dic", "gl_ES.aff", + "gsc_FR.zip", "gsc_FR.dic", "gsc_FR.aff", + "gu_IN.zip", "gu_IN.dic", "gu_IN.aff", + "he_IL.zip", "he_IL.dic", "he_IL.aff", + "hi_IN.zip", "hi_IN.dic", "hi_IN.aff", + "hil_PH.zip", "hil_PH.dic", "hil_PH.aff", + "hr_HR.zip", "hr_HR.dic", "hr_HR.aff", + "hu_HU.zip", "hu_HU.dic", "hu_HU.aff", + "hu_HU_comb.zip", "hu_HU.dic", "hu_HU.aff", + "ia.zip", "ia.dic", "ia.aff", + "id_ID.zip", "id_ID.dic", "id_ID.aff", + "it_IT.zip", "it_IT.dic", "it_IT.aff", + "ku_TR.zip", "ku_TR.dic", "ku_TR.aff", + "la.zip", "la.dic", "la.aff", + "lt_LT.zip", "lt_LT.dic", "lt_LT.aff", + "lv_LV.zip", "lv_LV.dic", "lv_LV.aff", + "mg_MG.zip", "mg_MG.dic", "mg_MG.aff", + "mi_NZ.zip", "mi_NZ.dic", "mi_NZ.aff", + "mk_MK.zip", "mk_MK.dic", "mk_MK.aff", + "mos_BF.zip", "mos_BF.dic", "mos_BF.aff", + "mr_IN.zip", "mr_IN.dic", "mr_IN.aff", + "ms_MY.zip", "ms_MY.dic", "ms_MY.aff", + "nb_NO.zip", "nb_NO.dic", "nb_NO.aff", + "ne_NP.zip", "ne_NP.dic", "ne_NP.aff", + "nl_NL.zip", "nl_NL.dic", "nl_NL.aff", + "nl_med.zip", "nl_med.dic", "nl_med.aff", + "nn_NO.zip", "nn_NO.dic", "nn_NO.aff", + "nr_ZA.zip", "nr_ZA.dic", "nr_ZA.aff", + "ns_ZA.zip", "ns_ZA.dic", "ns_ZA.aff", + "ny_MW.zip", "ny_MW.dic", "ny_MW.aff", + "oc_FR.zip", "oc_FR.dic", "oc_FR.aff", + "pl_PL.zip", "pl_PL.dic", "pl_PL.aff", + "pt_BR.zip", "pt_BR.dic", "pt_BR.aff", + "pt_PT.zip", "pt_PT.dic", "pt_PT.aff", + "ro_RO.zip", "ro_RO.dic", "ro_RO.aff", + "ru_RU.zip", "ru_RU.dic", "ru_RU.aff", + "ru_RU_ye.zip", "ru_RU_ie.dic", "ru_RU_ie.aff", + "ru_RU_yo.zip", "ru_RU_yo.dic", "ru_RU_yo.aff", + "rw_RW.zip", "rw_RW.dic", "rw_RW.aff", + "sk_SK.zip", "sk_SK.dic", "sk_SK.aff", + "sl_SI.zip", "sl_SI.dic", "sl_SI.aff", + "sq_AL.zip", "sq_AL.dic", "sq_AL.aff", + "ss_ZA.zip", "ss_ZA.dic", "ss_ZA.aff", + "st_ZA.zip", "st_ZA.dic", "st_ZA.aff", + "sv_SE.zip", "sv_SE.dic", "sv_SE.aff", + "sw_KE.zip", "sw_KE.dic", "sw_KE.aff", + "tet_ID.zip", "tet_ID.dic", "tet_ID.aff", + "th_TH.zip", "th_TH.dic", "th_TH.aff", + "tl_PH.zip", "tl_PH.dic", "tl_PH.aff", + "tn_ZA.zip", "tn_ZA.dic", "tn_ZA.aff", + "ts_ZA.zip", "ts_ZA.dic", "ts_ZA.aff", + "uk_UA.zip", "uk_UA.dic", "uk_UA.aff", + "ve_ZA.zip", "ve_ZA.dic", "ve_ZA.aff", + "vi_VN.zip", "vi_VN.dic", "vi_VN.aff", + "xh_ZA.zip", "xh_ZA.dic", "xh_ZA.aff", + "zu_ZA.zip", "zu_ZA.dic", "zu_ZA.aff", + }; + + public void test() throws Exception { + for (int i = 0; i < tests.length; i += 3) { + File f = new File(DICTIONARY_HOME, tests[i]); + assert f.exists(); + + try (ZipFile zip = new ZipFile(f, IOUtils.CHARSET_UTF_8)) { + ZipEntry dicEntry = zip.getEntry(tests[i+1]); + assert dicEntry != null; + ZipEntry affEntry = zip.getEntry(tests[i+2]); + assert affEntry != null; + + // get ram from previous impl + String oldRAM = "FAIL"; + try (InputStream dictionary = zip.getInputStream(dicEntry); + InputStream affix = zip.getInputStream(affEntry)) { + try { + HunspellDictionary dic = new HunspellDictionary(affix, dictionary, TEST_VERSION_CURRENT); + oldRAM = RamUsageEstimator.humanSizeOf(dic); + } catch (Throwable t) {} + } + + try (InputStream dictionary = zip.getInputStream(dicEntry); + InputStream affix = zip.getInputStream(affEntry)) { + Dictionary dic = new Dictionary(affix, dictionary); + System.out.println(tests[i] + "\t" + oldRAM + "\t" + RamUsageEstimator.humanSizeOf(dic)); + } + } + } + } + + public void testOneDictionary() throws Exception { + String toTest = "hu_HU.zip"; + for (int i = 0; i < tests.length; i++) { + if (tests[i].equals(toTest)) { + File f = new File(DICTIONARY_HOME, tests[i]); + assert f.exists(); + + try (ZipFile zip = new ZipFile(f, IOUtils.CHARSET_UTF_8)) { + ZipEntry dicEntry = zip.getEntry(tests[i+1]); + assert dicEntry != null; + ZipEntry affEntry = zip.getEntry(tests[i+2]); + assert affEntry != null; + + try (InputStream dictionary = zip.getInputStream(dicEntry); + InputStream affix = zip.getInputStream(affEntry)) { + Dictionary dic = new Dictionary(affix, dictionary); + } + } + } + } + } +} diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestDictionary.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestDictionary.java new file mode 100644 index 00000000000..14c6e8967d0 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestDictionary.java @@ -0,0 +1,109 @@ +package org.apache.lucene.analysis.hunspell2; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.FilterInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.text.ParseException; + +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.LuceneTestCase; + +public class TestDictionary extends LuceneTestCase { + + public void testSimpleDictionary() throws Exception { + InputStream affixStream = getClass().getResourceAsStream("simple.aff"); + InputStream dictStream = getClass().getResourceAsStream("simple.dic"); + + Dictionary dictionary = new Dictionary(affixStream, dictStream); + assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).size()); + assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).size()); + char flags[] = dictionary.lookupWord(new char[]{'o', 'l', 'r'}, 0, 3, new BytesRef()); + assertNotNull(flags); + assertEquals(1, flags.length); + assertEquals("Wrong number of flags for lucen", 1, dictionary.lookupWord(new char[]{'l', 'u', 'c', 'e', 'n'}, 0, 5, new BytesRef()).length); + + affixStream.close(); + dictStream.close(); + } + + public void testCompressedDictionary() throws Exception { + InputStream affixStream = getClass().getResourceAsStream("compressed.aff"); + InputStream dictStream = getClass().getResourceAsStream("compressed.dic"); + + Dictionary dictionary = new Dictionary(affixStream, dictStream); + assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).size()); + assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).size()); + assertEquals(1, dictionary.lookupWord(new char[]{'o', 'l', 'r'}, 0, 3, new BytesRef()).length); + + affixStream.close(); + dictStream.close(); + } + + // malformed rule causes ParseException + public void testInvalidData() throws Exception { + InputStream affixStream = getClass().getResourceAsStream("broken.aff"); + InputStream dictStream = getClass().getResourceAsStream("simple.dic"); + + try { + new Dictionary(affixStream, dictStream); + fail("didn't get expected exception"); + } catch (ParseException expected) { + assertEquals("The affix file contains a rule with less than five elements", expected.getMessage()); + assertEquals(23, expected.getErrorOffset()); + } + + affixStream.close(); + dictStream.close(); + } + + private class CloseCheckInputStream extends FilterInputStream { + private boolean closed = false; + + public CloseCheckInputStream(InputStream delegate) { + super(delegate); + } + + @Override + public void close() throws IOException { + this.closed = true; + super.close(); + } + + public boolean isClosed() { + return this.closed; + } + } + + public void testResourceCleanup() throws Exception { + CloseCheckInputStream affixStream = new CloseCheckInputStream(getClass().getResourceAsStream("compressed.aff")); + CloseCheckInputStream dictStream = new CloseCheckInputStream(getClass().getResourceAsStream("compressed.dic")); + + new Dictionary(affixStream, dictStream); + + assertFalse(affixStream.isClosed()); + assertFalse(dictStream.isClosed()); + + affixStream.close(); + dictStream.close(); + + assertTrue(affixStream.isClosed()); + assertTrue(dictStream.isClosed()); + } +} diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestHunspell2StemFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestHunspell2StemFilter.java new file mode 100644 index 00000000000..eafb1f272cf --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestHunspell2StemFilter.java @@ -0,0 +1,87 @@ +package org.apache.lucene.analysis.hunspell2; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.InputStream; +import java.util.Arrays; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.KeywordTokenizer; +import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter; +import org.apache.lucene.analysis.util.CharArraySet; +import org.apache.lucene.util.TestUtil; +import org.junit.AfterClass; +import org.junit.BeforeClass; + +public class TestHunspell2StemFilter extends BaseTokenStreamTestCase { + private static Dictionary dictionary; + + @BeforeClass + public static void beforeClass() throws Exception { + try (InputStream affixStream = TestStemmer.class.getResourceAsStream("simple.aff"); + InputStream dictStream = TestStemmer.class.getResourceAsStream("simple.dic")) { + dictionary = new Dictionary(affixStream, dictStream); + } + } + + @AfterClass + public static void afterClass() { + dictionary = null; + } + + /** Simple test for KeywordAttribute */ + public void testKeywordAttribute() throws IOException { + MockTokenizer tokenizer = whitespaceMockTokenizer("lucene is awesome"); + tokenizer.setEnableChecks(true); + Hunspell2StemFilter filter = new Hunspell2StemFilter(tokenizer, dictionary, TestUtil.nextInt(random(), 1, 3)); + assertTokenStreamContents(filter, new String[]{"lucene", "lucen", "is", "awesome"}, new int[] {1, 0, 1, 1}); + + // assert with keyword marker + tokenizer = whitespaceMockTokenizer("lucene is awesome"); + CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, Arrays.asList("Lucene"), true); + filter = new Hunspell2StemFilter(new SetKeywordMarkerFilter(tokenizer, set), dictionary, TestUtil.nextInt(random(), 1, 3)); + assertTokenStreamContents(filter, new String[]{"lucene", "is", "awesome"}, new int[] {1, 1, 1}); + } + + /** blast some random strings through the analyzer */ + public void testRandomStrings() throws Exception { + Analyzer analyzer = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName) { + Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); + return new TokenStreamComponents(tokenizer, new Hunspell2StemFilter(tokenizer, dictionary, TestUtil.nextInt(random(), 1, 3))); + } + }; + checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER); + } + + public void testEmptyTerm() throws IOException { + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName) { + Tokenizer tokenizer = new KeywordTokenizer(); + return new TokenStreamComponents(tokenizer, new Hunspell2StemFilter(tokenizer, dictionary, TestUtil.nextInt(random(), 1, 3))); + } + }; + checkOneTerm(a, "", ""); + } +} diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestHunspell2StemFilterFactory.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestHunspell2StemFilterFactory.java new file mode 100644 index 00000000000..d95e2be04b6 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestHunspell2StemFilterFactory.java @@ -0,0 +1,50 @@ +package org.apache.lucene.analysis.hunspell2; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Reader; +import java.io.StringReader; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase; + +/** + * Simple tests to ensure the Hunspell stemmer loads from factory + */ +public class TestHunspell2StemFilterFactory extends BaseTokenStreamFactoryTestCase { + public void testStemming() throws Exception { + Reader reader = new StringReader("abc"); + TokenStream stream = whitespaceMockTokenizer(reader); + stream = tokenFilterFactory("Hunspell2Stem", + "dictionary", "simple.dic", + "affix", "simple.aff").create(stream); + assertTokenStreamContents(stream, new String[] { "ab" }); + } + + /** Test that bogus arguments result in exception */ + public void testBogusArguments() throws Exception { + try { + tokenFilterFactory("Hunspell2Stem", + "dictionary", "simple.dic", + "bogusArg", "bogusValue"); + fail(); + } catch (IllegalArgumentException expected) { + assertTrue(expected.getMessage().contains("Unknown parameters")); + } + } +} diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestStemmer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestStemmer.java new file mode 100644 index 00000000000..ea98f65256f --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestStemmer.java @@ -0,0 +1,105 @@ +package org.apache.lucene.analysis.hunspell2; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.hunspell2.Stemmer.Stem; +import org.apache.lucene.util.LuceneTestCase; +import org.junit.AfterClass; +import org.junit.BeforeClass; + +import java.io.InputStream; +import java.util.Arrays; +import java.util.List; + +public class TestStemmer extends LuceneTestCase { + private static Stemmer stemmer; + + @BeforeClass + public static void beforeClass() throws Exception { + try (InputStream affixStream = TestStemmer.class.getResourceAsStream("simple.aff"); + InputStream dictStream = TestStemmer.class.getResourceAsStream("simple.dic")) { + Dictionary dictionary = new Dictionary(affixStream, dictStream); + stemmer = new Stemmer(dictionary); + } + } + + @AfterClass + public static void afterClass() { + stemmer = null; + } + + public void testSimpleSuffix() { + assertStemsTo("lucene", "lucene", "lucen"); + assertStemsTo("mahoute", "mahout"); + } + + public void testSimplePrefix() { + assertStemsTo("solr", "olr"); + } + + public void testRecursiveSuffix() { + assertStemsTo("abcd", "ab"); + } + + // all forms unmunched from dictionary + public void testAllStems() { + assertStemsTo("ab", "ab"); + assertStemsTo("abc", "ab"); + assertStemsTo("apach", "apach"); + assertStemsTo("apache", "apach"); + assertStemsTo("foo", "foo"); + assertStemsTo("food", "foo"); + assertStemsTo("foos", "foo"); + assertStemsTo("lucen", "lucen"); + assertStemsTo("lucene", "lucen", "lucene"); + assertStemsTo("mahout", "mahout"); + assertStemsTo("mahoute", "mahout"); + assertStemsTo("moo", "moo"); + assertStemsTo("mood", "moo"); + assertStemsTo("olr", "olr"); + assertStemsTo("solr", "olr"); + } + + // some bogus stuff that should not stem (empty lists)! + public void testBogusStems() { + assertStemsTo("abs"); + assertStemsTo("abe"); + assertStemsTo("sab"); + assertStemsTo("sapach"); + assertStemsTo("sapache"); + assertStemsTo("apachee"); + assertStemsTo("sfoo"); + assertStemsTo("sfoos"); + assertStemsTo("fooss"); + assertStemsTo("lucenee"); + assertStemsTo("solre"); + } + + private void assertStemsTo(String s, String... expected) { + Arrays.sort(expected); + + List stems = stemmer.stem(s); + String actual[] = new String[stems.size()]; + for (int i = 0; i < actual.length; i++) { + actual[i] = stems.get(i).getStemString(); + } + Arrays.sort(actual); + + assertArrayEquals(expected, actual); + } +} diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/broken.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/broken.aff new file mode 100644 index 00000000000..3b780cd1d7b --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/broken.aff @@ -0,0 +1,24 @@ +SET UTF-8 +TRY abcdefghijklmopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ + +SFX A Y 3 +SFX A 0 e n +SFX A 0 e t +SFX A 0 e h + +SFX C Y 2 +SFX C 0 d/C c +SFX C 0 c b + +SFX D Y 1 +SFX D 0 s o + +SFX E Y 1 +SFX E 0 d o + +PFX B Y 1 +PFX B 0 s o + +#wrong rule (only 4 elements) +PFX A0 Y 1 +PFX A0 0 a \ No newline at end of file diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/compressed.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/compressed.aff new file mode 100644 index 00000000000..e4a1b37300f --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/compressed.aff @@ -0,0 +1,29 @@ +SET UTF-8 +TRY abcdefghijklmopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ + +FLAG long + +AF 5 +AF AA +AF BB +AF CC +AF DD +AF EE + +SFX AA Y 3 +SFX AA 0 e n +SFX AA 0 e t +SFX AA 0 e h + +SFX CC Y 2 +SFX CC 0 d/3 c +SFX CC 0 c b + +SFX DD Y 1 +SFX DD 0 s o + +SFX EE Y 1 +SFX EE 0 d o + +PFX BB Y 1 +PFX BB 0 s o diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/compressed.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/compressed.dic new file mode 100644 index 00000000000..dd3890fae31 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/compressed.dic @@ -0,0 +1,9 @@ +6 +ab/3 +apach/1 +foo/4 +foo/5 +lucen/1 +lucene +mahout/1 +olr/2 diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/simple.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/simple.aff new file mode 100644 index 00000000000..db9423dcad1 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/simple.aff @@ -0,0 +1,20 @@ +SET UTF-8 +TRY abcdefghijklmopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ + +SFX A Y 3 +SFX A 0 e n +SFX A 0 e t +SFX A 0 e h + +SFX C Y 2 +SFX C 0 d/C c +SFX C 0 c b + +SFX D Y 1 +SFX D 0 s o + +SFX E Y 1 +SFX E 0 d o + +PFX B Y 1 +PFX B 0 s o \ No newline at end of file diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/simple.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/simple.dic new file mode 100644 index 00000000000..f7bbab3ba67 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/simple.dic @@ -0,0 +1,10 @@ +9 +ab/C +apach/A +foo/D +foo/E +lucen/A +lucene +mahout/A +moo/E +olr/B From e541984b62bdc3acc5947f761d40371c75b38de3 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Mon, 24 Feb 2014 06:04:28 +0000 Subject: [PATCH 03/17] remove treemap (TODO: refactor the sorter and use that) git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5468@1571154 13f79535-47bb-0310-9956-ffa450edef68 --- .../lucene/analysis/hunspell2/Dictionary.java | 90 +++++++++++++------ 1 file changed, 62 insertions(+), 28 deletions(-) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java index a7b9a58f080..c8068aa41b9 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java @@ -35,11 +35,11 @@ import java.text.ParseException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; +import java.util.Comparator; import java.util.HashMap; import java.util.List; import java.util.Locale; import java.util.Map; -import java.util.TreeMap; /** * In-memory structure for the dictionary (.dic) and affix (.aff) @@ -93,16 +93,10 @@ public class Dictionary { buffered.reset(); CharsetDecoder decoder = getJavaEncoding(encoding); readAffixFile(buffered, decoder); - TreeMap tempWords = new TreeMap(); flagLookup.add(new BytesRef()); // no flags -> ord 0 - readDictionaryFile(dictionary, decoder, tempWords); PositiveIntOutputs o = PositiveIntOutputs.getSingleton(); - Builder b = new Builder(FST.INPUT_TYPE.BYTE4, o); // nocommit: byte4 - IntsRef scratchInts = new IntsRef(); - for (Map.Entry e : tempWords.entrySet()) { - UnicodeUtil.UTF8toUTF32(e.getKey(), scratchInts); - b.add(scratchInts, e.getValue().longValue()); - } + Builder b = new Builder(FST.INPUT_TYPE.BYTE4, o); + readDictionaryFile(dictionary, decoder, b); words = b.finish(); } @@ -366,20 +360,51 @@ public class Dictionary { * @param decoder CharsetDecoder used to decode the contents of the file * @throws IOException Can be thrown while reading from the file */ - private void readDictionaryFile(InputStream dictionary, CharsetDecoder decoder, TreeMap words) throws IOException { + private void readDictionaryFile(InputStream dictionary, CharsetDecoder decoder, Builder words) throws IOException { BytesRef flagsScratch = new BytesRef(); - BytesRef flagsScratch2 = new BytesRef(); + IntsRef scratchInts = new IntsRef(); BufferedReader reader = new BufferedReader(new InputStreamReader(dictionary, decoder)); // TODO: don't create millions of strings. - String line = reader.readLine(); // first line is number of entries + String line = reader.readLine(); // first line is number of entries (approximately, sometimes) // sometimes the number of entries has a comment/copyright after it line = line.replaceFirst("\\s*\\#.*$", ""); int numEntries = Integer.parseInt(line); + String lines[] = new String[numEntries]; + int upto = 0; + while ((line = reader.readLine()) != null) { + if (upto == lines.length) { + lines = Arrays.copyOf(lines, (int)(lines.length * 1.25)); + } + lines[upto++] = line; + } + + // TODO: just replace this with offline sorter? + Arrays.sort(lines, 0, upto, new Comparator() { + @Override + public int compare(String o1, String o2) { + int sep1 = o1.lastIndexOf('/'); + if (sep1 >= 0) { + o1 = o1.substring(0, sep1); + } + + int sep2 = o2.lastIndexOf('/'); + if (sep2 >= 0) { + o2 = o2.substring(0, sep2); + } + return o1.compareTo(o2); + } + }); + // TODO: the flags themselves can be double-chars (long) or also numeric // either way the trick is to encode them as char... but they must be parsed differently - while ((line = reader.readLine()) != null) { + + BytesRef currentEntry = new BytesRef(); + char currentFlags[] = new char[0]; + + for (int i = 0; i < upto; i++) { + line = lines[i]; String entry; char wordForm[]; @@ -405,24 +430,33 @@ public class Dictionary { } BytesRef scratch = new BytesRef(entry); - Integer existingOrd = words.get(scratch); - final char mergedEntries[]; - if (existingOrd == null || existingOrd == 0) { - mergedEntries = wordForm; + int cmp = scratch.compareTo(currentEntry); + if (cmp < 0) { + throw new IllegalArgumentException("out of order: " + scratch.utf8ToString() + " < " + currentEntry.utf8ToString()); + } else if (cmp == 0) { + currentFlags = merge(currentFlags, wordForm); } else { - flagLookup.get(existingOrd, flagsScratch2); - mergedEntries = merge(decodeFlags(flagsScratch2), wordForm); + final int hashCode = encodeFlagsWithHash(flagsScratch, currentFlags); + int ord = flagLookup.add(flagsScratch, hashCode); + if (ord < 0) { + // already exists in our hash + ord = (-ord)-1; + } + UnicodeUtil.UTF8toUTF32(currentEntry, scratchInts); + words.add(scratchInts, (long)ord); + currentEntry = scratch; + currentFlags = wordForm; } - - final int hashCode = encodeFlagsWithHash(flagsScratch, mergedEntries); - int ord = flagLookup.add(flagsScratch, hashCode); - if (ord < 0) { - // already exists in our hash - ord = (-ord)-1; - } - - words.put(scratch, ord); } + + final int hashCode = encodeFlagsWithHash(flagsScratch, currentFlags); + int ord = flagLookup.add(flagsScratch, hashCode); + if (ord < 0) { + // already exists in our hash + ord = (-ord)-1; + } + UnicodeUtil.UTF8toUTF32(currentEntry, scratchInts); + words.add(scratchInts, (long)ord); } static char[] decodeFlags(BytesRef b) { From ad20d99b3571181504d956a5056c449de5968afd Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Mon, 24 Feb 2014 14:53:21 +0000 Subject: [PATCH 04/17] break out this class git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5468@1571305 13f79535-47bb-0310-9956-ffa450edef68 --- .../hunspell2/Hunspell2StemFilter.java | 1 - .../lucene/analysis/hunspell2/Stem.java | 98 +++++++++++++++++++ .../lucene/analysis/hunspell2/Stemmer.java | 78 --------------- .../hunspell2/TestAllDictionaries.java | 3 +- .../analysis/hunspell2/TestStemmer.java | 1 - 5 files changed, 100 insertions(+), 81 deletions(-) create mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stem.java diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Hunspell2StemFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Hunspell2StemFilter.java index f9dfb770ab2..45941345342 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Hunspell2StemFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Hunspell2StemFilter.java @@ -22,7 +22,6 @@ import java.util.List; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.hunspell2.Stemmer.Stem; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stem.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stem.java new file mode 100644 index 00000000000..d3c8d4c86ab --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stem.java @@ -0,0 +1,98 @@ +package org.apache.lucene.analysis.hunspell2; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.ArrayList; +import java.util.List; + +/** + * Stem represents all information known about a stem of a word. This includes the stem, and the prefixes and suffixes + * that were used to change the word into the stem. + */ +final class Stem { + final List prefixes = new ArrayList(); + final List suffixes = new ArrayList(); + final char stem[]; + final int stemLength; + + /** + * Creates a new Stem wrapping the given word stem + * + * @param stem Stem of a word + */ + public Stem(char stem[], int stemLength) { + this.stem = stem; + this.stemLength = stemLength; + } + + /** + * Adds a prefix to the list of prefixes used to generate this stem. Because it is assumed that prefixes are added + * depth first, the prefix is added to the front of the list + * + * @param prefix Prefix to add to the list of prefixes for this stem + */ + public void addPrefix(Affix prefix) { + prefixes.add(0, prefix); + } + + /** + * Adds a suffix to the list of suffixes used to generate this stem. Because it is assumed that suffixes are added + * depth first, the suffix is added to the end of the list + * + * @param suffix Suffix to add to the list of suffixes for this stem + */ + public void addSuffix(Affix suffix) { + suffixes.add(suffix); + } + + /** + * Returns the list of prefixes used to generate the stem + * + * @return List of prefixes used to generate the stem or an empty list if no prefixes were required + */ + public List getPrefixes() { + return prefixes; + } + + /** + * Returns the list of suffixes used to generate the stem + * + * @return List of suffixes used to generate the stem or an empty list if no suffixes were required + */ + public List getSuffixes() { + return suffixes; + } + + /** + * Returns the text of the word's stem. + * @see #getStemLength() + */ + public char[] getStem() { + return stem; + } + + /** Returns the valid length of the text in {@link #getStem()} */ + public int getStemLength() { + return stemLength; + } + + /** Only use this if you really need a string (e.g. for testing) */ + public String getStemString() { + return new String(stem, 0, stemLength); + } +} \ No newline at end of file diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java index 7d36c81e4ae..aa00836d6fe 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java @@ -207,82 +207,4 @@ final class Stemmer { private boolean hasCrossCheckedFlag(char flag, char[] flags) { return flags == null || Arrays.binarySearch(flags, flag) >= 0; } - - /** - * Stem represents all information known about a stem of a word. This includes the stem, and the prefixes and suffixes - * that were used to change the word into the stem. - */ - public static class Stem { - - private final List prefixes = new ArrayList(); - private final List suffixes = new ArrayList(); - private final char stem[]; - private final int stemLength; - - /** - * Creates a new Stem wrapping the given word stem - * - * @param stem Stem of a word - */ - public Stem(char stem[], int stemLength) { - this.stem = stem; - this.stemLength = stemLength; - } - - /** - * Adds a prefix to the list of prefixes used to generate this stem. Because it is assumed that prefixes are added - * depth first, the prefix is added to the front of the list - * - * @param prefix Prefix to add to the list of prefixes for this stem - */ - public void addPrefix(Affix prefix) { - prefixes.add(0, prefix); - } - - /** - * Adds a suffix to the list of suffixes used to generate this stem. Because it is assumed that suffixes are added - * depth first, the suffix is added to the end of the list - * - * @param suffix Suffix to add to the list of suffixes for this stem - */ - public void addSuffix(Affix suffix) { - suffixes.add(suffix); - } - - /** - * Returns the list of prefixes used to generate the stem - * - * @return List of prefixes used to generate the stem or an empty list if no prefixes were required - */ - public List getPrefixes() { - return prefixes; - } - - /** - * Returns the list of suffixes used to generate the stem - * - * @return List of suffixes used to generate the stem or an empty list if no suffixes were required - */ - public List getSuffixes() { - return suffixes; - } - - /** - * Returns the text of the word's stem. - * @see #getStemLength() - */ - public char[] getStem() { - return stem; - } - - /** Returns the valid length of the text in {@link #getStem()} */ - public int getStemLength() { - return stemLength; - } - - /** Only use this if you really need a string (e.g. for testing) */ - public String getStemString() { - return new String(stem, 0, stemLength); - } - } } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestAllDictionaries.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestAllDictionaries.java index 02ccedb9be7..ecb21b97a7c 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestAllDictionaries.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestAllDictionaries.java @@ -26,13 +26,14 @@ import org.apache.lucene.analysis.hunspell.HunspellDictionary; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.RamUsageEstimator; +import org.junit.Ignore; /** * Can be retrieved via: * wget --mirror -np http://archive.services.openoffice.org/pub/mirror/OpenOffice.org/contrib/dictionaries/ * Note some of the files differ only in case. This may be a problem on your operating system! */ -//@Ignore("enable manually") +@Ignore("enable manually") public class TestAllDictionaries extends LuceneTestCase { // set this to the location of where you downloaded all the files diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestStemmer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestStemmer.java index ea98f65256f..a8ac2a83fa9 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestStemmer.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestStemmer.java @@ -17,7 +17,6 @@ package org.apache.lucene.analysis.hunspell2; * limitations under the License. */ -import org.apache.lucene.analysis.hunspell2.Stemmer.Stem; import org.apache.lucene.util.LuceneTestCase; import org.junit.AfterClass; import org.junit.BeforeClass; From 7f6a40e15782d97dfedddcf6cc2b42f9c811654d Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Mon, 24 Feb 2014 15:45:07 +0000 Subject: [PATCH 05/17] LUCENE-5468: factor OfflineSorter out of suggest git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5468@1571321 13f79535-47bb-0310-9956-ffa450edef68 --- .../apache/lucene/util}/BytesRefArray.java | 2 +- .../apache/lucene/util/OfflineSorter.java} | 33 ++++++---- .../lucene/util}/TestBytesRefArray.java | 2 +- .../lucene/util/TestOfflineSorter.java} | 60 ++++++++++--------- .../search/suggest/BufferedInputIterator.java | 1 + .../lucene/search/suggest/InMemorySorter.java | 1 + .../search/suggest/SortedInputIterator.java | 15 ++--- .../suggest/analyzing/AnalyzingSuggester.java | 12 ++-- .../suggest/analyzing/FreeTextSuggester.java | 5 +- .../search/suggest/fst/ExternalRefSorter.java | 25 ++++---- .../suggest/fst/FSTCompletionLookup.java | 29 ++++----- .../suggest/fst/WFSTCompletionLookup.java | 2 +- .../suggest/fst/BytesRefSortersTest.java | 4 +- .../search/suggest/fst/LargeInputFST.java | 11 +++- 14 files changed, 114 insertions(+), 88 deletions(-) rename lucene/{suggest/src/java/org/apache/lucene/search/suggest => core/src/java/org/apache/lucene/util}/BytesRefArray.java (99%) rename lucene/{suggest/src/java/org/apache/lucene/search/suggest/Sort.java => core/src/java/org/apache/lucene/util/OfflineSorter.java} (95%) rename lucene/{suggest/src/test/org/apache/lucene/search/suggest => core/src/test/org/apache/lucene/util}/TestBytesRefArray.java (98%) rename lucene/{suggest/src/test/org/apache/lucene/search/suggest/fst/TestSort.java => core/src/test/org/apache/lucene/util/TestOfflineSorter.java} (72%) diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/BytesRefArray.java b/lucene/core/src/java/org/apache/lucene/util/BytesRefArray.java similarity index 99% rename from lucene/suggest/src/java/org/apache/lucene/search/suggest/BytesRefArray.java rename to lucene/core/src/java/org/apache/lucene/util/BytesRefArray.java index e7a44fc37e0..eb0aa1a808e 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/BytesRefArray.java +++ b/lucene/core/src/java/org/apache/lucene/util/BytesRefArray.java @@ -1,4 +1,4 @@ -package org.apache.lucene.search.suggest; +package org.apache.lucene.util; /* * Licensed to the Apache Software Foundation (ASF) under one or more diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/Sort.java b/lucene/core/src/java/org/apache/lucene/util/OfflineSorter.java similarity index 95% rename from lucene/suggest/src/java/org/apache/lucene/search/suggest/Sort.java rename to lucene/core/src/java/org/apache/lucene/util/OfflineSorter.java index 8c6c20f1444..76781f8b8c7 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/Sort.java +++ b/lucene/core/src/java/org/apache/lucene/util/OfflineSorter.java @@ -1,4 +1,4 @@ -package org.apache.lucene.search.suggest; +package org.apache.lucene.util; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -17,11 +17,24 @@ package org.apache.lucene.search.suggest; * limitations under the License. */ -import java.io.*; -import java.util.*; - -import org.apache.lucene.util.*; -import org.apache.lucene.util.PriorityQueue; +import java.io.BufferedInputStream; +import java.io.BufferedOutputStream; +import java.io.Closeable; +import java.io.DataInput; +import java.io.DataInputStream; +import java.io.DataOutput; +import java.io.DataOutputStream; +import java.io.EOFException; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.List; +import java.util.Locale; /** * On-disk sorting of byte arrays. Each byte array (entry) is a composed of the following @@ -35,7 +48,7 @@ import org.apache.lucene.util.PriorityQueue; * @lucene.experimental * @lucene.internal */ -public final class Sort { +public final class OfflineSorter { /** Convenience constant for megabytes */ public final static long MB = 1024 * 1024; /** Convenience constant for gigabytes */ @@ -170,7 +183,7 @@ public final class Sort { * @see #defaultTempDir() * @see BufferSize#automatic() */ - public Sort() throws IOException { + public OfflineSorter() throws IOException { this(DEFAULT_COMPARATOR, BufferSize.automatic(), defaultTempDir(), MAX_TEMPFILES); } @@ -180,14 +193,14 @@ public final class Sort { * @see #defaultTempDir() * @see BufferSize#automatic() */ - public Sort(Comparator comparator) throws IOException { + public OfflineSorter(Comparator comparator) throws IOException { this(comparator, BufferSize.automatic(), defaultTempDir(), MAX_TEMPFILES); } /** * All-details constructor. */ - public Sort(Comparator comparator, BufferSize ramBufferSize, File tempDirectory, int maxTempfiles) { + public OfflineSorter(Comparator comparator, BufferSize ramBufferSize, File tempDirectory, int maxTempfiles) { if (ramBufferSize.bytes < ABSOLUTE_MIN_SORT_BUFFER_SIZE) { throw new IllegalArgumentException(MIN_BUFFER_SIZE_MSG + ": " + ramBufferSize.bytes); } diff --git a/lucene/suggest/src/test/org/apache/lucene/search/suggest/TestBytesRefArray.java b/lucene/core/src/test/org/apache/lucene/util/TestBytesRefArray.java similarity index 98% rename from lucene/suggest/src/test/org/apache/lucene/search/suggest/TestBytesRefArray.java rename to lucene/core/src/test/org/apache/lucene/util/TestBytesRefArray.java index 935b71bc529..9fcd6a1b6df 100644 --- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/TestBytesRefArray.java +++ b/lucene/core/src/test/org/apache/lucene/util/TestBytesRefArray.java @@ -1,4 +1,4 @@ -package org.apache.lucene.search.suggest; +package org.apache.lucene.util; /* * Licensed to the Apache Software Foundation (ASF) under one or more diff --git a/lucene/suggest/src/test/org/apache/lucene/search/suggest/fst/TestSort.java b/lucene/core/src/test/org/apache/lucene/util/TestOfflineSorter.java similarity index 72% rename from lucene/suggest/src/test/org/apache/lucene/search/suggest/fst/TestSort.java rename to lucene/core/src/test/org/apache/lucene/util/TestOfflineSorter.java index 540fadedf11..b7f14d02ffb 100644 --- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/fst/TestSort.java +++ b/lucene/core/src/test/org/apache/lucene/util/TestOfflineSorter.java @@ -1,4 +1,4 @@ -package org.apache.lucene.search.suggest.fst; +package org.apache.lucene.util; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -17,70 +17,72 @@ package org.apache.lucene.search.suggest.fst; * limitations under the License. */ -import java.io.*; +import java.io.DataInputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Comparator; -import org.apache.lucene.search.suggest.Sort; -import org.apache.lucene.search.suggest.Sort.BufferSize; -import org.apache.lucene.search.suggest.Sort.ByteSequencesWriter; -import org.apache.lucene.search.suggest.Sort.SortInfo; -import org.apache.lucene.util.*; -import org.junit.*; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.OfflineSorter; +import org.apache.lucene.util.OfflineSorter.BufferSize; +import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter; +import org.apache.lucene.util.OfflineSorter.SortInfo; +import org.apache.lucene.util.TestUtil; /** * Tests for on-disk merge sorting. */ -public class TestSort extends LuceneTestCase { +public class TestOfflineSorter extends LuceneTestCase { private File tempDir; - @Before - public void prepareTempDir() throws IOException { + @Override + public void setUp() throws Exception { + super.setUp(); tempDir = TestUtil.getTempDir("mergesort"); TestUtil.rmDir(tempDir); tempDir.mkdirs(); } - @After - public void cleanup() throws IOException { + @Override + public void tearDown() throws Exception { if (tempDir != null) TestUtil.rmDir(tempDir); + super.tearDown(); } - @Test public void testEmpty() throws Exception { - checkSort(new Sort(), new byte [][] {}); + checkSort(new OfflineSorter(), new byte [][] {}); } - @Test public void testSingleLine() throws Exception { - checkSort(new Sort(), new byte [][] { + checkSort(new OfflineSorter(), new byte [][] { "Single line only.".getBytes("UTF-8") }); } - @Test public void testIntermediateMerges() throws Exception { // Sort 20 mb worth of data with 1mb buffer, binary merging. - SortInfo info = checkSort(new Sort(Sort.DEFAULT_COMPARATOR, BufferSize.megabytes(1), Sort.defaultTempDir(), 2), - generateRandom((int)Sort.MB * 20)); + SortInfo info = checkSort(new OfflineSorter(OfflineSorter.DEFAULT_COMPARATOR, BufferSize.megabytes(1), OfflineSorter.defaultTempDir(), 2), + generateRandom((int)OfflineSorter.MB * 20)); assertTrue(info.mergeRounds > 10); } - @Test public void testSmallRandom() throws Exception { // Sort 20 mb worth of data with 1mb buffer. - SortInfo sortInfo = checkSort(new Sort(Sort.DEFAULT_COMPARATOR, BufferSize.megabytes(1), Sort.defaultTempDir(), Sort.MAX_TEMPFILES), - generateRandom((int)Sort.MB * 20)); + SortInfo sortInfo = checkSort(new OfflineSorter(OfflineSorter.DEFAULT_COMPARATOR, BufferSize.megabytes(1), OfflineSorter.defaultTempDir(), OfflineSorter.MAX_TEMPFILES), + generateRandom((int)OfflineSorter.MB * 20)); assertEquals(1, sortInfo.mergeRounds); } - @Test @Nightly + @Nightly public void testLargerRandom() throws Exception { // Sort 100MB worth of data with 15mb buffer. - checkSort(new Sort(Sort.DEFAULT_COMPARATOR, BufferSize.megabytes(16), Sort.defaultTempDir(), Sort.MAX_TEMPFILES), - generateRandom((int)Sort.MB * 100)); + checkSort(new OfflineSorter(OfflineSorter.DEFAULT_COMPARATOR, BufferSize.megabytes(16), OfflineSorter.defaultTempDir(), OfflineSorter.MAX_TEMPFILES), + generateRandom((int)OfflineSorter.MB * 100)); } private byte[][] generateRandom(int howMuchData) { @@ -108,9 +110,9 @@ public class TestSort extends LuceneTestCase { } }; /** - * Check sorting data on an instance of {@link Sort}. + * Check sorting data on an instance of {@link OfflineSorter}. */ - private SortInfo checkSort(Sort sort, byte[][] data) throws IOException { + private SortInfo checkSort(OfflineSorter sort, byte[][] data) throws IOException { File unsorted = writeAll("unsorted", data); Arrays.sort(data, unsignedByteOrderComparator); @@ -147,7 +149,7 @@ public class TestSort extends LuceneTestCase { private File writeAll(String name, byte[][] data) throws IOException { File file = new File(tempDir, name); - ByteSequencesWriter w = new Sort.ByteSequencesWriter(file); + ByteSequencesWriter w = new OfflineSorter.ByteSequencesWriter(file); for (byte [] datum : data) { w.write(datum); } diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/BufferedInputIterator.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/BufferedInputIterator.java index b9772fafebd..96c7cf85f60 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/BufferedInputIterator.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/BufferedInputIterator.java @@ -21,6 +21,7 @@ import java.io.IOException; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefArray; import org.apache.lucene.util.Counter; /** diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/InMemorySorter.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/InMemorySorter.java index 0efc3a5fa7a..42e19a8f9b9 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/InMemorySorter.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/InMemorySorter.java @@ -21,6 +21,7 @@ import java.util.Comparator; import org.apache.lucene.search.suggest.fst.BytesRefSorter; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefArray; import org.apache.lucene.util.BytesRefIterator; import org.apache.lucene.util.Counter; diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/SortedInputIterator.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/SortedInputIterator.java index d804f38e1b1..d7011d435d9 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/SortedInputIterator.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/SortedInputIterator.java @@ -21,13 +21,14 @@ import java.io.File; import java.io.IOException; import java.util.Comparator; -import org.apache.lucene.search.suggest.Sort.ByteSequencesReader; -import org.apache.lucene.search.suggest.Sort.ByteSequencesWriter; import org.apache.lucene.store.ByteArrayDataInput; import org.apache.lucene.store.ByteArrayDataOutput; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.OfflineSorter; +import org.apache.lucene.util.OfflineSorter.ByteSequencesReader; +import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter; /** * This wrapper buffers incoming elements and makes sure they are sorted based on given comparator. @@ -141,13 +142,13 @@ public class SortedInputIterator implements InputIterator { } }; - private Sort.ByteSequencesReader sort() throws IOException { + private ByteSequencesReader sort() throws IOException { String prefix = getClass().getSimpleName(); - File directory = Sort.defaultTempDir(); + File directory = OfflineSorter.defaultTempDir(); tempInput = File.createTempFile(prefix, ".input", directory); tempSorted = File.createTempFile(prefix, ".sorted", directory); - final Sort.ByteSequencesWriter writer = new Sort.ByteSequencesWriter(tempInput); + final OfflineSorter.ByteSequencesWriter writer = new OfflineSorter.ByteSequencesWriter(tempInput); boolean success = false; try { BytesRef spare; @@ -158,8 +159,8 @@ public class SortedInputIterator implements InputIterator { encode(writer, output, buffer, spare, source.payload(), source.weight()); } writer.close(); - new Sort(tieBreakByCostComparator).sort(tempInput, tempSorted); - ByteSequencesReader reader = new Sort.ByteSequencesReader(tempSorted); + new OfflineSorter(tieBreakByCostComparator).sort(tempInput, tempSorted); + ByteSequencesReader reader = new OfflineSorter.ByteSequencesReader(tempSorted); success = true; return reader; diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java index 6b2c1f6bbe1..5dad351a83f 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java @@ -31,7 +31,6 @@ import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStreamToAutomaton; import org.apache.lucene.search.suggest.InputIterator; import org.apache.lucene.search.suggest.Lookup; -import org.apache.lucene.search.suggest.Sort; import org.apache.lucene.store.ByteArrayDataInput; import org.apache.lucene.store.ByteArrayDataOutput; import org.apache.lucene.store.DataInput; @@ -56,6 +55,7 @@ import org.apache.lucene.util.fst.PairOutputs; import org.apache.lucene.util.fst.PositiveIntOutputs; import org.apache.lucene.util.fst.Util.MinResult; import org.apache.lucene.util.fst.Util; +import org.apache.lucene.util.OfflineSorter; /** * Suggester that first analyzes the surface form, adds the @@ -380,14 +380,14 @@ public class AnalyzingSuggester extends Lookup { @Override public void build(InputIterator iterator) throws IOException { String prefix = getClass().getSimpleName(); - File directory = Sort.defaultTempDir(); + File directory = OfflineSorter.defaultTempDir(); File tempInput = File.createTempFile(prefix, ".input", directory); File tempSorted = File.createTempFile(prefix, ".sorted", directory); hasPayloads = iterator.hasPayloads(); - Sort.ByteSequencesWriter writer = new Sort.ByteSequencesWriter(tempInput); - Sort.ByteSequencesReader reader = null; + OfflineSorter.ByteSequencesWriter writer = new OfflineSorter.ByteSequencesWriter(tempInput); + OfflineSorter.ByteSequencesReader reader = null; BytesRef scratch = new BytesRef(); TokenStreamToAutomaton ts2a = getTokenStreamToAutomaton(); @@ -463,12 +463,12 @@ public class AnalyzingSuggester extends Lookup { writer.close(); // Sort all input/output pairs (required by FST.Builder): - new Sort(new AnalyzingComparator(hasPayloads)).sort(tempInput, tempSorted); + new OfflineSorter(new AnalyzingComparator(hasPayloads)).sort(tempInput, tempSorted); // Free disk space: tempInput.delete(); - reader = new Sort.ByteSequencesReader(tempSorted); + reader = new OfflineSorter.ByteSequencesReader(tempSorted); PairOutputs outputs = new PairOutputs(PositiveIntOutputs.getSingleton(), ByteSequenceOutputs.getSingleton()); Builder> builder = new Builder>(FST.INPUT_TYPE.BYTE1, outputs); diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FreeTextSuggester.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FreeTextSuggester.java index db332474c9a..f425235e272 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FreeTextSuggester.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/FreeTextSuggester.java @@ -20,6 +20,7 @@ package org.apache.lucene.search.suggest.analyzing; // TODO // - test w/ syns // - add pruning of low-freq ngrams? + import java.io.File; import java.io.IOException; //import java.io.PrintWriter; @@ -54,7 +55,6 @@ import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.suggest.InputIterator; import org.apache.lucene.search.suggest.Lookup; -import org.apache.lucene.search.suggest.Sort; import org.apache.lucene.store.ByteArrayDataInput; import org.apache.lucene.store.DataInput; import org.apache.lucene.store.DataOutput; @@ -74,6 +74,7 @@ import org.apache.lucene.util.fst.Outputs; import org.apache.lucene.util.fst.PositiveIntOutputs; import org.apache.lucene.util.fst.Util.MinResult; import org.apache.lucene.util.fst.Util; +import org.apache.lucene.util.OfflineSorter; /** * Builds an ngram model from the text sent to {@link @@ -287,7 +288,7 @@ public class FreeTextSuggester extends Lookup { } String prefix = getClass().getSimpleName(); - File directory = Sort.defaultTempDir(); + File directory = OfflineSorter.defaultTempDir(); // TODO: messy ... java7 has Files.createTempDirectory // ... but 4.x is java6: File tempIndexPath = null; diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/ExternalRefSorter.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/ExternalRefSorter.java index 0a06b861e83..8ceb937e74d 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/ExternalRefSorter.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/ExternalRefSorter.java @@ -17,14 +17,15 @@ package org.apache.lucene.search.suggest.fst; * limitations under the License. */ -import java.io.*; +import java.io.Closeable; +import java.io.File; +import java.io.IOException; import java.util.Comparator; -import org.apache.lucene.search.suggest.Sort; -import org.apache.lucene.search.suggest.Sort.ByteSequencesReader; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefIterator; import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.OfflineSorter; /** * Builds and iterates over sequences stored on disk. @@ -32,19 +33,19 @@ import org.apache.lucene.util.IOUtils; * @lucene.internal */ public class ExternalRefSorter implements BytesRefSorter, Closeable { - private final Sort sort; - private Sort.ByteSequencesWriter writer; + private final OfflineSorter sort; + private OfflineSorter.ByteSequencesWriter writer; private File input; private File sorted; /** * Will buffer all sequences to a temporary file and then sort (all on-disk). */ - public ExternalRefSorter(Sort sort) throws IOException { + public ExternalRefSorter(OfflineSorter sort) throws IOException { this.sort = sort; this.input = File.createTempFile("RefSorter-", ".raw", - Sort.defaultTempDir()); - this.writer = new Sort.ByteSequencesWriter(input); + OfflineSorter.defaultTempDir()); + this.writer = new OfflineSorter.ByteSequencesWriter(input); } @Override @@ -59,14 +60,14 @@ public class ExternalRefSorter implements BytesRefSorter, Closeable { closeWriter(); sorted = File.createTempFile("RefSorter-", ".sorted", - Sort.defaultTempDir()); + OfflineSorter.defaultTempDir()); sort.sort(input, sorted); input.delete(); input = null; } - return new ByteSequenceIterator(new Sort.ByteSequencesReader(sorted)); + return new ByteSequenceIterator(new OfflineSorter.ByteSequencesReader(sorted)); } private void closeWriter() throws IOException { @@ -93,10 +94,10 @@ public class ExternalRefSorter implements BytesRefSorter, Closeable { * Iterate over byte refs in a file. */ class ByteSequenceIterator implements BytesRefIterator { - private final ByteSequencesReader reader; + private final OfflineSorter.ByteSequencesReader reader; private BytesRef scratch = new BytesRef(); - public ByteSequenceIterator(ByteSequencesReader reader) { + public ByteSequenceIterator(OfflineSorter.ByteSequencesReader reader) { this.reader = reader; } diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionLookup.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionLookup.java index 38132cad444..704c1fb9c5b 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionLookup.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/FSTCompletionLookup.java @@ -19,26 +19,27 @@ package org.apache.lucene.search.suggest.fst; import java.io.File; import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; import java.util.ArrayList; import java.util.List; import org.apache.lucene.search.suggest.InputIterator; import org.apache.lucene.search.suggest.Lookup; -import org.apache.lucene.search.suggest.Sort.SortInfo; -import org.apache.lucene.search.suggest.Sort; import org.apache.lucene.search.suggest.fst.FSTCompletion.Completion; import org.apache.lucene.search.suggest.tst.TSTLookup; import org.apache.lucene.store.ByteArrayDataInput; import org.apache.lucene.store.ByteArrayDataOutput; import org.apache.lucene.store.DataInput; import org.apache.lucene.store.DataOutput; -import org.apache.lucene.store.InputStreamDataInput; -import org.apache.lucene.store.OutputStreamDataOutput; -import org.apache.lucene.util.*; import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.NoOutputs; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CharsRef; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.OfflineSorter; +import org.apache.lucene.util.OfflineSorter.SortInfo; +import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.UnicodeUtil; /** * An adapter from {@link Lookup} API to {@link FSTCompletion}. @@ -150,12 +151,12 @@ public class FSTCompletionLookup extends Lookup { throw new IllegalArgumentException("this suggester doesn't support payloads"); } File tempInput = File.createTempFile( - FSTCompletionLookup.class.getSimpleName(), ".input", Sort.defaultTempDir()); + FSTCompletionLookup.class.getSimpleName(), ".input", OfflineSorter.defaultTempDir()); File tempSorted = File.createTempFile( - FSTCompletionLookup.class.getSimpleName(), ".sorted", Sort.defaultTempDir()); + FSTCompletionLookup.class.getSimpleName(), ".sorted", OfflineSorter.defaultTempDir()); - Sort.ByteSequencesWriter writer = new Sort.ByteSequencesWriter(tempInput); - Sort.ByteSequencesReader reader = null; + OfflineSorter.ByteSequencesWriter writer = new OfflineSorter.ByteSequencesWriter(tempInput); + OfflineSorter.ByteSequencesReader reader = null; ExternalRefSorter sorter = null; // Push floats up front before sequences to sort them. For now, assume they are non-negative. @@ -180,13 +181,13 @@ public class FSTCompletionLookup extends Lookup { // We don't know the distribution of scores and we need to bucket them, so we'll sort // and divide into equal buckets. - SortInfo info = new Sort().sort(tempInput, tempSorted); + SortInfo info = new OfflineSorter().sort(tempInput, tempSorted); tempInput.delete(); FSTCompletionBuilder builder = new FSTCompletionBuilder( - buckets, sorter = new ExternalRefSorter(new Sort()), sharedTailLength); + buckets, sorter = new ExternalRefSorter(new OfflineSorter()), sharedTailLength); final int inputLines = info.lines; - reader = new Sort.ByteSequencesReader(tempSorted); + reader = new OfflineSorter.ByteSequencesReader(tempSorted); long line = 0; int previousBucket = 0; int previousScore = 0; diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/WFSTCompletionLookup.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/WFSTCompletionLookup.java index ab668413831..d654f182e48 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/WFSTCompletionLookup.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/fst/WFSTCompletionLookup.java @@ -25,7 +25,6 @@ import java.util.List; import org.apache.lucene.search.suggest.InputIterator; import org.apache.lucene.search.suggest.Lookup; -import org.apache.lucene.search.suggest.Sort.ByteSequencesWriter; import org.apache.lucene.search.suggest.SortedInputIterator; import org.apache.lucene.store.ByteArrayDataInput; import org.apache.lucene.store.ByteArrayDataOutput; @@ -43,6 +42,7 @@ import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.PositiveIntOutputs; import org.apache.lucene.util.fst.Util.MinResult; import org.apache.lucene.util.fst.Util; +import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter; /** * Suggester based on a weighted FST: it first traverses the prefix, diff --git a/lucene/suggest/src/test/org/apache/lucene/search/suggest/fst/BytesRefSortersTest.java b/lucene/suggest/src/test/org/apache/lucene/search/suggest/fst/BytesRefSortersTest.java index f8ccd35b55c..82775898475 100644 --- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/fst/BytesRefSortersTest.java +++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/fst/BytesRefSortersTest.java @@ -18,16 +18,16 @@ package org.apache.lucene.search.suggest.fst; */ import org.apache.lucene.search.suggest.InMemorySorter; -import org.apache.lucene.search.suggest.Sort; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefIterator; import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.OfflineSorter; import org.junit.Test; public class BytesRefSortersTest extends LuceneTestCase { @Test public void testExternalRefSorter() throws Exception { - ExternalRefSorter s = new ExternalRefSorter(new Sort()); + ExternalRefSorter s = new ExternalRefSorter(new OfflineSorter()); check(s); s.close(); } diff --git a/lucene/suggest/src/test/org/apache/lucene/search/suggest/fst/LargeInputFST.java b/lucene/suggest/src/test/org/apache/lucene/search/suggest/fst/LargeInputFST.java index 48a1409d9f9..0cb6c668d02 100644 --- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/fst/LargeInputFST.java +++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/fst/LargeInputFST.java @@ -17,10 +17,15 @@ package org.apache.lucene.search.suggest.fst; * limitations under the License. */ -import java.io.*; -import org.apache.lucene.search.suggest.Sort; +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStreamReader; + import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.OfflineSorter; /** * Try to build a suggester from a large data set. The input is a simple text @@ -33,7 +38,7 @@ public class LargeInputFST { int buckets = 20; int shareMaxTail = 10; - ExternalRefSorter sorter = new ExternalRefSorter(new Sort()); + ExternalRefSorter sorter = new ExternalRefSorter(new OfflineSorter()); FSTCompletionBuilder builder = new FSTCompletionBuilder(buckets, sorter, shareMaxTail); BufferedReader reader = new BufferedReader( From 803226ece41e9b10eafcc11664f943b9ae3db8b7 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Mon, 24 Feb 2014 17:11:07 +0000 Subject: [PATCH 06/17] LUCENE-5468: sort dictionary data with offline sorter git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5468@1571356 13f79535-47bb-0310-9956-ffa450edef68 --- .../lucene/analysis/hunspell2/Dictionary.java | 72 ++++++++++++------- 1 file changed, 48 insertions(+), 24 deletions(-) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java index c8068aa41b9..10baa403413 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java @@ -20,7 +20,11 @@ package org.apache.lucene.analysis.hunspell2; import org.apache.lucene.analysis.util.CharArrayMap; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefHash; +import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.IntsRef; +import org.apache.lucene.util.OfflineSorter; +import org.apache.lucene.util.OfflineSorter.ByteSequencesReader; +import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter; import org.apache.lucene.util.UnicodeUtil; import org.apache.lucene.util.Version; import org.apache.lucene.util.fst.Builder; @@ -75,6 +79,8 @@ public class Dictionary { private String[] aliases; private int aliasCount = 0; + + private final File tempDir = OfflineSorter.defaultTempDir(); // TODO: make this configurable? /** * Creates a new Dictionary containing the information read from the provided InputStreams to hunspell affix @@ -364,38 +370,53 @@ public class Dictionary { BytesRef flagsScratch = new BytesRef(); IntsRef scratchInts = new IntsRef(); - BufferedReader reader = new BufferedReader(new InputStreamReader(dictionary, decoder)); - // TODO: don't create millions of strings. - String line = reader.readLine(); // first line is number of entries (approximately, sometimes) - // sometimes the number of entries has a comment/copyright after it - line = line.replaceFirst("\\s*\\#.*$", ""); - int numEntries = Integer.parseInt(line); + BufferedReader lines = new BufferedReader(new InputStreamReader(dictionary, decoder)); + String line = lines.readLine(); // first line is number of entries (approximately, sometimes) - String lines[] = new String[numEntries]; - int upto = 0; - while ((line = reader.readLine()) != null) { - if (upto == lines.length) { - lines = Arrays.copyOf(lines, (int)(lines.length * 1.25)); + File unsorted = File.createTempFile("unsorted", "dat", tempDir); + try (ByteSequencesWriter writer = new ByteSequencesWriter(unsorted)) { + while ((line = lines.readLine()) != null) { + writer.write(line.getBytes(IOUtils.CHARSET_UTF_8)); } - lines[upto++] = line; } + File sorted = File.createTempFile("sorted", "dat", tempDir); - // TODO: just replace this with offline sorter? - Arrays.sort(lines, 0, upto, new Comparator() { + OfflineSorter sorter = new OfflineSorter(new Comparator() { + BytesRef scratch1 = new BytesRef(); + BytesRef scratch2 = new BytesRef(); + @Override - public int compare(String o1, String o2) { - int sep1 = o1.lastIndexOf('/'); - if (sep1 >= 0) { - o1 = o1.substring(0, sep1); + public int compare(BytesRef o1, BytesRef o2) { + scratch1.bytes = o1.bytes; + scratch1.offset = o1.offset; + scratch1.length = o1.length; + + for (int i = scratch1.length - 1; i >= 0; i--) { + if (scratch1.bytes[scratch1.offset + i] == '/') { + scratch1.length = i; + break; + } } - int sep2 = o2.lastIndexOf('/'); - if (sep2 >= 0) { - o2 = o2.substring(0, sep2); + scratch2.bytes = o2.bytes; + scratch2.offset = o2.offset; + scratch2.length = o2.length; + + for (int i = scratch2.length - 1; i >= 0; i--) { + if (scratch2.bytes[scratch2.offset + i] == '/') { + scratch2.length = i; + break; + } } - return o1.compareTo(o2); + + return scratch1.compareTo(scratch2); } }); + sorter.sort(unsorted, sorted); + unsorted.delete(); + + ByteSequencesReader reader = new ByteSequencesReader(sorted); + BytesRef scratchLine = new BytesRef(); // TODO: the flags themselves can be double-chars (long) or also numeric // either way the trick is to encode them as char... but they must be parsed differently @@ -403,8 +424,8 @@ public class Dictionary { BytesRef currentEntry = new BytesRef(); char currentFlags[] = new char[0]; - for (int i = 0; i < upto; i++) { - line = lines[i]; + while (reader.read(scratchLine)) { + line = scratchLine.utf8ToString(); String entry; char wordForm[]; @@ -457,6 +478,9 @@ public class Dictionary { } UnicodeUtil.UTF8toUTF32(currentEntry, scratchInts); words.add(scratchInts, (long)ord); + + reader.close(); + sorted.delete(); } static char[] decodeFlags(BytesRef b) { From 10f548d205e9443872c919f7af0ac1b01c735ed3 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Tue, 25 Feb 2014 19:18:09 +0000 Subject: [PATCH 07/17] LUCENE-5468: deduplicate patterns used by affix condition check git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5468@1571788 13f79535-47bb-0310-9956-ffa450edef68 --- .../lucene/analysis/hunspell2/Affix.java | 16 ++--------- .../lucene/analysis/hunspell2/Dictionary.java | 27 ++++++++++++++++--- .../hunspell2/TestAllDictionaries.java | 10 ++++--- 3 files changed, 32 insertions(+), 21 deletions(-) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Affix.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Affix.java index 41c3553fb77..443c006c97d 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Affix.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Affix.java @@ -28,7 +28,6 @@ final class Affix { private char appendFlags[]; // continuation class flags private String strip; - private String condition; private Pattern conditionPattern; private char flag; @@ -99,24 +98,13 @@ final class Affix { this.strip = strip; } - /** - * Returns the condition that must be met before the affix can be applied - * - * @return Condition that must be met before the affix can be applied - */ - public String getCondition() { - return condition; - } - /** * Sets the condition that must be met before the affix can be applied * - * @param condition Condition to be met before affix application * @param pattern Condition as a regular expression pattern */ - public void setCondition(String condition, String pattern) { - this.condition = condition; - this.conditionPattern = Pattern.compile(pattern); + public void setCondition(Pattern pattern) { + this.conditionPattern = pattern; } /** diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java index 10baa403413..0456d9946d3 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java @@ -44,6 +44,7 @@ import java.util.HashMap; import java.util.List; import java.util.Locale; import java.util.Map; +import java.util.regex.Pattern; /** * In-memory structure for the dictionary (.dic) and affix (.aff) @@ -68,6 +69,12 @@ public class Dictionary { public CharArrayMap> prefixes; public CharArrayMap> suffixes; + // all Patterns used by prefixes and suffixes. these are typically re-used across + // many affix stripping rules. so these are deduplicated, to save RAM. + // TODO: maybe don't use Pattern for the condition check... + // TODO: when we cut over Affix to FST, just store integer index to this. + public ArrayList patterns = new ArrayList<>(); + // the entries in the .dic file, mapping to their set of flags. // the fst output is the ordinal for flagLookup public FST words; @@ -184,6 +191,7 @@ public class Dictionary { private void readAffixFile(InputStream affixStream, CharsetDecoder decoder) throws IOException, ParseException { prefixes = new CharArrayMap>(Version.LUCENE_CURRENT, 8, false); suffixes = new CharArrayMap>(Version.LUCENE_CURRENT, 8, false); + Map seenPatterns = new HashMap<>(); LineNumberReader reader = new LineNumberReader(new InputStreamReader(affixStream, decoder)); String line = null; @@ -191,9 +199,9 @@ public class Dictionary { if (line.startsWith(ALIAS_KEY)) { parseAlias(line); } else if (line.startsWith(PREFIX_KEY)) { - parseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN); + parseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN, seenPatterns); } else if (line.startsWith(SUFFIX_KEY)) { - parseAffix(suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN); + parseAffix(suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN, seenPatterns); } else if (line.startsWith(FLAG_KEY)) { // Assume that the FLAG line comes before any prefix or suffixes // Store the strategy so it can be used when parsing the dic file @@ -210,12 +218,14 @@ public class Dictionary { * @param reader BufferedReader to read the content of the rule from * @param conditionPattern {@link String#format(String, Object...)} pattern to be used to generate the condition regex * pattern + * @param seenPatterns map from condition -> index of patterns, for deduplication. * @throws IOException Can be thrown while reading the rule */ private void parseAffix(CharArrayMap> affixes, String header, LineNumberReader reader, - String conditionPattern) throws IOException, ParseException { + String conditionPattern, + Map seenPatterns) throws IOException, ParseException { String args[] = header.split("\\s+"); boolean crossProduct = args[2].equals("Y"); @@ -261,7 +271,16 @@ public class Dictionary { if (condition.indexOf('-') >= 0) { condition = condition.replace("-", "\\-"); } - affix.setCondition(condition, String.format(Locale.ROOT, conditionPattern, condition)); + // deduplicate patterns + String regex = String.format(Locale.ROOT, conditionPattern, condition); + Integer patternIndex = seenPatterns.get(regex); + if (patternIndex == null) { + patternIndex = patterns.size(); + seenPatterns.put(regex, patternIndex); + Pattern pattern = Pattern.compile(regex); + patterns.add(pattern); + } + affix.setCondition(patterns.get(patternIndex)); affix.setCrossProduct(crossProduct); List list = affixes.get(affix.getAppend()); diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestAllDictionaries.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestAllDictionaries.java index ecb21b97a7c..9f9bce98236 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestAllDictionaries.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestAllDictionaries.java @@ -33,12 +33,12 @@ import org.junit.Ignore; * wget --mirror -np http://archive.services.openoffice.org/pub/mirror/OpenOffice.org/contrib/dictionaries/ * Note some of the files differ only in case. This may be a problem on your operating system! */ -@Ignore("enable manually") +//@Ignore("enable manually") public class TestAllDictionaries extends LuceneTestCase { // set this to the location of where you downloaded all the files static final File DICTIONARY_HOME = - new File("/Users/rmuir/hunspell/archive.services.openoffice.org/pub/mirror/OpenOffice.org/contrib/dictionaries"); + new File("/data/archive.services.openoffice.org/pub/mirror/OpenOffice.org/contrib/dictionaries"); final String tests[] = { /* zip file */ /* dictionary */ /* affix */ @@ -176,7 +176,11 @@ public class TestAllDictionaries extends LuceneTestCase { try (InputStream dictionary = zip.getInputStream(dicEntry); InputStream affix = zip.getInputStream(affEntry)) { Dictionary dic = new Dictionary(affix, dictionary); - System.out.println(tests[i] + "\t" + oldRAM + "\t" + RamUsageEstimator.humanSizeOf(dic)); + System.out.println(tests[i] + "\t" + oldRAM + "\t" + RamUsageEstimator.humanSizeOf(dic) + "\t(" + + "words=" + RamUsageEstimator.humanSizeOf(dic.words) + ", " + + "flags=" + RamUsageEstimator.humanSizeOf(dic.flagLookup) + ", " + + "prefixes=" + RamUsageEstimator.humanSizeOf(dic.prefixes) + ", " + + "suffixes=" + RamUsageEstimator.humanSizeOf(dic.suffixes) + ")"); } } } From 48f55644505c5bf553b2225f9559c351621194b0 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Tue, 25 Feb 2014 19:44:32 +0000 Subject: [PATCH 08/17] LUCENE-5468: remove redundant 'append' in Affix git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5468@1571802 13f79535-47bb-0310-9956-ffa450edef68 --- .../lucene/analysis/hunspell2/Affix.java | 19 ------------------- .../lucene/analysis/hunspell2/Dictionary.java | 8 +++----- .../lucene/analysis/hunspell2/Stemmer.java | 5 +++-- 3 files changed, 6 insertions(+), 26 deletions(-) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Affix.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Affix.java index 443c006c97d..47a81480d6c 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Affix.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Affix.java @@ -24,7 +24,6 @@ import java.util.regex.Pattern; */ final class Affix { - private String append; // the affix itself, what is appended private char appendFlags[]; // continuation class flags private String strip; @@ -44,24 +43,6 @@ final class Affix { return conditionPattern.matcher(text).matches(); } - /** - * Returns the append defined for the affix - * - * @return Defined append - */ - public String getAppend() { - return append; - } - - /** - * Sets the append defined for the affix - * - * @param append Defined append for the affix - */ - public void setAppend(String append) { - this.append = append; - } - /** * Returns the flags defined for the affix append * diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java index 0456d9946d3..713bc92210c 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java @@ -257,9 +257,7 @@ public class Dictionary { char appendFlags[] = flagParsingStrategy.parseFlags(flagPart); Arrays.sort(appendFlags); affix.setAppendFlags(appendFlags); - affix.setAppend(affixArg.substring(0, flagSep)); - } else { - affix.setAppend(affixArg); + affixArg = affixArg.substring(0, flagSep); } String condition = ruleArgs[4]; @@ -283,10 +281,10 @@ public class Dictionary { affix.setCondition(patterns.get(patternIndex)); affix.setCrossProduct(crossProduct); - List list = affixes.get(affix.getAppend()); + List list = affixes.get(affixArg); if (list == null) { list = new ArrayList(); - affixes.put(affix.getAppend(), list); + affixes.put(affixArg, list); } list.add(affix); diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java index aa00836d6fe..62096ef96cf 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java @@ -126,7 +126,8 @@ final class Stemmer { for (Affix suffix : suffixes) { if (hasCrossCheckedFlag(suffix.getFlag(), flags)) { - int deAffixedLength = length - suffix.getAppend().length(); + int appendLength = length - i; + int deAffixedLength = length - appendLength; // TODO: can we do this in-place? String strippedWord = new StringBuilder().append(word, 0, deAffixedLength).append(suffix.getStrip()).toString(); @@ -148,7 +149,7 @@ final class Stemmer { for (Affix prefix : prefixes) { if (hasCrossCheckedFlag(prefix.getFlag(), flags)) { - int deAffixedStart = prefix.getAppend().length(); + int deAffixedStart = i; int deAffixedLength = length - deAffixedStart; String strippedWord = new StringBuilder().append(prefix.getStrip()) From caaa01d2207b47e0f917760f09b9288710f1615c Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Tue, 25 Feb 2014 20:07:05 +0000 Subject: [PATCH 09/17] LUCENE-5468: Stem -> CharsRef git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5468@1571807 13f79535-47bb-0310-9956-ffa450edef68 --- .../hunspell2/Hunspell2StemFilter.java | 13 ++- .../lucene/analysis/hunspell2/Stem.java | 98 ------------------- .../lucene/analysis/hunspell2/Stemmer.java | 43 ++++---- .../analysis/hunspell2/TestStemmer.java | 5 +- 4 files changed, 28 insertions(+), 131 deletions(-) delete mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stem.java diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Hunspell2StemFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Hunspell2StemFilter.java index 45941345342..00ff88469be 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Hunspell2StemFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Hunspell2StemFilter.java @@ -25,6 +25,7 @@ import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.util.CharsRef; /** * TokenFilter that uses hunspell affix rules and words to stem tokens. Since hunspell supports a word having multiple @@ -49,7 +50,7 @@ public final class Hunspell2StemFilter extends TokenFilter { private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class); private final Stemmer stemmer; - private List buffer; + private List buffer; private State savedState; private final boolean dedup; @@ -97,11 +98,10 @@ public final class Hunspell2StemFilter extends TokenFilter { @Override public boolean incrementToken() throws IOException { if (buffer != null && !buffer.isEmpty()) { - Stem nextStem = buffer.remove(0); + CharsRef nextStem = buffer.remove(0); restoreState(savedState); posIncAtt.setPositionIncrement(0); - termAtt.copyBuffer(nextStem.getStem(), 0, nextStem.getStemLength()); - termAtt.setLength(nextStem.getStemLength()); + termAtt.setEmpty().append(nextStem); return true; } @@ -119,9 +119,8 @@ public final class Hunspell2StemFilter extends TokenFilter { return true; } - Stem stem = buffer.remove(0); - termAtt.copyBuffer(stem.getStem(), 0, stem.getStemLength()); - termAtt.setLength(stem.getStemLength()); + CharsRef stem = buffer.remove(0); + termAtt.setEmpty().append(stem); if (!buffer.isEmpty()) { savedState = captureState(); diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stem.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stem.java deleted file mode 100644 index d3c8d4c86ab..00000000000 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stem.java +++ /dev/null @@ -1,98 +0,0 @@ -package org.apache.lucene.analysis.hunspell2; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.util.ArrayList; -import java.util.List; - -/** - * Stem represents all information known about a stem of a word. This includes the stem, and the prefixes and suffixes - * that were used to change the word into the stem. - */ -final class Stem { - final List prefixes = new ArrayList(); - final List suffixes = new ArrayList(); - final char stem[]; - final int stemLength; - - /** - * Creates a new Stem wrapping the given word stem - * - * @param stem Stem of a word - */ - public Stem(char stem[], int stemLength) { - this.stem = stem; - this.stemLength = stemLength; - } - - /** - * Adds a prefix to the list of prefixes used to generate this stem. Because it is assumed that prefixes are added - * depth first, the prefix is added to the front of the list - * - * @param prefix Prefix to add to the list of prefixes for this stem - */ - public void addPrefix(Affix prefix) { - prefixes.add(0, prefix); - } - - /** - * Adds a suffix to the list of suffixes used to generate this stem. Because it is assumed that suffixes are added - * depth first, the suffix is added to the end of the list - * - * @param suffix Suffix to add to the list of suffixes for this stem - */ - public void addSuffix(Affix suffix) { - suffixes.add(suffix); - } - - /** - * Returns the list of prefixes used to generate the stem - * - * @return List of prefixes used to generate the stem or an empty list if no prefixes were required - */ - public List getPrefixes() { - return prefixes; - } - - /** - * Returns the list of suffixes used to generate the stem - * - * @return List of suffixes used to generate the stem or an empty list if no suffixes were required - */ - public List getSuffixes() { - return suffixes; - } - - /** - * Returns the text of the word's stem. - * @see #getStemLength() - */ - public char[] getStem() { - return stem; - } - - /** Returns the valid length of the text in {@link #getStem()} */ - public int getStemLength() { - return stemLength; - } - - /** Only use this if you really need a string (e.g. for testing) */ - public String getStemString() { - return new String(stem, 0, stemLength); - } -} \ No newline at end of file diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java index 62096ef96cf..7919ad56be7 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java @@ -24,6 +24,7 @@ import java.util.List; import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.Version; /** @@ -63,7 +64,7 @@ final class Stemmer { * @param word Word to find the stems for * @return List of stems for the word */ - public List stem(String word) { + public List stem(String word) { return stem(word.toCharArray(), word.length()); } @@ -73,10 +74,10 @@ final class Stemmer { * @param word Word to find the stems for * @return List of stems for the word */ - public List stem(char word[], int length) { - List stems = new ArrayList(); + public List stem(char word[], int length) { + List stems = new ArrayList(); if (dictionary.lookupWord(word, 0, length, scratch) != null) { - stems.add(new Stem(word, length)); + stems.add(new CharsRef(word, 0, length)); } stems.addAll(stem(word, length, null, 0)); return stems; @@ -88,18 +89,18 @@ final class Stemmer { * @param word Word to find the stems for * @return List of stems for the word */ - public List uniqueStems(char word[], int length) { - List stems = new ArrayList(); + public List uniqueStems(char word[], int length) { + List stems = new ArrayList(); CharArraySet terms = new CharArraySet(Version.LUCENE_CURRENT, 8, false); if (dictionary.lookupWord(word, 0, length, scratch) != null) { - stems.add(new Stem(word, length)); + stems.add(new CharsRef(word, 0, length)); terms.add(word); } - List otherStems = stem(word, length, null, 0); - for (Stem s : otherStems) { - if (!terms.contains(s.stem)) { + List otherStems = stem(word, length, null, 0); + for (CharsRef s : otherStems) { + if (!terms.contains(s)) { stems.add(s); - terms.add(s.stem); + terms.add(s); } } return stems; @@ -115,8 +116,8 @@ final class Stemmer { * @param recursionDepth Level of recursion this stemming step is at * @return List of stems, or empty list if no stems are found */ - private List stem(char word[], int length, char[] flags, int recursionDepth) { - List stems = new ArrayList(); + private List stem(char word[], int length, char[] flags, int recursionDepth) { + List stems = new ArrayList(); for (int i = 0; i < length; i++) { List suffixes = dictionary.lookupSuffix(word, i, length - i); @@ -131,10 +132,7 @@ final class Stemmer { // TODO: can we do this in-place? String strippedWord = new StringBuilder().append(word, 0, deAffixedLength).append(suffix.getStrip()).toString(); - List stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), suffix, recursionDepth); - for (Stem stem : stemList) { - stem.addSuffix(suffix); - } + List stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), suffix, recursionDepth); stems.addAll(stemList); } @@ -156,10 +154,7 @@ final class Stemmer { .append(word, deAffixedStart, deAffixedLength) .toString(); - List stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), prefix, recursionDepth); - for (Stem stem : stemList) { - stem.addPrefix(prefix); - } + List stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), prefix, recursionDepth); stems.addAll(stemList); } @@ -177,18 +172,18 @@ final class Stemmer { * @param recursionDepth Level of recursion this stemming step is at * @return List of stems for the word, or an empty list if none are found */ - public List applyAffix(char strippedWord[], int length, Affix affix, int recursionDepth) { + public List applyAffix(char strippedWord[], int length, Affix affix, int recursionDepth) { segment.setLength(0); segment.append(strippedWord, 0, length); if (!affix.checkCondition(segment)) { return Collections.emptyList(); } - List stems = new ArrayList(); + List stems = new ArrayList(); char wordFlags[] = dictionary.lookupWord(strippedWord, 0, length, scratch); if (wordFlags != null && Dictionary.hasFlag(wordFlags, affix.getFlag())) { - stems.add(new Stem(strippedWord, length)); + stems.add(new CharsRef(strippedWord, 0, length)); } if (affix.isCrossProduct() && recursionDepth < recursionCap) { diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestStemmer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestStemmer.java index a8ac2a83fa9..4dec107f314 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestStemmer.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestStemmer.java @@ -17,6 +17,7 @@ package org.apache.lucene.analysis.hunspell2; * limitations under the License. */ +import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.LuceneTestCase; import org.junit.AfterClass; import org.junit.BeforeClass; @@ -92,10 +93,10 @@ public class TestStemmer extends LuceneTestCase { private void assertStemsTo(String s, String... expected) { Arrays.sort(expected); - List stems = stemmer.stem(s); + List stems = stemmer.stem(s); String actual[] = new String[stems.size()]; for (int i = 0; i < actual.length; i++) { - actual[i] = stems.get(i).getStemString(); + actual[i] = stems.get(i).toString(); } Arrays.sort(actual); From d7cc408585363c1b2ea5d7c725515829cc4f5ff7 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Tue, 25 Feb 2014 22:29:27 +0000 Subject: [PATCH 10/17] LUCENE-5468: make Affix fixed-width git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5468@1571844 13f79535-47bb-0310-9956-ffa450edef68 --- .../lucene/analysis/hunspell2/Affix.java | 41 ++++++---------- .../lucene/analysis/hunspell2/Dictionary.java | 47 +++++++++++++++---- .../lucene/analysis/hunspell2/Stemmer.java | 24 +++++++--- 3 files changed, 68 insertions(+), 44 deletions(-) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Affix.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Affix.java index 47a81480d6c..eb67f60e763 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Affix.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Affix.java @@ -17,38 +17,23 @@ package org.apache.lucene.analysis.hunspell2; * limitations under the License. */ -import java.util.regex.Pattern; - /** * Wrapper class representing a hunspell affix */ final class Affix { - private char appendFlags[]; // continuation class flags - private String strip; - - private Pattern conditionPattern; - - private char flag; - + private int appendFlags; // continuation class flags + private int condition; // check condition private boolean crossProduct; - - /** - * Checks whether the given text matches the conditional pattern on this affix - * - * @param text Text to check if it matches the affix's conditional pattern - * @return {@code true} if the text meets the condition, {@code false} otherwise - */ - public boolean checkCondition(CharSequence text) { - return conditionPattern.matcher(text).matches(); - } + private char flag; + private int strip; /** * Returns the flags defined for the affix append * * @return Flags defined for the affix append */ - public char[] getAppendFlags() { + public int getAppendFlags() { return appendFlags; } @@ -57,7 +42,7 @@ final class Affix { * * @param appendFlags Flags defined for the affix append */ - public void setAppendFlags(char[] appendFlags) { + public void setAppendFlags(int appendFlags) { this.appendFlags = appendFlags; } @@ -66,7 +51,7 @@ final class Affix { * * @return Stripping characters defined for the affix */ - public String getStrip() { + public int getStrip() { return strip; } @@ -75,17 +60,19 @@ final class Affix { * * @param strip Stripping characters defined for the affix */ - public void setStrip(String strip) { + public void setStrip(int strip) { this.strip = strip; } /** * Sets the condition that must be met before the affix can be applied - * - * @param pattern Condition as a regular expression pattern */ - public void setCondition(Pattern pattern) { - this.conditionPattern = pattern; + public void setCondition(int condition) { + this.condition = condition; + } + + public int getCondition() { + return condition; } /** diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java index 713bc92210c..35c7aee6081 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java @@ -81,6 +81,9 @@ public class Dictionary { // the list of unique flagsets (wordforms). theoretically huge, but practically // small (e.g. for polish this is 756), otherwise humans wouldn't be able to deal with it either. public BytesRefHash flagLookup = new BytesRefHash(); + + // the list of unique strip affixes. + public BytesRefHash stripLookup = new BytesRefHash(); private FlagParsingStrategy flagParsingStrategy = new SimpleFlagParsingStrategy(); // Default flag parsing strategy @@ -107,6 +110,7 @@ public class Dictionary { CharsetDecoder decoder = getJavaEncoding(encoding); readAffixFile(buffered, decoder); flagLookup.add(new BytesRef()); // no flags -> ord 0 + stripLookup.add(new BytesRef()); // no strip -> ord 0 PositiveIntOutputs o = PositiveIntOutputs.getSingleton(); Builder b = new Builder(FST.INPUT_TYPE.BYTE4, o); readDictionaryFile(dictionary, decoder, b); @@ -226,6 +230,8 @@ public class Dictionary { LineNumberReader reader, String conditionPattern, Map seenPatterns) throws IOException, ParseException { + + BytesRef scratch = new BytesRef(); String args[] = header.split("\\s+"); boolean crossProduct = args[2].equals("Y"); @@ -239,25 +245,23 @@ public class Dictionary { throw new ParseException("The affix file contains a rule with less than five elements", reader.getLineNumber()); } - Affix affix = new Affix(); - affix.setFlag(flagParsingStrategy.parseFlag(ruleArgs[1])); - affix.setStrip(ruleArgs[2].equals("0") ? "" : ruleArgs[2]); - + char flag = flagParsingStrategy.parseFlag(ruleArgs[1]); + String strip = ruleArgs[2].equals("0") ? "" : ruleArgs[2]; String affixArg = ruleArgs[3]; + char appendFlags[] = null; int flagSep = affixArg.lastIndexOf('/'); if (flagSep != -1) { String flagPart = affixArg.substring(flagSep + 1); - + affixArg = affixArg.substring(0, flagSep); + if (aliasCount > 0) { flagPart = getAliasValue(Integer.parseInt(flagPart)); } - char appendFlags[] = flagParsingStrategy.parseFlags(flagPart); + appendFlags = flagParsingStrategy.parseFlags(flagPart); Arrays.sort(appendFlags); - affix.setAppendFlags(appendFlags); - affixArg = affixArg.substring(0, flagSep); } String condition = ruleArgs[4]; @@ -269,8 +273,10 @@ public class Dictionary { if (condition.indexOf('-') >= 0) { condition = condition.replace("-", "\\-"); } - // deduplicate patterns + String regex = String.format(Locale.ROOT, conditionPattern, condition); + + // deduplicate patterns Integer patternIndex = seenPatterns.get(regex); if (patternIndex == null) { patternIndex = patterns.size(); @@ -278,8 +284,29 @@ public class Dictionary { Pattern pattern = Pattern.compile(regex); patterns.add(pattern); } - affix.setCondition(patterns.get(patternIndex)); + + Affix affix = new Affix(); + scratch.copyChars(strip); + int ord = stripLookup.add(scratch); + if (ord < 0) { + // already exists in our hash + ord = (-ord)-1; + } + affix.setStrip(ord); + affix.setFlag(flag); + affix.setCondition(patternIndex); affix.setCrossProduct(crossProduct); + if (appendFlags == null) { + appendFlags = NOFLAGS; + } + + final int hashCode = encodeFlagsWithHash(scratch, appendFlags); + ord = flagLookup.add(scratch, hashCode); + if (ord < 0) { + // already exists in our hash + ord = (-ord)-1; + } + affix.setAppendFlags(ord); List list = affixes.get(affixArg); if (list == null) { diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java index 7919ad56be7..b2057c501b2 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java @@ -21,6 +21,7 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.List; +import java.util.regex.Pattern; import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.util.BytesRef; @@ -79,7 +80,7 @@ final class Stemmer { if (dictionary.lookupWord(word, 0, length, scratch) != null) { stems.add(new CharsRef(word, 0, length)); } - stems.addAll(stem(word, length, null, 0)); + stems.addAll(stem(word, length, Dictionary.NOFLAGS, 0)); return stems; } @@ -96,7 +97,7 @@ final class Stemmer { stems.add(new CharsRef(word, 0, length)); terms.add(word); } - List otherStems = stem(word, length, null, 0); + List otherStems = stem(word, length, Dictionary.NOFLAGS, 0); for (CharsRef s : otherStems) { if (!terms.contains(s)) { stems.add(s); @@ -117,7 +118,9 @@ final class Stemmer { * @return List of stems, or empty list if no stems are found */ private List stem(char word[], int length, char[] flags, int recursionDepth) { + // TODO: allow this stuff to be reused by tokenfilter List stems = new ArrayList(); + BytesRef scratch = new BytesRef(); for (int i = 0; i < length; i++) { List suffixes = dictionary.lookupSuffix(word, i, length - i); @@ -130,7 +133,8 @@ final class Stemmer { int appendLength = length - i; int deAffixedLength = length - appendLength; // TODO: can we do this in-place? - String strippedWord = new StringBuilder().append(word, 0, deAffixedLength).append(suffix.getStrip()).toString(); + dictionary.stripLookup.get(suffix.getStrip(), scratch); + String strippedWord = new StringBuilder().append(word, 0, deAffixedLength).append(scratch.utf8ToString()).toString(); List stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), suffix, recursionDepth); @@ -150,7 +154,8 @@ final class Stemmer { int deAffixedStart = i; int deAffixedLength = length - deAffixedStart; - String strippedWord = new StringBuilder().append(prefix.getStrip()) + dictionary.stripLookup.get(prefix.getStrip(), scratch); + String strippedWord = new StringBuilder().append(scratch.utf8ToString()) .append(word, deAffixedStart, deAffixedLength) .toString(); @@ -175,7 +180,9 @@ final class Stemmer { public List applyAffix(char strippedWord[], int length, Affix affix, int recursionDepth) { segment.setLength(0); segment.append(strippedWord, 0, length); - if (!affix.checkCondition(segment)) { + + Pattern pattern = dictionary.patterns.get(affix.getCondition()); + if (!pattern.matcher(segment).matches()) { return Collections.emptyList(); } @@ -187,7 +194,10 @@ final class Stemmer { } if (affix.isCrossProduct() && recursionDepth < recursionCap) { - stems.addAll(stem(strippedWord, length, affix.getAppendFlags(), ++recursionDepth)); + BytesRef scratch = new BytesRef(); + dictionary.flagLookup.get(affix.getAppendFlags(), scratch); + char appendFlags[] = Dictionary.decodeFlags(scratch); + stems.addAll(stem(strippedWord, length, appendFlags, ++recursionDepth)); } return stems; @@ -201,6 +211,6 @@ final class Stemmer { * @return {@code true} if the flag is found in the array or the array is {@code null}, {@code false} otherwise */ private boolean hasCrossCheckedFlag(char flag, char[] flags) { - return flags == null || Arrays.binarySearch(flags, flag) >= 0; + return flags.length == 0 || Arrays.binarySearch(flags, flag) >= 0; } } From 9896e610d36a68a9f331132b889bd326ae7d4163 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Thu, 27 Feb 2014 16:19:21 +0000 Subject: [PATCH 11/17] LUCENE-5468: don't create unnecessary objects git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5468@1572643 13f79535-47bb-0310-9956-ffa450edef68 --- .../src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java | 2 -- 1 file changed, 2 deletions(-) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java index b2057c501b2..54dce381b1a 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java @@ -120,7 +120,6 @@ final class Stemmer { private List stem(char word[], int length, char[] flags, int recursionDepth) { // TODO: allow this stuff to be reused by tokenfilter List stems = new ArrayList(); - BytesRef scratch = new BytesRef(); for (int i = 0; i < length; i++) { List suffixes = dictionary.lookupSuffix(word, i, length - i); @@ -194,7 +193,6 @@ final class Stemmer { } if (affix.isCrossProduct() && recursionDepth < recursionCap) { - BytesRef scratch = new BytesRef(); dictionary.flagLookup.get(affix.getAppendFlags(), scratch); char appendFlags[] = Dictionary.decodeFlags(scratch); stems.addAll(stem(strippedWord, length, appendFlags, ++recursionDepth)); From cdec14902bc86e7826c3194199dceaa40991c153 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Thu, 27 Feb 2014 17:19:15 +0000 Subject: [PATCH 12/17] LUCENE-5468: encode affix data as 8 bytes per affix, before cutting over to FST git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5468@1572660 13f79535-47bb-0310-9956-ffa450edef68 --- .../lucene/analysis/hunspell2/Affix.java | 113 ------------------ .../lucene/analysis/hunspell2/Dictionary.java | 64 ++++++---- .../lucene/analysis/hunspell2/Stemmer.java | 43 +++++-- 3 files changed, 71 insertions(+), 149 deletions(-) delete mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Affix.java diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Affix.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Affix.java deleted file mode 100644 index eb67f60e763..00000000000 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Affix.java +++ /dev/null @@ -1,113 +0,0 @@ -package org.apache.lucene.analysis.hunspell2; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Wrapper class representing a hunspell affix - */ -final class Affix { - - private int appendFlags; // continuation class flags - private int condition; // check condition - private boolean crossProduct; - private char flag; - private int strip; - - /** - * Returns the flags defined for the affix append - * - * @return Flags defined for the affix append - */ - public int getAppendFlags() { - return appendFlags; - } - - /** - * Sets the flags defined for the affix append - * - * @param appendFlags Flags defined for the affix append - */ - public void setAppendFlags(int appendFlags) { - this.appendFlags = appendFlags; - } - - /** - * Returns the stripping characters defined for the affix - * - * @return Stripping characters defined for the affix - */ - public int getStrip() { - return strip; - } - - /** - * Sets the stripping characters defined for the affix - * - * @param strip Stripping characters defined for the affix - */ - public void setStrip(int strip) { - this.strip = strip; - } - - /** - * Sets the condition that must be met before the affix can be applied - */ - public void setCondition(int condition) { - this.condition = condition; - } - - public int getCondition() { - return condition; - } - - /** - * Returns the affix flag - * - * @return Affix flag - */ - public char getFlag() { - return flag; - } - - /** - * Sets the affix flag - * - * @param flag Affix flag - */ - public void setFlag(char flag) { - this.flag = flag; - } - - /** - * Returns whether the affix is defined as cross product - * - * @return {@code true} if the affix is cross product, {@code false} otherwise - */ - public boolean isCrossProduct() { - return crossProduct; - } - - /** - * Sets whether the affix is defined as cross product - * - * @param crossProduct Whether the affix is defined as cross product - */ - public void setCrossProduct(boolean crossProduct) { - this.crossProduct = crossProduct; - } -} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java index 35c7aee6081..b30bdaa1e92 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java @@ -18,6 +18,8 @@ package org.apache.lucene.analysis.hunspell2; */ import org.apache.lucene.analysis.util.CharArrayMap; +import org.apache.lucene.store.ByteArrayDataOutput; +import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefHash; import org.apache.lucene.util.IOUtils; @@ -66,8 +68,8 @@ public class Dictionary { private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*"; private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s"; - public CharArrayMap> prefixes; - public CharArrayMap> suffixes; + public CharArrayMap> prefixes; + public CharArrayMap> suffixes; // all Patterns used by prefixes and suffixes. these are typically re-used across // many affix stripping rules. so these are deduplicated, to save RAM. @@ -84,6 +86,10 @@ public class Dictionary { // the list of unique strip affixes. public BytesRefHash stripLookup = new BytesRefHash(); + + // 8 bytes per affix + public byte[] affixData = new byte[64]; + private int currentAffix = 0; private FlagParsingStrategy flagParsingStrategy = new SimpleFlagParsingStrategy(); // Default flag parsing strategy @@ -169,7 +175,7 @@ public class Dictionary { * @param length Length from the offset that the String is * @return List of HunspellAffix prefixes with an append that matches the String, or {@code null} if none are found */ - public List lookupPrefix(char word[], int offset, int length) { + public List lookupPrefix(char word[], int offset, int length) { return prefixes.get(word, offset, length); } @@ -181,7 +187,7 @@ public class Dictionary { * @param length Length from the offset that the String is * @return List of HunspellAffix suffixes with an append that matches the String, or {@code null} if none are found */ - List lookupSuffix(char word[], int offset, int length) { + List lookupSuffix(char word[], int offset, int length) { return suffixes.get(word, offset, length); } @@ -193,8 +199,8 @@ public class Dictionary { * @throws IOException Can be thrown while reading from the InputStream */ private void readAffixFile(InputStream affixStream, CharsetDecoder decoder) throws IOException, ParseException { - prefixes = new CharArrayMap>(Version.LUCENE_CURRENT, 8, false); - suffixes = new CharArrayMap>(Version.LUCENE_CURRENT, 8, false); + prefixes = new CharArrayMap>(Version.LUCENE_CURRENT, 8, false); + suffixes = new CharArrayMap>(Version.LUCENE_CURRENT, 8, false); Map seenPatterns = new HashMap<>(); LineNumberReader reader = new LineNumberReader(new InputStreamReader(affixStream, decoder)); @@ -225,7 +231,7 @@ public class Dictionary { * @param seenPatterns map from condition -> index of patterns, for deduplication. * @throws IOException Can be thrown while reading the rule */ - private void parseAffix(CharArrayMap> affixes, + private void parseAffix(CharArrayMap> affixes, String header, LineNumberReader reader, String conditionPattern, @@ -237,14 +243,20 @@ public class Dictionary { boolean crossProduct = args[2].equals("Y"); int numLines = Integer.parseInt(args[3]); + affixData = ArrayUtil.grow(affixData, (currentAffix << 3) + (numLines << 3)); + ByteArrayDataOutput affixWriter = new ByteArrayDataOutput(affixData, currentAffix << 3, numLines << 3); + for (int i = 0; i < numLines; i++) { + if (currentAffix > Short.MAX_VALUE) { + throw new UnsupportedOperationException("Too many affixes, please report this to dev@lucene.apache.org"); + } + assert affixWriter.getPosition() == currentAffix << 3; String line = reader.readLine(); String ruleArgs[] = line.split("\\s+"); if (ruleArgs.length < 5) { throw new ParseException("The affix file contains a rule with less than five elements", reader.getLineNumber()); } - char flag = flagParsingStrategy.parseFlag(ruleArgs[1]); String strip = ruleArgs[2].equals("0") ? "" : ruleArgs[2]; @@ -285,36 +297,42 @@ public class Dictionary { patterns.add(pattern); } - Affix affix = new Affix(); scratch.copyChars(strip); - int ord = stripLookup.add(scratch); - if (ord < 0) { + int stripOrd = stripLookup.add(scratch); + if (stripOrd < 0) { // already exists in our hash - ord = (-ord)-1; + stripOrd = (-stripOrd)-1; } - affix.setStrip(ord); - affix.setFlag(flag); - affix.setCondition(patternIndex); - affix.setCrossProduct(crossProduct); + if (appendFlags == null) { appendFlags = NOFLAGS; } final int hashCode = encodeFlagsWithHash(scratch, appendFlags); - ord = flagLookup.add(scratch, hashCode); - if (ord < 0) { + int appendFlagsOrd = flagLookup.add(scratch, hashCode); + if (appendFlagsOrd < 0) { // already exists in our hash - ord = (-ord)-1; + appendFlagsOrd = (-appendFlagsOrd)-1; + } else if (appendFlagsOrd > Short.MAX_VALUE) { + // this limit is probably flexible, but its a good sanity check too + throw new UnsupportedOperationException("Too many unique flags, please report this to dev@lucene.apache.org"); } - affix.setAppendFlags(ord); - List list = affixes.get(affixArg); + affixWriter.writeShort((short)flag); + affixWriter.writeShort((short)stripOrd); + // encode crossProduct into patternIndex + int patternOrd = patternIndex.intValue() << 1 | (crossProduct ? 1 : 0); + affixWriter.writeShort((short)patternOrd); + affixWriter.writeShort((short)appendFlagsOrd); + + List list = affixes.get(affixArg); if (list == null) { - list = new ArrayList(); + list = new ArrayList(); affixes.put(affixArg, list); } - list.add(affix); + list.add((char)currentAffix); + currentAffix++; } } diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java index 54dce381b1a..4eaff6a9e95 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java @@ -24,6 +24,7 @@ import java.util.List; import java.util.regex.Pattern; import org.apache.lucene.analysis.util.CharArraySet; +import org.apache.lucene.store.ByteArrayDataInput; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.Version; @@ -37,6 +38,7 @@ final class Stemmer { private final Dictionary dictionary; private BytesRef scratch = new BytesRef(); private final StringBuilder segment = new StringBuilder(); + private final ByteArrayDataInput affixReader; /** * Constructs a new Stemmer which will use the provided Dictionary to create its stems. Uses the @@ -56,6 +58,7 @@ final class Stemmer { */ public Stemmer(Dictionary dictionary, int recursionCap) { this.dictionary = dictionary; + this.affixReader = new ByteArrayDataInput(dictionary.affixData); this.recursionCap = recursionCap; } @@ -122,17 +125,20 @@ final class Stemmer { List stems = new ArrayList(); for (int i = 0; i < length; i++) { - List suffixes = dictionary.lookupSuffix(word, i, length - i); + List suffixes = dictionary.lookupSuffix(word, i, length - i); if (suffixes == null) { continue; } - for (Affix suffix : suffixes) { - if (hasCrossCheckedFlag(suffix.getFlag(), flags)) { + for (Character suffix : suffixes) { + affixReader.setPosition(8 * suffix); + char flag = (char) (affixReader.readShort() & 0xffff); + if (hasCrossCheckedFlag(flag, flags)) { int appendLength = length - i; int deAffixedLength = length - appendLength; // TODO: can we do this in-place? - dictionary.stripLookup.get(suffix.getStrip(), scratch); + char stripOrd = (char) (affixReader.readShort() & 0xffff); + dictionary.stripLookup.get(stripOrd, scratch); String strippedWord = new StringBuilder().append(word, 0, deAffixedLength).append(scratch.utf8ToString()).toString(); List stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), suffix, recursionDepth); @@ -143,17 +149,20 @@ final class Stemmer { } for (int i = length - 1; i >= 0; i--) { - List prefixes = dictionary.lookupPrefix(word, 0, i); + List prefixes = dictionary.lookupPrefix(word, 0, i); if (prefixes == null) { continue; } - for (Affix prefix : prefixes) { - if (hasCrossCheckedFlag(prefix.getFlag(), flags)) { + for (Character prefix : prefixes) { + affixReader.setPosition(8 * prefix); + char flag = (char) (affixReader.readShort() & 0xffff); + if (hasCrossCheckedFlag(flag, flags)) { int deAffixedStart = i; int deAffixedLength = length - deAffixedStart; + char stripOrd = (char) (affixReader.readShort() & 0xffff); - dictionary.stripLookup.get(prefix.getStrip(), scratch); + dictionary.stripLookup.get(stripOrd, scratch); String strippedWord = new StringBuilder().append(scratch.utf8ToString()) .append(word, deAffixedStart, deAffixedLength) .toString(); @@ -176,11 +185,19 @@ final class Stemmer { * @param recursionDepth Level of recursion this stemming step is at * @return List of stems for the word, or an empty list if none are found */ - public List applyAffix(char strippedWord[], int length, Affix affix, int recursionDepth) { + public List applyAffix(char strippedWord[], int length, char affix, int recursionDepth) { segment.setLength(0); segment.append(strippedWord, 0, length); - Pattern pattern = dictionary.patterns.get(affix.getCondition()); + affixReader.setPosition(8 * affix); + char flag = (char) (affixReader.readShort() & 0xffff); + affixReader.skipBytes(2); // strip + int condition = (char) (affixReader.readShort() & 0xffff); + boolean crossProduct = (condition & 1) == 1; + condition >>>= 1; + char append = (char) (affixReader.readShort() & 0xffff); + + Pattern pattern = dictionary.patterns.get(condition); if (!pattern.matcher(segment).matches()) { return Collections.emptyList(); } @@ -188,12 +205,12 @@ final class Stemmer { List stems = new ArrayList(); char wordFlags[] = dictionary.lookupWord(strippedWord, 0, length, scratch); - if (wordFlags != null && Dictionary.hasFlag(wordFlags, affix.getFlag())) { + if (wordFlags != null && Dictionary.hasFlag(wordFlags, flag)) { stems.add(new CharsRef(strippedWord, 0, length)); } - if (affix.isCrossProduct() && recursionDepth < recursionCap) { - dictionary.flagLookup.get(affix.getAppendFlags(), scratch); + if (crossProduct && recursionDepth < recursionCap) { + dictionary.flagLookup.get(append, scratch); char appendFlags[] = Dictionary.decodeFlags(scratch); stems.addAll(stem(strippedWord, length, appendFlags, ++recursionDepth)); } From b2b86dd8add14c8e16d9b794707a3948834a6e68 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Thu, 27 Feb 2014 17:53:30 +0000 Subject: [PATCH 13/17] LUCENE-5468: convert affixes to FST git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5468@1572666 13f79535-47bb-0310-9956-ffa450edef68 --- .../lucene/analysis/hunspell2/Dictionary.java | 78 ++++++++++++++++--- .../lucene/analysis/hunspell2/Stemmer.java | 13 ++-- .../hunspell2/TestAllDictionaries.java | 3 + .../analysis/hunspell2/TestDictionary.java | 8 +- 4 files changed, 83 insertions(+), 19 deletions(-) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java index b30bdaa1e92..b9f9c82c2f5 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java @@ -31,7 +31,9 @@ import org.apache.lucene.util.UnicodeUtil; import org.apache.lucene.util.Version; import org.apache.lucene.util.fst.Builder; import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.IntSequenceOutputs; import org.apache.lucene.util.fst.PositiveIntOutputs; +import org.apache.lucene.util.fst.Util; import java.io.*; import java.nio.charset.Charset; @@ -46,6 +48,7 @@ import java.util.HashMap; import java.util.List; import java.util.Locale; import java.util.Map; +import java.util.TreeMap; import java.util.regex.Pattern; /** @@ -68,8 +71,8 @@ public class Dictionary { private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*"; private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s"; - public CharArrayMap> prefixes; - public CharArrayMap> suffixes; + public FST prefixes; + public FST suffixes; // all Patterns used by prefixes and suffixes. these are typically re-used across // many affix stripping rules. so these are deduplicated, to save RAM. @@ -137,7 +140,7 @@ public class Dictionary { ord = lookupOrd(word, offset, length); } catch (IOException ex) { /* bogus */ } if (ord == null) { - return null; + return null; } return decodeFlags(flagLookup.get(ord, scratch)); } @@ -175,8 +178,8 @@ public class Dictionary { * @param length Length from the offset that the String is * @return List of HunspellAffix prefixes with an append that matches the String, or {@code null} if none are found */ - public List lookupPrefix(char word[], int offset, int length) { - return prefixes.get(word, offset, length); + IntsRef lookupPrefix(char word[], int offset, int length) { + return lookupAffix(prefixes, word, offset, length); } /** @@ -187,8 +190,42 @@ public class Dictionary { * @param length Length from the offset that the String is * @return List of HunspellAffix suffixes with an append that matches the String, or {@code null} if none are found */ - List lookupSuffix(char word[], int offset, int length) { - return suffixes.get(word, offset, length); + IntsRef lookupSuffix(char word[], int offset, int length) { + return lookupAffix(suffixes, word, offset, length); + } + + // TODO: this is pretty stupid, considering how the stemming algorithm works + // we can speed it up to be significantly faster! + IntsRef lookupAffix(FST fst, char word[], int offset, int length) { + if (fst == null) { + return null; + } + final FST.BytesReader bytesReader = fst.getBytesReader(); + final FST.Arc arc = fst.getFirstArc(new FST.Arc()); + // Accumulate output as we go + final IntsRef NO_OUTPUT = fst.outputs.getNoOutput(); + IntsRef output = NO_OUTPUT; + + int l = offset + length; + try { + for (int i = offset, cp = 0; i < l; i += Character.charCount(cp)) { + cp = Character.codePointAt(word, i, l); + if (fst.findTargetArc(cp, arc, arc, bytesReader) == null) { + return null; + } else if (arc.output != NO_OUTPUT) { + output = fst.outputs.add(output, arc.output); + } + } + if (fst.findTargetArc(FST.END_LABEL, arc, arc, bytesReader) == null) { + return null; + } else if (arc.output != NO_OUTPUT) { + return fst.outputs.add(output, arc.output); + } else { + return output; + } + } catch (IOException bogus) { + throw new RuntimeException(bogus); + } } /** @@ -199,8 +236,8 @@ public class Dictionary { * @throws IOException Can be thrown while reading from the InputStream */ private void readAffixFile(InputStream affixStream, CharsetDecoder decoder) throws IOException, ParseException { - prefixes = new CharArrayMap>(Version.LUCENE_CURRENT, 8, false); - suffixes = new CharArrayMap>(Version.LUCENE_CURRENT, 8, false); + TreeMap> prefixes = new TreeMap<>(); + TreeMap> suffixes = new TreeMap<>(); Map seenPatterns = new HashMap<>(); LineNumberReader reader = new LineNumberReader(new InputStreamReader(affixStream, decoder)); @@ -218,6 +255,27 @@ public class Dictionary { flagParsingStrategy = getFlagParsingStrategy(line); } } + + this.prefixes = affixFST(prefixes); + this.suffixes = affixFST(suffixes); + } + + private FST affixFST(TreeMap> affixes) throws IOException { + IntSequenceOutputs outputs = IntSequenceOutputs.getSingleton(); + Builder builder = new Builder<>(FST.INPUT_TYPE.BYTE4, outputs); + + IntsRef scratch = new IntsRef(); + for (Map.Entry> entry : affixes.entrySet()) { + Util.toUTF32(entry.getKey(), scratch); + List entries = entry.getValue(); + IntsRef output = new IntsRef(entries.size()); + int upto = 0; + for (Character c : entries) { + output.ints[output.length++] = c; + } + builder.add(scratch, output); + } + return builder.finish(); } /** @@ -231,7 +289,7 @@ public class Dictionary { * @param seenPatterns map from condition -> index of patterns, for deduplication. * @throws IOException Can be thrown while reading the rule */ - private void parseAffix(CharArrayMap> affixes, + private void parseAffix(TreeMap> affixes, String header, LineNumberReader reader, String conditionPattern, diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java index 4eaff6a9e95..d6b0133830a 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java @@ -27,6 +27,7 @@ import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.store.ByteArrayDataInput; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.CharsRef; +import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.Version; /** @@ -125,12 +126,13 @@ final class Stemmer { List stems = new ArrayList(); for (int i = 0; i < length; i++) { - List suffixes = dictionary.lookupSuffix(word, i, length - i); + IntsRef suffixes = dictionary.lookupSuffix(word, i, length - i); if (suffixes == null) { continue; } - for (Character suffix : suffixes) { + for (int j = 0; j < suffixes.length; j++) { + int suffix = suffixes.ints[suffixes.offset + j]; affixReader.setPosition(8 * suffix); char flag = (char) (affixReader.readShort() & 0xffff); if (hasCrossCheckedFlag(flag, flags)) { @@ -149,12 +151,13 @@ final class Stemmer { } for (int i = length - 1; i >= 0; i--) { - List prefixes = dictionary.lookupPrefix(word, 0, i); + IntsRef prefixes = dictionary.lookupPrefix(word, 0, i); if (prefixes == null) { continue; } - for (Character prefix : prefixes) { + for (int j = 0; j < prefixes.length; j++) { + int prefix = prefixes.ints[prefixes.offset + j]; affixReader.setPosition(8 * prefix); char flag = (char) (affixReader.readShort() & 0xffff); if (hasCrossCheckedFlag(flag, flags)) { @@ -185,7 +188,7 @@ final class Stemmer { * @param recursionDepth Level of recursion this stemming step is at * @return List of stems for the word, or an empty list if none are found */ - public List applyAffix(char strippedWord[], int length, char affix, int recursionDepth) { + public List applyAffix(char strippedWord[], int length, int affix, int recursionDepth) { segment.setLength(0); segment.append(strippedWord, 0, length); diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestAllDictionaries.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestAllDictionaries.java index 9f9bce98236..d00fc634944 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestAllDictionaries.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestAllDictionaries.java @@ -179,6 +179,9 @@ public class TestAllDictionaries extends LuceneTestCase { System.out.println(tests[i] + "\t" + oldRAM + "\t" + RamUsageEstimator.humanSizeOf(dic) + "\t(" + "words=" + RamUsageEstimator.humanSizeOf(dic.words) + ", " + "flags=" + RamUsageEstimator.humanSizeOf(dic.flagLookup) + ", " + + "strips=" + RamUsageEstimator.humanSizeOf(dic.stripLookup) + ", " + + "conditions=" + RamUsageEstimator.humanSizeOf(dic.patterns) + ", " + + "affixData=" + RamUsageEstimator.humanSizeOf(dic.affixData) + ", " + "prefixes=" + RamUsageEstimator.humanSizeOf(dic.prefixes) + ", " + "suffixes=" + RamUsageEstimator.humanSizeOf(dic.suffixes) + ")"); } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestDictionary.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestDictionary.java index 14c6e8967d0..e8e0fd0d030 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestDictionary.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestDictionary.java @@ -32,8 +32,8 @@ public class TestDictionary extends LuceneTestCase { InputStream dictStream = getClass().getResourceAsStream("simple.dic"); Dictionary dictionary = new Dictionary(affixStream, dictStream); - assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).size()); - assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).size()); + assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).length); + assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).length); char flags[] = dictionary.lookupWord(new char[]{'o', 'l', 'r'}, 0, 3, new BytesRef()); assertNotNull(flags); assertEquals(1, flags.length); @@ -48,8 +48,8 @@ public class TestDictionary extends LuceneTestCase { InputStream dictStream = getClass().getResourceAsStream("compressed.dic"); Dictionary dictionary = new Dictionary(affixStream, dictStream); - assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).size()); - assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).size()); + assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).length); + assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).length); assertEquals(1, dictionary.lookupWord(new char[]{'o', 'l', 'r'}, 0, 3, new BytesRef()).length); affixStream.close(); From c4f4beb27e6cb636b0b151b4288f2230e350adc4 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Thu, 27 Feb 2014 20:19:27 +0000 Subject: [PATCH 14/17] LUCENE-5468: hunspell2 -> hunspell (with previous options and tests) git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5468@1572718 13f79535-47bb-0310-9956-ffa450edef68 --- .../{hunspell2 => hunspell}/Dictionary.java | 85 ++- .../analysis/hunspell/HunspellAffix.java | 157 ------ .../analysis/hunspell/HunspellDictionary.java | 507 ------------------ .../analysis/hunspell/HunspellStemFilter.java | 89 ++- .../hunspell/HunspellStemFilterFactory.java | 62 +-- .../analysis/hunspell/HunspellStemmer.java | 392 -------------- .../analysis/hunspell/HunspellWord.java | 63 --- .../ISO8859_14Decoder.java | 2 +- .../{hunspell2 => hunspell}/Stemmer.java | 28 +- .../hunspell2/Hunspell2StemFilter.java | 137 ----- .../hunspell2/Hunspell2StemFilterFactory.java | 80 --- .../lucene/analysis/hunspell2/package.html | 26 - ...he.lucene.analysis.util.TokenFilterFactory | 1 - .../analysis/core/TestRandomChains.java | 12 +- .../hunspell/HunspellDictionaryTest.java | 201 ------- .../hunspell/HunspellStemFilterTest.java | 92 ---- .../hunspell/HunspellStemmerTest.java | 137 ----- .../TestAllDictionaries.java | 20 +- .../hunspell/TestCaseInsensitive.java | 110 ++++ .../TestDictionary.java | 3 +- .../TestHunspellStemFilter.java} | 22 +- .../TestHunspellStemFilterFactory.java | 11 +- .../{hunspell2 => hunspell}/TestStemmer.java | 4 +- .../{hunspell2 => hunspell}/broken.aff | 0 .../{hunspell2 => hunspell}/compressed.aff | 0 .../{hunspell2 => hunspell}/compressed.dic | 0 .../lucene/analysis/hunspell/mixedcase.dic | 10 + .../{hunspell2 => hunspell}/simple.aff | 0 .../{hunspell2 => hunspell}/simple.dic | 0 .../apache/lucene/analysis/hunspell/test.aff | 20 - .../apache/lucene/analysis/hunspell/test.dic | 10 - .../analysis/hunspell/testCompressed.aff | 29 - .../analysis/hunspell/testCompressed.dic | 9 - .../lucene/analysis/hunspell/testOverride.dic | 3 - .../analysis/hunspell/testWrongAffixRule.aff | 24 - .../TestHunspell2StemFilterFactory.java | 50 -- 36 files changed, 320 insertions(+), 2076 deletions(-) rename lucene/analysis/common/src/java/org/apache/lucene/analysis/{hunspell2 => hunspell}/Dictionary.java (90%) delete mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellAffix.java delete mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellDictionary.java delete mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemmer.java delete mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellWord.java rename lucene/analysis/common/src/java/org/apache/lucene/analysis/{hunspell2 => hunspell}/ISO8859_14Decoder.java (98%) rename lucene/analysis/common/src/java/org/apache/lucene/analysis/{hunspell2 => hunspell}/Stemmer.java (92%) delete mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Hunspell2StemFilter.java delete mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Hunspell2StemFilterFactory.java delete mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/package.html delete mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellDictionaryTest.java delete mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellStemFilterTest.java delete mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellStemmerTest.java rename lucene/analysis/common/src/test/org/apache/lucene/analysis/{hunspell2 => hunspell}/TestAllDictionaries.java (93%) create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestCaseInsensitive.java rename lucene/analysis/common/src/test/org/apache/lucene/analysis/{hunspell2 => hunspell}/TestDictionary.java (97%) rename lucene/analysis/common/src/test/org/apache/lucene/analysis/{hunspell2/TestHunspell2StemFilter.java => hunspell/TestHunspellStemFilter.java} (75%) rename lucene/analysis/common/src/test/org/apache/lucene/analysis/{hunspell2 => hunspell}/TestStemmer.java (95%) rename lucene/analysis/common/src/test/org/apache/lucene/analysis/{hunspell2 => hunspell}/broken.aff (100%) rename lucene/analysis/common/src/test/org/apache/lucene/analysis/{hunspell2 => hunspell}/compressed.aff (100%) rename lucene/analysis/common/src/test/org/apache/lucene/analysis/{hunspell2 => hunspell}/compressed.dic (100%) create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/mixedcase.dic rename lucene/analysis/common/src/test/org/apache/lucene/analysis/{hunspell2 => hunspell}/simple.aff (100%) rename lucene/analysis/common/src/test/org/apache/lucene/analysis/{hunspell2 => hunspell}/simple.dic (100%) delete mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/test.aff delete mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/test.dic delete mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testCompressed.aff delete mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testCompressed.dic delete mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testOverride.dic delete mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testWrongAffixRule.aff delete mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestHunspell2StemFilterFactory.java diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java similarity index 90% rename from lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java rename to lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java index b9f9c82c2f5..7bbf27fb817 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Dictionary.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java @@ -1,4 +1,4 @@ -package org.apache.lucene.analysis.hunspell2; +package org.apache.lucene.analysis.hunspell; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -17,7 +17,6 @@ package org.apache.lucene.analysis.hunspell2; * limitations under the License. */ -import org.apache.lucene.analysis.util.CharArrayMap; import org.apache.lucene.store.ByteArrayDataOutput; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; @@ -28,14 +27,19 @@ import org.apache.lucene.util.OfflineSorter; import org.apache.lucene.util.OfflineSorter.ByteSequencesReader; import org.apache.lucene.util.OfflineSorter.ByteSequencesWriter; import org.apache.lucene.util.UnicodeUtil; -import org.apache.lucene.util.Version; import org.apache.lucene.util.fst.Builder; import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.IntSequenceOutputs; import org.apache.lucene.util.fst.PositiveIntOutputs; import org.apache.lucene.util.fst.Util; -import java.io.*; +import java.io.BufferedInputStream; +import java.io.BufferedReader; +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.LineNumberReader; import java.nio.charset.Charset; import java.nio.charset.CharsetDecoder; import java.nio.charset.CodingErrorAction; @@ -71,27 +75,27 @@ public class Dictionary { private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*"; private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s"; - public FST prefixes; - public FST suffixes; + FST prefixes; + FST suffixes; // all Patterns used by prefixes and suffixes. these are typically re-used across // many affix stripping rules. so these are deduplicated, to save RAM. // TODO: maybe don't use Pattern for the condition check... // TODO: when we cut over Affix to FST, just store integer index to this. - public ArrayList patterns = new ArrayList<>(); + ArrayList patterns = new ArrayList<>(); // the entries in the .dic file, mapping to their set of flags. // the fst output is the ordinal for flagLookup - public FST words; + FST words; // the list of unique flagsets (wordforms). theoretically huge, but practically // small (e.g. for polish this is 756), otherwise humans wouldn't be able to deal with it either. - public BytesRefHash flagLookup = new BytesRefHash(); + BytesRefHash flagLookup = new BytesRefHash(); // the list of unique strip affixes. - public BytesRefHash stripLookup = new BytesRefHash(); + BytesRefHash stripLookup = new BytesRefHash(); // 8 bytes per affix - public byte[] affixData = new byte[64]; + byte[] affixData = new byte[64]; private int currentAffix = 0; private FlagParsingStrategy flagParsingStrategy = new SimpleFlagParsingStrategy(); // Default flag parsing strategy @@ -100,7 +104,11 @@ public class Dictionary { private int aliasCount = 0; private final File tempDir = OfflineSorter.defaultTempDir(); // TODO: make this configurable? - + + public static final int IGNORE_CASE = 1; + + boolean ignoreCase; + /** * Creates a new Dictionary containing the information read from the provided InputStreams to hunspell affix * and dictionary files. @@ -112,6 +120,21 @@ public class Dictionary { * @throws ParseException Can be thrown if the content of the files does not meet expected formats */ public Dictionary(InputStream affix, InputStream dictionary) throws IOException, ParseException { + this(affix, Collections.singletonList(dictionary), false); + } + + /** + * Creates a new Dictionary containing the information read from the provided InputStreams to hunspell affix + * and dictionary files. + * You have to close the provided InputStreams yourself. + * + * @param affix InputStream for reading the hunspell affix file (won't be closed). + * @param dictionaries InputStream for reading the hunspell dictionary files (won't be closed). + * @throws IOException Can be thrown while reading from the InputStreams + * @throws ParseException Can be thrown if the content of the files does not meet expected formats + */ + public Dictionary(InputStream affix, List dictionaries, boolean ignoreCase) throws IOException, ParseException { + this.ignoreCase = ignoreCase; BufferedInputStream buffered = new BufferedInputStream(affix, 8192); buffered.mark(8192); String encoding = getDictionaryEncoding(affix); @@ -122,7 +145,7 @@ public class Dictionary { stripLookup.add(new BytesRef()); // no strip -> ord 0 PositiveIntOutputs o = PositiveIntOutputs.getSingleton(); Builder b = new Builder(FST.INPUT_TYPE.BYTE4, o); - readDictionaryFile(dictionary, decoder, b); + readDictionaryFiles(dictionaries, decoder, b); words = b.finish(); } @@ -145,7 +168,7 @@ public class Dictionary { return decodeFlags(flagLookup.get(ord, scratch)); } - public Integer lookupOrd(char word[], int offset, int length) throws IOException { + Integer lookupOrd(char word[], int offset, int length) throws IOException { final FST.BytesReader bytesReader = words.getBytesReader(); final FST.Arc arc = words.getFirstArc(new FST.Arc()); // Accumulate output as we go @@ -269,7 +292,6 @@ public class Dictionary { Util.toUTF32(entry.getKey(), scratch); List entries = entry.getValue(); IntsRef output = new IntsRef(entries.size()); - int upto = 0; for (Character c : entries) { output.ints[output.length++] = c; } @@ -480,23 +502,39 @@ public class Dictionary { } /** - * Reads the dictionary file through the provided InputStream, building up the words map + * Reads the dictionary file through the provided InputStreams, building up the words map * - * @param dictionary InputStream to read the dictionary file through + * @param dictionaries InputStreams to read the dictionary file through * @param decoder CharsetDecoder used to decode the contents of the file * @throws IOException Can be thrown while reading from the file */ - private void readDictionaryFile(InputStream dictionary, CharsetDecoder decoder, Builder words) throws IOException { + private void readDictionaryFiles(List dictionaries, CharsetDecoder decoder, Builder words) throws IOException { BytesRef flagsScratch = new BytesRef(); IntsRef scratchInts = new IntsRef(); - BufferedReader lines = new BufferedReader(new InputStreamReader(dictionary, decoder)); - String line = lines.readLine(); // first line is number of entries (approximately, sometimes) - File unsorted = File.createTempFile("unsorted", "dat", tempDir); try (ByteSequencesWriter writer = new ByteSequencesWriter(unsorted)) { - while ((line = lines.readLine()) != null) { - writer.write(line.getBytes(IOUtils.CHARSET_UTF_8)); + for (InputStream dictionary : dictionaries) { + BufferedReader lines = new BufferedReader(new InputStreamReader(dictionary, decoder)); + String line = lines.readLine(); // first line is number of entries (approximately, sometimes) + + while ((line = lines.readLine()) != null) { + if (ignoreCase) { + int flagSep = line.lastIndexOf('/'); + if (flagSep == -1) { + writer.write(line.toLowerCase(Locale.ROOT).getBytes(IOUtils.CHARSET_UTF_8)); + } else { + StringBuilder sb = new StringBuilder(); + sb.append(line.substring(0, flagSep).toLowerCase(Locale.ROOT)); + if (flagSep < line.length()) { + sb.append(line.substring(flagSep, line.length())); + } + writer.write(sb.toString().getBytes(IOUtils.CHARSET_UTF_8)); + } + } else { + writer.write(line.getBytes(IOUtils.CHARSET_UTF_8)); + } + } } } File sorted = File.createTempFile("sorted", "dat", tempDir); @@ -544,6 +582,7 @@ public class Dictionary { BytesRef currentEntry = new BytesRef(); char currentFlags[] = new char[0]; + String line; while (reader.read(scratchLine)) { line = scratchLine.utf8ToString(); String entry; diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellAffix.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellAffix.java deleted file mode 100644 index 97376c0b15e..00000000000 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellAffix.java +++ /dev/null @@ -1,157 +0,0 @@ -package org.apache.lucene.analysis.hunspell; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.util.regex.Pattern; - -/** - * Wrapper class representing a hunspell affix - */ -public class HunspellAffix { - - private String append; // the affix itself, what is appended - private char appendFlags[]; // continuation class flags - private String strip; - - private String condition; - private Pattern conditionPattern; - - private char flag; - - private boolean crossProduct; - - /** - * Checks whether the given text matches the conditional pattern on this affix - * - * @param text Text to check if it matches the affix's conditional pattern - * @return {@code true} if the text meets the condition, {@code false} otherwise - */ - public boolean checkCondition(CharSequence text) { - return conditionPattern.matcher(text).matches(); - } - - /** - * Returns the append defined for the affix - * - * @return Defined append - */ - public String getAppend() { - return append; - } - - /** - * Sets the append defined for the affix - * - * @param append Defined append for the affix - */ - public void setAppend(String append) { - this.append = append; - } - - /** - * Returns the flags defined for the affix append - * - * @return Flags defined for the affix append - */ - public char[] getAppendFlags() { - return appendFlags; - } - - /** - * Sets the flags defined for the affix append - * - * @param appendFlags Flags defined for the affix append - */ - public void setAppendFlags(char[] appendFlags) { - this.appendFlags = appendFlags; - } - - /** - * Returns the stripping characters defined for the affix - * - * @return Stripping characters defined for the affix - */ - public String getStrip() { - return strip; - } - - /** - * Sets the stripping characters defined for the affix - * - * @param strip Stripping characters defined for the affix - */ - public void setStrip(String strip) { - this.strip = strip; - } - - /** - * Returns the condition that must be met before the affix can be applied - * - * @return Condition that must be met before the affix can be applied - */ - public String getCondition() { - return condition; - } - - /** - * Sets the condition that must be met before the affix can be applied - * - * @param condition Condition to be met before affix application - * @param pattern Condition as a regular expression pattern - */ - public void setCondition(String condition, String pattern) { - this.condition = condition; - this.conditionPattern = Pattern.compile(pattern); - } - - /** - * Returns the affix flag - * - * @return Affix flag - */ - public char getFlag() { - return flag; - } - - /** - * Sets the affix flag - * - * @param flag Affix flag - */ - public void setFlag(char flag) { - this.flag = flag; - } - - /** - * Returns whether the affix is defined as cross product - * - * @return {@code true} if the affix is cross product, {@code false} otherwise - */ - public boolean isCrossProduct() { - return crossProduct; - } - - /** - * Sets whether the affix is defined as cross product - * - * @param crossProduct Whether the affix is defined as cross product - */ - public void setCrossProduct(boolean crossProduct) { - this.crossProduct = crossProduct; - } -} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellDictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellDictionary.java deleted file mode 100644 index ccb53f57d29..00000000000 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellDictionary.java +++ /dev/null @@ -1,507 +0,0 @@ -package org.apache.lucene.analysis.hunspell; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.analysis.util.CharArrayMap; -import org.apache.lucene.util.Version; - -import java.io.*; -import java.nio.charset.Charset; -import java.nio.charset.CharsetDecoder; -import java.text.ParseException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -import java.util.Locale; - -/** - * In-memory structure for the dictionary (.dic) and affix (.aff) - * data of a hunspell dictionary. - */ -public class HunspellDictionary { - - static final HunspellWord NOFLAGS = new HunspellWord(); - - private static final String ALIAS_KEY = "AF"; - private static final String PREFIX_KEY = "PFX"; - private static final String SUFFIX_KEY = "SFX"; - private static final String FLAG_KEY = "FLAG"; - - private static final String NUM_FLAG_TYPE = "num"; - private static final String UTF8_FLAG_TYPE = "UTF-8"; - private static final String LONG_FLAG_TYPE = "long"; - - private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*"; - private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s"; - - private static final boolean IGNORE_CASE_DEFAULT = false; - private static final boolean STRICT_AFFIX_PARSING_DEFAULT = true; - - private CharArrayMap> words; - private CharArrayMap> prefixes; - private CharArrayMap> suffixes; - - private FlagParsingStrategy flagParsingStrategy = new SimpleFlagParsingStrategy(); // Default flag parsing strategy - private boolean ignoreCase = IGNORE_CASE_DEFAULT; - - private final Version version; - - private String[] aliases; - private int aliasCount = 0; - - /** - * Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix - * and dictionary files. - * You have to close the provided InputStreams yourself. - * - * @param affix InputStream for reading the hunspell affix file (won't be closed). - * @param dictionary InputStream for reading the hunspell dictionary file (won't be closed). - * @param version Lucene Version - * @throws IOException Can be thrown while reading from the InputStreams - * @throws ParseException Can be thrown if the content of the files does not meet expected formats - */ - public HunspellDictionary(InputStream affix, InputStream dictionary, Version version) throws IOException, ParseException { - this(affix, Arrays.asList(dictionary), version, IGNORE_CASE_DEFAULT); - } - - /** - * Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix - * and dictionary files. - * You have to close the provided InputStreams yourself. - * - * @param affix InputStream for reading the hunspell affix file (won't be closed). - * @param dictionary InputStream for reading the hunspell dictionary file (won't be closed). - * @param version Lucene Version - * @param ignoreCase If true, dictionary matching will be case insensitive - * @throws IOException Can be thrown while reading from the InputStreams - * @throws ParseException Can be thrown if the content of the files does not meet expected formats - */ - public HunspellDictionary(InputStream affix, InputStream dictionary, Version version, boolean ignoreCase) throws IOException, ParseException { - this(affix, Arrays.asList(dictionary), version, ignoreCase); - } - - /** - * Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix - * and dictionary files. - * You have to close the provided InputStreams yourself. - * - * @param affix InputStream for reading the hunspell affix file (won't be closed). - * @param dictionaries InputStreams for reading the hunspell dictionary file (won't be closed). - * @param version Lucene Version - * @param ignoreCase If true, dictionary matching will be case insensitive - * @throws IOException Can be thrown while reading from the InputStreams - * @throws ParseException Can be thrown if the content of the files does not meet expected formats - */ - public HunspellDictionary(InputStream affix, List dictionaries, Version version, boolean ignoreCase) throws IOException, ParseException { - this(affix, dictionaries, version, ignoreCase, STRICT_AFFIX_PARSING_DEFAULT); - } - - /** - * Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix - * and dictionary files. - * You have to close the provided InputStreams yourself. - * - * @param affix InputStream for reading the hunspell affix file (won't be closed). - * @param dictionaries InputStreams for reading the hunspell dictionary file (won't be closed). - * @param version Lucene Version - * @param ignoreCase If true, dictionary matching will be case insensitive - * @param strictAffixParsing Affix strict parsing enabled or not (an error while reading a rule causes exception or is ignored) - * @throws IOException Can be thrown while reading from the InputStreams - * @throws ParseException Can be thrown if the content of the files does not meet expected formats - */ - public HunspellDictionary(InputStream affix, List dictionaries, Version version, boolean ignoreCase, boolean strictAffixParsing) throws IOException, ParseException { - this.version = version; - this.ignoreCase = ignoreCase; - String encoding = getDictionaryEncoding(affix); - CharsetDecoder decoder = getJavaEncoding(encoding); - readAffixFile(affix, decoder, strictAffixParsing); - words = new CharArrayMap>(version, 65535 /* guess */, this.ignoreCase); - for (InputStream dictionary : dictionaries) { - readDictionaryFile(dictionary, decoder); - } - } - - /** - * Looks up HunspellWords that match the String created from the given char array, offset and length - * - * @param word Char array to generate the String from - * @param offset Offset in the char array that the String starts at - * @param length Length from the offset that the String is - * @return List of HunspellWords that match the generated String, or {@code null} if none are found - */ - public List lookupWord(char word[], int offset, int length) { - return words.get(word, offset, length); - } - - /** - * Looks up HunspellAffix prefixes that have an append that matches the String created from the given char array, offset and length - * - * @param word Char array to generate the String from - * @param offset Offset in the char array that the String starts at - * @param length Length from the offset that the String is - * @return List of HunspellAffix prefixes with an append that matches the String, or {@code null} if none are found - */ - public List lookupPrefix(char word[], int offset, int length) { - return prefixes.get(word, offset, length); - } - - /** - * Looks up HunspellAffix suffixes that have an append that matches the String created from the given char array, offset and length - * - * @param word Char array to generate the String from - * @param offset Offset in the char array that the String starts at - * @param length Length from the offset that the String is - * @return List of HunspellAffix suffixes with an append that matches the String, or {@code null} if none are found - */ - public List lookupSuffix(char word[], int offset, int length) { - return suffixes.get(word, offset, length); - } - - /** - * Reads the affix file through the provided InputStream, building up the prefix and suffix maps - * - * @param affixStream InputStream to read the content of the affix file from - * @param decoder CharsetDecoder to decode the content of the file - * @throws IOException Can be thrown while reading from the InputStream - */ - private void readAffixFile(InputStream affixStream, CharsetDecoder decoder, boolean strict) throws IOException, ParseException { - prefixes = new CharArrayMap>(version, 8, ignoreCase); - suffixes = new CharArrayMap>(version, 8, ignoreCase); - - LineNumberReader reader = new LineNumberReader(new InputStreamReader(affixStream, decoder)); - String line = null; - while ((line = reader.readLine()) != null) { - if (line.startsWith(ALIAS_KEY)) { - parseAlias(line); - } else if (line.startsWith(PREFIX_KEY)) { - parseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN, strict); - } else if (line.startsWith(SUFFIX_KEY)) { - parseAffix(suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN, strict); - } else if (line.startsWith(FLAG_KEY)) { - // Assume that the FLAG line comes before any prefix or suffixes - // Store the strategy so it can be used when parsing the dic file - flagParsingStrategy = getFlagParsingStrategy(line); - } - } - } - - /** - * Parses a specific affix rule putting the result into the provided affix map - * - * @param affixes Map where the result of the parsing will be put - * @param header Header line of the affix rule - * @param reader BufferedReader to read the content of the rule from - * @param conditionPattern {@link String#format(String, Object...)} pattern to be used to generate the condition regex - * pattern - * @throws IOException Can be thrown while reading the rule - */ - private void parseAffix(CharArrayMap> affixes, - String header, - LineNumberReader reader, - String conditionPattern, - boolean strict) throws IOException, ParseException { - String args[] = header.split("\\s+"); - - boolean crossProduct = args[2].equals("Y"); - - int numLines = Integer.parseInt(args[3]); - for (int i = 0; i < numLines; i++) { - String line = reader.readLine(); - String ruleArgs[] = line.split("\\s+"); - - if (ruleArgs.length < 5) { - if (strict) { - throw new ParseException("The affix file contains a rule with less than five elements", reader.getLineNumber()); - } - continue; - } - - HunspellAffix affix = new HunspellAffix(); - - affix.setFlag(flagParsingStrategy.parseFlag(ruleArgs[1])); - affix.setStrip(ruleArgs[2].equals("0") ? "" : ruleArgs[2]); - - String affixArg = ruleArgs[3]; - - int flagSep = affixArg.lastIndexOf('/'); - if (flagSep != -1) { - String flagPart = affixArg.substring(flagSep + 1); - - if (aliasCount > 0) { - flagPart = getAliasValue(Integer.parseInt(flagPart)); - } - - char appendFlags[] = flagParsingStrategy.parseFlags(flagPart); - Arrays.sort(appendFlags); - affix.setAppendFlags(appendFlags); - affix.setAppend(affixArg.substring(0, flagSep)); - } else { - affix.setAppend(affixArg); - } - - String condition = ruleArgs[4]; - affix.setCondition(condition, String.format(Locale.ROOT, conditionPattern, condition)); - affix.setCrossProduct(crossProduct); - - List list = affixes.get(affix.getAppend()); - if (list == null) { - list = new ArrayList(); - affixes.put(affix.getAppend(), list); - } - - list.add(affix); - } - } - - /** - * Parses the encoding specified in the affix file readable through the provided InputStream - * - * @param affix InputStream for reading the affix file - * @return Encoding specified in the affix file - * @throws IOException Can be thrown while reading from the InputStream - * @throws ParseException Thrown if the first non-empty non-comment line read from the file does not adhere to the format {@code SET } - */ - private String getDictionaryEncoding(InputStream affix) throws IOException, ParseException { - final StringBuilder encoding = new StringBuilder(); - for (;;) { - encoding.setLength(0); - int ch; - while ((ch = affix.read()) >= 0) { - if (ch == '\n') { - break; - } - if (ch != '\r') { - encoding.append((char)ch); - } - } - if ( - encoding.length() == 0 || encoding.charAt(0) == '#' || - // this test only at the end as ineffective but would allow lines only containing spaces: - encoding.toString().trim().length() == 0 - ) { - if (ch < 0) { - throw new ParseException("Unexpected end of affix file.", 0); - } - continue; - } - if ("SET ".equals(encoding.substring(0, 4))) { - // cleanup the encoding string, too (whitespace) - return encoding.substring(4).trim(); - } - throw new ParseException("The first non-comment line in the affix file must "+ - "be a 'SET charset', was: '" + encoding +"'", 0); - } - } - - /** - * Retrieves the CharsetDecoder for the given encoding. Note, This isn't perfect as I think ISCII-DEVANAGARI and - * MICROSOFT-CP1251 etc are allowed... - * - * @param encoding Encoding to retrieve the CharsetDecoder for - * @return CharSetDecoder for the given encoding - */ - private CharsetDecoder getJavaEncoding(String encoding) { - Charset charset = Charset.forName(encoding); - return charset.newDecoder(); - } - - /** - * Determines the appropriate {@link FlagParsingStrategy} based on the FLAG definition line taken from the affix file - * - * @param flagLine Line containing the flag information - * @return FlagParsingStrategy that handles parsing flags in the way specified in the FLAG definition - */ - private FlagParsingStrategy getFlagParsingStrategy(String flagLine) { - String flagType = flagLine.substring(5); - - if (NUM_FLAG_TYPE.equals(flagType)) { - return new NumFlagParsingStrategy(); - } else if (UTF8_FLAG_TYPE.equals(flagType)) { - return new SimpleFlagParsingStrategy(); - } else if (LONG_FLAG_TYPE.equals(flagType)) { - return new DoubleASCIIFlagParsingStrategy(); - } - - throw new IllegalArgumentException("Unknown flag type: " + flagType); - } - - /** - * Reads the dictionary file through the provided InputStream, building up the words map - * - * @param dictionary InputStream to read the dictionary file through - * @param decoder CharsetDecoder used to decode the contents of the file - * @throws IOException Can be thrown while reading from the file - */ - private void readDictionaryFile(InputStream dictionary, CharsetDecoder decoder) throws IOException { - BufferedReader reader = new BufferedReader(new InputStreamReader(dictionary, decoder)); - // TODO: don't create millions of strings. - String line = reader.readLine(); // first line is number of entries - int numEntries = Integer.parseInt(line); - - // TODO: the flags themselves can be double-chars (long) or also numeric - // either way the trick is to encode them as char... but they must be parsed differently - while ((line = reader.readLine()) != null) { - String entry; - HunspellWord wordForm; - - int flagSep = line.lastIndexOf('/'); - if (flagSep == -1) { - wordForm = NOFLAGS; - entry = line; - } else { - // note, there can be comments (morph description) after a flag. - // we should really look for any whitespace - int end = line.indexOf('\t', flagSep); - if (end == -1) - end = line.length(); - - String flagPart = line.substring(flagSep + 1, end); - if (aliasCount > 0) { - flagPart = getAliasValue(Integer.parseInt(flagPart)); - } - - wordForm = new HunspellWord(flagParsingStrategy.parseFlags(flagPart)); - Arrays.sort(wordForm.getFlags()); - entry = line.substring(0, flagSep); - } - if(ignoreCase) { - entry = entry.toLowerCase(Locale.ROOT); - } - - List entries = new ArrayList(); - entries.add(wordForm); - words.put(entry, entries); - } - } - - public Version getVersion() { - return version; - } - - private void parseAlias(String line) { - String ruleArgs[] = line.split("\\s+"); - if (aliases == null) { - //first line should be the aliases count - final int count = Integer.parseInt(ruleArgs[1]); - aliases = new String[count]; - } else { - aliases[aliasCount++] = ruleArgs[1]; - } - } - - private String getAliasValue(int id) { - try { - return aliases[id - 1]; - } catch (IndexOutOfBoundsException ex) { - throw new IllegalArgumentException("Bad flag alias number:" + id, ex); - } - } - - /** - * Abstraction of the process of parsing flags taken from the affix and dic files - */ - private static abstract class FlagParsingStrategy { - - /** - * Parses the given String into a single flag - * - * @param rawFlag String to parse into a flag - * @return Parsed flag - */ - char parseFlag(String rawFlag) { - return parseFlags(rawFlag)[0]; - } - - /** - * Parses the given String into multiple flags - * - * @param rawFlags String to parse into flags - * @return Parsed flags - */ - abstract char[] parseFlags(String rawFlags); - } - - /** - * Simple implementation of {@link FlagParsingStrategy} that treats the chars in each String as a individual flags. - * Can be used with both the ASCII and UTF-8 flag types. - */ - private static class SimpleFlagParsingStrategy extends FlagParsingStrategy { - /** - * {@inheritDoc} - */ - @Override - public char[] parseFlags(String rawFlags) { - return rawFlags.toCharArray(); - } - } - - /** - * Implementation of {@link FlagParsingStrategy} that assumes each flag is encoded in its numerical form. In the case - * of multiple flags, each number is separated by a comma. - */ - private static class NumFlagParsingStrategy extends FlagParsingStrategy { - /** - * {@inheritDoc} - */ - @Override - public char[] parseFlags(String rawFlags) { - String[] rawFlagParts = rawFlags.trim().split(","); - char[] flags = new char[rawFlagParts.length]; - - for (int i = 0; i < rawFlagParts.length; i++) { - // note, removing the trailing X/leading I for nepali... what is the rule here?! - flags[i] = (char) Integer.parseInt(rawFlagParts[i].replaceAll("[^0-9]", "")); - } - - return flags; - } - } - - /** - * Implementation of {@link FlagParsingStrategy} that assumes each flag is encoded as two ASCII characters whose codes - * must be combined into a single character. - * - * TODO (rmuir) test - */ - private static class DoubleASCIIFlagParsingStrategy extends FlagParsingStrategy { - - /** - * {@inheritDoc} - */ - @Override - public char[] parseFlags(String rawFlags) { - if (rawFlags.length() == 0) { - return new char[0]; - } - - StringBuilder builder = new StringBuilder(); - for (int i = 0; i < rawFlags.length(); i+=2) { - char cookedFlag = (char) ((int) rawFlags.charAt(i) + (int) rawFlags.charAt(i + 1)); - builder.append(cookedFlag); - } - - char flags[] = new char[builder.length()]; - builder.getChars(0, builder.length(), flags, 0); - return flags; - } - } - - public boolean isIgnoreCase() { - return ignoreCase; - } -} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilter.java index 4ff0a741ad8..a9b512b7bbd 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilter.java @@ -18,14 +18,16 @@ package org.apache.lucene.analysis.hunspell; */ import java.io.IOException; +import java.util.Collections; +import java.util.Comparator; import java.util.List; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.hunspell.HunspellStemmer.Stem; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.util.CharsRef; /** * TokenFilter that uses hunspell affix rules and words to stem tokens. Since hunspell supports a word having multiple @@ -41,71 +43,83 @@ import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; * {@link org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilterFactory} *

* - * + * @lucene.experimental */ public final class HunspellStemFilter extends TokenFilter { private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class); - private final HunspellStemmer stemmer; + private final Stemmer stemmer; - private List buffer; + private List buffer; private State savedState; private final boolean dedup; + private final boolean longestOnly; /** Create a {@link HunspellStemFilter} which deduplicates stems and has a maximum * recursion level of 2. - * @see #HunspellStemFilter(TokenStream, HunspellDictionary, int) */ - public HunspellStemFilter(TokenStream input, HunspellDictionary dictionary) { + * @see #HunspellStemFilter(TokenStream, Dictionary, int) */ + public HunspellStemFilter(TokenStream input, Dictionary dictionary) { this(input, dictionary, 2); } /** - * Creates a new HunspellStemFilter that will stem tokens from the given TokenStream using affix rules in the provided - * HunspellDictionary + * Creates a new Hunspell2StemFilter that will stem tokens from the given TokenStream using affix rules in the provided + * Dictionary * * @param input TokenStream whose tokens will be stemmed * @param dictionary HunspellDictionary containing the affix rules and words that will be used to stem the tokens * @param recursionCap maximum level of recursion stemmer can go into, defaults to 2 */ - public HunspellStemFilter(TokenStream input, HunspellDictionary dictionary, int recursionCap) { + public HunspellStemFilter(TokenStream input, Dictionary dictionary, int recursionCap) { this(input, dictionary, true, recursionCap); } /** Create a {@link HunspellStemFilter} which has a maximum recursion level of 2. - * @see #HunspellStemFilter(TokenStream, HunspellDictionary, boolean, int) */ - public HunspellStemFilter(TokenStream input, HunspellDictionary dictionary, boolean dedup) { + * @see #HunspellStemFilter(TokenStream, Dictionary, boolean, int) */ + public HunspellStemFilter(TokenStream input, Dictionary dictionary, boolean dedup) { this(input, dictionary, dedup, 2); } - + /** * Creates a new HunspellStemFilter that will stem tokens from the given TokenStream using affix rules in the provided - * HunspellDictionary + * Dictionary * * @param input TokenStream whose tokens will be stemmed * @param dictionary HunspellDictionary containing the affix rules and words that will be used to stem the tokens * @param dedup true if only unique terms should be output. * @param recursionCap maximum level of recursion stemmer can go into, defaults to 2 */ - public HunspellStemFilter(TokenStream input, HunspellDictionary dictionary, boolean dedup, int recursionCap) { - super(input); - this.dedup = dedup; - this.stemmer = new HunspellStemmer(dictionary, recursionCap); + public HunspellStemFilter(TokenStream input, Dictionary dictionary, boolean dedup, int recursionCap) { + this(input, dictionary, dedup, recursionCap, false); } /** - * {@inheritDoc} + * Creates a new HunspellStemFilter that will stem tokens from the given TokenStream using affix rules in the provided + * Dictionary + * + * @param input TokenStream whose tokens will be stemmed + * @param dictionary HunspellDictionary containing the affix rules and words that will be used to stem the tokens + * @param dedup true if only unique terms should be output. + * @param recursionCap maximum level of recursion stemmer can go into, defaults to 2 + * @param longestOnly true if only the longest term should be output. */ + public HunspellStemFilter(TokenStream input, Dictionary dictionary, boolean dedup, int recursionCap, boolean longestOnly) { + super(input); + this.dedup = dedup && longestOnly == false; // don't waste time deduping if longestOnly is set + this.stemmer = new Stemmer(dictionary, recursionCap); + this.longestOnly = longestOnly; + } + @Override public boolean incrementToken() throws IOException { if (buffer != null && !buffer.isEmpty()) { - Stem nextStem = buffer.remove(0); + CharsRef nextStem = buffer.remove(0); restoreState(savedState); posIncAtt.setPositionIncrement(0); - termAtt.copyBuffer(nextStem.getStem(), 0, nextStem.getStemLength()); - termAtt.setLength(nextStem.getStemLength()); + termAtt.setEmpty().append(nextStem); return true; } @@ -122,24 +136,41 @@ public final class HunspellStemFilter extends TokenFilter { if (buffer.isEmpty()) { // we do not know this word, return it unchanged return true; } + + if (longestOnly && buffer.size() > 1) { + Collections.sort(buffer, lengthComparator); + } - Stem stem = buffer.remove(0); - termAtt.copyBuffer(stem.getStem(), 0, stem.getStemLength()); - termAtt.setLength(stem.getStemLength()); + CharsRef stem = buffer.remove(0); + termAtt.setEmpty().append(stem); - if (!buffer.isEmpty()) { - savedState = captureState(); + if (longestOnly) { + buffer.clear(); + } else { + if (!buffer.isEmpty()) { + savedState = captureState(); + } } return true; } - /** - * {@inheritDoc} - */ @Override public void reset() throws IOException { super.reset(); buffer = null; } + + static final Comparator lengthComparator = new Comparator() { + @Override + public int compare(CharsRef o1, CharsRef o2) { + int cmp = Integer.compare(o2.length, o1.length); + if (cmp == 0) { + // tie break on text + return o2.compareTo(o1); + } else { + return cmp; + } + } + }; } diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilterFactory.java index 63e621c2ab9..e632b489d51 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilterFactory.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilterFactory.java @@ -31,89 +31,75 @@ import org.apache.lucene.analysis.util.TokenFilterFactory; import org.apache.lucene.util.IOUtils; /** - * TokenFilterFactory that creates instances of {@link org.apache.lucene.analysis.hunspell.HunspellStemFilter}. - * Example config for British English including a custom dictionary, case insensitive matching: + * TokenFilterFactory that creates instances of {@link HunspellStemFilter}. + * Example config for British English: *
  * <filter class="solr.HunspellStemFilterFactory"
- *    dictionary="en_GB.dic,my_custom.dic"
- *    affix="en_GB.aff"
- *    ignoreCase="true" />
+ * dictionary="en_GB.dic,my_custom.dic" + * affix="en_GB.aff" + * ignoreCase="false" + * longestOnly="false" /> * Both parameters dictionary and affix are mandatory. - *
- * The parameter ignoreCase (true/false) controls whether matching is case sensitive or not. Default false. - *
- * The parameter strictAffixParsing (true/false) controls whether the affix parsing is strict or not. Default true. - * If strict an error while reading an affix rule causes a ParseException, otherwise is ignored. - *
* Dictionaries for many languages are available through the OpenOffice project. * * See http://wiki.apache.org/solr/Hunspell + * @lucene.experimental */ public class HunspellStemFilterFactory extends TokenFilterFactory implements ResourceLoaderAware { - private static final String PARAM_DICTIONARY = "dictionary"; - private static final String PARAM_AFFIX = "affix"; - private static final String PARAM_IGNORE_CASE = "ignoreCase"; - private static final String PARAM_STRICT_AFFIX_PARSING = "strictAffixParsing"; + private static final String PARAM_DICTIONARY = "dictionary"; + private static final String PARAM_AFFIX = "affix"; private static final String PARAM_RECURSION_CAP = "recursionCap"; + private static final String PARAM_IGNORE_CASE = "ignoreCase"; + private static final String PARAM_LONGEST_ONLY = "longestOnly"; - private final String dictionaryArg; + private final String dictionaryFiles; private final String affixFile; private final boolean ignoreCase; - private final boolean strictAffixParsing; - private HunspellDictionary dictionary; + private final boolean longestOnly; + private Dictionary dictionary; private int recursionCap; /** Creates a new HunspellStemFilterFactory */ public HunspellStemFilterFactory(Map args) { super(args); - assureMatchVersion(); - dictionaryArg = require(args, PARAM_DICTIONARY); + dictionaryFiles = require(args, PARAM_DICTIONARY); affixFile = get(args, PARAM_AFFIX); ignoreCase = getBoolean(args, PARAM_IGNORE_CASE, false); - strictAffixParsing = getBoolean(args, PARAM_STRICT_AFFIX_PARSING, true); recursionCap = getInt(args, PARAM_RECURSION_CAP, 2); + longestOnly = getBoolean(args, PARAM_LONGEST_ONLY, false); + // this isnt necessary: we properly load all dictionaries. + // but recognize and ignore for back compat + getBoolean(args, "strictAffixParsing", true); if (!args.isEmpty()) { throw new IllegalArgumentException("Unknown parameters: " + args); } } - /** - * Loads the hunspell dictionary and affix files defined in the configuration - * - * @param loader ResourceLoader used to load the files - */ @Override public void inform(ResourceLoader loader) throws IOException { - String dictionaryFiles[] = dictionaryArg.split(","); + String dicts[] = dictionaryFiles.split(","); InputStream affix = null; List dictionaries = new ArrayList(); try { dictionaries = new ArrayList(); - for (String file : dictionaryFiles) { + for (String file : dicts) { dictionaries.add(loader.openResource(file)); } affix = loader.openResource(affixFile); - this.dictionary = new HunspellDictionary(affix, dictionaries, luceneMatchVersion, ignoreCase, strictAffixParsing); + this.dictionary = new Dictionary(affix, dictionaries, ignoreCase); } catch (ParseException e) { - throw new IOException("Unable to load hunspell data! [dictionary=" + dictionaryArg + ",affix=" + affixFile + "]", e); + throw new IOException("Unable to load hunspell data! [dictionary=" + dictionaries + ",affix=" + affixFile + "]", e); } finally { IOUtils.closeWhileHandlingException(affix); IOUtils.closeWhileHandlingException(dictionaries); } } - /** - * Creates an instance of {@link org.apache.lucene.analysis.hunspell.HunspellStemFilter} that will filter the given - * TokenStream - * - * @param tokenStream TokenStream that will be filtered - * @return HunspellStemFilter that filters the TokenStream - */ @Override public TokenStream create(TokenStream tokenStream) { - return new HunspellStemFilter(tokenStream, dictionary, recursionCap); + return new HunspellStemFilter(tokenStream, dictionary, true, recursionCap, longestOnly); } } diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemmer.java deleted file mode 100644 index ae2948284d6..00000000000 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemmer.java +++ /dev/null @@ -1,392 +0,0 @@ -package org.apache.lucene.analysis.hunspell; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.List; - -import org.apache.lucene.analysis.util.CharArraySet; -import org.apache.lucene.analysis.util.CharacterUtils; -import org.apache.lucene.util.Version; - -/** - * HunspellStemmer uses the affix rules declared in the HunspellDictionary to generate one or more stems for a word. It - * conforms to the algorithm in the original hunspell algorithm, including recursive suffix stripping. - */ -public class HunspellStemmer { - private final int recursionCap; - private final HunspellDictionary dictionary; - private final StringBuilder segment = new StringBuilder(); - private CharacterUtils charUtils = CharacterUtils.getInstance(Version.LUCENE_CURRENT); - - /** - * Constructs a new HunspellStemmer which will use the provided HunspellDictionary to create its stems. Uses the - * default recursion cap of 2 (based on Hunspell documentation). - * - * @param dictionary HunspellDictionary that will be used to create the stems - */ - public HunspellStemmer(HunspellDictionary dictionary) { - this(dictionary, 2); - } - - /** - * Constructs a new HunspellStemmer which will use the provided HunspellDictionary to create its stems - * - * @param dictionary HunspellDictionary that will be used to create the stems - * @param recursionCap maximum level of recursion stemmer can go into - */ - public HunspellStemmer(HunspellDictionary dictionary, int recursionCap) { - this.dictionary = dictionary; - this.recursionCap = recursionCap; - } - - /** - * Find the stem(s) of the provided word - * - * @param word Word to find the stems for - * @return List of stems for the word - */ - public List stem(String word) { - return stem(word.toCharArray(), word.length()); - } - - /** - * Find the stem(s) of the provided word - * - * @param word Word to find the stems for - * @return List of stems for the word - */ - public List stem(char word[], int length) { - List stems = new ArrayList(); - if (dictionary.lookupWord(word, 0, length) != null) { - stems.add(new Stem(word, length)); - } - stems.addAll(stem(word, length, null, 0)); - return stems; - } - - /** - * Find the unique stem(s) of the provided word - * - * @param word Word to find the stems for - * @return List of stems for the word - */ - public List uniqueStems(char word[], int length) { - List stems = new ArrayList(); - CharArraySet terms = new CharArraySet(dictionary.getVersion(), 8, dictionary.isIgnoreCase()); - if (dictionary.lookupWord(word, 0, length) != null) { - stems.add(new Stem(word, length)); - terms.add(word); - } - List otherStems = stem(word, length, null, 0); - for (Stem s : otherStems) { - if (!terms.contains(s.stem)) { - stems.add(s); - terms.add(s.stem); - } - } - return stems; - } - - // ================================================= Helper Methods ================================================ - - /** - * Generates a list of stems for the provided word - * - * @param word Word to generate the stems for - * @param flags Flags from a previous stemming step that need to be cross-checked with any affixes in this recursive step - * @param recursionDepth Level of recursion this stemming step is at - * @return List of stems, pr an empty if no stems are found - */ - private List stem(char word[], int length, char[] flags, int recursionDepth) { - List stems = new ArrayList(); - - for (int i = 0; i < length; i++) { - List suffixes = dictionary.lookupSuffix(word, i, length - i); - if (suffixes == null) { - continue; - } - - for (HunspellAffix suffix : suffixes) { - if (hasCrossCheckedFlag(suffix.getFlag(), flags)) { - int deAffixedLength = length - suffix.getAppend().length(); - // TODO: can we do this in-place? - String strippedWord = new StringBuilder().append(word, 0, deAffixedLength).append(suffix.getStrip()).toString(); - - List stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), suffix, recursionDepth); - for (Stem stem : stemList) { - stem.addSuffix(suffix); - } - - stems.addAll(stemList); - } - } - } - - for (int i = length - 1; i >= 0; i--) { - List prefixes = dictionary.lookupPrefix(word, 0, i); - if (prefixes == null) { - continue; - } - - for (HunspellAffix prefix : prefixes) { - if (hasCrossCheckedFlag(prefix.getFlag(), flags)) { - int deAffixedStart = prefix.getAppend().length(); - int deAffixedLength = length - deAffixedStart; - - String strippedWord = new StringBuilder().append(prefix.getStrip()) - .append(word, deAffixedStart, deAffixedLength) - .toString(); - - List stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), prefix, recursionDepth); - for (Stem stem : stemList) { - stem.addPrefix(prefix); - } - - stems.addAll(stemList); - } - } - } - - return stems; - } - - /** - * Applies the affix rule to the given word, producing a list of stems if any are found - * - * @param strippedWord Word the affix has been removed and the strip added - * @param affix HunspellAffix representing the affix rule itself - * @param recursionDepth Level of recursion this stemming step is at - * @return List of stems for the word, or an empty list if none are found - */ - @SuppressWarnings("unchecked") - public List applyAffix(char strippedWord[], int length, HunspellAffix affix, int recursionDepth) { - if(dictionary.isIgnoreCase()) { - charUtils.toLowerCase(strippedWord, 0, strippedWord.length); - } - segment.setLength(0); - segment.append(strippedWord, 0, length); - if (!affix.checkCondition(segment)) { - return Collections.EMPTY_LIST; - } - - List stems = new ArrayList(); - - List words = dictionary.lookupWord(strippedWord, 0, length); - if (words != null) { - for (HunspellWord hunspellWord : words) { - if (hunspellWord.hasFlag(affix.getFlag())) { - stems.add(new Stem(strippedWord, length)); - } - } - } - - if (affix.isCrossProduct() && recursionDepth < recursionCap) { - stems.addAll(stem(strippedWord, length, affix.getAppendFlags(), ++recursionDepth)); - } - - return stems; - } - - /** - * Checks if the given flag cross checks with the given array of flags - * - * @param flag Flag to cross check with the array of flags - * @param flags Array of flags to cross check against. Can be {@code null} - * @return {@code true} if the flag is found in the array or the array is {@code null}, {@code false} otherwise - */ - private boolean hasCrossCheckedFlag(char flag, char[] flags) { - return flags == null || Arrays.binarySearch(flags, flag) >= 0; - } - - /** - * Stem represents all information known about a stem of a word. This includes the stem, and the prefixes and suffixes - * that were used to change the word into the stem. - */ - public static class Stem { - - private final List prefixes = new ArrayList(); - private final List suffixes = new ArrayList(); - private final char stem[]; - private final int stemLength; - - /** - * Creates a new Stem wrapping the given word stem - * - * @param stem Stem of a word - */ - public Stem(char stem[], int stemLength) { - this.stem = stem; - this.stemLength = stemLength; - } - - /** - * Adds a prefix to the list of prefixes used to generate this stem. Because it is assumed that prefixes are added - * depth first, the prefix is added to the front of the list - * - * @param prefix Prefix to add to the list of prefixes for this stem - */ - public void addPrefix(HunspellAffix prefix) { - prefixes.add(0, prefix); - } - - /** - * Adds a suffix to the list of suffixes used to generate this stem. Because it is assumed that suffixes are added - * depth first, the suffix is added to the end of the list - * - * @param suffix Suffix to add to the list of suffixes for this stem - */ - public void addSuffix(HunspellAffix suffix) { - suffixes.add(suffix); - } - - /** - * Returns the list of prefixes used to generate the stem - * - * @return List of prefixes used to generate the stem or an empty list if no prefixes were required - */ - public List getPrefixes() { - return prefixes; - } - - /** - * Returns the list of suffixes used to generate the stem - * - * @return List of suffixes used to generate the stem or an empty list if no suffixes were required - */ - public List getSuffixes() { - return suffixes; - } - - /** - * Returns the actual word stem itself - * - * @return Word stem itself - */ - public char[] getStem() { - return stem; - } - - /** - * @return the stemLength - */ - public int getStemLength() { - return stemLength; - } - - public String getStemString() { - return new String(stem, 0, stemLength); - } - - } - - - // ================================================= Entry Point =================================================== - - /* - * HunspellStemmer entry point. Accepts two arguments: location of affix file and location of dic file - * - * @param args Program arguments. Should contain location of affix file and location of dic file - * @throws IOException Can be thrown while reading from the files - * @throws ParseException Can be thrown while parsing the files - public static void main(String[] args) throws IOException, ParseException { - boolean ignoreCase = false; - int offset = 0; - - if (args.length < 2) { - System.out.println("usage: HunspellStemmer [-i] "); - System.exit(1); - } - - if(args[offset].equals("-i")) { - ignoreCase = true; - System.out.println("Ignoring case. All stems will be returned lowercased"); - offset++; - } - - InputStream affixInputStream = new FileInputStream(args[offset++]); - InputStream dicInputStream = new FileInputStream(args[offset++]); - - // :Post-Release-Update-Version.LUCENE_XY: - HunspellDictionary dictionary = new HunspellDictionary(affixInputStream, dicInputStream, Version.LUCENE_50, ignoreCase); - - affixInputStream.close(); - dicInputStream.close(); - - HunspellStemmer stemmer = new HunspellStemmer(dictionary); - - Scanner scanner = new Scanner(System.in, Charset.defaultCharset().name()); - - System.out.print("> "); - while (scanner.hasNextLine()) { - String word = scanner.nextLine(); - - if ("exit".equals(word)) { - break; - } - - printStemResults(word, stemmer.stem(word.toCharArray(), word.length())); - - System.out.print("> "); - } - } - - * Prints the results of the stemming of a word - * - * @param originalWord Word that has been stemmed - * @param stems Stems of the word - private static void printStemResults(String originalWord, List stems) { - StringBuilder builder = new StringBuilder().append("stem(").append(originalWord).append(")").append("\n"); - - for (Stem stem : stems) { - builder.append("- ").append(stem.getStem()).append(": "); - - for (HunspellAffix prefix : stem.getPrefixes()) { - builder.append(prefix.getAppend()).append("+"); - - if (hasText(prefix.getStrip())) { - builder.append(prefix.getStrip()).append("-"); - } - } - - builder.append(stem.getStem()); - - for (HunspellAffix suffix : stem.getSuffixes()) { - if (hasText(suffix.getStrip())) { - builder.append("-").append(suffix.getStrip()); - } - - builder.append("+").append(suffix.getAppend()); - } - builder.append("\n"); - } - - System.out.println(builder); - } - - * Simple utility to check if the given String has any text - * - * @param str String to check if it has any text - * @return {@code true} if the String has text, {@code false} otherwise - private static boolean hasText(String str) { - return str != null && str.length() > 0; - } - */ -} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellWord.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellWord.java deleted file mode 100644 index fe216d30dc8..00000000000 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellWord.java +++ /dev/null @@ -1,63 +0,0 @@ -package org.apache.lucene.analysis.hunspell; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.util.Arrays; - -/** - * A dictionary (.dic) entry with its associated flags. - */ -public class HunspellWord { - - private final char flags[]; // sorted, can we represent more concisely? - - /** - * Creates a new HunspellWord with no associated flags - */ - public HunspellWord() { - flags = null; - } - - /** - * Constructs a new HunspellWord with the given flags - * - * @param flags Flags to associate with the word - */ - public HunspellWord(char[] flags) { - this.flags = flags; - } - - /** - * Checks whether the word has the given flag associated with it - * - * @param flag Flag to check whether it is associated with the word - * @return {@code true} if the flag is associated, {@code false} otherwise - */ - public boolean hasFlag(char flag) { - return flags != null && Arrays.binarySearch(flags, flag) >= 0; - } - - /** - * Returns the flags associated with the word - * - * @return Flags associated with the word - */ - public char[] getFlags() { - return flags; - } -} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/ISO8859_14Decoder.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ISO8859_14Decoder.java similarity index 98% rename from lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/ISO8859_14Decoder.java rename to lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ISO8859_14Decoder.java index 4de0d4bc051..2d87947ab3d 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/ISO8859_14Decoder.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/ISO8859_14Decoder.java @@ -1,4 +1,4 @@ -package org.apache.lucene.analysis.hunspell2; +package org.apache.lucene.analysis.hunspell; /* * Licensed to the Apache Software Foundation (ASF) under one or more diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java similarity index 92% rename from lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java rename to lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java index d6b0133830a..18e6588ce7a 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Stemmer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java @@ -1,4 +1,4 @@ -package org.apache.lucene.analysis.hunspell2; +package org.apache.lucene.analysis.hunspell; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -24,6 +24,7 @@ import java.util.List; import java.util.regex.Pattern; import org.apache.lucene.analysis.util.CharArraySet; +import org.apache.lucene.analysis.util.CharacterUtils; import org.apache.lucene.store.ByteArrayDataInput; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.CharsRef; @@ -37,9 +38,10 @@ import org.apache.lucene.util.Version; final class Stemmer { private final int recursionCap; private final Dictionary dictionary; - private BytesRef scratch = new BytesRef(); + private final BytesRef scratch = new BytesRef(); private final StringBuilder segment = new StringBuilder(); private final ByteArrayDataInput affixReader; + private final CharacterUtils charUtils = CharacterUtils.getInstance(Version.LUCENE_CURRENT); /** * Constructs a new Stemmer which will use the provided Dictionary to create its stems. Uses the @@ -80,6 +82,9 @@ final class Stemmer { * @return List of stems for the word */ public List stem(char word[], int length) { + if (dictionary.ignoreCase) { + charUtils.toLowerCase(word, 0, length); + } List stems = new ArrayList(); if (dictionary.lookupWord(word, 0, length, scratch) != null) { stems.add(new CharsRef(word, 0, length)); @@ -95,20 +100,19 @@ final class Stemmer { * @return List of stems for the word */ public List uniqueStems(char word[], int length) { - List stems = new ArrayList(); - CharArraySet terms = new CharArraySet(Version.LUCENE_CURRENT, 8, false); - if (dictionary.lookupWord(word, 0, length, scratch) != null) { - stems.add(new CharsRef(word, 0, length)); - terms.add(word); + List stems = stem(word, length); + if (stems.size() < 2) { + return stems; } - List otherStems = stem(word, length, Dictionary.NOFLAGS, 0); - for (CharsRef s : otherStems) { + CharArraySet terms = new CharArraySet(Version.LUCENE_CURRENT, 8, dictionary.ignoreCase); + List deduped = new ArrayList<>(); + for (CharsRef s : stems) { if (!terms.contains(s)) { - stems.add(s); + deduped.add(s); terms.add(s); } } - return stems; + return deduped; } // ================================================= Helper Methods ================================================ @@ -188,7 +192,7 @@ final class Stemmer { * @param recursionDepth Level of recursion this stemming step is at * @return List of stems for the word, or an empty list if none are found */ - public List applyAffix(char strippedWord[], int length, int affix, int recursionDepth) { + List applyAffix(char strippedWord[], int length, int affix, int recursionDepth) { segment.setLength(0); segment.append(strippedWord, 0, length); diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Hunspell2StemFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Hunspell2StemFilter.java deleted file mode 100644 index 00ff88469be..00000000000 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Hunspell2StemFilter.java +++ /dev/null @@ -1,137 +0,0 @@ -package org.apache.lucene.analysis.hunspell2; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; -import java.util.List; - -import org.apache.lucene.analysis.TokenFilter; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; -import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; -import org.apache.lucene.util.CharsRef; - -/** - * TokenFilter that uses hunspell affix rules and words to stem tokens. Since hunspell supports a word having multiple - * stems, this filter can emit multiple tokens for each consumed token - * - *

- * Note: This filter is aware of the {@link KeywordAttribute}. To prevent - * certain terms from being passed to the stemmer - * {@link KeywordAttribute#isKeyword()} should be set to true - * in a previous {@link TokenStream}. - * - * Note: For including the original term as well as the stemmed version, see - * {@link org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilterFactory} - *

- * - * @lucene.experimental - */ -public final class Hunspell2StemFilter extends TokenFilter { - - private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); - private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); - private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class); - private final Stemmer stemmer; - - private List buffer; - private State savedState; - - private final boolean dedup; - - /** Create a {@link Hunspell2StemFilter} which deduplicates stems and has a maximum - * recursion level of 2. - * @see #Hunspell2StemFilter(TokenStream, Dictionary, int) */ - public Hunspell2StemFilter(TokenStream input, Dictionary dictionary) { - this(input, dictionary, 2); - } - - /** - * Creates a new Hunspell2StemFilter that will stem tokens from the given TokenStream using affix rules in the provided - * Dictionary - * - * @param input TokenStream whose tokens will be stemmed - * @param dictionary HunspellDictionary containing the affix rules and words that will be used to stem the tokens - * @param recursionCap maximum level of recursion stemmer can go into, defaults to 2 - */ - public Hunspell2StemFilter(TokenStream input, Dictionary dictionary, int recursionCap) { - this(input, dictionary, true, recursionCap); - } - - /** Create a {@link Hunspell2StemFilter} which has a maximum recursion level of 2. - * @see #Hunspell2StemFilter(TokenStream, Dictionary, boolean, int) */ - public Hunspell2StemFilter(TokenStream input, Dictionary dictionary, boolean dedup) { - this(input, dictionary, dedup, 2); - } - - /** - * Creates a new HunspellStemFilter that will stem tokens from the given TokenStream using affix rules in the provided - * Dictionary - * - * @param input TokenStream whose tokens will be stemmed - * @param dictionary HunspellDictionary containing the affix rules and words that will be used to stem the tokens - * @param dedup true if only unique terms should be output. - * @param recursionCap maximum level of recursion stemmer can go into, defaults to 2 - */ - public Hunspell2StemFilter(TokenStream input, Dictionary dictionary, boolean dedup, int recursionCap) { - super(input); - this.dedup = dedup; - this.stemmer = new Stemmer(dictionary, recursionCap); - } - - @Override - public boolean incrementToken() throws IOException { - if (buffer != null && !buffer.isEmpty()) { - CharsRef nextStem = buffer.remove(0); - restoreState(savedState); - posIncAtt.setPositionIncrement(0); - termAtt.setEmpty().append(nextStem); - return true; - } - - if (!input.incrementToken()) { - return false; - } - - if (keywordAtt.isKeyword()) { - return true; - } - - buffer = dedup ? stemmer.uniqueStems(termAtt.buffer(), termAtt.length()) : stemmer.stem(termAtt.buffer(), termAtt.length()); - - if (buffer.isEmpty()) { // we do not know this word, return it unchanged - return true; - } - - CharsRef stem = buffer.remove(0); - termAtt.setEmpty().append(stem); - - if (!buffer.isEmpty()) { - savedState = captureState(); - } - - return true; - } - - @Override - public void reset() throws IOException { - super.reset(); - buffer = null; - } -} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Hunspell2StemFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Hunspell2StemFilterFactory.java deleted file mode 100644 index 6ce73698dfd..00000000000 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/Hunspell2StemFilterFactory.java +++ /dev/null @@ -1,80 +0,0 @@ -package org.apache.lucene.analysis.hunspell2; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; -import java.io.InputStream; -import java.text.ParseException; -import java.util.Map; - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.util.ResourceLoader; -import org.apache.lucene.analysis.util.ResourceLoaderAware; -import org.apache.lucene.analysis.util.TokenFilterFactory; - -/** - * TokenFilterFactory that creates instances of {@link Hunspell2StemFilter}. - * Example config for British English: - *
- * <filter class="solr.Hunspell2StemFilterFactory"
- *         dictionary="en_GB.dic"
- *         affix="en_GB.aff" />
- * Both parameters dictionary and affix are mandatory. - * Dictionaries for many languages are available through the OpenOffice project. - * - * See http://wiki.apache.org/solr/Hunspell - * @lucene.experimental - */ -public class Hunspell2StemFilterFactory extends TokenFilterFactory implements ResourceLoaderAware { - private static final String PARAM_DICTIONARY = "dictionary"; - private static final String PARAM_AFFIX = "affix"; - private static final String PARAM_RECURSION_CAP = "recursionCap"; - - private final String dictionaryFile; - private final String affixFile; - private Dictionary dictionary; - private int recursionCap; - - /** Creates a new Hunspell2StemFilterFactory */ - public Hunspell2StemFilterFactory(Map args) { - super(args); - dictionaryFile = require(args, PARAM_DICTIONARY); - affixFile = get(args, PARAM_AFFIX); - recursionCap = getInt(args, PARAM_RECURSION_CAP, 2); - if (!args.isEmpty()) { - throw new IllegalArgumentException("Unknown parameters: " + args); - } - } - - @Override - public void inform(ResourceLoader loader) throws IOException { - try (InputStream affix = loader.openResource(affixFile); - InputStream dictionary = loader.openResource(dictionaryFile)) { - try { - this.dictionary = new Dictionary(affix, dictionary); - } catch (ParseException e) { - throw new RuntimeException(e); - } - } - } - - @Override - public TokenStream create(TokenStream tokenStream) { - return new Hunspell2StemFilter(tokenStream, dictionary, recursionCap); - } -} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/package.html b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/package.html deleted file mode 100644 index 196591969e8..00000000000 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell2/package.html +++ /dev/null @@ -1,26 +0,0 @@ - - - -Stemming TokenFilter using a Java implementation of the -Hunspell stemming algorithm. -

-Dictionaries can be found on -OpenOffice's wiki -

- - diff --git a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory index e4ca7c6802c..04fc80cf59c 100644 --- a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory +++ b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory @@ -51,7 +51,6 @@ org.apache.lucene.analysis.hi.HindiNormalizationFilterFactory org.apache.lucene.analysis.hi.HindiStemFilterFactory org.apache.lucene.analysis.hu.HungarianLightStemFilterFactory org.apache.lucene.analysis.hunspell.HunspellStemFilterFactory -org.apache.lucene.analysis.hunspell2.Hunspell2StemFilterFactory org.apache.lucene.analysis.id.IndonesianStemFilterFactory org.apache.lucene.analysis.in.IndicNormalizationFilterFactory org.apache.lucene.analysis.it.ItalianLightStemFilterFactory diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java index bca5e1ede50..617e7523b69 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java @@ -62,8 +62,8 @@ import org.apache.lucene.analysis.commongrams.CommonGramsQueryFilter; import org.apache.lucene.analysis.compound.HyphenationCompoundWordTokenFilter; import org.apache.lucene.analysis.compound.TestCompoundWordTokenFilter; import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree; -import org.apache.lucene.analysis.hunspell.HunspellDictionary; -import org.apache.lucene.analysis.hunspell.HunspellDictionaryTest; +import org.apache.lucene.analysis.hunspell.Dictionary; +import org.apache.lucene.analysis.hunspell.TestHunspellStemFilter; import org.apache.lucene.analysis.miscellaneous.HyphenatedWordsFilter; import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilter; import org.apache.lucene.analysis.miscellaneous.LimitTokenPositionFilter; @@ -406,13 +406,13 @@ public class TestRandomChains extends BaseTokenStreamTestCase { return new IdentityEncoder(); // the other encoders will throw exceptions if tokens arent numbers? } }); - put(HunspellDictionary.class, new ArgProducer() { + put(Dictionary.class, new ArgProducer() { @Override public Object create(Random random) { // TODO: make nastier - InputStream affixStream = HunspellDictionaryTest.class.getResourceAsStream("test.aff"); - InputStream dictStream = HunspellDictionaryTest.class.getResourceAsStream("test.dic"); + InputStream affixStream = TestHunspellStemFilter.class.getResourceAsStream("simple.aff"); + InputStream dictStream = TestHunspellStemFilter.class.getResourceAsStream("simple.dic"); try { - return new HunspellDictionary(affixStream, dictStream, TEST_VERSION_CURRENT); + return new Dictionary(affixStream, dictStream); } catch (Exception ex) { Rethrow.rethrow(ex); return null; // unreachable code diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellDictionaryTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellDictionaryTest.java deleted file mode 100644 index fd8f9211727..00000000000 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellDictionaryTest.java +++ /dev/null @@ -1,201 +0,0 @@ -package org.apache.lucene.analysis.hunspell; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; -import java.io.InputStream; -import java.text.ParseException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; - -import org.apache.lucene.util.LuceneTestCase; -import org.junit.Assert; -import org.junit.Test; - -public class HunspellDictionaryTest extends LuceneTestCase { - - private class CloseCheckInputStream extends InputStream { - private InputStream delegate; - - private boolean closed = false; - - public CloseCheckInputStream(InputStream delegate) { - super(); - this.delegate = delegate; - } - - @Override - public int read() throws IOException { - return delegate.read(); - } - - @Override - public int hashCode() { - return delegate.hashCode(); - } - - @Override - public int read(byte[] b) throws IOException { - return delegate.read(b); - } - - @Override - public boolean equals(Object obj) { - return delegate.equals(obj); - } - - @Override - public int read(byte[] b, int off, int len) throws IOException { - return delegate.read(b, off, len); - } - - @Override - public long skip(long n) throws IOException { - return delegate.skip(n); - } - - @Override - public String toString() { - return delegate.toString(); - } - - @Override - public int available() throws IOException { - return delegate.available(); - } - - @Override - public void close() throws IOException { - this.closed = true; - delegate.close(); - } - - @Override - public void mark(int readlimit) { - delegate.mark(readlimit); - } - - @Override - public void reset() throws IOException { - delegate.reset(); - } - - @Override - public boolean markSupported() { - return delegate.markSupported(); - } - - public boolean isClosed() { - return this.closed; - } - - } - - @Test - public void testResourceCleanup() throws IOException, ParseException { - CloseCheckInputStream affixStream = new CloseCheckInputStream(getClass().getResourceAsStream("testCompressed.aff")); - CloseCheckInputStream dictStream = new CloseCheckInputStream(getClass().getResourceAsStream("testCompressed.dic")); - - new HunspellDictionary(affixStream, dictStream, TEST_VERSION_CURRENT); - - assertFalse(affixStream.isClosed()); - assertFalse(dictStream.isClosed()); - - affixStream.close(); - dictStream.close(); - - assertTrue(affixStream.isClosed()); - assertTrue(dictStream.isClosed()); - } - - @Test - public void testHunspellDictionary_loadDicAff() throws IOException, ParseException { - InputStream affixStream = getClass().getResourceAsStream("test.aff"); - InputStream dictStream = getClass().getResourceAsStream("test.dic"); - - HunspellDictionary dictionary = new HunspellDictionary(affixStream, dictStream, TEST_VERSION_CURRENT); - assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).size()); - assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).size()); - assertEquals(1, dictionary.lookupWord(new char[]{'o', 'l', 'r'}, 0, 3).size()); - assertEquals("Wrong number of flags for lucen", 1, dictionary.lookupWord(new char[]{'l', 'u', 'c', 'e', 'n'}, 0, 5).get(0).getFlags().length); - - affixStream.close(); - dictStream.close(); - } - - @Test - public void testHunspellDictionary_multipleDictWithOverride() throws IOException, ParseException { - InputStream affixStream = getClass().getResourceAsStream("test.aff"); - List dictStreams = new ArrayList(); - dictStreams.add(getClass().getResourceAsStream("test.dic")); - dictStreams.add(getClass().getResourceAsStream("testOverride.dic")); - - HunspellDictionary dictionary = new HunspellDictionary(affixStream, dictStreams, TEST_VERSION_CURRENT, false); - assertEquals("Wrong number of flags for lucen", 3, dictionary.lookupWord(new char[]{'l', 'u', 'c', 'e', 'n'}, 0, 5).get(0).getFlags().length); - assertEquals("Wrong number of flags for bar", 1, dictionary.lookupWord(new char[]{'b', 'a', 'r'}, 0, 3).get(0).getFlags().length); - - affixStream.close(); - for(InputStream dstream : dictStreams) { - dstream.close(); - } - } - - @Test - public void testCompressedHunspellDictionary_loadDicAff() throws IOException, ParseException { - InputStream affixStream = getClass().getResourceAsStream("testCompressed.aff"); - InputStream dictStream = getClass().getResourceAsStream("testCompressed.dic"); - - HunspellDictionary dictionary = new HunspellDictionary(affixStream, dictStream, TEST_VERSION_CURRENT); - assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).size()); - assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).size()); - assertEquals(1, dictionary.lookupWord(new char[]{'o', 'l', 'r'}, 0, 3).size()); - - affixStream.close(); - dictStream.close(); - } - - @Test - public void testHunspellDictionary_loadDicWrongAff() throws IOException, ParseException { - InputStream affixStream = getClass().getResourceAsStream("testWrongAffixRule.aff"); - InputStream dictStream = getClass().getResourceAsStream("test.dic"); - - HunspellDictionary dictionary = new HunspellDictionary(affixStream, Arrays.asList(dictStream), TEST_VERSION_CURRENT, false, false); - assertEquals(3, dictionary.lookupSuffix(new char[]{'e'}, 0, 1).size()); - assertEquals(1, dictionary.lookupPrefix(new char[]{'s'}, 0, 1).size()); - assertEquals(1, dictionary.lookupWord(new char[]{'o', 'l', 'r'}, 0, 3).size()); - //strict parsing disabled: malformed rule is not loaded - assertNull(dictionary.lookupPrefix(new char[]{'a'}, 0, 1)); - affixStream.close(); - dictStream.close(); - - affixStream = getClass().getResourceAsStream("testWrongAffixRule.aff"); - dictStream = getClass().getResourceAsStream("test.dic"); - //strict parsing enabled: malformed rule causes ParseException - try { - dictionary = new HunspellDictionary(affixStream, Arrays.asList(dictStream), TEST_VERSION_CURRENT, false, true); - Assert.fail(); - } catch(ParseException e) { - Assert.assertEquals("The affix file contains a rule with less than five elements", e.getMessage()); - Assert.assertEquals(23, e.getErrorOffset()); - } - - affixStream.close(); - dictStream.close(); - } -} diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellStemFilterTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellStemFilterTest.java deleted file mode 100644 index dd273fa8dc5..00000000000 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellStemFilterTest.java +++ /dev/null @@ -1,92 +0,0 @@ -package org.apache.lucene.analysis.hunspell; -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; -import java.io.InputStream; -import java.text.ParseException; -import java.util.Arrays; - -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.MockTokenizer; -import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.core.KeywordTokenizer; -import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter; -import org.apache.lucene.analysis.util.CharArraySet; -import org.apache.lucene.util.TestUtil; -import org.junit.AfterClass; -import org.junit.BeforeClass; - -public class HunspellStemFilterTest extends BaseTokenStreamTestCase { - - private static HunspellDictionary DICTIONARY; - @BeforeClass - public static void beforeClass() throws IOException, ParseException { - DICTIONARY = createDict(true); - } - @AfterClass - public static void afterClass() { - DICTIONARY = null; - } - public static HunspellDictionary createDict(boolean ignoreCase) throws IOException, ParseException { - InputStream affixStream = HunspellStemmerTest.class.getResourceAsStream("test.aff"); - InputStream dictStream = HunspellStemmerTest.class.getResourceAsStream("test.dic"); - - return new HunspellDictionary(affixStream, dictStream, TEST_VERSION_CURRENT, ignoreCase); - } - - /** - * Simple test for KeywordAttribute - */ - public void testKeywordAttribute() throws IOException { - MockTokenizer tokenizer = whitespaceMockTokenizer("lucene is awesome"); - tokenizer.setEnableChecks(true); - HunspellStemFilter filter = new HunspellStemFilter(tokenizer, DICTIONARY, TestUtil.nextInt(random(), 1, 3)); - assertTokenStreamContents(filter, new String[]{"lucene", "lucen", "is", "awesome"}, new int[] {1, 0, 1, 1}); - - // assert with keywork marker - tokenizer = whitespaceMockTokenizer("lucene is awesome"); - CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, Arrays.asList("Lucene"), true); - filter = new HunspellStemFilter(new SetKeywordMarkerFilter(tokenizer, set), DICTIONARY, TestUtil.nextInt(random(), 1, 3)); - assertTokenStreamContents(filter, new String[]{"lucene", "is", "awesome"}, new int[] {1, 1, 1}); - } - - /** blast some random strings through the analyzer */ - public void testRandomStrings() throws Exception { - Analyzer analyzer = new Analyzer() { - - @Override - protected TokenStreamComponents createComponents(String fieldName) { - Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); - return new TokenStreamComponents(tokenizer, new HunspellStemFilter(tokenizer, DICTIONARY, TestUtil.nextInt(random(), 1, 3))); - } - }; - checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER); - } - - public void testEmptyTerm() throws IOException { - Analyzer a = new Analyzer() { - @Override - protected TokenStreamComponents createComponents(String fieldName) { - Tokenizer tokenizer = new KeywordTokenizer(); - return new TokenStreamComponents(tokenizer, new HunspellStemFilter(tokenizer, DICTIONARY, TestUtil.nextInt(random(), 1, 3))); - } - }; - checkOneTerm(a, "", ""); - } -} diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellStemmerTest.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellStemmerTest.java deleted file mode 100644 index 66a9410c27a..00000000000 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/HunspellStemmerTest.java +++ /dev/null @@ -1,137 +0,0 @@ -package org.apache.lucene.analysis.hunspell; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.util.LuceneTestCase; -import org.apache.lucene.util.Version; -import org.junit.AfterClass; -import org.junit.BeforeClass; -import org.junit.Test; - -import java.io.IOException; -import java.io.InputStream; -import java.text.ParseException; -import java.util.List; - -import static junit.framework.Assert.assertEquals; - -public class HunspellStemmerTest extends LuceneTestCase { - - private static HunspellStemmer stemmer; - - @BeforeClass - public static void beforeClass() throws IOException, ParseException { - createStemmer(true); - } - - @AfterClass - public static void afterClass() { - stemmer = null; - } - - @Test - public void testStem_simpleSuffix() { - List stems = stemmer.stem("lucene"); - - assertEquals(2, stems.size()); - assertEquals("lucene", stems.get(0).getStemString()); - assertEquals("lucen", stems.get(1).getStemString()); - - stems = stemmer.stem("mahoute"); - assertEquals(1, stems.size()); - assertEquals("mahout", stems.get(0).getStemString()); - } - - @Test - public void testStem_simplePrefix() { - List stems = stemmer.stem("solr"); - - assertEquals(1, stems.size()); - assertEquals("olr", stems.get(0).getStemString()); - } - - @Test - public void testStem_recursiveSuffix() { - List stems = stemmer.stem("abcd"); - - assertEquals(1, stems.size()); - assertEquals("ab", stems.get(0).getStemString()); - } - - @Test - public void testStem_ignoreCase() throws IOException, ParseException { - List stems; - createStemmer(true); - - stems = stemmer.stem("apache"); - assertEquals(1, stems.size()); - assertEquals("apach", stems.get(0).getStemString()); - - stems = stemmer.stem("APACHE"); - assertEquals(1, stems.size()); - assertEquals("apach", stems.get(0).getStemString()); - - stems = stemmer.stem("Apache"); - assertEquals(1, stems.size()); - assertEquals("apach", stems.get(0).getStemString()); - - stems = stemmer.stem("foos"); - assertEquals(1, stems.size()); - assertEquals("foo", stems.get(0).getStemString()); - - stems = stemmer.stem("mood"); - assertEquals(1, stems.size()); - assertEquals("moo", stems.get(0).getStemString()); - - stems = stemmer.stem("Foos"); - assertEquals(1, stems.size()); - assertEquals("foo", stems.get(0).getStemString()); - - // The "Foo" rule gets overridden by the "foo" rule, and we don't merge - stems = stemmer.stem("Food"); - assertEquals(0, stems.size()); - - stems = stemmer.stem("Mood"); - assertEquals(1, stems.size()); - assertEquals("moo", stems.get(0).getStemString()); - } - - @Test - public void testStem_caseSensitive() throws IOException, ParseException { - createStemmer(false); - List stems = stemmer.stem("apache"); - assertEquals(0, stems.size()); - - stems = stemmer.stem("Apache"); - assertEquals(1, stems.size()); - assertEquals("Apach", stems.get(0).getStemString()); - } - - - private static void createStemmer(boolean ignoreCase) throws IOException, ParseException { - InputStream affixStream = HunspellStemmerTest.class.getResourceAsStream("test.aff"); - InputStream dictStream = HunspellStemmerTest.class.getResourceAsStream("test.dic"); - - HunspellDictionary dictionary = new HunspellDictionary(affixStream, dictStream, TEST_VERSION_CURRENT, ignoreCase); - stemmer = new HunspellStemmer(dictionary); - - affixStream.close(); - dictStream.close(); - } - -} diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestAllDictionaries.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java similarity index 93% rename from lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestAllDictionaries.java rename to lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java index d00fc634944..3322eb109a6 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestAllDictionaries.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestAllDictionaries.java @@ -1,4 +1,4 @@ -package org.apache.lucene.analysis.hunspell2; +package org.apache.lucene.analysis.hunspell; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -22,7 +22,7 @@ import java.io.InputStream; import java.util.zip.ZipEntry; import java.util.zip.ZipFile; -import org.apache.lucene.analysis.hunspell.HunspellDictionary; +import org.apache.lucene.analysis.hunspell.Dictionary; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.RamUsageEstimator; @@ -33,7 +33,7 @@ import org.junit.Ignore; * wget --mirror -np http://archive.services.openoffice.org/pub/mirror/OpenOffice.org/contrib/dictionaries/ * Note some of the files differ only in case. This may be a problem on your operating system! */ -//@Ignore("enable manually") +@Ignore("enable manually") public class TestAllDictionaries extends LuceneTestCase { // set this to the location of where you downloaded all the files @@ -162,21 +162,11 @@ public class TestAllDictionaries extends LuceneTestCase { assert dicEntry != null; ZipEntry affEntry = zip.getEntry(tests[i+2]); assert affEntry != null; - - // get ram from previous impl - String oldRAM = "FAIL"; - try (InputStream dictionary = zip.getInputStream(dicEntry); - InputStream affix = zip.getInputStream(affEntry)) { - try { - HunspellDictionary dic = new HunspellDictionary(affix, dictionary, TEST_VERSION_CURRENT); - oldRAM = RamUsageEstimator.humanSizeOf(dic); - } catch (Throwable t) {} - } try (InputStream dictionary = zip.getInputStream(dicEntry); InputStream affix = zip.getInputStream(affEntry)) { Dictionary dic = new Dictionary(affix, dictionary); - System.out.println(tests[i] + "\t" + oldRAM + "\t" + RamUsageEstimator.humanSizeOf(dic) + "\t(" + + System.out.println(tests[i] + "\t" + RamUsageEstimator.humanSizeOf(dic) + "\t(" + "words=" + RamUsageEstimator.humanSizeOf(dic.words) + ", " + "flags=" + RamUsageEstimator.humanSizeOf(dic.flagLookup) + ", " + "strips=" + RamUsageEstimator.humanSizeOf(dic.stripLookup) + ", " + @@ -204,7 +194,7 @@ public class TestAllDictionaries extends LuceneTestCase { try (InputStream dictionary = zip.getInputStream(dicEntry); InputStream affix = zip.getInputStream(affEntry)) { - Dictionary dic = new Dictionary(affix, dictionary); + new Dictionary(affix, dictionary); } } } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestCaseInsensitive.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestCaseInsensitive.java new file mode 100644 index 00000000000..64bdb41e8c7 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestCaseInsensitive.java @@ -0,0 +1,110 @@ +package org.apache.lucene.analysis.hunspell; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.hunspell.Dictionary; +import org.apache.lucene.analysis.hunspell.Stemmer; +import org.apache.lucene.util.CharsRef; +import org.apache.lucene.util.LuceneTestCase; +import org.junit.AfterClass; +import org.junit.BeforeClass; + +import java.io.InputStream; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +public class TestCaseInsensitive extends LuceneTestCase { + private static Stemmer stemmer; + + @BeforeClass + public static void beforeClass() throws Exception { + try (InputStream affixStream = TestCaseInsensitive.class.getResourceAsStream("simple.aff"); + InputStream dictStream = TestCaseInsensitive.class.getResourceAsStream("mixedcase.dic")) { + Dictionary dictionary = new Dictionary(affixStream, Collections.singletonList(dictStream), true); + stemmer = new Stemmer(dictionary); + } + } + + @AfterClass + public static void afterClass() { + stemmer = null; + } + + public void testCaseInsensitivity() { + assertStemsTo("lucene", "lucene", "lucen"); + assertStemsTo("LuCeNe", "lucene", "lucen"); + assertStemsTo("mahoute", "mahout"); + assertStemsTo("MaHoUte", "mahout"); + } + + public void testSimplePrefix() { + assertStemsTo("solr", "olr"); + } + + public void testRecursiveSuffix() { + assertStemsTo("abcd", "ab"); + } + + // all forms unmunched from dictionary + public void testAllStems() { + assertStemsTo("ab", "ab"); + assertStemsTo("abc", "ab"); + assertStemsTo("apach", "apach"); + assertStemsTo("apache", "apach"); + assertStemsTo("foo", "foo"); + assertStemsTo("food", "foo"); + assertStemsTo("foos", "foo"); + assertStemsTo("lucen", "lucen"); + assertStemsTo("lucene", "lucen", "lucene"); + assertStemsTo("mahout", "mahout"); + assertStemsTo("mahoute", "mahout"); + assertStemsTo("moo", "moo"); + assertStemsTo("mood", "moo"); + assertStemsTo("olr", "olr"); + assertStemsTo("solr", "olr"); + } + + // some bogus stuff that should not stem (empty lists)! + public void testBogusStems() { + assertStemsTo("abs"); + assertStemsTo("abe"); + assertStemsTo("sab"); + assertStemsTo("sapach"); + assertStemsTo("sapache"); + assertStemsTo("apachee"); + assertStemsTo("sfoo"); + assertStemsTo("sfoos"); + assertStemsTo("fooss"); + assertStemsTo("lucenee"); + assertStemsTo("solre"); + } + + private void assertStemsTo(String s, String... expected) { + Arrays.sort(expected); + + List stems = stemmer.stem(s); + String actual[] = new String[stems.size()]; + for (int i = 0; i < actual.length; i++) { + actual[i] = stems.get(i).toString(); + } + Arrays.sort(actual); + + assertArrayEquals(expected, actual); + } +} diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestDictionary.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java similarity index 97% rename from lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestDictionary.java rename to lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java index e8e0fd0d030..6cbe931d376 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestDictionary.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java @@ -1,4 +1,4 @@ -package org.apache.lucene.analysis.hunspell2; +package org.apache.lucene.analysis.hunspell; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -22,6 +22,7 @@ import java.io.IOException; import java.io.InputStream; import java.text.ParseException; +import org.apache.lucene.analysis.hunspell.Dictionary; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.LuceneTestCase; diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestHunspell2StemFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellStemFilter.java similarity index 75% rename from lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestHunspell2StemFilter.java rename to lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellStemFilter.java index eafb1f272cf..af48427d522 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestHunspell2StemFilter.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellStemFilter.java @@ -1,4 +1,4 @@ -package org.apache.lucene.analysis.hunspell2; +package org.apache.lucene.analysis.hunspell; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -26,13 +26,15 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.core.KeywordTokenizer; +import org.apache.lucene.analysis.hunspell.Dictionary; +import org.apache.lucene.analysis.hunspell.HunspellStemFilter; import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter; import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.util.TestUtil; import org.junit.AfterClass; import org.junit.BeforeClass; -public class TestHunspell2StemFilter extends BaseTokenStreamTestCase { +public class TestHunspellStemFilter extends BaseTokenStreamTestCase { private static Dictionary dictionary; @BeforeClass @@ -52,13 +54,21 @@ public class TestHunspell2StemFilter extends BaseTokenStreamTestCase { public void testKeywordAttribute() throws IOException { MockTokenizer tokenizer = whitespaceMockTokenizer("lucene is awesome"); tokenizer.setEnableChecks(true); - Hunspell2StemFilter filter = new Hunspell2StemFilter(tokenizer, dictionary, TestUtil.nextInt(random(), 1, 3)); + HunspellStemFilter filter = new HunspellStemFilter(tokenizer, dictionary, TestUtil.nextInt(random(), 1, 3)); assertTokenStreamContents(filter, new String[]{"lucene", "lucen", "is", "awesome"}, new int[] {1, 0, 1, 1}); // assert with keyword marker tokenizer = whitespaceMockTokenizer("lucene is awesome"); CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, Arrays.asList("Lucene"), true); - filter = new Hunspell2StemFilter(new SetKeywordMarkerFilter(tokenizer, set), dictionary, TestUtil.nextInt(random(), 1, 3)); + filter = new HunspellStemFilter(new SetKeywordMarkerFilter(tokenizer, set), dictionary, TestUtil.nextInt(random(), 1, 3)); + assertTokenStreamContents(filter, new String[]{"lucene", "is", "awesome"}, new int[] {1, 1, 1}); + } + + /** simple test for longestOnly option */ + public void testLongestOnly() throws IOException { + MockTokenizer tokenizer = whitespaceMockTokenizer("lucene is awesome"); + tokenizer.setEnableChecks(true); + HunspellStemFilter filter = new HunspellStemFilter(tokenizer, dictionary, true, TestUtil.nextInt(random(), 1, 3), true); assertTokenStreamContents(filter, new String[]{"lucene", "is", "awesome"}, new int[] {1, 1, 1}); } @@ -68,7 +78,7 @@ public class TestHunspell2StemFilter extends BaseTokenStreamTestCase { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); - return new TokenStreamComponents(tokenizer, new Hunspell2StemFilter(tokenizer, dictionary, TestUtil.nextInt(random(), 1, 3))); + return new TokenStreamComponents(tokenizer, new HunspellStemFilter(tokenizer, dictionary, TestUtil.nextInt(random(), 1, 3))); } }; checkRandomData(random(), analyzer, 1000*RANDOM_MULTIPLIER); @@ -79,7 +89,7 @@ public class TestHunspell2StemFilter extends BaseTokenStreamTestCase { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new KeywordTokenizer(); - return new TokenStreamComponents(tokenizer, new Hunspell2StemFilter(tokenizer, dictionary, TestUtil.nextInt(random(), 1, 3))); + return new TokenStreamComponents(tokenizer, new HunspellStemFilter(tokenizer, dictionary, TestUtil.nextInt(random(), 1, 3))); } }; checkOneTerm(a, "", ""); diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellStemFilterFactory.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellStemFilterFactory.java index e8e232ce60b..f4302035dbc 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellStemFilterFactory.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellStemFilterFactory.java @@ -20,7 +20,6 @@ package org.apache.lucene.analysis.hunspell; import java.io.Reader; import java.io.StringReader; -import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase; @@ -31,17 +30,17 @@ public class TestHunspellStemFilterFactory extends BaseTokenStreamFactoryTestCas public void testStemming() throws Exception { Reader reader = new StringReader("abc"); TokenStream stream = whitespaceMockTokenizer(reader); - stream = tokenFilterFactory("HunspellStem", - "dictionary", "test.dic", - "affix", "test.aff").create(stream); + stream = tokenFilterFactory("Hunspell2Stem", + "dictionary", "simple.dic", + "affix", "simple.aff").create(stream); assertTokenStreamContents(stream, new String[] { "ab" }); } /** Test that bogus arguments result in exception */ public void testBogusArguments() throws Exception { try { - tokenFilterFactory("HunspellStem", - "dictionary", "test.dic", + tokenFilterFactory("Hunspell2Stem", + "dictionary", "simple.dic", "bogusArg", "bogusValue"); fail(); } catch (IllegalArgumentException expected) { diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestStemmer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestStemmer.java similarity index 95% rename from lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestStemmer.java rename to lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestStemmer.java index 4dec107f314..dca9faa6b16 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestStemmer.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestStemmer.java @@ -1,4 +1,4 @@ -package org.apache.lucene.analysis.hunspell2; +package org.apache.lucene.analysis.hunspell; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -17,6 +17,8 @@ package org.apache.lucene.analysis.hunspell2; * limitations under the License. */ +import org.apache.lucene.analysis.hunspell.Dictionary; +import org.apache.lucene.analysis.hunspell.Stemmer; import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.LuceneTestCase; import org.junit.AfterClass; diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/broken.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/broken.aff similarity index 100% rename from lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/broken.aff rename to lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/broken.aff diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/compressed.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compressed.aff similarity index 100% rename from lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/compressed.aff rename to lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compressed.aff diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/compressed.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compressed.dic similarity index 100% rename from lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/compressed.dic rename to lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compressed.dic diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/mixedcase.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/mixedcase.dic new file mode 100644 index 00000000000..9fae253279e --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/mixedcase.dic @@ -0,0 +1,10 @@ +9 +Ab/C +apach/A +Foo/D +foo/E +Lucen/A +Lucene +mahout/A +Moo/E +olr/B diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/simple.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/simple.aff similarity index 100% rename from lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/simple.aff rename to lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/simple.aff diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/simple.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/simple.dic similarity index 100% rename from lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/simple.dic rename to lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/simple.dic diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/test.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/test.aff deleted file mode 100644 index db9423dcad1..00000000000 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/test.aff +++ /dev/null @@ -1,20 +0,0 @@ -SET UTF-8 -TRY abcdefghijklmopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ - -SFX A Y 3 -SFX A 0 e n -SFX A 0 e t -SFX A 0 e h - -SFX C Y 2 -SFX C 0 d/C c -SFX C 0 c b - -SFX D Y 1 -SFX D 0 s o - -SFX E Y 1 -SFX E 0 d o - -PFX B Y 1 -PFX B 0 s o \ No newline at end of file diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/test.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/test.dic deleted file mode 100644 index 12efd8fccb2..00000000000 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/test.dic +++ /dev/null @@ -1,10 +0,0 @@ -9 -lucen/A -lucene -mahout/A -olr/B -ab/C -Apach/A -Foo/E -foo/D -Moo/E \ No newline at end of file diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testCompressed.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testCompressed.aff deleted file mode 100644 index e4a1b37300f..00000000000 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testCompressed.aff +++ /dev/null @@ -1,29 +0,0 @@ -SET UTF-8 -TRY abcdefghijklmopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ - -FLAG long - -AF 5 -AF AA -AF BB -AF CC -AF DD -AF EE - -SFX AA Y 3 -SFX AA 0 e n -SFX AA 0 e t -SFX AA 0 e h - -SFX CC Y 2 -SFX CC 0 d/3 c -SFX CC 0 c b - -SFX DD Y 1 -SFX DD 0 s o - -SFX EE Y 1 -SFX EE 0 d o - -PFX BB Y 1 -PFX BB 0 s o diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testCompressed.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testCompressed.dic deleted file mode 100644 index bf237662017..00000000000 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testCompressed.dic +++ /dev/null @@ -1,9 +0,0 @@ -6 -lucen/1 -lucene -mahout/1 -olr/2 -ab/3 -Apach/1 -foo/4 -Foo/5 diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testOverride.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testOverride.dic deleted file mode 100644 index c1111ef562b..00000000000 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testOverride.dic +++ /dev/null @@ -1,3 +0,0 @@ -2 -lucen/ABC -bar/A \ No newline at end of file diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testWrongAffixRule.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testWrongAffixRule.aff deleted file mode 100644 index 3b780cd1d7b..00000000000 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/testWrongAffixRule.aff +++ /dev/null @@ -1,24 +0,0 @@ -SET UTF-8 -TRY abcdefghijklmopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ - -SFX A Y 3 -SFX A 0 e n -SFX A 0 e t -SFX A 0 e h - -SFX C Y 2 -SFX C 0 d/C c -SFX C 0 c b - -SFX D Y 1 -SFX D 0 s o - -SFX E Y 1 -SFX E 0 d o - -PFX B Y 1 -PFX B 0 s o - -#wrong rule (only 4 elements) -PFX A0 Y 1 -PFX A0 0 a \ No newline at end of file diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestHunspell2StemFilterFactory.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestHunspell2StemFilterFactory.java deleted file mode 100644 index d95e2be04b6..00000000000 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell2/TestHunspell2StemFilterFactory.java +++ /dev/null @@ -1,50 +0,0 @@ -package org.apache.lucene.analysis.hunspell2; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.Reader; -import java.io.StringReader; - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.util.BaseTokenStreamFactoryTestCase; - -/** - * Simple tests to ensure the Hunspell stemmer loads from factory - */ -public class TestHunspell2StemFilterFactory extends BaseTokenStreamFactoryTestCase { - public void testStemming() throws Exception { - Reader reader = new StringReader("abc"); - TokenStream stream = whitespaceMockTokenizer(reader); - stream = tokenFilterFactory("Hunspell2Stem", - "dictionary", "simple.dic", - "affix", "simple.aff").create(stream); - assertTokenStreamContents(stream, new String[] { "ab" }); - } - - /** Test that bogus arguments result in exception */ - public void testBogusArguments() throws Exception { - try { - tokenFilterFactory("Hunspell2Stem", - "dictionary", "simple.dic", - "bogusArg", "bogusValue"); - fail(); - } catch (IllegalArgumentException expected) { - assertTrue(expected.getMessage().contains("Unknown parameters")); - } - } -} From a51e85f91f3a189bf6b4a7806856d915d31285f3 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Thu, 27 Feb 2014 20:19:50 +0000 Subject: [PATCH 15/17] fix oops git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5468@1572719 13f79535-47bb-0310-9956-ffa450edef68 --- .../org/apache/lucene/analysis/hunspell/HunspellStemFilter.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilter.java index a9b512b7bbd..87de53aee63 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/HunspellStemFilter.java @@ -66,7 +66,7 @@ public final class HunspellStemFilter extends TokenFilter { } /** - * Creates a new Hunspell2StemFilter that will stem tokens from the given TokenStream using affix rules in the provided + * Creates a new HunspellStemFilter that will stem tokens from the given TokenStream using affix rules in the provided * Dictionary * * @param input TokenStream whose tokens will be stemmed From 66ccdead470caa1daba3e9f619fec7ceb87b699a Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Thu, 27 Feb 2014 20:39:35 +0000 Subject: [PATCH 16/17] LUCENE-5468: fix precommit+test git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5468@1572724 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/CHANGES.txt | 6 ++++++ .../analysis/hunspell/TestHunspellStemFilterFactory.java | 4 ++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index ebe7bb12a8a..5b9245fcaf1 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -82,6 +82,12 @@ API Changes * LUCENE-5454: Add RandomAccessOrds, an optional extension of SortedSetDocValues that supports random access to the ordinals in a document. (Robert Muir) +Optimizations + +* LUCENE-5468: HunspellStemFilter uses 10 to 100x less RAM. It also loads + all known openoffice dictionaries without error, and supports an additional + longestOnly option for a less aggressive approach. (Robert Muir) + Bug fixes * LUCENE-5450: Fix getField() NPE issues with SpanOr/SpanNear when they have an diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellStemFilterFactory.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellStemFilterFactory.java index f4302035dbc..b671f6dbdbf 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellStemFilterFactory.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestHunspellStemFilterFactory.java @@ -30,7 +30,7 @@ public class TestHunspellStemFilterFactory extends BaseTokenStreamFactoryTestCas public void testStemming() throws Exception { Reader reader = new StringReader("abc"); TokenStream stream = whitespaceMockTokenizer(reader); - stream = tokenFilterFactory("Hunspell2Stem", + stream = tokenFilterFactory("HunspellStem", "dictionary", "simple.dic", "affix", "simple.aff").create(stream); assertTokenStreamContents(stream, new String[] { "ab" }); @@ -39,7 +39,7 @@ public class TestHunspellStemFilterFactory extends BaseTokenStreamFactoryTestCas /** Test that bogus arguments result in exception */ public void testBogusArguments() throws Exception { try { - tokenFilterFactory("Hunspell2Stem", + tokenFilterFactory("HunspellStem", "dictionary", "simple.dic", "bogusArg", "bogusValue"); fail(); From 0c5f1c42a8bbc744c519d25fee7481b77d474a49 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Thu, 27 Feb 2014 20:42:52 +0000 Subject: [PATCH 17/17] LUCENE-5468: add additional change git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5468@1572727 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/CHANGES.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 5b9245fcaf1..e9679b25b05 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -82,6 +82,8 @@ API Changes * LUCENE-5454: Add RandomAccessOrds, an optional extension of SortedSetDocValues that supports random access to the ordinals in a document. (Robert Muir) +* LUCENE-5468: Move offline Sort (from suggest module) to OfflineSort. (Robert Muir) + Optimizations * LUCENE-5468: HunspellStemFilter uses 10 to 100x less RAM. It also loads