diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 612022375e5..b34944a36d8 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -195,6 +195,8 @@ Bug Fixes * LUCENE-5827: Make all Directory implementations correctly fail with IllegalArgumentException if slices are out of bounds. (Uwe SChindler) +* LUCENE-5838: Fix hunspell when the .aff file has over 64k affixes. (Robert Muir) + Test Framework * LUCENE-5786: Unflushed/ truncated events file (hung testing subprocess). diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java index 312a6e994b3..f8308f0ad68 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java @@ -289,8 +289,8 @@ public class Dictionary { * @throws IOException Can be thrown while reading from the InputStream */ private void readAffixFile(InputStream affixStream, CharsetDecoder decoder) throws IOException, ParseException { - TreeMap> prefixes = new TreeMap<>(); - TreeMap> suffixes = new TreeMap<>(); + TreeMap> prefixes = new TreeMap<>(); + TreeMap> suffixes = new TreeMap<>(); Map seenPatterns = new HashMap<>(); // zero condition -> 0 ord @@ -397,16 +397,15 @@ public class Dictionary { stripOffsets[currentIndex] = currentOffset; } - private FST affixFST(TreeMap> affixes) throws IOException { + private FST affixFST(TreeMap> affixes) throws IOException { IntSequenceOutputs outputs = IntSequenceOutputs.getSingleton(); Builder builder = new Builder<>(FST.INPUT_TYPE.BYTE4, outputs); - IntsRef scratch = new IntsRef(); - for (Map.Entry> entry : affixes.entrySet()) { + for (Map.Entry> entry : affixes.entrySet()) { Util.toUTF32(entry.getKey(), scratch); - List entries = entry.getValue(); + List entries = entry.getValue(); IntsRef output = new IntsRef(entries.size()); - for (Character c : entries) { + for (Integer c : entries) { output.ints[output.length++] = c; } builder.add(scratch, output); @@ -444,7 +443,7 @@ public class Dictionary { * @param seenPatterns map from condition -> index of patterns, for deduplication. * @throws IOException Can be thrown while reading the rule */ - private void parseAffix(TreeMap> affixes, + private void parseAffix(TreeMap> affixes, String header, LineNumberReader reader, String conditionPattern, @@ -564,13 +563,12 @@ public class Dictionary { affixArg = new StringBuilder(affixArg).reverse().toString(); } - List list = affixes.get(affixArg); + List list = affixes.get(affixArg); if (list == null) { list = new ArrayList<>(); affixes.put(affixArg, list); } - - list.add((char)currentAffix); + list.add(currentAffix); currentAffix++; } } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/Test64kAffixes.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/Test64kAffixes.java new file mode 100644 index 00000000000..30ce732cf00 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/Test64kAffixes.java @@ -0,0 +1,69 @@ +package org.apache.lucene.analysis.hunspell; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.InputStream; +import java.io.OutputStreamWriter; +import java.nio.charset.StandardCharsets; +import java.util.List; + +import org.apache.lucene.util.CharsRef; +import org.apache.lucene.util.LuceneTestCase; + +/** Tests that > 64k affixes actually works and doesnt overflow some internal int */ +public class Test64kAffixes extends LuceneTestCase { + + public void test() throws Exception { + File tempDir = createTempDir("64kaffixes"); + File affix = new File(tempDir, "64kaffixes.aff"); + File dict = new File(tempDir, "64kaffixes.dic"); + + BufferedWriter affixWriter = new BufferedWriter( + new OutputStreamWriter( + new FileOutputStream(affix), StandardCharsets.UTF_8)); + + // 65k affixes with flag 1, then an affix with flag 2 + affixWriter.write("SET UTF-8\nFLAG num\nSFX 1 Y 65536\n"); + for (int i = 0; i < 65536; i++) { + affixWriter.write("SFX 1 0 " + Integer.toHexString(i) + " .\n"); + } + affixWriter.write("SFX 2 Y 1\nSFX 2 0 s\n"); + affixWriter.close(); + + BufferedWriter dictWriter = new BufferedWriter( + new OutputStreamWriter( + new FileOutputStream(dict), StandardCharsets.UTF_8)); + + // drink signed with affix 2 (takes -s) + dictWriter.write("1\ndrink/2\n"); + dictWriter.close(); + + try (InputStream affStream = new FileInputStream(affix); InputStream dictStream = new FileInputStream(dict)) { + Dictionary dictionary = new Dictionary(affStream, dictStream); + Stemmer stemmer = new Stemmer(dictionary); + // drinks should still stem to drink + List stems = stemmer.stem("drinks"); + assertEquals(1, stems.size()); + assertEquals("drink", stems.get(0).toString()); + } + } +}