LUCENE-5838: fix hunspell when .aff file has over 64k affixes

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1612349 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2014-07-21 17:07:48 +00:00
parent 052b84db2d
commit 0901f25e20
3 changed files with 80 additions and 11 deletions

View File

@ -195,6 +195,8 @@ Bug Fixes
* LUCENE-5827: Make all Directory implementations correctly fail with
IllegalArgumentException if slices are out of bounds. (Uwe SChindler)
* LUCENE-5838: Fix hunspell when the .aff file has over 64k affixes. (Robert Muir)
Test Framework
* LUCENE-5786: Unflushed/ truncated events file (hung testing subprocess).

View File

@ -289,8 +289,8 @@ public class Dictionary {
* @throws IOException Can be thrown while reading from the InputStream
*/
private void readAffixFile(InputStream affixStream, CharsetDecoder decoder) throws IOException, ParseException {
TreeMap<String, List<Character>> prefixes = new TreeMap<>();
TreeMap<String, List<Character>> suffixes = new TreeMap<>();
TreeMap<String, List<Integer>> prefixes = new TreeMap<>();
TreeMap<String, List<Integer>> suffixes = new TreeMap<>();
Map<String,Integer> seenPatterns = new HashMap<>();
// zero condition -> 0 ord
@ -397,16 +397,15 @@ public class Dictionary {
stripOffsets[currentIndex] = currentOffset;
}
private FST<IntsRef> affixFST(TreeMap<String,List<Character>> affixes) throws IOException {
private FST<IntsRef> affixFST(TreeMap<String,List<Integer>> affixes) throws IOException {
IntSequenceOutputs outputs = IntSequenceOutputs.getSingleton();
Builder<IntsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE4, outputs);
IntsRef scratch = new IntsRef();
for (Map.Entry<String,List<Character>> entry : affixes.entrySet()) {
for (Map.Entry<String,List<Integer>> entry : affixes.entrySet()) {
Util.toUTF32(entry.getKey(), scratch);
List<Character> entries = entry.getValue();
List<Integer> entries = entry.getValue();
IntsRef output = new IntsRef(entries.size());
for (Character c : entries) {
for (Integer c : entries) {
output.ints[output.length++] = c;
}
builder.add(scratch, output);
@ -444,7 +443,7 @@ public class Dictionary {
* @param seenPatterns map from condition -> index of patterns, for deduplication.
* @throws IOException Can be thrown while reading the rule
*/
private void parseAffix(TreeMap<String,List<Character>> affixes,
private void parseAffix(TreeMap<String,List<Integer>> affixes,
String header,
LineNumberReader reader,
String conditionPattern,
@ -564,13 +563,12 @@ public class Dictionary {
affixArg = new StringBuilder(affixArg).reverse().toString();
}
List<Character> list = affixes.get(affixArg);
List<Integer> list = affixes.get(affixArg);
if (list == null) {
list = new ArrayList<>();
affixes.put(affixArg, list);
}
list.add((char)currentAffix);
list.add(currentAffix);
currentAffix++;
}
}

View File

@ -0,0 +1,69 @@
package org.apache.lucene.analysis.hunspell;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.nio.charset.StandardCharsets;
import java.util.List;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.LuceneTestCase;
/** Tests that > 64k affixes actually works and doesnt overflow some internal int */
public class Test64kAffixes extends LuceneTestCase {
public void test() throws Exception {
File tempDir = createTempDir("64kaffixes");
File affix = new File(tempDir, "64kaffixes.aff");
File dict = new File(tempDir, "64kaffixes.dic");
BufferedWriter affixWriter = new BufferedWriter(
new OutputStreamWriter(
new FileOutputStream(affix), StandardCharsets.UTF_8));
// 65k affixes with flag 1, then an affix with flag 2
affixWriter.write("SET UTF-8\nFLAG num\nSFX 1 Y 65536\n");
for (int i = 0; i < 65536; i++) {
affixWriter.write("SFX 1 0 " + Integer.toHexString(i) + " .\n");
}
affixWriter.write("SFX 2 Y 1\nSFX 2 0 s\n");
affixWriter.close();
BufferedWriter dictWriter = new BufferedWriter(
new OutputStreamWriter(
new FileOutputStream(dict), StandardCharsets.UTF_8));
// drink signed with affix 2 (takes -s)
dictWriter.write("1\ndrink/2\n");
dictWriter.close();
try (InputStream affStream = new FileInputStream(affix); InputStream dictStream = new FileInputStream(dict)) {
Dictionary dictionary = new Dictionary(affStream, dictStream);
Stemmer stemmer = new Stemmer(dictionary);
// drinks should still stem to drink
List<CharsRef> stems = stemmer.stem("drinks");
assertEquals(1, stems.size());
assertEquals("drink", stems.get(0).toString());
}
}
}