mirror of https://github.com/apache/lucene.git
LUCENE-5838: fix hunspell when .aff file has over 64k affixes
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1612349 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
052b84db2d
commit
0901f25e20
|
@ -195,6 +195,8 @@ Bug Fixes
|
|||
* LUCENE-5827: Make all Directory implementations correctly fail with
|
||||
IllegalArgumentException if slices are out of bounds. (Uwe SChindler)
|
||||
|
||||
* LUCENE-5838: Fix hunspell when the .aff file has over 64k affixes. (Robert Muir)
|
||||
|
||||
Test Framework
|
||||
|
||||
* LUCENE-5786: Unflushed/ truncated events file (hung testing subprocess).
|
||||
|
|
|
@ -289,8 +289,8 @@ public class Dictionary {
|
|||
* @throws IOException Can be thrown while reading from the InputStream
|
||||
*/
|
||||
private void readAffixFile(InputStream affixStream, CharsetDecoder decoder) throws IOException, ParseException {
|
||||
TreeMap<String, List<Character>> prefixes = new TreeMap<>();
|
||||
TreeMap<String, List<Character>> suffixes = new TreeMap<>();
|
||||
TreeMap<String, List<Integer>> prefixes = new TreeMap<>();
|
||||
TreeMap<String, List<Integer>> suffixes = new TreeMap<>();
|
||||
Map<String,Integer> seenPatterns = new HashMap<>();
|
||||
|
||||
// zero condition -> 0 ord
|
||||
|
@ -397,16 +397,15 @@ public class Dictionary {
|
|||
stripOffsets[currentIndex] = currentOffset;
|
||||
}
|
||||
|
||||
private FST<IntsRef> affixFST(TreeMap<String,List<Character>> affixes) throws IOException {
|
||||
private FST<IntsRef> affixFST(TreeMap<String,List<Integer>> affixes) throws IOException {
|
||||
IntSequenceOutputs outputs = IntSequenceOutputs.getSingleton();
|
||||
Builder<IntsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE4, outputs);
|
||||
|
||||
IntsRef scratch = new IntsRef();
|
||||
for (Map.Entry<String,List<Character>> entry : affixes.entrySet()) {
|
||||
for (Map.Entry<String,List<Integer>> entry : affixes.entrySet()) {
|
||||
Util.toUTF32(entry.getKey(), scratch);
|
||||
List<Character> entries = entry.getValue();
|
||||
List<Integer> entries = entry.getValue();
|
||||
IntsRef output = new IntsRef(entries.size());
|
||||
for (Character c : entries) {
|
||||
for (Integer c : entries) {
|
||||
output.ints[output.length++] = c;
|
||||
}
|
||||
builder.add(scratch, output);
|
||||
|
@ -444,7 +443,7 @@ public class Dictionary {
|
|||
* @param seenPatterns map from condition -> index of patterns, for deduplication.
|
||||
* @throws IOException Can be thrown while reading the rule
|
||||
*/
|
||||
private void parseAffix(TreeMap<String,List<Character>> affixes,
|
||||
private void parseAffix(TreeMap<String,List<Integer>> affixes,
|
||||
String header,
|
||||
LineNumberReader reader,
|
||||
String conditionPattern,
|
||||
|
@ -564,13 +563,12 @@ public class Dictionary {
|
|||
affixArg = new StringBuilder(affixArg).reverse().toString();
|
||||
}
|
||||
|
||||
List<Character> list = affixes.get(affixArg);
|
||||
List<Integer> list = affixes.get(affixArg);
|
||||
if (list == null) {
|
||||
list = new ArrayList<>();
|
||||
affixes.put(affixArg, list);
|
||||
}
|
||||
|
||||
list.add((char)currentAffix);
|
||||
list.add(currentAffix);
|
||||
currentAffix++;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,69 @@
|
|||
package org.apache.lucene.analysis.hunspell;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.BufferedWriter;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStreamWriter;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
/** Tests that > 64k affixes actually works and doesnt overflow some internal int */
|
||||
public class Test64kAffixes extends LuceneTestCase {
|
||||
|
||||
public void test() throws Exception {
|
||||
File tempDir = createTempDir("64kaffixes");
|
||||
File affix = new File(tempDir, "64kaffixes.aff");
|
||||
File dict = new File(tempDir, "64kaffixes.dic");
|
||||
|
||||
BufferedWriter affixWriter = new BufferedWriter(
|
||||
new OutputStreamWriter(
|
||||
new FileOutputStream(affix), StandardCharsets.UTF_8));
|
||||
|
||||
// 65k affixes with flag 1, then an affix with flag 2
|
||||
affixWriter.write("SET UTF-8\nFLAG num\nSFX 1 Y 65536\n");
|
||||
for (int i = 0; i < 65536; i++) {
|
||||
affixWriter.write("SFX 1 0 " + Integer.toHexString(i) + " .\n");
|
||||
}
|
||||
affixWriter.write("SFX 2 Y 1\nSFX 2 0 s\n");
|
||||
affixWriter.close();
|
||||
|
||||
BufferedWriter dictWriter = new BufferedWriter(
|
||||
new OutputStreamWriter(
|
||||
new FileOutputStream(dict), StandardCharsets.UTF_8));
|
||||
|
||||
// drink signed with affix 2 (takes -s)
|
||||
dictWriter.write("1\ndrink/2\n");
|
||||
dictWriter.close();
|
||||
|
||||
try (InputStream affStream = new FileInputStream(affix); InputStream dictStream = new FileInputStream(dict)) {
|
||||
Dictionary dictionary = new Dictionary(affStream, dictStream);
|
||||
Stemmer stemmer = new Stemmer(dictionary);
|
||||
// drinks should still stem to drink
|
||||
List<CharsRef> stems = stemmer.stem("drinks");
|
||||
assertEquals(1, stems.size());
|
||||
assertEquals("drink", stems.get(0).toString());
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue