diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java index 9ab4421986a..567e26f519b 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java @@ -951,7 +951,7 @@ public class Dictionary { reuse.append(caseFold(word.charAt(i))); } reuse.append(FLAG_SEPARATOR); - flagParsingStrategy.appendFlag(HIDDEN_FLAG, reuse); + reuse.append(HIDDEN_FLAG); reuse.append(afterSep, afterSep.charAt(0) == FLAG_SEPARATOR ? 1 : 0, afterSep.length()); writer.write(reuse.toString().getBytes(StandardCharsets.UTF_8)); } @@ -1039,12 +1039,17 @@ public class Dictionary { entry = line.substring(0, end); } else { end = line.indexOf(MORPH_SEPARATOR); - String flagPart = line.substring(flagSep + 1, end); - if (aliasCount > 0) { + boolean hidden = line.charAt(flagSep + 1) == HIDDEN_FLAG; + String flagPart = line.substring(flagSep + (hidden ? 2 : 1), end); + if (aliasCount > 0 && !flagPart.isEmpty()) { flagPart = getAliasValue(Integer.parseInt(flagPart)); } wordForm = flagParsingStrategy.parseFlags(flagPart); + if (hidden) { + wordForm = ArrayUtil.growExact(wordForm, wordForm.length + 1); + wordForm[wordForm.length - 1] = HIDDEN_FLAG; + } Arrays.sort(wordForm); entry = line.substring(0, flagSep); } @@ -1275,8 +1280,6 @@ public class Dictionary { * @return Parsed flags */ abstract char[] parseFlags(String rawFlags); - - abstract void appendFlag(char flag, StringBuilder to); } /** @@ -1288,11 +1291,6 @@ public class Dictionary { public char[] parseFlags(String rawFlags) { return rawFlags.toCharArray(); } - - @Override - void appendFlag(char flag, StringBuilder to) { - to.append(flag); - } } /** Used to read flags as UTF-8 even if the rest of the file is in the default (8-bit) encoding */ @@ -1301,11 +1299,6 @@ public class Dictionary { public char[] parseFlags(String rawFlags) { return new String(rawFlags.getBytes(DEFAULT_CHARSET), StandardCharsets.UTF_8).toCharArray(); } - - @Override - void appendFlag(char flag, StringBuilder to) { - to.append(new String(String.valueOf(flag).getBytes(StandardCharsets.UTF_8), DEFAULT_CHARSET)); - } } /** @@ -1339,12 +1332,6 @@ public class Dictionary { } return flags; } - - @Override - void appendFlag(char flag, StringBuilder to) { - to.append((int) flag); - to.append(","); - } } /** @@ -1379,12 +1366,6 @@ public class Dictionary { builder.getChars(0, builder.length(), flags, 0); return flags; } - - @Override - void appendFlag(char flag, StringBuilder to) { - to.append((char) (flag >> 8)); - to.append((char) (flag & 0xff)); - } } boolean hasFlag(int entryId, char flag, BytesRef scratch) { diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestCompressed.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestCompressed.java new file mode 100644 index 00000000000..cb5d09e59cb --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestCompressed.java @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.analysis.hunspell; + +import org.junit.BeforeClass; + +public class TestCompressed extends StemmerTestBase { + @BeforeClass + public static void beforeClass() throws Exception { + init("compressed.aff", "compressed.dic"); + } + + public void test() { + assertStemsTo("apach", "apach"); + assertStemsTo("apache", "apach"); + assertStemsTo("apachee"); + + assertStemsTo("XYZ", "XYZ", "Xyz"); + assertStemsTo("XYZs", "XYZ"); + assertStemsTo("XYZS", "Xyz"); + assertStemsTo("xyz"); + + assertStemsTo("mixedCase", "mixedCase"); + assertStemsTo("MIXEDCASE", "Mixedcase"); + } +} diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java index 110f487c29b..289f4ed81ab 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestDictionary.java @@ -20,10 +20,8 @@ import java.io.ByteArrayInputStream; import java.io.FilterInputStream; import java.io.IOException; import java.io.InputStream; -import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.text.ParseException; -import java.util.Random; import org.apache.lucene.store.ByteBuffersDirectory; import org.apache.lucene.store.Directory; import org.apache.lucene.util.BytesRef; @@ -293,31 +291,6 @@ public class TestDictionary extends LuceneTestCase { assertEquals(src, new String(strategy.parseFlags(asAscii))); } - @Test - public void testFlagSerialization() { - Random r = random(); - char[] flags = new char[r.nextInt(10)]; - for (int i = 0; i < flags.length; i++) { - flags[i] = (char) r.nextInt(Character.MIN_SURROGATE); - } - - String[] flagLines = {"FLAG long", "FLAG UTF-8", "FLAG num"}; - Charset[] charsets = {StandardCharsets.UTF_8, Dictionary.DEFAULT_CHARSET}; - for (String flagLine : flagLines) { - for (Charset charset : charsets) { - Dictionary.FlagParsingStrategy strategy = - Dictionary.getFlagParsingStrategy(flagLine, charset); - StringBuilder serialized = new StringBuilder(); - for (char flag : flags) { - strategy.appendFlag(flag, serialized); - } - - char[] deserialized = strategy.parseFlags(serialized.toString()); - assertEquals(new String(flags), new String(deserialized)); - } - } - } - private Directory getDirectory() { return newDirectory(); } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compressed.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compressed.aff index c747c27ef80..93601f100fb 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compressed.aff +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compressed.aff @@ -20,7 +20,7 @@ SFX CC 0 d/3 c SFX CC 0 c b SFX DD Y 1 -SFX DD 0 s o +SFX DD 0 s . SFX EE Y 1 SFX EE 0 d o diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compressed.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compressed.dic index dd3890fae31..e162d77c3f4 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compressed.dic +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/compressed.dic @@ -1,4 +1,4 @@ -6 +10 ab/3 apach/1 foo/4 @@ -7,3 +7,5 @@ lucen/1 lucene mahout/1 olr/2 +XYZ/4 +mixedCase \ No newline at end of file