From 513359ba8cb7f117d14f9b5080799c2bf0f4eb74 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Tue, 15 Jul 2014 10:59:29 +0000 Subject: [PATCH] LUCENE-5823: recognize hunspell FULLSTRIP option in the affix file git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1610644 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/CHANGES.txt | 5 +-- .../lucene/analysis/hunspell/Dictionary.java | 6 ++++ .../lucene/analysis/hunspell/Stemmer.java | 6 ++-- .../analysis/hunspell/TestFullStrip.java | 31 +++++++++++++++++++ .../lucene/analysis/hunspell/fullstrip.aff | 6 ++++ .../lucene/analysis/hunspell/fullstrip.dic | 2 ++ 6 files changed, 52 insertions(+), 4 deletions(-) create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestFullStrip.java create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/fullstrip.aff create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/fullstrip.dic diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index a13b673c2a8..d1e47be5c2b 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -159,8 +159,9 @@ Bug Fixes * LUCENE-5817: Fix hunspell zero-affix handling: previously only zero-strips worked correctly. (Robert Muir) -* LUCENE-5818: Fix hunspell overgeneration for short strings that also match affixes. - (Robert Muir) +* LUCENE-5818, LUCENE-5823: Fix hunspell overgeneration for short strings that also + match affixes, words are only stripped to a zero-length string if FULLSTRIP option + is specifed in the dictionary. (Robert Muir) Test Framework diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java index 17a4db07dd2..179887a0a66 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Dictionary.java @@ -84,6 +84,7 @@ public class Dictionary { private static final String IGNORE_KEY = "IGNORE"; private static final String ICONV_KEY = "ICONV"; private static final String OCONV_KEY = "OCONV"; + private static final String FULLSTRIP_KEY = "FULLSTRIP"; private static final String NUM_FLAG_TYPE = "num"; private static final String UTF8_FLAG_TYPE = "UTF-8"; @@ -150,6 +151,9 @@ public class Dictionary { boolean needsInputCleaning; boolean needsOutputCleaning; + // true if we can strip suffixes "down to nothing" + boolean fullStrip; + /** * Creates a new Dictionary containing the information read from the provided InputStreams to hunspell affix * and dictionary files. @@ -334,6 +338,8 @@ public class Dictionary { oconv = res; needsOutputCleaning |= oconv != null; } + } else if (line.startsWith(FULLSTRIP_KEY)) { + fullStrip = true; } } diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java index e9a7a3e9184..1030920972a 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Stemmer.java @@ -218,7 +218,8 @@ final class Stemmer { fst.getFirstArc(arc); IntsRef NO_OUTPUT = outputs.getNoOutput(); IntsRef output = NO_OUTPUT; - for (int i = 0; i < length-1; i++) { + int limit = dictionary.fullStrip ? length : length-1; + for (int i = 0; i < limit; i++) { if (i > 0) { int ch = word[i-1]; if (fst.findTargetArc(ch, arc, arc, bytesReader) == null) { @@ -292,7 +293,8 @@ final class Stemmer { fst.getFirstArc(arc); IntsRef NO_OUTPUT = outputs.getNoOutput(); IntsRef output = NO_OUTPUT; - for (int i = length; i > 0; i--) { + int limit = dictionary.fullStrip ? 0 : 1; + for (int i = length; i >= limit; i--) { if (i < length) { int ch = word[i]; if (fst.findTargetArc(ch, arc, arc, bytesReader) == null) { diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestFullStrip.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestFullStrip.java new file mode 100644 index 00000000000..988aa9bed34 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestFullStrip.java @@ -0,0 +1,31 @@ +package org.apache.lucene.analysis.hunspell; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.junit.BeforeClass; + +public class TestFullStrip extends StemmerTestBase { + @BeforeClass + public static void beforeClass() throws Exception { + init("fullstrip.aff", "fullstrip.dic"); + } + + public void testStemming() { + assertStemsTo("tasty", "beer"); + } +} diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/fullstrip.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/fullstrip.aff new file mode 100644 index 00000000000..9c2de7f7cff --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/fullstrip.aff @@ -0,0 +1,6 @@ +SET UTF-8 + +FULLSTRIP + +SFX A Y 1 +SFX A beer tasty . diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/fullstrip.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/fullstrip.dic new file mode 100644 index 00000000000..c948f184635 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/fullstrip.dic @@ -0,0 +1,2 @@ +1 +beer/A