mirror of https://github.com/apache/lucene.git
LUCENE-5485: hunspell circumfix support
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1573569 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
213eb2fd06
commit
4eed6367ff
|
@ -87,6 +87,8 @@ New Features
|
|||
* LUCENE-5479: FacetsConfig subclass can now customize the default
|
||||
per-dim facets configuration. (Rob Audenaerde via Mike McCandless)
|
||||
|
||||
* LUCENE-5485: Add circumfix support to HunspellStemFilter. (Robert Muir)
|
||||
|
||||
API Changes
|
||||
|
||||
* LUCENE-5454: Add RandomAccessOrds, an optional extension of SortedSetDocValues
|
||||
|
|
|
@ -66,6 +66,7 @@ public class Dictionary {
|
|||
private static final String SUFFIX_KEY = "SFX";
|
||||
private static final String FLAG_KEY = "FLAG";
|
||||
private static final String COMPLEXPREFIXES_KEY = "COMPLEXPREFIXES";
|
||||
private static final String CIRCUMFIX_KEY = "CIRCUMFIX";
|
||||
|
||||
private static final String NUM_FLAG_TYPE = "num";
|
||||
private static final String UTF8_FLAG_TYPE = "UTF-8";
|
||||
|
@ -107,6 +108,8 @@ public class Dictionary {
|
|||
boolean ignoreCase;
|
||||
boolean complexPrefixes;
|
||||
|
||||
int circumfix = -1; // circumfix flag, or -1 if one is not defined
|
||||
|
||||
/**
|
||||
* Creates a new Dictionary containing the information read from the provided InputStreams to hunspell affix
|
||||
* and dictionary files.
|
||||
|
@ -240,6 +243,12 @@ public class Dictionary {
|
|||
flagParsingStrategy = getFlagParsingStrategy(line);
|
||||
} else if (line.equals(COMPLEXPREFIXES_KEY)) {
|
||||
complexPrefixes = true; // 2-stage prefix+1-stage suffix instead of 2-stage suffix+1-stage prefix
|
||||
} else if (line.startsWith(CIRCUMFIX_KEY)) {
|
||||
String parts[] = line.split("\\s+");
|
||||
if (parts.length != 2) {
|
||||
throw new ParseException("Illegal CIRCUMFIX declaration", reader.getLineNumber());
|
||||
}
|
||||
circumfix = flagParsingStrategy.parseFlag(parts[1]);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -81,7 +81,7 @@ final class Stemmer {
|
|||
stems.add(new CharsRef(word, 0, length));
|
||||
}
|
||||
}
|
||||
stems.addAll(stem(word, length, -1, -1, -1, 0, true, true, false));
|
||||
stems.addAll(stem(word, length, -1, -1, -1, 0, true, true, false, false));
|
||||
return stems;
|
||||
}
|
||||
|
||||
|
@ -122,9 +122,11 @@ final class Stemmer {
|
|||
* @param previousWasPrefix true if the previous removal was a prefix:
|
||||
* if we are removing a suffix, and it has no continuation requirements, its ok.
|
||||
* but two prefixes (COMPLEXPREFIXES) or two suffixes must have continuation requirements to recurse.
|
||||
* @param circumfix true if the previous prefix removal was signed as a circumfix
|
||||
* this means inner most suffix must also contain circumfix flag.
|
||||
* @return List of stems, or empty list if no stems are found
|
||||
*/
|
||||
private List<CharsRef> stem(char word[], int length, int previous, int prevFlag, int prefixFlag, int recursionDepth, boolean doPrefix, boolean doSuffix, boolean previousWasPrefix) {
|
||||
private List<CharsRef> stem(char word[], int length, int previous, int prevFlag, int prefixFlag, int recursionDepth, boolean doPrefix, boolean doSuffix, boolean previousWasPrefix, boolean circumfix) {
|
||||
|
||||
// TODO: allow this stuff to be reused by tokenfilter
|
||||
List<CharsRef> stems = new ArrayList<CharsRef>();
|
||||
|
@ -171,7 +173,7 @@ final class Stemmer {
|
|||
.append(word, deAffixedStart, deAffixedLength)
|
||||
.toString();
|
||||
|
||||
List<CharsRef> stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), prefix, -1, recursionDepth, true);
|
||||
List<CharsRef> stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), prefix, -1, recursionDepth, true, circumfix);
|
||||
|
||||
stems.addAll(stemList);
|
||||
}
|
||||
|
@ -219,7 +221,7 @@ final class Stemmer {
|
|||
dictionary.stripLookup.get(stripOrd, scratch);
|
||||
String strippedWord = new StringBuilder().append(word, 0, deAffixedLength).append(scratch.utf8ToString()).toString();
|
||||
|
||||
List<CharsRef> stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), suffix, prefixFlag, recursionDepth, false);
|
||||
List<CharsRef> stemList = applyAffix(strippedWord.toCharArray(), strippedWord.length(), suffix, prefixFlag, recursionDepth, false, circumfix);
|
||||
|
||||
stems.addAll(stemList);
|
||||
}
|
||||
|
@ -242,7 +244,7 @@ final class Stemmer {
|
|||
* @param prefix true if we are removing a prefix (false if its a suffix)
|
||||
* @return List of stems for the word, or an empty list if none are found
|
||||
*/
|
||||
List<CharsRef> applyAffix(char strippedWord[], int length, int affix, int prefixFlag, int recursionDepth, boolean prefix) {
|
||||
List<CharsRef> applyAffix(char strippedWord[], int length, int affix, int prefixFlag, int recursionDepth, boolean prefix, boolean circumfix) {
|
||||
segment.setLength(0);
|
||||
segment.append(strippedWord, 0, length);
|
||||
|
||||
|
@ -279,31 +281,49 @@ final class Stemmer {
|
|||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// if circumfix was previously set by a prefix, we must check this suffix,
|
||||
// to ensure it has it, and vice versa
|
||||
if (dictionary.circumfix != -1) {
|
||||
dictionary.flagLookup.get(append, scratch);
|
||||
char appendFlags[] = Dictionary.decodeFlags(scratch);
|
||||
boolean suffixCircumfix = Dictionary.hasFlag(appendFlags, (char)dictionary.circumfix);
|
||||
if (circumfix != suffixCircumfix) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
stems.add(new CharsRef(strippedWord, 0, length));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// if a circumfix flag is defined in the dictionary, and we are a prefix, we need to check if we have that flag
|
||||
if (dictionary.circumfix != -1 && !circumfix && prefix) {
|
||||
dictionary.flagLookup.get(append, scratch);
|
||||
char appendFlags[] = Dictionary.decodeFlags(scratch);
|
||||
circumfix = Dictionary.hasFlag(appendFlags, (char)dictionary.circumfix);
|
||||
}
|
||||
|
||||
if (crossProduct) {
|
||||
if (recursionDepth == 0) {
|
||||
if (prefix) {
|
||||
// we took away the first prefix.
|
||||
// COMPLEXPREFIXES = true: combine with a second prefix and another suffix
|
||||
// COMPLEXPREFIXES = false: combine with another suffix
|
||||
stems.addAll(stem(strippedWord, length, affix, flag, flag, ++recursionDepth, dictionary.complexPrefixes, true, true));
|
||||
stems.addAll(stem(strippedWord, length, affix, flag, flag, ++recursionDepth, dictionary.complexPrefixes, true, true, circumfix));
|
||||
} else if (!dictionary.complexPrefixes) {
|
||||
// we took away a suffix.
|
||||
// COMPLEXPREFIXES = true: we don't recurse! only one suffix allowed
|
||||
// COMPLEXPREFIXES = false: combine with another suffix
|
||||
stems.addAll(stem(strippedWord, length, affix, flag, prefixFlag, ++recursionDepth, false, true, false));
|
||||
stems.addAll(stem(strippedWord, length, affix, flag, prefixFlag, ++recursionDepth, false, true, false, circumfix));
|
||||
}
|
||||
} else if (recursionDepth == 1) {
|
||||
if (prefix && dictionary.complexPrefixes) {
|
||||
// we took away the second prefix: go look for another suffix
|
||||
stems.addAll(stem(strippedWord, length, affix, flag, flag, ++recursionDepth, false, true, true));
|
||||
stems.addAll(stem(strippedWord, length, affix, flag, flag, ++recursionDepth, false, true, true, circumfix));
|
||||
} else if (prefix == false && dictionary.complexPrefixes == false) {
|
||||
// we took away a prefix, then a suffix: go look for another suffix
|
||||
stems.addAll(stem(strippedWord, length, affix, flag, prefixFlag, ++recursionDepth, false, true, false));
|
||||
stems.addAll(stem(strippedWord, length, affix, flag, prefixFlag, ++recursionDepth, false, true, false, circumfix));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,38 @@
|
|||
package org.apache.lucene.analysis.hunspell;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.junit.BeforeClass;
|
||||
|
||||
public class TestCircumfix extends StemmerTestBase {
|
||||
|
||||
@BeforeClass
|
||||
public static void beforeClass() throws Exception {
|
||||
init("circumfix.aff", "circumfix.dic");
|
||||
}
|
||||
|
||||
public void testCircumfix() {
|
||||
assertStemsTo("nagy", "nagy");
|
||||
assertStemsTo("nagyobb", "nagy");
|
||||
assertStemsTo("legnagyobb", "nagy");
|
||||
assertStemsTo("legeslegnagyobb", "nagy");
|
||||
assertStemsTo("nagyobbobb");
|
||||
assertStemsTo("legnagy");
|
||||
assertStemsTo("legeslegnagy");
|
||||
}
|
||||
}
|
|
@ -0,0 +1,14 @@
|
|||
SET UTF-8
|
||||
|
||||
CIRCUMFIX X
|
||||
|
||||
PFX A Y 1
|
||||
PFX A 0 leg/X .
|
||||
|
||||
PFX B Y 1
|
||||
PFX B 0 legesleg/X .
|
||||
|
||||
SFX C Y 3
|
||||
SFX C 0 obb . +COMPARATIVE
|
||||
SFX C 0 obb/AX . +SUPERLATIVE
|
||||
SFX C 0 obb/BX . +SUPERSUPERLATIVE
|
|
@ -0,0 +1,2 @@
|
|||
1
|
||||
nagy/C [MN]
|
Loading…
Reference in New Issue