LUCENE-5826: Support proper hunspell case handling and related options

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1611161 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2014-07-16 20:01:49 +00:00
parent 1abbfdb1f7
commit a9bb31174c
18 changed files with 513 additions and 17 deletions

View File

@ -108,6 +108,9 @@ New Features
* LUCENE-5806: Extend expressions grammar to support array access in variables.
Added helper class VariableContext to parse complex variable into pieces.
(Ryan Ernst)
* LUCENE-5826: Support proper hunspell case handling, LANG, KEEPCASE, NEEDAFFIX,
and ONLYINCOMPOUND flags. (Robert Muir)
API Changes

View File

@ -85,6 +85,11 @@ public class Dictionary {
private static final String ICONV_KEY = "ICONV";
private static final String OCONV_KEY = "OCONV";
private static final String FULLSTRIP_KEY = "FULLSTRIP";
private static final String LANG_KEY = "LANG";
private static final String KEEPCASE_KEY = "KEEPCASE";
private static final String NEEDAFFIX_KEY = "NEEDAFFIX";
private static final String PSEUDOROOT_KEY = "PSEUDOROOT";
private static final String ONLYINCOMPOUND_KEY = "ONLYINCOMPOUND";
private static final String NUM_FLAG_TYPE = "num";
private static final String UTF8_FLAG_TYPE = "UTF-8";
@ -140,6 +145,9 @@ public class Dictionary {
boolean twoStageAffix; // if no affixes have continuation classes, no need to do 2-level affix stripping
int circumfix = -1; // circumfix flag, or -1 if one is not defined
int keepcase = -1; // keepcase flag, or -1 if one is not defined
int needaffix = -1; // needaffix flag, or -1 if one is not defined
int onlyincompound = -1; // onlyincompound flag, or -1 if one is not defined
// ignored characters (dictionary, affix, inputs)
private char[] ignore;
@ -154,6 +162,11 @@ public class Dictionary {
// true if we can strip suffixes "down to nothing"
boolean fullStrip;
// language declaration of the dictionary
String language;
// true if case algorithms should use alternate (Turkish/Azeri) mapping
boolean alternateCasing;
/**
* Creates a new Dictionary containing the information read from the provided InputStreams to hunspell affix
* and dictionary files.
@ -315,6 +328,24 @@ public class Dictionary {
throw new ParseException("Illegal CIRCUMFIX declaration", reader.getLineNumber());
}
circumfix = flagParsingStrategy.parseFlag(parts[1]);
} else if (line.startsWith(KEEPCASE_KEY)) {
String parts[] = line.split("\\s+");
if (parts.length != 2) {
throw new ParseException("Illegal KEEPCASE declaration", reader.getLineNumber());
}
keepcase = flagParsingStrategy.parseFlag(parts[1]);
} else if (line.startsWith(NEEDAFFIX_KEY) || line.startsWith(PSEUDOROOT_KEY)) {
String parts[] = line.split("\\s+");
if (parts.length != 2) {
throw new ParseException("Illegal NEEDAFFIX declaration", reader.getLineNumber());
}
needaffix = flagParsingStrategy.parseFlag(parts[1]);
} else if (line.startsWith(ONLYINCOMPOUND_KEY)) {
String parts[] = line.split("\\s+");
if (parts.length != 2) {
throw new ParseException("Illegal ONLYINCOMPOUND declaration", reader.getLineNumber());
}
onlyincompound = flagParsingStrategy.parseFlag(parts[1]);
} else if (line.startsWith(IGNORE_KEY)) {
String parts[] = line.split("\\s+");
if (parts.length != 2) {
@ -340,6 +371,9 @@ public class Dictionary {
}
} else if (line.startsWith(FULLSTRIP_KEY)) {
fullStrip = true;
} else if (line.startsWith(LANG_KEY)) {
language = line.substring(LANG_KEY.length()).trim();
alternateCasing = "tr_TR".equals(language) || "az_AZ".equals(language);
}
}
@ -1108,7 +1142,7 @@ public class Dictionary {
if (ignoreCase && iconv == null) {
// if we have no input conversion mappings, do this on-the-fly
ch = Character.toLowerCase(ch);
ch = caseFold(ch);
}
reuse.append(ch);
@ -1122,7 +1156,7 @@ public class Dictionary {
}
if (ignoreCase) {
for (int i = 0; i < reuse.length(); i++) {
reuse.setCharAt(i, Character.toLowerCase(reuse.charAt(i)));
reuse.setCharAt(i, caseFold(reuse.charAt(i)));
}
}
}
@ -1130,6 +1164,21 @@ public class Dictionary {
return reuse;
}
/** folds single character (according to LANG if present) */
char caseFold(char c) {
if (alternateCasing) {
if (c == 'I') {
return 'ı';
} else if (c == 'İ') {
return 'i';
} else {
return Character.toLowerCase(c);
}
} else {
return Character.toLowerCase(c);
}
}
// TODO: this could be more efficient!
static void applyMappings(FST<CharsRef> fst, StringBuilder sb) throws IOException {
final FST.BytesReader bytesReader = fst.getBytesReader();

View File

@ -100,17 +100,104 @@ final class Stemmer {
word = scratchBuffer;
}
int caseType = caseOf(word, length);
if (caseType == UPPER_CASE) {
// upper: union exact, title, lower
caseFoldTitle(word, length);
caseFoldLower(titleBuffer, length);
List<CharsRef> list = doStem(word, length, false);
list.addAll(doStem(titleBuffer, length, true));
list.addAll(doStem(lowerBuffer, length, true));
return list;
} else if (caseType == TITLE_CASE) {
// title: union exact, lower
caseFoldLower(word, length);
List<CharsRef> list = doStem(word, length, false);
list.addAll(doStem(lowerBuffer, length, true));
return list;
} else {
// exact match only
return doStem(word, length, false);
}
}
// temporary buffers for case variants
private char[] lowerBuffer = new char[8];
private char[] titleBuffer = new char[8];
private static final int EXACT_CASE = 0;
private static final int TITLE_CASE = 1;
private static final int UPPER_CASE = 2;
/** returns EXACT_CASE,TITLE_CASE, or UPPER_CASE type for the word */
private int caseOf(char word[], int length) {
if (dictionary.ignoreCase || length == 0 || !Character.isUpperCase(word[0])) {
return EXACT_CASE;
}
// determine if we are title or lowercase (or something funky, in which its exact)
boolean seenUpper = false;
boolean seenLower = false;
for (int i = 1; i < length; i++) {
boolean v = Character.isUpperCase(word[i]);
seenUpper |= v;
seenLower |= !v;
}
if (!seenLower) {
return UPPER_CASE;
} else if (!seenUpper) {
return TITLE_CASE;
} else {
return EXACT_CASE;
}
}
/** folds titlecase variant of word to titleBuffer */
private void caseFoldTitle(char word[], int length) {
titleBuffer = ArrayUtil.grow(titleBuffer, length);
System.arraycopy(word, 0, titleBuffer, 0, length);
for (int i = 1; i < length; i++) {
titleBuffer[i] = dictionary.caseFold(titleBuffer[i]);
}
}
/** folds lowercase variant of word (title cased) to lowerBuffer */
private void caseFoldLower(char word[], int length) {
lowerBuffer = ArrayUtil.grow(lowerBuffer, length);
System.arraycopy(word, 0, lowerBuffer, 0, length);
lowerBuffer[0] = dictionary.caseFold(lowerBuffer[0]);
}
private List<CharsRef> doStem(char word[], int length, boolean caseVariant) {
List<CharsRef> stems = new ArrayList<>();
IntsRef forms = dictionary.lookupWord(word, 0, length);
if (forms != null) {
// TODO: some forms should not be added, e.g. ONLYINCOMPOUND
// just because it exists, does not make it valid...
for (int i = 0; i < forms.length; i += formStep) {
boolean checkKeepCase = caseVariant && dictionary.keepcase != -1;
boolean checkNeedAffix = dictionary.needaffix != -1;
boolean checkOnlyInCompound = dictionary.onlyincompound != -1;
if (checkKeepCase || checkNeedAffix || checkOnlyInCompound) {
dictionary.flagLookup.get(forms.ints[forms.offset+i], scratch);
char wordFlags[] = Dictionary.decodeFlags(scratch);
// we are looking for a case variant, but this word does not allow it
if (checkKeepCase && Dictionary.hasFlag(wordFlags, (char)dictionary.keepcase)) {
continue;
}
// we can't add this form, its a pseudostem requiring an affix
if (checkNeedAffix && Dictionary.hasFlag(wordFlags, (char)dictionary.needaffix)) {
continue;
}
// we can't add this form, it only belongs inside a compound word
if (checkOnlyInCompound && Dictionary.hasFlag(wordFlags, (char)dictionary.onlyincompound)) {
continue;
}
}
stems.add(newStem(word, length, forms, i));
}
}
try {
boolean v = stems.addAll(stem(word, length, -1, -1, -1, 0, true, true, false, false));
boolean v = stems.addAll(stem(word, length, -1, -1, -1, 0, true, true, false, false, caseVariant));
} catch (IOException bogus) {
throw new RuntimeException(bogus);
}
@ -203,9 +290,10 @@ final class Stemmer {
* but two prefixes (COMPLEXPREFIXES) or two suffixes must have continuation requirements to recurse.
* @param circumfix true if the previous prefix removal was signed as a circumfix
* this means inner most suffix must also contain circumfix flag.
* @param caseVariant true if we are searching for a case variant. if the word has KEEPCASE flag it cannot succeed.
* @return List of stems, or empty list if no stems are found
*/
private List<CharsRef> stem(char word[], int length, int previous, int prevFlag, int prefixFlag, int recursionDepth, boolean doPrefix, boolean doSuffix, boolean previousWasPrefix, boolean circumfix) throws IOException {
private List<CharsRef> stem(char word[], int length, int previous, int prevFlag, int prefixFlag, int recursionDepth, boolean doPrefix, boolean doSuffix, boolean previousWasPrefix, boolean circumfix, boolean caseVariant) throws IOException {
// TODO: allow this stuff to be reused by tokenfilter
List<CharsRef> stems = new ArrayList<>();
@ -250,13 +338,22 @@ final class Stemmer {
final boolean compatible;
if (recursionDepth == 0) {
compatible = true;
if (dictionary.onlyincompound == -1) {
compatible = true;
} else {
// check if affix is allowed in a non-compound word
dictionary.flagLookup.get(append, scratch);
char appendFlags[] = Dictionary.decodeFlags(scratch);
compatible = !Dictionary.hasFlag(appendFlags, (char) dictionary.onlyincompound);
}
} else if (crossProduct) {
// cross check incoming continuation class (flag of previous affix) against list.
dictionary.flagLookup.get(append, scratch);
char appendFlags[] = Dictionary.decodeFlags(scratch);
assert prevFlag >= 0;
compatible = hasCrossCheckedFlag((char)prevFlag, appendFlags, false);
boolean allowed = dictionary.onlyincompound == -1 ||
!Dictionary.hasFlag(appendFlags, (char) dictionary.onlyincompound);
compatible = allowed && hasCrossCheckedFlag((char)prevFlag, appendFlags, false);
} else {
compatible = false;
}
@ -277,7 +374,7 @@ final class Stemmer {
System.arraycopy(dictionary.stripData, stripStart, strippedWord, 0, stripLength);
System.arraycopy(word, deAffixedStart, strippedWord, stripLength, deAffixedLength);
List<CharsRef> stemList = applyAffix(strippedWord, strippedWord.length, prefix, -1, recursionDepth, true, circumfix);
List<CharsRef> stemList = applyAffix(strippedWord, strippedWord.length, prefix, -1, recursionDepth, true, circumfix, caseVariant);
stems.addAll(stemList);
}
@ -325,13 +422,22 @@ final class Stemmer {
final boolean compatible;
if (recursionDepth == 0) {
compatible = true;
if (dictionary.onlyincompound == -1) {
compatible = true;
} else {
// check if affix is allowed in a non-compound word
dictionary.flagLookup.get(append, scratch);
char appendFlags[] = Dictionary.decodeFlags(scratch);
compatible = !Dictionary.hasFlag(appendFlags, (char) dictionary.onlyincompound);
}
} else if (crossProduct) {
// cross check incoming continuation class (flag of previous affix) against list.
dictionary.flagLookup.get(append, scratch);
char appendFlags[] = Dictionary.decodeFlags(scratch);
assert prevFlag >= 0;
compatible = hasCrossCheckedFlag((char)prevFlag, appendFlags, previousWasPrefix);
boolean allowed = dictionary.onlyincompound == -1 ||
!Dictionary.hasFlag(appendFlags, (char) dictionary.onlyincompound);
compatible = allowed && hasCrossCheckedFlag((char)prevFlag, appendFlags, previousWasPrefix);
} else {
compatible = false;
}
@ -352,7 +458,7 @@ final class Stemmer {
System.arraycopy(word, 0, strippedWord, 0, deAffixedLength);
System.arraycopy(dictionary.stripData, stripStart, strippedWord, deAffixedLength, stripLength);
List<CharsRef> stemList = applyAffix(strippedWord, strippedWord.length, suffix, prefixFlag, recursionDepth, false, circumfix);
List<CharsRef> stemList = applyAffix(strippedWord, strippedWord.length, suffix, prefixFlag, recursionDepth, false, circumfix, caseVariant);
stems.addAll(stemList);
}
@ -399,7 +505,7 @@ final class Stemmer {
* @param prefix true if we are removing a prefix (false if its a suffix)
* @return List of stems for the word, or an empty list if none are found
*/
List<CharsRef> applyAffix(char strippedWord[], int length, int affix, int prefixFlag, int recursionDepth, boolean prefix, boolean circumfix) throws IOException {
List<CharsRef> applyAffix(char strippedWord[], int length, int affix, int prefixFlag, int recursionDepth, boolean prefix, boolean circumfix, boolean caseVariant) throws IOException {
// TODO: just pass this in from before, no need to decode it twice
affixReader.setPosition(8 * affix);
char flag = (char) (affixReader.readShort() & 0xffff);
@ -439,6 +545,15 @@ final class Stemmer {
continue;
}
}
// we are looking for a case variant, but this word does not allow it
if (caseVariant && dictionary.keepcase != -1 && Dictionary.hasFlag(wordFlags, (char)dictionary.keepcase)) {
continue;
}
// we aren't decompounding (yet)
if (dictionary.onlyincompound != -1 && Dictionary.hasFlag(wordFlags, (char)dictionary.onlyincompound)) {
continue;
}
stems.add(newStem(strippedWord, length, forms, i));
}
}
@ -457,20 +572,20 @@ final class Stemmer {
// we took away the first prefix.
// COMPLEXPREFIXES = true: combine with a second prefix and another suffix
// COMPLEXPREFIXES = false: combine with a suffix
stems.addAll(stem(strippedWord, length, affix, flag, flag, ++recursionDepth, dictionary.complexPrefixes && dictionary.twoStageAffix, true, true, circumfix));
stems.addAll(stem(strippedWord, length, affix, flag, flag, ++recursionDepth, dictionary.complexPrefixes && dictionary.twoStageAffix, true, true, circumfix, caseVariant));
} else if (dictionary.complexPrefixes == false && dictionary.twoStageAffix) {
// we took away a suffix.
// COMPLEXPREFIXES = true: we don't recurse! only one suffix allowed
// COMPLEXPREFIXES = false: combine with another suffix
stems.addAll(stem(strippedWord, length, affix, flag, prefixFlag, ++recursionDepth, false, true, false, circumfix));
stems.addAll(stem(strippedWord, length, affix, flag, prefixFlag, ++recursionDepth, false, true, false, circumfix, caseVariant));
}
} else if (recursionDepth == 1) {
if (prefix && dictionary.complexPrefixes) {
// we took away the second prefix: go look for another suffix
stems.addAll(stem(strippedWord, length, affix, flag, flag, ++recursionDepth, false, true, true, circumfix));
stems.addAll(stem(strippedWord, length, affix, flag, flag, ++recursionDepth, false, true, true, circumfix, caseVariant));
} else if (prefix == false && dictionary.complexPrefixes == false && dictionary.twoStageAffix) {
// we took away a prefix, then a suffix: go look for another suffix
stems.addAll(stem(strippedWord, length, affix, flag, prefixFlag, ++recursionDepth, false, true, false, circumfix));
stems.addAll(stem(strippedWord, length, affix, flag, prefixFlag, ++recursionDepth, false, true, false, circumfix, caseVariant));
}
}
}

View File

@ -0,0 +1,62 @@
package org.apache.lucene.analysis.hunspell;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.junit.BeforeClass;
public class TestAlternateCasing extends StemmerTestBase {
@BeforeClass
public static void beforeClass() throws Exception {
init("alternate-casing.aff", "alternate-casing.dic");
}
public void testPossibilities() {
assertStemsTo("drink", "drink");
assertStemsTo("DRİNK", "drink");
assertStemsTo("DRINK");
assertStemsTo("drinki", "drink");
assertStemsTo("DRİNKİ", "drink");
assertStemsTo("DRİNKI");
assertStemsTo("DRINKI");
assertStemsTo("DRINKİ");
assertStemsTo("idrink", "drink");
assertStemsTo("İDRİNK", "drink");
assertStemsTo("IDRİNK");
assertStemsTo("IDRINK");
assertStemsTo("İDRINK");
assertStemsTo("idrinki", "drink");
assertStemsTo("İDRİNKİ", "drink");
assertStemsTo("rıver", "rıver");
assertStemsTo("RIVER", "rıver");
assertStemsTo("RİVER");
assertStemsTo("rıverı", "rıver");
assertStemsTo("RIVERI", "rıver");
assertStemsTo("RİVERI");
assertStemsTo("RİVERİ");
assertStemsTo("RIVERİ");
assertStemsTo("ırıver", "rıver");
assertStemsTo("IRIVER", "rıver");
assertStemsTo("IRİVER");
assertStemsTo("İRİVER");
assertStemsTo("İRIVER");
assertStemsTo("ırıverı", "rıver");
assertStemsTo("IRIVERI", "rıver");
assertStemsTo("Irıverı", "rıver");
}
}

View File

@ -0,0 +1,66 @@
package org.apache.lucene.analysis.hunspell;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.junit.BeforeClass;
public class TestCaseSensitive extends StemmerTestBase {
@BeforeClass
public static void beforeClass() throws Exception {
init("casesensitive.aff", "casesensitive.dic");
}
public void testAllPossibilities() {
assertStemsTo("drink", "drink");
assertStemsTo("drinks", "drink");
assertStemsTo("drinkS", "drink");
assertStemsTo("gooddrinks", "drink");
assertStemsTo("Gooddrinks", "drink", "drink");
assertStemsTo("GOODdrinks", "drink");
assertStemsTo("gooddrinkS", "drink");
assertStemsTo("GooddrinkS", "drink");
assertStemsTo("gooddrink", "drink");
assertStemsTo("Gooddrink", "drink", "drink");
assertStemsTo("GOODdrink", "drink");
assertStemsTo("Drink", "drink", "Drink");
assertStemsTo("Drinks", "drink", "Drink");
assertStemsTo("DrinkS", "Drink");
assertStemsTo("goodDrinks", "Drink");
assertStemsTo("GoodDrinks", "Drink");
assertStemsTo("GOODDrinks", "Drink");
assertStemsTo("goodDrinkS", "Drink");
assertStemsTo("GoodDrinkS", "Drink");
assertStemsTo("GOODDrinkS", "Drink");
assertStemsTo("goodDrink", "Drink");
assertStemsTo("GoodDrink", "Drink");
assertStemsTo("GOODDrink", "Drink");
assertStemsTo("DRINK", "DRINK", "drink", "Drink");
assertStemsTo("DRINKs", "DRINK");
assertStemsTo("DRINKS", "DRINK", "drink", "Drink");
assertStemsTo("goodDRINKs", "DRINK");
assertStemsTo("GoodDRINKs", "DRINK");
assertStemsTo("GOODDRINKs", "DRINK");
assertStemsTo("goodDRINKS", "DRINK");
assertStemsTo("GoodDRINKS", "DRINK");
assertStemsTo("GOODDRINKS", "DRINK", "drink", "drink");
assertStemsTo("goodDRINK", "DRINK");
assertStemsTo("GoodDRINK", "DRINK");
assertStemsTo("GOODDRINK", "DRINK", "drink", "drink");
}
}

View File

@ -0,0 +1,45 @@
package org.apache.lucene.analysis.hunspell;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.junit.BeforeClass;
public class TestKeepCase extends StemmerTestBase {
@BeforeClass
public static void beforeClass() throws Exception {
init("keepcase.aff", "keepcase.dic");
}
public void testPossibilities() {
assertStemsTo("drink", "drink");
assertStemsTo("Drink", "drink");
assertStemsTo("DRINK", "drink");
assertStemsTo("drinks", "drink");
assertStemsTo("Drinks", "drink");
assertStemsTo("DRINKS", "drink");
assertStemsTo("walk", "walk");
assertStemsTo("walks", "walk");
assertStemsTo("Walk");
assertStemsTo("Walks");
assertStemsTo("WALKS");
assertStemsTo("test", "test");
assertStemsTo("Test");
assertStemsTo("TEST");
}
}

View File

@ -0,0 +1,41 @@
package org.apache.lucene.analysis.hunspell;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.junit.BeforeClass;
public class TestNeedAffix extends StemmerTestBase {
@BeforeClass
public static void beforeClass() throws Exception {
init("needaffix.aff", "needaffix.dic");
}
public void testPossibilities() {
assertStemsTo("drink", "drink");
assertStemsTo("drinks", "drink");
assertStemsTo("walk");
assertStemsTo("walks", "walk");
assertStemsTo("prewalk", "walk");
assertStemsTo("prewalks", "walk");
assertStemsTo("test");
assertStemsTo("pretest");
assertStemsTo("tests");
assertStemsTo("pretests");
}
}

View File

@ -0,0 +1,37 @@
package org.apache.lucene.analysis.hunspell;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.junit.BeforeClass;
public class TestOnlyInCompound extends StemmerTestBase {
@BeforeClass
public static void beforeClass() throws Exception {
init("onlyincompound.aff", "onlyincompound.dic");
}
public void testPossibilities() {
assertStemsTo("drink", "drink");
assertStemsTo("drinks", "drink");
assertStemsTo("drinked");
assertStemsTo("predrink");
assertStemsTo("predrinked");
assertStemsTo("walk");
}
}

View File

@ -0,0 +1,15 @@
SET UTF-8
LANG tr_TR
PFX A Y 1
PFX A 0 ı . +dotlessprefix
PFX B Y 1
PFX B 0 i . +dottedprefix
SFX X Y 1
SFX X 0 ı . +dotlesssuffix
SFX Y Y 1
SFX Y 0 i . +dottedsuffix

View File

@ -0,0 +1,4 @@
3
drink/BY
rıver/AX

View File

@ -0,0 +1,16 @@
SET UTF-8
PFX A Y 1
PFX A 0 good . +good
PFX B Y 1
PFX B 0 Good . +Good
PFX C Y 1
PFX C 0 GOOD . +GOOD
SFX X Y 1
SFX X 0 s . +s
SFX Y Y 1
SFX Y 0 S . +S

View File

@ -0,0 +1,4 @@
3
drink/XYABC
Drink/XYABC
DRINK/XYABC

View File

@ -0,0 +1,6 @@
SET UTF-8
KEEPCASE Z
SFX X Y 1
SFX X 0 s . +s

View File

@ -0,0 +1,4 @@
3
drink/X
walk/XZ
test/Z

View File

@ -0,0 +1,9 @@
SET UTF-8
NEEDAFFIX Z
PFX Y Y 1
PFX Y 0 pre . pre+
SFX X Y 1
SFX X 0 s . +s

View File

@ -0,0 +1,4 @@
3
drink/X
walk/XYZ
test/Z

View File

@ -0,0 +1,12 @@
SET UTF-8
ONLYINCOMPOUND A
PFX Y Y 1
PFX Y 0 pre/A . pre+
SFX X Y 1
SFX X 0 s . +s
SFX Z Y 1
SFX Z 0 ed/A . +ed

View File

@ -0,0 +1,4 @@
2
drink/XYZ
walk/A