LUCENE-9700: Hunspell: support words with trailing dots (#2249)

This commit is contained in:
Peter Gromov 2021-01-29 08:23:03 +01:00 committed by GitHub
parent a9ad02cc54
commit 800f4d0919
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 309 additions and 1 deletions

View File

@ -44,6 +44,14 @@ public class SpellChecker {
word = dictionary.cleanInput(word, new StringBuilder()).toString();
}
if (word.endsWith(".")) {
return spellWithTrailingDots(word);
}
return spellClean(word);
}
private boolean spellClean(String word) {
if (isNumber(word)) {
return true;
}
@ -67,6 +75,14 @@ public class SpellChecker {
return false;
}
private boolean spellWithTrailingDots(String word) {
int length = word.length() - 1;
while (length > 0 && word.charAt(length - 1) == '.') {
length--;
}
return spellClean(word.substring(0, length)) || spellClean(word.substring(0, length + 1));
}
private boolean checkCaseVariants(char[] wordChars, WordCase wordCase) {
char[] caseVariant = wordChars;
if (wordCase == WordCase.UPPER) {

View File

@ -26,6 +26,16 @@ import org.apache.lucene.util.IOUtils;
import org.junit.Test;
public class SpellCheckerTest extends StemmerTestBase {
@Test
public void base() throws Exception {
doTest("base");
}
@Test
public void keepcase() throws Exception {
doTest("keepcase");
}
@Test
public void allcaps() throws Exception {
doTest("allcaps");

View File

@ -40,5 +40,11 @@ public class TestKeepCase extends StemmerTestBase {
assertStemsTo("test", "test");
assertStemsTo("Test");
assertStemsTo("TEST");
assertStemsTo("baz.", "baz.");
assertStemsTo("Baz.");
assertStemsTo("Quux.", "Quux.");
assertStemsTo("QUUX.");
}
}

View File

@ -0,0 +1,192 @@
# OpenOffice.org's en_US.aff file
SET ISO8859-1
TRY esianrtolcdugmphbyfvkwz'
WORDCHARS .'
PFX A Y 1
PFX A 0 re .
PFX I Y 1
PFX I 0 in .
PFX U Y 1
PFX U 0 un .
PFX C Y 1
PFX C 0 de .
PFX E Y 1
PFX E 0 dis .
PFX F Y 1
PFX F 0 con .
PFX K Y 1
PFX K 0 pro .
SFX V N 2
SFX V e ive e
SFX V 0 ive [^e]
SFX N Y 3
SFX N e ion e
SFX N y ication y
SFX N 0 en [^ey]
SFX X Y 3
SFX X e ions e
SFX X y ications y
SFX X 0 ens [^ey]
SFX H N 2
SFX H y ieth y
SFX H 0 th [^y]
SFX Y Y 1
SFX Y 0 ly .
SFX G Y 2
SFX G e ing e
SFX G 0 ing [^e]
SFX J Y 2
SFX J e ings e
SFX J 0 ings [^e]
SFX D Y 4
SFX D 0 d e
SFX D y ied [^aeiou]y
SFX D 0 ed [^ey]
SFX D 0 ed [aeiou]y
SFX T N 4
SFX T 0 st e
SFX T y iest [^aeiou]y
SFX T 0 est [aeiou]y
SFX T 0 est [^ey]
SFX R Y 4
SFX R 0 r e
SFX R y ier [^aeiou]y
SFX R 0 er [aeiou]y
SFX R 0 er [^ey]
SFX Z Y 4
SFX Z 0 rs e
SFX Z y iers [^aeiou]y
SFX Z 0 ers [aeiou]y
SFX Z 0 ers [^ey]
SFX S Y 4
SFX S y ies [^aeiou]y
SFX S 0 s [aeiou]y
SFX S 0 es [sxzh]
SFX S 0 s [^sxzhy]
SFX P Y 3
SFX P y iness [^aeiou]y
SFX P 0 ness [aeiou]y
SFX P 0 ness [^y]
SFX M Y 1
SFX M 0 's .
SFX B Y 3
SFX B 0 able [^aeiou]
SFX B 0 able ee
SFX B e able [^aeiou]e
SFX L Y 1
SFX L 0 ment .
REP 88
REP a ei
REP ei a
REP a ey
REP ey a
REP ai ie
REP ie ai
REP are air
REP are ear
REP are eir
REP air are
REP air ere
REP ere air
REP ere ear
REP ere eir
REP ear are
REP ear air
REP ear ere
REP eir are
REP eir ere
REP ch te
REP te ch
REP ch ti
REP ti ch
REP ch tu
REP tu ch
REP ch s
REP s ch
REP ch k
REP k ch
REP f ph
REP ph f
REP gh f
REP f gh
REP i igh
REP igh i
REP i uy
REP uy i
REP i ee
REP ee i
REP j di
REP di j
REP j gg
REP gg j
REP j ge
REP ge j
REP s ti
REP ti s
REP s ci
REP ci s
REP k cc
REP cc k
REP k qu
REP qu k
REP kw qu
REP o eau
REP eau o
REP o ew
REP ew o
REP oo ew
REP ew oo
REP ew ui
REP ui ew
REP oo ui
REP ui oo
REP ew u
REP u ew
REP oo u
REP u oo
REP u oe
REP oe u
REP u ieu
REP ieu u
REP ue ew
REP ew ue
REP uff ough
REP oo ieu
REP ieu oo
REP ier ear
REP ear ier
REP ear air
REP air ear
REP w qu
REP qu w
REP z ss
REP ss z
REP shun tion
REP shun sion
REP shun cion

View File

@ -0,0 +1,29 @@
28
created/U
create/XKVNGADS
imply/GNSDX
natural/PUY
like/USPBY
convey/BDGS
look/GZRDS
text
hello
said
sawyer
NASA
rotten
day
tomorrow
seven
FAQ/SM
can't
doesn't
etc
won't
lip
text
horrifying
speech
suggest
uncreate/V
Hunspell

View File

@ -0,0 +1,28 @@
created
uncreate
uncreated
imply
implied
unnatural
conveyed
sawyer
NASA
FAQs
can't
doesn't
won't
Created
Hello
HELLO
NASA
etc.
etc
HELLO
lip.
text.
NASA.
Text.
TEXT.
Hunspell.
HUNSPELL.
HUNSPELL...

View File

@ -0,0 +1,11 @@
loooked
texxt
hlelo
seid
rottenday
tomorow
seeeven
Nasa
horrorfying
peech
sugesst

View File

@ -1,4 +1,8 @@
3
7
drink/X
walk/XZ
test/Z
foo/Z
Bar/Z
baz./Z
Quux./Z

View File

@ -0,0 +1,4 @@
foo
Bar
baz.
Quux.

View File

@ -0,0 +1,8 @@
Foo
FOO
BAR
bar
Baz.
BAZ.
quux.
QUUX.