LUCENE-8124: Fixed HyphenationCompoundWordTokenFilter to handle correctly hyphenation patterns with indicator >= 7.

This commit is contained in:
Adrien Grand 2018-01-22 08:46:01 +01:00
parent fc6f3a45f8
commit f5e2267097
3 changed files with 20 additions and 2 deletions

View File

@ -146,6 +146,9 @@ Bug Fixes
* LUCENE-8130: Fix NullPointerException from TermStates.toString() (Mike McCandless)
* LUCENE-8124: Fixed HyphenationCompoundWordTokenFilter to handle correctly
hyphenation patterns with indicator >= 7. (Holger Bruch via Adrien Grand)
Other
* LUCENE-8111: IndexOrDocValuesQuery Javadoc references outdated method name.

View File

@ -89,7 +89,7 @@ public class HyphenationTree extends TernaryTree implements PatternConsumer {
StringBuilder buf = new StringBuilder();
byte v = vspace.get(k++);
while (v != 0) {
char c = (char) ((v >>> 4) - 1 + '0');
char c = (char) (((v & 0xf0 )>>> 4) - 1 + '0');
buf.append(c);
c = (char) (v & 0x0f);
if (c == 0) {
@ -151,7 +151,7 @@ public class HyphenationTree extends TernaryTree implements PatternConsumer {
StringBuilder buf = new StringBuilder();
byte v = vspace.get(k++);
while (v != 0) {
char c = (char) ((v >>> 4) - 1);
char c = (char) (((v & 0xf0 )>>> 4) - 1);
buf.append(c);
c = (char) (v & 0x0f);
if (c == 0) {

View File

@ -262,6 +262,21 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
}
}
public void testLucene8124() throws Exception {
InputSource is = new InputSource(getClass().getResource("hyphenation-LUCENE-8124.xml").toExternalForm());
HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
.getHyphenationTree(is);
HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(
whitespaceMockTokenizer(
"Rindfleisch"),
hyphenator);
// TODO Rindfleisch returned twice is another issue of the HyphenationCompoundTokenFilter
assertTokenStreamContents(tf, new String[] { "Rindfleisch", "Rind", "Rindfleisch", "fleisch"});
}
public static interface MockRetainAttribute extends Attribute {
void setRetain(boolean attr);