diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 6b90215e935..e95d066a7f3 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -146,6 +146,9 @@ Bug Fixes * LUCENE-8130: Fix NullPointerException from TermStates.toString() (Mike McCandless) +* LUCENE-8124: Fixed HyphenationCompoundWordTokenFilter to handle correctly + hyphenation patterns with indicator >= 7. (Holger Bruch via Adrien Grand) + Other * LUCENE-8111: IndexOrDocValuesQuery Javadoc references outdated method name. diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/HyphenationTree.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/HyphenationTree.java index 0f7dd2b5c48..3c72b4fe045 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/HyphenationTree.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/compound/hyphenation/HyphenationTree.java @@ -89,7 +89,7 @@ public class HyphenationTree extends TernaryTree implements PatternConsumer { StringBuilder buf = new StringBuilder(); byte v = vspace.get(k++); while (v != 0) { - char c = (char) ((v >>> 4) - 1 + '0'); + char c = (char) (((v & 0xf0 )>>> 4) - 1 + '0'); buf.append(c); c = (char) (v & 0x0f); if (c == 0) { @@ -151,7 +151,7 @@ public class HyphenationTree extends TernaryTree implements PatternConsumer { StringBuilder buf = new StringBuilder(); byte v = vspace.get(k++); while (v != 0) { - char c = (char) ((v >>> 4) - 1); + char c = (char) (((v & 0xf0 )>>> 4) - 1); buf.append(c); c = (char) (v & 0x0f); if (c == 0) { diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java index ed3abe45b54..67a1bb42920 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java @@ -262,6 +262,21 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase { } } + + public void testLucene8124() throws Exception { + InputSource is = new InputSource(getClass().getResource("hyphenation-LUCENE-8124.xml").toExternalForm()); + HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter + .getHyphenationTree(is); + + HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter( + whitespaceMockTokenizer( + "Rindfleisch"), + hyphenator); + + // TODO Rindfleisch returned twice is another issue of the HyphenationCompoundTokenFilter + assertTokenStreamContents(tf, new String[] { "Rindfleisch", "Rind", "Rindfleisch", "fleisch"}); + } + public static interface MockRetainAttribute extends Attribute { void setRetain(boolean attr);