LUCENE-7916: Remove use of deprecated UScript.CODE_LIMIT in ICUTokenizer

This commit is contained in:
Robert Muir 2017-08-07 20:47:30 -04:00
parent a0ad20f5e6
commit a4db6ce3e6
5 changed files with 15 additions and 3 deletions

View File

@ -30,6 +30,11 @@ Bug Fixes
functions (Operations.isFinite and Operations.topsortState) to prevent
large automaton to overflow the stack (Robert Muir, Adrien Grand, Jim Ferenczi)
* LUCENE-7916: Prevent ArrayIndexOutOfBoundsException if ICUTokenizer is used
with a different ICU JAR version than it is compiled against. Note, this is
not recommended, lucene-analyzers-icu contains binary data structures
specific to ICU/Unicode versions it is built against. (Chris Koenig, Robert Muir)
======================= Lucene 7.0.0 =======================
New Features

View File

@ -61,6 +61,8 @@ import com.ibm.icu.text.Normalizer2;
* </p>
*/
public final class ICUFoldingFilter extends ICUNormalizer2Filter {
// TODO: if the wrong version of the ICU jar is used, loading these data files may give a strange error.
// maybe add an explicit check? http://icu-project.org/apiref/icu4j/com/ibm/icu/util/VersionInfo.html
private static final Normalizer2 normalizer = Normalizer2.getInstance(
ICUFoldingFilter.class.getResourceAsStream("utr30.nrm"),
"utr30", Normalizer2.Mode.COMPOSE);

View File

@ -17,6 +17,8 @@
package org.apache.lucene.analysis.icu.segmentation;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UProperty;
import com.ibm.icu.lang.UScript;
import com.ibm.icu.text.BreakIterator;
@ -38,7 +40,7 @@ import com.ibm.icu.text.BreakIterator;
*/
final class CompositeBreakIterator {
private final ICUTokenizerConfig config;
private final BreakIteratorWrapper wordBreakers[] = new BreakIteratorWrapper[UScript.CODE_LIMIT];
private final BreakIteratorWrapper wordBreakers[] = new BreakIteratorWrapper[1 + UCharacter.getIntPropertyMaxValue(UProperty.SCRIPT)];
private BreakIteratorWrapper rbbi;
private final ScriptIterator scriptIterator;

View File

@ -60,6 +60,10 @@ public class DefaultICUTokenizerConfig extends ICUTokenizerConfig {
// we keep the cjk breaking separate, thats because it cannot be customized (because dictionary
// is only triggered when kind = WORD, but kind = LINE by default and we have no non-evil way to change it)
private static final BreakIterator cjkBreakIterator = BreakIterator.getWordInstance(ULocale.ROOT);
// TODO: if the wrong version of the ICU jar is used, loading these data files may give a strange error.
// maybe add an explicit check? http://icu-project.org/apiref/icu4j/com/ibm/icu/util/VersionInfo.html
// the same as ROOT, except no dictionary segmentation for cjk
private static final BreakIterator defaultBreakIterator =
readBreakIterator("Default.brk");

View File

@ -33,7 +33,6 @@ import org.apache.lucene.util.IOUtils;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UProperty;
import com.ibm.icu.lang.UScript;
import com.ibm.icu.text.BreakIterator;
import com.ibm.icu.text.RuleBasedBreakIterator;
@ -108,7 +107,7 @@ public class ICUTokenizerFactory extends TokenizerFactory implements ResourceLoa
if (tailored.isEmpty()) {
config = new DefaultICUTokenizerConfig(cjkAsWords, myanmarAsWords);
} else {
final BreakIterator breakers[] = new BreakIterator[UScript.CODE_LIMIT];
final BreakIterator breakers[] = new BreakIterator[1 + UCharacter.getIntPropertyMaxValue(UProperty.SCRIPT)];
for (Map.Entry<Integer,String> entry : tailored.entrySet()) {
int code = entry.getKey();
String resourcePath = entry.getValue();