mirror of https://github.com/apache/lucene.git
LUCENE-7916: Remove use of deprecated UScript.CODE_LIMIT in ICUTokenizer
This commit is contained in:
parent
a0ad20f5e6
commit
a4db6ce3e6
|
@ -30,6 +30,11 @@ Bug Fixes
|
|||
functions (Operations.isFinite and Operations.topsortState) to prevent
|
||||
large automaton to overflow the stack (Robert Muir, Adrien Grand, Jim Ferenczi)
|
||||
|
||||
* LUCENE-7916: Prevent ArrayIndexOutOfBoundsException if ICUTokenizer is used
|
||||
with a different ICU JAR version than it is compiled against. Note, this is
|
||||
not recommended, lucene-analyzers-icu contains binary data structures
|
||||
specific to ICU/Unicode versions it is built against. (Chris Koenig, Robert Muir)
|
||||
|
||||
======================= Lucene 7.0.0 =======================
|
||||
|
||||
New Features
|
||||
|
|
|
@ -61,6 +61,8 @@ import com.ibm.icu.text.Normalizer2;
|
|||
* </p>
|
||||
*/
|
||||
public final class ICUFoldingFilter extends ICUNormalizer2Filter {
|
||||
// TODO: if the wrong version of the ICU jar is used, loading these data files may give a strange error.
|
||||
// maybe add an explicit check? http://icu-project.org/apiref/icu4j/com/ibm/icu/util/VersionInfo.html
|
||||
private static final Normalizer2 normalizer = Normalizer2.getInstance(
|
||||
ICUFoldingFilter.class.getResourceAsStream("utr30.nrm"),
|
||||
"utr30", Normalizer2.Mode.COMPOSE);
|
||||
|
|
|
@ -17,6 +17,8 @@
|
|||
package org.apache.lucene.analysis.icu.segmentation;
|
||||
|
||||
|
||||
import com.ibm.icu.lang.UCharacter;
|
||||
import com.ibm.icu.lang.UProperty;
|
||||
import com.ibm.icu.lang.UScript;
|
||||
import com.ibm.icu.text.BreakIterator;
|
||||
|
||||
|
@ -38,7 +40,7 @@ import com.ibm.icu.text.BreakIterator;
|
|||
*/
|
||||
final class CompositeBreakIterator {
|
||||
private final ICUTokenizerConfig config;
|
||||
private final BreakIteratorWrapper wordBreakers[] = new BreakIteratorWrapper[UScript.CODE_LIMIT];
|
||||
private final BreakIteratorWrapper wordBreakers[] = new BreakIteratorWrapper[1 + UCharacter.getIntPropertyMaxValue(UProperty.SCRIPT)];
|
||||
|
||||
private BreakIteratorWrapper rbbi;
|
||||
private final ScriptIterator scriptIterator;
|
||||
|
|
|
@ -60,6 +60,10 @@ public class DefaultICUTokenizerConfig extends ICUTokenizerConfig {
|
|||
// we keep the cjk breaking separate, thats because it cannot be customized (because dictionary
|
||||
// is only triggered when kind = WORD, but kind = LINE by default and we have no non-evil way to change it)
|
||||
private static final BreakIterator cjkBreakIterator = BreakIterator.getWordInstance(ULocale.ROOT);
|
||||
|
||||
// TODO: if the wrong version of the ICU jar is used, loading these data files may give a strange error.
|
||||
// maybe add an explicit check? http://icu-project.org/apiref/icu4j/com/ibm/icu/util/VersionInfo.html
|
||||
|
||||
// the same as ROOT, except no dictionary segmentation for cjk
|
||||
private static final BreakIterator defaultBreakIterator =
|
||||
readBreakIterator("Default.brk");
|
||||
|
|
|
@ -33,7 +33,6 @@ import org.apache.lucene.util.IOUtils;
|
|||
|
||||
import com.ibm.icu.lang.UCharacter;
|
||||
import com.ibm.icu.lang.UProperty;
|
||||
import com.ibm.icu.lang.UScript;
|
||||
import com.ibm.icu.text.BreakIterator;
|
||||
import com.ibm.icu.text.RuleBasedBreakIterator;
|
||||
|
||||
|
@ -108,7 +107,7 @@ public class ICUTokenizerFactory extends TokenizerFactory implements ResourceLoa
|
|||
if (tailored.isEmpty()) {
|
||||
config = new DefaultICUTokenizerConfig(cjkAsWords, myanmarAsWords);
|
||||
} else {
|
||||
final BreakIterator breakers[] = new BreakIterator[UScript.CODE_LIMIT];
|
||||
final BreakIterator breakers[] = new BreakIterator[1 + UCharacter.getIntPropertyMaxValue(UProperty.SCRIPT)];
|
||||
for (Map.Entry<Integer,String> entry : tailored.entrySet()) {
|
||||
int code = entry.getKey();
|
||||
String resourcePath = entry.getValue();
|
||||
|
|
Loading…
Reference in New Issue