mirror of https://github.com/apache/lucene.git
Revert "LUCENE-8125: ICUTokenizer support for emoji/emoji sequence tokens"
This reverts commit 972df6c69d
.
This commit is contained in:
parent
4bfcbc5c60
commit
fafbb2635d
|
@ -136,8 +136,6 @@ Improvements
|
||||||
position sensitive (e.g. part of a phrase) by having an accurate freq.
|
position sensitive (e.g. part of a phrase) by having an accurate freq.
|
||||||
(David Smiley)
|
(David Smiley)
|
||||||
|
|
||||||
* LUCENE-8125: ICUTokenizer support for emoji/emoji sequence tokens. (Robert Muir)
|
|
||||||
|
|
||||||
* LUCENE-8129: A Unicode set filter can now be specified when using ICUFoldingFilter.
|
* LUCENE-8129: A Unicode set filter can now be specified when using ICUFoldingFilter.
|
||||||
(Ere Maijala)
|
(Ere Maijala)
|
||||||
|
|
||||||
|
|
|
@ -16,84 +16,152 @@
|
||||||
*/
|
*/
|
||||||
package org.apache.lucene.analysis.icu.segmentation;
|
package org.apache.lucene.analysis.icu.segmentation;
|
||||||
|
|
||||||
|
|
||||||
|
import java.text.CharacterIterator;
|
||||||
|
|
||||||
import com.ibm.icu.lang.UCharacter;
|
import com.ibm.icu.lang.UCharacter;
|
||||||
import com.ibm.icu.lang.UProperty;
|
|
||||||
import com.ibm.icu.text.BreakIterator;
|
import com.ibm.icu.text.BreakIterator;
|
||||||
import com.ibm.icu.text.RuleBasedBreakIterator;
|
import com.ibm.icu.text.RuleBasedBreakIterator;
|
||||||
import com.ibm.icu.text.UTF16;
|
import com.ibm.icu.text.UTF16;
|
||||||
import com.ibm.icu.text.UnicodeSet;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Wraps RuleBasedBreakIterator, making object reuse convenient and
|
* Contain all the issues surrounding BreakIterators in ICU in one place.
|
||||||
* emitting a rule status for emoji sequences.
|
* Basically this boils down to the fact that they aren't very friendly to any
|
||||||
|
* sort of OO design.
|
||||||
|
* <p>
|
||||||
|
* http://bugs.icu-project.org/trac/ticket/5901: RBBI.getRuleStatus(), hoist to
|
||||||
|
* BreakIterator from RuleBasedBreakIterator
|
||||||
|
* <p>
|
||||||
|
* DictionaryBasedBreakIterator is a subclass of RuleBasedBreakIterator, but
|
||||||
|
* doesn't actually behave as a subclass: it always returns 0 for
|
||||||
|
* getRuleStatus():
|
||||||
|
* http://bugs.icu-project.org/trac/ticket/4730: Thai RBBI, no boundary type
|
||||||
|
* tags
|
||||||
* @lucene.experimental
|
* @lucene.experimental
|
||||||
*/
|
*/
|
||||||
final class BreakIteratorWrapper {
|
abstract class BreakIteratorWrapper {
|
||||||
private final CharArrayIterator textIterator = new CharArrayIterator();
|
protected final CharArrayIterator textIterator = new CharArrayIterator();
|
||||||
private final RuleBasedBreakIterator rbbi;
|
protected char text[];
|
||||||
private char text[];
|
protected int start;
|
||||||
private int start;
|
protected int length;
|
||||||
private int status;
|
|
||||||
|
|
||||||
BreakIteratorWrapper(RuleBasedBreakIterator rbbi) {
|
abstract int next();
|
||||||
this.rbbi = rbbi;
|
abstract int current();
|
||||||
}
|
abstract int getRuleStatus();
|
||||||
|
abstract void setText(CharacterIterator text);
|
||||||
int current() {
|
|
||||||
return rbbi.current();
|
|
||||||
}
|
|
||||||
|
|
||||||
int getRuleStatus() {
|
|
||||||
return status;
|
|
||||||
}
|
|
||||||
|
|
||||||
int next() {
|
|
||||||
int current = rbbi.current();
|
|
||||||
int next = rbbi.next();
|
|
||||||
status = calcStatus(current, next);
|
|
||||||
return next;
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Returns current rule status for the text between breaks. (determines token type) */
|
|
||||||
private int calcStatus(int current, int next) {
|
|
||||||
// to support presentation selectors, we need to handle alphanum, num, and none at least, so currently not worth optimizing.
|
|
||||||
// https://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5B%3AEmoji%3A%5D-%5B%3AEmoji_Presentation%3A%5D&g=Word_Break&i=
|
|
||||||
if (next != BreakIterator.DONE && isEmoji(current, next)) {
|
|
||||||
return ICUTokenizerConfig.EMOJI_SEQUENCE_STATUS;
|
|
||||||
} else {
|
|
||||||
return rbbi.getRuleStatus();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// See unicode doc L2/16-315 and also the RBBI rules for rationale.
|
|
||||||
// we don't include regional indicators here, because they aren't ambiguous for tagging,
|
|
||||||
// they need only be treated special for segmentation.
|
|
||||||
static final UnicodeSet EMOJI_RK = new UnicodeSet("[\u002a\u00230-9©®™〰〽]").freeze();
|
|
||||||
|
|
||||||
/** Returns true if the current text represents emoji character or sequence */
|
|
||||||
private boolean isEmoji(int current, int next) {
|
|
||||||
int begin = start + current;
|
|
||||||
int end = start + next;
|
|
||||||
int codepoint = UTF16.charAt(text, 0, end, begin);
|
|
||||||
// TODO: this can be made more aggressive and future-proof if it uses [:Extended_Pictographic:]
|
|
||||||
if (UCharacter.hasBinaryProperty(codepoint, UProperty.EMOJI)) {
|
|
||||||
if (EMOJI_RK.contains(codepoint)) {
|
|
||||||
// if its in EmojiRK, we don't treat it as emoji unless there is evidence it forms emoji sequence,
|
|
||||||
// an emoji presentation selector or keycap follows.
|
|
||||||
int trailer = begin + Character.charCount(codepoint);
|
|
||||||
return trailer < end && (text[trailer] == 0xFE0F || text[trailer] == 0x20E3);
|
|
||||||
} else {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
void setText(char text[], int start, int length) {
|
void setText(char text[], int start, int length) {
|
||||||
this.text = text;
|
this.text = text;
|
||||||
this.start = start;
|
this.start = start;
|
||||||
|
this.length = length;
|
||||||
textIterator.setText(text, start, length);
|
textIterator.setText(text, start, length);
|
||||||
rbbi.setText(textIterator);
|
setText(textIterator);
|
||||||
status = RuleBasedBreakIterator.WORD_NONE;
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* If it's a RuleBasedBreakIterator, the rule status can be used for token type. If it's
|
||||||
|
* any other BreakIterator, the rulestatus method is not available, so treat
|
||||||
|
* it like a generic BreakIterator.
|
||||||
|
*/
|
||||||
|
static BreakIteratorWrapper wrap(BreakIterator breakIterator) {
|
||||||
|
if (breakIterator instanceof RuleBasedBreakIterator)
|
||||||
|
return new RBBIWrapper((RuleBasedBreakIterator) breakIterator);
|
||||||
|
else
|
||||||
|
return new BIWrapper(breakIterator);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* RuleBasedBreakIterator wrapper: RuleBasedBreakIterator (as long as it's not
|
||||||
|
* a DictionaryBasedBreakIterator) behaves correctly.
|
||||||
|
*/
|
||||||
|
static final class RBBIWrapper extends BreakIteratorWrapper {
|
||||||
|
private final RuleBasedBreakIterator rbbi;
|
||||||
|
|
||||||
|
RBBIWrapper(RuleBasedBreakIterator rbbi) {
|
||||||
|
this.rbbi = rbbi;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
int current() {
|
||||||
|
return rbbi.current();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
int getRuleStatus() {
|
||||||
|
return rbbi.getRuleStatus();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
int next() {
|
||||||
|
return rbbi.next();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
void setText(CharacterIterator text) {
|
||||||
|
rbbi.setText(text);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Generic BreakIterator wrapper: Either the rulestatus method is not
|
||||||
|
* available or always returns 0. Calculate a rulestatus here so it behaves
|
||||||
|
* like RuleBasedBreakIterator.
|
||||||
|
*
|
||||||
|
* Note: This is slower than RuleBasedBreakIterator.
|
||||||
|
*/
|
||||||
|
static final class BIWrapper extends BreakIteratorWrapper {
|
||||||
|
private final BreakIterator bi;
|
||||||
|
private int status;
|
||||||
|
|
||||||
|
BIWrapper(BreakIterator bi) {
|
||||||
|
this.bi = bi;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
int current() {
|
||||||
|
return bi.current();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
int getRuleStatus() {
|
||||||
|
return status;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
int next() {
|
||||||
|
int current = bi.current();
|
||||||
|
int next = bi.next();
|
||||||
|
status = calcStatus(current, next);
|
||||||
|
return next;
|
||||||
|
}
|
||||||
|
|
||||||
|
private int calcStatus(int current, int next) {
|
||||||
|
if (current == BreakIterator.DONE || next == BreakIterator.DONE)
|
||||||
|
return RuleBasedBreakIterator.WORD_NONE;
|
||||||
|
|
||||||
|
int begin = start + current;
|
||||||
|
int end = start + next;
|
||||||
|
|
||||||
|
int codepoint;
|
||||||
|
for (int i = begin; i < end; i += UTF16.getCharCount(codepoint)) {
|
||||||
|
codepoint = UTF16.charAt(text, 0, end, begin);
|
||||||
|
|
||||||
|
if (UCharacter.isDigit(codepoint))
|
||||||
|
return RuleBasedBreakIterator.WORD_NUMBER;
|
||||||
|
else if (UCharacter.isLetter(codepoint)) {
|
||||||
|
// TODO: try to separately specify ideographic, kana?
|
||||||
|
// [currently all bundled as letter for this case]
|
||||||
|
return RuleBasedBreakIterator.WORD_LETTER;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return RuleBasedBreakIterator.WORD_NONE;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
void setText(CharacterIterator text) {
|
||||||
|
bi.setText(text);
|
||||||
|
status = RuleBasedBreakIterator.WORD_NONE;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -123,7 +123,7 @@ final class CompositeBreakIterator {
|
||||||
|
|
||||||
private BreakIteratorWrapper getBreakIterator(int scriptCode) {
|
private BreakIteratorWrapper getBreakIterator(int scriptCode) {
|
||||||
if (wordBreakers[scriptCode] == null)
|
if (wordBreakers[scriptCode] == null)
|
||||||
wordBreakers[scriptCode] = new BreakIteratorWrapper(config.getBreakIterator(scriptCode));
|
wordBreakers[scriptCode] = BreakIteratorWrapper.wrap(config.getBreakIterator(scriptCode));
|
||||||
return wordBreakers[scriptCode];
|
return wordBreakers[scriptCode];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -52,8 +52,6 @@ public class DefaultICUTokenizerConfig extends ICUTokenizerConfig {
|
||||||
public static final String WORD_LETTER = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.ALPHANUM];
|
public static final String WORD_LETTER = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.ALPHANUM];
|
||||||
/** Token type for words that appear to be numbers */
|
/** Token type for words that appear to be numbers */
|
||||||
public static final String WORD_NUMBER = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.NUM];
|
public static final String WORD_NUMBER = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.NUM];
|
||||||
/** Token type for words that appear to be emoji sequences */
|
|
||||||
public static final String WORD_EMOJI = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.EMOJI];
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* the default breakiterators in use. these can be expensive to
|
* the default breakiterators in use. these can be expensive to
|
||||||
|
@ -67,9 +65,9 @@ public class DefaultICUTokenizerConfig extends ICUTokenizerConfig {
|
||||||
// maybe add an explicit check? http://icu-project.org/apiref/icu4j/com/ibm/icu/util/VersionInfo.html
|
// maybe add an explicit check? http://icu-project.org/apiref/icu4j/com/ibm/icu/util/VersionInfo.html
|
||||||
|
|
||||||
// the same as ROOT, except no dictionary segmentation for cjk
|
// the same as ROOT, except no dictionary segmentation for cjk
|
||||||
private static final RuleBasedBreakIterator defaultBreakIterator =
|
private static final BreakIterator defaultBreakIterator =
|
||||||
readBreakIterator("Default.brk");
|
readBreakIterator("Default.brk");
|
||||||
private static final RuleBasedBreakIterator myanmarSyllableIterator =
|
private static final BreakIterator myanmarSyllableIterator =
|
||||||
readBreakIterator("MyanmarSyllable.brk");
|
readBreakIterator("MyanmarSyllable.brk");
|
||||||
|
|
||||||
// TODO: deprecate this boolean? you only care if you are doing super-expert stuff...
|
// TODO: deprecate this boolean? you only care if you are doing super-expert stuff...
|
||||||
|
@ -97,16 +95,16 @@ public class DefaultICUTokenizerConfig extends ICUTokenizerConfig {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public RuleBasedBreakIterator getBreakIterator(int script) {
|
public BreakIterator getBreakIterator(int script) {
|
||||||
switch(script) {
|
switch(script) {
|
||||||
case UScript.JAPANESE: return (RuleBasedBreakIterator)cjkBreakIterator.clone();
|
case UScript.JAPANESE: return (BreakIterator)cjkBreakIterator.clone();
|
||||||
case UScript.MYANMAR:
|
case UScript.MYANMAR:
|
||||||
if (myanmarAsWords) {
|
if (myanmarAsWords) {
|
||||||
return (RuleBasedBreakIterator)defaultBreakIterator.clone();
|
return (BreakIterator)defaultBreakIterator.clone();
|
||||||
} else {
|
} else {
|
||||||
return (RuleBasedBreakIterator)myanmarSyllableIterator.clone();
|
return (BreakIterator)myanmarSyllableIterator.clone();
|
||||||
}
|
}
|
||||||
default: return (RuleBasedBreakIterator)defaultBreakIterator.clone();
|
default: return (BreakIterator)defaultBreakIterator.clone();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -121,8 +119,6 @@ public class DefaultICUTokenizerConfig extends ICUTokenizerConfig {
|
||||||
return script == UScript.HANGUL ? WORD_HANGUL : WORD_LETTER;
|
return script == UScript.HANGUL ? WORD_HANGUL : WORD_LETTER;
|
||||||
case RuleBasedBreakIterator.WORD_NUMBER:
|
case RuleBasedBreakIterator.WORD_NUMBER:
|
||||||
return WORD_NUMBER;
|
return WORD_NUMBER;
|
||||||
case EMOJI_SEQUENCE_STATUS:
|
|
||||||
return WORD_EMOJI;
|
|
||||||
default: /* some other custom code */
|
default: /* some other custom code */
|
||||||
return "<OTHER>";
|
return "<OTHER>";
|
||||||
}
|
}
|
||||||
|
|
|
@ -16,7 +16,8 @@
|
||||||
*/
|
*/
|
||||||
package org.apache.lucene.analysis.icu.segmentation;
|
package org.apache.lucene.analysis.icu.segmentation;
|
||||||
|
|
||||||
import com.ibm.icu.text.RuleBasedBreakIterator;
|
|
||||||
|
import com.ibm.icu.text.BreakIterator;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Class that allows for tailored Unicode Text Segmentation on
|
* Class that allows for tailored Unicode Text Segmentation on
|
||||||
|
@ -24,8 +25,6 @@ import com.ibm.icu.text.RuleBasedBreakIterator;
|
||||||
* @lucene.experimental
|
* @lucene.experimental
|
||||||
*/
|
*/
|
||||||
public abstract class ICUTokenizerConfig {
|
public abstract class ICUTokenizerConfig {
|
||||||
/** Rule status for emoji sequences */
|
|
||||||
public static final int EMOJI_SEQUENCE_STATUS = 299;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Sole constructor. (For invocation by subclass
|
* Sole constructor. (For invocation by subclass
|
||||||
|
@ -33,7 +32,7 @@ public abstract class ICUTokenizerConfig {
|
||||||
*/
|
*/
|
||||||
public ICUTokenizerConfig() {}
|
public ICUTokenizerConfig() {}
|
||||||
/** Return a breakiterator capable of processing a given script. */
|
/** Return a breakiterator capable of processing a given script. */
|
||||||
public abstract RuleBasedBreakIterator getBreakIterator(int script);
|
public abstract BreakIterator getBreakIterator(int script);
|
||||||
/** Return a token type value for a given script and BreakIterator
|
/** Return a token type value for a given script and BreakIterator
|
||||||
* rule status. */
|
* rule status. */
|
||||||
public abstract String getType(int script, int ruleStatus);
|
public abstract String getType(int script, int ruleStatus);
|
||||||
|
|
|
@ -116,9 +116,9 @@ public class ICUTokenizerFactory extends TokenizerFactory implements ResourceLoa
|
||||||
config = new DefaultICUTokenizerConfig(cjkAsWords, myanmarAsWords) {
|
config = new DefaultICUTokenizerConfig(cjkAsWords, myanmarAsWords) {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public RuleBasedBreakIterator getBreakIterator(int script) {
|
public BreakIterator getBreakIterator(int script) {
|
||||||
if (breakers[script] != null) {
|
if (breakers[script] != null) {
|
||||||
return (RuleBasedBreakIterator) breakers[script].clone();
|
return (BreakIterator) breakers[script].clone();
|
||||||
} else {
|
} else {
|
||||||
return super.getBreakIterator(script);
|
return super.getBreakIterator(script);
|
||||||
}
|
}
|
||||||
|
|
|
@ -16,10 +16,13 @@
|
||||||
*/
|
*/
|
||||||
package org.apache.lucene.analysis.icu.segmentation;
|
package org.apache.lucene.analysis.icu.segmentation;
|
||||||
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.icu.ICUNormalizer2Filter;
|
||||||
import org.apache.lucene.analysis.icu.tokenattributes.ScriptAttribute;
|
import org.apache.lucene.analysis.icu.tokenattributes.ScriptAttribute;
|
||||||
|
|
||||||
import com.ibm.icu.lang.UScript;
|
import com.ibm.icu.lang.UScript;
|
||||||
|
@ -73,7 +76,8 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
|
||||||
@Override
|
@Override
|
||||||
protected TokenStreamComponents createComponents(String fieldName) {
|
protected TokenStreamComponents createComponents(String fieldName) {
|
||||||
Tokenizer tokenizer = new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(false, true));
|
Tokenizer tokenizer = new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(false, true));
|
||||||
return new TokenStreamComponents(tokenizer);
|
TokenFilter filter = new ICUNormalizer2Filter(tokenizer);
|
||||||
|
return new TokenStreamComponents(tokenizer, filter);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
@ -86,8 +90,8 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
public void testArmenian() throws Exception {
|
public void testArmenian() throws Exception {
|
||||||
assertAnalyzesTo(a, "Վիքիպեդիայի 13 միլիոն հոդվածները (4,600` հայերեն վիքիպեդիայում) գրվել են կամավորների կողմից ու համարյա բոլոր հոդվածները կարող է խմբագրել ցանկաց մարդ ով կարող է բացել Վիքիպեդիայի կայքը։",
|
assertAnalyzesTo(a, "Վիքիպեդիայի 13 միլիոն հոդվածները (4,600` հայերեն վիքիպեդիայում) գրվել են կամավորների կողմից ու համարյա բոլոր հոդվածները կարող է խմբագրել ցանկաց մարդ ով կարող է բացել Վիքիպեդիայի կայքը։",
|
||||||
new String[] { "Վիքիպեդիայի", "13", "միլիոն", "հոդվածները", "4,600", "հայերեն", "վիքիպեդիայում", "գրվել", "են", "կամավորների", "կողմից",
|
new String[] { "վիքիպեդիայի", "13", "միլիոն", "հոդվածները", "4,600", "հայերեն", "վիքիպեդիայում", "գրվել", "են", "կամավորների", "կողմից",
|
||||||
"ու", "համարյա", "բոլոր", "հոդվածները", "կարող", "է", "խմբագրել", "ցանկաց", "մարդ", "ով", "կարող", "է", "բացել", "Վիքիպեդիայի", "կայքը" } );
|
"ու", "համարյա", "բոլոր", "հոդվածները", "կարող", "է", "խմբագրել", "ցանկաց", "մարդ", "ով", "կարող", "է", "բացել", "վիքիպեդիայի", "կայքը" } );
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testAmharic() throws Exception {
|
public void testAmharic() throws Exception {
|
||||||
|
@ -98,12 +102,12 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
|
||||||
public void testArabic() throws Exception {
|
public void testArabic() throws Exception {
|
||||||
assertAnalyzesTo(a, "الفيلم الوثائقي الأول عن ويكيبيديا يسمى \"الحقيقة بالأرقام: قصة ويكيبيديا\" (بالإنجليزية: Truth in Numbers: The Wikipedia Story)، سيتم إطلاقه في 2008.",
|
assertAnalyzesTo(a, "الفيلم الوثائقي الأول عن ويكيبيديا يسمى \"الحقيقة بالأرقام: قصة ويكيبيديا\" (بالإنجليزية: Truth in Numbers: The Wikipedia Story)، سيتم إطلاقه في 2008.",
|
||||||
new String[] { "الفيلم", "الوثائقي", "الأول", "عن", "ويكيبيديا", "يسمى", "الحقيقة", "بالأرقام", "قصة", "ويكيبيديا",
|
new String[] { "الفيلم", "الوثائقي", "الأول", "عن", "ويكيبيديا", "يسمى", "الحقيقة", "بالأرقام", "قصة", "ويكيبيديا",
|
||||||
"بالإنجليزية", "Truth", "in", "Numbers", "The", "Wikipedia", "Story", "سيتم", "إطلاقه", "في", "2008" } );
|
"بالإنجليزية", "truth", "in", "numbers", "the", "wikipedia", "story", "سيتم", "إطلاقه", "في", "2008" } );
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testAramaic() throws Exception {
|
public void testAramaic() throws Exception {
|
||||||
assertAnalyzesTo(a, "ܘܝܩܝܦܕܝܐ (ܐܢܓܠܝܐ: Wikipedia) ܗܘ ܐܝܢܣܩܠܘܦܕܝܐ ܚܐܪܬܐ ܕܐܢܛܪܢܛ ܒܠܫܢ̈ܐ ܣܓܝܐ̈ܐ܂ ܫܡܗ ܐܬܐ ܡܢ ܡ̈ܠܬܐ ܕ\"ܘܝܩܝ\" ܘ\"ܐܝܢܣܩܠܘܦܕܝܐ\"܀",
|
assertAnalyzesTo(a, "ܘܝܩܝܦܕܝܐ (ܐܢܓܠܝܐ: Wikipedia) ܗܘ ܐܝܢܣܩܠܘܦܕܝܐ ܚܐܪܬܐ ܕܐܢܛܪܢܛ ܒܠܫܢ̈ܐ ܣܓܝܐ̈ܐ܂ ܫܡܗ ܐܬܐ ܡܢ ܡ̈ܠܬܐ ܕ\"ܘܝܩܝ\" ܘ\"ܐܝܢܣܩܠܘܦܕܝܐ\"܀",
|
||||||
new String[] { "ܘܝܩܝܦܕܝܐ", "ܐܢܓܠܝܐ", "Wikipedia", "ܗܘ", "ܐܝܢܣܩܠܘܦܕܝܐ", "ܚܐܪܬܐ", "ܕܐܢܛܪܢܛ", "ܒܠܫܢ̈ܐ", "ܣܓܝܐ̈ܐ", "ܫܡܗ",
|
new String[] { "ܘܝܩܝܦܕܝܐ", "ܐܢܓܠܝܐ", "wikipedia", "ܗܘ", "ܐܝܢܣܩܠܘܦܕܝܐ", "ܚܐܪܬܐ", "ܕܐܢܛܪܢܛ", "ܒܠܫܢ̈ܐ", "ܣܓܝܐ̈ܐ", "ܫܡܗ",
|
||||||
"ܐܬܐ", "ܡܢ", "ܡ̈ܠܬܐ", "ܕ", "ܘܝܩܝ", "ܘ", "ܐܝܢܣܩܠܘܦܕܝܐ"});
|
"ܐܬܐ", "ܡܢ", "ܡ̈ܠܬܐ", "ܕ", "ܘܝܩܝ", "ܘ", "ܐܝܢܣܩܠܘܦܕܝܐ"});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -121,7 +125,7 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
public void testGreek() throws Exception {
|
public void testGreek() throws Exception {
|
||||||
assertAnalyzesTo(a, "Γράφεται σε συνεργασία από εθελοντές με το λογισμικό wiki, κάτι που σημαίνει ότι άρθρα μπορεί να προστεθούν ή να αλλάξουν από τον καθένα.",
|
assertAnalyzesTo(a, "Γράφεται σε συνεργασία από εθελοντές με το λογισμικό wiki, κάτι που σημαίνει ότι άρθρα μπορεί να προστεθούν ή να αλλάξουν από τον καθένα.",
|
||||||
new String[] { "Γράφεται", "σε", "συνεργασία", "από", "εθελοντές", "με", "το", "λογισμικό", "wiki", "κάτι", "που",
|
new String[] { "γράφεται", "σε", "συνεργασία", "από", "εθελοντέσ", "με", "το", "λογισμικό", "wiki", "κάτι", "που",
|
||||||
"σημαίνει", "ότι", "άρθρα", "μπορεί", "να", "προστεθούν", "ή", "να", "αλλάξουν", "από", "τον", "καθένα" });
|
"σημαίνει", "ότι", "άρθρα", "μπορεί", "να", "προστεθούν", "ή", "να", "αλλάξουν", "από", "τον", "καθένα" });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -152,7 +156,7 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
|
||||||
*/
|
*/
|
||||||
public void testChinese() throws Exception {
|
public void testChinese() throws Exception {
|
||||||
assertAnalyzesTo(a, "我是中国人。 1234 Tests ",
|
assertAnalyzesTo(a, "我是中国人。 1234 Tests ",
|
||||||
new String[] { "我", "是", "中", "国", "人", "1234", "Tests"});
|
new String[] { "我", "是", "中", "国", "人", "1234", "tests"});
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testHebrew() throws Exception {
|
public void testHebrew() throws Exception {
|
||||||
|
@ -182,8 +186,8 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
|
||||||
/* Tests from StandardAnalyzer, just to show behavior is similar */
|
/* Tests from StandardAnalyzer, just to show behavior is similar */
|
||||||
public void testAlphanumericSA() throws Exception {
|
public void testAlphanumericSA() throws Exception {
|
||||||
// alphanumeric tokens
|
// alphanumeric tokens
|
||||||
assertAnalyzesTo(a, "B2B", new String[]{"B2B"});
|
assertAnalyzesTo(a, "B2B", new String[]{"b2b"});
|
||||||
assertAnalyzesTo(a, "2B", new String[]{"2B"});
|
assertAnalyzesTo(a, "2B", new String[]{"2b"});
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testDelimitersSA() throws Exception {
|
public void testDelimitersSA() throws Exception {
|
||||||
|
@ -195,34 +199,34 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
public void testApostrophesSA() throws Exception {
|
public void testApostrophesSA() throws Exception {
|
||||||
// internal apostrophes: O'Reilly, you're, O'Reilly's
|
// internal apostrophes: O'Reilly, you're, O'Reilly's
|
||||||
assertAnalyzesTo(a, "O'Reilly", new String[]{"O'Reilly"});
|
assertAnalyzesTo(a, "O'Reilly", new String[]{"o'reilly"});
|
||||||
assertAnalyzesTo(a, "you're", new String[]{"you're"});
|
assertAnalyzesTo(a, "you're", new String[]{"you're"});
|
||||||
assertAnalyzesTo(a, "she's", new String[]{"she's"});
|
assertAnalyzesTo(a, "she's", new String[]{"she's"});
|
||||||
assertAnalyzesTo(a, "Jim's", new String[]{"Jim's"});
|
assertAnalyzesTo(a, "Jim's", new String[]{"jim's"});
|
||||||
assertAnalyzesTo(a, "don't", new String[]{"don't"});
|
assertAnalyzesTo(a, "don't", new String[]{"don't"});
|
||||||
assertAnalyzesTo(a, "O'Reilly's", new String[]{"O'Reilly's"});
|
assertAnalyzesTo(a, "O'Reilly's", new String[]{"o'reilly's"});
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testNumericSA() throws Exception {
|
public void testNumericSA() throws Exception {
|
||||||
// floating point, serial, model numbers, ip addresses, etc.
|
// floating point, serial, model numbers, ip addresses, etc.
|
||||||
// every other segment must have at least one digit
|
// every other segment must have at least one digit
|
||||||
assertAnalyzesTo(a, "21.35", new String[]{"21.35"});
|
assertAnalyzesTo(a, "21.35", new String[]{"21.35"});
|
||||||
assertAnalyzesTo(a, "R2D2 C3PO", new String[]{"R2D2", "C3PO"});
|
assertAnalyzesTo(a, "R2D2 C3PO", new String[]{"r2d2", "c3po"});
|
||||||
assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
|
assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
|
||||||
assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
|
assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testTextWithNumbersSA() throws Exception {
|
public void testTextWithNumbersSA() throws Exception {
|
||||||
// numbers
|
// numbers
|
||||||
assertAnalyzesTo(a, "David has 5000 bones", new String[]{"David", "has", "5000", "bones"});
|
assertAnalyzesTo(a, "David has 5000 bones", new String[]{"david", "has", "5000", "bones"});
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testVariousTextSA() throws Exception {
|
public void testVariousTextSA() throws Exception {
|
||||||
// various
|
// various
|
||||||
assertAnalyzesTo(a, "C embedded developers wanted", new String[]{"C", "embedded", "developers", "wanted"});
|
assertAnalyzesTo(a, "C embedded developers wanted", new String[]{"c", "embedded", "developers", "wanted"});
|
||||||
assertAnalyzesTo(a, "foo bar FOO BAR", new String[]{"foo", "bar", "FOO", "BAR"});
|
assertAnalyzesTo(a, "foo bar FOO BAR", new String[]{"foo", "bar", "foo", "bar"});
|
||||||
assertAnalyzesTo(a, "foo bar . FOO <> BAR", new String[]{"foo", "bar", "FOO", "BAR"});
|
assertAnalyzesTo(a, "foo bar . FOO <> BAR", new String[]{"foo", "bar", "foo", "bar"});
|
||||||
assertAnalyzesTo(a, "\"QUOTED\" word", new String[]{"QUOTED", "word"});
|
assertAnalyzesTo(a, "\"QUOTED\" word", new String[]{"quoted", "word"});
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testKoreanSA() throws Exception {
|
public void testKoreanSA() throws Exception {
|
||||||
|
@ -238,14 +242,14 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
public void testOffsets() throws Exception {
|
public void testOffsets() throws Exception {
|
||||||
assertAnalyzesTo(a, "David has 5000 bones",
|
assertAnalyzesTo(a, "David has 5000 bones",
|
||||||
new String[] {"David", "has", "5000", "bones"},
|
new String[] {"david", "has", "5000", "bones"},
|
||||||
new int[] {0, 6, 10, 15},
|
new int[] {0, 6, 10, 15},
|
||||||
new int[] {5, 9, 14, 20});
|
new int[] {5, 9, 14, 20});
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testTypes() throws Exception {
|
public void testTypes() throws Exception {
|
||||||
assertAnalyzesTo(a, "David has 5000 bones",
|
assertAnalyzesTo(a, "David has 5000 bones",
|
||||||
new String[] {"David", "has", "5000", "bones"},
|
new String[] {"david", "has", "5000", "bones"},
|
||||||
new String[] { "<ALPHANUM>", "<ALPHANUM>", "<NUM>", "<ALPHANUM>" });
|
new String[] { "<ALPHANUM>", "<ALPHANUM>", "<NUM>", "<ALPHANUM>" });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -261,61 +265,6 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
|
||||||
new String[] { "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<HIRAGANA>", "<KATAKANA>" });
|
new String[] { "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<HIRAGANA>", "<KATAKANA>" });
|
||||||
}
|
}
|
||||||
|
|
||||||
/** simple emoji */
|
|
||||||
public void testEmoji() throws Exception {
|
|
||||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "💩 💩💩",
|
|
||||||
new String[] { "💩", "💩", "💩" },
|
|
||||||
new String[] { "<EMOJI>", "<EMOJI>", "<EMOJI>" });
|
|
||||||
}
|
|
||||||
|
|
||||||
/** emoji zwj sequence */
|
|
||||||
public void testEmojiSequence() throws Exception {
|
|
||||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "👩❤️👩",
|
|
||||||
new String[] { "👩❤️👩" },
|
|
||||||
new String[] { "<EMOJI>" });
|
|
||||||
}
|
|
||||||
|
|
||||||
/** emoji zwj sequence with fitzpatrick modifier */
|
|
||||||
public void testEmojiSequenceWithModifier() throws Exception {
|
|
||||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "👨🏼⚕️",
|
|
||||||
new String[] { "👨🏼⚕️" },
|
|
||||||
new String[] { "<EMOJI>" });
|
|
||||||
}
|
|
||||||
|
|
||||||
/** regional indicator */
|
|
||||||
public void testEmojiRegionalIndicator() throws Exception {
|
|
||||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "🇺🇸🇺🇸",
|
|
||||||
new String[] { "🇺🇸", "🇺🇸" },
|
|
||||||
new String[] { "<EMOJI>", "<EMOJI>" });
|
|
||||||
}
|
|
||||||
|
|
||||||
/** variation sequence */
|
|
||||||
public void testEmojiVariationSequence() throws Exception {
|
|
||||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "#️⃣",
|
|
||||||
new String[] { "#️⃣" },
|
|
||||||
new String[] { "<EMOJI>" });
|
|
||||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "3️⃣",
|
|
||||||
new String[] { "3️⃣",},
|
|
||||||
new String[] { "<EMOJI>" });
|
|
||||||
}
|
|
||||||
|
|
||||||
public void testEmojiTagSequence() throws Exception {
|
|
||||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "🏴",
|
|
||||||
new String[] { "🏴" },
|
|
||||||
new String[] { "<EMOJI>" });
|
|
||||||
}
|
|
||||||
|
|
||||||
public void testEmojiTokenization() throws Exception {
|
|
||||||
// simple emoji around latin
|
|
||||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "poo💩poo",
|
|
||||||
new String[] { "poo", "💩", "poo" },
|
|
||||||
new String[] { "<ALPHANUM>", "<EMOJI>", "<ALPHANUM>" });
|
|
||||||
// simple emoji around non-latin
|
|
||||||
BaseTokenStreamTestCase.assertAnalyzesTo(a, "💩中國💩",
|
|
||||||
new String[] { "💩", "中", "國", "💩" },
|
|
||||||
new String[] { "<EMOJI>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<EMOJI>" });
|
|
||||||
}
|
|
||||||
|
|
||||||
/** blast some random strings through the analyzer */
|
/** blast some random strings through the analyzer */
|
||||||
public void testRandomStrings() throws Exception {
|
public void testRandomStrings() throws Exception {
|
||||||
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
|
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);
|
||||||
|
|
|
@ -78,15 +78,6 @@ public class TestICUTokenizerCJK extends BaseTokenStreamTestCase {
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* dictionary segmentation with emoji
|
|
||||||
*/
|
|
||||||
public void testSimpleJapaneseWithEmoji() throws Exception {
|
|
||||||
assertAnalyzesTo(a, "それはまだ実験段階にあります💩",
|
|
||||||
new String[] { "それ", "は", "まだ", "実験", "段階", "に", "あり", "ます", "💩" }
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
public void testJapaneseTypes() throws Exception {
|
public void testJapaneseTypes() throws Exception {
|
||||||
assertAnalyzesTo(a, "仮名遣い カタカナ",
|
assertAnalyzesTo(a, "仮名遣い カタカナ",
|
||||||
new String[] { "仮名遣い", "カタカナ" },
|
new String[] { "仮名遣い", "カタカナ" },
|
||||||
|
|
|
@ -54,8 +54,6 @@ public final class StandardTokenizer extends Tokenizer {
|
||||||
public static final int KATAKANA = 5;
|
public static final int KATAKANA = 5;
|
||||||
/** Hangul token type */
|
/** Hangul token type */
|
||||||
public static final int HANGUL = 6;
|
public static final int HANGUL = 6;
|
||||||
/** Emoji token type. */
|
|
||||||
public static final int EMOJI = 7;
|
|
||||||
|
|
||||||
/** String token types that correspond to token type int constants */
|
/** String token types that correspond to token type int constants */
|
||||||
public static final String [] TOKEN_TYPES = new String [] {
|
public static final String [] TOKEN_TYPES = new String [] {
|
||||||
|
@ -65,8 +63,7 @@ public final class StandardTokenizer extends Tokenizer {
|
||||||
"<IDEOGRAPHIC>",
|
"<IDEOGRAPHIC>",
|
||||||
"<HIRAGANA>",
|
"<HIRAGANA>",
|
||||||
"<KATAKANA>",
|
"<KATAKANA>",
|
||||||
"<HANGUL>",
|
"<HANGUL>"
|
||||||
"<EMOJI>"
|
|
||||||
};
|
};
|
||||||
|
|
||||||
/** Absolute maximum sized token */
|
/** Absolute maximum sized token */
|
||||||
|
|
Loading…
Reference in New Issue