LUCENE-8175: un-revert "LUCENE-8125: ICUTokenizer support for emoji/emoji sequence tokens""

This was a casualty of war because it relied on new unicode stuff
This commit is contained in:
Robert Muir 2018-03-26 18:31:39 -04:00
parent bdfe1e69e6
commit 23bff7dbc2
9 changed files with 174 additions and 172 deletions

View File

@ -174,6 +174,8 @@ Improvements
position sensitive (e.g. part of a phrase) by having an accurate freq. position sensitive (e.g. part of a phrase) by having an accurate freq.
(David Smiley) (David Smiley)
* LUCENE-8125: ICUTokenizer support for emoji/emoji sequence tokens. (Robert Muir)
* LUCENE-8129: A Unicode set filter can now be specified when using ICUFoldingFilter. * LUCENE-8129: A Unicode set filter can now be specified when using ICUFoldingFilter.
(Ere Maijala) (Ere Maijala)

View File

@ -16,152 +16,84 @@
*/ */
package org.apache.lucene.analysis.icu.segmentation; package org.apache.lucene.analysis.icu.segmentation;
import java.text.CharacterIterator;
import com.ibm.icu.lang.UCharacter; import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UProperty;
import com.ibm.icu.text.BreakIterator; import com.ibm.icu.text.BreakIterator;
import com.ibm.icu.text.RuleBasedBreakIterator; import com.ibm.icu.text.RuleBasedBreakIterator;
import com.ibm.icu.text.UTF16; import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
/** /**
* Contain all the issues surrounding BreakIterators in ICU in one place. * Wraps RuleBasedBreakIterator, making object reuse convenient and
* Basically this boils down to the fact that they aren't very friendly to any * emitting a rule status for emoji sequences.
* sort of OO design.
* <p>
* http://bugs.icu-project.org/trac/ticket/5901: RBBI.getRuleStatus(), hoist to
* BreakIterator from RuleBasedBreakIterator
* <p>
* DictionaryBasedBreakIterator is a subclass of RuleBasedBreakIterator, but
* doesn't actually behave as a subclass: it always returns 0 for
* getRuleStatus():
* http://bugs.icu-project.org/trac/ticket/4730: Thai RBBI, no boundary type
* tags
* @lucene.experimental * @lucene.experimental
*/ */
abstract class BreakIteratorWrapper { final class BreakIteratorWrapper {
protected final CharArrayIterator textIterator = new CharArrayIterator(); private final CharArrayIterator textIterator = new CharArrayIterator();
protected char text[];
protected int start;
protected int length;
abstract int next();
abstract int current();
abstract int getRuleStatus();
abstract void setText(CharacterIterator text);
void setText(char text[], int start, int length) {
this.text = text;
this.start = start;
this.length = length;
textIterator.setText(text, start, length);
setText(textIterator);
}
/**
* If it's a RuleBasedBreakIterator, the rule status can be used for token type. If it's
* any other BreakIterator, the rulestatus method is not available, so treat
* it like a generic BreakIterator.
*/
static BreakIteratorWrapper wrap(BreakIterator breakIterator) {
if (breakIterator instanceof RuleBasedBreakIterator)
return new RBBIWrapper((RuleBasedBreakIterator) breakIterator);
else
return new BIWrapper(breakIterator);
}
/**
* RuleBasedBreakIterator wrapper: RuleBasedBreakIterator (as long as it's not
* a DictionaryBasedBreakIterator) behaves correctly.
*/
static final class RBBIWrapper extends BreakIteratorWrapper {
private final RuleBasedBreakIterator rbbi; private final RuleBasedBreakIterator rbbi;
private char text[];
private int start;
private int status;
RBBIWrapper(RuleBasedBreakIterator rbbi) { BreakIteratorWrapper(RuleBasedBreakIterator rbbi) {
this.rbbi = rbbi; this.rbbi = rbbi;
} }
@Override
int current() { int current() {
return rbbi.current(); return rbbi.current();
} }
@Override
int getRuleStatus() {
return rbbi.getRuleStatus();
}
@Override
int next() {
return rbbi.next();
}
@Override
void setText(CharacterIterator text) {
rbbi.setText(text);
}
}
/**
* Generic BreakIterator wrapper: Either the rulestatus method is not
* available or always returns 0. Calculate a rulestatus here so it behaves
* like RuleBasedBreakIterator.
*
* Note: This is slower than RuleBasedBreakIterator.
*/
static final class BIWrapper extends BreakIteratorWrapper {
private final BreakIterator bi;
private int status;
BIWrapper(BreakIterator bi) {
this.bi = bi;
}
@Override
int current() {
return bi.current();
}
@Override
int getRuleStatus() { int getRuleStatus() {
return status; return status;
} }
@Override
int next() { int next() {
int current = bi.current(); int current = rbbi.current();
int next = bi.next(); int next = rbbi.next();
status = calcStatus(current, next); status = calcStatus(current, next);
return next; return next;
} }
/** Returns current rule status for the text between breaks. (determines token type) */
private int calcStatus(int current, int next) { private int calcStatus(int current, int next) {
if (current == BreakIterator.DONE || next == BreakIterator.DONE) // to support presentation selectors, we need to handle alphanum, num, and none at least, so currently not worth optimizing.
return RuleBasedBreakIterator.WORD_NONE; // https://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5B%3AEmoji%3A%5D-%5B%3AEmoji_Presentation%3A%5D&g=Word_Break&i=
if (next != BreakIterator.DONE && isEmoji(current, next)) {
return ICUTokenizerConfig.EMOJI_SEQUENCE_STATUS;
} else {
return rbbi.getRuleStatus();
}
}
// See unicode doc L2/16-315 and also the RBBI rules for rationale.
// we don't include regional indicators here, because they aren't ambiguous for tagging,
// they need only be treated special for segmentation.
static final UnicodeSet EMOJI_RK = new UnicodeSet("[\u002a\u00230-9©®™〰〽]").freeze();
/** Returns true if the current text represents emoji character or sequence */
private boolean isEmoji(int current, int next) {
int begin = start + current; int begin = start + current;
int end = start + next; int end = start + next;
int codepoint = UTF16.charAt(text, 0, end, begin);
int codepoint; // TODO: this can be made more aggressive and future-proof if it uses [:Extended_Pictographic:]
for (int i = begin; i < end; i += UTF16.getCharCount(codepoint)) { if (UCharacter.hasBinaryProperty(codepoint, UProperty.EMOJI)) {
codepoint = UTF16.charAt(text, 0, end, begin); if (EMOJI_RK.contains(codepoint)) {
// if its in EmojiRK, we don't treat it as emoji unless there is evidence it forms emoji sequence,
if (UCharacter.isDigit(codepoint)) // an emoji presentation selector or keycap follows.
return RuleBasedBreakIterator.WORD_NUMBER; int trailer = begin + Character.charCount(codepoint);
else if (UCharacter.isLetter(codepoint)) { return trailer < end && (text[trailer] == 0xFE0F || text[trailer] == 0x20E3);
// TODO: try to separately specify ideographic, kana? } else {
// [currently all bundled as letter for this case] return true;
return RuleBasedBreakIterator.WORD_LETTER;
} }
} }
return false;
return RuleBasedBreakIterator.WORD_NONE;
} }
@Override void setText(char text[], int start, int length) {
void setText(CharacterIterator text) { this.text = text;
bi.setText(text); this.start = start;
textIterator.setText(text, start, length);
rbbi.setText(textIterator);
status = RuleBasedBreakIterator.WORD_NONE; status = RuleBasedBreakIterator.WORD_NONE;
} }
}
} }

View File

@ -123,7 +123,7 @@ final class CompositeBreakIterator {
private BreakIteratorWrapper getBreakIterator(int scriptCode) { private BreakIteratorWrapper getBreakIterator(int scriptCode) {
if (wordBreakers[scriptCode] == null) if (wordBreakers[scriptCode] == null)
wordBreakers[scriptCode] = BreakIteratorWrapper.wrap(config.getBreakIterator(scriptCode)); wordBreakers[scriptCode] = new BreakIteratorWrapper(config.getBreakIterator(scriptCode));
return wordBreakers[scriptCode]; return wordBreakers[scriptCode];
} }
} }

View File

@ -52,6 +52,8 @@ public class DefaultICUTokenizerConfig extends ICUTokenizerConfig {
public static final String WORD_LETTER = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.ALPHANUM]; public static final String WORD_LETTER = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.ALPHANUM];
/** Token type for words that appear to be numbers */ /** Token type for words that appear to be numbers */
public static final String WORD_NUMBER = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.NUM]; public static final String WORD_NUMBER = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.NUM];
/** Token type for words that appear to be emoji sequences */
public static final String WORD_EMOJI = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.EMOJI];
/* /*
* the default breakiterators in use. these can be expensive to * the default breakiterators in use. these can be expensive to
@ -65,9 +67,9 @@ public class DefaultICUTokenizerConfig extends ICUTokenizerConfig {
// maybe add an explicit check? http://icu-project.org/apiref/icu4j/com/ibm/icu/util/VersionInfo.html // maybe add an explicit check? http://icu-project.org/apiref/icu4j/com/ibm/icu/util/VersionInfo.html
// the same as ROOT, except no dictionary segmentation for cjk // the same as ROOT, except no dictionary segmentation for cjk
private static final BreakIterator defaultBreakIterator = private static final RuleBasedBreakIterator defaultBreakIterator =
readBreakIterator("Default.brk"); readBreakIterator("Default.brk");
private static final BreakIterator myanmarSyllableIterator = private static final RuleBasedBreakIterator myanmarSyllableIterator =
readBreakIterator("MyanmarSyllable.brk"); readBreakIterator("MyanmarSyllable.brk");
// TODO: deprecate this boolean? you only care if you are doing super-expert stuff... // TODO: deprecate this boolean? you only care if you are doing super-expert stuff...
@ -95,16 +97,16 @@ public class DefaultICUTokenizerConfig extends ICUTokenizerConfig {
} }
@Override @Override
public BreakIterator getBreakIterator(int script) { public RuleBasedBreakIterator getBreakIterator(int script) {
switch(script) { switch(script) {
case UScript.JAPANESE: return (BreakIterator)cjkBreakIterator.clone(); case UScript.JAPANESE: return (RuleBasedBreakIterator)cjkBreakIterator.clone();
case UScript.MYANMAR: case UScript.MYANMAR:
if (myanmarAsWords) { if (myanmarAsWords) {
return (BreakIterator)defaultBreakIterator.clone(); return (RuleBasedBreakIterator)defaultBreakIterator.clone();
} else { } else {
return (BreakIterator)myanmarSyllableIterator.clone(); return (RuleBasedBreakIterator)myanmarSyllableIterator.clone();
} }
default: return (BreakIterator)defaultBreakIterator.clone(); default: return (RuleBasedBreakIterator)defaultBreakIterator.clone();
} }
} }
@ -119,6 +121,8 @@ public class DefaultICUTokenizerConfig extends ICUTokenizerConfig {
return script == UScript.HANGUL ? WORD_HANGUL : WORD_LETTER; return script == UScript.HANGUL ? WORD_HANGUL : WORD_LETTER;
case RuleBasedBreakIterator.WORD_NUMBER: case RuleBasedBreakIterator.WORD_NUMBER:
return WORD_NUMBER; return WORD_NUMBER;
case EMOJI_SEQUENCE_STATUS:
return WORD_EMOJI;
default: /* some other custom code */ default: /* some other custom code */
return "<OTHER>"; return "<OTHER>";
} }

View File

@ -16,8 +16,7 @@
*/ */
package org.apache.lucene.analysis.icu.segmentation; package org.apache.lucene.analysis.icu.segmentation;
import com.ibm.icu.text.RuleBasedBreakIterator;
import com.ibm.icu.text.BreakIterator;
/** /**
* Class that allows for tailored Unicode Text Segmentation on * Class that allows for tailored Unicode Text Segmentation on
@ -25,6 +24,8 @@ import com.ibm.icu.text.BreakIterator;
* @lucene.experimental * @lucene.experimental
*/ */
public abstract class ICUTokenizerConfig { public abstract class ICUTokenizerConfig {
/** Rule status for emoji sequences */
public static final int EMOJI_SEQUENCE_STATUS = 299;
/** /**
* Sole constructor. (For invocation by subclass * Sole constructor. (For invocation by subclass
@ -32,7 +33,7 @@ public abstract class ICUTokenizerConfig {
*/ */
public ICUTokenizerConfig() {} public ICUTokenizerConfig() {}
/** Return a breakiterator capable of processing a given script. */ /** Return a breakiterator capable of processing a given script. */
public abstract BreakIterator getBreakIterator(int script); public abstract RuleBasedBreakIterator getBreakIterator(int script);
/** Return a token type value for a given script and BreakIterator /** Return a token type value for a given script and BreakIterator
* rule status. */ * rule status. */
public abstract String getType(int script, int ruleStatus); public abstract String getType(int script, int ruleStatus);

View File

@ -116,9 +116,9 @@ public class ICUTokenizerFactory extends TokenizerFactory implements ResourceLoa
config = new DefaultICUTokenizerConfig(cjkAsWords, myanmarAsWords) { config = new DefaultICUTokenizerConfig(cjkAsWords, myanmarAsWords) {
@Override @Override
public BreakIterator getBreakIterator(int script) { public RuleBasedBreakIterator getBreakIterator(int script) {
if (breakers[script] != null) { if (breakers[script] != null) {
return (BreakIterator) breakers[script].clone(); return (RuleBasedBreakIterator) breakers[script].clone();
} else { } else {
return super.getBreakIterator(script); return super.getBreakIterator(script);
} }

View File

@ -16,13 +16,10 @@
*/ */
package org.apache.lucene.analysis.icu.segmentation; package org.apache.lucene.analysis.icu.segmentation;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.icu.ICUNormalizer2Filter;
import org.apache.lucene.analysis.icu.tokenattributes.ScriptAttribute; import org.apache.lucene.analysis.icu.tokenattributes.ScriptAttribute;
import com.ibm.icu.lang.UScript; import com.ibm.icu.lang.UScript;
@ -76,8 +73,7 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
@Override @Override
protected TokenStreamComponents createComponents(String fieldName) { protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(false, true)); Tokenizer tokenizer = new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(false, true));
TokenFilter filter = new ICUNormalizer2Filter(tokenizer); return new TokenStreamComponents(tokenizer);
return new TokenStreamComponents(tokenizer, filter);
} }
}; };
} }
@ -90,8 +86,8 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
public void testArmenian() throws Exception { public void testArmenian() throws Exception {
assertAnalyzesTo(a, "Վիքիպեդիայի 13 միլիոն հոդվածները (4,600` հայերեն վիքիպեդիայում) գրվել են կամավորների կողմից ու համարյա բոլոր հոդվածները կարող է խմբագրել ցանկաց մարդ ով կարող է բացել Վիքիպեդիայի կայքը։", assertAnalyzesTo(a, "Վիքիպեդիայի 13 միլիոն հոդվածները (4,600` հայերեն վիքիպեդիայում) գրվել են կամավորների կողմից ու համարյա բոլոր հոդվածները կարող է խմբագրել ցանկաց մարդ ով կարող է բացել Վիքիպեդիայի կայքը։",
new String[] { "վիքիպեդիայի", "13", "միլիոն", "հոդվածները", "4,600", "հայերեն", "վիքիպեդիայում", "գրվել", "են", "կամավորների", "կողմից", new String[] { "Վիքիպեդիայի", "13", "միլիոն", "հոդվածները", "4,600", "հայերեն", "վիքիպեդիայում", "գրվել", "են", "կամավորների", "կողմից",
"ու", "համարյա", "բոլոր", "հոդվածները", "կարող", "է", "խմբագրել", "ցանկաց", "մարդ", "ով", "կարող", "է", "բացել", "վիքիպեդիայի", "կայքը" } ); "ու", "համարյա", "բոլոր", "հոդվածները", "կարող", "է", "խմբագրել", "ցանկաց", "մարդ", "ով", "կարող", "է", "բացել", "Վիքիպեդիայի", "կայքը" } );
} }
public void testAmharic() throws Exception { public void testAmharic() throws Exception {
@ -102,12 +98,12 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
public void testArabic() throws Exception { public void testArabic() throws Exception {
assertAnalyzesTo(a, "الفيلم الوثائقي الأول عن ويكيبيديا يسمى \"الحقيقة بالأرقام: قصة ويكيبيديا\" (بالإنجليزية: Truth in Numbers: The Wikipedia Story)، سيتم إطلاقه في 2008.", assertAnalyzesTo(a, "الفيلم الوثائقي الأول عن ويكيبيديا يسمى \"الحقيقة بالأرقام: قصة ويكيبيديا\" (بالإنجليزية: Truth in Numbers: The Wikipedia Story)، سيتم إطلاقه في 2008.",
new String[] { "الفيلم", "الوثائقي", "الأول", "عن", "ويكيبيديا", "يسمى", "الحقيقة", "بالأرقام", "قصة", "ويكيبيديا", new String[] { "الفيلم", "الوثائقي", "الأول", "عن", "ويكيبيديا", "يسمى", "الحقيقة", "بالأرقام", "قصة", "ويكيبيديا",
"بالإنجليزية", "truth", "in", "numbers", "the", "wikipedia", "story", "سيتم", "إطلاقه", "في", "2008" } ); "بالإنجليزية", "Truth", "in", "Numbers", "The", "Wikipedia", "Story", "سيتم", "إطلاقه", "في", "2008" } );
} }
public void testAramaic() throws Exception { public void testAramaic() throws Exception {
assertAnalyzesTo(a, "ܘܝܩܝܦܕܝܐ (ܐܢܓܠܝܐ: Wikipedia) ܗܘ ܐܝܢܣܩܠܘܦܕܝܐ ܚܐܪܬܐ ܕܐܢܛܪܢܛ ܒܠܫܢ̈ܐ ܣܓܝܐ̈ܐ܂ ܫܡܗ ܐܬܐ ܡܢ ܡ̈ܠܬܐ ܕ\"ܘܝܩܝ\" ܘ\"ܐܝܢܣܩܠܘܦܕܝܐ\"܀", assertAnalyzesTo(a, "ܘܝܩܝܦܕܝܐ (ܐܢܓܠܝܐ: Wikipedia) ܗܘ ܐܝܢܣܩܠܘܦܕܝܐ ܚܐܪܬܐ ܕܐܢܛܪܢܛ ܒܠܫܢ̈ܐ ܣܓܝܐ̈ܐ܂ ܫܡܗ ܐܬܐ ܡܢ ܡ̈ܠܬܐ ܕ\"ܘܝܩܝ\" ܘ\"ܐܝܢܣܩܠܘܦܕܝܐ\"܀",
new String[] { "ܘܝܩܝܦܕܝܐ", "ܐܢܓܠܝܐ", "wikipedia", "ܗܘ", "ܐܝܢܣܩܠܘܦܕܝܐ", "ܚܐܪܬܐ", "ܕܐܢܛܪܢܛ", "ܒܠܫܢ̈ܐ", "ܣܓܝܐ̈ܐ", "ܫܡܗ", new String[] { "ܘܝܩܝܦܕܝܐ", "ܐܢܓܠܝܐ", "Wikipedia", "ܗܘ", "ܐܝܢܣܩܠܘܦܕܝܐ", "ܚܐܪܬܐ", "ܕܐܢܛܪܢܛ", "ܒܠܫܢ̈ܐ", "ܣܓܝܐ̈ܐ", "ܫܡܗ",
"ܐܬܐ", "ܡܢ", "ܡ̈ܠܬܐ", "ܕ", "ܘܝܩܝ", "ܘ", "ܐܝܢܣܩܠܘܦܕܝܐ"}); "ܐܬܐ", "ܡܢ", "ܡ̈ܠܬܐ", "ܕ", "ܘܝܩܝ", "ܘ", "ܐܝܢܣܩܠܘܦܕܝܐ"});
} }
@ -125,7 +121,7 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
public void testGreek() throws Exception { public void testGreek() throws Exception {
assertAnalyzesTo(a, "Γράφεται σε συνεργασία από εθελοντές με το λογισμικό wiki, κάτι που σημαίνει ότι άρθρα μπορεί να προστεθούν ή να αλλάξουν από τον καθένα.", assertAnalyzesTo(a, "Γράφεται σε συνεργασία από εθελοντές με το λογισμικό wiki, κάτι που σημαίνει ότι άρθρα μπορεί να προστεθούν ή να αλλάξουν από τον καθένα.",
new String[] { "γράφεται", "σε", "συνεργασία", "από", "εθελοντέσ", "με", "το", "λογισμικό", "wiki", "κάτι", "που", new String[] { "Γράφεται", "σε", "συνεργασία", "από", "εθελοντές", "με", "το", "λογισμικό", "wiki", "κάτι", "που",
"σημαίνει", "ότι", "άρθρα", "μπορεί", "να", "προστεθούν", "ή", "να", "αλλάξουν", "από", "τον", "καθένα" }); "σημαίνει", "ότι", "άρθρα", "μπορεί", "να", "προστεθούν", "ή", "να", "αλλάξουν", "από", "τον", "καθένα" });
} }
@ -156,7 +152,7 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
*/ */
public void testChinese() throws Exception { public void testChinese() throws Exception {
assertAnalyzesTo(a, "我是中国人。 ", assertAnalyzesTo(a, "我是中国人。 ",
new String[] { "", "", "", "", "", "1234", "tests"}); new String[] { "", "", "", "", "", "", ""});
} }
public void testHebrew() throws Exception { public void testHebrew() throws Exception {
@ -186,8 +182,8 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
/* Tests from StandardAnalyzer, just to show behavior is similar */ /* Tests from StandardAnalyzer, just to show behavior is similar */
public void testAlphanumericSA() throws Exception { public void testAlphanumericSA() throws Exception {
// alphanumeric tokens // alphanumeric tokens
assertAnalyzesTo(a, "B2B", new String[]{"b2b"}); assertAnalyzesTo(a, "B2B", new String[]{"B2B"});
assertAnalyzesTo(a, "2B", new String[]{"2b"}); assertAnalyzesTo(a, "2B", new String[]{"2B"});
} }
public void testDelimitersSA() throws Exception { public void testDelimitersSA() throws Exception {
@ -199,34 +195,34 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
public void testApostrophesSA() throws Exception { public void testApostrophesSA() throws Exception {
// internal apostrophes: O'Reilly, you're, O'Reilly's // internal apostrophes: O'Reilly, you're, O'Reilly's
assertAnalyzesTo(a, "O'Reilly", new String[]{"o'reilly"}); assertAnalyzesTo(a, "O'Reilly", new String[]{"O'Reilly"});
assertAnalyzesTo(a, "you're", new String[]{"you're"}); assertAnalyzesTo(a, "you're", new String[]{"you're"});
assertAnalyzesTo(a, "she's", new String[]{"she's"}); assertAnalyzesTo(a, "she's", new String[]{"she's"});
assertAnalyzesTo(a, "Jim's", new String[]{"jim's"}); assertAnalyzesTo(a, "Jim's", new String[]{"Jim's"});
assertAnalyzesTo(a, "don't", new String[]{"don't"}); assertAnalyzesTo(a, "don't", new String[]{"don't"});
assertAnalyzesTo(a, "O'Reilly's", new String[]{"o'reilly's"}); assertAnalyzesTo(a, "O'Reilly's", new String[]{"O'Reilly's"});
} }
public void testNumericSA() throws Exception { public void testNumericSA() throws Exception {
// floating point, serial, model numbers, ip addresses, etc. // floating point, serial, model numbers, ip addresses, etc.
// every other segment must have at least one digit // every other segment must have at least one digit
assertAnalyzesTo(a, "21.35", new String[]{"21.35"}); assertAnalyzesTo(a, "21.35", new String[]{"21.35"});
assertAnalyzesTo(a, "R2D2 C3PO", new String[]{"r2d2", "c3po"}); assertAnalyzesTo(a, "R2D2 C3PO", new String[]{"R2D2", "C3PO"});
assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"}); assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"}); assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
} }
public void testTextWithNumbersSA() throws Exception { public void testTextWithNumbersSA() throws Exception {
// numbers // numbers
assertAnalyzesTo(a, "David has 5000 bones", new String[]{"david", "has", "5000", "bones"}); assertAnalyzesTo(a, "David has 5000 bones", new String[]{"David", "has", "5000", "bones"});
} }
public void testVariousTextSA() throws Exception { public void testVariousTextSA() throws Exception {
// various // various
assertAnalyzesTo(a, "C embedded developers wanted", new String[]{"c", "embedded", "developers", "wanted"}); assertAnalyzesTo(a, "C embedded developers wanted", new String[]{"C", "embedded", "developers", "wanted"});
assertAnalyzesTo(a, "foo bar FOO BAR", new String[]{"foo", "bar", "foo", "bar"}); assertAnalyzesTo(a, "foo bar FOO BAR", new String[]{"foo", "bar", "FOO", "BAR"});
assertAnalyzesTo(a, "foo bar . FOO <> BAR", new String[]{"foo", "bar", "foo", "bar"}); assertAnalyzesTo(a, "foo bar . FOO <> BAR", new String[]{"foo", "bar", "FOO", "BAR"});
assertAnalyzesTo(a, "\"QUOTED\" word", new String[]{"quoted", "word"}); assertAnalyzesTo(a, "\"QUOTED\" word", new String[]{"QUOTED", "word"});
} }
public void testKoreanSA() throws Exception { public void testKoreanSA() throws Exception {
@ -242,14 +238,14 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
public void testOffsets() throws Exception { public void testOffsets() throws Exception {
assertAnalyzesTo(a, "David has 5000 bones", assertAnalyzesTo(a, "David has 5000 bones",
new String[] {"david", "has", "5000", "bones"}, new String[] {"David", "has", "5000", "bones"},
new int[] {0, 6, 10, 15}, new int[] {0, 6, 10, 15},
new int[] {5, 9, 14, 20}); new int[] {5, 9, 14, 20});
} }
public void testTypes() throws Exception { public void testTypes() throws Exception {
assertAnalyzesTo(a, "David has 5000 bones", assertAnalyzesTo(a, "David has 5000 bones",
new String[] {"david", "has", "5000", "bones"}, new String[] {"David", "has", "5000", "bones"},
new String[] { "<ALPHANUM>", "<ALPHANUM>", "<NUM>", "<ALPHANUM>" }); new String[] { "<ALPHANUM>", "<ALPHANUM>", "<NUM>", "<ALPHANUM>" });
} }
@ -265,6 +261,61 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
new String[] { "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<HIRAGANA>", "<KATAKANA>" }); new String[] { "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<HIRAGANA>", "<KATAKANA>" });
} }
/** simple emoji */
public void testEmoji() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(a, "💩 💩💩",
new String[] { "💩", "💩", "💩" },
new String[] { "<EMOJI>", "<EMOJI>", "<EMOJI>" });
}
/** emoji zwj sequence */
public void testEmojiSequence() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(a, "👩‍❤️‍👩",
new String[] { "👩‍❤️‍👩" },
new String[] { "<EMOJI>" });
}
/** emoji zwj sequence with fitzpatrick modifier */
public void testEmojiSequenceWithModifier() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(a, "👨🏼‍⚕️",
new String[] { "👨🏼‍⚕️" },
new String[] { "<EMOJI>" });
}
/** regional indicator */
public void testEmojiRegionalIndicator() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(a, "🇺🇸🇺🇸",
new String[] { "🇺🇸", "🇺🇸" },
new String[] { "<EMOJI>", "<EMOJI>" });
}
/** variation sequence */
public void testEmojiVariationSequence() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(a, "#️⃣",
new String[] { "#️⃣" },
new String[] { "<EMOJI>" });
BaseTokenStreamTestCase.assertAnalyzesTo(a, "3",
new String[] { "3",},
new String[] { "<EMOJI>" });
}
public void testEmojiTagSequence() throws Exception {
BaseTokenStreamTestCase.assertAnalyzesTo(a, "🏴󠁧󠁢󠁥󠁮󠁧󠁿",
new String[] { "🏴󠁧󠁢󠁥󠁮󠁧󠁿" },
new String[] { "<EMOJI>" });
}
public void testEmojiTokenization() throws Exception {
// simple emoji around latin
BaseTokenStreamTestCase.assertAnalyzesTo(a, "poo💩poo",
new String[] { "poo", "💩", "poo" },
new String[] { "<ALPHANUM>", "<EMOJI>", "<ALPHANUM>" });
// simple emoji around non-latin
BaseTokenStreamTestCase.assertAnalyzesTo(a, "💩中國💩",
new String[] { "💩", "", "", "💩" },
new String[] { "<EMOJI>", "<IDEOGRAPHIC>", "<IDEOGRAPHIC>", "<EMOJI>" });
}
/** blast some random strings through the analyzer */ /** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception { public void testRandomStrings() throws Exception {
checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER); checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER);

View File

@ -78,6 +78,15 @@ public class TestICUTokenizerCJK extends BaseTokenStreamTestCase {
); );
} }
/**
* dictionary segmentation with emoji
*/
public void testSimpleJapaneseWithEmoji() throws Exception {
assertAnalyzesTo(a, "それはまだ実験段階にあります💩",
new String[] { "それ", "", "まだ", "実験", "段階", "", "あり", "ます", "💩" }
);
}
public void testJapaneseTypes() throws Exception { public void testJapaneseTypes() throws Exception {
assertAnalyzesTo(a, "仮名遣い カタカナ", assertAnalyzesTo(a, "仮名遣い カタカナ",
new String[] { "仮名遣い", "カタカナ" }, new String[] { "仮名遣い", "カタカナ" },

View File

@ -54,6 +54,8 @@ public final class StandardTokenizer extends Tokenizer {
public static final int KATAKANA = 5; public static final int KATAKANA = 5;
/** Hangul token type */ /** Hangul token type */
public static final int HANGUL = 6; public static final int HANGUL = 6;
/** Emoji token type. */
public static final int EMOJI = 7;
/** String token types that correspond to token type int constants */ /** String token types that correspond to token type int constants */
public static final String [] TOKEN_TYPES = new String [] { public static final String [] TOKEN_TYPES = new String [] {
@ -63,7 +65,8 @@ public final class StandardTokenizer extends Tokenizer {
"<IDEOGRAPHIC>", "<IDEOGRAPHIC>",
"<HIRAGANA>", "<HIRAGANA>",
"<KATAKANA>", "<KATAKANA>",
"<HANGUL>" "<HANGUL>",
"<EMOJI>"
}; };
/** Absolute maximum sized token */ /** Absolute maximum sized token */