From 23bff7dbc207083af2ccb1b308c121ac18c36508 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Mon, 26 Mar 2018 18:31:39 -0400 Subject: [PATCH] LUCENE-8175: un-revert "LUCENE-8125: ICUTokenizer support for emoji/emoji sequence tokens"" This was a casualty of war because it relied on new unicode stuff --- lucene/CHANGES.txt | 2 + .../segmentation/BreakIteratorWrapper.java | 198 ++++++------------ .../segmentation/CompositeBreakIterator.java | 2 +- .../DefaultICUTokenizerConfig.java | 18 +- .../icu/segmentation/ICUTokenizerConfig.java | 9 +- .../icu/segmentation/ICUTokenizerFactory.java | 4 +- .../icu/segmentation/TestICUTokenizer.java | 99 ++++++--- .../icu/segmentation/TestICUTokenizerCJK.java | 9 + .../analysis/standard/StandardTokenizer.java | 5 +- 9 files changed, 174 insertions(+), 172 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 5b703e80c15..84fb260d588 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -174,6 +174,8 @@ Improvements position sensitive (e.g. part of a phrase) by having an accurate freq. (David Smiley) +* LUCENE-8125: ICUTokenizer support for emoji/emoji sequence tokens. (Robert Muir) + * LUCENE-8129: A Unicode set filter can now be specified when using ICUFoldingFilter. (Ere Maijala) diff --git a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/BreakIteratorWrapper.java b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/BreakIteratorWrapper.java index d8ecb77d401..9e5050d55b8 100644 --- a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/BreakIteratorWrapper.java +++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/BreakIteratorWrapper.java @@ -16,152 +16,84 @@ */ package org.apache.lucene.analysis.icu.segmentation; - -import java.text.CharacterIterator; - import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.lang.UProperty; import com.ibm.icu.text.BreakIterator; import com.ibm.icu.text.RuleBasedBreakIterator; import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; /** - * Contain all the issues surrounding BreakIterators in ICU in one place. - * Basically this boils down to the fact that they aren't very friendly to any - * sort of OO design. - *

- * http://bugs.icu-project.org/trac/ticket/5901: RBBI.getRuleStatus(), hoist to - * BreakIterator from RuleBasedBreakIterator - *

- * DictionaryBasedBreakIterator is a subclass of RuleBasedBreakIterator, but - * doesn't actually behave as a subclass: it always returns 0 for - * getRuleStatus(): - * http://bugs.icu-project.org/trac/ticket/4730: Thai RBBI, no boundary type - * tags + * Wraps RuleBasedBreakIterator, making object reuse convenient and + * emitting a rule status for emoji sequences. * @lucene.experimental */ -abstract class BreakIteratorWrapper { - protected final CharArrayIterator textIterator = new CharArrayIterator(); - protected char text[]; - protected int start; - protected int length; +final class BreakIteratorWrapper { + private final CharArrayIterator textIterator = new CharArrayIterator(); + private final RuleBasedBreakIterator rbbi; + private char text[]; + private int start; + private int status; + + BreakIteratorWrapper(RuleBasedBreakIterator rbbi) { + this.rbbi = rbbi; + } + + int current() { + return rbbi.current(); + } - abstract int next(); - abstract int current(); - abstract int getRuleStatus(); - abstract void setText(CharacterIterator text); + int getRuleStatus() { + return status; + } + + int next() { + int current = rbbi.current(); + int next = rbbi.next(); + status = calcStatus(current, next); + return next; + } + + /** Returns current rule status for the text between breaks. (determines token type) */ + private int calcStatus(int current, int next) { + // to support presentation selectors, we need to handle alphanum, num, and none at least, so currently not worth optimizing. + // https://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5B%3AEmoji%3A%5D-%5B%3AEmoji_Presentation%3A%5D&g=Word_Break&i= + if (next != BreakIterator.DONE && isEmoji(current, next)) { + return ICUTokenizerConfig.EMOJI_SEQUENCE_STATUS; + } else { + return rbbi.getRuleStatus(); + } + } + + // See unicode doc L2/16-315 and also the RBBI rules for rationale. + // we don't include regional indicators here, because they aren't ambiguous for tagging, + // they need only be treated special for segmentation. + static final UnicodeSet EMOJI_RK = new UnicodeSet("[\u002a\u00230-9©®™〰〽]").freeze(); + + /** Returns true if the current text represents emoji character or sequence */ + private boolean isEmoji(int current, int next) { + int begin = start + current; + int end = start + next; + int codepoint = UTF16.charAt(text, 0, end, begin); + // TODO: this can be made more aggressive and future-proof if it uses [:Extended_Pictographic:] + if (UCharacter.hasBinaryProperty(codepoint, UProperty.EMOJI)) { + if (EMOJI_RK.contains(codepoint)) { + // if its in EmojiRK, we don't treat it as emoji unless there is evidence it forms emoji sequence, + // an emoji presentation selector or keycap follows. + int trailer = begin + Character.charCount(codepoint); + return trailer < end && (text[trailer] == 0xFE0F || text[trailer] == 0x20E3); + } else { + return true; + } + } + return false; + } void setText(char text[], int start, int length) { this.text = text; this.start = start; - this.length = length; textIterator.setText(text, start, length); - setText(textIterator); - } - - /** - * If it's a RuleBasedBreakIterator, the rule status can be used for token type. If it's - * any other BreakIterator, the rulestatus method is not available, so treat - * it like a generic BreakIterator. - */ - static BreakIteratorWrapper wrap(BreakIterator breakIterator) { - if (breakIterator instanceof RuleBasedBreakIterator) - return new RBBIWrapper((RuleBasedBreakIterator) breakIterator); - else - return new BIWrapper(breakIterator); - } - - /** - * RuleBasedBreakIterator wrapper: RuleBasedBreakIterator (as long as it's not - * a DictionaryBasedBreakIterator) behaves correctly. - */ - static final class RBBIWrapper extends BreakIteratorWrapper { - private final RuleBasedBreakIterator rbbi; - - RBBIWrapper(RuleBasedBreakIterator rbbi) { - this.rbbi = rbbi; - } - - @Override - int current() { - return rbbi.current(); - } - - @Override - int getRuleStatus() { - return rbbi.getRuleStatus(); - } - - @Override - int next() { - return rbbi.next(); - } - - @Override - void setText(CharacterIterator text) { - rbbi.setText(text); - } - } - - /** - * Generic BreakIterator wrapper: Either the rulestatus method is not - * available or always returns 0. Calculate a rulestatus here so it behaves - * like RuleBasedBreakIterator. - * - * Note: This is slower than RuleBasedBreakIterator. - */ - static final class BIWrapper extends BreakIteratorWrapper { - private final BreakIterator bi; - private int status; - - BIWrapper(BreakIterator bi) { - this.bi = bi; - } - - @Override - int current() { - return bi.current(); - } - - @Override - int getRuleStatus() { - return status; - } - - @Override - int next() { - int current = bi.current(); - int next = bi.next(); - status = calcStatus(current, next); - return next; - } - - private int calcStatus(int current, int next) { - if (current == BreakIterator.DONE || next == BreakIterator.DONE) - return RuleBasedBreakIterator.WORD_NONE; - - int begin = start + current; - int end = start + next; - - int codepoint; - for (int i = begin; i < end; i += UTF16.getCharCount(codepoint)) { - codepoint = UTF16.charAt(text, 0, end, begin); - - if (UCharacter.isDigit(codepoint)) - return RuleBasedBreakIterator.WORD_NUMBER; - else if (UCharacter.isLetter(codepoint)) { - // TODO: try to separately specify ideographic, kana? - // [currently all bundled as letter for this case] - return RuleBasedBreakIterator.WORD_LETTER; - } - } - - return RuleBasedBreakIterator.WORD_NONE; - } - - @Override - void setText(CharacterIterator text) { - bi.setText(text); - status = RuleBasedBreakIterator.WORD_NONE; - } + rbbi.setText(textIterator); + status = RuleBasedBreakIterator.WORD_NONE; } } diff --git a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/CompositeBreakIterator.java b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/CompositeBreakIterator.java index 096eada2de3..3cb39edb92d 100644 --- a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/CompositeBreakIterator.java +++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/CompositeBreakIterator.java @@ -123,7 +123,7 @@ final class CompositeBreakIterator { private BreakIteratorWrapper getBreakIterator(int scriptCode) { if (wordBreakers[scriptCode] == null) - wordBreakers[scriptCode] = BreakIteratorWrapper.wrap(config.getBreakIterator(scriptCode)); + wordBreakers[scriptCode] = new BreakIteratorWrapper(config.getBreakIterator(scriptCode)); return wordBreakers[scriptCode]; } } diff --git a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java index 50a6b4c71d8..10e6c671817 100644 --- a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java +++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/DefaultICUTokenizerConfig.java @@ -52,6 +52,8 @@ public class DefaultICUTokenizerConfig extends ICUTokenizerConfig { public static final String WORD_LETTER = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.ALPHANUM]; /** Token type for words that appear to be numbers */ public static final String WORD_NUMBER = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.NUM]; + /** Token type for words that appear to be emoji sequences */ + public static final String WORD_EMOJI = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.EMOJI]; /* * the default breakiterators in use. these can be expensive to @@ -65,9 +67,9 @@ public class DefaultICUTokenizerConfig extends ICUTokenizerConfig { // maybe add an explicit check? http://icu-project.org/apiref/icu4j/com/ibm/icu/util/VersionInfo.html // the same as ROOT, except no dictionary segmentation for cjk - private static final BreakIterator defaultBreakIterator = + private static final RuleBasedBreakIterator defaultBreakIterator = readBreakIterator("Default.brk"); - private static final BreakIterator myanmarSyllableIterator = + private static final RuleBasedBreakIterator myanmarSyllableIterator = readBreakIterator("MyanmarSyllable.brk"); // TODO: deprecate this boolean? you only care if you are doing super-expert stuff... @@ -95,16 +97,16 @@ public class DefaultICUTokenizerConfig extends ICUTokenizerConfig { } @Override - public BreakIterator getBreakIterator(int script) { + public RuleBasedBreakIterator getBreakIterator(int script) { switch(script) { - case UScript.JAPANESE: return (BreakIterator)cjkBreakIterator.clone(); + case UScript.JAPANESE: return (RuleBasedBreakIterator)cjkBreakIterator.clone(); case UScript.MYANMAR: if (myanmarAsWords) { - return (BreakIterator)defaultBreakIterator.clone(); + return (RuleBasedBreakIterator)defaultBreakIterator.clone(); } else { - return (BreakIterator)myanmarSyllableIterator.clone(); + return (RuleBasedBreakIterator)myanmarSyllableIterator.clone(); } - default: return (BreakIterator)defaultBreakIterator.clone(); + default: return (RuleBasedBreakIterator)defaultBreakIterator.clone(); } } @@ -119,6 +121,8 @@ public class DefaultICUTokenizerConfig extends ICUTokenizerConfig { return script == UScript.HANGUL ? WORD_HANGUL : WORD_LETTER; case RuleBasedBreakIterator.WORD_NUMBER: return WORD_NUMBER; + case EMOJI_SEQUENCE_STATUS: + return WORD_EMOJI; default: /* some other custom code */ return ""; } diff --git a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerConfig.java b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerConfig.java index 69694fc0780..e2d3dae3d75 100644 --- a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerConfig.java +++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerConfig.java @@ -16,8 +16,7 @@ */ package org.apache.lucene.analysis.icu.segmentation; - -import com.ibm.icu.text.BreakIterator; +import com.ibm.icu.text.RuleBasedBreakIterator; /** * Class that allows for tailored Unicode Text Segmentation on @@ -25,14 +24,16 @@ import com.ibm.icu.text.BreakIterator; * @lucene.experimental */ public abstract class ICUTokenizerConfig { - + /** Rule status for emoji sequences */ + public static final int EMOJI_SEQUENCE_STATUS = 299; + /** * Sole constructor. (For invocation by subclass * constructors, typically implicit.) */ public ICUTokenizerConfig() {} /** Return a breakiterator capable of processing a given script. */ - public abstract BreakIterator getBreakIterator(int script); + public abstract RuleBasedBreakIterator getBreakIterator(int script); /** Return a token type value for a given script and BreakIterator * rule status. */ public abstract String getType(int script, int ruleStatus); diff --git a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerFactory.java b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerFactory.java index 4d29b0c36bc..0cd4cf28e5f 100644 --- a/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerFactory.java +++ b/lucene/analysis/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizerFactory.java @@ -116,9 +116,9 @@ public class ICUTokenizerFactory extends TokenizerFactory implements ResourceLoa config = new DefaultICUTokenizerConfig(cjkAsWords, myanmarAsWords) { @Override - public BreakIterator getBreakIterator(int script) { + public RuleBasedBreakIterator getBreakIterator(int script) { if (breakers[script] != null) { - return (BreakIterator) breakers[script].clone(); + return (RuleBasedBreakIterator) breakers[script].clone(); } else { return super.getBreakIterator(script); } diff --git a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java index 027baa35705..98939752cbe 100644 --- a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java +++ b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizer.java @@ -16,13 +16,10 @@ */ package org.apache.lucene.analysis.icu.segmentation; - import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.icu.ICUNormalizer2Filter; import org.apache.lucene.analysis.icu.tokenattributes.ScriptAttribute; import com.ibm.icu.lang.UScript; @@ -76,8 +73,7 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(false, true)); - TokenFilter filter = new ICUNormalizer2Filter(tokenizer); - return new TokenStreamComponents(tokenizer, filter); + return new TokenStreamComponents(tokenizer); } }; } @@ -90,8 +86,8 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase { public void testArmenian() throws Exception { assertAnalyzesTo(a, "Վիքիպեդիայի 13 միլիոն հոդվածները (4,600` հայերեն վիքիպեդիայում) գրվել են կամավորների կողմից ու համարյա բոլոր հոդվածները կարող է խմբագրել ցանկաց մարդ ով կարող է բացել Վիքիպեդիայի կայքը։", - new String[] { "վիքիպեդիայի", "13", "միլիոն", "հոդվածները", "4,600", "հայերեն", "վիքիպեդիայում", "գրվել", "են", "կամավորների", "կողմից", - "ու", "համարյա", "բոլոր", "հոդվածները", "կարող", "է", "խմբագրել", "ցանկաց", "մարդ", "ով", "կարող", "է", "բացել", "վիքիպեդիայի", "կայքը" } ); + new String[] { "Վիքիպեդիայի", "13", "միլիոն", "հոդվածները", "4,600", "հայերեն", "վիքիպեդիայում", "գրվել", "են", "կամավորների", "կողմից", + "ու", "համարյա", "բոլոր", "հոդվածները", "կարող", "է", "խմբագրել", "ցանկաց", "մարդ", "ով", "կարող", "է", "բացել", "Վիքիպեդիայի", "կայքը" } ); } public void testAmharic() throws Exception { @@ -102,12 +98,12 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase { public void testArabic() throws Exception { assertAnalyzesTo(a, "الفيلم الوثائقي الأول عن ويكيبيديا يسمى \"الحقيقة بالأرقام: قصة ويكيبيديا\" (بالإنجليزية: Truth in Numbers: The Wikipedia Story)، سيتم إطلاقه في 2008.", new String[] { "الفيلم", "الوثائقي", "الأول", "عن", "ويكيبيديا", "يسمى", "الحقيقة", "بالأرقام", "قصة", "ويكيبيديا", - "بالإنجليزية", "truth", "in", "numbers", "the", "wikipedia", "story", "سيتم", "إطلاقه", "في", "2008" } ); + "بالإنجليزية", "Truth", "in", "Numbers", "The", "Wikipedia", "Story", "سيتم", "إطلاقه", "في", "2008" } ); } public void testAramaic() throws Exception { assertAnalyzesTo(a, "ܘܝܩܝܦܕܝܐ (ܐܢܓܠܝܐ: Wikipedia) ܗܘ ܐܝܢܣܩܠܘܦܕܝܐ ܚܐܪܬܐ ܕܐܢܛܪܢܛ ܒܠܫܢ̈ܐ ܣܓܝܐ̈ܐ܂ ܫܡܗ ܐܬܐ ܡܢ ܡ̈ܠܬܐ ܕ\"ܘܝܩܝ\" ܘ\"ܐܝܢܣܩܠܘܦܕܝܐ\"܀", - new String[] { "ܘܝܩܝܦܕܝܐ", "ܐܢܓܠܝܐ", "wikipedia", "ܗܘ", "ܐܝܢܣܩܠܘܦܕܝܐ", "ܚܐܪܬܐ", "ܕܐܢܛܪܢܛ", "ܒܠܫܢ̈ܐ", "ܣܓܝܐ̈ܐ", "ܫܡܗ", + new String[] { "ܘܝܩܝܦܕܝܐ", "ܐܢܓܠܝܐ", "Wikipedia", "ܗܘ", "ܐܝܢܣܩܠܘܦܕܝܐ", "ܚܐܪܬܐ", "ܕܐܢܛܪܢܛ", "ܒܠܫܢ̈ܐ", "ܣܓܝܐ̈ܐ", "ܫܡܗ", "ܐܬܐ", "ܡܢ", "ܡ̈ܠܬܐ", "ܕ", "ܘܝܩܝ", "ܘ", "ܐܝܢܣܩܠܘܦܕܝܐ"}); } @@ -125,7 +121,7 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase { public void testGreek() throws Exception { assertAnalyzesTo(a, "Γράφεται σε συνεργασία από εθελοντές με το λογισμικό wiki, κάτι που σημαίνει ότι άρθρα μπορεί να προστεθούν ή να αλλάξουν από τον καθένα.", - new String[] { "γράφεται", "σε", "συνεργασία", "από", "εθελοντέσ", "με", "το", "λογισμικό", "wiki", "κάτι", "που", + new String[] { "Γράφεται", "σε", "συνεργασία", "από", "εθελοντές", "με", "το", "λογισμικό", "wiki", "κάτι", "που", "σημαίνει", "ότι", "άρθρα", "μπορεί", "να", "προστεθούν", "ή", "να", "αλλάξουν", "από", "τον", "καθένα" }); } @@ -156,7 +152,7 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase { */ public void testChinese() throws Exception { assertAnalyzesTo(a, "我是中国人。 1234 Tests ", - new String[] { "我", "是", "中", "国", "人", "1234", "tests"}); + new String[] { "我", "是", "中", "国", "人", "1234", "Tests"}); } public void testHebrew() throws Exception { @@ -186,8 +182,8 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase { /* Tests from StandardAnalyzer, just to show behavior is similar */ public void testAlphanumericSA() throws Exception { // alphanumeric tokens - assertAnalyzesTo(a, "B2B", new String[]{"b2b"}); - assertAnalyzesTo(a, "2B", new String[]{"2b"}); + assertAnalyzesTo(a, "B2B", new String[]{"B2B"}); + assertAnalyzesTo(a, "2B", new String[]{"2B"}); } public void testDelimitersSA() throws Exception { @@ -199,34 +195,34 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase { public void testApostrophesSA() throws Exception { // internal apostrophes: O'Reilly, you're, O'Reilly's - assertAnalyzesTo(a, "O'Reilly", new String[]{"o'reilly"}); + assertAnalyzesTo(a, "O'Reilly", new String[]{"O'Reilly"}); assertAnalyzesTo(a, "you're", new String[]{"you're"}); assertAnalyzesTo(a, "she's", new String[]{"she's"}); - assertAnalyzesTo(a, "Jim's", new String[]{"jim's"}); + assertAnalyzesTo(a, "Jim's", new String[]{"Jim's"}); assertAnalyzesTo(a, "don't", new String[]{"don't"}); - assertAnalyzesTo(a, "O'Reilly's", new String[]{"o'reilly's"}); + assertAnalyzesTo(a, "O'Reilly's", new String[]{"O'Reilly's"}); } public void testNumericSA() throws Exception { // floating point, serial, model numbers, ip addresses, etc. // every other segment must have at least one digit assertAnalyzesTo(a, "21.35", new String[]{"21.35"}); - assertAnalyzesTo(a, "R2D2 C3PO", new String[]{"r2d2", "c3po"}); + assertAnalyzesTo(a, "R2D2 C3PO", new String[]{"R2D2", "C3PO"}); assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"}); assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"}); } public void testTextWithNumbersSA() throws Exception { // numbers - assertAnalyzesTo(a, "David has 5000 bones", new String[]{"david", "has", "5000", "bones"}); + assertAnalyzesTo(a, "David has 5000 bones", new String[]{"David", "has", "5000", "bones"}); } public void testVariousTextSA() throws Exception { // various - assertAnalyzesTo(a, "C embedded developers wanted", new String[]{"c", "embedded", "developers", "wanted"}); - assertAnalyzesTo(a, "foo bar FOO BAR", new String[]{"foo", "bar", "foo", "bar"}); - assertAnalyzesTo(a, "foo bar . FOO <> BAR", new String[]{"foo", "bar", "foo", "bar"}); - assertAnalyzesTo(a, "\"QUOTED\" word", new String[]{"quoted", "word"}); + assertAnalyzesTo(a, "C embedded developers wanted", new String[]{"C", "embedded", "developers", "wanted"}); + assertAnalyzesTo(a, "foo bar FOO BAR", new String[]{"foo", "bar", "FOO", "BAR"}); + assertAnalyzesTo(a, "foo bar . FOO <> BAR", new String[]{"foo", "bar", "FOO", "BAR"}); + assertAnalyzesTo(a, "\"QUOTED\" word", new String[]{"QUOTED", "word"}); } public void testKoreanSA() throws Exception { @@ -242,14 +238,14 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase { public void testOffsets() throws Exception { assertAnalyzesTo(a, "David has 5000 bones", - new String[] {"david", "has", "5000", "bones"}, + new String[] {"David", "has", "5000", "bones"}, new int[] {0, 6, 10, 15}, new int[] {5, 9, 14, 20}); } public void testTypes() throws Exception { assertAnalyzesTo(a, "David has 5000 bones", - new String[] {"david", "has", "5000", "bones"}, + new String[] {"David", "has", "5000", "bones"}, new String[] { "", "", "", "" }); } @@ -265,6 +261,61 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase { new String[] { "", "", "", "", "" }); } + /** simple emoji */ + public void testEmoji() throws Exception { + BaseTokenStreamTestCase.assertAnalyzesTo(a, "💩 💩💩", + new String[] { "💩", "💩", "💩" }, + new String[] { "", "", "" }); + } + + /** emoji zwj sequence */ + public void testEmojiSequence() throws Exception { + BaseTokenStreamTestCase.assertAnalyzesTo(a, "👩‍❤️‍👩", + new String[] { "👩‍❤️‍👩" }, + new String[] { "" }); + } + + /** emoji zwj sequence with fitzpatrick modifier */ + public void testEmojiSequenceWithModifier() throws Exception { + BaseTokenStreamTestCase.assertAnalyzesTo(a, "👨🏼‍⚕️", + new String[] { "👨🏼‍⚕️" }, + new String[] { "" }); + } + + /** regional indicator */ + public void testEmojiRegionalIndicator() throws Exception { + BaseTokenStreamTestCase.assertAnalyzesTo(a, "🇺🇸🇺🇸", + new String[] { "🇺🇸", "🇺🇸" }, + new String[] { "", "" }); + } + + /** variation sequence */ + public void testEmojiVariationSequence() throws Exception { + BaseTokenStreamTestCase.assertAnalyzesTo(a, "#️⃣", + new String[] { "#️⃣" }, + new String[] { "" }); + BaseTokenStreamTestCase.assertAnalyzesTo(a, "3️⃣", + new String[] { "3️⃣",}, + new String[] { "" }); + } + + public void testEmojiTagSequence() throws Exception { + BaseTokenStreamTestCase.assertAnalyzesTo(a, "🏴󠁧󠁢󠁥󠁮󠁧󠁿", + new String[] { "🏴󠁧󠁢󠁥󠁮󠁧󠁿" }, + new String[] { "" }); + } + + public void testEmojiTokenization() throws Exception { + // simple emoji around latin + BaseTokenStreamTestCase.assertAnalyzesTo(a, "poo💩poo", + new String[] { "poo", "💩", "poo" }, + new String[] { "", "", "" }); + // simple emoji around non-latin + BaseTokenStreamTestCase.assertAnalyzesTo(a, "💩中國💩", + new String[] { "💩", "中", "國", "💩" }, + new String[] { "", "", "", "" }); + } + /** blast some random strings through the analyzer */ public void testRandomStrings() throws Exception { checkRandomData(random(), a, 1000*RANDOM_MULTIPLIER); diff --git a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizerCJK.java b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizerCJK.java index 3d859ee4e34..51fb0e61ea1 100644 --- a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizerCJK.java +++ b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestICUTokenizerCJK.java @@ -78,6 +78,15 @@ public class TestICUTokenizerCJK extends BaseTokenStreamTestCase { ); } + /** + * dictionary segmentation with emoji + */ + public void testSimpleJapaneseWithEmoji() throws Exception { + assertAnalyzesTo(a, "それはまだ実験段階にあります💩", + new String[] { "それ", "は", "まだ", "実験", "段階", "に", "あり", "ます", "💩" } + ); + } + public void testJapaneseTypes() throws Exception { assertAnalyzesTo(a, "仮名遣い カタカナ", new String[] { "仮名遣い", "カタカナ" }, diff --git a/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java index 04101246460..50d1f9fb5d0 100644 --- a/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java +++ b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java @@ -54,6 +54,8 @@ public final class StandardTokenizer extends Tokenizer { public static final int KATAKANA = 5; /** Hangul token type */ public static final int HANGUL = 6; + /** Emoji token type. */ + public static final int EMOJI = 7; /** String token types that correspond to token type int constants */ public static final String [] TOKEN_TYPES = new String [] { @@ -63,7 +65,8 @@ public final class StandardTokenizer extends Tokenizer { "", "", "", - "" + "", + "" }; /** Absolute maximum sized token */