LUCENE-3747: Support Unicode 6.1.0.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1365971 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Steven Rowe 2012-07-26 12:56:35 +00:00
parent 924f6f730d
commit 5abc76ea42
51 changed files with 10260 additions and 7354 deletions

View File

@ -98,7 +98,7 @@
<classpathentry kind="lib" path="lucene/test-framework/lib/ant-junit-1.8.2.jar"/>
<classpathentry kind="lib" path="lucene/test-framework/lib/junit-4.10.jar"/>
<classpathentry kind="lib" path="lucene/sandbox/lib/jakarta-regexp-1.4.jar"/>
<classpathentry kind="lib" path="lucene/analysis/icu/lib/icu4j-4.8.1.1.jar"/>
<classpathentry kind="lib" path="lucene/analysis/icu/lib/icu4j-49.1.jar"/>
<classpathentry kind="lib" path="lucene/analysis/phonetic/lib/commons-codec-1.6.jar"/>
<classpathentry kind="lib" path="lucene/analysis/morfologik/lib/morfologik-fsa-1.5.3.jar"/>
<classpathentry kind="lib" path="lucene/analysis/morfologik/lib/morfologik-polish-1.5.3.jar"/>

View File

@ -143,7 +143,7 @@
<dependency>
<groupId>com.ibm.icu</groupId>
<artifactId>icu4j</artifactId>
<version>4.8.1.1</version>
<version>49.1</version>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>

View File

@ -67,6 +67,8 @@ API Changes
parallels with docScore and the default implementation is correct.
(Robert Muir)
* LUCENE-3747: Support Unicode 6.1.0. (Steve Rowe)
Optimizations
* LUCENE-4171: Performance improvements to Packed64.

View File

@ -1,162 +0,0 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
CharacterEntities = ( "AElig" | "Aacute" | "Acirc" | "Agrave" | "Alpha"
| "Aring" | "Atilde" | "Auml" | "Beta" | "Ccedil" | "Chi"
| "Dagger" | "Delta" | "ETH" | "Eacute" | "Ecirc"
| "Egrave" | "Epsilon" | "Eta" | "Euml" | "Gamma"
| "Iacute" | "Icirc" | "Igrave" | "Iota" | "Iuml" | "Kappa"
| "Lambda" | "Mu" | "Ntilde" | "Nu" | "OElig" | "Oacute"
| "Ocirc" | "Ograve" | "Omega" | "Omicron" | "Oslash"
| "Otilde" | "Ouml" | "Phi" | "Pi" | "Prime" | "Psi"
| "Rho" | "Scaron" | "Sigma" | "THORN" | "Tau" | "Theta"
| "Uacute" | "Ucirc" | "Ugrave" | "Upsilon" | "Uuml" | "Xi"
| "Yacute" | "Yuml" | "Zeta" | "aacute" | "acirc" | "acute"
| "aelig" | "agrave" | "alefsym" | "alpha" | "amp" | "AMP"
| "and" | "ang" | "apos" | "aring" | "asymp" | "atilde"
| "auml" | "bdquo" | "beta" | "brvbar" | "bull" | "cap"
| "ccedil" | "cedil" | "cent" | "chi" | "circ" | "clubs"
| "cong" | "copy" | "COPY" | "crarr" | "cup" | "curren"
| "dArr" | "dagger" | "darr" | "deg" | "delta" | "diams"
| "divide" | "eacute" | "ecirc" | "egrave" | "empty"
| "emsp" | "ensp" | "epsilon" | "equiv" | "eta" | "eth"
| "euml" | "euro" | "exist" | "fnof" | "forall" | "frac12"
| "frac14" | "frac34" | "frasl" | "gamma" | "ge" | "gt"
| "GT" | "hArr" | "harr" | "hearts" | "hellip" | "iacute"
| "icirc" | "iexcl" | "igrave" | "image" | "infin" | "int"
| "iota" | "iquest" | "isin" | "iuml" | "kappa" | "lArr"
| "lambda" | "lang" | "laquo" | "larr" | "lceil" | "ldquo"
| "le" | "lfloor" | "lowast" | "loz" | "lrm" | "lsaquo"
| "lsquo" | "lt" | "LT" | "macr" | "mdash" | "micro"
| "middot" | "minus" | "mu" | "nabla" | "nbsp" | "ndash"
| "ne" | "ni" | "not" | "notin" | "nsub" | "ntilde" | "nu"
| "oacute" | "ocirc" | "oelig" | "ograve" | "oline"
| "omega" | "omicron" | "oplus" | "or" | "ordf" | "ordm"
| "oslash" | "otilde" | "otimes" | "ouml" | "para" | "part"
| "permil" | "perp" | "phi" | "pi" | "piv" | "plusmn"
| "pound" | "prime" | "prod" | "prop" | "psi" | "quot"
| "QUOT" | "rArr" | "radic" | "rang" | "raquo" | "rarr"
| "rceil" | "rdquo" | "real" | "reg" | "REG" | "rfloor"
| "rho" | "rlm" | "rsaquo" | "rsquo" | "sbquo" | "scaron"
| "sdot" | "sect" | "shy" | "sigma" | "sigmaf" | "sim"
| "spades" | "sub" | "sube" | "sum" | "sup" | "sup1"
| "sup2" | "sup3" | "supe" | "szlig" | "tau" | "there4"
| "theta" | "thetasym" | "thinsp" | "thorn" | "tilde"
| "times" | "trade" | "uArr" | "uacute" | "uarr" | "ucirc"
| "ugrave" | "uml" | "upsih" | "upsilon" | "uuml"
| "weierp" | "xi" | "yacute" | "yen" | "yuml" | "zeta"
| "zwj" | "zwnj" )
%{
private static final Map<String,String> upperCaseVariantsAccepted
= new HashMap<String,String>();
static {
upperCaseVariantsAccepted.put("quot", "QUOT");
upperCaseVariantsAccepted.put("copy", "COPY");
upperCaseVariantsAccepted.put("gt", "GT");
upperCaseVariantsAccepted.put("lt", "LT");
upperCaseVariantsAccepted.put("reg", "REG");
upperCaseVariantsAccepted.put("amp", "AMP");
}
private static final CharArrayMap<Character> entityValues
= new CharArrayMap<Character>(Version.LUCENE_40, 253, false);
static {
String[] entities = {
"AElig", "\u00C6", "Aacute", "\u00C1", "Acirc", "\u00C2",
"Agrave", "\u00C0", "Alpha", "\u0391", "Aring", "\u00C5",
"Atilde", "\u00C3", "Auml", "\u00C4", "Beta", "\u0392",
"Ccedil", "\u00C7", "Chi", "\u03A7", "Dagger", "\u2021",
"Delta", "\u0394", "ETH", "\u00D0", "Eacute", "\u00C9",
"Ecirc", "\u00CA", "Egrave", "\u00C8", "Epsilon", "\u0395",
"Eta", "\u0397", "Euml", "\u00CB", "Gamma", "\u0393", "Iacute", "\u00CD",
"Icirc", "\u00CE", "Igrave", "\u00CC", "Iota", "\u0399",
"Iuml", "\u00CF", "Kappa", "\u039A", "Lambda", "\u039B", "Mu", "\u039C",
"Ntilde", "\u00D1", "Nu", "\u039D", "OElig", "\u0152",
"Oacute", "\u00D3", "Ocirc", "\u00D4", "Ograve", "\u00D2",
"Omega", "\u03A9", "Omicron", "\u039F", "Oslash", "\u00D8",
"Otilde", "\u00D5", "Ouml", "\u00D6", "Phi", "\u03A6", "Pi", "\u03A0",
"Prime", "\u2033", "Psi", "\u03A8", "Rho", "\u03A1", "Scaron", "\u0160",
"Sigma", "\u03A3", "THORN", "\u00DE", "Tau", "\u03A4", "Theta", "\u0398",
"Uacute", "\u00DA", "Ucirc", "\u00DB", "Ugrave", "\u00D9",
"Upsilon", "\u03A5", "Uuml", "\u00DC", "Xi", "\u039E",
"Yacute", "\u00DD", "Yuml", "\u0178", "Zeta", "\u0396",
"aacute", "\u00E1", "acirc", "\u00E2", "acute", "\u00B4",
"aelig", "\u00E6", "agrave", "\u00E0", "alefsym", "\u2135",
"alpha", "\u03B1", "amp", "\u0026", "and", "\u2227", "ang", "\u2220",
"apos", "\u0027", "aring", "\u00E5", "asymp", "\u2248",
"atilde", "\u00E3", "auml", "\u00E4", "bdquo", "\u201E",
"beta", "\u03B2", "brvbar", "\u00A6", "bull", "\u2022", "cap", "\u2229",
"ccedil", "\u00E7", "cedil", "\u00B8", "cent", "\u00A2", "chi", "\u03C7",
"circ", "\u02C6", "clubs", "\u2663", "cong", "\u2245", "copy", "\u00A9",
"crarr", "\u21B5", "cup", "\u222A", "curren", "\u00A4", "dArr", "\u21D3",
"dagger", "\u2020", "darr", "\u2193", "deg", "\u00B0", "delta", "\u03B4",
"diams", "\u2666", "divide", "\u00F7", "eacute", "\u00E9",
"ecirc", "\u00EA", "egrave", "\u00E8", "empty", "\u2205",
"emsp", "\u2003", "ensp", "\u2002", "epsilon", "\u03B5",
"equiv", "\u2261", "eta", "\u03B7", "eth", "\u00F0", "euml", "\u00EB",
"euro", "\u20AC", "exist", "\u2203", "fnof", "\u0192",
"forall", "\u2200", "frac12", "\u00BD", "frac14", "\u00BC",
"frac34", "\u00BE", "frasl", "\u2044", "gamma", "\u03B3", "ge", "\u2265",
"gt", "\u003E", "hArr", "\u21D4", "harr", "\u2194", "hearts", "\u2665",
"hellip", "\u2026", "iacute", "\u00ED", "icirc", "\u00EE",
"iexcl", "\u00A1", "igrave", "\u00EC", "image", "\u2111",
"infin", "\u221E", "int", "\u222B", "iota", "\u03B9", "iquest", "\u00BF",
"isin", "\u2208", "iuml", "\u00EF", "kappa", "\u03BA", "lArr", "\u21D0",
"lambda", "\u03BB", "lang", "\u2329", "laquo", "\u00AB",
"larr", "\u2190", "lceil", "\u2308", "ldquo", "\u201C", "le", "\u2264",
"lfloor", "\u230A", "lowast", "\u2217", "loz", "\u25CA", "lrm", "\u200E",
"lsaquo", "\u2039", "lsquo", "\u2018", "lt", "\u003C", "macr", "\u00AF",
"mdash", "\u2014", "micro", "\u00B5", "middot", "\u00B7",
"minus", "\u2212", "mu", "\u03BC", "nabla", "\u2207", "nbsp", " ",
"ndash", "\u2013", "ne", "\u2260", "ni", "\u220B", "not", "\u00AC",
"notin", "\u2209", "nsub", "\u2284", "ntilde", "\u00F1", "nu", "\u03BD",
"oacute", "\u00F3", "ocirc", "\u00F4", "oelig", "\u0153",
"ograve", "\u00F2", "oline", "\u203E", "omega", "\u03C9",
"omicron", "\u03BF", "oplus", "\u2295", "or", "\u2228", "ordf", "\u00AA",
"ordm", "\u00BA", "oslash", "\u00F8", "otilde", "\u00F5",
"otimes", "\u2297", "ouml", "\u00F6", "para", "\u00B6", "part", "\u2202",
"permil", "\u2030", "perp", "\u22A5", "phi", "\u03C6", "pi", "\u03C0",
"piv", "\u03D6", "plusmn", "\u00B1", "pound", "\u00A3",
"prime", "\u2032", "prod", "\u220F", "prop", "\u221D", "psi", "\u03C8",
"quot", "\"", "rArr", "\u21D2", "radic", "\u221A", "rang", "\u232A",
"raquo", "\u00BB", "rarr", "\u2192", "rceil", "\u2309",
"rdquo", "\u201D", "real", "\u211C", "reg", "\u00AE", "rfloor", "\u230B",
"rho", "\u03C1", "rlm", "\u200F", "rsaquo", "\u203A", "rsquo", "\u2019",
"sbquo", "\u201A", "scaron", "\u0161", "sdot", "\u22C5",
"sect", "\u00A7", "shy", "\u00AD", "sigma", "\u03C3", "sigmaf", "\u03C2",
"sim", "\u223C", "spades", "\u2660", "sub", "\u2282", "sube", "\u2286",
"sum", "\u2211", "sup", "\u2283", "sup1", "\u00B9", "sup2", "\u00B2",
"sup3", "\u00B3", "supe", "\u2287", "szlig", "\u00DF", "tau", "\u03C4",
"there4", "\u2234", "theta", "\u03B8", "thetasym", "\u03D1",
"thinsp", "\u2009", "thorn", "\u00FE", "tilde", "\u02DC",
"times", "\u00D7", "trade", "\u2122", "uArr", "\u21D1",
"uacute", "\u00FA", "uarr", "\u2191", "ucirc", "\u00FB",
"ugrave", "\u00F9", "uml", "\u00A8", "upsih", "\u03D2",
"upsilon", "\u03C5", "uuml", "\u00FC", "weierp", "\u2118",
"xi", "\u03BE", "yacute", "\u00FD", "yen", "\u00A5", "yuml", "\u00FF",
"zeta", "\u03B6", "zwj", "\u200D", "zwnj", "\u200C"
};
for (int i = 0 ; i < entities.length ; i += 2) {
Character value = entities[i + 1].charAt(0);
entityValues.put(entities[i], value);
String upperCaseVariant = upperCaseVariantsAccepted.get(entities[i]);
if (upperCaseVariant != null) {
entityValues.put(upperCaseVariant, value);
}
}
}
%}

View File

@ -14,45 +14,52 @@
* limitations under the License.
*/
// Generated using ICU4J 4.8.1.1 on Friday, January 13, 2012 6:20:39 PM UTC
// Generated using ICU4J 49.1.0.0 on Sunday, July 15, 2012 5:42:00 AM UTC
// by org.apache.lucene.analysis.icu.GenerateHTMLStripCharFilterSupplementaryMacros
ID_Start_Supp = (
[\uD81A][\uDC00-\uDE38]
[\uD802][\uDC00-\uDC05\uDC08\uDC0A-\uDC35\uDC37\uDC38\uDC3C\uDC3F-\uDC55\uDD00-\uDD15\uDD20-\uDD39\uDD80-\uDDB7\uDDBE\uDDBF\uDE00\uDE10-\uDE13\uDE15-\uDE17\uDE19-\uDE33\uDE60-\uDE7C\uDF00-\uDF35\uDF40-\uDF55\uDF60-\uDF72]
| [\uD81A][\uDC00-\uDE38]
| [\uD869][\uDC00-\uDED6\uDF00-\uDFFF]
| [\uD835][\uDC00-\uDC54\uDC56-\uDC9C\uDC9E\uDC9F\uDCA2\uDCA5\uDCA6\uDCA9-\uDCAC\uDCAE-\uDCB9\uDCBB\uDCBD-\uDCC3\uDCC5-\uDD05\uDD07-\uDD0A\uDD0D-\uDD14\uDD16-\uDD1C\uDD1E-\uDD39\uDD3B-\uDD3E\uDD40-\uDD44\uDD46\uDD4A-\uDD50\uDD52-\uDEA5\uDEA8-\uDEC0\uDEC2-\uDEDA\uDEDC-\uDEFA\uDEFC-\uDF14\uDF16-\uDF34\uDF36-\uDF4E\uDF50-\uDF6E\uDF70-\uDF88\uDF8A-\uDFA8\uDFAA-\uDFC2\uDFC4-\uDFCB]
| [\uD80C\uD840-\uD868\uD86A-\uD86C][\uDC00-\uDFFF]
| [\uD82C][\uDC00\uDC01]
| [\uD804][\uDC03-\uDC37\uDC83-\uDCAF]
| [\uD86D][\uDC00-\uDF34\uDF40-\uDFFF]
| [\uD81B][\uDF00-\uDF44\uDF50\uDF93-\uDF9F]
| [\uD87E][\uDC00-\uDE1D]
| [\uD804][\uDC03-\uDC37\uDC83-\uDCAF\uDCD0-\uDCE8\uDD03-\uDD26\uDD83-\uDDB2\uDDC1-\uDDC4]
| [\uD83B][\uDE00-\uDE03\uDE05-\uDE1F\uDE21\uDE22\uDE24\uDE27\uDE29-\uDE32\uDE34-\uDE37\uDE39\uDE3B\uDE42\uDE47\uDE49\uDE4B\uDE4D-\uDE4F\uDE51\uDE52\uDE54\uDE57\uDE59\uDE5B\uDE5D\uDE5F\uDE61\uDE62\uDE64\uDE67-\uDE6A\uDE6C-\uDE72\uDE74-\uDE77\uDE79-\uDE7C\uDE7E\uDE80-\uDE89\uDE8B-\uDE9B\uDEA1-\uDEA3\uDEA5-\uDEA9\uDEAB-\uDEBB]
| [\uD809][\uDC00-\uDC62]
| [\uD808][\uDC00-\uDF6E]
| [\uD803][\uDC00-\uDC48]
| [\uD800][\uDC00-\uDC0B\uDC0D-\uDC26\uDC28-\uDC3A\uDC3C\uDC3D\uDC3F-\uDC4D\uDC50-\uDC5D\uDC80-\uDCFA\uDD40-\uDD74\uDE80-\uDE9C\uDEA0-\uDED0\uDF00-\uDF1E\uDF30-\uDF4A\uDF80-\uDF9D\uDFA0-\uDFC3\uDFC8-\uDFCF\uDFD1-\uDFD5]
| [\uD80D][\uDC00-\uDC2E]
| [\uD805][\uDE80-\uDEAA]
| [\uD86E][\uDC00-\uDC1D]
| [\uD802][\uDC00-\uDC05\uDC08\uDC0A-\uDC35\uDC37\uDC38\uDC3C\uDC3F-\uDC55\uDD00-\uDD15\uDD20-\uDD39\uDE00\uDE10-\uDE13\uDE15-\uDE17\uDE19-\uDE33\uDE60-\uDE7C\uDF00-\uDF35\uDF40-\uDF55\uDF60-\uDF72]
| [\uD801][\uDC00-\uDC9D]
)
ID_Continue_Supp = (
[\uD81A][\uDC00-\uDE38]
| [\uD869][\uDC00-\uDED6\uDF00-\uDFFF]
| [\uD80C\uD840-\uD868\uD86A-\uD86C][\uDC00-\uDFFF]
| [\uD804][\uDC00-\uDC46\uDC66-\uDC6F\uDC80-\uDCBA]
| [\uD82C][\uDC00\uDC01]
| [\uD802][\uDC00-\uDC05\uDC08\uDC0A-\uDC35\uDC37\uDC38\uDC3C\uDC3F-\uDC55\uDD00-\uDD15\uDD20-\uDD39\uDE00-\uDE03\uDE05\uDE06\uDE0C-\uDE13\uDE15-\uDE17\uDE19-\uDE33\uDE38-\uDE3A\uDE3F\uDE60-\uDE7C\uDF00-\uDF35\uDF40-\uDF55\uDF60-\uDF72]
| [\uD81B][\uDF00-\uDF44\uDF50-\uDF7E\uDF8F-\uDF9F]
| [\uD801][\uDC00-\uDC9D\uDCA0-\uDCA9]
| [\uD86D][\uDC00-\uDF34\uDF40-\uDFFF]
| [\uD87E][\uDC00-\uDE1D]
| [\uD802][\uDC00-\uDC05\uDC08\uDC0A-\uDC35\uDC37\uDC38\uDC3C\uDC3F-\uDC55\uDD00-\uDD15\uDD20-\uDD39\uDD80-\uDDB7\uDDBE\uDDBF\uDE00-\uDE03\uDE05\uDE06\uDE0C-\uDE13\uDE15-\uDE17\uDE19-\uDE33\uDE38-\uDE3A\uDE3F\uDE60-\uDE7C\uDF00-\uDF35\uDF40-\uDF55\uDF60-\uDF72]
| [\uD805][\uDE80-\uDEB7\uDEC0-\uDEC9]
| [\uD83B][\uDE00-\uDE03\uDE05-\uDE1F\uDE21\uDE22\uDE24\uDE27\uDE29-\uDE32\uDE34-\uDE37\uDE39\uDE3B\uDE42\uDE47\uDE49\uDE4B\uDE4D-\uDE4F\uDE51\uDE52\uDE54\uDE57\uDE59\uDE5B\uDE5D\uDE5F\uDE61\uDE62\uDE64\uDE67-\uDE6A\uDE6C-\uDE72\uDE74-\uDE77\uDE79-\uDE7C\uDE7E\uDE80-\uDE89\uDE8B-\uDE9B\uDEA1-\uDEA3\uDEA5-\uDEA9\uDEAB-\uDEBB]
| [\uD809][\uDC00-\uDC62]
| [\uD808][\uDC00-\uDF6E]
| [\uD803][\uDC00-\uDC48]
| [\uD80D][\uDC00-\uDC2E]
| [\uD800][\uDC00-\uDC0B\uDC0D-\uDC26\uDC28-\uDC3A\uDC3C\uDC3D\uDC3F-\uDC4D\uDC50-\uDC5D\uDC80-\uDCFA\uDD40-\uDD74\uDDFD\uDE80-\uDE9C\uDEA0-\uDED0\uDF00-\uDF1E\uDF30-\uDF4A\uDF80-\uDF9D\uDFA0-\uDFC3\uDFC8-\uDFCF\uDFD1-\uDFD5]
| [\uD804][\uDC00-\uDC46\uDC66-\uDC6F\uDC80-\uDCBA\uDCD0-\uDCE8\uDCF0-\uDCF9\uDD00-\uDD34\uDD36-\uDD3F\uDD80-\uDDC4\uDDD0-\uDDD9]
| [\uD86E][\uDC00-\uDC1D]
| [\uDB40][\uDD00-\uDDEF]
| [\uD834][\uDD65-\uDD69\uDD6D-\uDD72\uDD7B-\uDD82\uDD85-\uDD8B\uDDAA-\uDDAD\uDE42-\uDE44]
| [\uD835][\uDC00-\uDC54\uDC56-\uDC9C\uDC9E\uDC9F\uDCA2\uDCA5\uDCA6\uDCA9-\uDCAC\uDCAE-\uDCB9\uDCBB\uDCBD-\uDCC3\uDCC5-\uDD05\uDD07-\uDD0A\uDD0D-\uDD14\uDD16-\uDD1C\uDD1E-\uDD39\uDD3B-\uDD3E\uDD40-\uDD44\uDD46\uDD4A-\uDD50\uDD52-\uDEA5\uDEA8-\uDEC0\uDEC2-\uDEDA\uDEDC-\uDEFA\uDEFC-\uDF14\uDF16-\uDF34\uDF36-\uDF4E\uDF50-\uDF6E\uDF70-\uDF88\uDF8A-\uDFA8\uDFAA-\uDFC2\uDFC4-\uDFCB\uDFCE-\uDFFF]
)
Picked up JAVA_TOOL_OPTIONS: -Dfile.encoding=UTF-8

View File

@ -15,8 +15,8 @@
*/
// Generated from IANA Root Zone Database <http://www.internic.net/zones/root.zone>
// file version from Sunday, March 18, 2012 4:34:02 AM UTC
// generated on Sunday, March 18, 2012 4:02:55 PM UTC
// file version from Saturday, July 14, 2012 4:34:14 AM UTC
// generated on Sunday, July 15, 2012 12:59:44 AM UTC
// by org.apache.lucene.analysis.standard.GenerateJflexTLDMacros
ASCIITLD = "." (
@ -310,6 +310,7 @@ ASCIITLD = "." (
| [xX][nN]--[kK][pP][rR][wW]13[dD]
| [xX][nN]--[kK][pP][rR][yY]57[dD]
| [xX][nN]--[lL][gG][bB][bB][aA][tT]1[aA][dD]8[jJ]
| [xX][nN]--[mM][gG][bB]9[aA][wW][bB][fF]
| [xX][nN]--[mM][gG][bB][aA][aA][mM]7[aA]8[hH]
| [xX][nN]--[mM][gG][bB][aA][yY][hH]7[gG][pP][aA]
| [xX][nN]--[mM][gG][bB][bB][hH]1[aA]71[eE]

View File

@ -1,4 +1,4 @@
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 08.07.12 16:59 */
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 7/15/12 1:57 AM */
package org.apache.lucene.analysis.standard;
@ -33,8 +33,8 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
/**
* This class is a scanner generated by
* <a href="http://www.jflex.de/">JFlex</a> 1.5.0-SNAPSHOT
* on 08.07.12 16:59 from the specification file
* <tt>C:/Users/Uwe Schindler/Projects/lucene/lucene4199/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex</tt>
* on 7/15/12 1:57 AM from the specification file
* <tt>C:/cygwin/home/s/svn/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex</tt>
*/
class ClassicTokenizerImpl implements StandardTokenizerInterface {

View File

@ -14,22 +14,25 @@
* limitations under the License.
*/
// Generated using ICU4J 4.8.1.1 on Sunday, July 8, 2012 2:59:49 PM UTC
// Generated using ICU4J 49.1.0.0 on Sunday, July 15, 2012 5:57:26 AM UTC
// by org.apache.lucene.analysis.icu.GenerateJFlexSupplementaryMacros
ALetterSupp = (
([\ud80d][\uDC00-\uDC2E])
([\ud83b][\uDE00-\uDE03\uDE05-\uDE1F\uDE21\uDE22\uDE24\uDE27\uDE29-\uDE32\uDE34-\uDE37\uDE39\uDE3B\uDE42\uDE47\uDE49\uDE4B\uDE4D-\uDE4F\uDE51\uDE52\uDE54\uDE57\uDE59\uDE5B\uDE5D\uDE5F\uDE61\uDE62\uDE64\uDE67-\uDE6A\uDE6C-\uDE72\uDE74-\uDE77\uDE79-\uDE7C\uDE7E\uDE80-\uDE89\uDE8B-\uDE9B\uDEA1-\uDEA3\uDEA5-\uDEA9\uDEAB-\uDEBB])
| ([\ud81a][\uDC00-\uDE38])
| ([\ud81b][\uDF00-\uDF44\uDF50\uDF93-\uDF9F])
| ([\ud835][\uDC00-\uDC54\uDC56-\uDC9C\uDC9E\uDC9F\uDCA2\uDCA5\uDCA6\uDCA9-\uDCAC\uDCAE-\uDCB9\uDCBB\uDCBD-\uDCC3\uDCC5-\uDD05\uDD07-\uDD0A\uDD0D-\uDD14\uDD16-\uDD1C\uDD1E-\uDD39\uDD3B-\uDD3E\uDD40-\uDD44\uDD46\uDD4A-\uDD50\uDD52-\uDEA5\uDEA8-\uDEC0\uDEC2-\uDEDA\uDEDC-\uDEFA\uDEFC-\uDF14\uDF16-\uDF34\uDF36-\uDF4E\uDF50-\uDF6E\uDF70-\uDF88\uDF8A-\uDFA8\uDFAA-\uDFC2\uDFC4-\uDFCB])
| ([\ud80d][\uDC00-\uDC2E])
| ([\ud80c][\uDC00-\uDFFF])
| ([\ud809][\uDC00-\uDC62])
| ([\ud808][\uDC00-\uDF6E])
| ([\ud81a][\uDC00-\uDE38])
| ([\ud804][\uDC03-\uDC37\uDC83-\uDCAF])
| ([\ud835][\uDC00-\uDC54\uDC56-\uDC9C\uDC9E\uDC9F\uDCA2\uDCA5\uDCA6\uDCA9-\uDCAC\uDCAE-\uDCB9\uDCBB\uDCBD-\uDCC3\uDCC5-\uDD05\uDD07-\uDD0A\uDD0D-\uDD14\uDD16-\uDD1C\uDD1E-\uDD39\uDD3B-\uDD3E\uDD40-\uDD44\uDD46\uDD4A-\uDD50\uDD52-\uDEA5\uDEA8-\uDEC0\uDEC2-\uDEDA\uDEDC-\uDEFA\uDEFC-\uDF14\uDF16-\uDF34\uDF36-\uDF4E\uDF50-\uDF6E\uDF70-\uDF88\uDF8A-\uDFA8\uDFAA-\uDFC2\uDFC4-\uDFCB])
| ([\ud805][\uDE80-\uDEAA])
| ([\ud804][\uDC03-\uDC37\uDC83-\uDCAF\uDCD0-\uDCE8\uDD03-\uDD26\uDD83-\uDDB2\uDDC1-\uDDC4])
| ([\ud801][\uDC00-\uDC9D])
| ([\ud800][\uDC00-\uDC0B\uDC0D-\uDC26\uDC28-\uDC3A\uDC3C\uDC3D\uDC3F-\uDC4D\uDC50-\uDC5D\uDC80-\uDCFA\uDD40-\uDD74\uDE80-\uDE9C\uDEA0-\uDED0\uDF00-\uDF1E\uDF30-\uDF4A\uDF80-\uDF9D\uDFA0-\uDFC3\uDFC8-\uDFCF\uDFD1-\uDFD5])
| ([\ud803][\uDC00-\uDC48])
| ([\ud802][\uDC00-\uDC05\uDC08\uDC0A-\uDC35\uDC37\uDC38\uDC3C\uDC3F-\uDC55\uDD00-\uDD15\uDD20-\uDD39\uDE00\uDE10-\uDE13\uDE15-\uDE17\uDE19-\uDE33\uDE60-\uDE7C\uDF00-\uDF35\uDF40-\uDF55\uDF60-\uDF72])
| ([\ud802][\uDC00-\uDC05\uDC08\uDC0A-\uDC35\uDC37\uDC38\uDC3C\uDC3F-\uDC55\uDD00-\uDD15\uDD20-\uDD39\uDD80-\uDDB7\uDDBE\uDDBF\uDE00\uDE10-\uDE13\uDE15-\uDE17\uDE19-\uDE33\uDE60-\uDE7C\uDF00-\uDF35\uDF40-\uDF55\uDF60-\uDF72])
)
FormatSupp = (
([\ud804][\uDCBD])
@ -37,14 +40,17 @@ FormatSupp = (
| ([\udb40][\uDC01\uDC20-\uDC7F])
)
ExtendSupp = (
([\ud804][\uDC00-\uDC02\uDC38-\uDC46\uDC80-\uDC82\uDCB0-\uDCBA])
([\ud81b][\uDF51-\uDF7E\uDF8F-\uDF92])
| ([\ud805][\uDEAB-\uDEB7])
| ([\ud804][\uDC00-\uDC02\uDC38-\uDC46\uDC80-\uDC82\uDCB0-\uDCBA\uDD00-\uDD02\uDD27-\uDD34\uDD80-\uDD82\uDDB3-\uDDC0])
| ([\ud834][\uDD65-\uDD69\uDD6D-\uDD72\uDD7B-\uDD82\uDD85-\uDD8B\uDDAA-\uDDAD\uDE42-\uDE44])
| ([\ud800][\uDDFD])
| ([\udb40][\uDD00-\uDDEF])
| ([\ud802][\uDE01-\uDE03\uDE05\uDE06\uDE0C-\uDE0F\uDE38-\uDE3A\uDE3F])
)
NumericSupp = (
([\ud804][\uDC66-\uDC6F])
([\ud805][\uDEC0-\uDEC9])
| ([\ud804][\uDC66-\uDC6F\uDCF0-\uDCF9\uDD36-\uDD3F\uDDD0-\uDDD9])
| ([\ud835][\uDFCE-\uDFFF])
| ([\ud801][\uDCA0-\uDCA9])
)
@ -109,7 +115,8 @@ HanSupp = (
| ([\ud84d][\uDC00-\uDFFF])
| ([\ud84c][\uDC00-\uDFFF])
| ([\ud84f][\uDC00-\uDFFF])
| ([\ud84e][\uDC00-\uDFFF])
| Picked up JAVA_TOOL_OPTIONS: -Dfile.encoding=UTF-8
([\ud84e][\uDC00-\uDFFF])
| ([\ud841][\uDC00-\uDFFF])
| ([\ud840][\uDC00-\uDFFF])
| ([\ud843][\uDC00-\uDFFF])

View File

@ -36,7 +36,7 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
*/
%%
%unicode 6.0
%unicode 6.1
%integer
%final
%public

View File

@ -39,7 +39,7 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
*/
%%
%unicode 6.0
%unicode 6.1
%integer
%final
%public

View File

@ -1,4 +1,4 @@
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 08.07.12 17:00 */
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 7/15/12 1:57 AM */
package org.apache.lucene.analysis.wikipedia;
@ -25,8 +25,8 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
/**
* This class is a scanner generated by
* <a href="http://www.jflex.de/">JFlex</a> 1.5.0-SNAPSHOT
* on 08.07.12 17:00 from the specification file
* <tt>C:/Users/Uwe Schindler/Projects/lucene/lucene4199/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex</tt>
* on 7/15/12 1:57 AM from the specification file
* <tt>C:/cygwin/home/s/svn/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex</tt>
*/
class WikipediaTokenizerImpl {

View File

@ -202,7 +202,7 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
}
public void testUnicodeWordBreaks() throws Exception {
WordBreakTestUnicode_6_0_0 wordBreakTest = new WordBreakTestUnicode_6_0_0();
WordBreakTestUnicode_6_1_0 wordBreakTest = new WordBreakTestUnicode_6_1_0();
wordBreakTest.test(a);
}

View File

@ -424,7 +424,7 @@ public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
}
public void testUnicodeWordBreaks() throws Exception {
WordBreakTestUnicode_6_0_0 wordBreakTest = new WordBreakTestUnicode_6_0_0();
WordBreakTestUnicode_6_1_0 wordBreakTest = new WordBreakTestUnicode_6_1_0();
wordBreakTest.test(a);
}

View File

@ -1,6 +1,6 @@
package org.apache.lucene.analysis.core;
/*
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
@ -23,7 +23,7 @@ import org.junit.Ignore;
/**
* This class was automatically generated by generateJavaUnicodeWordBreakTest.pl
* from: http://www.unicode.org/Public/6.0.0/ucd/auxiliary/WordBreakTest.txt
* from: http://www.unicode.org/Public/6.1.0/ucd/auxiliary/WordBreakTest.txt
*
* WordBreakTest.txt indicates the points in the provided character sequences
* at which conforming implementations must and must not break words. This
@ -32,16 +32,16 @@ import org.junit.Ignore;
* sequences bounded by word breaks and containing at least one character
* from one of the following character sets:
*
* \p{Script = Han} (From http://www.unicode.org/Public/6.0.0/ucd/Scripts.txt)
* \p{Script = Han} (From http://www.unicode.org/Public/6.1.0/ucd/Scripts.txt)
* \p{Script = Hiragana}
* \p{LineBreak = Complex_Context} (From http://www.unicode.org/Public/6.0.0/ucd/LineBreak.txt)
* \p{WordBreak = ALetter} (From http://www.unicode.org/Public/6.0.0/ucd/auxiliary/WordBreakProperty.txt)
* \p{LineBreak = Complex_Context} (From http://www.unicode.org/Public/6.1.0/ucd/LineBreak.txt)
* \p{WordBreak = ALetter} (From http://www.unicode.org/Public/6.1.0/ucd/auxiliary/WordBreakProperty.txt)
* \p{WordBreak = Katakana}
* \p{WordBreak = Numeric} (Excludes full-width Arabic digits)
* [\uFF10-\uFF19] (Full-width Arabic digits)
*/
@Ignore
public class WordBreakTestUnicode_6_0_0 extends BaseTokenStreamTestCase {
public class WordBreakTestUnicode_6_1_0 extends BaseTokenStreamTestCase {
public void test(Analyzer analyzer) throws Exception {
// ÷ 0001 ÷ 0001 ÷ # ÷ [0.2] <START OF HEADING> (Other) ÷ [999.0] <START OF HEADING> (Other) ÷ [0.3]
@ -52,27 +52,27 @@ public class WordBreakTestUnicode_6_0_0 extends BaseTokenStreamTestCase {
assertAnalyzesTo(analyzer, "\u0001\u0308\u0001",
new String[] { });
// ÷ 0001 ÷ 000D ÷ # ÷ [0.2] <START OF HEADING> (Other) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
// ÷ 0001 ÷ 000D ÷ # ÷ [0.2] <START OF HEADING> (Other) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0001\r",
new String[] { });
// ÷ 0001 × 0308 ÷ 000D ÷ # ÷ [0.2] <START OF HEADING> (Other) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
// ÷ 0001 × 0308 ÷ 000D ÷ # ÷ [0.2] <START OF HEADING> (Other) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0001\u0308\r",
new String[] { });
// ÷ 0001 ÷ 000A ÷ # ÷ [0.2] <START OF HEADING> (Other) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
// ÷ 0001 ÷ 000A ÷ # ÷ [0.2] <START OF HEADING> (Other) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0001\n",
new String[] { });
// ÷ 0001 × 0308 ÷ 000A ÷ # ÷ [0.2] <START OF HEADING> (Other) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
// ÷ 0001 × 0308 ÷ 000A ÷ # ÷ [0.2] <START OF HEADING> (Other) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0001\u0308\n",
new String[] { });
// ÷ 0001 ÷ 000B ÷ # ÷ [0.2] <START OF HEADING> (Other) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
// ÷ 0001 ÷ 000B ÷ # ÷ [0.2] <START OF HEADING> (Other) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0001\u000B",
new String[] { });
// ÷ 0001 × 0308 ÷ 000B ÷ # ÷ [0.2] <START OF HEADING> (Other) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
// ÷ 0001 × 0308 ÷ 000B ÷ # ÷ [0.2] <START OF HEADING> (Other) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0001\u0308\u000B",
new String[] { });
@ -232,7 +232,7 @@ public class WordBreakTestUnicode_6_0_0 extends BaseTokenStreamTestCase {
assertAnalyzesTo(analyzer, "\r\r",
new String[] { });
// ÷ 000D ÷ 0308 ÷ 000D ÷ # ÷ [0.2] <CARRIAGE RETURN (CR)> (CR) ÷ [3.1] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
// ÷ 000D ÷ 0308 ÷ 000D ÷ # ÷ [0.2] <CARRIAGE RETURN (CR)> (CR) ÷ [3.1] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
assertAnalyzesTo(analyzer, "\r\u0308\r",
new String[] { });
@ -240,7 +240,7 @@ public class WordBreakTestUnicode_6_0_0 extends BaseTokenStreamTestCase {
assertAnalyzesTo(analyzer, "\r\n",
new String[] { });
// ÷ 000D ÷ 0308 ÷ 000A ÷ # ÷ [0.2] <CARRIAGE RETURN (CR)> (CR) ÷ [3.1] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
// ÷ 000D ÷ 0308 ÷ 000A ÷ # ÷ [0.2] <CARRIAGE RETURN (CR)> (CR) ÷ [3.1] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
assertAnalyzesTo(analyzer, "\r\u0308\n",
new String[] { });
@ -248,7 +248,7 @@ public class WordBreakTestUnicode_6_0_0 extends BaseTokenStreamTestCase {
assertAnalyzesTo(analyzer, "\r\u000B",
new String[] { });
// ÷ 000D ÷ 0308 ÷ 000B ÷ # ÷ [0.2] <CARRIAGE RETURN (CR)> (CR) ÷ [3.1] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
// ÷ 000D ÷ 0308 ÷ 000B ÷ # ÷ [0.2] <CARRIAGE RETURN (CR)> (CR) ÷ [3.1] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
assertAnalyzesTo(analyzer, "\r\u0308\u000B",
new String[] { });
@ -408,7 +408,7 @@ public class WordBreakTestUnicode_6_0_0 extends BaseTokenStreamTestCase {
assertAnalyzesTo(analyzer, "\n\r",
new String[] { });
// ÷ 000A ÷ 0308 ÷ 000D ÷ # ÷ [0.2] <LINE FEED (LF)> (LF) ÷ [3.1] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
// ÷ 000A ÷ 0308 ÷ 000D ÷ # ÷ [0.2] <LINE FEED (LF)> (LF) ÷ [3.1] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
assertAnalyzesTo(analyzer, "\n\u0308\r",
new String[] { });
@ -416,7 +416,7 @@ public class WordBreakTestUnicode_6_0_0 extends BaseTokenStreamTestCase {
assertAnalyzesTo(analyzer, "\n\n",
new String[] { });
// ÷ 000A ÷ 0308 ÷ 000A ÷ # ÷ [0.2] <LINE FEED (LF)> (LF) ÷ [3.1] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
// ÷ 000A ÷ 0308 ÷ 000A ÷ # ÷ [0.2] <LINE FEED (LF)> (LF) ÷ [3.1] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
assertAnalyzesTo(analyzer, "\n\u0308\n",
new String[] { });
@ -424,7 +424,7 @@ public class WordBreakTestUnicode_6_0_0 extends BaseTokenStreamTestCase {
assertAnalyzesTo(analyzer, "\n\u000B",
new String[] { });
// ÷ 000A ÷ 0308 ÷ 000B ÷ # ÷ [0.2] <LINE FEED (LF)> (LF) ÷ [3.1] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
// ÷ 000A ÷ 0308 ÷ 000B ÷ # ÷ [0.2] <LINE FEED (LF)> (LF) ÷ [3.1] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
assertAnalyzesTo(analyzer, "\n\u0308\u000B",
new String[] { });
@ -584,7 +584,7 @@ public class WordBreakTestUnicode_6_0_0 extends BaseTokenStreamTestCase {
assertAnalyzesTo(analyzer, "\u000B\r",
new String[] { });
// ÷ 000B ÷ 0308 ÷ 000D ÷ # ÷ [0.2] <LINE TABULATION> (Newline) ÷ [3.1] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
// ÷ 000B ÷ 0308 ÷ 000D ÷ # ÷ [0.2] <LINE TABULATION> (Newline) ÷ [3.1] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u000B\u0308\r",
new String[] { });
@ -592,7 +592,7 @@ public class WordBreakTestUnicode_6_0_0 extends BaseTokenStreamTestCase {
assertAnalyzesTo(analyzer, "\u000B\n",
new String[] { });
// ÷ 000B ÷ 0308 ÷ 000A ÷ # ÷ [0.2] <LINE TABULATION> (Newline) ÷ [3.1] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
// ÷ 000B ÷ 0308 ÷ 000A ÷ # ÷ [0.2] <LINE TABULATION> (Newline) ÷ [3.1] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u000B\u0308\n",
new String[] { });
@ -600,7 +600,7 @@ public class WordBreakTestUnicode_6_0_0 extends BaseTokenStreamTestCase {
assertAnalyzesTo(analyzer, "\u000B\u000B",
new String[] { });
// ÷ 000B ÷ 0308 ÷ 000B ÷ # ÷ [0.2] <LINE TABULATION> (Newline) ÷ [3.1] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
// ÷ 000B ÷ 0308 ÷ 000B ÷ # ÷ [0.2] <LINE TABULATION> (Newline) ÷ [3.1] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u000B\u0308\u000B",
new String[] { });
@ -756,27 +756,27 @@ public class WordBreakTestUnicode_6_0_0 extends BaseTokenStreamTestCase {
assertAnalyzesTo(analyzer, "\u3031\u0308\u0001",
new String[] { "\u3031\u0308" });
// ÷ 3031 ÷ 000D ÷ # ÷ [0.2] VERTICAL KANA REPEAT MARK (Katakana) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
// ÷ 3031 ÷ 000D ÷ # ÷ [0.2] VERTICAL KANA REPEAT MARK (Katakana) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u3031\r",
new String[] { "\u3031" });
// ÷ 3031 × 0308 ÷ 000D ÷ # ÷ [0.2] VERTICAL KANA REPEAT MARK (Katakana) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
// ÷ 3031 × 0308 ÷ 000D ÷ # ÷ [0.2] VERTICAL KANA REPEAT MARK (Katakana) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u3031\u0308\r",
new String[] { "\u3031\u0308" });
// ÷ 3031 ÷ 000A ÷ # ÷ [0.2] VERTICAL KANA REPEAT MARK (Katakana) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
// ÷ 3031 ÷ 000A ÷ # ÷ [0.2] VERTICAL KANA REPEAT MARK (Katakana) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u3031\n",
new String[] { "\u3031" });
// ÷ 3031 × 0308 ÷ 000A ÷ # ÷ [0.2] VERTICAL KANA REPEAT MARK (Katakana) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
// ÷ 3031 × 0308 ÷ 000A ÷ # ÷ [0.2] VERTICAL KANA REPEAT MARK (Katakana) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u3031\u0308\n",
new String[] { "\u3031\u0308" });
// ÷ 3031 ÷ 000B ÷ # ÷ [0.2] VERTICAL KANA REPEAT MARK (Katakana) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
// ÷ 3031 ÷ 000B ÷ # ÷ [0.2] VERTICAL KANA REPEAT MARK (Katakana) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u3031\u000B",
new String[] { "\u3031" });
// ÷ 3031 × 0308 ÷ 000B ÷ # ÷ [0.2] VERTICAL KANA REPEAT MARK (Katakana) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
// ÷ 3031 × 0308 ÷ 000B ÷ # ÷ [0.2] VERTICAL KANA REPEAT MARK (Katakana) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u3031\u0308\u000B",
new String[] { "\u3031\u0308" });
@ -932,27 +932,27 @@ public class WordBreakTestUnicode_6_0_0 extends BaseTokenStreamTestCase {
assertAnalyzesTo(analyzer, "\u0041\u0308\u0001",
new String[] { "\u0041\u0308" });
// ÷ 0041 ÷ 000D ÷ # ÷ [0.2] LATIN CAPITAL LETTER A (ALetter) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
// ÷ 0041 ÷ 000D ÷ # ÷ [0.2] LATIN CAPITAL LETTER A (ALetter) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0041\r",
new String[] { "\u0041" });
// ÷ 0041 × 0308 ÷ 000D ÷ # ÷ [0.2] LATIN CAPITAL LETTER A (ALetter) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
// ÷ 0041 × 0308 ÷ 000D ÷ # ÷ [0.2] LATIN CAPITAL LETTER A (ALetter) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0041\u0308\r",
new String[] { "\u0041\u0308" });
// ÷ 0041 ÷ 000A ÷ # ÷ [0.2] LATIN CAPITAL LETTER A (ALetter) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
// ÷ 0041 ÷ 000A ÷ # ÷ [0.2] LATIN CAPITAL LETTER A (ALetter) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0041\n",
new String[] { "\u0041" });
// ÷ 0041 × 0308 ÷ 000A ÷ # ÷ [0.2] LATIN CAPITAL LETTER A (ALetter) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
// ÷ 0041 × 0308 ÷ 000A ÷ # ÷ [0.2] LATIN CAPITAL LETTER A (ALetter) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0041\u0308\n",
new String[] { "\u0041\u0308" });
// ÷ 0041 ÷ 000B ÷ # ÷ [0.2] LATIN CAPITAL LETTER A (ALetter) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
// ÷ 0041 ÷ 000B ÷ # ÷ [0.2] LATIN CAPITAL LETTER A (ALetter) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0041\u000B",
new String[] { "\u0041" });
// ÷ 0041 × 0308 ÷ 000B ÷ # ÷ [0.2] LATIN CAPITAL LETTER A (ALetter) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
// ÷ 0041 × 0308 ÷ 000B ÷ # ÷ [0.2] LATIN CAPITAL LETTER A (ALetter) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0041\u0308\u000B",
new String[] { "\u0041\u0308" });
@ -1108,27 +1108,27 @@ public class WordBreakTestUnicode_6_0_0 extends BaseTokenStreamTestCase {
assertAnalyzesTo(analyzer, "\u003A\u0308\u0001",
new String[] { });
// ÷ 003A ÷ 000D ÷ # ÷ [0.2] COLON (MidLetter) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
// ÷ 003A ÷ 000D ÷ # ÷ [0.2] COLON (MidLetter) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u003A\r",
new String[] { });
// ÷ 003A × 0308 ÷ 000D ÷ # ÷ [0.2] COLON (MidLetter) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
// ÷ 003A × 0308 ÷ 000D ÷ # ÷ [0.2] COLON (MidLetter) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u003A\u0308\r",
new String[] { });
// ÷ 003A ÷ 000A ÷ # ÷ [0.2] COLON (MidLetter) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
// ÷ 003A ÷ 000A ÷ # ÷ [0.2] COLON (MidLetter) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u003A\n",
new String[] { });
// ÷ 003A × 0308 ÷ 000A ÷ # ÷ [0.2] COLON (MidLetter) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
// ÷ 003A × 0308 ÷ 000A ÷ # ÷ [0.2] COLON (MidLetter) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u003A\u0308\n",
new String[] { });
// ÷ 003A ÷ 000B ÷ # ÷ [0.2] COLON (MidLetter) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
// ÷ 003A ÷ 000B ÷ # ÷ [0.2] COLON (MidLetter) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u003A\u000B",
new String[] { });
// ÷ 003A × 0308 ÷ 000B ÷ # ÷ [0.2] COLON (MidLetter) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
// ÷ 003A × 0308 ÷ 000B ÷ # ÷ [0.2] COLON (MidLetter) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u003A\u0308\u000B",
new String[] { });
@ -1284,27 +1284,27 @@ public class WordBreakTestUnicode_6_0_0 extends BaseTokenStreamTestCase {
assertAnalyzesTo(analyzer, "\u002C\u0308\u0001",
new String[] { });
// ÷ 002C ÷ 000D ÷ # ÷ [0.2] COMMA (MidNum) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
// ÷ 002C ÷ 000D ÷ # ÷ [0.2] COMMA (MidNum) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u002C\r",
new String[] { });
// ÷ 002C × 0308 ÷ 000D ÷ # ÷ [0.2] COMMA (MidNum) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
// ÷ 002C × 0308 ÷ 000D ÷ # ÷ [0.2] COMMA (MidNum) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u002C\u0308\r",
new String[] { });
// ÷ 002C ÷ 000A ÷ # ÷ [0.2] COMMA (MidNum) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
// ÷ 002C ÷ 000A ÷ # ÷ [0.2] COMMA (MidNum) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u002C\n",
new String[] { });
// ÷ 002C × 0308 ÷ 000A ÷ # ÷ [0.2] COMMA (MidNum) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
// ÷ 002C × 0308 ÷ 000A ÷ # ÷ [0.2] COMMA (MidNum) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u002C\u0308\n",
new String[] { });
// ÷ 002C ÷ 000B ÷ # ÷ [0.2] COMMA (MidNum) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
// ÷ 002C ÷ 000B ÷ # ÷ [0.2] COMMA (MidNum) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u002C\u000B",
new String[] { });
// ÷ 002C × 0308 ÷ 000B ÷ # ÷ [0.2] COMMA (MidNum) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
// ÷ 002C × 0308 ÷ 000B ÷ # ÷ [0.2] COMMA (MidNum) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u002C\u0308\u000B",
new String[] { });
@ -1460,27 +1460,27 @@ public class WordBreakTestUnicode_6_0_0 extends BaseTokenStreamTestCase {
assertAnalyzesTo(analyzer, "\u0027\u0308\u0001",
new String[] { });
// ÷ 0027 ÷ 000D ÷ # ÷ [0.2] APOSTROPHE (MidNumLet) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
// ÷ 0027 ÷ 000D ÷ # ÷ [0.2] APOSTROPHE (MidNumLet) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0027\r",
new String[] { });
// ÷ 0027 × 0308 ÷ 000D ÷ # ÷ [0.2] APOSTROPHE (MidNumLet) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
// ÷ 0027 × 0308 ÷ 000D ÷ # ÷ [0.2] APOSTROPHE (MidNumLet) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0027\u0308\r",
new String[] { });
// ÷ 0027 ÷ 000A ÷ # ÷ [0.2] APOSTROPHE (MidNumLet) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
// ÷ 0027 ÷ 000A ÷ # ÷ [0.2] APOSTROPHE (MidNumLet) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0027\n",
new String[] { });
// ÷ 0027 × 0308 ÷ 000A ÷ # ÷ [0.2] APOSTROPHE (MidNumLet) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
// ÷ 0027 × 0308 ÷ 000A ÷ # ÷ [0.2] APOSTROPHE (MidNumLet) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0027\u0308\n",
new String[] { });
// ÷ 0027 ÷ 000B ÷ # ÷ [0.2] APOSTROPHE (MidNumLet) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
// ÷ 0027 ÷ 000B ÷ # ÷ [0.2] APOSTROPHE (MidNumLet) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0027\u000B",
new String[] { });
// ÷ 0027 × 0308 ÷ 000B ÷ # ÷ [0.2] APOSTROPHE (MidNumLet) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
// ÷ 0027 × 0308 ÷ 000B ÷ # ÷ [0.2] APOSTROPHE (MidNumLet) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0027\u0308\u000B",
new String[] { });
@ -1636,27 +1636,27 @@ public class WordBreakTestUnicode_6_0_0 extends BaseTokenStreamTestCase {
assertAnalyzesTo(analyzer, "\u0030\u0308\u0001",
new String[] { "\u0030\u0308" });
// ÷ 0030 ÷ 000D ÷ # ÷ [0.2] DIGIT ZERO (Numeric) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
// ÷ 0030 ÷ 000D ÷ # ÷ [0.2] DIGIT ZERO (Numeric) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0030\r",
new String[] { "\u0030" });
// ÷ 0030 × 0308 ÷ 000D ÷ # ÷ [0.2] DIGIT ZERO (Numeric) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
// ÷ 0030 × 0308 ÷ 000D ÷ # ÷ [0.2] DIGIT ZERO (Numeric) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0030\u0308\r",
new String[] { "\u0030\u0308" });
// ÷ 0030 ÷ 000A ÷ # ÷ [0.2] DIGIT ZERO (Numeric) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
// ÷ 0030 ÷ 000A ÷ # ÷ [0.2] DIGIT ZERO (Numeric) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0030\n",
new String[] { "\u0030" });
// ÷ 0030 × 0308 ÷ 000A ÷ # ÷ [0.2] DIGIT ZERO (Numeric) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
// ÷ 0030 × 0308 ÷ 000A ÷ # ÷ [0.2] DIGIT ZERO (Numeric) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0030\u0308\n",
new String[] { "\u0030\u0308" });
// ÷ 0030 ÷ 000B ÷ # ÷ [0.2] DIGIT ZERO (Numeric) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
// ÷ 0030 ÷ 000B ÷ # ÷ [0.2] DIGIT ZERO (Numeric) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0030\u000B",
new String[] { "\u0030" });
// ÷ 0030 × 0308 ÷ 000B ÷ # ÷ [0.2] DIGIT ZERO (Numeric) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
// ÷ 0030 × 0308 ÷ 000B ÷ # ÷ [0.2] DIGIT ZERO (Numeric) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0030\u0308\u000B",
new String[] { "\u0030\u0308" });
@ -1812,27 +1812,27 @@ public class WordBreakTestUnicode_6_0_0 extends BaseTokenStreamTestCase {
assertAnalyzesTo(analyzer, "\u005F\u0308\u0001",
new String[] { });
// ÷ 005F ÷ 000D ÷ # ÷ [0.2] LOW LINE (ExtendNumLet) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
// ÷ 005F ÷ 000D ÷ # ÷ [0.2] LOW LINE (ExtendNumLet) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u005F\r",
new String[] { });
// ÷ 005F × 0308 ÷ 000D ÷ # ÷ [0.2] LOW LINE (ExtendNumLet) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
// ÷ 005F × 0308 ÷ 000D ÷ # ÷ [0.2] LOW LINE (ExtendNumLet) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u005F\u0308\r",
new String[] { });
// ÷ 005F ÷ 000A ÷ # ÷ [0.2] LOW LINE (ExtendNumLet) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
// ÷ 005F ÷ 000A ÷ # ÷ [0.2] LOW LINE (ExtendNumLet) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u005F\n",
new String[] { });
// ÷ 005F × 0308 ÷ 000A ÷ # ÷ [0.2] LOW LINE (ExtendNumLet) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
// ÷ 005F × 0308 ÷ 000A ÷ # ÷ [0.2] LOW LINE (ExtendNumLet) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u005F\u0308\n",
new String[] { });
// ÷ 005F ÷ 000B ÷ # ÷ [0.2] LOW LINE (ExtendNumLet) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
// ÷ 005F ÷ 000B ÷ # ÷ [0.2] LOW LINE (ExtendNumLet) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u005F\u000B",
new String[] { });
// ÷ 005F × 0308 ÷ 000B ÷ # ÷ [0.2] LOW LINE (ExtendNumLet) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
// ÷ 005F × 0308 ÷ 000B ÷ # ÷ [0.2] LOW LINE (ExtendNumLet) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u005F\u0308\u000B",
new String[] { });
@ -1988,27 +1988,27 @@ public class WordBreakTestUnicode_6_0_0 extends BaseTokenStreamTestCase {
assertAnalyzesTo(analyzer, "\u00AD\u0308\u0001",
new String[] { });
// ÷ 00AD ÷ 000D ÷ # ÷ [0.2] SOFT HYPHEN (Format_FE) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
// ÷ 00AD ÷ 000D ÷ # ÷ [0.2] SOFT HYPHEN (Format_FE) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u00AD\r",
new String[] { });
// ÷ 00AD × 0308 ÷ 000D ÷ # ÷ [0.2] SOFT HYPHEN (Format_FE) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
// ÷ 00AD × 0308 ÷ 000D ÷ # ÷ [0.2] SOFT HYPHEN (Format_FE) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u00AD\u0308\r",
new String[] { });
// ÷ 00AD ÷ 000A ÷ # ÷ [0.2] SOFT HYPHEN (Format_FE) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
// ÷ 00AD ÷ 000A ÷ # ÷ [0.2] SOFT HYPHEN (Format_FE) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u00AD\n",
new String[] { });
// ÷ 00AD × 0308 ÷ 000A ÷ # ÷ [0.2] SOFT HYPHEN (Format_FE) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
// ÷ 00AD × 0308 ÷ 000A ÷ # ÷ [0.2] SOFT HYPHEN (Format_FE) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u00AD\u0308\n",
new String[] { });
// ÷ 00AD ÷ 000B ÷ # ÷ [0.2] SOFT HYPHEN (Format_FE) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
// ÷ 00AD ÷ 000B ÷ # ÷ [0.2] SOFT HYPHEN (Format_FE) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u00AD\u000B",
new String[] { });
// ÷ 00AD × 0308 ÷ 000B ÷ # ÷ [0.2] SOFT HYPHEN (Format_FE) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
// ÷ 00AD × 0308 ÷ 000B ÷ # ÷ [0.2] SOFT HYPHEN (Format_FE) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u00AD\u0308\u000B",
new String[] { });
@ -2164,27 +2164,27 @@ public class WordBreakTestUnicode_6_0_0 extends BaseTokenStreamTestCase {
assertAnalyzesTo(analyzer, "\u0300\u0308\u0001",
new String[] { });
// ÷ 0300 ÷ 000D ÷ # ÷ [0.2] COMBINING GRAVE ACCENT (Extend_FE) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
// ÷ 0300 ÷ 000D ÷ # ÷ [0.2] COMBINING GRAVE ACCENT (Extend_FE) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0300\r",
new String[] { });
// ÷ 0300 × 0308 ÷ 000D ÷ # ÷ [0.2] COMBINING GRAVE ACCENT (Extend_FE) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
// ÷ 0300 × 0308 ÷ 000D ÷ # ÷ [0.2] COMBINING GRAVE ACCENT (Extend_FE) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0300\u0308\r",
new String[] { });
// ÷ 0300 ÷ 000A ÷ # ÷ [0.2] COMBINING GRAVE ACCENT (Extend_FE) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
// ÷ 0300 ÷ 000A ÷ # ÷ [0.2] COMBINING GRAVE ACCENT (Extend_FE) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0300\n",
new String[] { });
// ÷ 0300 × 0308 ÷ 000A ÷ # ÷ [0.2] COMBINING GRAVE ACCENT (Extend_FE) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
// ÷ 0300 × 0308 ÷ 000A ÷ # ÷ [0.2] COMBINING GRAVE ACCENT (Extend_FE) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0300\u0308\n",
new String[] { });
// ÷ 0300 ÷ 000B ÷ # ÷ [0.2] COMBINING GRAVE ACCENT (Extend_FE) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
// ÷ 0300 ÷ 000B ÷ # ÷ [0.2] COMBINING GRAVE ACCENT (Extend_FE) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0300\u000B",
new String[] { });
// ÷ 0300 × 0308 ÷ 000B ÷ # ÷ [0.2] COMBINING GRAVE ACCENT (Extend_FE) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
// ÷ 0300 × 0308 ÷ 000B ÷ # ÷ [0.2] COMBINING GRAVE ACCENT (Extend_FE) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0300\u0308\u000B",
new String[] { });
@ -2340,27 +2340,27 @@ public class WordBreakTestUnicode_6_0_0 extends BaseTokenStreamTestCase {
assertAnalyzesTo(analyzer, "\u0061\u2060\u0308\u0001",
new String[] { "\u0061\u2060\u0308" });
// ÷ 0061 × 2060 ÷ 000D ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) × [4.0] WORD JOINER (Format_FE) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
// ÷ 0061 × 2060 ÷ 000D ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) × [4.0] WORD JOINER (Format_FE) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0061\u2060\r",
new String[] { "\u0061\u2060" });
// ÷ 0061 × 2060 × 0308 ÷ 000D ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) × [4.0] WORD JOINER (Format_FE) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
// ÷ 0061 × 2060 × 0308 ÷ 000D ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) × [4.0] WORD JOINER (Format_FE) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0061\u2060\u0308\r",
new String[] { "\u0061\u2060\u0308" });
// ÷ 0061 × 2060 ÷ 000A ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) × [4.0] WORD JOINER (Format_FE) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
// ÷ 0061 × 2060 ÷ 000A ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) × [4.0] WORD JOINER (Format_FE) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0061\u2060\n",
new String[] { "\u0061\u2060" });
// ÷ 0061 × 2060 × 0308 ÷ 000A ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) × [4.0] WORD JOINER (Format_FE) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
// ÷ 0061 × 2060 × 0308 ÷ 000A ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) × [4.0] WORD JOINER (Format_FE) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0061\u2060\u0308\n",
new String[] { "\u0061\u2060\u0308" });
// ÷ 0061 × 2060 ÷ 000B ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) × [4.0] WORD JOINER (Format_FE) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
// ÷ 0061 × 2060 ÷ 000B ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) × [4.0] WORD JOINER (Format_FE) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0061\u2060\u000B",
new String[] { "\u0061\u2060" });
// ÷ 0061 × 2060 × 0308 ÷ 000B ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) × [4.0] WORD JOINER (Format_FE) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
// ÷ 0061 × 2060 × 0308 ÷ 000B ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) × [4.0] WORD JOINER (Format_FE) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0061\u2060\u0308\u000B",
new String[] { "\u0061\u2060\u0308" });
@ -2516,27 +2516,27 @@ public class WordBreakTestUnicode_6_0_0 extends BaseTokenStreamTestCase {
assertAnalyzesTo(analyzer, "\u0061\u003A\u0308\u0001",
new String[] { "\u0061" });
// ÷ 0061 ÷ 003A ÷ 000D ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] COLON (MidLetter) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
// ÷ 0061 ÷ 003A ÷ 000D ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] COLON (MidLetter) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0061\u003A\r",
new String[] { "\u0061" });
// ÷ 0061 ÷ 003A × 0308 ÷ 000D ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] COLON (MidLetter) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
// ÷ 0061 ÷ 003A × 0308 ÷ 000D ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] COLON (MidLetter) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0061\u003A\u0308\r",
new String[] { "\u0061" });
// ÷ 0061 ÷ 003A ÷ 000A ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] COLON (MidLetter) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
// ÷ 0061 ÷ 003A ÷ 000A ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] COLON (MidLetter) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0061\u003A\n",
new String[] { "\u0061" });
// ÷ 0061 ÷ 003A × 0308 ÷ 000A ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] COLON (MidLetter) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
// ÷ 0061 ÷ 003A × 0308 ÷ 000A ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] COLON (MidLetter) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0061\u003A\u0308\n",
new String[] { "\u0061" });
// ÷ 0061 ÷ 003A ÷ 000B ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] COLON (MidLetter) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
// ÷ 0061 ÷ 003A ÷ 000B ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] COLON (MidLetter) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0061\u003A\u000B",
new String[] { "\u0061" });
// ÷ 0061 ÷ 003A × 0308 ÷ 000B ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] COLON (MidLetter) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
// ÷ 0061 ÷ 003A × 0308 ÷ 000B ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] COLON (MidLetter) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0061\u003A\u0308\u000B",
new String[] { "\u0061" });
@ -2692,27 +2692,27 @@ public class WordBreakTestUnicode_6_0_0 extends BaseTokenStreamTestCase {
assertAnalyzesTo(analyzer, "\u0061\u0027\u0308\u0001",
new String[] { "\u0061" });
// ÷ 0061 ÷ 0027 ÷ 000D ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] APOSTROPHE (MidNumLet) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
// ÷ 0061 ÷ 0027 ÷ 000D ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] APOSTROPHE (MidNumLet) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0061\u0027\r",
new String[] { "\u0061" });
// ÷ 0061 ÷ 0027 × 0308 ÷ 000D ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] APOSTROPHE (MidNumLet) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
// ÷ 0061 ÷ 0027 × 0308 ÷ 000D ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] APOSTROPHE (MidNumLet) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0061\u0027\u0308\r",
new String[] { "\u0061" });
// ÷ 0061 ÷ 0027 ÷ 000A ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] APOSTROPHE (MidNumLet) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
// ÷ 0061 ÷ 0027 ÷ 000A ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] APOSTROPHE (MidNumLet) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0061\u0027\n",
new String[] { "\u0061" });
// ÷ 0061 ÷ 0027 × 0308 ÷ 000A ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] APOSTROPHE (MidNumLet) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
// ÷ 0061 ÷ 0027 × 0308 ÷ 000A ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] APOSTROPHE (MidNumLet) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0061\u0027\u0308\n",
new String[] { "\u0061" });
// ÷ 0061 ÷ 0027 ÷ 000B ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] APOSTROPHE (MidNumLet) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
// ÷ 0061 ÷ 0027 ÷ 000B ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] APOSTROPHE (MidNumLet) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0061\u0027\u000B",
new String[] { "\u0061" });
// ÷ 0061 ÷ 0027 × 0308 ÷ 000B ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] APOSTROPHE (MidNumLet) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
// ÷ 0061 ÷ 0027 × 0308 ÷ 000B ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] APOSTROPHE (MidNumLet) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0061\u0027\u0308\u000B",
new String[] { "\u0061" });
@ -2868,27 +2868,27 @@ public class WordBreakTestUnicode_6_0_0 extends BaseTokenStreamTestCase {
assertAnalyzesTo(analyzer, "\u0061\u0027\u2060\u0308\u0001",
new String[] { "\u0061" });
// ÷ 0061 ÷ 0027 × 2060 ÷ 000D ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] APOSTROPHE (MidNumLet) × [4.0] WORD JOINER (Format_FE) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
// ÷ 0061 ÷ 0027 × 2060 ÷ 000D ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] APOSTROPHE (MidNumLet) × [4.0] WORD JOINER (Format_FE) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0061\u0027\u2060\r",
new String[] { "\u0061" });
// ÷ 0061 ÷ 0027 × 2060 × 0308 ÷ 000D ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] APOSTROPHE (MidNumLet) × [4.0] WORD JOINER (Format_FE) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
// ÷ 0061 ÷ 0027 × 2060 × 0308 ÷ 000D ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] APOSTROPHE (MidNumLet) × [4.0] WORD JOINER (Format_FE) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0061\u0027\u2060\u0308\r",
new String[] { "\u0061" });
// ÷ 0061 ÷ 0027 × 2060 ÷ 000A ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] APOSTROPHE (MidNumLet) × [4.0] WORD JOINER (Format_FE) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
// ÷ 0061 ÷ 0027 × 2060 ÷ 000A ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] APOSTROPHE (MidNumLet) × [4.0] WORD JOINER (Format_FE) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0061\u0027\u2060\n",
new String[] { "\u0061" });
// ÷ 0061 ÷ 0027 × 2060 × 0308 ÷ 000A ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] APOSTROPHE (MidNumLet) × [4.0] WORD JOINER (Format_FE) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
// ÷ 0061 ÷ 0027 × 2060 × 0308 ÷ 000A ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] APOSTROPHE (MidNumLet) × [4.0] WORD JOINER (Format_FE) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0061\u0027\u2060\u0308\n",
new String[] { "\u0061" });
// ÷ 0061 ÷ 0027 × 2060 ÷ 000B ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] APOSTROPHE (MidNumLet) × [4.0] WORD JOINER (Format_FE) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
// ÷ 0061 ÷ 0027 × 2060 ÷ 000B ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] APOSTROPHE (MidNumLet) × [4.0] WORD JOINER (Format_FE) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0061\u0027\u2060\u000B",
new String[] { "\u0061" });
// ÷ 0061 ÷ 0027 × 2060 × 0308 ÷ 000B ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] APOSTROPHE (MidNumLet) × [4.0] WORD JOINER (Format_FE) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
// ÷ 0061 ÷ 0027 × 2060 × 0308 ÷ 000B ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] APOSTROPHE (MidNumLet) × [4.0] WORD JOINER (Format_FE) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0061\u0027\u2060\u0308\u000B",
new String[] { "\u0061" });
@ -3044,27 +3044,27 @@ public class WordBreakTestUnicode_6_0_0 extends BaseTokenStreamTestCase {
assertAnalyzesTo(analyzer, "\u0061\u002C\u0308\u0001",
new String[] { "\u0061" });
// ÷ 0061 ÷ 002C ÷ 000D ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] COMMA (MidNum) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
// ÷ 0061 ÷ 002C ÷ 000D ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] COMMA (MidNum) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0061\u002C\r",
new String[] { "\u0061" });
// ÷ 0061 ÷ 002C × 0308 ÷ 000D ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] COMMA (MidNum) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
// ÷ 0061 ÷ 002C × 0308 ÷ 000D ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] COMMA (MidNum) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0061\u002C\u0308\r",
new String[] { "\u0061" });
// ÷ 0061 ÷ 002C ÷ 000A ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] COMMA (MidNum) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
// ÷ 0061 ÷ 002C ÷ 000A ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] COMMA (MidNum) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0061\u002C\n",
new String[] { "\u0061" });
// ÷ 0061 ÷ 002C × 0308 ÷ 000A ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] COMMA (MidNum) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
// ÷ 0061 ÷ 002C × 0308 ÷ 000A ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] COMMA (MidNum) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0061\u002C\u0308\n",
new String[] { "\u0061" });
// ÷ 0061 ÷ 002C ÷ 000B ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] COMMA (MidNum) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
// ÷ 0061 ÷ 002C ÷ 000B ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] COMMA (MidNum) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0061\u002C\u000B",
new String[] { "\u0061" });
// ÷ 0061 ÷ 002C × 0308 ÷ 000B ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] COMMA (MidNum) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
// ÷ 0061 ÷ 002C × 0308 ÷ 000B ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] COMMA (MidNum) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0061\u002C\u0308\u000B",
new String[] { "\u0061" });
@ -3220,27 +3220,27 @@ public class WordBreakTestUnicode_6_0_0 extends BaseTokenStreamTestCase {
assertAnalyzesTo(analyzer, "\u0031\u003A\u0308\u0001",
new String[] { "\u0031" });
// ÷ 0031 ÷ 003A ÷ 000D ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] COLON (MidLetter) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
// ÷ 0031 ÷ 003A ÷ 000D ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] COLON (MidLetter) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0031\u003A\r",
new String[] { "\u0031" });
// ÷ 0031 ÷ 003A × 0308 ÷ 000D ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] COLON (MidLetter) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
// ÷ 0031 ÷ 003A × 0308 ÷ 000D ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] COLON (MidLetter) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0031\u003A\u0308\r",
new String[] { "\u0031" });
// ÷ 0031 ÷ 003A ÷ 000A ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] COLON (MidLetter) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
// ÷ 0031 ÷ 003A ÷ 000A ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] COLON (MidLetter) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0031\u003A\n",
new String[] { "\u0031" });
// ÷ 0031 ÷ 003A × 0308 ÷ 000A ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] COLON (MidLetter) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
// ÷ 0031 ÷ 003A × 0308 ÷ 000A ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] COLON (MidLetter) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0031\u003A\u0308\n",
new String[] { "\u0031" });
// ÷ 0031 ÷ 003A ÷ 000B ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] COLON (MidLetter) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
// ÷ 0031 ÷ 003A ÷ 000B ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] COLON (MidLetter) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0031\u003A\u000B",
new String[] { "\u0031" });
// ÷ 0031 ÷ 003A × 0308 ÷ 000B ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] COLON (MidLetter) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
// ÷ 0031 ÷ 003A × 0308 ÷ 000B ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] COLON (MidLetter) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0031\u003A\u0308\u000B",
new String[] { "\u0031" });
@ -3396,27 +3396,27 @@ public class WordBreakTestUnicode_6_0_0 extends BaseTokenStreamTestCase {
assertAnalyzesTo(analyzer, "\u0031\u0027\u0308\u0001",
new String[] { "\u0031" });
// ÷ 0031 ÷ 0027 ÷ 000D ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] APOSTROPHE (MidNumLet) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
// ÷ 0031 ÷ 0027 ÷ 000D ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] APOSTROPHE (MidNumLet) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0031\u0027\r",
new String[] { "\u0031" });
// ÷ 0031 ÷ 0027 × 0308 ÷ 000D ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] APOSTROPHE (MidNumLet) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
// ÷ 0031 ÷ 0027 × 0308 ÷ 000D ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] APOSTROPHE (MidNumLet) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0031\u0027\u0308\r",
new String[] { "\u0031" });
// ÷ 0031 ÷ 0027 ÷ 000A ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] APOSTROPHE (MidNumLet) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
// ÷ 0031 ÷ 0027 ÷ 000A ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] APOSTROPHE (MidNumLet) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0031\u0027\n",
new String[] { "\u0031" });
// ÷ 0031 ÷ 0027 × 0308 ÷ 000A ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] APOSTROPHE (MidNumLet) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
// ÷ 0031 ÷ 0027 × 0308 ÷ 000A ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] APOSTROPHE (MidNumLet) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0031\u0027\u0308\n",
new String[] { "\u0031" });
// ÷ 0031 ÷ 0027 ÷ 000B ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] APOSTROPHE (MidNumLet) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
// ÷ 0031 ÷ 0027 ÷ 000B ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] APOSTROPHE (MidNumLet) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0031\u0027\u000B",
new String[] { "\u0031" });
// ÷ 0031 ÷ 0027 × 0308 ÷ 000B ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] APOSTROPHE (MidNumLet) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
// ÷ 0031 ÷ 0027 × 0308 ÷ 000B ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] APOSTROPHE (MidNumLet) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0031\u0027\u0308\u000B",
new String[] { "\u0031" });
@ -3572,27 +3572,27 @@ public class WordBreakTestUnicode_6_0_0 extends BaseTokenStreamTestCase {
assertAnalyzesTo(analyzer, "\u0031\u002C\u0308\u0001",
new String[] { "\u0031" });
// ÷ 0031 ÷ 002C ÷ 000D ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] COMMA (MidNum) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
// ÷ 0031 ÷ 002C ÷ 000D ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] COMMA (MidNum) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0031\u002C\r",
new String[] { "\u0031" });
// ÷ 0031 ÷ 002C × 0308 ÷ 000D ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] COMMA (MidNum) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
// ÷ 0031 ÷ 002C × 0308 ÷ 000D ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] COMMA (MidNum) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0031\u002C\u0308\r",
new String[] { "\u0031" });
// ÷ 0031 ÷ 002C ÷ 000A ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] COMMA (MidNum) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
// ÷ 0031 ÷ 002C ÷ 000A ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] COMMA (MidNum) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0031\u002C\n",
new String[] { "\u0031" });
// ÷ 0031 ÷ 002C × 0308 ÷ 000A ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] COMMA (MidNum) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
// ÷ 0031 ÷ 002C × 0308 ÷ 000A ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] COMMA (MidNum) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0031\u002C\u0308\n",
new String[] { "\u0031" });
// ÷ 0031 ÷ 002C ÷ 000B ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] COMMA (MidNum) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
// ÷ 0031 ÷ 002C ÷ 000B ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] COMMA (MidNum) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0031\u002C\u000B",
new String[] { "\u0031" });
// ÷ 0031 ÷ 002C × 0308 ÷ 000B ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] COMMA (MidNum) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
// ÷ 0031 ÷ 002C × 0308 ÷ 000B ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] COMMA (MidNum) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0031\u002C\u0308\u000B",
new String[] { "\u0031" });
@ -3748,27 +3748,27 @@ public class WordBreakTestUnicode_6_0_0 extends BaseTokenStreamTestCase {
assertAnalyzesTo(analyzer, "\u0031\u002E\u2060\u0308\u0001",
new String[] { "\u0031" });
// ÷ 0031 ÷ 002E × 2060 ÷ 000D ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] FULL STOP (MidNumLet) × [4.0] WORD JOINER (Format_FE) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
// ÷ 0031 ÷ 002E × 2060 ÷ 000D ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] FULL STOP (MidNumLet) × [4.0] WORD JOINER (Format_FE) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0031\u002E\u2060\r",
new String[] { "\u0031" });
// ÷ 0031 ÷ 002E × 2060 × 0308 ÷ 000D ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] FULL STOP (MidNumLet) × [4.0] WORD JOINER (Format_FE) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
// ÷ 0031 ÷ 002E × 2060 × 0308 ÷ 000D ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] FULL STOP (MidNumLet) × [4.0] WORD JOINER (Format_FE) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0031\u002E\u2060\u0308\r",
new String[] { "\u0031" });
// ÷ 0031 ÷ 002E × 2060 ÷ 000A ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] FULL STOP (MidNumLet) × [4.0] WORD JOINER (Format_FE) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
// ÷ 0031 ÷ 002E × 2060 ÷ 000A ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] FULL STOP (MidNumLet) × [4.0] WORD JOINER (Format_FE) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0031\u002E\u2060\n",
new String[] { "\u0031" });
// ÷ 0031 ÷ 002E × 2060 × 0308 ÷ 000A ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] FULL STOP (MidNumLet) × [4.0] WORD JOINER (Format_FE) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
// ÷ 0031 ÷ 002E × 2060 × 0308 ÷ 000A ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] FULL STOP (MidNumLet) × [4.0] WORD JOINER (Format_FE) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0031\u002E\u2060\u0308\n",
new String[] { "\u0031" });
// ÷ 0031 ÷ 002E × 2060 ÷ 000B ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] FULL STOP (MidNumLet) × [4.0] WORD JOINER (Format_FE) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
// ÷ 0031 ÷ 002E × 2060 ÷ 000B ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] FULL STOP (MidNumLet) × [4.0] WORD JOINER (Format_FE) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0031\u002E\u2060\u000B",
new String[] { "\u0031" });
// ÷ 0031 ÷ 002E × 2060 × 0308 ÷ 000B ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] FULL STOP (MidNumLet) × [4.0] WORD JOINER (Format_FE) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
// ÷ 0031 ÷ 002E × 2060 × 0308 ÷ 000B ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] FULL STOP (MidNumLet) × [4.0] WORD JOINER (Format_FE) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
assertAnalyzesTo(analyzer, "\u0031\u002E\u2060\u0308\u000B",
new String[] { "\u0031" });

View File

@ -26,7 +26,7 @@
<import file="../analysis-module-build.xml"/>
<path id="icujar">
<pathelement location="lib/icu4j-4.8.1.1.jar"/>
<pathelement location="lib/icu4j-49.1.jar"/>
</path>
<path id="classpath">
@ -37,19 +37,32 @@
<target name="compile-core" depends="jar-analyzers-common, common.compile-core" />
<property name="gennorm2.src.dir" value="src/data/utr30"/>
<property name="gennorm2.src.files"
value="nfkc.txt nfkc_cf.txt BasicFoldings.txt DiacriticFolding.txt DingbatFolding.txt HanRadicalFolding.txt NativeDigitFolding.txt"/>
<property name="utr30.data.dir" location="src/data/utr30"/>
<target name="gen-utr30-data-files" depends="compile-tools">
<java
classname="org.apache.lucene.analysis.icu.GenerateUTR30DataFiles"
dir="${utr30.data.dir}"
fork="true"
failonerror="true">
<classpath>
<path refid="icujar"/>
<pathelement location="${build.dir}/classes/tools"/>
</classpath>
</java>
</target>
<property name="gennorm2.src.files"
value="nfc.txt nfkc.txt nfkc_cf.txt BasicFoldings.txt DiacriticFolding.txt DingbatFolding.txt HanRadicalFolding.txt NativeDigitFolding.txt"/>
<property name="gennorm2.tmp" value="${build.dir}/gennorm2/utr30.tmp"/>
<property name="gennorm2.dst" value="src/resources/org/apache/lucene/analysis/icu/utr30.nrm"/>
<target name="gennorm2">
<target name="gennorm2" depends="gen-utr30-data-files">
<echo>Note that the gennorm2 and icupkg tools must be on your PATH. These tools
are part of the ICU4C package. See http://site.icu-project.org/ </echo>
<mkdir dir="${build.dir}/gennorm2"/>
<exec executable="gennorm2" failonerror="true">
<arg value="-v"/>
<arg value="-s"/>
<arg value="${gennorm2.src.dir}"/>
<arg value="${utr30.data.dir}"/>
<arg line="${gennorm2.src.files}"/>
<arg value="-o"/>
<arg value="${gennorm2.tmp}"/>

View File

@ -19,7 +19,7 @@
<ivy-module version="2.0">
<info organisation="org.apache.lucene" module="analyzers-icu"/>
<dependencies>
<dependency org="com.ibm.icu" name="icu4j" rev="4.8.1.1" transitive="false"/>
<dependency org="com.ibm.icu" name="icu4j" rev="49.1" transitive="false"/>
<exclude org="*" ext="*" matcher="regexp" type="${ivy.exclude.types}"/>
</dependencies>
</ivy-module>

View File

@ -1 +0,0 @@
654b7021e7bb6c5b7b35c88d23cac1022c7b3d6b

View File

@ -0,0 +1 @@
fbf7a438e6bf3660e0da2fd77dd1df1635fe503c

View File

@ -2,7 +2,7 @@ ICU License - ICU 1.8.1 and later
COPYRIGHT AND PERMISSION NOTICE
Copyright (c) 1995-2008 International Business Machines Corporation and others
Copyright (c) 1995-2012 International Business Machines Corporation and others
All rights reserved.

View File

@ -1,3 +1,3 @@
ICU4J, (under modules/analysis/icu) is licensed under an MIT style license
(modules/analysis/icu/lib/icu4j-LICENSE-BSD_LIKE.txt) and Copyright (c) 1995-2011
ICU4J, (under lucene/analysis/icu) is licensed under an MIT style license
(modules/analysis/icu/lib/icu4j-LICENSE-BSD_LIKE.txt) and Copyright (c) 1995-2012
International Business Machines Corporation and others

View File

@ -20,67 +20,96 @@
### Custom Normalization mappings for UTR#30
### (http://www.unicode.org/reports/tr30/tr30-4.html)
###
### Created from Unicode 5.2 UCD
###
#### WARNING ####
#### Rule: lines direct content generation.
#### All non-comments will be REMOVED when this file's contents
#### are generated by 'ant gen-utr30-data-files'.
#### Use "# Rule: verbatim" to keep non-comments up until
#### the next "# Rule:" line.
#### WARNING ####
## Accent removal
# See DiacriticFolding.txt
## Case Folding (done by cf)
## Canonical Duplicates Folding (done by cd)
## Dashes folding
# [[:Dash:][:Pd:]]-2053(swung dash) > U+002D
# Rule: [[[[:Dash:][:Pd:]]-[\u2053\uFE31\uFE32]] - [\u002D]] > 002D
058A>002D
05BE>002D
1400>002D
1806>002D
2010..2015>002D
207B>002D
208B>002D
2212>002D
2E17>002D
2E1A>002D
2E3A..2E3B>002D
301C>002D
3030>002D
30A0>002D
#2053>002D
2212>002D
# FE31,FE32,FE58,FE63,FF0D done by kd
FE58>002D
FE63>002D
FF0D>002D
## Greek letterforms folding (done by kd)
## Hebrew alternates folding (done by kd)
## Jamo folding (done by kd)
## Math symbol folding (done by kd)
## Native digit folding
# See NativeDigitFolding.txt
## Nobreak folding (done by kd)
## Overline Folding
FE49..FE4C>203E
## Overline Folding (done by kd)
## Positional forms folding (done by kd)
## Small forms folding (done by kd)
## Space Folding
# [:Zs:] > U+0020
# Rule: [[:Zs:] - [:Changes_When_NFKC_Casefolded=Yes:] - [\u0020]] > 0020
1680>0020
180E>0020
# 00A0, 2000..200A,202F,205F,3000 done by kd
## Spacing Accents folding (done by kd)
## Subscript folding (done by kd)
## Symbol folding (done by kd)
## Underline Folding
# Rule: verbatim
2017>005E
FE4D..FE4F>005E
## Diacritic Folding
#
# See DiacriticFolding.txt
## Vertical forms folding (done by kd)
## Han Radical Folding
# See HanRadicalFolding.txt
## Letter Form Folding (done by kd)
## Superscript folding
# Additions to kd:
# Rule: verbatim
02C0>0294
02C1>0295
06E5>0648
06E6>064A
## Suzhou Numeral Folding
# Additions to kd:
# Rule: verbatim
3021>4E00
3022>4E8C
3023>4E09
@ -92,6 +121,7 @@ FE4D..FE4F>005E
3029>4E5D
## Width Folding (done by kd)
# Punctuation Folding
# Rule: verbatim
00AB>0022
00BB>0022
201C..201E>0022

View File

@ -24,41 +24,45 @@
### Created from Unicode 5.2 UCD
###
# Removes diacritics, as defined by [:Diacritic:]
# These may or may not be combining marks
#### WARNING ####
#### Rule: lines direct content generation.
#### All non-comments will be REMOVED when this file's contents
#### are generated by 'ant gen-utr30-data-files'.
#### Use "# Rule: verbatim" to keep non-comments up until
#### the next "# Rule:" line.
#### WARNING ####
## Remove diacritics
# Rule: [:Diacritic:] >
005E>
0060>
00B7>
02B9..02D7>
02DE>
02DF>
02E5..033F>
0342>
0346..034E>
00A8>
00AF>
00B4>
00B7..00B8>
02B0..034E>
0350..0357>
035D..0362>
0375>
0374..0375>
037A>
0384..0385>
0483..0487>
0559>
0591..05A1>
05A3..05BD>
05BF>
05C1>
05C2>
05C1..05C2>
05C4>
064B..0652>
0657>
0658>
06DF>
06E0>
06E5>
06E6>
0657..0658>
06DF..06E0>
06E5..06E6>
06EA..06EC>
0730..074A>
07A6..07B0>
07EB..07F5>
0818>
0819>
0818..0819>
08E4..08FE>
093C>
094D>
0951..0954>
@ -80,24 +84,19 @@
0E47..0E4C>
0E4E>
0EC8..0ECC>
0F18>
0F19>
0F18..0F19>
0F35>
0F37>
0F39>
0F3E>
0F3F>
0F3E..0F3F>
0F82..0F84>
0F86>
0F87>
0F86..0F87>
0FC6>
1037>
1039>
103A>
1039..103A>
1087..108D>
108F>
109A>
109B>
109A..109B>
17C9..17D3>
17DD>
1939..193B>
@ -106,31 +105,33 @@
1B34>
1B44>
1B6B..1B73>
1BAA>
1C36>
1C37>
1BAA..1BAB>
1C36..1C37>
1C78..1C7D>
1CD0..1CE8>
1CED>
1D2F>
1D3B>
1D4E>
1CF4>
1D2C..1D6A>
1DC4..1DCF>
1DFD..1DFF>
1FBD>
1FBF..1FC1>
1FCD..1FCF>
1FDD..1FDF>
1FED..1FEF>
1FFD..1FFE>
2CEF..2CF1>
2E2F>
302A..302F>
3099>
309A>
3099..309C>
30FC>
A66F>
A67C>
A67D>
A67C..A67D>
A67F>
A6F0>
A6F1>
A6F0..A6F1>
A717..A721>
A788>
A7F8..A7F9>
A8C4>
A8E0..A8F1>
A92B..A92E>
@ -139,12 +140,20 @@ A9B3>
A9C0>
AA7B>
AABF..AAC2>
ABEC>
ABED>
AAF6>
ABEC..ABED>
FB1E>
FE20..FE26>
110B9>
110BA>
FF3E>
FF40>
FF70>
FF9E..FF9F>
FFE3>
110B9..110BA>
11133..11134>
111C0>
116B6..116B7>
16F8F..16F9F>
1D167..1D169>
1D16D..1D172>
1D17B..1D182>
@ -153,6 +162,7 @@ FE20..FE26>
# Latin script "composed" that do not further decompose, so decompose here
# These are from AsciiFoldingFilter
# Rule: verbatim
00E6>0061 0065
00F0>0064
00F8>006F
@ -491,6 +501,7 @@ A7FF>004D
# Cyrillic script "composed" that do not further decompose, so decompose here
# These are from UTR#30 DiacriticFolding.txt
# Rule: verbatim
047D>0461
048B>0439
@ -520,6 +531,7 @@ A7FF>004D
04CE>043C
# Additional signs and diacritic, from examination of [:Mark:]&[:Lm:]
# Rule: verbatim
0358..035C>
05A2>
05C5>
@ -555,6 +567,7 @@ A802>
1D242..1D244>
# Additional Arabic/Hebrew decompositions
# Rule: verbatim
05F3>0027
05F4>0022
0629>0647

View File

@ -24,8 +24,17 @@
### Created from Unicode 5.2 UCD
###
#### WARNING ####
#### Rule: lines direct content generation.
#### All non-comments will be REMOVED when this file's contents
#### are generated by 'ant gen-utr30-data-files'.
#### Use "# Rule: verbatim" to keep non-comments up until
#### the next "# Rule:" line.
#### WARNING ####
# Folds dingbats and other adorned forms
# Generated from ASCIIFoldingFilter
# Rule: verbatim
24EB>0031 0031
24EC>0031 0032
24ED>0031 0033

View File

@ -24,6 +24,16 @@
### Created from UTR#30 HanRadicalFolding.txt
###
#### WARNING ####
#### Rule: lines direct content generation.
#### All non-comments will be REMOVED when this file's contents
#### are generated by 'ant gen-utr30-data-files'.
#### Use "# Rule: verbatim" to keep non-comments up until
#### the next "# Rule:" line.
#### WARNING ####
# Rule: verbatim
# CJK Radicals
2E81>5382
2E82>4E5B

View File

@ -1,7 +1,7 @@
# Copyright 2001-2010 Unicode, Inc.
#
# Copyright 2001-2012 Unicode, Inc.
#
# Disclaimer
#
#
# This source code is provided as is by Unicode, Inc. No claims are
# made as to fitness for any particular purpose. No warranties of any
# kind are expressed or implied. The recipient agrees to determine
@ -9,463 +9,485 @@
# purchased on magnetic or optical media from Unicode, Inc., the
# sole remedy for any claim will be exchange of defective media
# within 90 days of receipt.
#
#
# Limitations on Rights to Redistribute This Code
#
#
# Unicode, Inc. hereby grants the right to freely use the information
# supplied in this file in the creation of products supporting the
# Unicode Standard, and to make copies of this file in any form
# for internal or external distribution as long as this notice
# remains attached.
### Custom Normalization mappings for UTR#30
### Custom Normalization mappings for UTR#30
### (http://www.unicode.org/reports/tr30/tr30-4.html)
###
### Created from Unicode 5.2 UCD
###
#### WARNING ####
#### Rule: lines direct content generation.
#### All non-comments will be REMOVED when this file's contents
#### are generated by 'ant gen-utr30-data-files'.
#### Use "# Rule: verbatim" to keep non-comments up until
#### the next "# Rule:" line.
#### WARNING ####
## Native digit folding
# [:Nd:] > Ascii digit equivalent
# Arabic-Indic
0660>0030
0661>0031
0662>0032
0663>0033
0664>0034
0665>0035
0666>0036
0667>0037
0668>0038
0669>0039
# Eastern Arabic-Indic
06F0>0030
06F1>0031
06F2>0032
06F3>0033
06F4>0034
06F5>0035
06F6>0036
06F7>0037
06F8>0038
06F9>0039
# NKo
07C0>0030
07C1>0031
07C2>0032
07C3>0033
07C4>0034
07C5>0035
07C6>0036
07C7>0037
07C8>0038
07C9>0039
# Devanagari
0966>0030
0967>0031
0968>0032
0969>0033
096A>0034
096B>0035
096C>0036
096D>0037
096E>0038
096F>0039
# Bengali
09E6>0030
09E7>0031
09E8>0032
09E9>0033
09EA>0034
09EB>0035
09EC>0036
09ED>0037
09EE>0038
09EF>0039
# Gurmukhi
0A66>0030
0A67>0031
0A68>0032
0A69>0033
0A6A>0034
0A6B>0035
0A6C>0036
0A6D>0037
0A6E>0038
0A6F>0039
# Gujarati
0AE6>0030
0AE7>0031
0AE8>0032
0AE9>0033
0AEA>0034
0AEB>0035
0AEC>0036
0AED>0037
0AEE>0038
0AEF>0039
# Oriya
0B66>0030
0B67>0031
0B68>0032
0B69>0033
0B6A>0034
0B6B>0035
0B6C>0036
0B6D>0037
0B6E>0038
0B6F>0039
# Tamil
0BE6>0030
0BE7>0031
0BE8>0032
0BE9>0033
0BEA>0034
0BEB>0035
0BEC>0036
0BED>0037
0BEE>0038
0BEF>0039
# Telugu
0C66>0030
0C67>0031
0C68>0032
0C69>0033
0C6A>0034
0C6B>0035
0C6C>0036
0C6D>0037
0C6E>0038
0C6F>0039
# Kannada
0CE6>0030
0CE7>0031
0CE8>0032
0CE9>0033
0CEA>0034
0CEB>0035
0CEC>0036
0CED>0037
0CEE>0038
0CEF>0039
# Malayalam
0D66>0030
0D67>0031
0D68>0032
0D69>0033
0D6A>0034
0D6B>0035
0D6C>0036
0D6D>0037
0D6E>0038
0D6F>0039
# Thai
0E50>0030
0E51>0031
0E52>0032
0E53>0033
0E54>0034
0E55>0035
0E56>0036
0E57>0037
0E58>0038
0E59>0039
# Lao
0ED0>0030
0ED1>0031
0ED2>0032
0ED3>0033
0ED4>0034
0ED5>0035
0ED6>0036
0ED7>0037
0ED8>0038
0ED9>0039
# Tibetan
0F20>0030
0F21>0031
0F22>0032
0F23>0033
0F24>0034
0F25>0035
0F26>0036
0F27>0037
0F28>0038
0F29>0039
# Myanmar
1040>0030
1041>0031
1042>0032
1043>0033
1044>0034
1045>0035
1046>0036
1047>0037
1048>0038
1049>0039
# Myanmar Shan
1090>0030
1091>0031
1092>0032
1093>0033
1094>0034
1095>0035
1096>0036
1097>0037
1098>0038
1099>0039
# Khmer
17E0>0030
17E1>0031
17E2>0032
17E3>0033
17E4>0034
17E5>0035
17E6>0036
17E7>0037
17E8>0038
17E9>0039
# Mongolian
1810>0030
1811>0031
1812>0032
1813>0033
1814>0034
1815>0035
1816>0036
1817>0037
1818>0038
1819>0039
# Limbu
1946>0030
1947>0031
1948>0032
1949>0033
194A>0034
194B>0035
194C>0036
194D>0037
194E>0038
194F>0039
# New Tai Lue
19D0>0030
19D1>0031
19D2>0032
19D3>0033
19D4>0034
19D5>0035
19D6>0036
19D7>0037
19D8>0038
19D9>0039
# New Tai Lue Tham Digit One
19DA>0031
# Tai Tham Hora
1A80>0030
1A81>0031
1A82>0032
1A83>0033
1A84>0034
1A85>0035
1A86>0036
1A87>0037
1A88>0038
1A89>0039
# Tai Tham Tham
1A90>0030
1A91>0031
1A92>0032
1A93>0033
1A94>0034
1A95>0035
1A96>0036
1A97>0037
1A98>0038
1A99>0039
# Balinese
1B50>0030
1B51>0031
1B52>0032
1B53>0033
1B54>0034
1B55>0035
1B56>0036
1B57>0037
1B58>0038
1B59>0039
# Sundanese
1BB0>0030
1BB1>0031
1BB2>0032
1BB3>0033
1BB4>0034
1BB5>0035
1BB6>0036
1BB7>0037
1BB8>0038
1BB9>0039
# Lepcha
1C40>0030
1C41>0031
1C42>0032
1C43>0033
1C44>0034
1C45>0035
1C46>0036
1C47>0037
1C48>0038
1C49>0039
# Ol Chiki
1C50>0030
1C51>0031
1C52>0032
1C53>0033
1C54>0034
1C55>0035
1C56>0036
1C57>0037
1C58>0038
1C59>0039
# Vai
A620>0030
A621>0031
A622>0032
A623>0033
A624>0034
A625>0035
A626>0036
A627>0037
A628>0038
A629>0039
# Saurashtra
A8D0>0030
A8D1>0031
A8D2>0032
A8D3>0033
A8D4>0034
A8D5>0035
A8D6>0036
A8D7>0037
A8D8>0038
A8D9>0039
# Kayah Li
A900>0030
A901>0031
A902>0032
A903>0033
A904>0034
A905>0035
A906>0036
A907>0037
A908>0038
A909>0039
# Javanese
A9D0>0030
A9D1>0031
A9D2>0032
A9D3>0033
A9D4>0034
A9D5>0035
A9D6>0036
A9D7>0037
A9D8>0038
A9D9>0039
# Cham
AA50>0030
AA51>0031
AA52>0032
AA53>0033
AA54>0034
AA55>0035
AA56>0036
AA57>0037
AA58>0038
AA59>0039
# Meetei Mayek
ABF0>0030
ABF1>0031
ABF2>0032
ABF3>0033
ABF4>0034
ABF5>0035
ABF6>0036
ABF7>0037
ABF8>0038
ABF9>0039
# Halfwidth and Fullwidth Forms (done by kd)
# Osmanya
104A0>0030
104A1>0031
104A2>0032
104A3>0033
104A4>0034
104A5>0035
104A6>0036
104A7>0037
104A8>0038
104A9>0039
# Brahmi
11066>0030
11067>0031
11068>0032
11069>0033
1106A>0034
1106B>0035
1106C>0036
1106D>0037
1106E>0038
1106F>0039
# Mathematical Alphanumeric Symbols - Bold digits
1D7CE>0030
1D7CF>0031
1D7D0>0032
1D7D1>0033
1D7D2>0034
1D7D3>0035
1D7D4>0036
1D7D5>0037
1D7D6>0038
1D7D7>0039
# Mathematical Alphanumeric Symbols - Double-struck digits
1D7D8>0030
1D7D9>0031
1D7DA>0032
1D7DB>0033
1D7DC>0034
1D7DD>0035
1D7DE>0036
1D7DF>0037
1D7E0>0038
1D7E1>0039
# Mathematical Alphanumeric Symbols - Sans-serif digits
1D7E2>0030
1D7E3>0031
1D7E4>0032
1D7E5>0033
1D7E6>0034
1D7E7>0035
1D7E8>0036
1D7E9>0037
1D7EA>0038
1D7EB>0039
# Mathematical Alphanumeric Symbols - Sans-serif bold digits
1D7EC>0030
1D7ED>0031
1D7EE>0032
1D7EF>0033
1D7F0>0034
1D7F1>0035
1D7F2>0036
1D7F3>0037
1D7F4>0038
1D7F5>0039
# Mathematical Alphanumeric Symbols - Monospace digits
1D7F6>0030
1D7F7>0031
1D7F8>0032
1D7F9>0033
1D7FA>0034
1D7FB>0035
1D7FC>0036
1D7FD>0037
1D7FE>0038
1D7FF>0039
# Rule: [[[:Numeric_Type=Digit:][:Nd:]] - [[:Changes_When_NFKC_Casefolded=Yes:][:Block=Superscripts_And_Subscripts:][\u00B2\u00B3\u00B9][\u0030-\u0039]]] > Numeric_Value
0660>0030 # ARABIC-INDIC DIGIT ZERO
0661>0031 # ARABIC-INDIC DIGIT ONE
0662>0032 # ARABIC-INDIC DIGIT TWO
0663>0033 # ARABIC-INDIC DIGIT THREE
0664>0034 # ARABIC-INDIC DIGIT FOUR
0665>0035 # ARABIC-INDIC DIGIT FIVE
0666>0036 # ARABIC-INDIC DIGIT SIX
0667>0037 # ARABIC-INDIC DIGIT SEVEN
0668>0038 # ARABIC-INDIC DIGIT EIGHT
0669>0039 # ARABIC-INDIC DIGIT NINE
06F0>0030 # EXTENDED ARABIC-INDIC DIGIT ZERO
06F1>0031 # EXTENDED ARABIC-INDIC DIGIT ONE
06F2>0032 # EXTENDED ARABIC-INDIC DIGIT TWO
06F3>0033 # EXTENDED ARABIC-INDIC DIGIT THREE
06F4>0034 # EXTENDED ARABIC-INDIC DIGIT FOUR
06F5>0035 # EXTENDED ARABIC-INDIC DIGIT FIVE
06F6>0036 # EXTENDED ARABIC-INDIC DIGIT SIX
06F7>0037 # EXTENDED ARABIC-INDIC DIGIT SEVEN
06F8>0038 # EXTENDED ARABIC-INDIC DIGIT EIGHT
06F9>0039 # EXTENDED ARABIC-INDIC DIGIT NINE
07C0>0030 # NKO DIGIT ZERO
07C1>0031 # NKO DIGIT ONE
07C2>0032 # NKO DIGIT TWO
07C3>0033 # NKO DIGIT THREE
07C4>0034 # NKO DIGIT FOUR
07C5>0035 # NKO DIGIT FIVE
07C6>0036 # NKO DIGIT SIX
07C7>0037 # NKO DIGIT SEVEN
07C8>0038 # NKO DIGIT EIGHT
07C9>0039 # NKO DIGIT NINE
0966>0030 # DEVANAGARI DIGIT ZERO
0967>0031 # DEVANAGARI DIGIT ONE
0968>0032 # DEVANAGARI DIGIT TWO
0969>0033 # DEVANAGARI DIGIT THREE
096A>0034 # DEVANAGARI DIGIT FOUR
096B>0035 # DEVANAGARI DIGIT FIVE
096C>0036 # DEVANAGARI DIGIT SIX
096D>0037 # DEVANAGARI DIGIT SEVEN
096E>0038 # DEVANAGARI DIGIT EIGHT
096F>0039 # DEVANAGARI DIGIT NINE
09E6>0030 # BENGALI DIGIT ZERO
09E7>0031 # BENGALI DIGIT ONE
09E8>0032 # BENGALI DIGIT TWO
09E9>0033 # BENGALI DIGIT THREE
09EA>0034 # BENGALI DIGIT FOUR
09EB>0035 # BENGALI DIGIT FIVE
09EC>0036 # BENGALI DIGIT SIX
09ED>0037 # BENGALI DIGIT SEVEN
09EE>0038 # BENGALI DIGIT EIGHT
09EF>0039 # BENGALI DIGIT NINE
0A66>0030 # GURMUKHI DIGIT ZERO
0A67>0031 # GURMUKHI DIGIT ONE
0A68>0032 # GURMUKHI DIGIT TWO
0A69>0033 # GURMUKHI DIGIT THREE
0A6A>0034 # GURMUKHI DIGIT FOUR
0A6B>0035 # GURMUKHI DIGIT FIVE
0A6C>0036 # GURMUKHI DIGIT SIX
0A6D>0037 # GURMUKHI DIGIT SEVEN
0A6E>0038 # GURMUKHI DIGIT EIGHT
0A6F>0039 # GURMUKHI DIGIT NINE
0AE6>0030 # GUJARATI DIGIT ZERO
0AE7>0031 # GUJARATI DIGIT ONE
0AE8>0032 # GUJARATI DIGIT TWO
0AE9>0033 # GUJARATI DIGIT THREE
0AEA>0034 # GUJARATI DIGIT FOUR
0AEB>0035 # GUJARATI DIGIT FIVE
0AEC>0036 # GUJARATI DIGIT SIX
0AED>0037 # GUJARATI DIGIT SEVEN
0AEE>0038 # GUJARATI DIGIT EIGHT
0AEF>0039 # GUJARATI DIGIT NINE
0B66>0030 # ORIYA DIGIT ZERO
0B67>0031 # ORIYA DIGIT ONE
0B68>0032 # ORIYA DIGIT TWO
0B69>0033 # ORIYA DIGIT THREE
0B6A>0034 # ORIYA DIGIT FOUR
0B6B>0035 # ORIYA DIGIT FIVE
0B6C>0036 # ORIYA DIGIT SIX
0B6D>0037 # ORIYA DIGIT SEVEN
0B6E>0038 # ORIYA DIGIT EIGHT
0B6F>0039 # ORIYA DIGIT NINE
0BE6>0030 # TAMIL DIGIT ZERO
0BE7>0031 # TAMIL DIGIT ONE
0BE8>0032 # TAMIL DIGIT TWO
0BE9>0033 # TAMIL DIGIT THREE
0BEA>0034 # TAMIL DIGIT FOUR
0BEB>0035 # TAMIL DIGIT FIVE
0BEC>0036 # TAMIL DIGIT SIX
0BED>0037 # TAMIL DIGIT SEVEN
0BEE>0038 # TAMIL DIGIT EIGHT
0BEF>0039 # TAMIL DIGIT NINE
0C66>0030 # TELUGU DIGIT ZERO
0C67>0031 # TELUGU DIGIT ONE
0C68>0032 # TELUGU DIGIT TWO
0C69>0033 # TELUGU DIGIT THREE
0C6A>0034 # TELUGU DIGIT FOUR
0C6B>0035 # TELUGU DIGIT FIVE
0C6C>0036 # TELUGU DIGIT SIX
0C6D>0037 # TELUGU DIGIT SEVEN
0C6E>0038 # TELUGU DIGIT EIGHT
0C6F>0039 # TELUGU DIGIT NINE
0CE6>0030 # KANNADA DIGIT ZERO
0CE7>0031 # KANNADA DIGIT ONE
0CE8>0032 # KANNADA DIGIT TWO
0CE9>0033 # KANNADA DIGIT THREE
0CEA>0034 # KANNADA DIGIT FOUR
0CEB>0035 # KANNADA DIGIT FIVE
0CEC>0036 # KANNADA DIGIT SIX
0CED>0037 # KANNADA DIGIT SEVEN
0CEE>0038 # KANNADA DIGIT EIGHT
0CEF>0039 # KANNADA DIGIT NINE
0D66>0030 # MALAYALAM DIGIT ZERO
0D67>0031 # MALAYALAM DIGIT ONE
0D68>0032 # MALAYALAM DIGIT TWO
0D69>0033 # MALAYALAM DIGIT THREE
0D6A>0034 # MALAYALAM DIGIT FOUR
0D6B>0035 # MALAYALAM DIGIT FIVE
0D6C>0036 # MALAYALAM DIGIT SIX
0D6D>0037 # MALAYALAM DIGIT SEVEN
0D6E>0038 # MALAYALAM DIGIT EIGHT
0D6F>0039 # MALAYALAM DIGIT NINE
0E50>0030 # THAI DIGIT ZERO
0E51>0031 # THAI DIGIT ONE
0E52>0032 # THAI DIGIT TWO
0E53>0033 # THAI DIGIT THREE
0E54>0034 # THAI DIGIT FOUR
0E55>0035 # THAI DIGIT FIVE
0E56>0036 # THAI DIGIT SIX
0E57>0037 # THAI DIGIT SEVEN
0E58>0038 # THAI DIGIT EIGHT
0E59>0039 # THAI DIGIT NINE
0ED0>0030 # LAO DIGIT ZERO
0ED1>0031 # LAO DIGIT ONE
0ED2>0032 # LAO DIGIT TWO
0ED3>0033 # LAO DIGIT THREE
0ED4>0034 # LAO DIGIT FOUR
0ED5>0035 # LAO DIGIT FIVE
0ED6>0036 # LAO DIGIT SIX
0ED7>0037 # LAO DIGIT SEVEN
0ED8>0038 # LAO DIGIT EIGHT
0ED9>0039 # LAO DIGIT NINE
0F20>0030 # TIBETAN DIGIT ZERO
0F21>0031 # TIBETAN DIGIT ONE
0F22>0032 # TIBETAN DIGIT TWO
0F23>0033 # TIBETAN DIGIT THREE
0F24>0034 # TIBETAN DIGIT FOUR
0F25>0035 # TIBETAN DIGIT FIVE
0F26>0036 # TIBETAN DIGIT SIX
0F27>0037 # TIBETAN DIGIT SEVEN
0F28>0038 # TIBETAN DIGIT EIGHT
0F29>0039 # TIBETAN DIGIT NINE
1040>0030 # MYANMAR DIGIT ZERO
1041>0031 # MYANMAR DIGIT ONE
1042>0032 # MYANMAR DIGIT TWO
1043>0033 # MYANMAR DIGIT THREE
1044>0034 # MYANMAR DIGIT FOUR
1045>0035 # MYANMAR DIGIT FIVE
1046>0036 # MYANMAR DIGIT SIX
1047>0037 # MYANMAR DIGIT SEVEN
1048>0038 # MYANMAR DIGIT EIGHT
1049>0039 # MYANMAR DIGIT NINE
1090>0030 # MYANMAR SHAN DIGIT ZERO
1091>0031 # MYANMAR SHAN DIGIT ONE
1092>0032 # MYANMAR SHAN DIGIT TWO
1093>0033 # MYANMAR SHAN DIGIT THREE
1094>0034 # MYANMAR SHAN DIGIT FOUR
1095>0035 # MYANMAR SHAN DIGIT FIVE
1096>0036 # MYANMAR SHAN DIGIT SIX
1097>0037 # MYANMAR SHAN DIGIT SEVEN
1098>0038 # MYANMAR SHAN DIGIT EIGHT
1099>0039 # MYANMAR SHAN DIGIT NINE
1369>0031 # ETHIOPIC DIGIT ONE
136A>0032 # ETHIOPIC DIGIT TWO
136B>0033 # ETHIOPIC DIGIT THREE
136C>0034 # ETHIOPIC DIGIT FOUR
136D>0035 # ETHIOPIC DIGIT FIVE
136E>0036 # ETHIOPIC DIGIT SIX
136F>0037 # ETHIOPIC DIGIT SEVEN
1370>0038 # ETHIOPIC DIGIT EIGHT
1371>0039 # ETHIOPIC DIGIT NINE
17E0>0030 # KHMER DIGIT ZERO
17E1>0031 # KHMER DIGIT ONE
17E2>0032 # KHMER DIGIT TWO
17E3>0033 # KHMER DIGIT THREE
17E4>0034 # KHMER DIGIT FOUR
17E5>0035 # KHMER DIGIT FIVE
17E6>0036 # KHMER DIGIT SIX
17E7>0037 # KHMER DIGIT SEVEN
17E8>0038 # KHMER DIGIT EIGHT
17E9>0039 # KHMER DIGIT NINE
1810>0030 # MONGOLIAN DIGIT ZERO
1811>0031 # MONGOLIAN DIGIT ONE
1812>0032 # MONGOLIAN DIGIT TWO
1813>0033 # MONGOLIAN DIGIT THREE
1814>0034 # MONGOLIAN DIGIT FOUR
1815>0035 # MONGOLIAN DIGIT FIVE
1816>0036 # MONGOLIAN DIGIT SIX
1817>0037 # MONGOLIAN DIGIT SEVEN
1818>0038 # MONGOLIAN DIGIT EIGHT
1819>0039 # MONGOLIAN DIGIT NINE
1946>0030 # LIMBU DIGIT ZERO
1947>0031 # LIMBU DIGIT ONE
1948>0032 # LIMBU DIGIT TWO
1949>0033 # LIMBU DIGIT THREE
194A>0034 # LIMBU DIGIT FOUR
194B>0035 # LIMBU DIGIT FIVE
194C>0036 # LIMBU DIGIT SIX
194D>0037 # LIMBU DIGIT SEVEN
194E>0038 # LIMBU DIGIT EIGHT
194F>0039 # LIMBU DIGIT NINE
19D0>0030 # NEW TAI LUE DIGIT ZERO
19D1>0031 # NEW TAI LUE DIGIT ONE
19D2>0032 # NEW TAI LUE DIGIT TWO
19D3>0033 # NEW TAI LUE DIGIT THREE
19D4>0034 # NEW TAI LUE DIGIT FOUR
19D5>0035 # NEW TAI LUE DIGIT FIVE
19D6>0036 # NEW TAI LUE DIGIT SIX
19D7>0037 # NEW TAI LUE DIGIT SEVEN
19D8>0038 # NEW TAI LUE DIGIT EIGHT
19D9>0039 # NEW TAI LUE DIGIT NINE
19DA>0031 # NEW TAI LUE THAM DIGIT ONE
1A80>0030 # TAI THAM HORA DIGIT ZERO
1A81>0031 # TAI THAM HORA DIGIT ONE
1A82>0032 # TAI THAM HORA DIGIT TWO
1A83>0033 # TAI THAM HORA DIGIT THREE
1A84>0034 # TAI THAM HORA DIGIT FOUR
1A85>0035 # TAI THAM HORA DIGIT FIVE
1A86>0036 # TAI THAM HORA DIGIT SIX
1A87>0037 # TAI THAM HORA DIGIT SEVEN
1A88>0038 # TAI THAM HORA DIGIT EIGHT
1A89>0039 # TAI THAM HORA DIGIT NINE
1A90>0030 # TAI THAM THAM DIGIT ZERO
1A91>0031 # TAI THAM THAM DIGIT ONE
1A92>0032 # TAI THAM THAM DIGIT TWO
1A93>0033 # TAI THAM THAM DIGIT THREE
1A94>0034 # TAI THAM THAM DIGIT FOUR
1A95>0035 # TAI THAM THAM DIGIT FIVE
1A96>0036 # TAI THAM THAM DIGIT SIX
1A97>0037 # TAI THAM THAM DIGIT SEVEN
1A98>0038 # TAI THAM THAM DIGIT EIGHT
1A99>0039 # TAI THAM THAM DIGIT NINE
1B50>0030 # BALINESE DIGIT ZERO
1B51>0031 # BALINESE DIGIT ONE
1B52>0032 # BALINESE DIGIT TWO
1B53>0033 # BALINESE DIGIT THREE
1B54>0034 # BALINESE DIGIT FOUR
1B55>0035 # BALINESE DIGIT FIVE
1B56>0036 # BALINESE DIGIT SIX
1B57>0037 # BALINESE DIGIT SEVEN
1B58>0038 # BALINESE DIGIT EIGHT
1B59>0039 # BALINESE DIGIT NINE
1BB0>0030 # SUNDANESE DIGIT ZERO
1BB1>0031 # SUNDANESE DIGIT ONE
1BB2>0032 # SUNDANESE DIGIT TWO
1BB3>0033 # SUNDANESE DIGIT THREE
1BB4>0034 # SUNDANESE DIGIT FOUR
1BB5>0035 # SUNDANESE DIGIT FIVE
1BB6>0036 # SUNDANESE DIGIT SIX
1BB7>0037 # SUNDANESE DIGIT SEVEN
1BB8>0038 # SUNDANESE DIGIT EIGHT
1BB9>0039 # SUNDANESE DIGIT NINE
1C40>0030 # LEPCHA DIGIT ZERO
1C41>0031 # LEPCHA DIGIT ONE
1C42>0032 # LEPCHA DIGIT TWO
1C43>0033 # LEPCHA DIGIT THREE
1C44>0034 # LEPCHA DIGIT FOUR
1C45>0035 # LEPCHA DIGIT FIVE
1C46>0036 # LEPCHA DIGIT SIX
1C47>0037 # LEPCHA DIGIT SEVEN
1C48>0038 # LEPCHA DIGIT EIGHT
1C49>0039 # LEPCHA DIGIT NINE
1C50>0030 # OL CHIKI DIGIT ZERO
1C51>0031 # OL CHIKI DIGIT ONE
1C52>0032 # OL CHIKI DIGIT TWO
1C53>0033 # OL CHIKI DIGIT THREE
1C54>0034 # OL CHIKI DIGIT FOUR
1C55>0035 # OL CHIKI DIGIT FIVE
1C56>0036 # OL CHIKI DIGIT SIX
1C57>0037 # OL CHIKI DIGIT SEVEN
1C58>0038 # OL CHIKI DIGIT EIGHT
1C59>0039 # OL CHIKI DIGIT NINE
24F5>0031 # DOUBLE CIRCLED DIGIT ONE
24F6>0032 # DOUBLE CIRCLED DIGIT TWO
24F7>0033 # DOUBLE CIRCLED DIGIT THREE
24F8>0034 # DOUBLE CIRCLED DIGIT FOUR
24F9>0035 # DOUBLE CIRCLED DIGIT FIVE
24FA>0036 # DOUBLE CIRCLED DIGIT SIX
24FB>0037 # DOUBLE CIRCLED DIGIT SEVEN
24FC>0038 # DOUBLE CIRCLED DIGIT EIGHT
24FD>0039 # DOUBLE CIRCLED DIGIT NINE
24FF>0030 # NEGATIVE CIRCLED DIGIT ZERO
2776>0031 # DINGBAT NEGATIVE CIRCLED DIGIT ONE
2777>0032 # DINGBAT NEGATIVE CIRCLED DIGIT TWO
2778>0033 # DINGBAT NEGATIVE CIRCLED DIGIT THREE
2779>0034 # DINGBAT NEGATIVE CIRCLED DIGIT FOUR
277A>0035 # DINGBAT NEGATIVE CIRCLED DIGIT FIVE
277B>0036 # DINGBAT NEGATIVE CIRCLED DIGIT SIX
277C>0037 # DINGBAT NEGATIVE CIRCLED DIGIT SEVEN
277D>0038 # DINGBAT NEGATIVE CIRCLED DIGIT EIGHT
277E>0039 # DINGBAT NEGATIVE CIRCLED DIGIT NINE
2780>0031 # DINGBAT CIRCLED SANS-SERIF DIGIT ONE
2781>0032 # DINGBAT CIRCLED SANS-SERIF DIGIT TWO
2782>0033 # DINGBAT CIRCLED SANS-SERIF DIGIT THREE
2783>0034 # DINGBAT CIRCLED SANS-SERIF DIGIT FOUR
2784>0035 # DINGBAT CIRCLED SANS-SERIF DIGIT FIVE
2785>0036 # DINGBAT CIRCLED SANS-SERIF DIGIT SIX
2786>0037 # DINGBAT CIRCLED SANS-SERIF DIGIT SEVEN
2787>0038 # DINGBAT CIRCLED SANS-SERIF DIGIT EIGHT
2788>0039 # DINGBAT CIRCLED SANS-SERIF DIGIT NINE
278A>0031 # DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT ONE
278B>0032 # DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT TWO
278C>0033 # DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT THREE
278D>0034 # DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT FOUR
278E>0035 # DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT FIVE
278F>0036 # DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT SIX
2790>0037 # DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT SEVEN
2791>0038 # DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT EIGHT
2792>0039 # DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT NINE
A620>0030 # VAI DIGIT ZERO
A621>0031 # VAI DIGIT ONE
A622>0032 # VAI DIGIT TWO
A623>0033 # VAI DIGIT THREE
A624>0034 # VAI DIGIT FOUR
A625>0035 # VAI DIGIT FIVE
A626>0036 # VAI DIGIT SIX
A627>0037 # VAI DIGIT SEVEN
A628>0038 # VAI DIGIT EIGHT
A629>0039 # VAI DIGIT NINE
A8D0>0030 # SAURASHTRA DIGIT ZERO
A8D1>0031 # SAURASHTRA DIGIT ONE
A8D2>0032 # SAURASHTRA DIGIT TWO
A8D3>0033 # SAURASHTRA DIGIT THREE
A8D4>0034 # SAURASHTRA DIGIT FOUR
A8D5>0035 # SAURASHTRA DIGIT FIVE
A8D6>0036 # SAURASHTRA DIGIT SIX
A8D7>0037 # SAURASHTRA DIGIT SEVEN
A8D8>0038 # SAURASHTRA DIGIT EIGHT
A8D9>0039 # SAURASHTRA DIGIT NINE
A900>0030 # KAYAH LI DIGIT ZERO
A901>0031 # KAYAH LI DIGIT ONE
A902>0032 # KAYAH LI DIGIT TWO
A903>0033 # KAYAH LI DIGIT THREE
A904>0034 # KAYAH LI DIGIT FOUR
A905>0035 # KAYAH LI DIGIT FIVE
A906>0036 # KAYAH LI DIGIT SIX
A907>0037 # KAYAH LI DIGIT SEVEN
A908>0038 # KAYAH LI DIGIT EIGHT
A909>0039 # KAYAH LI DIGIT NINE
A9D0>0030 # JAVANESE DIGIT ZERO
A9D1>0031 # JAVANESE DIGIT ONE
A9D2>0032 # JAVANESE DIGIT TWO
A9D3>0033 # JAVANESE DIGIT THREE
A9D4>0034 # JAVANESE DIGIT FOUR
A9D5>0035 # JAVANESE DIGIT FIVE
A9D6>0036 # JAVANESE DIGIT SIX
A9D7>0037 # JAVANESE DIGIT SEVEN
A9D8>0038 # JAVANESE DIGIT EIGHT
A9D9>0039 # JAVANESE DIGIT NINE
AA50>0030 # CHAM DIGIT ZERO
AA51>0031 # CHAM DIGIT ONE
AA52>0032 # CHAM DIGIT TWO
AA53>0033 # CHAM DIGIT THREE
AA54>0034 # CHAM DIGIT FOUR
AA55>0035 # CHAM DIGIT FIVE
AA56>0036 # CHAM DIGIT SIX
AA57>0037 # CHAM DIGIT SEVEN
AA58>0038 # CHAM DIGIT EIGHT
AA59>0039 # CHAM DIGIT NINE
ABF0>0030 # MEETEI MAYEK DIGIT ZERO
ABF1>0031 # MEETEI MAYEK DIGIT ONE
ABF2>0032 # MEETEI MAYEK DIGIT TWO
ABF3>0033 # MEETEI MAYEK DIGIT THREE
ABF4>0034 # MEETEI MAYEK DIGIT FOUR
ABF5>0035 # MEETEI MAYEK DIGIT FIVE
ABF6>0036 # MEETEI MAYEK DIGIT SIX
ABF7>0037 # MEETEI MAYEK DIGIT SEVEN
ABF8>0038 # MEETEI MAYEK DIGIT EIGHT
ABF9>0039 # MEETEI MAYEK DIGIT NINE
104A0>0030 # OSMANYA DIGIT ZERO
104A1>0031 # OSMANYA DIGIT ONE
104A2>0032 # OSMANYA DIGIT TWO
104A3>0033 # OSMANYA DIGIT THREE
104A4>0034 # OSMANYA DIGIT FOUR
104A5>0035 # OSMANYA DIGIT FIVE
104A6>0036 # OSMANYA DIGIT SIX
104A7>0037 # OSMANYA DIGIT SEVEN
104A8>0038 # OSMANYA DIGIT EIGHT
104A9>0039 # OSMANYA DIGIT NINE
10A40>0031 # KHAROSHTHI DIGIT ONE
10A41>0032 # KHAROSHTHI DIGIT TWO
10A42>0033 # KHAROSHTHI DIGIT THREE
10A43>0034 # KHAROSHTHI DIGIT FOUR
10E60>0031 # RUMI DIGIT ONE
10E61>0032 # RUMI DIGIT TWO
10E62>0033 # RUMI DIGIT THREE
10E63>0034 # RUMI DIGIT FOUR
10E64>0035 # RUMI DIGIT FIVE
10E65>0036 # RUMI DIGIT SIX
10E66>0037 # RUMI DIGIT SEVEN
10E67>0038 # RUMI DIGIT EIGHT
10E68>0039 # RUMI DIGIT NINE
11052>0031 # BRAHMI NUMBER ONE
11053>0032 # BRAHMI NUMBER TWO
11054>0033 # BRAHMI NUMBER THREE
11055>0034 # BRAHMI NUMBER FOUR
11056>0035 # BRAHMI NUMBER FIVE
11057>0036 # BRAHMI NUMBER SIX
11058>0037 # BRAHMI NUMBER SEVEN
11059>0038 # BRAHMI NUMBER EIGHT
1105A>0039 # BRAHMI NUMBER NINE
11066>0030 # BRAHMI DIGIT ZERO
11067>0031 # BRAHMI DIGIT ONE
11068>0032 # BRAHMI DIGIT TWO
11069>0033 # BRAHMI DIGIT THREE
1106A>0034 # BRAHMI DIGIT FOUR
1106B>0035 # BRAHMI DIGIT FIVE
1106C>0036 # BRAHMI DIGIT SIX
1106D>0037 # BRAHMI DIGIT SEVEN
1106E>0038 # BRAHMI DIGIT EIGHT
1106F>0039 # BRAHMI DIGIT NINE
110F0>0030 # SORA SOMPENG DIGIT ZERO
110F1>0031 # SORA SOMPENG DIGIT ONE
110F2>0032 # SORA SOMPENG DIGIT TWO
110F3>0033 # SORA SOMPENG DIGIT THREE
110F4>0034 # SORA SOMPENG DIGIT FOUR
110F5>0035 # SORA SOMPENG DIGIT FIVE
110F6>0036 # SORA SOMPENG DIGIT SIX
110F7>0037 # SORA SOMPENG DIGIT SEVEN
110F8>0038 # SORA SOMPENG DIGIT EIGHT
110F9>0039 # SORA SOMPENG DIGIT NINE
11136>0030 # CHAKMA DIGIT ZERO
11137>0031 # CHAKMA DIGIT ONE
11138>0032 # CHAKMA DIGIT TWO
11139>0033 # CHAKMA DIGIT THREE
1113A>0034 # CHAKMA DIGIT FOUR
1113B>0035 # CHAKMA DIGIT FIVE
1113C>0036 # CHAKMA DIGIT SIX
1113D>0037 # CHAKMA DIGIT SEVEN
1113E>0038 # CHAKMA DIGIT EIGHT
1113F>0039 # CHAKMA DIGIT NINE
111D0>0030 # SHARADA DIGIT ZERO
111D1>0031 # SHARADA DIGIT ONE
111D2>0032 # SHARADA DIGIT TWO
111D3>0033 # SHARADA DIGIT THREE
111D4>0034 # SHARADA DIGIT FOUR
111D5>0035 # SHARADA DIGIT FIVE
111D6>0036 # SHARADA DIGIT SIX
111D7>0037 # SHARADA DIGIT SEVEN
111D8>0038 # SHARADA DIGIT EIGHT
111D9>0039 # SHARADA DIGIT NINE
116C0>0030 # TAKRI DIGIT ZERO
116C1>0031 # TAKRI DIGIT ONE
116C2>0032 # TAKRI DIGIT TWO
116C3>0033 # TAKRI DIGIT THREE
116C4>0034 # TAKRI DIGIT FOUR
116C5>0035 # TAKRI DIGIT FIVE
116C6>0036 # TAKRI DIGIT SIX
116C7>0037 # TAKRI DIGIT SEVEN
116C8>0038 # TAKRI DIGIT EIGHT
116C9>0039 # TAKRI DIGIT NINE

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,52 +1,18 @@
# Copyright 2001-2010 Unicode, Inc.
#
# Disclaimer
#
# This source code is provided as is by Unicode, Inc. No claims are
# made as to fitness for any particular purpose. No warranties of any
# kind are expressed or implied. The recipient agrees to determine
# applicability of information provided. If this file has been
# purchased on magnetic or optical media from Unicode, Inc., the
# sole remedy for any claim will be exchange of defective media
# within 90 days of receipt.
#
# Limitations on Rights to Redistribute This Code
#
# Unicode, Inc. hereby grants the right to freely use the information
# supplied in this file in the creation of products supporting the
# Unicode Standard, and to make copies of this file in any form
# for internal or external distribution as long as this notice
# remains attached.
#
# Extracted from:
# DerivedNormalizationProps-6.0.0.txt
# Date: 2010-05-20, 15:14:12 GMT [MD]
#
# Unicode Character Database
# Copyright (c) 1991-2010 Unicode, Inc.
# Copyright (c) 1991-2012 Unicode, Inc.
# For terms of use, see http://www.unicode.org/terms_of_use.html
# For documentation, see http://www.unicode.org/reports/tr44/
#
# file name: nfkc_cf.txt
#
# machine-generated by ICU preparseucd.py
#
# This file contains the Unicode NFKC_CF mappings,
# extracted from the UCD file DerivedNormalizationProps.txt,
# and reformatted into syntax for the gennorm2 Normalizer2 data generator tool.
# Use this file as the third gennorm2 input file after nfc.txt and nfkc.txt.
# ================================================
# This file has been reformatted into syntax for the
# gennorm2 Normalizer2 data generator tool.
# Only the NFKC_CF mappings are retained and reformatted.
# Reformatting via regular expression: s/ *; NFKC_CF; */>/
# Use this file as the second gennorm2 input file after nfkc.txt.
# ================================================
# Derived Property: NFKC_Casefold (NFKC_CF)
# This property removes certain variations from characters: case, compatibility, and default-ignorables.
# It is used for loose matching and certain types of identifiers.
# It is constructed by applying NFKC, CaseFolding, and removal of Default_Ignorable_Code_Points.
# The process of applying these transformations is repeated until a stable result is produced.
# WARNING: Application to STRINGS must apply NFC after mapping each character, because characters may interact.
# For more information, see [http://www.unicode.org/reports/tr44/]
# Omitted code points are unchanged by this mapping.
# @missing: 0000..10FFFF; NFKC_CF; <code point>
# All code points not explicitly listed for NFKC_Casefold
# have the value <codepoint>.
* Unicode 6.1.0
0041>0061
0042>0062
@ -656,6 +622,8 @@
10C3>2D23
10C4>2D24
10C5>2D25
10C7>2D27
10CD>2D2D
10FC>10DC
115F..1160>
17B4..17B5>
@ -1061,9 +1029,7 @@
2049>0021 003F
2057>2032 2032 2032 2032
205F>0020
2060..2064>
2065..2069>
206A..206F>
2060..206F>
2070>0030
2071>0069
2074>0034
@ -1470,6 +1436,7 @@
2CE2>2CE3
2CEB>2CEC
2CED>2CEE
2CF2>2CF3
2D6F>2D61
2E9F>6BCD
2EF3>9F9F
@ -2390,11 +2357,15 @@ A786>A787
A78B>A78C
A78D>0265
A790>A791
A792>A793
A7A0>A7A1
A7A2>A7A3
A7A4>A7A5
A7A6>A7A7
A7A8>A7A9
A7AA>0266
A7F8>0127
A7F9>0153
F900>8C48
F901>66F4
F902>8ECA
@ -2684,6 +2655,8 @@ FA2A>98EF
FA2B>98FC
FA2C>9928
FA2D>9DB4
FA2E>90DE
FA2F>96B7
FA30>4FAE
FA31>50E7
FA32>514D
@ -4773,6 +4746,147 @@ FFF0..FFF8>
1D7FD>0037
1D7FE>0038
1D7FF>0039
1EE00>0627
1EE01>0628
1EE02>062C
1EE03>062F
1EE05>0648
1EE06>0632
1EE07>062D
1EE08>0637
1EE09>064A
1EE0A>0643
1EE0B>0644
1EE0C>0645
1EE0D>0646
1EE0E>0633
1EE0F>0639
1EE10>0641
1EE11>0635
1EE12>0642
1EE13>0631
1EE14>0634
1EE15>062A
1EE16>062B
1EE17>062E
1EE18>0630
1EE19>0636
1EE1A>0638
1EE1B>063A
1EE1C>066E
1EE1D>06BA
1EE1E>06A1
1EE1F>066F
1EE21>0628
1EE22>062C
1EE24>0647
1EE27>062D
1EE29>064A
1EE2A>0643
1EE2B>0644
1EE2C>0645
1EE2D>0646
1EE2E>0633
1EE2F>0639
1EE30>0641
1EE31>0635
1EE32>0642
1EE34>0634
1EE35>062A
1EE36>062B
1EE37>062E
1EE39>0636
1EE3B>063A
1EE42>062C
1EE47>062D
1EE49>064A
1EE4B>0644
1EE4D>0646
1EE4E>0633
1EE4F>0639
1EE51>0635
1EE52>0642
1EE54>0634
1EE57>062E
1EE59>0636
1EE5B>063A
1EE5D>06BA
1EE5F>066F
1EE61>0628
1EE62>062C
1EE64>0647
1EE67>062D
1EE68>0637
1EE69>064A
1EE6A>0643
1EE6C>0645
1EE6D>0646
1EE6E>0633
1EE6F>0639
1EE70>0641
1EE71>0635
1EE72>0642
1EE74>0634
1EE75>062A
1EE76>062B
1EE77>062E
1EE79>0636
1EE7A>0638
1EE7B>063A
1EE7C>066E
1EE7E>06A1
1EE80>0627
1EE81>0628
1EE82>062C
1EE83>062F
1EE84>0647
1EE85>0648
1EE86>0632
1EE87>062D
1EE88>0637
1EE89>064A
1EE8B>0644
1EE8C>0645
1EE8D>0646
1EE8E>0633
1EE8F>0639
1EE90>0641
1EE91>0635
1EE92>0642
1EE93>0631
1EE94>0634
1EE95>062A
1EE96>062B
1EE97>062E
1EE98>0630
1EE99>0636
1EE9A>0638
1EE9B>063A
1EEA1>0628
1EEA2>062C
1EEA3>062F
1EEA5>0648
1EEA6>0632
1EEA7>062D
1EEA8>0637
1EEA9>064A
1EEAB>0644
1EEAC>0645
1EEAD>0646
1EEAE>0633
1EEAF>0639
1EEB0>0641
1EEB1>0635
1EEB2>0642
1EEB3>0631
1EEB4>0634
1EEB5>062A
1EEB6>062B
1EEB7>062E
1EEB8>0630
1EEB9>0636
1EEBA>0638
1EEBB>063A
1F100>0030 002E
1F101>0030 002C
1F102>0031 002C
@ -4847,6 +4961,8 @@ FFF0..FFF8>
1F14D>0073 0073
1F14E>0070 0070 0076
1F14F>0077 0063
1F16A>006D 0063
1F16B>006D 0064
1F190>0064 006A
1F200>307B 304B
1F201>30B3 30B3
@ -5437,12 +5553,4 @@ FFF0..FFF8>
2FA1B>9F16
2FA1C>9F3B
2FA1D>2A600
E0000>
E0001>
E0002..E001F>
E0020..E007F>
E0080..E00FF>
E0100..E01EF>
E01F0..E0FFF>
# Total code points: 9792
E0000..E0FFF>

View File

@ -0,0 +1,273 @@
package org.apache.lucene.analysis.icu;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UProperty;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.text.UnicodeSetIterator;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileFilter;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Downloads/generates lucene/analysis/icu/src/data/utr30/*.txt
*
* ASSUMPTION: This class will be run with current directory set to
* lucene/analysis/icu/src/data/utr30/
*
* <ol>
* <li>
* Downloads nfc.txt, nfkc.txt and nfkc_cf.txt from icu-project.org,
* overwriting the versions in lucene/analysis/icu/src/data/utr30/.
* </li>
* <li>
* Converts round-trip mappings in nfc.txt (containing '=')
* that map to at least one [:Diacritic:] character
* into one-way mappings ('>' instead of '=').
* </li>
* </ol>
*/
public class GenerateUTR30DataFiles {
private static final String ICU_SVN_TAG_URL
= "http://source.icu-project.org/repos/icu/icu/tags";
private static final String ICU_RELEASE_TAG = "release-49-1-2";
private static final String ICU_DATA_NORM2_PATH = "source/data/unidata/norm2";
private static final String NFC_TXT = "nfc.txt";
private static final String NFKC_TXT = "nfkc.txt";
private static final String NFKC_CF_TXT = "nfkc_cf.txt";
private static byte[] DOWNLOAD_BUFFER = new byte[8192];
private static final Pattern ROUND_TRIP_MAPPING_LINE_PATTERN
= Pattern.compile("^\\s*([^=]+?)\\s*=\\s*(.*)$");
private static final Pattern VERBATIM_RULE_LINE_PATTERN
= Pattern.compile("^#\\s*Rule:\\s*verbatim\\s*$", Pattern.CASE_INSENSITIVE);
private static final Pattern RULE_LINE_PATTERN
= Pattern.compile("^#\\s*Rule:\\s*(.*)>(.*)", Pattern.CASE_INSENSITIVE);
private static final Pattern BLANK_OR_COMMENT_LINE_PATTERN
= Pattern.compile("^\\s*(?:#.*)?$");
private static final Pattern NUMERIC_VALUE_PATTERN
= Pattern.compile("Numeric[-\\s_]*Value", Pattern.CASE_INSENSITIVE);
public static void main(String args[]) {
try {
getNFKCDataFilesFromIcuProject();
expandRulesInUTR30DataFiles();
} catch (Throwable t) {
t.printStackTrace(System.err);
System.exit(1);
}
}
private static void expandRulesInUTR30DataFiles() throws IOException {
FileFilter filter = new FileFilter() {
@Override
public boolean accept(File pathname) {
String name = pathname.getName();
return pathname.isFile() && name.matches(".*\\.(?s:txt)")
&& ! name.equals(NFC_TXT) && ! name.equals(NFKC_TXT)
&& ! name.equals(NFKC_CF_TXT);
}
};
for (File file : new File(".").listFiles(filter)) {
expandDataFileRules(file);
}
}
private static void expandDataFileRules(File file) throws IOException {
final FileInputStream stream = new FileInputStream(file);
final InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
final BufferedReader bufferedReader = new BufferedReader(reader);
StringBuilder builder = new StringBuilder();
String line;
boolean verbatim = false;
boolean modified = false;
int lineNum = 0;
try {
while (null != (line = bufferedReader.readLine())) {
++lineNum;
if (VERBATIM_RULE_LINE_PATTERN.matcher(line).matches()) {
verbatim = true;
builder.append(line).append("\n");
} else {
Matcher ruleMatcher = RULE_LINE_PATTERN.matcher(line);
if (ruleMatcher.matches()) {
verbatim = false;
builder.append(line).append("\n");
try {
String leftHandSide = ruleMatcher.group(1).trim();
String rightHandSide = ruleMatcher.group(2).trim();
expandSingleRule(builder, leftHandSide, rightHandSide);
} catch (IllegalArgumentException e) {
System.err.println
("ERROR in " + file.getName() + " line #" + lineNum + ":");
e.printStackTrace(System.err);
System.exit(1);
}
modified = true;
} else {
if (BLANK_OR_COMMENT_LINE_PATTERN.matcher(line).matches()) {
builder.append(line).append("\n");
} else {
if (verbatim) {
builder.append(line).append("\n");
} else {
modified = true;
}
}
}
}
}
} finally {
bufferedReader.close();
}
if (modified) {
System.err.println("Expanding rules in and overwriting " + file.getName());
final FileOutputStream out = new FileOutputStream(file, false);
Writer writer = new OutputStreamWriter(out, "UTF-8");
try {
writer.write(builder.toString());
} finally {
writer.close();
}
}
}
private static void getNFKCDataFilesFromIcuProject() throws IOException {
URL icuTagsURL = new URL(ICU_SVN_TAG_URL + "/");
URL icuReleaseTagURL = new URL(icuTagsURL, ICU_RELEASE_TAG + "/");
URL norm2url = new URL(icuReleaseTagURL, ICU_DATA_NORM2_PATH + "/");
System.err.print("Downloading " + NFKC_TXT + " ... ");
download(new URL(norm2url, NFKC_TXT), NFKC_TXT);
System.err.println("done.");
System.err.print("Downloading " + NFKC_CF_TXT + " ... ");
download(new URL(norm2url, NFKC_CF_TXT), NFKC_CF_TXT);
System.err.println("done.");
System.err.print("Downloading " + NFKC_CF_TXT + " and making diacritic rules one-way ... ");
URLConnection connection = openConnection(new URL(norm2url, NFC_TXT));
BufferedReader reader = new BufferedReader
(new InputStreamReader(connection.getInputStream(), "UTF-8"));
Writer writer = new OutputStreamWriter(new FileOutputStream(NFC_TXT), "UTF-8");
try {
String line;
while (null != (line = reader.readLine())) {
Matcher matcher = ROUND_TRIP_MAPPING_LINE_PATTERN.matcher(line);
if (matcher.matches()) {
final String leftHandSide = matcher.group(1);
final String rightHandSide = matcher.group(2).trim();
List<String> diacritics = new ArrayList<String>();
for (String outputCodePoint : rightHandSide.split("\\s+")) {
int ch = Integer.parseInt(outputCodePoint, 16);
if (UCharacter.hasBinaryProperty(ch, UProperty.DIACRITIC)
// gennorm2 fails if U+0653-U+0656 are included in round-trip mappings
|| (ch >= 0x653 && ch <= 0x656)) {
diacritics.add(outputCodePoint);
}
}
if ( ! diacritics.isEmpty()) {
StringBuilder replacementLine = new StringBuilder();
replacementLine.append(leftHandSide).append(">").append(rightHandSide);
replacementLine.append(" # one-way: diacritic");
if (diacritics.size() > 1) {
replacementLine.append("s");
}
for (String diacritic : diacritics) {
replacementLine.append(" ").append(diacritic);
}
line = replacementLine.toString();
}
}
writer.write(line);
writer.write("\n");
}
} finally {
reader.close();
writer.close();
}
System.err.println("done.");
}
private static void download(URL url, String outputFile)
throws IOException {
final URLConnection connection = openConnection(url);
final InputStream inputStream = connection.getInputStream();
final OutputStream outputStream = new FileOutputStream(outputFile);
int numBytes;
try {
while (-1 != (numBytes = inputStream.read(DOWNLOAD_BUFFER))) {
outputStream.write(DOWNLOAD_BUFFER, 0, numBytes);
}
} finally {
inputStream.close();
outputStream.close();
}
}
private static URLConnection openConnection(URL url) throws IOException {
final URLConnection connection = url.openConnection();
connection.setUseCaches(false);
connection.addRequestProperty("Cache-Control", "no-cache");
connection.connect();
return connection;
}
private static void expandSingleRule
(StringBuilder builder, String leftHandSide, String rightHandSide)
throws IllegalArgumentException {
UnicodeSet set = new UnicodeSet(leftHandSide, UnicodeSet.IGNORE_SPACE);
boolean numericValue = NUMERIC_VALUE_PATTERN.matcher(rightHandSide).matches();
for (UnicodeSetIterator it = new UnicodeSetIterator(set) ; it.nextRange() ; ) {
if (it.codepoint != UnicodeSetIterator.IS_STRING) {
if (numericValue) {
for (int cp = it.codepoint ; cp <= it.codepointEnd ; ++cp) {
builder.append(String.format("%04X", cp)).append('>');
builder.append(String.format("%04X", 0x30 + UCharacter.getNumericValue(cp)));
builder.append(" # ").append(UCharacter.getName(cp));
builder.append("\n");
}
} else {
builder.append(String.format("%04X", it.codepoint));
if (it.codepointEnd > it.codepoint) {
builder.append("..").append(String.format("%04X", it.codepointEnd));
}
builder.append('>').append(rightHandSide).append("\n");
}
} else {
System.err.println("ERROR: String '" + it.getString() + "' found in UnicodeSet");
System.exit(1);
}
}
}
}

View File

@ -64,7 +64,7 @@
</target>
<path id="tools.dependencies">
<fileset dir="../icu/lib" includes="icu4j-4.8.1.1.jar"/>
<fileset dir="../icu/lib" includes="icu4j-49.1.jar"/>
</path>
<path id="tools.classpath">

View File

@ -150,7 +150,7 @@
<pathelement path="${analyzers-common.jar}"/>
<pathelement path="${queryparser.jar}"/>
<pathelement path="${facet.jar}"/>
<fileset dir="${common.dir}/analysis/icu/lib" includes="icu4j-4.8.1.1.jar"/>
<fileset dir="${common.dir}/analysis/icu/lib" includes="icu4j-49.1.jar"/>
<path refid="base.classpath"/>
<fileset dir="lib">
<include name="commons-compress-1.2.jar"/>
@ -208,7 +208,7 @@
<path id="collation.runtime.classpath">
<path refid="run.classpath"/>
<pathelement path="${analyzers-icu.jar}"/>
<fileset dir="${common.dir}/analysis/icu/lib" includes="icu4j-4.8.1.1.jar"/>
<fileset dir="${common.dir}/analysis/icu/lib" includes="icu4j-49.1.jar"/>
</path>
<target name="collation" depends="compile,jar-analyzers-icu,top-100k-wiki-word-files">

View File

@ -352,7 +352,6 @@ public abstract class LuceneTestCase extends Assert {
.around(new TestRuleNoStaticHooksShadowing())
.around(new TestRuleNoInstanceHooksOverrides())
.around(new SystemPropertiesInvariantRule(IGNORED_INVARIANT_PROPERTIES))
.around(new TestRuleIcuHack())
.around(classNameRule = new TestRuleStoreClassName())
.around(new TestRuleReportUncaughtExceptions())
.around(classEnvRule = new TestRuleSetupAndRestoreClassEnv());

View File

@ -1,55 +0,0 @@
package org.apache.lucene.util;
import java.util.Locale;
import java.util.concurrent.atomic.AtomicBoolean;
import org.junit.rules.TestRule;
import org.junit.runner.Description;
import org.junit.runners.model.Statement;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
final class TestRuleIcuHack implements TestRule {
/** Globally only check hack once. */
private static volatile AtomicBoolean icuTested = new AtomicBoolean(false);
@Override
public Statement apply(final Statement s, Description d) {
return new Statement() {
@Override
public void evaluate() throws Throwable {
// START hack to init ICU safely before we randomize locales.
// ICU fails during classloading when a special Java7-only locale is the default
// see: http://bugs.icu-project.org/trac/ticket/8734
if (!icuTested.getAndSet(true)) {
Locale previous = Locale.getDefault();
try {
Locale.setDefault(Locale.ROOT);
Class.forName("com.ibm.icu.util.ULocale");
} catch (ClassNotFoundException cnfe) {
// ignore if no ICU is in classpath
} finally {
Locale.setDefault(previous);
}
}
s.evaluate();
}
};
}
}

View File

@ -19,7 +19,7 @@
<ivy-module version="2.0">
<info organisation="org.apache.solr" module="analysis-extras"/>
<dependencies>
<dependency org="com.ibm.icu" name="icu4j" rev="4.8.1.1" transitive="false"/>
<dependency org="com.ibm.icu" name="icu4j" rev="49.1" transitive="false"/>
<dependency org="org.carrot2" name="morfologik-polish" rev="1.5.3" transitive="false"/>
<dependency org="org.carrot2" name="morfologik-fsa" rev="1.5.3" transitive="false"/>
<dependency org="org.carrot2" name="morfologik-stemming" rev="1.5.3" transitive="false"/>

View File

@ -1 +0,0 @@
654b7021e7bb6c5b7b35c88d23cac1022c7b3d6b

View File

@ -0,0 +1 @@
fbf7a438e6bf3660e0da2fd77dd1df1635fe503c

View File

@ -2,7 +2,7 @@ ICU License - ICU 1.8.1 and later
COPYRIGHT AND PERMISSION NOTICE
Copyright (c) 1995-2008 International Business Machines Corporation and others
Copyright (c) 1995-2012 International Business Machines Corporation and others
All rights reserved.

View File

@ -1,3 +1,3 @@
ICU4J, (under contrib/icu) is licensed under an MIT styles license
(contrib/icu/lib/ICU-LICENSE.txt) and Copyright (c) 1995-2008
ICU4J, (under modules/analysis/icu) is licensed under an MIT style license
(modules/analysis/icu/lib/icu4j-LICENSE-BSD_LIKE.txt) and Copyright (c) 1995-2012
International Business Machines Corporation and others

View File

@ -52,7 +52,7 @@
<dependency org="rome" name="rome" rev="0.9" transitive="false"/>
<dependency org="jdom" name="jdom" rev="1.0" transitive="false"/>
<!-- Other ExtracingRequestHandler dependencies -->
<dependency org="com.ibm.icu" name="icu4j" rev="4.8.1.1" transitive="false"/>
<dependency org="com.ibm.icu" name="icu4j" rev="49.1" transitive="false"/>
<dependency org="xerces" name="xercesImpl" rev="2.8.1" transitive="false"/>
<exclude org="*" ext="*" matcher="regexp" type="${ivy.exclude.types}"/>
</dependencies>

View File

@ -1 +0,0 @@
654b7021e7bb6c5b7b35c88d23cac1022c7b3d6b

View File

@ -0,0 +1 @@
fbf7a438e6bf3660e0da2fd77dd1df1635fe503c

View File

@ -2,7 +2,7 @@ ICU License - ICU 1.8.1 and later
COPYRIGHT AND PERMISSION NOTICE
Copyright (c) 1995-2008 International Business Machines Corporation and others
Copyright (c) 1995-2012 International Business Machines Corporation and others
All rights reserved.

View File

@ -1,3 +1,3 @@
ICU4J, (under contrib/icu) is licensed under an MIT styles license
(contrib/icu/lib/ICU-LICENSE.txt) and Copyright (c) 1995-2008
ICU4J, (under modules/analysis/icu) is licensed under an MIT style license
(modules/analysis/icu/lib/icu4j-LICENSE-BSD_LIKE.txt) and Copyright (c) 1995-2012
International Business Machines Corporation and others