mirror of https://github.com/apache/lucene.git
LUCENE-3747: Support Unicode 6.1.0.
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1365971 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
924f6f730d
commit
5abc76ea42
|
@ -98,7 +98,7 @@
|
|||
<classpathentry kind="lib" path="lucene/test-framework/lib/ant-junit-1.8.2.jar"/>
|
||||
<classpathentry kind="lib" path="lucene/test-framework/lib/junit-4.10.jar"/>
|
||||
<classpathentry kind="lib" path="lucene/sandbox/lib/jakarta-regexp-1.4.jar"/>
|
||||
<classpathentry kind="lib" path="lucene/analysis/icu/lib/icu4j-4.8.1.1.jar"/>
|
||||
<classpathentry kind="lib" path="lucene/analysis/icu/lib/icu4j-49.1.jar"/>
|
||||
<classpathentry kind="lib" path="lucene/analysis/phonetic/lib/commons-codec-1.6.jar"/>
|
||||
<classpathentry kind="lib" path="lucene/analysis/morfologik/lib/morfologik-fsa-1.5.3.jar"/>
|
||||
<classpathentry kind="lib" path="lucene/analysis/morfologik/lib/morfologik-polish-1.5.3.jar"/>
|
||||
|
|
|
@ -143,7 +143,7 @@
|
|||
<dependency>
|
||||
<groupId>com.ibm.icu</groupId>
|
||||
<artifactId>icu4j</artifactId>
|
||||
<version>4.8.1.1</version>
|
||||
<version>49.1</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.google.guava</groupId>
|
||||
|
|
|
@ -67,6 +67,8 @@ API Changes
|
|||
parallels with docScore and the default implementation is correct.
|
||||
(Robert Muir)
|
||||
|
||||
* LUCENE-3747: Support Unicode 6.1.0. (Steve Rowe)
|
||||
|
||||
Optimizations
|
||||
|
||||
* LUCENE-4171: Performance improvements to Packed64.
|
||||
|
|
|
@ -1,162 +0,0 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
|
||||
CharacterEntities = ( "AElig" | "Aacute" | "Acirc" | "Agrave" | "Alpha"
|
||||
| "Aring" | "Atilde" | "Auml" | "Beta" | "Ccedil" | "Chi"
|
||||
| "Dagger" | "Delta" | "ETH" | "Eacute" | "Ecirc"
|
||||
| "Egrave" | "Epsilon" | "Eta" | "Euml" | "Gamma"
|
||||
| "Iacute" | "Icirc" | "Igrave" | "Iota" | "Iuml" | "Kappa"
|
||||
| "Lambda" | "Mu" | "Ntilde" | "Nu" | "OElig" | "Oacute"
|
||||
| "Ocirc" | "Ograve" | "Omega" | "Omicron" | "Oslash"
|
||||
| "Otilde" | "Ouml" | "Phi" | "Pi" | "Prime" | "Psi"
|
||||
| "Rho" | "Scaron" | "Sigma" | "THORN" | "Tau" | "Theta"
|
||||
| "Uacute" | "Ucirc" | "Ugrave" | "Upsilon" | "Uuml" | "Xi"
|
||||
| "Yacute" | "Yuml" | "Zeta" | "aacute" | "acirc" | "acute"
|
||||
| "aelig" | "agrave" | "alefsym" | "alpha" | "amp" | "AMP"
|
||||
| "and" | "ang" | "apos" | "aring" | "asymp" | "atilde"
|
||||
| "auml" | "bdquo" | "beta" | "brvbar" | "bull" | "cap"
|
||||
| "ccedil" | "cedil" | "cent" | "chi" | "circ" | "clubs"
|
||||
| "cong" | "copy" | "COPY" | "crarr" | "cup" | "curren"
|
||||
| "dArr" | "dagger" | "darr" | "deg" | "delta" | "diams"
|
||||
| "divide" | "eacute" | "ecirc" | "egrave" | "empty"
|
||||
| "emsp" | "ensp" | "epsilon" | "equiv" | "eta" | "eth"
|
||||
| "euml" | "euro" | "exist" | "fnof" | "forall" | "frac12"
|
||||
| "frac14" | "frac34" | "frasl" | "gamma" | "ge" | "gt"
|
||||
| "GT" | "hArr" | "harr" | "hearts" | "hellip" | "iacute"
|
||||
| "icirc" | "iexcl" | "igrave" | "image" | "infin" | "int"
|
||||
| "iota" | "iquest" | "isin" | "iuml" | "kappa" | "lArr"
|
||||
| "lambda" | "lang" | "laquo" | "larr" | "lceil" | "ldquo"
|
||||
| "le" | "lfloor" | "lowast" | "loz" | "lrm" | "lsaquo"
|
||||
| "lsquo" | "lt" | "LT" | "macr" | "mdash" | "micro"
|
||||
| "middot" | "minus" | "mu" | "nabla" | "nbsp" | "ndash"
|
||||
| "ne" | "ni" | "not" | "notin" | "nsub" | "ntilde" | "nu"
|
||||
| "oacute" | "ocirc" | "oelig" | "ograve" | "oline"
|
||||
| "omega" | "omicron" | "oplus" | "or" | "ordf" | "ordm"
|
||||
| "oslash" | "otilde" | "otimes" | "ouml" | "para" | "part"
|
||||
| "permil" | "perp" | "phi" | "pi" | "piv" | "plusmn"
|
||||
| "pound" | "prime" | "prod" | "prop" | "psi" | "quot"
|
||||
| "QUOT" | "rArr" | "radic" | "rang" | "raquo" | "rarr"
|
||||
| "rceil" | "rdquo" | "real" | "reg" | "REG" | "rfloor"
|
||||
| "rho" | "rlm" | "rsaquo" | "rsquo" | "sbquo" | "scaron"
|
||||
| "sdot" | "sect" | "shy" | "sigma" | "sigmaf" | "sim"
|
||||
| "spades" | "sub" | "sube" | "sum" | "sup" | "sup1"
|
||||
| "sup2" | "sup3" | "supe" | "szlig" | "tau" | "there4"
|
||||
| "theta" | "thetasym" | "thinsp" | "thorn" | "tilde"
|
||||
| "times" | "trade" | "uArr" | "uacute" | "uarr" | "ucirc"
|
||||
| "ugrave" | "uml" | "upsih" | "upsilon" | "uuml"
|
||||
| "weierp" | "xi" | "yacute" | "yen" | "yuml" | "zeta"
|
||||
| "zwj" | "zwnj" )
|
||||
%{
|
||||
private static final Map<String,String> upperCaseVariantsAccepted
|
||||
= new HashMap<String,String>();
|
||||
static {
|
||||
upperCaseVariantsAccepted.put("quot", "QUOT");
|
||||
upperCaseVariantsAccepted.put("copy", "COPY");
|
||||
upperCaseVariantsAccepted.put("gt", "GT");
|
||||
upperCaseVariantsAccepted.put("lt", "LT");
|
||||
upperCaseVariantsAccepted.put("reg", "REG");
|
||||
upperCaseVariantsAccepted.put("amp", "AMP");
|
||||
}
|
||||
private static final CharArrayMap<Character> entityValues
|
||||
= new CharArrayMap<Character>(Version.LUCENE_40, 253, false);
|
||||
static {
|
||||
String[] entities = {
|
||||
"AElig", "\u00C6", "Aacute", "\u00C1", "Acirc", "\u00C2",
|
||||
"Agrave", "\u00C0", "Alpha", "\u0391", "Aring", "\u00C5",
|
||||
"Atilde", "\u00C3", "Auml", "\u00C4", "Beta", "\u0392",
|
||||
"Ccedil", "\u00C7", "Chi", "\u03A7", "Dagger", "\u2021",
|
||||
"Delta", "\u0394", "ETH", "\u00D0", "Eacute", "\u00C9",
|
||||
"Ecirc", "\u00CA", "Egrave", "\u00C8", "Epsilon", "\u0395",
|
||||
"Eta", "\u0397", "Euml", "\u00CB", "Gamma", "\u0393", "Iacute", "\u00CD",
|
||||
"Icirc", "\u00CE", "Igrave", "\u00CC", "Iota", "\u0399",
|
||||
"Iuml", "\u00CF", "Kappa", "\u039A", "Lambda", "\u039B", "Mu", "\u039C",
|
||||
"Ntilde", "\u00D1", "Nu", "\u039D", "OElig", "\u0152",
|
||||
"Oacute", "\u00D3", "Ocirc", "\u00D4", "Ograve", "\u00D2",
|
||||
"Omega", "\u03A9", "Omicron", "\u039F", "Oslash", "\u00D8",
|
||||
"Otilde", "\u00D5", "Ouml", "\u00D6", "Phi", "\u03A6", "Pi", "\u03A0",
|
||||
"Prime", "\u2033", "Psi", "\u03A8", "Rho", "\u03A1", "Scaron", "\u0160",
|
||||
"Sigma", "\u03A3", "THORN", "\u00DE", "Tau", "\u03A4", "Theta", "\u0398",
|
||||
"Uacute", "\u00DA", "Ucirc", "\u00DB", "Ugrave", "\u00D9",
|
||||
"Upsilon", "\u03A5", "Uuml", "\u00DC", "Xi", "\u039E",
|
||||
"Yacute", "\u00DD", "Yuml", "\u0178", "Zeta", "\u0396",
|
||||
"aacute", "\u00E1", "acirc", "\u00E2", "acute", "\u00B4",
|
||||
"aelig", "\u00E6", "agrave", "\u00E0", "alefsym", "\u2135",
|
||||
"alpha", "\u03B1", "amp", "\u0026", "and", "\u2227", "ang", "\u2220",
|
||||
"apos", "\u0027", "aring", "\u00E5", "asymp", "\u2248",
|
||||
"atilde", "\u00E3", "auml", "\u00E4", "bdquo", "\u201E",
|
||||
"beta", "\u03B2", "brvbar", "\u00A6", "bull", "\u2022", "cap", "\u2229",
|
||||
"ccedil", "\u00E7", "cedil", "\u00B8", "cent", "\u00A2", "chi", "\u03C7",
|
||||
"circ", "\u02C6", "clubs", "\u2663", "cong", "\u2245", "copy", "\u00A9",
|
||||
"crarr", "\u21B5", "cup", "\u222A", "curren", "\u00A4", "dArr", "\u21D3",
|
||||
"dagger", "\u2020", "darr", "\u2193", "deg", "\u00B0", "delta", "\u03B4",
|
||||
"diams", "\u2666", "divide", "\u00F7", "eacute", "\u00E9",
|
||||
"ecirc", "\u00EA", "egrave", "\u00E8", "empty", "\u2205",
|
||||
"emsp", "\u2003", "ensp", "\u2002", "epsilon", "\u03B5",
|
||||
"equiv", "\u2261", "eta", "\u03B7", "eth", "\u00F0", "euml", "\u00EB",
|
||||
"euro", "\u20AC", "exist", "\u2203", "fnof", "\u0192",
|
||||
"forall", "\u2200", "frac12", "\u00BD", "frac14", "\u00BC",
|
||||
"frac34", "\u00BE", "frasl", "\u2044", "gamma", "\u03B3", "ge", "\u2265",
|
||||
"gt", "\u003E", "hArr", "\u21D4", "harr", "\u2194", "hearts", "\u2665",
|
||||
"hellip", "\u2026", "iacute", "\u00ED", "icirc", "\u00EE",
|
||||
"iexcl", "\u00A1", "igrave", "\u00EC", "image", "\u2111",
|
||||
"infin", "\u221E", "int", "\u222B", "iota", "\u03B9", "iquest", "\u00BF",
|
||||
"isin", "\u2208", "iuml", "\u00EF", "kappa", "\u03BA", "lArr", "\u21D0",
|
||||
"lambda", "\u03BB", "lang", "\u2329", "laquo", "\u00AB",
|
||||
"larr", "\u2190", "lceil", "\u2308", "ldquo", "\u201C", "le", "\u2264",
|
||||
"lfloor", "\u230A", "lowast", "\u2217", "loz", "\u25CA", "lrm", "\u200E",
|
||||
"lsaquo", "\u2039", "lsquo", "\u2018", "lt", "\u003C", "macr", "\u00AF",
|
||||
"mdash", "\u2014", "micro", "\u00B5", "middot", "\u00B7",
|
||||
"minus", "\u2212", "mu", "\u03BC", "nabla", "\u2207", "nbsp", " ",
|
||||
"ndash", "\u2013", "ne", "\u2260", "ni", "\u220B", "not", "\u00AC",
|
||||
"notin", "\u2209", "nsub", "\u2284", "ntilde", "\u00F1", "nu", "\u03BD",
|
||||
"oacute", "\u00F3", "ocirc", "\u00F4", "oelig", "\u0153",
|
||||
"ograve", "\u00F2", "oline", "\u203E", "omega", "\u03C9",
|
||||
"omicron", "\u03BF", "oplus", "\u2295", "or", "\u2228", "ordf", "\u00AA",
|
||||
"ordm", "\u00BA", "oslash", "\u00F8", "otilde", "\u00F5",
|
||||
"otimes", "\u2297", "ouml", "\u00F6", "para", "\u00B6", "part", "\u2202",
|
||||
"permil", "\u2030", "perp", "\u22A5", "phi", "\u03C6", "pi", "\u03C0",
|
||||
"piv", "\u03D6", "plusmn", "\u00B1", "pound", "\u00A3",
|
||||
"prime", "\u2032", "prod", "\u220F", "prop", "\u221D", "psi", "\u03C8",
|
||||
"quot", "\"", "rArr", "\u21D2", "radic", "\u221A", "rang", "\u232A",
|
||||
"raquo", "\u00BB", "rarr", "\u2192", "rceil", "\u2309",
|
||||
"rdquo", "\u201D", "real", "\u211C", "reg", "\u00AE", "rfloor", "\u230B",
|
||||
"rho", "\u03C1", "rlm", "\u200F", "rsaquo", "\u203A", "rsquo", "\u2019",
|
||||
"sbquo", "\u201A", "scaron", "\u0161", "sdot", "\u22C5",
|
||||
"sect", "\u00A7", "shy", "\u00AD", "sigma", "\u03C3", "sigmaf", "\u03C2",
|
||||
"sim", "\u223C", "spades", "\u2660", "sub", "\u2282", "sube", "\u2286",
|
||||
"sum", "\u2211", "sup", "\u2283", "sup1", "\u00B9", "sup2", "\u00B2",
|
||||
"sup3", "\u00B3", "supe", "\u2287", "szlig", "\u00DF", "tau", "\u03C4",
|
||||
"there4", "\u2234", "theta", "\u03B8", "thetasym", "\u03D1",
|
||||
"thinsp", "\u2009", "thorn", "\u00FE", "tilde", "\u02DC",
|
||||
"times", "\u00D7", "trade", "\u2122", "uArr", "\u21D1",
|
||||
"uacute", "\u00FA", "uarr", "\u2191", "ucirc", "\u00FB",
|
||||
"ugrave", "\u00F9", "uml", "\u00A8", "upsih", "\u03D2",
|
||||
"upsilon", "\u03C5", "uuml", "\u00FC", "weierp", "\u2118",
|
||||
"xi", "\u03BE", "yacute", "\u00FD", "yen", "\u00A5", "yuml", "\u00FF",
|
||||
"zeta", "\u03B6", "zwj", "\u200D", "zwnj", "\u200C"
|
||||
};
|
||||
for (int i = 0 ; i < entities.length ; i += 2) {
|
||||
Character value = entities[i + 1].charAt(0);
|
||||
entityValues.put(entities[i], value);
|
||||
String upperCaseVariant = upperCaseVariantsAccepted.get(entities[i]);
|
||||
if (upperCaseVariant != null) {
|
||||
entityValues.put(upperCaseVariant, value);
|
||||
}
|
||||
}
|
||||
}
|
||||
%}
|
|
@ -14,45 +14,52 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
// Generated using ICU4J 4.8.1.1 on Friday, January 13, 2012 6:20:39 PM UTC
|
||||
// Generated using ICU4J 49.1.0.0 on Sunday, July 15, 2012 5:42:00 AM UTC
|
||||
// by org.apache.lucene.analysis.icu.GenerateHTMLStripCharFilterSupplementaryMacros
|
||||
|
||||
|
||||
ID_Start_Supp = (
|
||||
[\uD81A][\uDC00-\uDE38]
|
||||
[\uD802][\uDC00-\uDC05\uDC08\uDC0A-\uDC35\uDC37\uDC38\uDC3C\uDC3F-\uDC55\uDD00-\uDD15\uDD20-\uDD39\uDD80-\uDDB7\uDDBE\uDDBF\uDE00\uDE10-\uDE13\uDE15-\uDE17\uDE19-\uDE33\uDE60-\uDE7C\uDF00-\uDF35\uDF40-\uDF55\uDF60-\uDF72]
|
||||
| [\uD81A][\uDC00-\uDE38]
|
||||
| [\uD869][\uDC00-\uDED6\uDF00-\uDFFF]
|
||||
| [\uD835][\uDC00-\uDC54\uDC56-\uDC9C\uDC9E\uDC9F\uDCA2\uDCA5\uDCA6\uDCA9-\uDCAC\uDCAE-\uDCB9\uDCBB\uDCBD-\uDCC3\uDCC5-\uDD05\uDD07-\uDD0A\uDD0D-\uDD14\uDD16-\uDD1C\uDD1E-\uDD39\uDD3B-\uDD3E\uDD40-\uDD44\uDD46\uDD4A-\uDD50\uDD52-\uDEA5\uDEA8-\uDEC0\uDEC2-\uDEDA\uDEDC-\uDEFA\uDEFC-\uDF14\uDF16-\uDF34\uDF36-\uDF4E\uDF50-\uDF6E\uDF70-\uDF88\uDF8A-\uDFA8\uDFAA-\uDFC2\uDFC4-\uDFCB]
|
||||
| [\uD80C\uD840-\uD868\uD86A-\uD86C][\uDC00-\uDFFF]
|
||||
| [\uD82C][\uDC00\uDC01]
|
||||
| [\uD804][\uDC03-\uDC37\uDC83-\uDCAF]
|
||||
| [\uD86D][\uDC00-\uDF34\uDF40-\uDFFF]
|
||||
| [\uD81B][\uDF00-\uDF44\uDF50\uDF93-\uDF9F]
|
||||
| [\uD87E][\uDC00-\uDE1D]
|
||||
| [\uD804][\uDC03-\uDC37\uDC83-\uDCAF\uDCD0-\uDCE8\uDD03-\uDD26\uDD83-\uDDB2\uDDC1-\uDDC4]
|
||||
| [\uD83B][\uDE00-\uDE03\uDE05-\uDE1F\uDE21\uDE22\uDE24\uDE27\uDE29-\uDE32\uDE34-\uDE37\uDE39\uDE3B\uDE42\uDE47\uDE49\uDE4B\uDE4D-\uDE4F\uDE51\uDE52\uDE54\uDE57\uDE59\uDE5B\uDE5D\uDE5F\uDE61\uDE62\uDE64\uDE67-\uDE6A\uDE6C-\uDE72\uDE74-\uDE77\uDE79-\uDE7C\uDE7E\uDE80-\uDE89\uDE8B-\uDE9B\uDEA1-\uDEA3\uDEA5-\uDEA9\uDEAB-\uDEBB]
|
||||
| [\uD809][\uDC00-\uDC62]
|
||||
| [\uD808][\uDC00-\uDF6E]
|
||||
| [\uD803][\uDC00-\uDC48]
|
||||
| [\uD800][\uDC00-\uDC0B\uDC0D-\uDC26\uDC28-\uDC3A\uDC3C\uDC3D\uDC3F-\uDC4D\uDC50-\uDC5D\uDC80-\uDCFA\uDD40-\uDD74\uDE80-\uDE9C\uDEA0-\uDED0\uDF00-\uDF1E\uDF30-\uDF4A\uDF80-\uDF9D\uDFA0-\uDFC3\uDFC8-\uDFCF\uDFD1-\uDFD5]
|
||||
| [\uD80D][\uDC00-\uDC2E]
|
||||
| [\uD805][\uDE80-\uDEAA]
|
||||
| [\uD86E][\uDC00-\uDC1D]
|
||||
| [\uD802][\uDC00-\uDC05\uDC08\uDC0A-\uDC35\uDC37\uDC38\uDC3C\uDC3F-\uDC55\uDD00-\uDD15\uDD20-\uDD39\uDE00\uDE10-\uDE13\uDE15-\uDE17\uDE19-\uDE33\uDE60-\uDE7C\uDF00-\uDF35\uDF40-\uDF55\uDF60-\uDF72]
|
||||
| [\uD801][\uDC00-\uDC9D]
|
||||
)
|
||||
ID_Continue_Supp = (
|
||||
[\uD81A][\uDC00-\uDE38]
|
||||
| [\uD869][\uDC00-\uDED6\uDF00-\uDFFF]
|
||||
| [\uD80C\uD840-\uD868\uD86A-\uD86C][\uDC00-\uDFFF]
|
||||
| [\uD804][\uDC00-\uDC46\uDC66-\uDC6F\uDC80-\uDCBA]
|
||||
| [\uD82C][\uDC00\uDC01]
|
||||
| [\uD802][\uDC00-\uDC05\uDC08\uDC0A-\uDC35\uDC37\uDC38\uDC3C\uDC3F-\uDC55\uDD00-\uDD15\uDD20-\uDD39\uDE00-\uDE03\uDE05\uDE06\uDE0C-\uDE13\uDE15-\uDE17\uDE19-\uDE33\uDE38-\uDE3A\uDE3F\uDE60-\uDE7C\uDF00-\uDF35\uDF40-\uDF55\uDF60-\uDF72]
|
||||
| [\uD81B][\uDF00-\uDF44\uDF50-\uDF7E\uDF8F-\uDF9F]
|
||||
| [\uD801][\uDC00-\uDC9D\uDCA0-\uDCA9]
|
||||
| [\uD86D][\uDC00-\uDF34\uDF40-\uDFFF]
|
||||
| [\uD87E][\uDC00-\uDE1D]
|
||||
| [\uD802][\uDC00-\uDC05\uDC08\uDC0A-\uDC35\uDC37\uDC38\uDC3C\uDC3F-\uDC55\uDD00-\uDD15\uDD20-\uDD39\uDD80-\uDDB7\uDDBE\uDDBF\uDE00-\uDE03\uDE05\uDE06\uDE0C-\uDE13\uDE15-\uDE17\uDE19-\uDE33\uDE38-\uDE3A\uDE3F\uDE60-\uDE7C\uDF00-\uDF35\uDF40-\uDF55\uDF60-\uDF72]
|
||||
| [\uD805][\uDE80-\uDEB7\uDEC0-\uDEC9]
|
||||
| [\uD83B][\uDE00-\uDE03\uDE05-\uDE1F\uDE21\uDE22\uDE24\uDE27\uDE29-\uDE32\uDE34-\uDE37\uDE39\uDE3B\uDE42\uDE47\uDE49\uDE4B\uDE4D-\uDE4F\uDE51\uDE52\uDE54\uDE57\uDE59\uDE5B\uDE5D\uDE5F\uDE61\uDE62\uDE64\uDE67-\uDE6A\uDE6C-\uDE72\uDE74-\uDE77\uDE79-\uDE7C\uDE7E\uDE80-\uDE89\uDE8B-\uDE9B\uDEA1-\uDEA3\uDEA5-\uDEA9\uDEAB-\uDEBB]
|
||||
| [\uD809][\uDC00-\uDC62]
|
||||
| [\uD808][\uDC00-\uDF6E]
|
||||
| [\uD803][\uDC00-\uDC48]
|
||||
| [\uD80D][\uDC00-\uDC2E]
|
||||
| [\uD800][\uDC00-\uDC0B\uDC0D-\uDC26\uDC28-\uDC3A\uDC3C\uDC3D\uDC3F-\uDC4D\uDC50-\uDC5D\uDC80-\uDCFA\uDD40-\uDD74\uDDFD\uDE80-\uDE9C\uDEA0-\uDED0\uDF00-\uDF1E\uDF30-\uDF4A\uDF80-\uDF9D\uDFA0-\uDFC3\uDFC8-\uDFCF\uDFD1-\uDFD5]
|
||||
| [\uD804][\uDC00-\uDC46\uDC66-\uDC6F\uDC80-\uDCBA\uDCD0-\uDCE8\uDCF0-\uDCF9\uDD00-\uDD34\uDD36-\uDD3F\uDD80-\uDDC4\uDDD0-\uDDD9]
|
||||
| [\uD86E][\uDC00-\uDC1D]
|
||||
| [\uDB40][\uDD00-\uDDEF]
|
||||
| [\uD834][\uDD65-\uDD69\uDD6D-\uDD72\uDD7B-\uDD82\uDD85-\uDD8B\uDDAA-\uDDAD\uDE42-\uDE44]
|
||||
| [\uD835][\uDC00-\uDC54\uDC56-\uDC9C\uDC9E\uDC9F\uDCA2\uDCA5\uDCA6\uDCA9-\uDCAC\uDCAE-\uDCB9\uDCBB\uDCBD-\uDCC3\uDCC5-\uDD05\uDD07-\uDD0A\uDD0D-\uDD14\uDD16-\uDD1C\uDD1E-\uDD39\uDD3B-\uDD3E\uDD40-\uDD44\uDD46\uDD4A-\uDD50\uDD52-\uDEA5\uDEA8-\uDEC0\uDEC2-\uDEDA\uDEDC-\uDEFA\uDEFC-\uDF14\uDF16-\uDF34\uDF36-\uDF4E\uDF50-\uDF6E\uDF70-\uDF88\uDF8A-\uDFA8\uDFAA-\uDFC2\uDFC4-\uDFCB\uDFCE-\uDFFF]
|
||||
)
|
||||
Picked up JAVA_TOOL_OPTIONS: -Dfile.encoding=UTF-8
|
||||
|
|
|
@ -15,8 +15,8 @@
|
|||
*/
|
||||
|
||||
// Generated from IANA Root Zone Database <http://www.internic.net/zones/root.zone>
|
||||
// file version from Sunday, March 18, 2012 4:34:02 AM UTC
|
||||
// generated on Sunday, March 18, 2012 4:02:55 PM UTC
|
||||
// file version from Saturday, July 14, 2012 4:34:14 AM UTC
|
||||
// generated on Sunday, July 15, 2012 12:59:44 AM UTC
|
||||
// by org.apache.lucene.analysis.standard.GenerateJflexTLDMacros
|
||||
|
||||
ASCIITLD = "." (
|
||||
|
@ -310,6 +310,7 @@ ASCIITLD = "." (
|
|||
| [xX][nN]--[kK][pP][rR][wW]13[dD]
|
||||
| [xX][nN]--[kK][pP][rR][yY]57[dD]
|
||||
| [xX][nN]--[lL][gG][bB][bB][aA][tT]1[aA][dD]8[jJ]
|
||||
| [xX][nN]--[mM][gG][bB]9[aA][wW][bB][fF]
|
||||
| [xX][nN]--[mM][gG][bB][aA][aA][mM]7[aA]8[hH]
|
||||
| [xX][nN]--[mM][gG][bB][aA][yY][hH]7[gG][pP][aA]
|
||||
| [xX][nN]--[mM][gG][bB][bB][hH]1[aA]71[eE]
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 08.07.12 16:59 */
|
||||
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 7/15/12 1:57 AM */
|
||||
|
||||
package org.apache.lucene.analysis.standard;
|
||||
|
||||
|
@ -33,8 +33,8 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
|||
/**
|
||||
* This class is a scanner generated by
|
||||
* <a href="http://www.jflex.de/">JFlex</a> 1.5.0-SNAPSHOT
|
||||
* on 08.07.12 16:59 from the specification file
|
||||
* <tt>C:/Users/Uwe Schindler/Projects/lucene/lucene4199/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex</tt>
|
||||
* on 7/15/12 1:57 AM from the specification file
|
||||
* <tt>C:/cygwin/home/s/svn/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex</tt>
|
||||
*/
|
||||
class ClassicTokenizerImpl implements StandardTokenizerInterface {
|
||||
|
||||
|
|
|
@ -14,22 +14,25 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
// Generated using ICU4J 4.8.1.1 on Sunday, July 8, 2012 2:59:49 PM UTC
|
||||
// Generated using ICU4J 49.1.0.0 on Sunday, July 15, 2012 5:57:26 AM UTC
|
||||
// by org.apache.lucene.analysis.icu.GenerateJFlexSupplementaryMacros
|
||||
|
||||
|
||||
ALetterSupp = (
|
||||
([\ud80d][\uDC00-\uDC2E])
|
||||
([\ud83b][\uDE00-\uDE03\uDE05-\uDE1F\uDE21\uDE22\uDE24\uDE27\uDE29-\uDE32\uDE34-\uDE37\uDE39\uDE3B\uDE42\uDE47\uDE49\uDE4B\uDE4D-\uDE4F\uDE51\uDE52\uDE54\uDE57\uDE59\uDE5B\uDE5D\uDE5F\uDE61\uDE62\uDE64\uDE67-\uDE6A\uDE6C-\uDE72\uDE74-\uDE77\uDE79-\uDE7C\uDE7E\uDE80-\uDE89\uDE8B-\uDE9B\uDEA1-\uDEA3\uDEA5-\uDEA9\uDEAB-\uDEBB])
|
||||
| ([\ud81a][\uDC00-\uDE38])
|
||||
| ([\ud81b][\uDF00-\uDF44\uDF50\uDF93-\uDF9F])
|
||||
| ([\ud835][\uDC00-\uDC54\uDC56-\uDC9C\uDC9E\uDC9F\uDCA2\uDCA5\uDCA6\uDCA9-\uDCAC\uDCAE-\uDCB9\uDCBB\uDCBD-\uDCC3\uDCC5-\uDD05\uDD07-\uDD0A\uDD0D-\uDD14\uDD16-\uDD1C\uDD1E-\uDD39\uDD3B-\uDD3E\uDD40-\uDD44\uDD46\uDD4A-\uDD50\uDD52-\uDEA5\uDEA8-\uDEC0\uDEC2-\uDEDA\uDEDC-\uDEFA\uDEFC-\uDF14\uDF16-\uDF34\uDF36-\uDF4E\uDF50-\uDF6E\uDF70-\uDF88\uDF8A-\uDFA8\uDFAA-\uDFC2\uDFC4-\uDFCB])
|
||||
| ([\ud80d][\uDC00-\uDC2E])
|
||||
| ([\ud80c][\uDC00-\uDFFF])
|
||||
| ([\ud809][\uDC00-\uDC62])
|
||||
| ([\ud808][\uDC00-\uDF6E])
|
||||
| ([\ud81a][\uDC00-\uDE38])
|
||||
| ([\ud804][\uDC03-\uDC37\uDC83-\uDCAF])
|
||||
| ([\ud835][\uDC00-\uDC54\uDC56-\uDC9C\uDC9E\uDC9F\uDCA2\uDCA5\uDCA6\uDCA9-\uDCAC\uDCAE-\uDCB9\uDCBB\uDCBD-\uDCC3\uDCC5-\uDD05\uDD07-\uDD0A\uDD0D-\uDD14\uDD16-\uDD1C\uDD1E-\uDD39\uDD3B-\uDD3E\uDD40-\uDD44\uDD46\uDD4A-\uDD50\uDD52-\uDEA5\uDEA8-\uDEC0\uDEC2-\uDEDA\uDEDC-\uDEFA\uDEFC-\uDF14\uDF16-\uDF34\uDF36-\uDF4E\uDF50-\uDF6E\uDF70-\uDF88\uDF8A-\uDFA8\uDFAA-\uDFC2\uDFC4-\uDFCB])
|
||||
| ([\ud805][\uDE80-\uDEAA])
|
||||
| ([\ud804][\uDC03-\uDC37\uDC83-\uDCAF\uDCD0-\uDCE8\uDD03-\uDD26\uDD83-\uDDB2\uDDC1-\uDDC4])
|
||||
| ([\ud801][\uDC00-\uDC9D])
|
||||
| ([\ud800][\uDC00-\uDC0B\uDC0D-\uDC26\uDC28-\uDC3A\uDC3C\uDC3D\uDC3F-\uDC4D\uDC50-\uDC5D\uDC80-\uDCFA\uDD40-\uDD74\uDE80-\uDE9C\uDEA0-\uDED0\uDF00-\uDF1E\uDF30-\uDF4A\uDF80-\uDF9D\uDFA0-\uDFC3\uDFC8-\uDFCF\uDFD1-\uDFD5])
|
||||
| ([\ud803][\uDC00-\uDC48])
|
||||
| ([\ud802][\uDC00-\uDC05\uDC08\uDC0A-\uDC35\uDC37\uDC38\uDC3C\uDC3F-\uDC55\uDD00-\uDD15\uDD20-\uDD39\uDE00\uDE10-\uDE13\uDE15-\uDE17\uDE19-\uDE33\uDE60-\uDE7C\uDF00-\uDF35\uDF40-\uDF55\uDF60-\uDF72])
|
||||
| ([\ud802][\uDC00-\uDC05\uDC08\uDC0A-\uDC35\uDC37\uDC38\uDC3C\uDC3F-\uDC55\uDD00-\uDD15\uDD20-\uDD39\uDD80-\uDDB7\uDDBE\uDDBF\uDE00\uDE10-\uDE13\uDE15-\uDE17\uDE19-\uDE33\uDE60-\uDE7C\uDF00-\uDF35\uDF40-\uDF55\uDF60-\uDF72])
|
||||
)
|
||||
FormatSupp = (
|
||||
([\ud804][\uDCBD])
|
||||
|
@ -37,14 +40,17 @@ FormatSupp = (
|
|||
| ([\udb40][\uDC01\uDC20-\uDC7F])
|
||||
)
|
||||
ExtendSupp = (
|
||||
([\ud804][\uDC00-\uDC02\uDC38-\uDC46\uDC80-\uDC82\uDCB0-\uDCBA])
|
||||
([\ud81b][\uDF51-\uDF7E\uDF8F-\uDF92])
|
||||
| ([\ud805][\uDEAB-\uDEB7])
|
||||
| ([\ud804][\uDC00-\uDC02\uDC38-\uDC46\uDC80-\uDC82\uDCB0-\uDCBA\uDD00-\uDD02\uDD27-\uDD34\uDD80-\uDD82\uDDB3-\uDDC0])
|
||||
| ([\ud834][\uDD65-\uDD69\uDD6D-\uDD72\uDD7B-\uDD82\uDD85-\uDD8B\uDDAA-\uDDAD\uDE42-\uDE44])
|
||||
| ([\ud800][\uDDFD])
|
||||
| ([\udb40][\uDD00-\uDDEF])
|
||||
| ([\ud802][\uDE01-\uDE03\uDE05\uDE06\uDE0C-\uDE0F\uDE38-\uDE3A\uDE3F])
|
||||
)
|
||||
NumericSupp = (
|
||||
([\ud804][\uDC66-\uDC6F])
|
||||
([\ud805][\uDEC0-\uDEC9])
|
||||
| ([\ud804][\uDC66-\uDC6F\uDCF0-\uDCF9\uDD36-\uDD3F\uDDD0-\uDDD9])
|
||||
| ([\ud835][\uDFCE-\uDFFF])
|
||||
| ([\ud801][\uDCA0-\uDCA9])
|
||||
)
|
||||
|
@ -109,7 +115,8 @@ HanSupp = (
|
|||
| ([\ud84d][\uDC00-\uDFFF])
|
||||
| ([\ud84c][\uDC00-\uDFFF])
|
||||
| ([\ud84f][\uDC00-\uDFFF])
|
||||
| ([\ud84e][\uDC00-\uDFFF])
|
||||
| Picked up JAVA_TOOL_OPTIONS: -Dfile.encoding=UTF-8
|
||||
([\ud84e][\uDC00-\uDFFF])
|
||||
| ([\ud841][\uDC00-\uDFFF])
|
||||
| ([\ud840][\uDC00-\uDFFF])
|
||||
| ([\ud843][\uDC00-\uDFFF])
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -36,7 +36,7 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
|||
*/
|
||||
%%
|
||||
|
||||
%unicode 6.0
|
||||
%unicode 6.1
|
||||
%integer
|
||||
%final
|
||||
%public
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -39,7 +39,7 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
|||
*/
|
||||
%%
|
||||
|
||||
%unicode 6.0
|
||||
%unicode 6.1
|
||||
%integer
|
||||
%final
|
||||
%public
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 08.07.12 17:00 */
|
||||
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 7/15/12 1:57 AM */
|
||||
|
||||
package org.apache.lucene.analysis.wikipedia;
|
||||
|
||||
|
@ -25,8 +25,8 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
|||
/**
|
||||
* This class is a scanner generated by
|
||||
* <a href="http://www.jflex.de/">JFlex</a> 1.5.0-SNAPSHOT
|
||||
* on 08.07.12 17:00 from the specification file
|
||||
* <tt>C:/Users/Uwe Schindler/Projects/lucene/lucene4199/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex</tt>
|
||||
* on 7/15/12 1:57 AM from the specification file
|
||||
* <tt>C:/cygwin/home/s/svn/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex</tt>
|
||||
*/
|
||||
class WikipediaTokenizerImpl {
|
||||
|
||||
|
|
|
@ -202,7 +202,7 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
public void testUnicodeWordBreaks() throws Exception {
|
||||
WordBreakTestUnicode_6_0_0 wordBreakTest = new WordBreakTestUnicode_6_0_0();
|
||||
WordBreakTestUnicode_6_1_0 wordBreakTest = new WordBreakTestUnicode_6_1_0();
|
||||
wordBreakTest.test(a);
|
||||
}
|
||||
|
||||
|
|
|
@ -424,7 +424,7 @@ public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
public void testUnicodeWordBreaks() throws Exception {
|
||||
WordBreakTestUnicode_6_0_0 wordBreakTest = new WordBreakTestUnicode_6_0_0();
|
||||
WordBreakTestUnicode_6_1_0 wordBreakTest = new WordBreakTestUnicode_6_1_0();
|
||||
wordBreakTest.test(a);
|
||||
}
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
package org.apache.lucene.analysis.core;
|
||||
|
||||
/*
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
|
@ -23,7 +23,7 @@ import org.junit.Ignore;
|
|||
|
||||
/**
|
||||
* This class was automatically generated by generateJavaUnicodeWordBreakTest.pl
|
||||
* from: http://www.unicode.org/Public/6.0.0/ucd/auxiliary/WordBreakTest.txt
|
||||
* from: http://www.unicode.org/Public/6.1.0/ucd/auxiliary/WordBreakTest.txt
|
||||
*
|
||||
* WordBreakTest.txt indicates the points in the provided character sequences
|
||||
* at which conforming implementations must and must not break words. This
|
||||
|
@ -32,16 +32,16 @@ import org.junit.Ignore;
|
|||
* sequences bounded by word breaks and containing at least one character
|
||||
* from one of the following character sets:
|
||||
*
|
||||
* \p{Script = Han} (From http://www.unicode.org/Public/6.0.0/ucd/Scripts.txt)
|
||||
* \p{Script = Han} (From http://www.unicode.org/Public/6.1.0/ucd/Scripts.txt)
|
||||
* \p{Script = Hiragana}
|
||||
* \p{LineBreak = Complex_Context} (From http://www.unicode.org/Public/6.0.0/ucd/LineBreak.txt)
|
||||
* \p{WordBreak = ALetter} (From http://www.unicode.org/Public/6.0.0/ucd/auxiliary/WordBreakProperty.txt)
|
||||
* \p{LineBreak = Complex_Context} (From http://www.unicode.org/Public/6.1.0/ucd/LineBreak.txt)
|
||||
* \p{WordBreak = ALetter} (From http://www.unicode.org/Public/6.1.0/ucd/auxiliary/WordBreakProperty.txt)
|
||||
* \p{WordBreak = Katakana}
|
||||
* \p{WordBreak = Numeric} (Excludes full-width Arabic digits)
|
||||
* [\uFF10-\uFF19] (Full-width Arabic digits)
|
||||
*/
|
||||
@Ignore
|
||||
public class WordBreakTestUnicode_6_0_0 extends BaseTokenStreamTestCase {
|
||||
public class WordBreakTestUnicode_6_1_0 extends BaseTokenStreamTestCase {
|
||||
|
||||
public void test(Analyzer analyzer) throws Exception {
|
||||
// ÷ 0001 ÷ 0001 ÷ # ÷ [0.2] <START OF HEADING> (Other) ÷ [999.0] <START OF HEADING> (Other) ÷ [0.3]
|
||||
|
@ -52,27 +52,27 @@ public class WordBreakTestUnicode_6_0_0 extends BaseTokenStreamTestCase {
|
|||
assertAnalyzesTo(analyzer, "\u0001\u0308\u0001",
|
||||
new String[] { });
|
||||
|
||||
// ÷ 0001 ÷ 000D ÷ # ÷ [0.2] <START OF HEADING> (Other) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
// ÷ 0001 ÷ 000D ÷ # ÷ [0.2] <START OF HEADING> (Other) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0001\r",
|
||||
new String[] { });
|
||||
|
||||
// ÷ 0001 × 0308 ÷ 000D ÷ # ÷ [0.2] <START OF HEADING> (Other) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
// ÷ 0001 × 0308 ÷ 000D ÷ # ÷ [0.2] <START OF HEADING> (Other) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0001\u0308\r",
|
||||
new String[] { });
|
||||
|
||||
// ÷ 0001 ÷ 000A ÷ # ÷ [0.2] <START OF HEADING> (Other) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
// ÷ 0001 ÷ 000A ÷ # ÷ [0.2] <START OF HEADING> (Other) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0001\n",
|
||||
new String[] { });
|
||||
|
||||
// ÷ 0001 × 0308 ÷ 000A ÷ # ÷ [0.2] <START OF HEADING> (Other) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
// ÷ 0001 × 0308 ÷ 000A ÷ # ÷ [0.2] <START OF HEADING> (Other) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0001\u0308\n",
|
||||
new String[] { });
|
||||
|
||||
// ÷ 0001 ÷ 000B ÷ # ÷ [0.2] <START OF HEADING> (Other) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
// ÷ 0001 ÷ 000B ÷ # ÷ [0.2] <START OF HEADING> (Other) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0001\u000B",
|
||||
new String[] { });
|
||||
|
||||
// ÷ 0001 × 0308 ÷ 000B ÷ # ÷ [0.2] <START OF HEADING> (Other) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
// ÷ 0001 × 0308 ÷ 000B ÷ # ÷ [0.2] <START OF HEADING> (Other) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0001\u0308\u000B",
|
||||
new String[] { });
|
||||
|
||||
|
@ -232,7 +232,7 @@ public class WordBreakTestUnicode_6_0_0 extends BaseTokenStreamTestCase {
|
|||
assertAnalyzesTo(analyzer, "\r\r",
|
||||
new String[] { });
|
||||
|
||||
// ÷ 000D ÷ 0308 ÷ 000D ÷ # ÷ [0.2] <CARRIAGE RETURN (CR)> (CR) ÷ [3.1] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
// ÷ 000D ÷ 0308 ÷ 000D ÷ # ÷ [0.2] <CARRIAGE RETURN (CR)> (CR) ÷ [3.1] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\r\u0308\r",
|
||||
new String[] { });
|
||||
|
||||
|
@ -240,7 +240,7 @@ public class WordBreakTestUnicode_6_0_0 extends BaseTokenStreamTestCase {
|
|||
assertAnalyzesTo(analyzer, "\r\n",
|
||||
new String[] { });
|
||||
|
||||
// ÷ 000D ÷ 0308 ÷ 000A ÷ # ÷ [0.2] <CARRIAGE RETURN (CR)> (CR) ÷ [3.1] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
// ÷ 000D ÷ 0308 ÷ 000A ÷ # ÷ [0.2] <CARRIAGE RETURN (CR)> (CR) ÷ [3.1] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\r\u0308\n",
|
||||
new String[] { });
|
||||
|
||||
|
@ -248,7 +248,7 @@ public class WordBreakTestUnicode_6_0_0 extends BaseTokenStreamTestCase {
|
|||
assertAnalyzesTo(analyzer, "\r\u000B",
|
||||
new String[] { });
|
||||
|
||||
// ÷ 000D ÷ 0308 ÷ 000B ÷ # ÷ [0.2] <CARRIAGE RETURN (CR)> (CR) ÷ [3.1] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
// ÷ 000D ÷ 0308 ÷ 000B ÷ # ÷ [0.2] <CARRIAGE RETURN (CR)> (CR) ÷ [3.1] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\r\u0308\u000B",
|
||||
new String[] { });
|
||||
|
||||
|
@ -408,7 +408,7 @@ public class WordBreakTestUnicode_6_0_0 extends BaseTokenStreamTestCase {
|
|||
assertAnalyzesTo(analyzer, "\n\r",
|
||||
new String[] { });
|
||||
|
||||
// ÷ 000A ÷ 0308 ÷ 000D ÷ # ÷ [0.2] <LINE FEED (LF)> (LF) ÷ [3.1] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
// ÷ 000A ÷ 0308 ÷ 000D ÷ # ÷ [0.2] <LINE FEED (LF)> (LF) ÷ [3.1] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\n\u0308\r",
|
||||
new String[] { });
|
||||
|
||||
|
@ -416,7 +416,7 @@ public class WordBreakTestUnicode_6_0_0 extends BaseTokenStreamTestCase {
|
|||
assertAnalyzesTo(analyzer, "\n\n",
|
||||
new String[] { });
|
||||
|
||||
// ÷ 000A ÷ 0308 ÷ 000A ÷ # ÷ [0.2] <LINE FEED (LF)> (LF) ÷ [3.1] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
// ÷ 000A ÷ 0308 ÷ 000A ÷ # ÷ [0.2] <LINE FEED (LF)> (LF) ÷ [3.1] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\n\u0308\n",
|
||||
new String[] { });
|
||||
|
||||
|
@ -424,7 +424,7 @@ public class WordBreakTestUnicode_6_0_0 extends BaseTokenStreamTestCase {
|
|||
assertAnalyzesTo(analyzer, "\n\u000B",
|
||||
new String[] { });
|
||||
|
||||
// ÷ 000A ÷ 0308 ÷ 000B ÷ # ÷ [0.2] <LINE FEED (LF)> (LF) ÷ [3.1] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
// ÷ 000A ÷ 0308 ÷ 000B ÷ # ÷ [0.2] <LINE FEED (LF)> (LF) ÷ [3.1] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\n\u0308\u000B",
|
||||
new String[] { });
|
||||
|
||||
|
@ -584,7 +584,7 @@ public class WordBreakTestUnicode_6_0_0 extends BaseTokenStreamTestCase {
|
|||
assertAnalyzesTo(analyzer, "\u000B\r",
|
||||
new String[] { });
|
||||
|
||||
// ÷ 000B ÷ 0308 ÷ 000D ÷ # ÷ [0.2] <LINE TABULATION> (Newline) ÷ [3.1] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
// ÷ 000B ÷ 0308 ÷ 000D ÷ # ÷ [0.2] <LINE TABULATION> (Newline) ÷ [3.1] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u000B\u0308\r",
|
||||
new String[] { });
|
||||
|
||||
|
@ -592,7 +592,7 @@ public class WordBreakTestUnicode_6_0_0 extends BaseTokenStreamTestCase {
|
|||
assertAnalyzesTo(analyzer, "\u000B\n",
|
||||
new String[] { });
|
||||
|
||||
// ÷ 000B ÷ 0308 ÷ 000A ÷ # ÷ [0.2] <LINE TABULATION> (Newline) ÷ [3.1] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
// ÷ 000B ÷ 0308 ÷ 000A ÷ # ÷ [0.2] <LINE TABULATION> (Newline) ÷ [3.1] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u000B\u0308\n",
|
||||
new String[] { });
|
||||
|
||||
|
@ -600,7 +600,7 @@ public class WordBreakTestUnicode_6_0_0 extends BaseTokenStreamTestCase {
|
|||
assertAnalyzesTo(analyzer, "\u000B\u000B",
|
||||
new String[] { });
|
||||
|
||||
// ÷ 000B ÷ 0308 ÷ 000B ÷ # ÷ [0.2] <LINE TABULATION> (Newline) ÷ [3.1] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
// ÷ 000B ÷ 0308 ÷ 000B ÷ # ÷ [0.2] <LINE TABULATION> (Newline) ÷ [3.1] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u000B\u0308\u000B",
|
||||
new String[] { });
|
||||
|
||||
|
@ -756,27 +756,27 @@ public class WordBreakTestUnicode_6_0_0 extends BaseTokenStreamTestCase {
|
|||
assertAnalyzesTo(analyzer, "\u3031\u0308\u0001",
|
||||
new String[] { "\u3031\u0308" });
|
||||
|
||||
// ÷ 3031 ÷ 000D ÷ # ÷ [0.2] VERTICAL KANA REPEAT MARK (Katakana) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
// ÷ 3031 ÷ 000D ÷ # ÷ [0.2] VERTICAL KANA REPEAT MARK (Katakana) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u3031\r",
|
||||
new String[] { "\u3031" });
|
||||
|
||||
// ÷ 3031 × 0308 ÷ 000D ÷ # ÷ [0.2] VERTICAL KANA REPEAT MARK (Katakana) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
// ÷ 3031 × 0308 ÷ 000D ÷ # ÷ [0.2] VERTICAL KANA REPEAT MARK (Katakana) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u3031\u0308\r",
|
||||
new String[] { "\u3031\u0308" });
|
||||
|
||||
// ÷ 3031 ÷ 000A ÷ # ÷ [0.2] VERTICAL KANA REPEAT MARK (Katakana) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
// ÷ 3031 ÷ 000A ÷ # ÷ [0.2] VERTICAL KANA REPEAT MARK (Katakana) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u3031\n",
|
||||
new String[] { "\u3031" });
|
||||
|
||||
// ÷ 3031 × 0308 ÷ 000A ÷ # ÷ [0.2] VERTICAL KANA REPEAT MARK (Katakana) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
// ÷ 3031 × 0308 ÷ 000A ÷ # ÷ [0.2] VERTICAL KANA REPEAT MARK (Katakana) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u3031\u0308\n",
|
||||
new String[] { "\u3031\u0308" });
|
||||
|
||||
// ÷ 3031 ÷ 000B ÷ # ÷ [0.2] VERTICAL KANA REPEAT MARK (Katakana) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
// ÷ 3031 ÷ 000B ÷ # ÷ [0.2] VERTICAL KANA REPEAT MARK (Katakana) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u3031\u000B",
|
||||
new String[] { "\u3031" });
|
||||
|
||||
// ÷ 3031 × 0308 ÷ 000B ÷ # ÷ [0.2] VERTICAL KANA REPEAT MARK (Katakana) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
// ÷ 3031 × 0308 ÷ 000B ÷ # ÷ [0.2] VERTICAL KANA REPEAT MARK (Katakana) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u3031\u0308\u000B",
|
||||
new String[] { "\u3031\u0308" });
|
||||
|
||||
|
@ -932,27 +932,27 @@ public class WordBreakTestUnicode_6_0_0 extends BaseTokenStreamTestCase {
|
|||
assertAnalyzesTo(analyzer, "\u0041\u0308\u0001",
|
||||
new String[] { "\u0041\u0308" });
|
||||
|
||||
// ÷ 0041 ÷ 000D ÷ # ÷ [0.2] LATIN CAPITAL LETTER A (ALetter) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
// ÷ 0041 ÷ 000D ÷ # ÷ [0.2] LATIN CAPITAL LETTER A (ALetter) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0041\r",
|
||||
new String[] { "\u0041" });
|
||||
|
||||
// ÷ 0041 × 0308 ÷ 000D ÷ # ÷ [0.2] LATIN CAPITAL LETTER A (ALetter) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
// ÷ 0041 × 0308 ÷ 000D ÷ # ÷ [0.2] LATIN CAPITAL LETTER A (ALetter) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0041\u0308\r",
|
||||
new String[] { "\u0041\u0308" });
|
||||
|
||||
// ÷ 0041 ÷ 000A ÷ # ÷ [0.2] LATIN CAPITAL LETTER A (ALetter) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
// ÷ 0041 ÷ 000A ÷ # ÷ [0.2] LATIN CAPITAL LETTER A (ALetter) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0041\n",
|
||||
new String[] { "\u0041" });
|
||||
|
||||
// ÷ 0041 × 0308 ÷ 000A ÷ # ÷ [0.2] LATIN CAPITAL LETTER A (ALetter) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
// ÷ 0041 × 0308 ÷ 000A ÷ # ÷ [0.2] LATIN CAPITAL LETTER A (ALetter) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0041\u0308\n",
|
||||
new String[] { "\u0041\u0308" });
|
||||
|
||||
// ÷ 0041 ÷ 000B ÷ # ÷ [0.2] LATIN CAPITAL LETTER A (ALetter) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
// ÷ 0041 ÷ 000B ÷ # ÷ [0.2] LATIN CAPITAL LETTER A (ALetter) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0041\u000B",
|
||||
new String[] { "\u0041" });
|
||||
|
||||
// ÷ 0041 × 0308 ÷ 000B ÷ # ÷ [0.2] LATIN CAPITAL LETTER A (ALetter) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
// ÷ 0041 × 0308 ÷ 000B ÷ # ÷ [0.2] LATIN CAPITAL LETTER A (ALetter) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0041\u0308\u000B",
|
||||
new String[] { "\u0041\u0308" });
|
||||
|
||||
|
@ -1108,27 +1108,27 @@ public class WordBreakTestUnicode_6_0_0 extends BaseTokenStreamTestCase {
|
|||
assertAnalyzesTo(analyzer, "\u003A\u0308\u0001",
|
||||
new String[] { });
|
||||
|
||||
// ÷ 003A ÷ 000D ÷ # ÷ [0.2] COLON (MidLetter) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
// ÷ 003A ÷ 000D ÷ # ÷ [0.2] COLON (MidLetter) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u003A\r",
|
||||
new String[] { });
|
||||
|
||||
// ÷ 003A × 0308 ÷ 000D ÷ # ÷ [0.2] COLON (MidLetter) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
// ÷ 003A × 0308 ÷ 000D ÷ # ÷ [0.2] COLON (MidLetter) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u003A\u0308\r",
|
||||
new String[] { });
|
||||
|
||||
// ÷ 003A ÷ 000A ÷ # ÷ [0.2] COLON (MidLetter) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
// ÷ 003A ÷ 000A ÷ # ÷ [0.2] COLON (MidLetter) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u003A\n",
|
||||
new String[] { });
|
||||
|
||||
// ÷ 003A × 0308 ÷ 000A ÷ # ÷ [0.2] COLON (MidLetter) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
// ÷ 003A × 0308 ÷ 000A ÷ # ÷ [0.2] COLON (MidLetter) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u003A\u0308\n",
|
||||
new String[] { });
|
||||
|
||||
// ÷ 003A ÷ 000B ÷ # ÷ [0.2] COLON (MidLetter) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
// ÷ 003A ÷ 000B ÷ # ÷ [0.2] COLON (MidLetter) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u003A\u000B",
|
||||
new String[] { });
|
||||
|
||||
// ÷ 003A × 0308 ÷ 000B ÷ # ÷ [0.2] COLON (MidLetter) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
// ÷ 003A × 0308 ÷ 000B ÷ # ÷ [0.2] COLON (MidLetter) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u003A\u0308\u000B",
|
||||
new String[] { });
|
||||
|
||||
|
@ -1284,27 +1284,27 @@ public class WordBreakTestUnicode_6_0_0 extends BaseTokenStreamTestCase {
|
|||
assertAnalyzesTo(analyzer, "\u002C\u0308\u0001",
|
||||
new String[] { });
|
||||
|
||||
// ÷ 002C ÷ 000D ÷ # ÷ [0.2] COMMA (MidNum) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
// ÷ 002C ÷ 000D ÷ # ÷ [0.2] COMMA (MidNum) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u002C\r",
|
||||
new String[] { });
|
||||
|
||||
// ÷ 002C × 0308 ÷ 000D ÷ # ÷ [0.2] COMMA (MidNum) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
// ÷ 002C × 0308 ÷ 000D ÷ # ÷ [0.2] COMMA (MidNum) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u002C\u0308\r",
|
||||
new String[] { });
|
||||
|
||||
// ÷ 002C ÷ 000A ÷ # ÷ [0.2] COMMA (MidNum) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
// ÷ 002C ÷ 000A ÷ # ÷ [0.2] COMMA (MidNum) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u002C\n",
|
||||
new String[] { });
|
||||
|
||||
// ÷ 002C × 0308 ÷ 000A ÷ # ÷ [0.2] COMMA (MidNum) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
// ÷ 002C × 0308 ÷ 000A ÷ # ÷ [0.2] COMMA (MidNum) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u002C\u0308\n",
|
||||
new String[] { });
|
||||
|
||||
// ÷ 002C ÷ 000B ÷ # ÷ [0.2] COMMA (MidNum) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
// ÷ 002C ÷ 000B ÷ # ÷ [0.2] COMMA (MidNum) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u002C\u000B",
|
||||
new String[] { });
|
||||
|
||||
// ÷ 002C × 0308 ÷ 000B ÷ # ÷ [0.2] COMMA (MidNum) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
// ÷ 002C × 0308 ÷ 000B ÷ # ÷ [0.2] COMMA (MidNum) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u002C\u0308\u000B",
|
||||
new String[] { });
|
||||
|
||||
|
@ -1460,27 +1460,27 @@ public class WordBreakTestUnicode_6_0_0 extends BaseTokenStreamTestCase {
|
|||
assertAnalyzesTo(analyzer, "\u0027\u0308\u0001",
|
||||
new String[] { });
|
||||
|
||||
// ÷ 0027 ÷ 000D ÷ # ÷ [0.2] APOSTROPHE (MidNumLet) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
// ÷ 0027 ÷ 000D ÷ # ÷ [0.2] APOSTROPHE (MidNumLet) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0027\r",
|
||||
new String[] { });
|
||||
|
||||
// ÷ 0027 × 0308 ÷ 000D ÷ # ÷ [0.2] APOSTROPHE (MidNumLet) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
// ÷ 0027 × 0308 ÷ 000D ÷ # ÷ [0.2] APOSTROPHE (MidNumLet) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0027\u0308\r",
|
||||
new String[] { });
|
||||
|
||||
// ÷ 0027 ÷ 000A ÷ # ÷ [0.2] APOSTROPHE (MidNumLet) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
// ÷ 0027 ÷ 000A ÷ # ÷ [0.2] APOSTROPHE (MidNumLet) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0027\n",
|
||||
new String[] { });
|
||||
|
||||
// ÷ 0027 × 0308 ÷ 000A ÷ # ÷ [0.2] APOSTROPHE (MidNumLet) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
// ÷ 0027 × 0308 ÷ 000A ÷ # ÷ [0.2] APOSTROPHE (MidNumLet) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0027\u0308\n",
|
||||
new String[] { });
|
||||
|
||||
// ÷ 0027 ÷ 000B ÷ # ÷ [0.2] APOSTROPHE (MidNumLet) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
// ÷ 0027 ÷ 000B ÷ # ÷ [0.2] APOSTROPHE (MidNumLet) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0027\u000B",
|
||||
new String[] { });
|
||||
|
||||
// ÷ 0027 × 0308 ÷ 000B ÷ # ÷ [0.2] APOSTROPHE (MidNumLet) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
// ÷ 0027 × 0308 ÷ 000B ÷ # ÷ [0.2] APOSTROPHE (MidNumLet) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0027\u0308\u000B",
|
||||
new String[] { });
|
||||
|
||||
|
@ -1636,27 +1636,27 @@ public class WordBreakTestUnicode_6_0_0 extends BaseTokenStreamTestCase {
|
|||
assertAnalyzesTo(analyzer, "\u0030\u0308\u0001",
|
||||
new String[] { "\u0030\u0308" });
|
||||
|
||||
// ÷ 0030 ÷ 000D ÷ # ÷ [0.2] DIGIT ZERO (Numeric) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
// ÷ 0030 ÷ 000D ÷ # ÷ [0.2] DIGIT ZERO (Numeric) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0030\r",
|
||||
new String[] { "\u0030" });
|
||||
|
||||
// ÷ 0030 × 0308 ÷ 000D ÷ # ÷ [0.2] DIGIT ZERO (Numeric) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
// ÷ 0030 × 0308 ÷ 000D ÷ # ÷ [0.2] DIGIT ZERO (Numeric) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0030\u0308\r",
|
||||
new String[] { "\u0030\u0308" });
|
||||
|
||||
// ÷ 0030 ÷ 000A ÷ # ÷ [0.2] DIGIT ZERO (Numeric) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
// ÷ 0030 ÷ 000A ÷ # ÷ [0.2] DIGIT ZERO (Numeric) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0030\n",
|
||||
new String[] { "\u0030" });
|
||||
|
||||
// ÷ 0030 × 0308 ÷ 000A ÷ # ÷ [0.2] DIGIT ZERO (Numeric) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
// ÷ 0030 × 0308 ÷ 000A ÷ # ÷ [0.2] DIGIT ZERO (Numeric) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0030\u0308\n",
|
||||
new String[] { "\u0030\u0308" });
|
||||
|
||||
// ÷ 0030 ÷ 000B ÷ # ÷ [0.2] DIGIT ZERO (Numeric) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
// ÷ 0030 ÷ 000B ÷ # ÷ [0.2] DIGIT ZERO (Numeric) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0030\u000B",
|
||||
new String[] { "\u0030" });
|
||||
|
||||
// ÷ 0030 × 0308 ÷ 000B ÷ # ÷ [0.2] DIGIT ZERO (Numeric) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
// ÷ 0030 × 0308 ÷ 000B ÷ # ÷ [0.2] DIGIT ZERO (Numeric) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0030\u0308\u000B",
|
||||
new String[] { "\u0030\u0308" });
|
||||
|
||||
|
@ -1812,27 +1812,27 @@ public class WordBreakTestUnicode_6_0_0 extends BaseTokenStreamTestCase {
|
|||
assertAnalyzesTo(analyzer, "\u005F\u0308\u0001",
|
||||
new String[] { });
|
||||
|
||||
// ÷ 005F ÷ 000D ÷ # ÷ [0.2] LOW LINE (ExtendNumLet) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
// ÷ 005F ÷ 000D ÷ # ÷ [0.2] LOW LINE (ExtendNumLet) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u005F\r",
|
||||
new String[] { });
|
||||
|
||||
// ÷ 005F × 0308 ÷ 000D ÷ # ÷ [0.2] LOW LINE (ExtendNumLet) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
// ÷ 005F × 0308 ÷ 000D ÷ # ÷ [0.2] LOW LINE (ExtendNumLet) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u005F\u0308\r",
|
||||
new String[] { });
|
||||
|
||||
// ÷ 005F ÷ 000A ÷ # ÷ [0.2] LOW LINE (ExtendNumLet) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
// ÷ 005F ÷ 000A ÷ # ÷ [0.2] LOW LINE (ExtendNumLet) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u005F\n",
|
||||
new String[] { });
|
||||
|
||||
// ÷ 005F × 0308 ÷ 000A ÷ # ÷ [0.2] LOW LINE (ExtendNumLet) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
// ÷ 005F × 0308 ÷ 000A ÷ # ÷ [0.2] LOW LINE (ExtendNumLet) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u005F\u0308\n",
|
||||
new String[] { });
|
||||
|
||||
// ÷ 005F ÷ 000B ÷ # ÷ [0.2] LOW LINE (ExtendNumLet) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
// ÷ 005F ÷ 000B ÷ # ÷ [0.2] LOW LINE (ExtendNumLet) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u005F\u000B",
|
||||
new String[] { });
|
||||
|
||||
// ÷ 005F × 0308 ÷ 000B ÷ # ÷ [0.2] LOW LINE (ExtendNumLet) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
// ÷ 005F × 0308 ÷ 000B ÷ # ÷ [0.2] LOW LINE (ExtendNumLet) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u005F\u0308\u000B",
|
||||
new String[] { });
|
||||
|
||||
|
@ -1988,27 +1988,27 @@ public class WordBreakTestUnicode_6_0_0 extends BaseTokenStreamTestCase {
|
|||
assertAnalyzesTo(analyzer, "\u00AD\u0308\u0001",
|
||||
new String[] { });
|
||||
|
||||
// ÷ 00AD ÷ 000D ÷ # ÷ [0.2] SOFT HYPHEN (Format_FE) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
// ÷ 00AD ÷ 000D ÷ # ÷ [0.2] SOFT HYPHEN (Format_FE) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u00AD\r",
|
||||
new String[] { });
|
||||
|
||||
// ÷ 00AD × 0308 ÷ 000D ÷ # ÷ [0.2] SOFT HYPHEN (Format_FE) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
// ÷ 00AD × 0308 ÷ 000D ÷ # ÷ [0.2] SOFT HYPHEN (Format_FE) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u00AD\u0308\r",
|
||||
new String[] { });
|
||||
|
||||
// ÷ 00AD ÷ 000A ÷ # ÷ [0.2] SOFT HYPHEN (Format_FE) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
// ÷ 00AD ÷ 000A ÷ # ÷ [0.2] SOFT HYPHEN (Format_FE) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u00AD\n",
|
||||
new String[] { });
|
||||
|
||||
// ÷ 00AD × 0308 ÷ 000A ÷ # ÷ [0.2] SOFT HYPHEN (Format_FE) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
// ÷ 00AD × 0308 ÷ 000A ÷ # ÷ [0.2] SOFT HYPHEN (Format_FE) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u00AD\u0308\n",
|
||||
new String[] { });
|
||||
|
||||
// ÷ 00AD ÷ 000B ÷ # ÷ [0.2] SOFT HYPHEN (Format_FE) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
// ÷ 00AD ÷ 000B ÷ # ÷ [0.2] SOFT HYPHEN (Format_FE) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u00AD\u000B",
|
||||
new String[] { });
|
||||
|
||||
// ÷ 00AD × 0308 ÷ 000B ÷ # ÷ [0.2] SOFT HYPHEN (Format_FE) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
// ÷ 00AD × 0308 ÷ 000B ÷ # ÷ [0.2] SOFT HYPHEN (Format_FE) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u00AD\u0308\u000B",
|
||||
new String[] { });
|
||||
|
||||
|
@ -2164,27 +2164,27 @@ public class WordBreakTestUnicode_6_0_0 extends BaseTokenStreamTestCase {
|
|||
assertAnalyzesTo(analyzer, "\u0300\u0308\u0001",
|
||||
new String[] { });
|
||||
|
||||
// ÷ 0300 ÷ 000D ÷ # ÷ [0.2] COMBINING GRAVE ACCENT (Extend_FE) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
// ÷ 0300 ÷ 000D ÷ # ÷ [0.2] COMBINING GRAVE ACCENT (Extend_FE) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0300\r",
|
||||
new String[] { });
|
||||
|
||||
// ÷ 0300 × 0308 ÷ 000D ÷ # ÷ [0.2] COMBINING GRAVE ACCENT (Extend_FE) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
// ÷ 0300 × 0308 ÷ 000D ÷ # ÷ [0.2] COMBINING GRAVE ACCENT (Extend_FE) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0300\u0308\r",
|
||||
new String[] { });
|
||||
|
||||
// ÷ 0300 ÷ 000A ÷ # ÷ [0.2] COMBINING GRAVE ACCENT (Extend_FE) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
// ÷ 0300 ÷ 000A ÷ # ÷ [0.2] COMBINING GRAVE ACCENT (Extend_FE) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0300\n",
|
||||
new String[] { });
|
||||
|
||||
// ÷ 0300 × 0308 ÷ 000A ÷ # ÷ [0.2] COMBINING GRAVE ACCENT (Extend_FE) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
// ÷ 0300 × 0308 ÷ 000A ÷ # ÷ [0.2] COMBINING GRAVE ACCENT (Extend_FE) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0300\u0308\n",
|
||||
new String[] { });
|
||||
|
||||
// ÷ 0300 ÷ 000B ÷ # ÷ [0.2] COMBINING GRAVE ACCENT (Extend_FE) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
// ÷ 0300 ÷ 000B ÷ # ÷ [0.2] COMBINING GRAVE ACCENT (Extend_FE) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0300\u000B",
|
||||
new String[] { });
|
||||
|
||||
// ÷ 0300 × 0308 ÷ 000B ÷ # ÷ [0.2] COMBINING GRAVE ACCENT (Extend_FE) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
// ÷ 0300 × 0308 ÷ 000B ÷ # ÷ [0.2] COMBINING GRAVE ACCENT (Extend_FE) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0300\u0308\u000B",
|
||||
new String[] { });
|
||||
|
||||
|
@ -2340,27 +2340,27 @@ public class WordBreakTestUnicode_6_0_0 extends BaseTokenStreamTestCase {
|
|||
assertAnalyzesTo(analyzer, "\u0061\u2060\u0308\u0001",
|
||||
new String[] { "\u0061\u2060\u0308" });
|
||||
|
||||
// ÷ 0061 × 2060 ÷ 000D ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) × [4.0] WORD JOINER (Format_FE) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
// ÷ 0061 × 2060 ÷ 000D ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) × [4.0] WORD JOINER (Format_FE) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0061\u2060\r",
|
||||
new String[] { "\u0061\u2060" });
|
||||
|
||||
// ÷ 0061 × 2060 × 0308 ÷ 000D ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) × [4.0] WORD JOINER (Format_FE) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
// ÷ 0061 × 2060 × 0308 ÷ 000D ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) × [4.0] WORD JOINER (Format_FE) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0061\u2060\u0308\r",
|
||||
new String[] { "\u0061\u2060\u0308" });
|
||||
|
||||
// ÷ 0061 × 2060 ÷ 000A ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) × [4.0] WORD JOINER (Format_FE) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
// ÷ 0061 × 2060 ÷ 000A ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) × [4.0] WORD JOINER (Format_FE) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0061\u2060\n",
|
||||
new String[] { "\u0061\u2060" });
|
||||
|
||||
// ÷ 0061 × 2060 × 0308 ÷ 000A ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) × [4.0] WORD JOINER (Format_FE) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
// ÷ 0061 × 2060 × 0308 ÷ 000A ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) × [4.0] WORD JOINER (Format_FE) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0061\u2060\u0308\n",
|
||||
new String[] { "\u0061\u2060\u0308" });
|
||||
|
||||
// ÷ 0061 × 2060 ÷ 000B ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) × [4.0] WORD JOINER (Format_FE) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
// ÷ 0061 × 2060 ÷ 000B ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) × [4.0] WORD JOINER (Format_FE) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0061\u2060\u000B",
|
||||
new String[] { "\u0061\u2060" });
|
||||
|
||||
// ÷ 0061 × 2060 × 0308 ÷ 000B ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) × [4.0] WORD JOINER (Format_FE) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
// ÷ 0061 × 2060 × 0308 ÷ 000B ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) × [4.0] WORD JOINER (Format_FE) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0061\u2060\u0308\u000B",
|
||||
new String[] { "\u0061\u2060\u0308" });
|
||||
|
||||
|
@ -2516,27 +2516,27 @@ public class WordBreakTestUnicode_6_0_0 extends BaseTokenStreamTestCase {
|
|||
assertAnalyzesTo(analyzer, "\u0061\u003A\u0308\u0001",
|
||||
new String[] { "\u0061" });
|
||||
|
||||
// ÷ 0061 ÷ 003A ÷ 000D ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] COLON (MidLetter) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
// ÷ 0061 ÷ 003A ÷ 000D ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] COLON (MidLetter) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0061\u003A\r",
|
||||
new String[] { "\u0061" });
|
||||
|
||||
// ÷ 0061 ÷ 003A × 0308 ÷ 000D ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] COLON (MidLetter) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
// ÷ 0061 ÷ 003A × 0308 ÷ 000D ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] COLON (MidLetter) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0061\u003A\u0308\r",
|
||||
new String[] { "\u0061" });
|
||||
|
||||
// ÷ 0061 ÷ 003A ÷ 000A ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] COLON (MidLetter) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
// ÷ 0061 ÷ 003A ÷ 000A ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] COLON (MidLetter) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0061\u003A\n",
|
||||
new String[] { "\u0061" });
|
||||
|
||||
// ÷ 0061 ÷ 003A × 0308 ÷ 000A ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] COLON (MidLetter) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
// ÷ 0061 ÷ 003A × 0308 ÷ 000A ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] COLON (MidLetter) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0061\u003A\u0308\n",
|
||||
new String[] { "\u0061" });
|
||||
|
||||
// ÷ 0061 ÷ 003A ÷ 000B ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] COLON (MidLetter) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
// ÷ 0061 ÷ 003A ÷ 000B ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] COLON (MidLetter) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0061\u003A\u000B",
|
||||
new String[] { "\u0061" });
|
||||
|
||||
// ÷ 0061 ÷ 003A × 0308 ÷ 000B ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] COLON (MidLetter) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
// ÷ 0061 ÷ 003A × 0308 ÷ 000B ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] COLON (MidLetter) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0061\u003A\u0308\u000B",
|
||||
new String[] { "\u0061" });
|
||||
|
||||
|
@ -2692,27 +2692,27 @@ public class WordBreakTestUnicode_6_0_0 extends BaseTokenStreamTestCase {
|
|||
assertAnalyzesTo(analyzer, "\u0061\u0027\u0308\u0001",
|
||||
new String[] { "\u0061" });
|
||||
|
||||
// ÷ 0061 ÷ 0027 ÷ 000D ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] APOSTROPHE (MidNumLet) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
// ÷ 0061 ÷ 0027 ÷ 000D ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] APOSTROPHE (MidNumLet) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0061\u0027\r",
|
||||
new String[] { "\u0061" });
|
||||
|
||||
// ÷ 0061 ÷ 0027 × 0308 ÷ 000D ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] APOSTROPHE (MidNumLet) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
// ÷ 0061 ÷ 0027 × 0308 ÷ 000D ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] APOSTROPHE (MidNumLet) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0061\u0027\u0308\r",
|
||||
new String[] { "\u0061" });
|
||||
|
||||
// ÷ 0061 ÷ 0027 ÷ 000A ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] APOSTROPHE (MidNumLet) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
// ÷ 0061 ÷ 0027 ÷ 000A ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] APOSTROPHE (MidNumLet) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0061\u0027\n",
|
||||
new String[] { "\u0061" });
|
||||
|
||||
// ÷ 0061 ÷ 0027 × 0308 ÷ 000A ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] APOSTROPHE (MidNumLet) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
// ÷ 0061 ÷ 0027 × 0308 ÷ 000A ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] APOSTROPHE (MidNumLet) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0061\u0027\u0308\n",
|
||||
new String[] { "\u0061" });
|
||||
|
||||
// ÷ 0061 ÷ 0027 ÷ 000B ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] APOSTROPHE (MidNumLet) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
// ÷ 0061 ÷ 0027 ÷ 000B ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] APOSTROPHE (MidNumLet) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0061\u0027\u000B",
|
||||
new String[] { "\u0061" });
|
||||
|
||||
// ÷ 0061 ÷ 0027 × 0308 ÷ 000B ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] APOSTROPHE (MidNumLet) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
// ÷ 0061 ÷ 0027 × 0308 ÷ 000B ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] APOSTROPHE (MidNumLet) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0061\u0027\u0308\u000B",
|
||||
new String[] { "\u0061" });
|
||||
|
||||
|
@ -2868,27 +2868,27 @@ public class WordBreakTestUnicode_6_0_0 extends BaseTokenStreamTestCase {
|
|||
assertAnalyzesTo(analyzer, "\u0061\u0027\u2060\u0308\u0001",
|
||||
new String[] { "\u0061" });
|
||||
|
||||
// ÷ 0061 ÷ 0027 × 2060 ÷ 000D ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] APOSTROPHE (MidNumLet) × [4.0] WORD JOINER (Format_FE) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
// ÷ 0061 ÷ 0027 × 2060 ÷ 000D ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] APOSTROPHE (MidNumLet) × [4.0] WORD JOINER (Format_FE) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0061\u0027\u2060\r",
|
||||
new String[] { "\u0061" });
|
||||
|
||||
// ÷ 0061 ÷ 0027 × 2060 × 0308 ÷ 000D ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] APOSTROPHE (MidNumLet) × [4.0] WORD JOINER (Format_FE) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
// ÷ 0061 ÷ 0027 × 2060 × 0308 ÷ 000D ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] APOSTROPHE (MidNumLet) × [4.0] WORD JOINER (Format_FE) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0061\u0027\u2060\u0308\r",
|
||||
new String[] { "\u0061" });
|
||||
|
||||
// ÷ 0061 ÷ 0027 × 2060 ÷ 000A ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] APOSTROPHE (MidNumLet) × [4.0] WORD JOINER (Format_FE) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
// ÷ 0061 ÷ 0027 × 2060 ÷ 000A ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] APOSTROPHE (MidNumLet) × [4.0] WORD JOINER (Format_FE) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0061\u0027\u2060\n",
|
||||
new String[] { "\u0061" });
|
||||
|
||||
// ÷ 0061 ÷ 0027 × 2060 × 0308 ÷ 000A ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] APOSTROPHE (MidNumLet) × [4.0] WORD JOINER (Format_FE) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
// ÷ 0061 ÷ 0027 × 2060 × 0308 ÷ 000A ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] APOSTROPHE (MidNumLet) × [4.0] WORD JOINER (Format_FE) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0061\u0027\u2060\u0308\n",
|
||||
new String[] { "\u0061" });
|
||||
|
||||
// ÷ 0061 ÷ 0027 × 2060 ÷ 000B ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] APOSTROPHE (MidNumLet) × [4.0] WORD JOINER (Format_FE) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
// ÷ 0061 ÷ 0027 × 2060 ÷ 000B ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] APOSTROPHE (MidNumLet) × [4.0] WORD JOINER (Format_FE) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0061\u0027\u2060\u000B",
|
||||
new String[] { "\u0061" });
|
||||
|
||||
// ÷ 0061 ÷ 0027 × 2060 × 0308 ÷ 000B ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] APOSTROPHE (MidNumLet) × [4.0] WORD JOINER (Format_FE) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
// ÷ 0061 ÷ 0027 × 2060 × 0308 ÷ 000B ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] APOSTROPHE (MidNumLet) × [4.0] WORD JOINER (Format_FE) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0061\u0027\u2060\u0308\u000B",
|
||||
new String[] { "\u0061" });
|
||||
|
||||
|
@ -3044,27 +3044,27 @@ public class WordBreakTestUnicode_6_0_0 extends BaseTokenStreamTestCase {
|
|||
assertAnalyzesTo(analyzer, "\u0061\u002C\u0308\u0001",
|
||||
new String[] { "\u0061" });
|
||||
|
||||
// ÷ 0061 ÷ 002C ÷ 000D ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] COMMA (MidNum) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
// ÷ 0061 ÷ 002C ÷ 000D ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] COMMA (MidNum) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0061\u002C\r",
|
||||
new String[] { "\u0061" });
|
||||
|
||||
// ÷ 0061 ÷ 002C × 0308 ÷ 000D ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] COMMA (MidNum) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
// ÷ 0061 ÷ 002C × 0308 ÷ 000D ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] COMMA (MidNum) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0061\u002C\u0308\r",
|
||||
new String[] { "\u0061" });
|
||||
|
||||
// ÷ 0061 ÷ 002C ÷ 000A ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] COMMA (MidNum) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
// ÷ 0061 ÷ 002C ÷ 000A ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] COMMA (MidNum) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0061\u002C\n",
|
||||
new String[] { "\u0061" });
|
||||
|
||||
// ÷ 0061 ÷ 002C × 0308 ÷ 000A ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] COMMA (MidNum) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
// ÷ 0061 ÷ 002C × 0308 ÷ 000A ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] COMMA (MidNum) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0061\u002C\u0308\n",
|
||||
new String[] { "\u0061" });
|
||||
|
||||
// ÷ 0061 ÷ 002C ÷ 000B ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] COMMA (MidNum) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
// ÷ 0061 ÷ 002C ÷ 000B ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] COMMA (MidNum) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0061\u002C\u000B",
|
||||
new String[] { "\u0061" });
|
||||
|
||||
// ÷ 0061 ÷ 002C × 0308 ÷ 000B ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] COMMA (MidNum) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
// ÷ 0061 ÷ 002C × 0308 ÷ 000B ÷ # ÷ [0.2] LATIN SMALL LETTER A (ALetter) ÷ [999.0] COMMA (MidNum) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0061\u002C\u0308\u000B",
|
||||
new String[] { "\u0061" });
|
||||
|
||||
|
@ -3220,27 +3220,27 @@ public class WordBreakTestUnicode_6_0_0 extends BaseTokenStreamTestCase {
|
|||
assertAnalyzesTo(analyzer, "\u0031\u003A\u0308\u0001",
|
||||
new String[] { "\u0031" });
|
||||
|
||||
// ÷ 0031 ÷ 003A ÷ 000D ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] COLON (MidLetter) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
// ÷ 0031 ÷ 003A ÷ 000D ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] COLON (MidLetter) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0031\u003A\r",
|
||||
new String[] { "\u0031" });
|
||||
|
||||
// ÷ 0031 ÷ 003A × 0308 ÷ 000D ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] COLON (MidLetter) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
// ÷ 0031 ÷ 003A × 0308 ÷ 000D ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] COLON (MidLetter) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0031\u003A\u0308\r",
|
||||
new String[] { "\u0031" });
|
||||
|
||||
// ÷ 0031 ÷ 003A ÷ 000A ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] COLON (MidLetter) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
// ÷ 0031 ÷ 003A ÷ 000A ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] COLON (MidLetter) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0031\u003A\n",
|
||||
new String[] { "\u0031" });
|
||||
|
||||
// ÷ 0031 ÷ 003A × 0308 ÷ 000A ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] COLON (MidLetter) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
// ÷ 0031 ÷ 003A × 0308 ÷ 000A ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] COLON (MidLetter) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0031\u003A\u0308\n",
|
||||
new String[] { "\u0031" });
|
||||
|
||||
// ÷ 0031 ÷ 003A ÷ 000B ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] COLON (MidLetter) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
// ÷ 0031 ÷ 003A ÷ 000B ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] COLON (MidLetter) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0031\u003A\u000B",
|
||||
new String[] { "\u0031" });
|
||||
|
||||
// ÷ 0031 ÷ 003A × 0308 ÷ 000B ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] COLON (MidLetter) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
// ÷ 0031 ÷ 003A × 0308 ÷ 000B ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] COLON (MidLetter) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0031\u003A\u0308\u000B",
|
||||
new String[] { "\u0031" });
|
||||
|
||||
|
@ -3396,27 +3396,27 @@ public class WordBreakTestUnicode_6_0_0 extends BaseTokenStreamTestCase {
|
|||
assertAnalyzesTo(analyzer, "\u0031\u0027\u0308\u0001",
|
||||
new String[] { "\u0031" });
|
||||
|
||||
// ÷ 0031 ÷ 0027 ÷ 000D ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] APOSTROPHE (MidNumLet) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
// ÷ 0031 ÷ 0027 ÷ 000D ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] APOSTROPHE (MidNumLet) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0031\u0027\r",
|
||||
new String[] { "\u0031" });
|
||||
|
||||
// ÷ 0031 ÷ 0027 × 0308 ÷ 000D ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] APOSTROPHE (MidNumLet) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
// ÷ 0031 ÷ 0027 × 0308 ÷ 000D ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] APOSTROPHE (MidNumLet) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0031\u0027\u0308\r",
|
||||
new String[] { "\u0031" });
|
||||
|
||||
// ÷ 0031 ÷ 0027 ÷ 000A ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] APOSTROPHE (MidNumLet) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
// ÷ 0031 ÷ 0027 ÷ 000A ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] APOSTROPHE (MidNumLet) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0031\u0027\n",
|
||||
new String[] { "\u0031" });
|
||||
|
||||
// ÷ 0031 ÷ 0027 × 0308 ÷ 000A ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] APOSTROPHE (MidNumLet) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
// ÷ 0031 ÷ 0027 × 0308 ÷ 000A ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] APOSTROPHE (MidNumLet) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0031\u0027\u0308\n",
|
||||
new String[] { "\u0031" });
|
||||
|
||||
// ÷ 0031 ÷ 0027 ÷ 000B ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] APOSTROPHE (MidNumLet) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
// ÷ 0031 ÷ 0027 ÷ 000B ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] APOSTROPHE (MidNumLet) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0031\u0027\u000B",
|
||||
new String[] { "\u0031" });
|
||||
|
||||
// ÷ 0031 ÷ 0027 × 0308 ÷ 000B ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] APOSTROPHE (MidNumLet) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
// ÷ 0031 ÷ 0027 × 0308 ÷ 000B ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] APOSTROPHE (MidNumLet) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0031\u0027\u0308\u000B",
|
||||
new String[] { "\u0031" });
|
||||
|
||||
|
@ -3572,27 +3572,27 @@ public class WordBreakTestUnicode_6_0_0 extends BaseTokenStreamTestCase {
|
|||
assertAnalyzesTo(analyzer, "\u0031\u002C\u0308\u0001",
|
||||
new String[] { "\u0031" });
|
||||
|
||||
// ÷ 0031 ÷ 002C ÷ 000D ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] COMMA (MidNum) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
// ÷ 0031 ÷ 002C ÷ 000D ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] COMMA (MidNum) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0031\u002C\r",
|
||||
new String[] { "\u0031" });
|
||||
|
||||
// ÷ 0031 ÷ 002C × 0308 ÷ 000D ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] COMMA (MidNum) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
// ÷ 0031 ÷ 002C × 0308 ÷ 000D ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] COMMA (MidNum) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0031\u002C\u0308\r",
|
||||
new String[] { "\u0031" });
|
||||
|
||||
// ÷ 0031 ÷ 002C ÷ 000A ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] COMMA (MidNum) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
// ÷ 0031 ÷ 002C ÷ 000A ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] COMMA (MidNum) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0031\u002C\n",
|
||||
new String[] { "\u0031" });
|
||||
|
||||
// ÷ 0031 ÷ 002C × 0308 ÷ 000A ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] COMMA (MidNum) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
// ÷ 0031 ÷ 002C × 0308 ÷ 000A ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] COMMA (MidNum) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0031\u002C\u0308\n",
|
||||
new String[] { "\u0031" });
|
||||
|
||||
// ÷ 0031 ÷ 002C ÷ 000B ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] COMMA (MidNum) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
// ÷ 0031 ÷ 002C ÷ 000B ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] COMMA (MidNum) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0031\u002C\u000B",
|
||||
new String[] { "\u0031" });
|
||||
|
||||
// ÷ 0031 ÷ 002C × 0308 ÷ 000B ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] COMMA (MidNum) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
// ÷ 0031 ÷ 002C × 0308 ÷ 000B ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] COMMA (MidNum) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0031\u002C\u0308\u000B",
|
||||
new String[] { "\u0031" });
|
||||
|
||||
|
@ -3748,27 +3748,27 @@ public class WordBreakTestUnicode_6_0_0 extends BaseTokenStreamTestCase {
|
|||
assertAnalyzesTo(analyzer, "\u0031\u002E\u2060\u0308\u0001",
|
||||
new String[] { "\u0031" });
|
||||
|
||||
// ÷ 0031 ÷ 002E × 2060 ÷ 000D ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] FULL STOP (MidNumLet) × [4.0] WORD JOINER (Format_FE) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
// ÷ 0031 ÷ 002E × 2060 ÷ 000D ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] FULL STOP (MidNumLet) × [4.0] WORD JOINER (Format_FE) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0031\u002E\u2060\r",
|
||||
new String[] { "\u0031" });
|
||||
|
||||
// ÷ 0031 ÷ 002E × 2060 × 0308 ÷ 000D ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] FULL STOP (MidNumLet) × [4.0] WORD JOINER (Format_FE) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
// ÷ 0031 ÷ 002E × 2060 × 0308 ÷ 000D ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] FULL STOP (MidNumLet) × [4.0] WORD JOINER (Format_FE) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0031\u002E\u2060\u0308\r",
|
||||
new String[] { "\u0031" });
|
||||
|
||||
// ÷ 0031 ÷ 002E × 2060 ÷ 000A ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] FULL STOP (MidNumLet) × [4.0] WORD JOINER (Format_FE) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
// ÷ 0031 ÷ 002E × 2060 ÷ 000A ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] FULL STOP (MidNumLet) × [4.0] WORD JOINER (Format_FE) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0031\u002E\u2060\n",
|
||||
new String[] { "\u0031" });
|
||||
|
||||
// ÷ 0031 ÷ 002E × 2060 × 0308 ÷ 000A ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] FULL STOP (MidNumLet) × [4.0] WORD JOINER (Format_FE) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
// ÷ 0031 ÷ 002E × 2060 × 0308 ÷ 000A ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] FULL STOP (MidNumLet) × [4.0] WORD JOINER (Format_FE) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE FEED (LF)> (LF) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0031\u002E\u2060\u0308\n",
|
||||
new String[] { "\u0031" });
|
||||
|
||||
// ÷ 0031 ÷ 002E × 2060 ÷ 000B ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] FULL STOP (MidNumLet) × [4.0] WORD JOINER (Format_FE) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
// ÷ 0031 ÷ 002E × 2060 ÷ 000B ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] FULL STOP (MidNumLet) × [4.0] WORD JOINER (Format_FE) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0031\u002E\u2060\u000B",
|
||||
new String[] { "\u0031" });
|
||||
|
||||
// ÷ 0031 ÷ 002E × 2060 × 0308 ÷ 000B ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] FULL STOP (MidNumLet) × [4.0] WORD JOINER (Format_FE) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.11] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
// ÷ 0031 ÷ 002E × 2060 × 0308 ÷ 000B ÷ # ÷ [0.2] DIGIT ONE (Numeric) ÷ [999.0] FULL STOP (MidNumLet) × [4.0] WORD JOINER (Format_FE) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [3.2] <LINE TABULATION> (Newline) ÷ [0.3]
|
||||
assertAnalyzesTo(analyzer, "\u0031\u002E\u2060\u0308\u000B",
|
||||
new String[] { "\u0031" });
|
||||
|
|
@ -26,7 +26,7 @@
|
|||
<import file="../analysis-module-build.xml"/>
|
||||
|
||||
<path id="icujar">
|
||||
<pathelement location="lib/icu4j-4.8.1.1.jar"/>
|
||||
<pathelement location="lib/icu4j-49.1.jar"/>
|
||||
</path>
|
||||
|
||||
<path id="classpath">
|
||||
|
@ -37,19 +37,32 @@
|
|||
|
||||
<target name="compile-core" depends="jar-analyzers-common, common.compile-core" />
|
||||
|
||||
<property name="gennorm2.src.dir" value="src/data/utr30"/>
|
||||
<property name="gennorm2.src.files"
|
||||
value="nfkc.txt nfkc_cf.txt BasicFoldings.txt DiacriticFolding.txt DingbatFolding.txt HanRadicalFolding.txt NativeDigitFolding.txt"/>
|
||||
<property name="utr30.data.dir" location="src/data/utr30"/>
|
||||
<target name="gen-utr30-data-files" depends="compile-tools">
|
||||
<java
|
||||
classname="org.apache.lucene.analysis.icu.GenerateUTR30DataFiles"
|
||||
dir="${utr30.data.dir}"
|
||||
fork="true"
|
||||
failonerror="true">
|
||||
<classpath>
|
||||
<path refid="icujar"/>
|
||||
<pathelement location="${build.dir}/classes/tools"/>
|
||||
</classpath>
|
||||
</java>
|
||||
</target>
|
||||
|
||||
<property name="gennorm2.src.files"
|
||||
value="nfc.txt nfkc.txt nfkc_cf.txt BasicFoldings.txt DiacriticFolding.txt DingbatFolding.txt HanRadicalFolding.txt NativeDigitFolding.txt"/>
|
||||
<property name="gennorm2.tmp" value="${build.dir}/gennorm2/utr30.tmp"/>
|
||||
<property name="gennorm2.dst" value="src/resources/org/apache/lucene/analysis/icu/utr30.nrm"/>
|
||||
<target name="gennorm2">
|
||||
<target name="gennorm2" depends="gen-utr30-data-files">
|
||||
<echo>Note that the gennorm2 and icupkg tools must be on your PATH. These tools
|
||||
are part of the ICU4C package. See http://site.icu-project.org/ </echo>
|
||||
<mkdir dir="${build.dir}/gennorm2"/>
|
||||
<exec executable="gennorm2" failonerror="true">
|
||||
<arg value="-v"/>
|
||||
<arg value="-s"/>
|
||||
<arg value="${gennorm2.src.dir}"/>
|
||||
<arg value="${utr30.data.dir}"/>
|
||||
<arg line="${gennorm2.src.files}"/>
|
||||
<arg value="-o"/>
|
||||
<arg value="${gennorm2.tmp}"/>
|
||||
|
|
|
@ -19,7 +19,7 @@
|
|||
<ivy-module version="2.0">
|
||||
<info organisation="org.apache.lucene" module="analyzers-icu"/>
|
||||
<dependencies>
|
||||
<dependency org="com.ibm.icu" name="icu4j" rev="4.8.1.1" transitive="false"/>
|
||||
<dependency org="com.ibm.icu" name="icu4j" rev="49.1" transitive="false"/>
|
||||
<exclude org="*" ext="*" matcher="regexp" type="${ivy.exclude.types}"/>
|
||||
</dependencies>
|
||||
</ivy-module>
|
||||
|
|
|
@ -1 +0,0 @@
|
|||
654b7021e7bb6c5b7b35c88d23cac1022c7b3d6b
|
|
@ -0,0 +1 @@
|
|||
fbf7a438e6bf3660e0da2fd77dd1df1635fe503c
|
|
@ -2,7 +2,7 @@ ICU License - ICU 1.8.1 and later
|
|||
|
||||
COPYRIGHT AND PERMISSION NOTICE
|
||||
|
||||
Copyright (c) 1995-2008 International Business Machines Corporation and others
|
||||
Copyright (c) 1995-2012 International Business Machines Corporation and others
|
||||
|
||||
All rights reserved.
|
||||
|
||||
|
|
|
@ -1,3 +1,3 @@
|
|||
ICU4J, (under modules/analysis/icu) is licensed under an MIT style license
|
||||
(modules/analysis/icu/lib/icu4j-LICENSE-BSD_LIKE.txt) and Copyright (c) 1995-2011
|
||||
ICU4J, (under lucene/analysis/icu) is licensed under an MIT style license
|
||||
(modules/analysis/icu/lib/icu4j-LICENSE-BSD_LIKE.txt) and Copyright (c) 1995-2012
|
||||
International Business Machines Corporation and others
|
|
@ -20,67 +20,96 @@
|
|||
|
||||
### Custom Normalization mappings for UTR#30
|
||||
### (http://www.unicode.org/reports/tr30/tr30-4.html)
|
||||
###
|
||||
### Created from Unicode 5.2 UCD
|
||||
###
|
||||
|
||||
#### WARNING ####
|
||||
#### Rule: lines direct content generation.
|
||||
#### All non-comments will be REMOVED when this file's contents
|
||||
#### are generated by 'ant gen-utr30-data-files'.
|
||||
#### Use "# Rule: verbatim" to keep non-comments up until
|
||||
#### the next "# Rule:" line.
|
||||
#### WARNING ####
|
||||
|
||||
## Accent removal
|
||||
# See DiacriticFolding.txt
|
||||
|
||||
## Case Folding (done by cf)
|
||||
|
||||
## Canonical Duplicates Folding (done by cd)
|
||||
|
||||
## Dashes folding
|
||||
# [[:Dash:][:Pd:]]-2053(swung dash) > U+002D
|
||||
# Rule: [[[[:Dash:][:Pd:]]-[\u2053\uFE31\uFE32]] - [\u002D]] > 002D
|
||||
058A>002D
|
||||
05BE>002D
|
||||
1400>002D
|
||||
1806>002D
|
||||
2010..2015>002D
|
||||
207B>002D
|
||||
208B>002D
|
||||
2212>002D
|
||||
2E17>002D
|
||||
2E1A>002D
|
||||
2E3A..2E3B>002D
|
||||
301C>002D
|
||||
3030>002D
|
||||
30A0>002D
|
||||
#2053>002D
|
||||
2212>002D
|
||||
# FE31,FE32,FE58,FE63,FF0D done by kd
|
||||
FE58>002D
|
||||
FE63>002D
|
||||
FF0D>002D
|
||||
|
||||
## Greek letterforms folding (done by kd)
|
||||
|
||||
## Hebrew alternates folding (done by kd)
|
||||
|
||||
## Jamo folding (done by kd)
|
||||
|
||||
## Math symbol folding (done by kd)
|
||||
|
||||
## Native digit folding
|
||||
# See NativeDigitFolding.txt
|
||||
|
||||
## Nobreak folding (done by kd)
|
||||
## Overline Folding
|
||||
FE49..FE4C>203E
|
||||
|
||||
## Overline Folding (done by kd)
|
||||
|
||||
## Positional forms folding (done by kd)
|
||||
|
||||
## Small forms folding (done by kd)
|
||||
|
||||
## Space Folding
|
||||
# [:Zs:] > U+0020
|
||||
# Rule: [[:Zs:] - [:Changes_When_NFKC_Casefolded=Yes:] - [\u0020]] > 0020
|
||||
1680>0020
|
||||
180E>0020
|
||||
# 00A0, 2000..200A,202F,205F,3000 done by kd
|
||||
|
||||
## Spacing Accents folding (done by kd)
|
||||
|
||||
## Subscript folding (done by kd)
|
||||
|
||||
## Symbol folding (done by kd)
|
||||
|
||||
## Underline Folding
|
||||
# Rule: verbatim
|
||||
2017>005E
|
||||
FE4D..FE4F>005E
|
||||
|
||||
## Diacritic Folding
|
||||
#
|
||||
# See DiacriticFolding.txt
|
||||
|
||||
## Vertical forms folding (done by kd)
|
||||
|
||||
## Han Radical Folding
|
||||
# See HanRadicalFolding.txt
|
||||
|
||||
## Letter Form Folding (done by kd)
|
||||
## Superscript folding
|
||||
# Additions to kd:
|
||||
# Rule: verbatim
|
||||
02C0>0294
|
||||
02C1>0295
|
||||
06E5>0648
|
||||
06E6>064A
|
||||
## Suzhou Numeral Folding
|
||||
# Additions to kd:
|
||||
# Rule: verbatim
|
||||
3021>4E00
|
||||
3022>4E8C
|
||||
3023>4E09
|
||||
|
@ -92,6 +121,7 @@ FE4D..FE4F>005E
|
|||
3029>4E5D
|
||||
## Width Folding (done by kd)
|
||||
# Punctuation Folding
|
||||
# Rule: verbatim
|
||||
00AB>0022
|
||||
00BB>0022
|
||||
201C..201E>0022
|
||||
|
|
|
@ -24,41 +24,45 @@
|
|||
### Created from Unicode 5.2 UCD
|
||||
###
|
||||
|
||||
# Removes diacritics, as defined by [:Diacritic:]
|
||||
# These may or may not be combining marks
|
||||
#### WARNING ####
|
||||
#### Rule: lines direct content generation.
|
||||
#### All non-comments will be REMOVED when this file's contents
|
||||
#### are generated by 'ant gen-utr30-data-files'.
|
||||
#### Use "# Rule: verbatim" to keep non-comments up until
|
||||
#### the next "# Rule:" line.
|
||||
#### WARNING ####
|
||||
|
||||
## Remove diacritics
|
||||
# Rule: [:Diacritic:] >
|
||||
005E>
|
||||
0060>
|
||||
00B7>
|
||||
02B9..02D7>
|
||||
02DE>
|
||||
02DF>
|
||||
02E5..033F>
|
||||
0342>
|
||||
0346..034E>
|
||||
00A8>
|
||||
00AF>
|
||||
00B4>
|
||||
00B7..00B8>
|
||||
02B0..034E>
|
||||
0350..0357>
|
||||
035D..0362>
|
||||
0375>
|
||||
0374..0375>
|
||||
037A>
|
||||
0384..0385>
|
||||
0483..0487>
|
||||
0559>
|
||||
0591..05A1>
|
||||
05A3..05BD>
|
||||
05BF>
|
||||
05C1>
|
||||
05C2>
|
||||
05C1..05C2>
|
||||
05C4>
|
||||
064B..0652>
|
||||
0657>
|
||||
0658>
|
||||
06DF>
|
||||
06E0>
|
||||
06E5>
|
||||
06E6>
|
||||
0657..0658>
|
||||
06DF..06E0>
|
||||
06E5..06E6>
|
||||
06EA..06EC>
|
||||
0730..074A>
|
||||
07A6..07B0>
|
||||
07EB..07F5>
|
||||
0818>
|
||||
0819>
|
||||
0818..0819>
|
||||
08E4..08FE>
|
||||
093C>
|
||||
094D>
|
||||
0951..0954>
|
||||
|
@ -80,24 +84,19 @@
|
|||
0E47..0E4C>
|
||||
0E4E>
|
||||
0EC8..0ECC>
|
||||
0F18>
|
||||
0F19>
|
||||
0F18..0F19>
|
||||
0F35>
|
||||
0F37>
|
||||
0F39>
|
||||
0F3E>
|
||||
0F3F>
|
||||
0F3E..0F3F>
|
||||
0F82..0F84>
|
||||
0F86>
|
||||
0F87>
|
||||
0F86..0F87>
|
||||
0FC6>
|
||||
1037>
|
||||
1039>
|
||||
103A>
|
||||
1039..103A>
|
||||
1087..108D>
|
||||
108F>
|
||||
109A>
|
||||
109B>
|
||||
109A..109B>
|
||||
17C9..17D3>
|
||||
17DD>
|
||||
1939..193B>
|
||||
|
@ -106,31 +105,33 @@
|
|||
1B34>
|
||||
1B44>
|
||||
1B6B..1B73>
|
||||
1BAA>
|
||||
1C36>
|
||||
1C37>
|
||||
1BAA..1BAB>
|
||||
1C36..1C37>
|
||||
1C78..1C7D>
|
||||
1CD0..1CE8>
|
||||
1CED>
|
||||
1D2F>
|
||||
1D3B>
|
||||
1D4E>
|
||||
1CF4>
|
||||
1D2C..1D6A>
|
||||
1DC4..1DCF>
|
||||
1DFD..1DFF>
|
||||
1FBD>
|
||||
1FBF..1FC1>
|
||||
1FCD..1FCF>
|
||||
1FDD..1FDF>
|
||||
1FED..1FEF>
|
||||
1FFD..1FFE>
|
||||
2CEF..2CF1>
|
||||
2E2F>
|
||||
302A..302F>
|
||||
3099>
|
||||
309A>
|
||||
3099..309C>
|
||||
30FC>
|
||||
A66F>
|
||||
A67C>
|
||||
A67D>
|
||||
A67C..A67D>
|
||||
A67F>
|
||||
A6F0>
|
||||
A6F1>
|
||||
A6F0..A6F1>
|
||||
A717..A721>
|
||||
A788>
|
||||
A7F8..A7F9>
|
||||
A8C4>
|
||||
A8E0..A8F1>
|
||||
A92B..A92E>
|
||||
|
@ -139,12 +140,20 @@ A9B3>
|
|||
A9C0>
|
||||
AA7B>
|
||||
AABF..AAC2>
|
||||
ABEC>
|
||||
ABED>
|
||||
AAF6>
|
||||
ABEC..ABED>
|
||||
FB1E>
|
||||
FE20..FE26>
|
||||
110B9>
|
||||
110BA>
|
||||
FF3E>
|
||||
FF40>
|
||||
FF70>
|
||||
FF9E..FF9F>
|
||||
FFE3>
|
||||
110B9..110BA>
|
||||
11133..11134>
|
||||
111C0>
|
||||
116B6..116B7>
|
||||
16F8F..16F9F>
|
||||
1D167..1D169>
|
||||
1D16D..1D172>
|
||||
1D17B..1D182>
|
||||
|
@ -153,6 +162,7 @@ FE20..FE26>
|
|||
|
||||
# Latin script "composed" that do not further decompose, so decompose here
|
||||
# These are from AsciiFoldingFilter
|
||||
# Rule: verbatim
|
||||
00E6>0061 0065
|
||||
00F0>0064
|
||||
00F8>006F
|
||||
|
@ -491,6 +501,7 @@ A7FF>004D
|
|||
|
||||
# Cyrillic script "composed" that do not further decompose, so decompose here
|
||||
# These are from UTR#30 DiacriticFolding.txt
|
||||
# Rule: verbatim
|
||||
|
||||
047D>0461
|
||||
048B>0439
|
||||
|
@ -520,6 +531,7 @@ A7FF>004D
|
|||
04CE>043C
|
||||
|
||||
# Additional signs and diacritic, from examination of [:Mark:]&[:Lm:]
|
||||
# Rule: verbatim
|
||||
0358..035C>
|
||||
05A2>
|
||||
05C5>
|
||||
|
@ -555,6 +567,7 @@ A802>
|
|||
1D242..1D244>
|
||||
|
||||
# Additional Arabic/Hebrew decompositions
|
||||
# Rule: verbatim
|
||||
05F3>0027
|
||||
05F4>0022
|
||||
0629>0647
|
||||
|
|
|
@ -24,8 +24,17 @@
|
|||
### Created from Unicode 5.2 UCD
|
||||
###
|
||||
|
||||
#### WARNING ####
|
||||
#### Rule: lines direct content generation.
|
||||
#### All non-comments will be REMOVED when this file's contents
|
||||
#### are generated by 'ant gen-utr30-data-files'.
|
||||
#### Use "# Rule: verbatim" to keep non-comments up until
|
||||
#### the next "# Rule:" line.
|
||||
#### WARNING ####
|
||||
|
||||
# Folds dingbats and other adorned forms
|
||||
# Generated from ASCIIFoldingFilter
|
||||
# Rule: verbatim
|
||||
24EB>0031 0031
|
||||
24EC>0031 0032
|
||||
24ED>0031 0033
|
||||
|
|
|
@ -24,6 +24,16 @@
|
|||
### Created from UTR#30 HanRadicalFolding.txt
|
||||
###
|
||||
|
||||
#### WARNING ####
|
||||
#### Rule: lines direct content generation.
|
||||
#### All non-comments will be REMOVED when this file's contents
|
||||
#### are generated by 'ant gen-utr30-data-files'.
|
||||
#### Use "# Rule: verbatim" to keep non-comments up until
|
||||
#### the next "# Rule:" line.
|
||||
#### WARNING ####
|
||||
|
||||
# Rule: verbatim
|
||||
|
||||
# CJK Radicals
|
||||
2E81>5382
|
||||
2E82>4E5B
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
# Copyright 2001-2010 Unicode, Inc.
|
||||
#
|
||||
# Copyright 2001-2012 Unicode, Inc.
|
||||
#
|
||||
# Disclaimer
|
||||
#
|
||||
#
|
||||
# This source code is provided as is by Unicode, Inc. No claims are
|
||||
# made as to fitness for any particular purpose. No warranties of any
|
||||
# kind are expressed or implied. The recipient agrees to determine
|
||||
|
@ -9,463 +9,485 @@
|
|||
# purchased on magnetic or optical media from Unicode, Inc., the
|
||||
# sole remedy for any claim will be exchange of defective media
|
||||
# within 90 days of receipt.
|
||||
#
|
||||
#
|
||||
# Limitations on Rights to Redistribute This Code
|
||||
#
|
||||
#
|
||||
# Unicode, Inc. hereby grants the right to freely use the information
|
||||
# supplied in this file in the creation of products supporting the
|
||||
# Unicode Standard, and to make copies of this file in any form
|
||||
# for internal or external distribution as long as this notice
|
||||
# remains attached.
|
||||
|
||||
### Custom Normalization mappings for UTR#30
|
||||
### Custom Normalization mappings for UTR#30
|
||||
### (http://www.unicode.org/reports/tr30/tr30-4.html)
|
||||
###
|
||||
### Created from Unicode 5.2 UCD
|
||||
###
|
||||
|
||||
#### WARNING ####
|
||||
#### Rule: lines direct content generation.
|
||||
#### All non-comments will be REMOVED when this file's contents
|
||||
#### are generated by 'ant gen-utr30-data-files'.
|
||||
#### Use "# Rule: verbatim" to keep non-comments up until
|
||||
#### the next "# Rule:" line.
|
||||
#### WARNING ####
|
||||
|
||||
## Native digit folding
|
||||
# [:Nd:] > Ascii digit equivalent
|
||||
# Arabic-Indic
|
||||
0660>0030
|
||||
0661>0031
|
||||
0662>0032
|
||||
0663>0033
|
||||
0664>0034
|
||||
0665>0035
|
||||
0666>0036
|
||||
0667>0037
|
||||
0668>0038
|
||||
0669>0039
|
||||
# Eastern Arabic-Indic
|
||||
06F0>0030
|
||||
06F1>0031
|
||||
06F2>0032
|
||||
06F3>0033
|
||||
06F4>0034
|
||||
06F5>0035
|
||||
06F6>0036
|
||||
06F7>0037
|
||||
06F8>0038
|
||||
06F9>0039
|
||||
# NKo
|
||||
07C0>0030
|
||||
07C1>0031
|
||||
07C2>0032
|
||||
07C3>0033
|
||||
07C4>0034
|
||||
07C5>0035
|
||||
07C6>0036
|
||||
07C7>0037
|
||||
07C8>0038
|
||||
07C9>0039
|
||||
# Devanagari
|
||||
0966>0030
|
||||
0967>0031
|
||||
0968>0032
|
||||
0969>0033
|
||||
096A>0034
|
||||
096B>0035
|
||||
096C>0036
|
||||
096D>0037
|
||||
096E>0038
|
||||
096F>0039
|
||||
# Bengali
|
||||
09E6>0030
|
||||
09E7>0031
|
||||
09E8>0032
|
||||
09E9>0033
|
||||
09EA>0034
|
||||
09EB>0035
|
||||
09EC>0036
|
||||
09ED>0037
|
||||
09EE>0038
|
||||
09EF>0039
|
||||
# Gurmukhi
|
||||
0A66>0030
|
||||
0A67>0031
|
||||
0A68>0032
|
||||
0A69>0033
|
||||
0A6A>0034
|
||||
0A6B>0035
|
||||
0A6C>0036
|
||||
0A6D>0037
|
||||
0A6E>0038
|
||||
0A6F>0039
|
||||
# Gujarati
|
||||
0AE6>0030
|
||||
0AE7>0031
|
||||
0AE8>0032
|
||||
0AE9>0033
|
||||
0AEA>0034
|
||||
0AEB>0035
|
||||
0AEC>0036
|
||||
0AED>0037
|
||||
0AEE>0038
|
||||
0AEF>0039
|
||||
# Oriya
|
||||
0B66>0030
|
||||
0B67>0031
|
||||
0B68>0032
|
||||
0B69>0033
|
||||
0B6A>0034
|
||||
0B6B>0035
|
||||
0B6C>0036
|
||||
0B6D>0037
|
||||
0B6E>0038
|
||||
0B6F>0039
|
||||
# Tamil
|
||||
0BE6>0030
|
||||
0BE7>0031
|
||||
0BE8>0032
|
||||
0BE9>0033
|
||||
0BEA>0034
|
||||
0BEB>0035
|
||||
0BEC>0036
|
||||
0BED>0037
|
||||
0BEE>0038
|
||||
0BEF>0039
|
||||
# Telugu
|
||||
0C66>0030
|
||||
0C67>0031
|
||||
0C68>0032
|
||||
0C69>0033
|
||||
0C6A>0034
|
||||
0C6B>0035
|
||||
0C6C>0036
|
||||
0C6D>0037
|
||||
0C6E>0038
|
||||
0C6F>0039
|
||||
# Kannada
|
||||
0CE6>0030
|
||||
0CE7>0031
|
||||
0CE8>0032
|
||||
0CE9>0033
|
||||
0CEA>0034
|
||||
0CEB>0035
|
||||
0CEC>0036
|
||||
0CED>0037
|
||||
0CEE>0038
|
||||
0CEF>0039
|
||||
# Malayalam
|
||||
0D66>0030
|
||||
0D67>0031
|
||||
0D68>0032
|
||||
0D69>0033
|
||||
0D6A>0034
|
||||
0D6B>0035
|
||||
0D6C>0036
|
||||
0D6D>0037
|
||||
0D6E>0038
|
||||
0D6F>0039
|
||||
# Thai
|
||||
0E50>0030
|
||||
0E51>0031
|
||||
0E52>0032
|
||||
0E53>0033
|
||||
0E54>0034
|
||||
0E55>0035
|
||||
0E56>0036
|
||||
0E57>0037
|
||||
0E58>0038
|
||||
0E59>0039
|
||||
# Lao
|
||||
0ED0>0030
|
||||
0ED1>0031
|
||||
0ED2>0032
|
||||
0ED3>0033
|
||||
0ED4>0034
|
||||
0ED5>0035
|
||||
0ED6>0036
|
||||
0ED7>0037
|
||||
0ED8>0038
|
||||
0ED9>0039
|
||||
# Tibetan
|
||||
0F20>0030
|
||||
0F21>0031
|
||||
0F22>0032
|
||||
0F23>0033
|
||||
0F24>0034
|
||||
0F25>0035
|
||||
0F26>0036
|
||||
0F27>0037
|
||||
0F28>0038
|
||||
0F29>0039
|
||||
# Myanmar
|
||||
1040>0030
|
||||
1041>0031
|
||||
1042>0032
|
||||
1043>0033
|
||||
1044>0034
|
||||
1045>0035
|
||||
1046>0036
|
||||
1047>0037
|
||||
1048>0038
|
||||
1049>0039
|
||||
# Myanmar Shan
|
||||
1090>0030
|
||||
1091>0031
|
||||
1092>0032
|
||||
1093>0033
|
||||
1094>0034
|
||||
1095>0035
|
||||
1096>0036
|
||||
1097>0037
|
||||
1098>0038
|
||||
1099>0039
|
||||
# Khmer
|
||||
17E0>0030
|
||||
17E1>0031
|
||||
17E2>0032
|
||||
17E3>0033
|
||||
17E4>0034
|
||||
17E5>0035
|
||||
17E6>0036
|
||||
17E7>0037
|
||||
17E8>0038
|
||||
17E9>0039
|
||||
# Mongolian
|
||||
1810>0030
|
||||
1811>0031
|
||||
1812>0032
|
||||
1813>0033
|
||||
1814>0034
|
||||
1815>0035
|
||||
1816>0036
|
||||
1817>0037
|
||||
1818>0038
|
||||
1819>0039
|
||||
# Limbu
|
||||
1946>0030
|
||||
1947>0031
|
||||
1948>0032
|
||||
1949>0033
|
||||
194A>0034
|
||||
194B>0035
|
||||
194C>0036
|
||||
194D>0037
|
||||
194E>0038
|
||||
194F>0039
|
||||
# New Tai Lue
|
||||
19D0>0030
|
||||
19D1>0031
|
||||
19D2>0032
|
||||
19D3>0033
|
||||
19D4>0034
|
||||
19D5>0035
|
||||
19D6>0036
|
||||
19D7>0037
|
||||
19D8>0038
|
||||
19D9>0039
|
||||
# New Tai Lue Tham Digit One
|
||||
19DA>0031
|
||||
# Tai Tham Hora
|
||||
1A80>0030
|
||||
1A81>0031
|
||||
1A82>0032
|
||||
1A83>0033
|
||||
1A84>0034
|
||||
1A85>0035
|
||||
1A86>0036
|
||||
1A87>0037
|
||||
1A88>0038
|
||||
1A89>0039
|
||||
# Tai Tham Tham
|
||||
1A90>0030
|
||||
1A91>0031
|
||||
1A92>0032
|
||||
1A93>0033
|
||||
1A94>0034
|
||||
1A95>0035
|
||||
1A96>0036
|
||||
1A97>0037
|
||||
1A98>0038
|
||||
1A99>0039
|
||||
# Balinese
|
||||
1B50>0030
|
||||
1B51>0031
|
||||
1B52>0032
|
||||
1B53>0033
|
||||
1B54>0034
|
||||
1B55>0035
|
||||
1B56>0036
|
||||
1B57>0037
|
||||
1B58>0038
|
||||
1B59>0039
|
||||
# Sundanese
|
||||
1BB0>0030
|
||||
1BB1>0031
|
||||
1BB2>0032
|
||||
1BB3>0033
|
||||
1BB4>0034
|
||||
1BB5>0035
|
||||
1BB6>0036
|
||||
1BB7>0037
|
||||
1BB8>0038
|
||||
1BB9>0039
|
||||
# Lepcha
|
||||
1C40>0030
|
||||
1C41>0031
|
||||
1C42>0032
|
||||
1C43>0033
|
||||
1C44>0034
|
||||
1C45>0035
|
||||
1C46>0036
|
||||
1C47>0037
|
||||
1C48>0038
|
||||
1C49>0039
|
||||
# Ol Chiki
|
||||
1C50>0030
|
||||
1C51>0031
|
||||
1C52>0032
|
||||
1C53>0033
|
||||
1C54>0034
|
||||
1C55>0035
|
||||
1C56>0036
|
||||
1C57>0037
|
||||
1C58>0038
|
||||
1C59>0039
|
||||
# Vai
|
||||
A620>0030
|
||||
A621>0031
|
||||
A622>0032
|
||||
A623>0033
|
||||
A624>0034
|
||||
A625>0035
|
||||
A626>0036
|
||||
A627>0037
|
||||
A628>0038
|
||||
A629>0039
|
||||
# Saurashtra
|
||||
A8D0>0030
|
||||
A8D1>0031
|
||||
A8D2>0032
|
||||
A8D3>0033
|
||||
A8D4>0034
|
||||
A8D5>0035
|
||||
A8D6>0036
|
||||
A8D7>0037
|
||||
A8D8>0038
|
||||
A8D9>0039
|
||||
# Kayah Li
|
||||
A900>0030
|
||||
A901>0031
|
||||
A902>0032
|
||||
A903>0033
|
||||
A904>0034
|
||||
A905>0035
|
||||
A906>0036
|
||||
A907>0037
|
||||
A908>0038
|
||||
A909>0039
|
||||
# Javanese
|
||||
A9D0>0030
|
||||
A9D1>0031
|
||||
A9D2>0032
|
||||
A9D3>0033
|
||||
A9D4>0034
|
||||
A9D5>0035
|
||||
A9D6>0036
|
||||
A9D7>0037
|
||||
A9D8>0038
|
||||
A9D9>0039
|
||||
# Cham
|
||||
AA50>0030
|
||||
AA51>0031
|
||||
AA52>0032
|
||||
AA53>0033
|
||||
AA54>0034
|
||||
AA55>0035
|
||||
AA56>0036
|
||||
AA57>0037
|
||||
AA58>0038
|
||||
AA59>0039
|
||||
# Meetei Mayek
|
||||
ABF0>0030
|
||||
ABF1>0031
|
||||
ABF2>0032
|
||||
ABF3>0033
|
||||
ABF4>0034
|
||||
ABF5>0035
|
||||
ABF6>0036
|
||||
ABF7>0037
|
||||
ABF8>0038
|
||||
ABF9>0039
|
||||
# Halfwidth and Fullwidth Forms (done by kd)
|
||||
# Osmanya
|
||||
104A0>0030
|
||||
104A1>0031
|
||||
104A2>0032
|
||||
104A3>0033
|
||||
104A4>0034
|
||||
104A5>0035
|
||||
104A6>0036
|
||||
104A7>0037
|
||||
104A8>0038
|
||||
104A9>0039
|
||||
# Brahmi
|
||||
11066>0030
|
||||
11067>0031
|
||||
11068>0032
|
||||
11069>0033
|
||||
1106A>0034
|
||||
1106B>0035
|
||||
1106C>0036
|
||||
1106D>0037
|
||||
1106E>0038
|
||||
1106F>0039
|
||||
# Mathematical Alphanumeric Symbols - Bold digits
|
||||
1D7CE>0030
|
||||
1D7CF>0031
|
||||
1D7D0>0032
|
||||
1D7D1>0033
|
||||
1D7D2>0034
|
||||
1D7D3>0035
|
||||
1D7D4>0036
|
||||
1D7D5>0037
|
||||
1D7D6>0038
|
||||
1D7D7>0039
|
||||
# Mathematical Alphanumeric Symbols - Double-struck digits
|
||||
1D7D8>0030
|
||||
1D7D9>0031
|
||||
1D7DA>0032
|
||||
1D7DB>0033
|
||||
1D7DC>0034
|
||||
1D7DD>0035
|
||||
1D7DE>0036
|
||||
1D7DF>0037
|
||||
1D7E0>0038
|
||||
1D7E1>0039
|
||||
# Mathematical Alphanumeric Symbols - Sans-serif digits
|
||||
1D7E2>0030
|
||||
1D7E3>0031
|
||||
1D7E4>0032
|
||||
1D7E5>0033
|
||||
1D7E6>0034
|
||||
1D7E7>0035
|
||||
1D7E8>0036
|
||||
1D7E9>0037
|
||||
1D7EA>0038
|
||||
1D7EB>0039
|
||||
# Mathematical Alphanumeric Symbols - Sans-serif bold digits
|
||||
1D7EC>0030
|
||||
1D7ED>0031
|
||||
1D7EE>0032
|
||||
1D7EF>0033
|
||||
1D7F0>0034
|
||||
1D7F1>0035
|
||||
1D7F2>0036
|
||||
1D7F3>0037
|
||||
1D7F4>0038
|
||||
1D7F5>0039
|
||||
# Mathematical Alphanumeric Symbols - Monospace digits
|
||||
1D7F6>0030
|
||||
1D7F7>0031
|
||||
1D7F8>0032
|
||||
1D7F9>0033
|
||||
1D7FA>0034
|
||||
1D7FB>0035
|
||||
1D7FC>0036
|
||||
1D7FD>0037
|
||||
1D7FE>0038
|
||||
1D7FF>0039
|
||||
# Rule: [[[:Numeric_Type=Digit:][:Nd:]] - [[:Changes_When_NFKC_Casefolded=Yes:][:Block=Superscripts_And_Subscripts:][\u00B2\u00B3\u00B9][\u0030-\u0039]]] > Numeric_Value
|
||||
0660>0030 # ARABIC-INDIC DIGIT ZERO
|
||||
0661>0031 # ARABIC-INDIC DIGIT ONE
|
||||
0662>0032 # ARABIC-INDIC DIGIT TWO
|
||||
0663>0033 # ARABIC-INDIC DIGIT THREE
|
||||
0664>0034 # ARABIC-INDIC DIGIT FOUR
|
||||
0665>0035 # ARABIC-INDIC DIGIT FIVE
|
||||
0666>0036 # ARABIC-INDIC DIGIT SIX
|
||||
0667>0037 # ARABIC-INDIC DIGIT SEVEN
|
||||
0668>0038 # ARABIC-INDIC DIGIT EIGHT
|
||||
0669>0039 # ARABIC-INDIC DIGIT NINE
|
||||
06F0>0030 # EXTENDED ARABIC-INDIC DIGIT ZERO
|
||||
06F1>0031 # EXTENDED ARABIC-INDIC DIGIT ONE
|
||||
06F2>0032 # EXTENDED ARABIC-INDIC DIGIT TWO
|
||||
06F3>0033 # EXTENDED ARABIC-INDIC DIGIT THREE
|
||||
06F4>0034 # EXTENDED ARABIC-INDIC DIGIT FOUR
|
||||
06F5>0035 # EXTENDED ARABIC-INDIC DIGIT FIVE
|
||||
06F6>0036 # EXTENDED ARABIC-INDIC DIGIT SIX
|
||||
06F7>0037 # EXTENDED ARABIC-INDIC DIGIT SEVEN
|
||||
06F8>0038 # EXTENDED ARABIC-INDIC DIGIT EIGHT
|
||||
06F9>0039 # EXTENDED ARABIC-INDIC DIGIT NINE
|
||||
07C0>0030 # NKO DIGIT ZERO
|
||||
07C1>0031 # NKO DIGIT ONE
|
||||
07C2>0032 # NKO DIGIT TWO
|
||||
07C3>0033 # NKO DIGIT THREE
|
||||
07C4>0034 # NKO DIGIT FOUR
|
||||
07C5>0035 # NKO DIGIT FIVE
|
||||
07C6>0036 # NKO DIGIT SIX
|
||||
07C7>0037 # NKO DIGIT SEVEN
|
||||
07C8>0038 # NKO DIGIT EIGHT
|
||||
07C9>0039 # NKO DIGIT NINE
|
||||
0966>0030 # DEVANAGARI DIGIT ZERO
|
||||
0967>0031 # DEVANAGARI DIGIT ONE
|
||||
0968>0032 # DEVANAGARI DIGIT TWO
|
||||
0969>0033 # DEVANAGARI DIGIT THREE
|
||||
096A>0034 # DEVANAGARI DIGIT FOUR
|
||||
096B>0035 # DEVANAGARI DIGIT FIVE
|
||||
096C>0036 # DEVANAGARI DIGIT SIX
|
||||
096D>0037 # DEVANAGARI DIGIT SEVEN
|
||||
096E>0038 # DEVANAGARI DIGIT EIGHT
|
||||
096F>0039 # DEVANAGARI DIGIT NINE
|
||||
09E6>0030 # BENGALI DIGIT ZERO
|
||||
09E7>0031 # BENGALI DIGIT ONE
|
||||
09E8>0032 # BENGALI DIGIT TWO
|
||||
09E9>0033 # BENGALI DIGIT THREE
|
||||
09EA>0034 # BENGALI DIGIT FOUR
|
||||
09EB>0035 # BENGALI DIGIT FIVE
|
||||
09EC>0036 # BENGALI DIGIT SIX
|
||||
09ED>0037 # BENGALI DIGIT SEVEN
|
||||
09EE>0038 # BENGALI DIGIT EIGHT
|
||||
09EF>0039 # BENGALI DIGIT NINE
|
||||
0A66>0030 # GURMUKHI DIGIT ZERO
|
||||
0A67>0031 # GURMUKHI DIGIT ONE
|
||||
0A68>0032 # GURMUKHI DIGIT TWO
|
||||
0A69>0033 # GURMUKHI DIGIT THREE
|
||||
0A6A>0034 # GURMUKHI DIGIT FOUR
|
||||
0A6B>0035 # GURMUKHI DIGIT FIVE
|
||||
0A6C>0036 # GURMUKHI DIGIT SIX
|
||||
0A6D>0037 # GURMUKHI DIGIT SEVEN
|
||||
0A6E>0038 # GURMUKHI DIGIT EIGHT
|
||||
0A6F>0039 # GURMUKHI DIGIT NINE
|
||||
0AE6>0030 # GUJARATI DIGIT ZERO
|
||||
0AE7>0031 # GUJARATI DIGIT ONE
|
||||
0AE8>0032 # GUJARATI DIGIT TWO
|
||||
0AE9>0033 # GUJARATI DIGIT THREE
|
||||
0AEA>0034 # GUJARATI DIGIT FOUR
|
||||
0AEB>0035 # GUJARATI DIGIT FIVE
|
||||
0AEC>0036 # GUJARATI DIGIT SIX
|
||||
0AED>0037 # GUJARATI DIGIT SEVEN
|
||||
0AEE>0038 # GUJARATI DIGIT EIGHT
|
||||
0AEF>0039 # GUJARATI DIGIT NINE
|
||||
0B66>0030 # ORIYA DIGIT ZERO
|
||||
0B67>0031 # ORIYA DIGIT ONE
|
||||
0B68>0032 # ORIYA DIGIT TWO
|
||||
0B69>0033 # ORIYA DIGIT THREE
|
||||
0B6A>0034 # ORIYA DIGIT FOUR
|
||||
0B6B>0035 # ORIYA DIGIT FIVE
|
||||
0B6C>0036 # ORIYA DIGIT SIX
|
||||
0B6D>0037 # ORIYA DIGIT SEVEN
|
||||
0B6E>0038 # ORIYA DIGIT EIGHT
|
||||
0B6F>0039 # ORIYA DIGIT NINE
|
||||
0BE6>0030 # TAMIL DIGIT ZERO
|
||||
0BE7>0031 # TAMIL DIGIT ONE
|
||||
0BE8>0032 # TAMIL DIGIT TWO
|
||||
0BE9>0033 # TAMIL DIGIT THREE
|
||||
0BEA>0034 # TAMIL DIGIT FOUR
|
||||
0BEB>0035 # TAMIL DIGIT FIVE
|
||||
0BEC>0036 # TAMIL DIGIT SIX
|
||||
0BED>0037 # TAMIL DIGIT SEVEN
|
||||
0BEE>0038 # TAMIL DIGIT EIGHT
|
||||
0BEF>0039 # TAMIL DIGIT NINE
|
||||
0C66>0030 # TELUGU DIGIT ZERO
|
||||
0C67>0031 # TELUGU DIGIT ONE
|
||||
0C68>0032 # TELUGU DIGIT TWO
|
||||
0C69>0033 # TELUGU DIGIT THREE
|
||||
0C6A>0034 # TELUGU DIGIT FOUR
|
||||
0C6B>0035 # TELUGU DIGIT FIVE
|
||||
0C6C>0036 # TELUGU DIGIT SIX
|
||||
0C6D>0037 # TELUGU DIGIT SEVEN
|
||||
0C6E>0038 # TELUGU DIGIT EIGHT
|
||||
0C6F>0039 # TELUGU DIGIT NINE
|
||||
0CE6>0030 # KANNADA DIGIT ZERO
|
||||
0CE7>0031 # KANNADA DIGIT ONE
|
||||
0CE8>0032 # KANNADA DIGIT TWO
|
||||
0CE9>0033 # KANNADA DIGIT THREE
|
||||
0CEA>0034 # KANNADA DIGIT FOUR
|
||||
0CEB>0035 # KANNADA DIGIT FIVE
|
||||
0CEC>0036 # KANNADA DIGIT SIX
|
||||
0CED>0037 # KANNADA DIGIT SEVEN
|
||||
0CEE>0038 # KANNADA DIGIT EIGHT
|
||||
0CEF>0039 # KANNADA DIGIT NINE
|
||||
0D66>0030 # MALAYALAM DIGIT ZERO
|
||||
0D67>0031 # MALAYALAM DIGIT ONE
|
||||
0D68>0032 # MALAYALAM DIGIT TWO
|
||||
0D69>0033 # MALAYALAM DIGIT THREE
|
||||
0D6A>0034 # MALAYALAM DIGIT FOUR
|
||||
0D6B>0035 # MALAYALAM DIGIT FIVE
|
||||
0D6C>0036 # MALAYALAM DIGIT SIX
|
||||
0D6D>0037 # MALAYALAM DIGIT SEVEN
|
||||
0D6E>0038 # MALAYALAM DIGIT EIGHT
|
||||
0D6F>0039 # MALAYALAM DIGIT NINE
|
||||
0E50>0030 # THAI DIGIT ZERO
|
||||
0E51>0031 # THAI DIGIT ONE
|
||||
0E52>0032 # THAI DIGIT TWO
|
||||
0E53>0033 # THAI DIGIT THREE
|
||||
0E54>0034 # THAI DIGIT FOUR
|
||||
0E55>0035 # THAI DIGIT FIVE
|
||||
0E56>0036 # THAI DIGIT SIX
|
||||
0E57>0037 # THAI DIGIT SEVEN
|
||||
0E58>0038 # THAI DIGIT EIGHT
|
||||
0E59>0039 # THAI DIGIT NINE
|
||||
0ED0>0030 # LAO DIGIT ZERO
|
||||
0ED1>0031 # LAO DIGIT ONE
|
||||
0ED2>0032 # LAO DIGIT TWO
|
||||
0ED3>0033 # LAO DIGIT THREE
|
||||
0ED4>0034 # LAO DIGIT FOUR
|
||||
0ED5>0035 # LAO DIGIT FIVE
|
||||
0ED6>0036 # LAO DIGIT SIX
|
||||
0ED7>0037 # LAO DIGIT SEVEN
|
||||
0ED8>0038 # LAO DIGIT EIGHT
|
||||
0ED9>0039 # LAO DIGIT NINE
|
||||
0F20>0030 # TIBETAN DIGIT ZERO
|
||||
0F21>0031 # TIBETAN DIGIT ONE
|
||||
0F22>0032 # TIBETAN DIGIT TWO
|
||||
0F23>0033 # TIBETAN DIGIT THREE
|
||||
0F24>0034 # TIBETAN DIGIT FOUR
|
||||
0F25>0035 # TIBETAN DIGIT FIVE
|
||||
0F26>0036 # TIBETAN DIGIT SIX
|
||||
0F27>0037 # TIBETAN DIGIT SEVEN
|
||||
0F28>0038 # TIBETAN DIGIT EIGHT
|
||||
0F29>0039 # TIBETAN DIGIT NINE
|
||||
1040>0030 # MYANMAR DIGIT ZERO
|
||||
1041>0031 # MYANMAR DIGIT ONE
|
||||
1042>0032 # MYANMAR DIGIT TWO
|
||||
1043>0033 # MYANMAR DIGIT THREE
|
||||
1044>0034 # MYANMAR DIGIT FOUR
|
||||
1045>0035 # MYANMAR DIGIT FIVE
|
||||
1046>0036 # MYANMAR DIGIT SIX
|
||||
1047>0037 # MYANMAR DIGIT SEVEN
|
||||
1048>0038 # MYANMAR DIGIT EIGHT
|
||||
1049>0039 # MYANMAR DIGIT NINE
|
||||
1090>0030 # MYANMAR SHAN DIGIT ZERO
|
||||
1091>0031 # MYANMAR SHAN DIGIT ONE
|
||||
1092>0032 # MYANMAR SHAN DIGIT TWO
|
||||
1093>0033 # MYANMAR SHAN DIGIT THREE
|
||||
1094>0034 # MYANMAR SHAN DIGIT FOUR
|
||||
1095>0035 # MYANMAR SHAN DIGIT FIVE
|
||||
1096>0036 # MYANMAR SHAN DIGIT SIX
|
||||
1097>0037 # MYANMAR SHAN DIGIT SEVEN
|
||||
1098>0038 # MYANMAR SHAN DIGIT EIGHT
|
||||
1099>0039 # MYANMAR SHAN DIGIT NINE
|
||||
1369>0031 # ETHIOPIC DIGIT ONE
|
||||
136A>0032 # ETHIOPIC DIGIT TWO
|
||||
136B>0033 # ETHIOPIC DIGIT THREE
|
||||
136C>0034 # ETHIOPIC DIGIT FOUR
|
||||
136D>0035 # ETHIOPIC DIGIT FIVE
|
||||
136E>0036 # ETHIOPIC DIGIT SIX
|
||||
136F>0037 # ETHIOPIC DIGIT SEVEN
|
||||
1370>0038 # ETHIOPIC DIGIT EIGHT
|
||||
1371>0039 # ETHIOPIC DIGIT NINE
|
||||
17E0>0030 # KHMER DIGIT ZERO
|
||||
17E1>0031 # KHMER DIGIT ONE
|
||||
17E2>0032 # KHMER DIGIT TWO
|
||||
17E3>0033 # KHMER DIGIT THREE
|
||||
17E4>0034 # KHMER DIGIT FOUR
|
||||
17E5>0035 # KHMER DIGIT FIVE
|
||||
17E6>0036 # KHMER DIGIT SIX
|
||||
17E7>0037 # KHMER DIGIT SEVEN
|
||||
17E8>0038 # KHMER DIGIT EIGHT
|
||||
17E9>0039 # KHMER DIGIT NINE
|
||||
1810>0030 # MONGOLIAN DIGIT ZERO
|
||||
1811>0031 # MONGOLIAN DIGIT ONE
|
||||
1812>0032 # MONGOLIAN DIGIT TWO
|
||||
1813>0033 # MONGOLIAN DIGIT THREE
|
||||
1814>0034 # MONGOLIAN DIGIT FOUR
|
||||
1815>0035 # MONGOLIAN DIGIT FIVE
|
||||
1816>0036 # MONGOLIAN DIGIT SIX
|
||||
1817>0037 # MONGOLIAN DIGIT SEVEN
|
||||
1818>0038 # MONGOLIAN DIGIT EIGHT
|
||||
1819>0039 # MONGOLIAN DIGIT NINE
|
||||
1946>0030 # LIMBU DIGIT ZERO
|
||||
1947>0031 # LIMBU DIGIT ONE
|
||||
1948>0032 # LIMBU DIGIT TWO
|
||||
1949>0033 # LIMBU DIGIT THREE
|
||||
194A>0034 # LIMBU DIGIT FOUR
|
||||
194B>0035 # LIMBU DIGIT FIVE
|
||||
194C>0036 # LIMBU DIGIT SIX
|
||||
194D>0037 # LIMBU DIGIT SEVEN
|
||||
194E>0038 # LIMBU DIGIT EIGHT
|
||||
194F>0039 # LIMBU DIGIT NINE
|
||||
19D0>0030 # NEW TAI LUE DIGIT ZERO
|
||||
19D1>0031 # NEW TAI LUE DIGIT ONE
|
||||
19D2>0032 # NEW TAI LUE DIGIT TWO
|
||||
19D3>0033 # NEW TAI LUE DIGIT THREE
|
||||
19D4>0034 # NEW TAI LUE DIGIT FOUR
|
||||
19D5>0035 # NEW TAI LUE DIGIT FIVE
|
||||
19D6>0036 # NEW TAI LUE DIGIT SIX
|
||||
19D7>0037 # NEW TAI LUE DIGIT SEVEN
|
||||
19D8>0038 # NEW TAI LUE DIGIT EIGHT
|
||||
19D9>0039 # NEW TAI LUE DIGIT NINE
|
||||
19DA>0031 # NEW TAI LUE THAM DIGIT ONE
|
||||
1A80>0030 # TAI THAM HORA DIGIT ZERO
|
||||
1A81>0031 # TAI THAM HORA DIGIT ONE
|
||||
1A82>0032 # TAI THAM HORA DIGIT TWO
|
||||
1A83>0033 # TAI THAM HORA DIGIT THREE
|
||||
1A84>0034 # TAI THAM HORA DIGIT FOUR
|
||||
1A85>0035 # TAI THAM HORA DIGIT FIVE
|
||||
1A86>0036 # TAI THAM HORA DIGIT SIX
|
||||
1A87>0037 # TAI THAM HORA DIGIT SEVEN
|
||||
1A88>0038 # TAI THAM HORA DIGIT EIGHT
|
||||
1A89>0039 # TAI THAM HORA DIGIT NINE
|
||||
1A90>0030 # TAI THAM THAM DIGIT ZERO
|
||||
1A91>0031 # TAI THAM THAM DIGIT ONE
|
||||
1A92>0032 # TAI THAM THAM DIGIT TWO
|
||||
1A93>0033 # TAI THAM THAM DIGIT THREE
|
||||
1A94>0034 # TAI THAM THAM DIGIT FOUR
|
||||
1A95>0035 # TAI THAM THAM DIGIT FIVE
|
||||
1A96>0036 # TAI THAM THAM DIGIT SIX
|
||||
1A97>0037 # TAI THAM THAM DIGIT SEVEN
|
||||
1A98>0038 # TAI THAM THAM DIGIT EIGHT
|
||||
1A99>0039 # TAI THAM THAM DIGIT NINE
|
||||
1B50>0030 # BALINESE DIGIT ZERO
|
||||
1B51>0031 # BALINESE DIGIT ONE
|
||||
1B52>0032 # BALINESE DIGIT TWO
|
||||
1B53>0033 # BALINESE DIGIT THREE
|
||||
1B54>0034 # BALINESE DIGIT FOUR
|
||||
1B55>0035 # BALINESE DIGIT FIVE
|
||||
1B56>0036 # BALINESE DIGIT SIX
|
||||
1B57>0037 # BALINESE DIGIT SEVEN
|
||||
1B58>0038 # BALINESE DIGIT EIGHT
|
||||
1B59>0039 # BALINESE DIGIT NINE
|
||||
1BB0>0030 # SUNDANESE DIGIT ZERO
|
||||
1BB1>0031 # SUNDANESE DIGIT ONE
|
||||
1BB2>0032 # SUNDANESE DIGIT TWO
|
||||
1BB3>0033 # SUNDANESE DIGIT THREE
|
||||
1BB4>0034 # SUNDANESE DIGIT FOUR
|
||||
1BB5>0035 # SUNDANESE DIGIT FIVE
|
||||
1BB6>0036 # SUNDANESE DIGIT SIX
|
||||
1BB7>0037 # SUNDANESE DIGIT SEVEN
|
||||
1BB8>0038 # SUNDANESE DIGIT EIGHT
|
||||
1BB9>0039 # SUNDANESE DIGIT NINE
|
||||
1C40>0030 # LEPCHA DIGIT ZERO
|
||||
1C41>0031 # LEPCHA DIGIT ONE
|
||||
1C42>0032 # LEPCHA DIGIT TWO
|
||||
1C43>0033 # LEPCHA DIGIT THREE
|
||||
1C44>0034 # LEPCHA DIGIT FOUR
|
||||
1C45>0035 # LEPCHA DIGIT FIVE
|
||||
1C46>0036 # LEPCHA DIGIT SIX
|
||||
1C47>0037 # LEPCHA DIGIT SEVEN
|
||||
1C48>0038 # LEPCHA DIGIT EIGHT
|
||||
1C49>0039 # LEPCHA DIGIT NINE
|
||||
1C50>0030 # OL CHIKI DIGIT ZERO
|
||||
1C51>0031 # OL CHIKI DIGIT ONE
|
||||
1C52>0032 # OL CHIKI DIGIT TWO
|
||||
1C53>0033 # OL CHIKI DIGIT THREE
|
||||
1C54>0034 # OL CHIKI DIGIT FOUR
|
||||
1C55>0035 # OL CHIKI DIGIT FIVE
|
||||
1C56>0036 # OL CHIKI DIGIT SIX
|
||||
1C57>0037 # OL CHIKI DIGIT SEVEN
|
||||
1C58>0038 # OL CHIKI DIGIT EIGHT
|
||||
1C59>0039 # OL CHIKI DIGIT NINE
|
||||
24F5>0031 # DOUBLE CIRCLED DIGIT ONE
|
||||
24F6>0032 # DOUBLE CIRCLED DIGIT TWO
|
||||
24F7>0033 # DOUBLE CIRCLED DIGIT THREE
|
||||
24F8>0034 # DOUBLE CIRCLED DIGIT FOUR
|
||||
24F9>0035 # DOUBLE CIRCLED DIGIT FIVE
|
||||
24FA>0036 # DOUBLE CIRCLED DIGIT SIX
|
||||
24FB>0037 # DOUBLE CIRCLED DIGIT SEVEN
|
||||
24FC>0038 # DOUBLE CIRCLED DIGIT EIGHT
|
||||
24FD>0039 # DOUBLE CIRCLED DIGIT NINE
|
||||
24FF>0030 # NEGATIVE CIRCLED DIGIT ZERO
|
||||
2776>0031 # DINGBAT NEGATIVE CIRCLED DIGIT ONE
|
||||
2777>0032 # DINGBAT NEGATIVE CIRCLED DIGIT TWO
|
||||
2778>0033 # DINGBAT NEGATIVE CIRCLED DIGIT THREE
|
||||
2779>0034 # DINGBAT NEGATIVE CIRCLED DIGIT FOUR
|
||||
277A>0035 # DINGBAT NEGATIVE CIRCLED DIGIT FIVE
|
||||
277B>0036 # DINGBAT NEGATIVE CIRCLED DIGIT SIX
|
||||
277C>0037 # DINGBAT NEGATIVE CIRCLED DIGIT SEVEN
|
||||
277D>0038 # DINGBAT NEGATIVE CIRCLED DIGIT EIGHT
|
||||
277E>0039 # DINGBAT NEGATIVE CIRCLED DIGIT NINE
|
||||
2780>0031 # DINGBAT CIRCLED SANS-SERIF DIGIT ONE
|
||||
2781>0032 # DINGBAT CIRCLED SANS-SERIF DIGIT TWO
|
||||
2782>0033 # DINGBAT CIRCLED SANS-SERIF DIGIT THREE
|
||||
2783>0034 # DINGBAT CIRCLED SANS-SERIF DIGIT FOUR
|
||||
2784>0035 # DINGBAT CIRCLED SANS-SERIF DIGIT FIVE
|
||||
2785>0036 # DINGBAT CIRCLED SANS-SERIF DIGIT SIX
|
||||
2786>0037 # DINGBAT CIRCLED SANS-SERIF DIGIT SEVEN
|
||||
2787>0038 # DINGBAT CIRCLED SANS-SERIF DIGIT EIGHT
|
||||
2788>0039 # DINGBAT CIRCLED SANS-SERIF DIGIT NINE
|
||||
278A>0031 # DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT ONE
|
||||
278B>0032 # DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT TWO
|
||||
278C>0033 # DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT THREE
|
||||
278D>0034 # DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT FOUR
|
||||
278E>0035 # DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT FIVE
|
||||
278F>0036 # DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT SIX
|
||||
2790>0037 # DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT SEVEN
|
||||
2791>0038 # DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT EIGHT
|
||||
2792>0039 # DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT NINE
|
||||
A620>0030 # VAI DIGIT ZERO
|
||||
A621>0031 # VAI DIGIT ONE
|
||||
A622>0032 # VAI DIGIT TWO
|
||||
A623>0033 # VAI DIGIT THREE
|
||||
A624>0034 # VAI DIGIT FOUR
|
||||
A625>0035 # VAI DIGIT FIVE
|
||||
A626>0036 # VAI DIGIT SIX
|
||||
A627>0037 # VAI DIGIT SEVEN
|
||||
A628>0038 # VAI DIGIT EIGHT
|
||||
A629>0039 # VAI DIGIT NINE
|
||||
A8D0>0030 # SAURASHTRA DIGIT ZERO
|
||||
A8D1>0031 # SAURASHTRA DIGIT ONE
|
||||
A8D2>0032 # SAURASHTRA DIGIT TWO
|
||||
A8D3>0033 # SAURASHTRA DIGIT THREE
|
||||
A8D4>0034 # SAURASHTRA DIGIT FOUR
|
||||
A8D5>0035 # SAURASHTRA DIGIT FIVE
|
||||
A8D6>0036 # SAURASHTRA DIGIT SIX
|
||||
A8D7>0037 # SAURASHTRA DIGIT SEVEN
|
||||
A8D8>0038 # SAURASHTRA DIGIT EIGHT
|
||||
A8D9>0039 # SAURASHTRA DIGIT NINE
|
||||
A900>0030 # KAYAH LI DIGIT ZERO
|
||||
A901>0031 # KAYAH LI DIGIT ONE
|
||||
A902>0032 # KAYAH LI DIGIT TWO
|
||||
A903>0033 # KAYAH LI DIGIT THREE
|
||||
A904>0034 # KAYAH LI DIGIT FOUR
|
||||
A905>0035 # KAYAH LI DIGIT FIVE
|
||||
A906>0036 # KAYAH LI DIGIT SIX
|
||||
A907>0037 # KAYAH LI DIGIT SEVEN
|
||||
A908>0038 # KAYAH LI DIGIT EIGHT
|
||||
A909>0039 # KAYAH LI DIGIT NINE
|
||||
A9D0>0030 # JAVANESE DIGIT ZERO
|
||||
A9D1>0031 # JAVANESE DIGIT ONE
|
||||
A9D2>0032 # JAVANESE DIGIT TWO
|
||||
A9D3>0033 # JAVANESE DIGIT THREE
|
||||
A9D4>0034 # JAVANESE DIGIT FOUR
|
||||
A9D5>0035 # JAVANESE DIGIT FIVE
|
||||
A9D6>0036 # JAVANESE DIGIT SIX
|
||||
A9D7>0037 # JAVANESE DIGIT SEVEN
|
||||
A9D8>0038 # JAVANESE DIGIT EIGHT
|
||||
A9D9>0039 # JAVANESE DIGIT NINE
|
||||
AA50>0030 # CHAM DIGIT ZERO
|
||||
AA51>0031 # CHAM DIGIT ONE
|
||||
AA52>0032 # CHAM DIGIT TWO
|
||||
AA53>0033 # CHAM DIGIT THREE
|
||||
AA54>0034 # CHAM DIGIT FOUR
|
||||
AA55>0035 # CHAM DIGIT FIVE
|
||||
AA56>0036 # CHAM DIGIT SIX
|
||||
AA57>0037 # CHAM DIGIT SEVEN
|
||||
AA58>0038 # CHAM DIGIT EIGHT
|
||||
AA59>0039 # CHAM DIGIT NINE
|
||||
ABF0>0030 # MEETEI MAYEK DIGIT ZERO
|
||||
ABF1>0031 # MEETEI MAYEK DIGIT ONE
|
||||
ABF2>0032 # MEETEI MAYEK DIGIT TWO
|
||||
ABF3>0033 # MEETEI MAYEK DIGIT THREE
|
||||
ABF4>0034 # MEETEI MAYEK DIGIT FOUR
|
||||
ABF5>0035 # MEETEI MAYEK DIGIT FIVE
|
||||
ABF6>0036 # MEETEI MAYEK DIGIT SIX
|
||||
ABF7>0037 # MEETEI MAYEK DIGIT SEVEN
|
||||
ABF8>0038 # MEETEI MAYEK DIGIT EIGHT
|
||||
ABF9>0039 # MEETEI MAYEK DIGIT NINE
|
||||
104A0>0030 # OSMANYA DIGIT ZERO
|
||||
104A1>0031 # OSMANYA DIGIT ONE
|
||||
104A2>0032 # OSMANYA DIGIT TWO
|
||||
104A3>0033 # OSMANYA DIGIT THREE
|
||||
104A4>0034 # OSMANYA DIGIT FOUR
|
||||
104A5>0035 # OSMANYA DIGIT FIVE
|
||||
104A6>0036 # OSMANYA DIGIT SIX
|
||||
104A7>0037 # OSMANYA DIGIT SEVEN
|
||||
104A8>0038 # OSMANYA DIGIT EIGHT
|
||||
104A9>0039 # OSMANYA DIGIT NINE
|
||||
10A40>0031 # KHAROSHTHI DIGIT ONE
|
||||
10A41>0032 # KHAROSHTHI DIGIT TWO
|
||||
10A42>0033 # KHAROSHTHI DIGIT THREE
|
||||
10A43>0034 # KHAROSHTHI DIGIT FOUR
|
||||
10E60>0031 # RUMI DIGIT ONE
|
||||
10E61>0032 # RUMI DIGIT TWO
|
||||
10E62>0033 # RUMI DIGIT THREE
|
||||
10E63>0034 # RUMI DIGIT FOUR
|
||||
10E64>0035 # RUMI DIGIT FIVE
|
||||
10E65>0036 # RUMI DIGIT SIX
|
||||
10E66>0037 # RUMI DIGIT SEVEN
|
||||
10E67>0038 # RUMI DIGIT EIGHT
|
||||
10E68>0039 # RUMI DIGIT NINE
|
||||
11052>0031 # BRAHMI NUMBER ONE
|
||||
11053>0032 # BRAHMI NUMBER TWO
|
||||
11054>0033 # BRAHMI NUMBER THREE
|
||||
11055>0034 # BRAHMI NUMBER FOUR
|
||||
11056>0035 # BRAHMI NUMBER FIVE
|
||||
11057>0036 # BRAHMI NUMBER SIX
|
||||
11058>0037 # BRAHMI NUMBER SEVEN
|
||||
11059>0038 # BRAHMI NUMBER EIGHT
|
||||
1105A>0039 # BRAHMI NUMBER NINE
|
||||
11066>0030 # BRAHMI DIGIT ZERO
|
||||
11067>0031 # BRAHMI DIGIT ONE
|
||||
11068>0032 # BRAHMI DIGIT TWO
|
||||
11069>0033 # BRAHMI DIGIT THREE
|
||||
1106A>0034 # BRAHMI DIGIT FOUR
|
||||
1106B>0035 # BRAHMI DIGIT FIVE
|
||||
1106C>0036 # BRAHMI DIGIT SIX
|
||||
1106D>0037 # BRAHMI DIGIT SEVEN
|
||||
1106E>0038 # BRAHMI DIGIT EIGHT
|
||||
1106F>0039 # BRAHMI DIGIT NINE
|
||||
110F0>0030 # SORA SOMPENG DIGIT ZERO
|
||||
110F1>0031 # SORA SOMPENG DIGIT ONE
|
||||
110F2>0032 # SORA SOMPENG DIGIT TWO
|
||||
110F3>0033 # SORA SOMPENG DIGIT THREE
|
||||
110F4>0034 # SORA SOMPENG DIGIT FOUR
|
||||
110F5>0035 # SORA SOMPENG DIGIT FIVE
|
||||
110F6>0036 # SORA SOMPENG DIGIT SIX
|
||||
110F7>0037 # SORA SOMPENG DIGIT SEVEN
|
||||
110F8>0038 # SORA SOMPENG DIGIT EIGHT
|
||||
110F9>0039 # SORA SOMPENG DIGIT NINE
|
||||
11136>0030 # CHAKMA DIGIT ZERO
|
||||
11137>0031 # CHAKMA DIGIT ONE
|
||||
11138>0032 # CHAKMA DIGIT TWO
|
||||
11139>0033 # CHAKMA DIGIT THREE
|
||||
1113A>0034 # CHAKMA DIGIT FOUR
|
||||
1113B>0035 # CHAKMA DIGIT FIVE
|
||||
1113C>0036 # CHAKMA DIGIT SIX
|
||||
1113D>0037 # CHAKMA DIGIT SEVEN
|
||||
1113E>0038 # CHAKMA DIGIT EIGHT
|
||||
1113F>0039 # CHAKMA DIGIT NINE
|
||||
111D0>0030 # SHARADA DIGIT ZERO
|
||||
111D1>0031 # SHARADA DIGIT ONE
|
||||
111D2>0032 # SHARADA DIGIT TWO
|
||||
111D3>0033 # SHARADA DIGIT THREE
|
||||
111D4>0034 # SHARADA DIGIT FOUR
|
||||
111D5>0035 # SHARADA DIGIT FIVE
|
||||
111D6>0036 # SHARADA DIGIT SIX
|
||||
111D7>0037 # SHARADA DIGIT SEVEN
|
||||
111D8>0038 # SHARADA DIGIT EIGHT
|
||||
111D9>0039 # SHARADA DIGIT NINE
|
||||
116C0>0030 # TAKRI DIGIT ZERO
|
||||
116C1>0031 # TAKRI DIGIT ONE
|
||||
116C2>0032 # TAKRI DIGIT TWO
|
||||
116C3>0033 # TAKRI DIGIT THREE
|
||||
116C4>0034 # TAKRI DIGIT FOUR
|
||||
116C5>0035 # TAKRI DIGIT FIVE
|
||||
116C6>0036 # TAKRI DIGIT SIX
|
||||
116C7>0037 # TAKRI DIGIT SEVEN
|
||||
116C8>0038 # TAKRI DIGIT EIGHT
|
||||
116C9>0039 # TAKRI DIGIT NINE
|
||||
|
||||
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -1,52 +1,18 @@
|
|||
# Copyright 2001-2010 Unicode, Inc.
|
||||
#
|
||||
# Disclaimer
|
||||
#
|
||||
# This source code is provided as is by Unicode, Inc. No claims are
|
||||
# made as to fitness for any particular purpose. No warranties of any
|
||||
# kind are expressed or implied. The recipient agrees to determine
|
||||
# applicability of information provided. If this file has been
|
||||
# purchased on magnetic or optical media from Unicode, Inc., the
|
||||
# sole remedy for any claim will be exchange of defective media
|
||||
# within 90 days of receipt.
|
||||
#
|
||||
# Limitations on Rights to Redistribute This Code
|
||||
#
|
||||
# Unicode, Inc. hereby grants the right to freely use the information
|
||||
# supplied in this file in the creation of products supporting the
|
||||
# Unicode Standard, and to make copies of this file in any form
|
||||
# for internal or external distribution as long as this notice
|
||||
# remains attached.
|
||||
#
|
||||
# Extracted from:
|
||||
# DerivedNormalizationProps-6.0.0.txt
|
||||
# Date: 2010-05-20, 15:14:12 GMT [MD]
|
||||
#
|
||||
# Unicode Character Database
|
||||
# Copyright (c) 1991-2010 Unicode, Inc.
|
||||
# Copyright (c) 1991-2012 Unicode, Inc.
|
||||
# For terms of use, see http://www.unicode.org/terms_of_use.html
|
||||
# For documentation, see http://www.unicode.org/reports/tr44/
|
||||
#
|
||||
# file name: nfkc_cf.txt
|
||||
#
|
||||
# machine-generated by ICU preparseucd.py
|
||||
#
|
||||
# This file contains the Unicode NFKC_CF mappings,
|
||||
# extracted from the UCD file DerivedNormalizationProps.txt,
|
||||
# and reformatted into syntax for the gennorm2 Normalizer2 data generator tool.
|
||||
# Use this file as the third gennorm2 input file after nfc.txt and nfkc.txt.
|
||||
|
||||
# ================================================
|
||||
# This file has been reformatted into syntax for the
|
||||
# gennorm2 Normalizer2 data generator tool.
|
||||
# Only the NFKC_CF mappings are retained and reformatted.
|
||||
# Reformatting via regular expression: s/ *; NFKC_CF; */>/
|
||||
# Use this file as the second gennorm2 input file after nfkc.txt.
|
||||
# ================================================
|
||||
|
||||
# Derived Property: NFKC_Casefold (NFKC_CF)
|
||||
# This property removes certain variations from characters: case, compatibility, and default-ignorables.
|
||||
# It is used for loose matching and certain types of identifiers.
|
||||
# It is constructed by applying NFKC, CaseFolding, and removal of Default_Ignorable_Code_Points.
|
||||
# The process of applying these transformations is repeated until a stable result is produced.
|
||||
# WARNING: Application to STRINGS must apply NFC after mapping each character, because characters may interact.
|
||||
# For more information, see [http://www.unicode.org/reports/tr44/]
|
||||
# Omitted code points are unchanged by this mapping.
|
||||
# @missing: 0000..10FFFF; NFKC_CF; <code point>
|
||||
|
||||
# All code points not explicitly listed for NFKC_Casefold
|
||||
# have the value <codepoint>.
|
||||
* Unicode 6.1.0
|
||||
|
||||
0041>0061
|
||||
0042>0062
|
||||
|
@ -656,6 +622,8 @@
|
|||
10C3>2D23
|
||||
10C4>2D24
|
||||
10C5>2D25
|
||||
10C7>2D27
|
||||
10CD>2D2D
|
||||
10FC>10DC
|
||||
115F..1160>
|
||||
17B4..17B5>
|
||||
|
@ -1061,9 +1029,7 @@
|
|||
2049>0021 003F
|
||||
2057>2032 2032 2032 2032
|
||||
205F>0020
|
||||
2060..2064>
|
||||
2065..2069>
|
||||
206A..206F>
|
||||
2060..206F>
|
||||
2070>0030
|
||||
2071>0069
|
||||
2074>0034
|
||||
|
@ -1470,6 +1436,7 @@
|
|||
2CE2>2CE3
|
||||
2CEB>2CEC
|
||||
2CED>2CEE
|
||||
2CF2>2CF3
|
||||
2D6F>2D61
|
||||
2E9F>6BCD
|
||||
2EF3>9F9F
|
||||
|
@ -2390,11 +2357,15 @@ A786>A787
|
|||
A78B>A78C
|
||||
A78D>0265
|
||||
A790>A791
|
||||
A792>A793
|
||||
A7A0>A7A1
|
||||
A7A2>A7A3
|
||||
A7A4>A7A5
|
||||
A7A6>A7A7
|
||||
A7A8>A7A9
|
||||
A7AA>0266
|
||||
A7F8>0127
|
||||
A7F9>0153
|
||||
F900>8C48
|
||||
F901>66F4
|
||||
F902>8ECA
|
||||
|
@ -2684,6 +2655,8 @@ FA2A>98EF
|
|||
FA2B>98FC
|
||||
FA2C>9928
|
||||
FA2D>9DB4
|
||||
FA2E>90DE
|
||||
FA2F>96B7
|
||||
FA30>4FAE
|
||||
FA31>50E7
|
||||
FA32>514D
|
||||
|
@ -4773,6 +4746,147 @@ FFF0..FFF8>
|
|||
1D7FD>0037
|
||||
1D7FE>0038
|
||||
1D7FF>0039
|
||||
1EE00>0627
|
||||
1EE01>0628
|
||||
1EE02>062C
|
||||
1EE03>062F
|
||||
1EE05>0648
|
||||
1EE06>0632
|
||||
1EE07>062D
|
||||
1EE08>0637
|
||||
1EE09>064A
|
||||
1EE0A>0643
|
||||
1EE0B>0644
|
||||
1EE0C>0645
|
||||
1EE0D>0646
|
||||
1EE0E>0633
|
||||
1EE0F>0639
|
||||
1EE10>0641
|
||||
1EE11>0635
|
||||
1EE12>0642
|
||||
1EE13>0631
|
||||
1EE14>0634
|
||||
1EE15>062A
|
||||
1EE16>062B
|
||||
1EE17>062E
|
||||
1EE18>0630
|
||||
1EE19>0636
|
||||
1EE1A>0638
|
||||
1EE1B>063A
|
||||
1EE1C>066E
|
||||
1EE1D>06BA
|
||||
1EE1E>06A1
|
||||
1EE1F>066F
|
||||
1EE21>0628
|
||||
1EE22>062C
|
||||
1EE24>0647
|
||||
1EE27>062D
|
||||
1EE29>064A
|
||||
1EE2A>0643
|
||||
1EE2B>0644
|
||||
1EE2C>0645
|
||||
1EE2D>0646
|
||||
1EE2E>0633
|
||||
1EE2F>0639
|
||||
1EE30>0641
|
||||
1EE31>0635
|
||||
1EE32>0642
|
||||
1EE34>0634
|
||||
1EE35>062A
|
||||
1EE36>062B
|
||||
1EE37>062E
|
||||
1EE39>0636
|
||||
1EE3B>063A
|
||||
1EE42>062C
|
||||
1EE47>062D
|
||||
1EE49>064A
|
||||
1EE4B>0644
|
||||
1EE4D>0646
|
||||
1EE4E>0633
|
||||
1EE4F>0639
|
||||
1EE51>0635
|
||||
1EE52>0642
|
||||
1EE54>0634
|
||||
1EE57>062E
|
||||
1EE59>0636
|
||||
1EE5B>063A
|
||||
1EE5D>06BA
|
||||
1EE5F>066F
|
||||
1EE61>0628
|
||||
1EE62>062C
|
||||
1EE64>0647
|
||||
1EE67>062D
|
||||
1EE68>0637
|
||||
1EE69>064A
|
||||
1EE6A>0643
|
||||
1EE6C>0645
|
||||
1EE6D>0646
|
||||
1EE6E>0633
|
||||
1EE6F>0639
|
||||
1EE70>0641
|
||||
1EE71>0635
|
||||
1EE72>0642
|
||||
1EE74>0634
|
||||
1EE75>062A
|
||||
1EE76>062B
|
||||
1EE77>062E
|
||||
1EE79>0636
|
||||
1EE7A>0638
|
||||
1EE7B>063A
|
||||
1EE7C>066E
|
||||
1EE7E>06A1
|
||||
1EE80>0627
|
||||
1EE81>0628
|
||||
1EE82>062C
|
||||
1EE83>062F
|
||||
1EE84>0647
|
||||
1EE85>0648
|
||||
1EE86>0632
|
||||
1EE87>062D
|
||||
1EE88>0637
|
||||
1EE89>064A
|
||||
1EE8B>0644
|
||||
1EE8C>0645
|
||||
1EE8D>0646
|
||||
1EE8E>0633
|
||||
1EE8F>0639
|
||||
1EE90>0641
|
||||
1EE91>0635
|
||||
1EE92>0642
|
||||
1EE93>0631
|
||||
1EE94>0634
|
||||
1EE95>062A
|
||||
1EE96>062B
|
||||
1EE97>062E
|
||||
1EE98>0630
|
||||
1EE99>0636
|
||||
1EE9A>0638
|
||||
1EE9B>063A
|
||||
1EEA1>0628
|
||||
1EEA2>062C
|
||||
1EEA3>062F
|
||||
1EEA5>0648
|
||||
1EEA6>0632
|
||||
1EEA7>062D
|
||||
1EEA8>0637
|
||||
1EEA9>064A
|
||||
1EEAB>0644
|
||||
1EEAC>0645
|
||||
1EEAD>0646
|
||||
1EEAE>0633
|
||||
1EEAF>0639
|
||||
1EEB0>0641
|
||||
1EEB1>0635
|
||||
1EEB2>0642
|
||||
1EEB3>0631
|
||||
1EEB4>0634
|
||||
1EEB5>062A
|
||||
1EEB6>062B
|
||||
1EEB7>062E
|
||||
1EEB8>0630
|
||||
1EEB9>0636
|
||||
1EEBA>0638
|
||||
1EEBB>063A
|
||||
1F100>0030 002E
|
||||
1F101>0030 002C
|
||||
1F102>0031 002C
|
||||
|
@ -4847,6 +4961,8 @@ FFF0..FFF8>
|
|||
1F14D>0073 0073
|
||||
1F14E>0070 0070 0076
|
||||
1F14F>0077 0063
|
||||
1F16A>006D 0063
|
||||
1F16B>006D 0064
|
||||
1F190>0064 006A
|
||||
1F200>307B 304B
|
||||
1F201>30B3 30B3
|
||||
|
@ -5437,12 +5553,4 @@ FFF0..FFF8>
|
|||
2FA1B>9F16
|
||||
2FA1C>9F3B
|
||||
2FA1D>2A600
|
||||
E0000>
|
||||
E0001>
|
||||
E0002..E001F>
|
||||
E0020..E007F>
|
||||
E0080..E00FF>
|
||||
E0100..E01EF>
|
||||
E01F0..E0FFF>
|
||||
|
||||
# Total code points: 9792
|
||||
E0000..E0FFF>
|
||||
|
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,273 @@
|
|||
package org.apache.lucene.analysis.icu;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import com.ibm.icu.lang.UCharacter;
|
||||
import com.ibm.icu.lang.UProperty;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
import com.ibm.icu.text.UnicodeSetIterator;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.File;
|
||||
import java.io.FileFilter;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.OutputStream;
|
||||
import java.io.OutputStreamWriter;
|
||||
import java.io.Writer;
|
||||
import java.net.URL;
|
||||
import java.net.URLConnection;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/**
|
||||
* Downloads/generates lucene/analysis/icu/src/data/utr30/*.txt
|
||||
*
|
||||
* ASSUMPTION: This class will be run with current directory set to
|
||||
* lucene/analysis/icu/src/data/utr30/
|
||||
*
|
||||
* <ol>
|
||||
* <li>
|
||||
* Downloads nfc.txt, nfkc.txt and nfkc_cf.txt from icu-project.org,
|
||||
* overwriting the versions in lucene/analysis/icu/src/data/utr30/.
|
||||
* </li>
|
||||
* <li>
|
||||
* Converts round-trip mappings in nfc.txt (containing '=')
|
||||
* that map to at least one [:Diacritic:] character
|
||||
* into one-way mappings ('>' instead of '=').
|
||||
* </li>
|
||||
* </ol>
|
||||
*/
|
||||
public class GenerateUTR30DataFiles {
|
||||
private static final String ICU_SVN_TAG_URL
|
||||
= "http://source.icu-project.org/repos/icu/icu/tags";
|
||||
private static final String ICU_RELEASE_TAG = "release-49-1-2";
|
||||
private static final String ICU_DATA_NORM2_PATH = "source/data/unidata/norm2";
|
||||
private static final String NFC_TXT = "nfc.txt";
|
||||
private static final String NFKC_TXT = "nfkc.txt";
|
||||
private static final String NFKC_CF_TXT = "nfkc_cf.txt";
|
||||
private static byte[] DOWNLOAD_BUFFER = new byte[8192];
|
||||
private static final Pattern ROUND_TRIP_MAPPING_LINE_PATTERN
|
||||
= Pattern.compile("^\\s*([^=]+?)\\s*=\\s*(.*)$");
|
||||
private static final Pattern VERBATIM_RULE_LINE_PATTERN
|
||||
= Pattern.compile("^#\\s*Rule:\\s*verbatim\\s*$", Pattern.CASE_INSENSITIVE);
|
||||
private static final Pattern RULE_LINE_PATTERN
|
||||
= Pattern.compile("^#\\s*Rule:\\s*(.*)>(.*)", Pattern.CASE_INSENSITIVE);
|
||||
private static final Pattern BLANK_OR_COMMENT_LINE_PATTERN
|
||||
= Pattern.compile("^\\s*(?:#.*)?$");
|
||||
private static final Pattern NUMERIC_VALUE_PATTERN
|
||||
= Pattern.compile("Numeric[-\\s_]*Value", Pattern.CASE_INSENSITIVE);
|
||||
|
||||
public static void main(String args[]) {
|
||||
try {
|
||||
getNFKCDataFilesFromIcuProject();
|
||||
expandRulesInUTR30DataFiles();
|
||||
} catch (Throwable t) {
|
||||
t.printStackTrace(System.err);
|
||||
System.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
private static void expandRulesInUTR30DataFiles() throws IOException {
|
||||
FileFilter filter = new FileFilter() {
|
||||
@Override
|
||||
public boolean accept(File pathname) {
|
||||
String name = pathname.getName();
|
||||
return pathname.isFile() && name.matches(".*\\.(?s:txt)")
|
||||
&& ! name.equals(NFC_TXT) && ! name.equals(NFKC_TXT)
|
||||
&& ! name.equals(NFKC_CF_TXT);
|
||||
}
|
||||
};
|
||||
for (File file : new File(".").listFiles(filter)) {
|
||||
expandDataFileRules(file);
|
||||
}
|
||||
}
|
||||
|
||||
private static void expandDataFileRules(File file) throws IOException {
|
||||
final FileInputStream stream = new FileInputStream(file);
|
||||
final InputStreamReader reader = new InputStreamReader(stream, "UTF-8");
|
||||
final BufferedReader bufferedReader = new BufferedReader(reader);
|
||||
StringBuilder builder = new StringBuilder();
|
||||
String line;
|
||||
boolean verbatim = false;
|
||||
boolean modified = false;
|
||||
int lineNum = 0;
|
||||
try {
|
||||
while (null != (line = bufferedReader.readLine())) {
|
||||
++lineNum;
|
||||
if (VERBATIM_RULE_LINE_PATTERN.matcher(line).matches()) {
|
||||
verbatim = true;
|
||||
builder.append(line).append("\n");
|
||||
} else {
|
||||
Matcher ruleMatcher = RULE_LINE_PATTERN.matcher(line);
|
||||
if (ruleMatcher.matches()) {
|
||||
verbatim = false;
|
||||
builder.append(line).append("\n");
|
||||
try {
|
||||
String leftHandSide = ruleMatcher.group(1).trim();
|
||||
String rightHandSide = ruleMatcher.group(2).trim();
|
||||
expandSingleRule(builder, leftHandSide, rightHandSide);
|
||||
} catch (IllegalArgumentException e) {
|
||||
System.err.println
|
||||
("ERROR in " + file.getName() + " line #" + lineNum + ":");
|
||||
e.printStackTrace(System.err);
|
||||
System.exit(1);
|
||||
}
|
||||
modified = true;
|
||||
} else {
|
||||
if (BLANK_OR_COMMENT_LINE_PATTERN.matcher(line).matches()) {
|
||||
builder.append(line).append("\n");
|
||||
} else {
|
||||
if (verbatim) {
|
||||
builder.append(line).append("\n");
|
||||
} else {
|
||||
modified = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
bufferedReader.close();
|
||||
}
|
||||
if (modified) {
|
||||
System.err.println("Expanding rules in and overwriting " + file.getName());
|
||||
final FileOutputStream out = new FileOutputStream(file, false);
|
||||
Writer writer = new OutputStreamWriter(out, "UTF-8");
|
||||
try {
|
||||
writer.write(builder.toString());
|
||||
} finally {
|
||||
writer.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static void getNFKCDataFilesFromIcuProject() throws IOException {
|
||||
URL icuTagsURL = new URL(ICU_SVN_TAG_URL + "/");
|
||||
URL icuReleaseTagURL = new URL(icuTagsURL, ICU_RELEASE_TAG + "/");
|
||||
URL norm2url = new URL(icuReleaseTagURL, ICU_DATA_NORM2_PATH + "/");
|
||||
|
||||
System.err.print("Downloading " + NFKC_TXT + " ... ");
|
||||
download(new URL(norm2url, NFKC_TXT), NFKC_TXT);
|
||||
System.err.println("done.");
|
||||
System.err.print("Downloading " + NFKC_CF_TXT + " ... ");
|
||||
download(new URL(norm2url, NFKC_CF_TXT), NFKC_CF_TXT);
|
||||
System.err.println("done.");
|
||||
|
||||
System.err.print("Downloading " + NFKC_CF_TXT + " and making diacritic rules one-way ... ");
|
||||
URLConnection connection = openConnection(new URL(norm2url, NFC_TXT));
|
||||
BufferedReader reader = new BufferedReader
|
||||
(new InputStreamReader(connection.getInputStream(), "UTF-8"));
|
||||
Writer writer = new OutputStreamWriter(new FileOutputStream(NFC_TXT), "UTF-8");
|
||||
try {
|
||||
String line;
|
||||
|
||||
while (null != (line = reader.readLine())) {
|
||||
Matcher matcher = ROUND_TRIP_MAPPING_LINE_PATTERN.matcher(line);
|
||||
if (matcher.matches()) {
|
||||
final String leftHandSide = matcher.group(1);
|
||||
final String rightHandSide = matcher.group(2).trim();
|
||||
List<String> diacritics = new ArrayList<String>();
|
||||
for (String outputCodePoint : rightHandSide.split("\\s+")) {
|
||||
int ch = Integer.parseInt(outputCodePoint, 16);
|
||||
if (UCharacter.hasBinaryProperty(ch, UProperty.DIACRITIC)
|
||||
// gennorm2 fails if U+0653-U+0656 are included in round-trip mappings
|
||||
|| (ch >= 0x653 && ch <= 0x656)) {
|
||||
diacritics.add(outputCodePoint);
|
||||
}
|
||||
}
|
||||
if ( ! diacritics.isEmpty()) {
|
||||
StringBuilder replacementLine = new StringBuilder();
|
||||
replacementLine.append(leftHandSide).append(">").append(rightHandSide);
|
||||
replacementLine.append(" # one-way: diacritic");
|
||||
if (diacritics.size() > 1) {
|
||||
replacementLine.append("s");
|
||||
}
|
||||
for (String diacritic : diacritics) {
|
||||
replacementLine.append(" ").append(diacritic);
|
||||
}
|
||||
line = replacementLine.toString();
|
||||
}
|
||||
}
|
||||
writer.write(line);
|
||||
writer.write("\n");
|
||||
}
|
||||
} finally {
|
||||
reader.close();
|
||||
writer.close();
|
||||
}
|
||||
System.err.println("done.");
|
||||
}
|
||||
|
||||
private static void download(URL url, String outputFile)
|
||||
throws IOException {
|
||||
final URLConnection connection = openConnection(url);
|
||||
final InputStream inputStream = connection.getInputStream();
|
||||
final OutputStream outputStream = new FileOutputStream(outputFile);
|
||||
int numBytes;
|
||||
try {
|
||||
while (-1 != (numBytes = inputStream.read(DOWNLOAD_BUFFER))) {
|
||||
outputStream.write(DOWNLOAD_BUFFER, 0, numBytes);
|
||||
}
|
||||
} finally {
|
||||
inputStream.close();
|
||||
outputStream.close();
|
||||
}
|
||||
}
|
||||
|
||||
private static URLConnection openConnection(URL url) throws IOException {
|
||||
final URLConnection connection = url.openConnection();
|
||||
connection.setUseCaches(false);
|
||||
connection.addRequestProperty("Cache-Control", "no-cache");
|
||||
connection.connect();
|
||||
return connection;
|
||||
}
|
||||
|
||||
private static void expandSingleRule
|
||||
(StringBuilder builder, String leftHandSide, String rightHandSide)
|
||||
throws IllegalArgumentException {
|
||||
UnicodeSet set = new UnicodeSet(leftHandSide, UnicodeSet.IGNORE_SPACE);
|
||||
boolean numericValue = NUMERIC_VALUE_PATTERN.matcher(rightHandSide).matches();
|
||||
for (UnicodeSetIterator it = new UnicodeSetIterator(set) ; it.nextRange() ; ) {
|
||||
if (it.codepoint != UnicodeSetIterator.IS_STRING) {
|
||||
if (numericValue) {
|
||||
for (int cp = it.codepoint ; cp <= it.codepointEnd ; ++cp) {
|
||||
builder.append(String.format("%04X", cp)).append('>');
|
||||
builder.append(String.format("%04X", 0x30 + UCharacter.getNumericValue(cp)));
|
||||
builder.append(" # ").append(UCharacter.getName(cp));
|
||||
builder.append("\n");
|
||||
}
|
||||
} else {
|
||||
builder.append(String.format("%04X", it.codepoint));
|
||||
if (it.codepointEnd > it.codepoint) {
|
||||
builder.append("..").append(String.format("%04X", it.codepointEnd));
|
||||
}
|
||||
builder.append('>').append(rightHandSide).append("\n");
|
||||
}
|
||||
} else {
|
||||
System.err.println("ERROR: String '" + it.getString() + "' found in UnicodeSet");
|
||||
System.exit(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -64,7 +64,7 @@
|
|||
</target>
|
||||
|
||||
<path id="tools.dependencies">
|
||||
<fileset dir="../icu/lib" includes="icu4j-4.8.1.1.jar"/>
|
||||
<fileset dir="../icu/lib" includes="icu4j-49.1.jar"/>
|
||||
</path>
|
||||
|
||||
<path id="tools.classpath">
|
||||
|
|
|
@ -150,7 +150,7 @@
|
|||
<pathelement path="${analyzers-common.jar}"/>
|
||||
<pathelement path="${queryparser.jar}"/>
|
||||
<pathelement path="${facet.jar}"/>
|
||||
<fileset dir="${common.dir}/analysis/icu/lib" includes="icu4j-4.8.1.1.jar"/>
|
||||
<fileset dir="${common.dir}/analysis/icu/lib" includes="icu4j-49.1.jar"/>
|
||||
<path refid="base.classpath"/>
|
||||
<fileset dir="lib">
|
||||
<include name="commons-compress-1.2.jar"/>
|
||||
|
@ -208,7 +208,7 @@
|
|||
<path id="collation.runtime.classpath">
|
||||
<path refid="run.classpath"/>
|
||||
<pathelement path="${analyzers-icu.jar}"/>
|
||||
<fileset dir="${common.dir}/analysis/icu/lib" includes="icu4j-4.8.1.1.jar"/>
|
||||
<fileset dir="${common.dir}/analysis/icu/lib" includes="icu4j-49.1.jar"/>
|
||||
</path>
|
||||
|
||||
<target name="collation" depends="compile,jar-analyzers-icu,top-100k-wiki-word-files">
|
||||
|
|
|
@ -352,7 +352,6 @@ public abstract class LuceneTestCase extends Assert {
|
|||
.around(new TestRuleNoStaticHooksShadowing())
|
||||
.around(new TestRuleNoInstanceHooksOverrides())
|
||||
.around(new SystemPropertiesInvariantRule(IGNORED_INVARIANT_PROPERTIES))
|
||||
.around(new TestRuleIcuHack())
|
||||
.around(classNameRule = new TestRuleStoreClassName())
|
||||
.around(new TestRuleReportUncaughtExceptions())
|
||||
.around(classEnvRule = new TestRuleSetupAndRestoreClassEnv());
|
||||
|
|
|
@ -1,55 +0,0 @@
|
|||
package org.apache.lucene.util;
|
||||
|
||||
import java.util.Locale;
|
||||
import java.util.concurrent.atomic.AtomicBoolean;
|
||||
|
||||
import org.junit.rules.TestRule;
|
||||
import org.junit.runner.Description;
|
||||
import org.junit.runners.model.Statement;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
final class TestRuleIcuHack implements TestRule {
|
||||
/** Globally only check hack once. */
|
||||
private static volatile AtomicBoolean icuTested = new AtomicBoolean(false);
|
||||
|
||||
@Override
|
||||
public Statement apply(final Statement s, Description d) {
|
||||
return new Statement() {
|
||||
@Override
|
||||
public void evaluate() throws Throwable {
|
||||
// START hack to init ICU safely before we randomize locales.
|
||||
// ICU fails during classloading when a special Java7-only locale is the default
|
||||
// see: http://bugs.icu-project.org/trac/ticket/8734
|
||||
if (!icuTested.getAndSet(true)) {
|
||||
Locale previous = Locale.getDefault();
|
||||
try {
|
||||
Locale.setDefault(Locale.ROOT);
|
||||
Class.forName("com.ibm.icu.util.ULocale");
|
||||
} catch (ClassNotFoundException cnfe) {
|
||||
// ignore if no ICU is in classpath
|
||||
} finally {
|
||||
Locale.setDefault(previous);
|
||||
}
|
||||
}
|
||||
|
||||
s.evaluate();
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
|
@ -19,7 +19,7 @@
|
|||
<ivy-module version="2.0">
|
||||
<info organisation="org.apache.solr" module="analysis-extras"/>
|
||||
<dependencies>
|
||||
<dependency org="com.ibm.icu" name="icu4j" rev="4.8.1.1" transitive="false"/>
|
||||
<dependency org="com.ibm.icu" name="icu4j" rev="49.1" transitive="false"/>
|
||||
<dependency org="org.carrot2" name="morfologik-polish" rev="1.5.3" transitive="false"/>
|
||||
<dependency org="org.carrot2" name="morfologik-fsa" rev="1.5.3" transitive="false"/>
|
||||
<dependency org="org.carrot2" name="morfologik-stemming" rev="1.5.3" transitive="false"/>
|
||||
|
|
|
@ -1 +0,0 @@
|
|||
654b7021e7bb6c5b7b35c88d23cac1022c7b3d6b
|
|
@ -0,0 +1 @@
|
|||
fbf7a438e6bf3660e0da2fd77dd1df1635fe503c
|
|
@ -2,7 +2,7 @@ ICU License - ICU 1.8.1 and later
|
|||
|
||||
COPYRIGHT AND PERMISSION NOTICE
|
||||
|
||||
Copyright (c) 1995-2008 International Business Machines Corporation and others
|
||||
Copyright (c) 1995-2012 International Business Machines Corporation and others
|
||||
|
||||
All rights reserved.
|
||||
|
||||
|
|
|
@ -1,3 +1,3 @@
|
|||
ICU4J, (under contrib/icu) is licensed under an MIT styles license
|
||||
(contrib/icu/lib/ICU-LICENSE.txt) and Copyright (c) 1995-2008
|
||||
ICU4J, (under modules/analysis/icu) is licensed under an MIT style license
|
||||
(modules/analysis/icu/lib/icu4j-LICENSE-BSD_LIKE.txt) and Copyright (c) 1995-2012
|
||||
International Business Machines Corporation and others
|
|
@ -52,7 +52,7 @@
|
|||
<dependency org="rome" name="rome" rev="0.9" transitive="false"/>
|
||||
<dependency org="jdom" name="jdom" rev="1.0" transitive="false"/>
|
||||
<!-- Other ExtracingRequestHandler dependencies -->
|
||||
<dependency org="com.ibm.icu" name="icu4j" rev="4.8.1.1" transitive="false"/>
|
||||
<dependency org="com.ibm.icu" name="icu4j" rev="49.1" transitive="false"/>
|
||||
<dependency org="xerces" name="xercesImpl" rev="2.8.1" transitive="false"/>
|
||||
<exclude org="*" ext="*" matcher="regexp" type="${ivy.exclude.types}"/>
|
||||
</dependencies>
|
||||
|
|
|
@ -1 +0,0 @@
|
|||
654b7021e7bb6c5b7b35c88d23cac1022c7b3d6b
|
|
@ -0,0 +1 @@
|
|||
fbf7a438e6bf3660e0da2fd77dd1df1635fe503c
|
|
@ -2,7 +2,7 @@ ICU License - ICU 1.8.1 and later
|
|||
|
||||
COPYRIGHT AND PERMISSION NOTICE
|
||||
|
||||
Copyright (c) 1995-2008 International Business Machines Corporation and others
|
||||
Copyright (c) 1995-2012 International Business Machines Corporation and others
|
||||
|
||||
All rights reserved.
|
||||
|
||||
|
|
|
@ -1,3 +1,3 @@
|
|||
ICU4J, (under contrib/icu) is licensed under an MIT styles license
|
||||
(contrib/icu/lib/ICU-LICENSE.txt) and Copyright (c) 1995-2008
|
||||
ICU4J, (under modules/analysis/icu) is licensed under an MIT style license
|
||||
(modules/analysis/icu/lib/icu4j-LICENSE-BSD_LIKE.txt) and Copyright (c) 1995-2012
|
||||
International Business Machines Corporation and others
|
Loading…
Reference in New Issue