LUCENE-5770: Upgrade to JFlex 1.6, which has direct support for supplementary code points - as a result, ICU4J is no longer used to generate surrogate pairs to augment JFlex scanner specifications.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1608134 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Steven Rowe 2014-07-05 21:12:09 +00:00
parent 35d479f1b7
commit b207188421
16 changed files with 36420 additions and 40495 deletions

View File

@ -136,6 +136,13 @@ Test Framework
* LUCENE-5786: Unflushed/ truncated events file (hung testing subprocess).
(Dawid Weiss)
Build
* LUCENE-5770: Upgrade to JFlex 1.6, which has direct support for
supplementary code points - as a result, ICU4J is no longer used
to generate surrogate pairs to augment JFlex scanner specifications.
(Steve Rowe)
======================= Lucene 4.9.0 =======================
Changes in Runtime Behavior

View File

@ -29,16 +29,9 @@
<import file="../analysis-module-build.xml"/>
<target name="jflex" depends="-install-jflex,clean-jflex,-gen-uax29-supp-macros,
-jflex-StandardAnalyzer,-jflex-UAX29URLEmailTokenizer,
<target name="jflex" depends="-install-jflex,clean-jflex,-jflex-StandardAnalyzer,-jflex-UAX29URLEmailTokenizer,
-jflex-wiki-tokenizer,-jflex-HTMLStripCharFilter"/>
<target name="-gen-uax29-supp-macros">
<subant target="gen-uax29-supp-macros">
<fileset dir="../icu" includes="build.xml"/>
</subant>
</target>
<target name="-jflex-HTMLStripCharFilter"
depends="init,generate-jflex-html-char-entities">
<jflex file="src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex"

View File

@ -1,64 +0,0 @@
/*
* Copyright 2010 The Apache Software Foundation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Generated using ICU4J 52.1.0.0
// by org.apache.lucene.analysis.icu.GenerateHTMLStripCharFilterSupplementaryMacros
ID_Start_Supp = (
[\uD802][\uDC00-\uDC05\uDC08\uDC0A-\uDC35\uDC37\uDC38\uDC3C\uDC3F-\uDC55\uDD00-\uDD15\uDD20-\uDD39\uDD80-\uDDB7\uDDBE\uDDBF\uDE00\uDE10-\uDE13\uDE15-\uDE17\uDE19-\uDE33\uDE60-\uDE7C\uDF00-\uDF35\uDF40-\uDF55\uDF60-\uDF72]
| [\uD81A][\uDC00-\uDE38]
| [\uD869][\uDC00-\uDED6\uDF00-\uDFFF]
| [\uD835][\uDC00-\uDC54\uDC56-\uDC9C\uDC9E\uDC9F\uDCA2\uDCA5\uDCA6\uDCA9-\uDCAC\uDCAE-\uDCB9\uDCBB\uDCBD-\uDCC3\uDCC5-\uDD05\uDD07-\uDD0A\uDD0D-\uDD14\uDD16-\uDD1C\uDD1E-\uDD39\uDD3B-\uDD3E\uDD40-\uDD44\uDD46\uDD4A-\uDD50\uDD52-\uDEA5\uDEA8-\uDEC0\uDEC2-\uDEDA\uDEDC-\uDEFA\uDEFC-\uDF14\uDF16-\uDF34\uDF36-\uDF4E\uDF50-\uDF6E\uDF70-\uDF88\uDF8A-\uDFA8\uDFAA-\uDFC2\uDFC4-\uDFCB]
| [\uD80C\uD840-\uD868\uD86A-\uD86C][\uDC00-\uDFFF]
| [\uD82C][\uDC00\uDC01]
| [\uD86D][\uDC00-\uDF34\uDF40-\uDFFF]
| [\uD81B][\uDF00-\uDF44\uDF50\uDF93-\uDF9F]
| [\uD87E][\uDC00-\uDE1D]
| [\uD804][\uDC03-\uDC37\uDC83-\uDCAF\uDCD0-\uDCE8\uDD03-\uDD26\uDD83-\uDDB2\uDDC1-\uDDC4]
| [\uD83B][\uDE00-\uDE03\uDE05-\uDE1F\uDE21\uDE22\uDE24\uDE27\uDE29-\uDE32\uDE34-\uDE37\uDE39\uDE3B\uDE42\uDE47\uDE49\uDE4B\uDE4D-\uDE4F\uDE51\uDE52\uDE54\uDE57\uDE59\uDE5B\uDE5D\uDE5F\uDE61\uDE62\uDE64\uDE67-\uDE6A\uDE6C-\uDE72\uDE74-\uDE77\uDE79-\uDE7C\uDE7E\uDE80-\uDE89\uDE8B-\uDE9B\uDEA1-\uDEA3\uDEA5-\uDEA9\uDEAB-\uDEBB]
| [\uD809][\uDC00-\uDC62]
| [\uD808][\uDC00-\uDF6E]
| [\uD803][\uDC00-\uDC48]
| [\uD800][\uDC00-\uDC0B\uDC0D-\uDC26\uDC28-\uDC3A\uDC3C\uDC3D\uDC3F-\uDC4D\uDC50-\uDC5D\uDC80-\uDCFA\uDD40-\uDD74\uDE80-\uDE9C\uDEA0-\uDED0\uDF00-\uDF1E\uDF30-\uDF4A\uDF80-\uDF9D\uDFA0-\uDFC3\uDFC8-\uDFCF\uDFD1-\uDFD5]
| [\uD80D][\uDC00-\uDC2E]
| [\uD805][\uDE80-\uDEAA]
| [\uD86E][\uDC00-\uDC1D]
| [\uD801][\uDC00-\uDC9D]
)
ID_Continue_Supp = (
[\uD81A][\uDC00-\uDE38]
| [\uD869][\uDC00-\uDED6\uDF00-\uDFFF]
| [\uD80C\uD840-\uD868\uD86A-\uD86C][\uDC00-\uDFFF]
| [\uD82C][\uDC00\uDC01]
| [\uD81B][\uDF00-\uDF44\uDF50-\uDF7E\uDF8F-\uDF9F]
| [\uD801][\uDC00-\uDC9D\uDCA0-\uDCA9]
| [\uD86D][\uDC00-\uDF34\uDF40-\uDFFF]
| [\uD87E][\uDC00-\uDE1D]
| [\uD802][\uDC00-\uDC05\uDC08\uDC0A-\uDC35\uDC37\uDC38\uDC3C\uDC3F-\uDC55\uDD00-\uDD15\uDD20-\uDD39\uDD80-\uDDB7\uDDBE\uDDBF\uDE00-\uDE03\uDE05\uDE06\uDE0C-\uDE13\uDE15-\uDE17\uDE19-\uDE33\uDE38-\uDE3A\uDE3F\uDE60-\uDE7C\uDF00-\uDF35\uDF40-\uDF55\uDF60-\uDF72]
| [\uD805][\uDE80-\uDEB7\uDEC0-\uDEC9]
| [\uD83B][\uDE00-\uDE03\uDE05-\uDE1F\uDE21\uDE22\uDE24\uDE27\uDE29-\uDE32\uDE34-\uDE37\uDE39\uDE3B\uDE42\uDE47\uDE49\uDE4B\uDE4D-\uDE4F\uDE51\uDE52\uDE54\uDE57\uDE59\uDE5B\uDE5D\uDE5F\uDE61\uDE62\uDE64\uDE67-\uDE6A\uDE6C-\uDE72\uDE74-\uDE77\uDE79-\uDE7C\uDE7E\uDE80-\uDE89\uDE8B-\uDE9B\uDEA1-\uDEA3\uDEA5-\uDEA9\uDEAB-\uDEBB]
| [\uD809][\uDC00-\uDC62]
| [\uD808][\uDC00-\uDF6E]
| [\uD803][\uDC00-\uDC48]
| [\uD80D][\uDC00-\uDC2E]
| [\uD800][\uDC00-\uDC0B\uDC0D-\uDC26\uDC28-\uDC3A\uDC3C\uDC3D\uDC3F-\uDC4D\uDC50-\uDC5D\uDC80-\uDCFA\uDD40-\uDD74\uDDFD\uDE80-\uDE9C\uDEA0-\uDED0\uDF00-\uDF1E\uDF30-\uDF4A\uDF80-\uDF9D\uDFA0-\uDFC3\uDFC8-\uDFCF\uDFD1-\uDFD5]
| [\uD804][\uDC00-\uDC46\uDC66-\uDC6F\uDC80-\uDCBA\uDCD0-\uDCE8\uDCF0-\uDCF9\uDD00-\uDD34\uDD36-\uDD3F\uDD80-\uDDC4\uDDD0-\uDDD9]
| [\uD86E][\uDC00-\uDC1D]
| [\uDB40][\uDD00-\uDDEF]
| [\uD834][\uDD65-\uDD69\uDD6D-\uDD72\uDD7B-\uDD82\uDD85-\uDD8B\uDDAA-\uDDAD\uDE42-\uDE44]
| [\uD835][\uDC00-\uDC54\uDC56-\uDC9C\uDC9E\uDC9F\uDCA2\uDCA5\uDCA6\uDCA9-\uDCAC\uDCAE-\uDCB9\uDCBB\uDCBD-\uDCC3\uDCC5-\uDD05\uDD07-\uDD0A\uDD0D-\uDD14\uDD16-\uDD1C\uDD1E-\uDD39\uDD3B-\uDD3E\uDD40-\uDD44\uDD46\uDD4A-\uDD50\uDD52-\uDEA5\uDEA8-\uDEC0\uDEC2-\uDEDA\uDEDC-\uDEFA\uDEFC-\uDF14\uDF16-\uDF34\uDF36-\uDF4E\uDF50-\uDF6E\uDF70-\uDF88\uDF8A-\uDFA8\uDFAA-\uDFC2\uDFC4-\uDFCB\uDFCE-\uDFFF]
)

View File

@ -64,7 +64,7 @@ import org.apache.lucene.analysis.util.OpenStringBuilder;
//
// <identifier> := <ID_Start> <ID_Continue>*
//
Name = ( ( [:_\p{ID_Start}] | {ID_Start_Supp} ) ( [-.:_\p{ID_Continue}] | {ID_Continue_Supp} )* )
Name = [:_\p{ID_Start}] [-.:_\p{ID_Continue}]*
// From Apache httpd mod_include documentation
// <http://httpd.apache.org/docs/current/mod/mod_include.html>:
@ -141,8 +141,6 @@ InlineElment = ( [aAbBiIqQsSuU] |
%include HTMLCharacterEntities.jflex
%include HTMLStripCharFilter.SUPPLEMENTARY.jflex-macro
%{
private static final int INITIAL_INPUT_SEGMENT_SIZE = 1024;
private static final char BLOCK_LEVEL_START_TAG_REPLACEMENT = '\n';
@ -309,7 +307,7 @@ InlineElment = ( [aAbBiIqQsSuU] |
cumulativeDiff += inputSegment.length() - outputSegment.length();
// position the correction at (already output length) + (substitution length)
addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
eofReturnValue = outputSegment.length() > 0 ? outputSegment.nextChar() : -1;
eofReturnValue = ( ! outputSegment.isRead()) ? outputSegment.nextChar() : -1;
break;
}
case BANG:
@ -322,7 +320,7 @@ InlineElment = ( [aAbBiIqQsSuU] |
case LEFT_ANGLE_BRACKET_SLASH:
case LEFT_ANGLE_BRACKET_SPACE: { // Include
outputSegment = inputSegment;
eofReturnValue = outputSegment.length() > 0 ? outputSegment.nextChar() : -1;
eofReturnValue = ( ! outputSegment.isRead()) ? outputSegment.nextChar() : -1;
break;
}
default: {
@ -789,7 +787,7 @@ InlineElment = ( [aAbBiIqQsSuU] |
}
}
[^] {
inputSegment.append(zzBuffer[zzStartRead]);
inputSegment.append(yytext());
}
}
@ -801,7 +799,13 @@ InlineElment = ( [aAbBiIqQsSuU] |
addOffCorrectMap(outputCharCount, cumulativeDiff);
yybegin(YYINITIAL);
}
[^] { return zzBuffer[zzStartRead]; }
[^] {
if (yylength() == 1) {
return zzBuffer[zzStartRead];
} else {
outputSegment.append(yytext()); return outputSegment.nextChar();
}
}
}
<COMMENT> {
@ -916,7 +920,7 @@ InlineElment = ( [aAbBiIqQsSuU] |
<AMPERSAND,NUMERIC_CHARACTER,CHARACTER_REFERENCE_TAIL,LEFT_ANGLE_BRACKET_SLASH,END_TAG_TAIL_INCLUDE,END_TAG_TAIL_EXCLUDE,END_TAG_TAIL_SUBSTITUTE,LEFT_ANGLE_BRACKET,LEFT_ANGLE_BRACKET_SPACE,START_TAG_TAIL_INCLUDE,START_TAG_TAIL_EXCLUDE,START_TAG_TAIL_SUBSTITUTE,BANG> {
[^] {
yypushback(1);
yypushback(yylength());
outputSegment = inputSegment;
outputSegment.restart();
yybegin(YYINITIAL);
@ -924,4 +928,10 @@ InlineElment = ( [aAbBiIqQsSuU] |
}
}
[^] { return zzBuffer[zzStartRead]; }
[^] {
if (yylength() == 1) {
return zzBuffer[zzStartRead];
} else {
outputSegment.append(yytext()); return outputSegment.nextChar();
}
}

View File

@ -1,4 +1,4 @@
/* The following code was generated by JFlex 1.5.1 */
/* The following code was generated by JFlex 1.6.0 */
package org.apache.lucene.analysis.standard;
@ -114,7 +114,7 @@ class ClassicTokenizerImpl implements StandardTokenizerInterface {
"\1\0\2\12\1\0\2\12\1\0\154\12\41\0\u016b\12\22\0\100\12"+
"\2\0\66\12\50\0\14\12\164\0\3\12\1\0\1\12\1\0\207\12"+
"\23\0\12\2\7\0\32\12\6\0\32\12\12\0\1\13\72\13\37\12"+
"\3\0\6\12\2\0\6\12\2\0\6\12\2\0\3\12\43\0";
"\3\0\6\12\2\0\6\12\2\0\6\12\2\0\3\12\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\63\0";
/**
* Translates characters to character classes
@ -339,6 +339,14 @@ class ClassicTokenizerImpl implements StandardTokenizerInterface {
/** denotes if the user-EOF-code has already been executed */
private boolean zzEOFDone;
/**
* The number of occupied positions in zzBuffer beyond zzEndRead.
* When a lead/high surrogate has been read from the input stream
* into the final zzBuffer position, this will have a value of 1;
* otherwise, it will have a value of 0.
*/
private int zzFinalHighSurrogate = 0;
/* user code: */
@ -385,10 +393,10 @@ public final void getText(CharTermAttribute t) {
* @return the unpacked character translation table
*/
private static char [] zzUnpackCMap(String packed) {
char [] map = new char[0x10000];
char [] map = new char[0x110000];
int i = 0; /* index in packed string */
int j = 0; /* index in unpacked array */
while (i < 1138) {
while (i < 1170) {
int count = packed.charAt(i++);
char value = packed.charAt(i++);
do map[j++] = value; while (--count > 0);
@ -408,6 +416,8 @@ public final void getText(CharTermAttribute t) {
/* first: make room (if you can) */
if (zzStartRead > 0) {
zzEndRead += zzFinalHighSurrogate;
zzFinalHighSurrogate = 0;
System.arraycopy(zzBuffer, zzStartRead,
zzBuffer, 0,
zzEndRead-zzStartRead);
@ -420,33 +430,38 @@ public final void getText(CharTermAttribute t) {
}
/* is the buffer big enough? */
if (zzCurrentPos >= zzBuffer.length) {
if (zzCurrentPos >= zzBuffer.length - zzFinalHighSurrogate) {
/* if not: blow it up */
char newBuffer[] = new char[zzCurrentPos*2];
char newBuffer[] = new char[zzBuffer.length*2];
System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length);
zzBuffer = newBuffer;
zzEndRead += zzFinalHighSurrogate;
zzFinalHighSurrogate = 0;
}
/* finally: fill the buffer with new input */
int numRead = zzReader.read(zzBuffer, zzEndRead,
zzBuffer.length-zzEndRead);
/* fill the buffer with new input */
int requested = zzBuffer.length - zzEndRead;
int totalRead = 0;
while (totalRead < requested) {
int numRead = zzReader.read(zzBuffer, zzEndRead + totalRead, requested - totalRead);
if (numRead == -1) {
break;
}
totalRead += numRead;
}
if (numRead > 0) {
zzEndRead+= numRead;
if (totalRead > 0) {
zzEndRead += totalRead;
if (totalRead == requested) { /* possibly more input available */
if (Character.isHighSurrogate(zzBuffer[zzEndRead - 1])) {
--zzEndRead;
zzFinalHighSurrogate = 1;
}
}
return false;
}
// unlikely but not impossible: read 0 characters, but not at end of stream
if (numRead == 0) {
int c = zzReader.read();
if (c == -1) {
return true;
} else {
zzBuffer[zzEndRead++] = (char) c;
return false;
}
}
// numRead < 0
// totalRead = 0: End of stream
return true;
}
@ -482,6 +497,7 @@ public final void getText(CharTermAttribute t) {
zzEOFDone = false;
zzEndRead = zzStartRead = 0;
zzCurrentPos = zzMarkedPos = 0;
zzFinalHighSurrogate = 0;
yyline = yychar = yycolumn = 0;
zzLexicalState = YYINITIAL;
if (zzBuffer.length > ZZ_BUFFERSIZE)
@ -625,8 +641,10 @@ public final void getText(CharTermAttribute t) {
zzForAction: {
while (true) {
if (zzCurrentPosL < zzEndReadL)
zzInput = zzBufferL[zzCurrentPosL++];
if (zzCurrentPosL < zzEndReadL) {
zzInput = Character.codePointAt(zzBufferL, zzCurrentPosL, zzEndReadL);
zzCurrentPosL += Character.charCount(zzInput);
}
else if (zzAtEOF) {
zzInput = YYEOF;
break zzForAction;
@ -646,7 +664,8 @@ public final void getText(CharTermAttribute t) {
break zzForAction;
}
else {
zzInput = zzBufferL[zzCurrentPosL++];
zzInput = Character.codePointAt(zzBufferL, zzCurrentPosL, zzEndReadL);
zzCurrentPosL += Character.charCount(zzInput);
}
}
int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ];

View File

@ -1,143 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Generated using ICU4J 52.1.0.0
// by org.apache.lucene.analysis.icu.GenerateJFlexSupplementaryMacros
ALetterSupp = (
([\ud83b][\uDE00-\uDE03\uDE05-\uDE1F\uDE21\uDE22\uDE24\uDE27\uDE29-\uDE32\uDE34-\uDE37\uDE39\uDE3B\uDE42\uDE47\uDE49\uDE4B\uDE4D-\uDE4F\uDE51\uDE52\uDE54\uDE57\uDE59\uDE5B\uDE5D\uDE5F\uDE61\uDE62\uDE64\uDE67-\uDE6A\uDE6C-\uDE72\uDE74-\uDE77\uDE79-\uDE7C\uDE7E\uDE80-\uDE89\uDE8B-\uDE9B\uDEA1-\uDEA3\uDEA5-\uDEA9\uDEAB-\uDEBB])
| ([\ud81a][\uDC00-\uDE38])
| ([\ud81b][\uDF00-\uDF44\uDF50\uDF93-\uDF9F])
| ([\ud835][\uDC00-\uDC54\uDC56-\uDC9C\uDC9E\uDC9F\uDCA2\uDCA5\uDCA6\uDCA9-\uDCAC\uDCAE-\uDCB9\uDCBB\uDCBD-\uDCC3\uDCC5-\uDD05\uDD07-\uDD0A\uDD0D-\uDD14\uDD16-\uDD1C\uDD1E-\uDD39\uDD3B-\uDD3E\uDD40-\uDD44\uDD46\uDD4A-\uDD50\uDD52-\uDEA5\uDEA8-\uDEC0\uDEC2-\uDEDA\uDEDC-\uDEFA\uDEFC-\uDF14\uDF16-\uDF34\uDF36-\uDF4E\uDF50-\uDF6E\uDF70-\uDF88\uDF8A-\uDFA8\uDFAA-\uDFC2\uDFC4-\uDFCB])
| ([\ud80d][\uDC00-\uDC2E])
| ([\ud80c][\uDC00-\uDFFF])
| ([\ud809][\uDC00-\uDC62])
| ([\ud808][\uDC00-\uDF6E])
| ([\ud805][\uDE80-\uDEAA])
| ([\ud804][\uDC03-\uDC37\uDC83-\uDCAF\uDCD0-\uDCE8\uDD03-\uDD26\uDD83-\uDDB2\uDDC1-\uDDC4])
| ([\ud801][\uDC00-\uDC9D])
| ([\ud800][\uDC00-\uDC0B\uDC0D-\uDC26\uDC28-\uDC3A\uDC3C\uDC3D\uDC3F-\uDC4D\uDC50-\uDC5D\uDC80-\uDCFA\uDD40-\uDD74\uDE80-\uDE9C\uDEA0-\uDED0\uDF00-\uDF1E\uDF30-\uDF4A\uDF80-\uDF9D\uDFA0-\uDFC3\uDFC8-\uDFCF\uDFD1-\uDFD5])
| ([\ud803][\uDC00-\uDC48])
| ([\ud802][\uDC00-\uDC05\uDC08\uDC0A-\uDC35\uDC37\uDC38\uDC3C\uDC3F-\uDC55\uDD00-\uDD15\uDD20-\uDD39\uDD80-\uDDB7\uDDBE\uDDBF\uDE00\uDE10-\uDE13\uDE15-\uDE17\uDE19-\uDE33\uDE60-\uDE7C\uDF00-\uDF35\uDF40-\uDF55\uDF60-\uDF72])
)
FormatSupp = (
([\ud804][\uDCBD])
| ([\ud834][\uDD73-\uDD7A])
| ([\udb40][\uDC01\uDC20-\uDC7F])
)
NumericSupp = (
([\ud805][\uDEC0-\uDEC9])
| ([\ud804][\uDC66-\uDC6F\uDCF0-\uDCF9\uDD36-\uDD3F\uDDD0-\uDDD9])
| ([\ud835][\uDFCE-\uDFFF])
| ([\ud801][\uDCA0-\uDCA9])
)
ExtendSupp = (
([\ud81b][\uDF51-\uDF7E\uDF8F-\uDF92])
| ([\ud805][\uDEAB-\uDEB7])
| ([\ud804][\uDC00-\uDC02\uDC38-\uDC46\uDC80-\uDC82\uDCB0-\uDCBA\uDD00-\uDD02\uDD27-\uDD34\uDD80-\uDD82\uDDB3-\uDDC0])
| ([\ud834][\uDD65-\uDD69\uDD6D-\uDD72\uDD7B-\uDD82\uDD85-\uDD8B\uDDAA-\uDDAD\uDE42-\uDE44])
| ([\ud800][\uDDFD])
| ([\udb40][\uDD00-\uDDEF])
| ([\ud802][\uDE01-\uDE03\uDE05\uDE06\uDE0C-\uDE0F\uDE38-\uDE3A\uDE3F])
)
KatakanaSupp = (
([\ud82c][\uDC00])
)
MidLetterSupp = (
[]
)
MidNumSupp = (
[]
)
MidNumLetSupp = (
[]
)
ExtendNumLetSupp = (
[]
)
ExtendNumLetSupp = (
[]
)
ComplexContextSupp = (
[]
)
HanSupp = (
([\ud87e][\uDC00-\uDE1D])
| ([\ud86b][\uDC00-\uDFFF])
| ([\ud86a][\uDC00-\uDFFF])
| ([\ud869][\uDC00-\uDED6\uDF00-\uDFFF])
| ([\ud868][\uDC00-\uDFFF])
| ([\ud86e][\uDC00-\uDC1D])
| ([\ud86d][\uDC00-\uDF34\uDF40-\uDFFF])
| ([\ud86c][\uDC00-\uDFFF])
| ([\ud863][\uDC00-\uDFFF])
| ([\ud862][\uDC00-\uDFFF])
| ([\ud861][\uDC00-\uDFFF])
| ([\ud860][\uDC00-\uDFFF])
| ([\ud867][\uDC00-\uDFFF])
| ([\ud866][\uDC00-\uDFFF])
| ([\ud865][\uDC00-\uDFFF])
| ([\ud864][\uDC00-\uDFFF])
| ([\ud858][\uDC00-\uDFFF])
| ([\ud859][\uDC00-\uDFFF])
| ([\ud85a][\uDC00-\uDFFF])
| ([\ud85b][\uDC00-\uDFFF])
| ([\ud85c][\uDC00-\uDFFF])
| ([\ud85d][\uDC00-\uDFFF])
| ([\ud85e][\uDC00-\uDFFF])
| ([\ud85f][\uDC00-\uDFFF])
| ([\ud850][\uDC00-\uDFFF])
| ([\ud851][\uDC00-\uDFFF])
| ([\ud852][\uDC00-\uDFFF])
| ([\ud853][\uDC00-\uDFFF])
| ([\ud854][\uDC00-\uDFFF])
| ([\ud855][\uDC00-\uDFFF])
| ([\ud856][\uDC00-\uDFFF])
| ([\ud857][\uDC00-\uDFFF])
| ([\ud849][\uDC00-\uDFFF])
| ([\ud848][\uDC00-\uDFFF])
| ([\ud84b][\uDC00-\uDFFF])
| ([\ud84a][\uDC00-\uDFFF])
| ([\ud84d][\uDC00-\uDFFF])
| ([\ud84c][\uDC00-\uDFFF])
| ([\ud84f][\uDC00-\uDFFF])
| ([\ud84e][\uDC00-\uDFFF])
| ([\ud841][\uDC00-\uDFFF])
| ([\ud840][\uDC00-\uDFFF])
| ([\ud843][\uDC00-\uDFFF])
| ([\ud842][\uDC00-\uDFFF])
| ([\ud845][\uDC00-\uDFFF])
| ([\ud844][\uDC00-\uDFFF])
| ([\ud847][\uDC00-\uDFFF])
| ([\ud846][\uDC00-\uDFFF])
)
HiraganaSupp = (
([\ud83c][\uDE00])
| ([\ud82c][\uDC01])
)
SingleQuoteSupp = (
[]
)
DoubleQuoteSupp = (
[]
)
HebrewLetterSupp = (
[]
)
RegionalIndicatorSupp = (
([\ud83c][\uDDE6-\uDDFF])
)

View File

@ -48,41 +48,22 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
%char
%buffer 4096
%include SUPPLEMENTARY.jflex-macro
ALetter = (\p{WB:ALetter} | {ALetterSupp})
Format = (\p{WB:Format} | {FormatSupp})
Numeric = ([\p{WB:Numeric}[\p{Blk:HalfAndFullForms}&&\p{Nd}]] | {NumericSupp})
Extend = (\p{WB:Extend} | {ExtendSupp})
Katakana = (\p{WB:Katakana} | {KatakanaSupp})
MidLetter = (\p{WB:MidLetter} | {MidLetterSupp})
MidNum = (\p{WB:MidNum} | {MidNumSupp})
MidNumLet = (\p{WB:MidNumLet} | {MidNumLetSupp})
ExtendNumLet = (\p{WB:ExtendNumLet} | {ExtendNumLetSupp})
ComplexContext = (\p{LB:Complex_Context} | {ComplexContextSupp})
Han = (\p{Script:Han} | {HanSupp})
Hiragana = (\p{Script:Hiragana} | {HiraganaSupp})
SingleQuote = (\p{WB:Single_Quote} | {SingleQuoteSupp})
DoubleQuote = (\p{WB:Double_Quote} | {DoubleQuoteSupp})
HebrewLetter = (\p{WB:Hebrew_Letter} | {HebrewLetterSupp})
RegionalIndicator = (\p{WB:Regional_Indicator} | {RegionalIndicatorSupp})
HebrewOrALetter = ({HebrewLetter} | {ALetter})
// UAX#29 WB4. X (Extend | Format)* --> X
//
HangulEx = [\p{Script:Hangul}&&[\p{WB:ALetter}\p{WB:Hebrew_Letter}]] ({Format} | {Extend})*
HebrewOrALetterEx = {HebrewOrALetter} ({Format} | {Extend})*
NumericEx = {Numeric} ({Format} | {Extend})*
KatakanaEx = {Katakana} ({Format} | {Extend})*
MidLetterEx = ({MidLetter} | {MidNumLet} | {SingleQuote}) ({Format} | {Extend})*
MidNumericEx = ({MidNum} | {MidNumLet} | {SingleQuote}) ({Format} | {Extend})*
ExtendNumLetEx = {ExtendNumLet} ({Format} | {Extend})*
HanEx = {Han} ({Format} | {Extend})*
HiraganaEx = {Hiragana} ({Format} | {Extend})*
SingleQuoteEx = {SingleQuote} ({Format} | {Extend})*
DoubleQuoteEx = {DoubleQuote} ({Format} | {Extend})*
HebrewLetterEx = {HebrewLetter} ({Format} | {Extend})*
RegionalIndicatorEx = {RegionalIndicator} ({Format} | {Extend})*
HangulEx = [\p{Script:Hangul}&&[\p{WB:ALetter}\p{WB:Hebrew_Letter}]] [\p{WB:Format}\p{WB:Extend}]*
HebrewOrALetterEx = [\p{WB:HebrewLetter}\p{WB:ALetter}] [\p{WB:Format}\p{WB:Extend}]*
NumericEx = [\p{WB:Numeric}[\p{Blk:HalfAndFullForms}&&\p{Nd}]] [\p{WB:Format}\p{WB:Extend}]*
KatakanaEx = \p{WB:Katakana} [\p{WB:Format}\p{WB:Extend}]*
MidLetterEx = [\p{WB:MidLetter}\p{WB:MidNumLet}\p{WB:SingleQuote}] [\p{WB:Format}\p{WB:Extend}]*
MidNumericEx = [\p{WB:MidNum}\p{WB:MidNumLet}\p{WB:SingleQuote}] [\p{WB:Format}\p{WB:Extend}]*
ExtendNumLetEx = \p{WB:ExtendNumLet} [\p{WB:Format}\p{WB:Extend}]*
HanEx = \p{Script:Han} [\p{WB:Format}\p{WB:Extend}]*
HiraganaEx = \p{Script:Hiragana} [\p{WB:Format}\p{WB:Extend}]*
SingleQuoteEx = \p{WB:Single_Quote} [\p{WB:Format}\p{WB:Extend}]*
DoubleQuoteEx = \p{WB:Double_Quote} [\p{WB:Format}\p{WB:Extend}]*
HebrewLetterEx = \p{WB:Hebrew_Letter} [\p{WB:Format}\p{WB:Extend}]*
RegionalIndicatorEx = \p{WB:RegionalIndicator} [\p{WB:Format}\p{WB:Extend}]*
ComplexContextEx = \p{LB:Complex_Context} [\p{WB:Format}\p{WB:Extend}]*
%{
/** Alphanumeric sequences */
@ -193,7 +174,7 @@ RegionalIndicatorEx = {RegionalIndicator}
//
// http://www.unicode.org/reports/tr14/#SA
//
{ComplexContext}+ { return SOUTH_EAST_ASIAN_TYPE; }
{ComplexContextEx}+ { return SOUTH_EAST_ASIAN_TYPE; }
// UAX#29 WB14. Any ÷ Any
//

View File

@ -52,40 +52,22 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
%xstate AVOID_BAD_URL
%buffer 4096
%include SUPPLEMENTARY.jflex-macro
ALetter = (\p{WB:ALetter} | {ALetterSupp})
Format = (\p{WB:Format} | {FormatSupp})
Numeric = ([\p{WB:Numeric}[\p{Blk:HalfAndFullForms}&&\p{Nd}]] | {NumericSupp})
Extend = (\p{WB:Extend} | {ExtendSupp})
Katakana = (\p{WB:Katakana} | {KatakanaSupp})
MidLetter = (\p{WB:MidLetter} | {MidLetterSupp})
MidNum = (\p{WB:MidNum} | {MidNumSupp})
MidNumLet = (\p{WB:MidNumLet} | {MidNumLetSupp})
ExtendNumLet = (\p{WB:ExtendNumLet} | {ExtendNumLetSupp})
ComplexContext = (\p{LB:Complex_Context} | {ComplexContextSupp})
Han = (\p{Script:Han} | {HanSupp})
Hiragana = (\p{Script:Hiragana} | {HiraganaSupp})
SingleQuote = (\p{WB:Single_Quote} | {SingleQuoteSupp})
DoubleQuote = (\p{WB:Double_Quote} | {DoubleQuoteSupp})
HebrewLetter = (\p{WB:Hebrew_Letter} | {HebrewLetterSupp})
RegionalIndicator = (\p{WB:Regional_Indicator} | {RegionalIndicatorSupp})
HebrewOrALetter = ({HebrewLetter} | {ALetter})
// UAX#29 WB4. X (Extend | Format)* --> X
//
HangulEx = [\p{Script:Hangul}&&[\p{WB:ALetter}\p{WB:Hebrew_Letter}]] ({Format} | {Extend})*
HebrewOrALetterEx = {HebrewOrALetter} ({Format} | {Extend})*
NumericEx = {Numeric} ({Format} | {Extend})*
KatakanaEx = {Katakana} ({Format} | {Extend})*
MidLetterEx = ({MidLetter} | {MidNumLet} | {SingleQuote}) ({Format} | {Extend})*
MidNumericEx = ({MidNum} | {MidNumLet} | {SingleQuote}) ({Format} | {Extend})*
ExtendNumLetEx = {ExtendNumLet} ({Format} | {Extend})*
HanEx = {Han} ({Format} | {Extend})*
HiraganaEx = {Hiragana} ({Format} | {Extend})*
SingleQuoteEx = {SingleQuote} ({Format} | {Extend})*
DoubleQuoteEx = {DoubleQuote} ({Format} | {Extend})*
HebrewLetterEx = {HebrewLetter} ({Format} | {Extend})*
RegionalIndicatorEx = {RegionalIndicator} ({Format} | {Extend})*
HangulEx = [\p{Script:Hangul}&&[\p{WB:ALetter}\p{WB:Hebrew_Letter}]] [\p{WB:Format}\p{WB:Extend}]*
HebrewOrALetterEx = [\p{WB:HebrewLetter}\p{WB:ALetter}] [\p{WB:Format}\p{WB:Extend}]*
NumericEx = [\p{WB:Numeric}[\p{Blk:HalfAndFullForms}&&\p{Nd}]] [\p{WB:Format}\p{WB:Extend}]*
KatakanaEx = \p{WB:Katakana} [\p{WB:Format}\p{WB:Extend}]*
MidLetterEx = [\p{WB:MidLetter}\p{WB:MidNumLet}\p{WB:SingleQuote}] [\p{WB:Format}\p{WB:Extend}]*
MidNumericEx = [\p{WB:MidNum}\p{WB:MidNumLet}\p{WB:SingleQuote}] [\p{WB:Format}\p{WB:Extend}]*
ExtendNumLetEx = \p{WB:ExtendNumLet} [\p{WB:Format}\p{WB:Extend}]*
HanEx = \p{Script:Han} [\p{WB:Format}\p{WB:Extend}]*
HiraganaEx = \p{Script:Hiragana} [\p{WB:Format}\p{WB:Extend}]*
SingleQuoteEx = \p{WB:Single_Quote} [\p{WB:Format}\p{WB:Extend}]*
DoubleQuoteEx = \p{WB:Double_Quote} [\p{WB:Format}\p{WB:Extend}]*
HebrewLetterEx = \p{WB:Hebrew_Letter} [\p{WB:Format}\p{WB:Extend}]*
RegionalIndicatorEx = \p{WB:RegionalIndicator} [\p{WB:Format}\p{WB:Extend}]*
ComplexContextEx = \p{LB:Complex_Context} [\p{WB:Format}\p{WB:Extend}]*
// URL and E-mail syntax specifications:
//
@ -304,7 +286,7 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
//
// http://www.unicode.org/reports/tr14/#SA
//
{ComplexContext}+ { yybegin(YYINITIAL); return SOUTH_EAST_ASIAN_TYPE; }
{ComplexContextEx}+ { yybegin(YYINITIAL); return SOUTH_EAST_ASIAN_TYPE; }
// UAX#29 WB14. Any ÷ Any
//

View File

@ -1,4 +1,4 @@
/* The following code was generated by JFlex 1.5.1 */
/* The following code was generated by JFlex 1.6.0 */
package org.apache.lucene.analysis.wikipedia;
@ -71,7 +71,7 @@ class WikipediaTokenizerImpl {
"\167\15\11\17\166\15\12\17\166\15\12\17\166\15\12\17\340\15\12\17"+
"\166\15\12\17\u0166\15\12\17\266\15\u0100\15\u0e00\15\u1040\0\u0150\21\140\0"+
"\20\21\u0100\0\200\21\200\0\u19c0\21\100\0\u5200\21\u0c00\0\u2bb0\20\u2150\0"+
"\u0200\21\u0465\0\73\21\75\15\43\0";
"\u0200\21\u0465\0\73\21\75\15\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\63\0";
/**
* Translates characters to character classes
@ -427,6 +427,14 @@ class WikipediaTokenizerImpl {
/** denotes if the user-EOF-code has already been executed */
private boolean zzEOFDone;
/**
* The number of occupied positions in zzBuffer beyond zzEndRead.
* When a lead/high surrogate has been read from the input stream
* into the final zzBuffer position, this will have a value of 1;
* otherwise, it will have a value of 0.
*/
private int zzFinalHighSurrogate = 0;
/* user code: */
@ -519,10 +527,10 @@ final void reset() {
* @return the unpacked character translation table
*/
private static char [] zzUnpackCMap(String packed) {
char [] map = new char[0x10000];
char [] map = new char[0x110000];
int i = 0; /* index in packed string */
int j = 0; /* index in unpacked array */
while (i < 230) {
while (i < 262) {
int count = packed.charAt(i++);
char value = packed.charAt(i++);
do map[j++] = value; while (--count > 0);
@ -542,6 +550,8 @@ final void reset() {
/* first: make room (if you can) */
if (zzStartRead > 0) {
zzEndRead += zzFinalHighSurrogate;
zzFinalHighSurrogate = 0;
System.arraycopy(zzBuffer, zzStartRead,
zzBuffer, 0,
zzEndRead-zzStartRead);
@ -554,33 +564,38 @@ final void reset() {
}
/* is the buffer big enough? */
if (zzCurrentPos >= zzBuffer.length) {
if (zzCurrentPos >= zzBuffer.length - zzFinalHighSurrogate) {
/* if not: blow it up */
char newBuffer[] = new char[zzCurrentPos*2];
char newBuffer[] = new char[zzBuffer.length*2];
System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length);
zzBuffer = newBuffer;
zzEndRead += zzFinalHighSurrogate;
zzFinalHighSurrogate = 0;
}
/* finally: fill the buffer with new input */
int numRead = zzReader.read(zzBuffer, zzEndRead,
zzBuffer.length-zzEndRead);
/* fill the buffer with new input */
int requested = zzBuffer.length - zzEndRead;
int totalRead = 0;
while (totalRead < requested) {
int numRead = zzReader.read(zzBuffer, zzEndRead + totalRead, requested - totalRead);
if (numRead == -1) {
break;
}
totalRead += numRead;
}
if (numRead > 0) {
zzEndRead+= numRead;
if (totalRead > 0) {
zzEndRead += totalRead;
if (totalRead == requested) { /* possibly more input available */
if (Character.isHighSurrogate(zzBuffer[zzEndRead - 1])) {
--zzEndRead;
zzFinalHighSurrogate = 1;
}
}
return false;
}
// unlikely but not impossible: read 0 characters, but not at end of stream
if (numRead == 0) {
int c = zzReader.read();
if (c == -1) {
return true;
} else {
zzBuffer[zzEndRead++] = (char) c;
return false;
}
}
// numRead < 0
// totalRead = 0: End of stream
return true;
}
@ -616,6 +631,7 @@ final void reset() {
zzEOFDone = false;
zzEndRead = zzStartRead = 0;
zzCurrentPos = zzMarkedPos = 0;
zzFinalHighSurrogate = 0;
yyline = yychar = yycolumn = 0;
zzLexicalState = YYINITIAL;
if (zzBuffer.length > ZZ_BUFFERSIZE)
@ -759,8 +775,10 @@ final void reset() {
zzForAction: {
while (true) {
if (zzCurrentPosL < zzEndReadL)
zzInput = zzBufferL[zzCurrentPosL++];
if (zzCurrentPosL < zzEndReadL) {
zzInput = Character.codePointAt(zzBufferL, zzCurrentPosL, zzEndReadL);
zzCurrentPosL += Character.charCount(zzInput);
}
else if (zzAtEOF) {
zzInput = YYEOF;
break zzForAction;
@ -780,7 +798,8 @@ final void reset() {
break zzForAction;
}
else {
zzInput = zzBufferL[zzCurrentPosL++];
zzInput = Character.codePointAt(zzBufferL, zzCurrentPosL, zzEndReadL);
zzCurrentPosL += Character.charCount(zzInput);
}
}
int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ];

View File

@ -104,43 +104,6 @@ are part of the ICU4C package. See http://site.icu-project.org/ </echo>
<arg value="${rbbi.dst.dir}"/>
</java>
</target>
<property name="uax29.supp.macros.output.file"
location="../common/src/java/org/apache/lucene/analysis/standard/SUPPLEMENTARY.jflex-macro"/>
<target name="gen-uax29-supp-macros" depends="compile-tools">
<java
classname="org.apache.lucene.analysis.icu.GenerateJFlexSupplementaryMacros"
dir="."
fork="true"
failonerror="true"
output="${uax29.supp.macros.output.file}">
<classpath>
<path refid="icujar"/>
<pathelement location="${build.dir}/classes/tools"/>
</classpath>
<assertions>
<enable package="org.apache.lucene"/>
</assertions>
</java>
</target>
<property name="html.strip.charfilter.supp.macros.output.file"
location="../common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.SUPPLEMENTARY.jflex-macro"/>
<target name="gen-html-strip-charfilter-supp-macros" depends="compile-tools">
<java
classname="org.apache.lucene.analysis.icu.GenerateHTMLStripCharFilterSupplementaryMacros"
dir="."
fork="true"
failonerror="true"
output="${html.strip.charfilter.supp.macros.output.file}">
<classpath>
<path refid="icujar"/>
<pathelement location="${build.dir}/classes/tools"/>
</classpath>
</java>
</target>
<target name="compile-tools" depends="init,common.compile-tools">
<compile
@ -150,6 +113,6 @@ are part of the ICU4C package. See http://site.icu-project.org/ </echo>
</compile>
</target>
<target name="regenerate" depends="gen-html-strip-charfilter-supp-macros,gen-uax29-supp-macros,gen-utr30-data-files,gennorm2,genrbbi"/>
<target name="regenerate" depends="gen-utr30-data-files,gennorm2,genrbbi"/>
</project>

View File

@ -1,109 +0,0 @@
package org.apache.lucene.analysis.icu;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.text.DateFormat;
import java.util.*;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.text.UnicodeSetIterator;
import com.ibm.icu.util.VersionInfo;
/** creates a macro to augment jflex's unicode support for > BMP */
public class GenerateHTMLStripCharFilterSupplementaryMacros {
private static final UnicodeSet BMP = new UnicodeSet("[\u0000-\uFFFF]");
private static final String NL = System.getProperty("line.separator");
private static final DateFormat DATE_FORMAT = DateFormat.getDateTimeInstance
(DateFormat.FULL, DateFormat.FULL, Locale.ROOT);
static {
DATE_FORMAT.setTimeZone(TimeZone.getTimeZone("UTC"));
}
private static final String APACHE_LICENSE
= "/*" + NL
+ " * Copyright 2010 The Apache Software Foundation." + NL
+ " *" + NL
+ " * Licensed under the Apache License, Version 2.0 (the \"License\");" + NL
+ " * you may not use this file except in compliance with the License." + NL
+ " * You may obtain a copy of the License at" + NL
+ " *" + NL
+ " * http://www.apache.org/licenses/LICENSE-2.0" + NL
+ " *" + NL
+ " * Unless required by applicable law or agreed to in writing, software" + NL
+ " * distributed under the License is distributed on an \"AS IS\" BASIS," + NL
+ " * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied." + NL
+ " * See the License for the specific language governing permissions and" + NL
+ " * limitations under the License." + NL
+ " */" + NL + NL;
public static void main(String args[]) {
outputHeader();
outputMacro("ID_Start_Supp", "[:ID_Start:]");
outputMacro("ID_Continue_Supp", "[:ID_Continue:]");
}
static void outputHeader() {
System.out.print(APACHE_LICENSE);
System.out.println("// Generated using ICU4J " + VersionInfo.ICU_VERSION.toString());
System.out.println("// by " + GenerateHTMLStripCharFilterSupplementaryMacros.class.getName());
System.out.print(NL + NL);
}
// we have to carefully output the possibilities as compact utf-16
// range expressions, or jflex will OOM!
static void outputMacro(String name, String pattern) {
UnicodeSet set = new UnicodeSet(pattern);
set.removeAll(BMP);
System.out.println(name + " = (");
// if the set is empty, we have to do this or jflex will barf
if (set.isEmpty()) {
System.out.println("\t []");
}
HashMap<Character,UnicodeSet> utf16ByLead = new HashMap<>();
for (UnicodeSetIterator it = new UnicodeSetIterator(set); it.next();) {
char utf16[] = Character.toChars(it.codepoint);
UnicodeSet trails = utf16ByLead.get(utf16[0]);
if (trails == null) {
trails = new UnicodeSet();
utf16ByLead.put(utf16[0], trails);
}
trails.add(utf16[1]);
}
Map<String,UnicodeSet> utf16ByTrail = new HashMap<>();
for (Map.Entry<Character,UnicodeSet> entry : utf16ByLead.entrySet()) {
String trail = entry.getValue().getRegexEquivalent();
UnicodeSet leads = utf16ByTrail.get(trail);
if (leads == null) {
leads = new UnicodeSet();
utf16ByTrail.put(trail, leads);
}
leads.add(entry.getKey());
}
boolean isFirst = true;
for (Map.Entry<String,UnicodeSet> entry : utf16ByTrail.entrySet()) {
System.out.print( isFirst ? "\t " : "\t| ");
isFirst = false;
System.out.println(entry.getValue().getRegexEquivalent() + entry.getKey());
}
System.out.println(")");
}
}

View File

@ -1,118 +0,0 @@
package org.apache.lucene.analysis.icu;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.text.DateFormat;
import java.util.Date;
import java.util.HashMap;
import java.util.Locale;
import java.util.TimeZone;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.text.UnicodeSetIterator;
import com.ibm.icu.util.VersionInfo;
/** creates a macro to augment jflex's unicode wordbreak support for > BMP */
public class GenerateJFlexSupplementaryMacros {
private static final UnicodeSet BMP = new UnicodeSet("[\u0000-\uFFFF]");
private static final String NL = System.getProperty("line.separator");
private static final DateFormat DATE_FORMAT = DateFormat.getDateTimeInstance
(DateFormat.FULL, DateFormat.FULL, Locale.ROOT);
static {
DATE_FORMAT.setTimeZone(TimeZone.getTimeZone("UTC"));
}
private static final String APACHE_LICENSE
= "/*" + NL
+ " * Licensed to the Apache Software Foundation (ASF) under one or more" + NL
+ " * contributor license agreements. See the NOTICE file distributed with" + NL
+ " * this work for additional information regarding copyright ownership." + NL
+ " * The ASF licenses this file to You under the Apache License, Version 2.0" + NL
+ " * (the \"License\"); you may not use this file except in compliance with" + NL
+ " * the License. You may obtain a copy of the License at" + NL
+ " *" + NL
+ " * http://www.apache.org/licenses/LICENSE-2.0" + NL
+ " *" + NL
+ " * Unless required by applicable law or agreed to in writing, software" + NL
+ " * distributed under the License is distributed on an \"AS IS\" BASIS," + NL
+ " * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied." + NL
+ " * See the License for the specific language governing permissions and" + NL
+ " * limitations under the License." + NL
+ " */" + NL;
public static void main(String args[]) {
outputHeader();
outputMacro("ALetterSupp", "[:WordBreak=ALetter:]");
outputMacro("FormatSupp", "[:WordBreak=Format:]");
outputMacro("NumericSupp", "[:WordBreak=Numeric:]");
outputMacro("ExtendSupp", "[:WordBreak=Extend:]");
outputMacro("KatakanaSupp", "[:WordBreak=Katakana:]");
outputMacro("MidLetterSupp", "[:WordBreak=MidLetter:]");
outputMacro("MidNumSupp", "[:WordBreak=MidNum:]");
outputMacro("MidNumLetSupp", "[:WordBreak=MidNumLet:]");
outputMacro("ExtendNumLetSupp", "[:WordBreak=ExtendNumLet:]");
outputMacro("ExtendNumLetSupp", "[:WordBreak=ExtendNumLet:]");
outputMacro("ComplexContextSupp", "[:LineBreak=Complex_Context:]");
outputMacro("HanSupp", "[:Script=Han:]");
outputMacro("HiraganaSupp", "[:Script=Hiragana:]");
outputMacro("SingleQuoteSupp", "[:WordBreak=Single_Quote:]");
outputMacro("DoubleQuoteSupp", "[:WordBreak=Double_Quote:]");
outputMacro("HebrewLetterSupp", "[:WordBreak=Hebrew_Letter:]");
outputMacro("RegionalIndicatorSupp", "[:WordBreak=Regional_Indicator:]");
}
static void outputHeader() {
System.out.print(APACHE_LICENSE);
System.out.println("// Generated using ICU4J " + VersionInfo.ICU_VERSION.toString());
System.out.println("// by " + GenerateJFlexSupplementaryMacros.class.getName());
System.out.print(NL + NL);
}
// we have to carefully output the possibilities as compact utf-16
// range expressions, or jflex will OOM!
static void outputMacro(String name, String pattern) {
UnicodeSet set = new UnicodeSet(pattern);
set.removeAll(BMP);
System.out.println(name + " = (");
// if the set is empty, we have to do this or jflex will barf
if (set.isEmpty()) {
System.out.println("\t []");
}
HashMap<Character,UnicodeSet> utf16ByLead = new HashMap<>();
for (UnicodeSetIterator it = new UnicodeSetIterator(set); it.next();) {
char utf16[] = Character.toChars(it.codepoint);
UnicodeSet trails = utf16ByLead.get(utf16[0]);
if (trails == null) {
trails = new UnicodeSet();
utf16ByLead.put(utf16[0], trails);
}
trails.add(utf16[1]);
}
boolean isFirst = true;
for (Character c : utf16ByLead.keySet()) {
UnicodeSet trail = utf16ByLead.get(c);
System.out.print( isFirst ? "\t " : "\t| ");
isFirst = false;
System.out.println("([\\u" + Integer.toHexString(c) + "]" + trail.getRegexEquivalent() + ")");
}
System.out.println(")");
}
}

View File

@ -2196,7 +2196,7 @@ ${ant.project.name}.test.dependencies=${test.classpath.list}
<!-- JFlex task -->
<target name="-install-jflex" unless="jflex.loaded" depends="ivy-availability-check,ivy-configure">
<ivy:cachepath organisation="de.jflex" module="jflex" revision="1.5.1"
<ivy:cachepath organisation="de.jflex" module="jflex" revision="1.6.0"
inline="true" conf="default" transitive="true" pathid="jflex.classpath"/>
<taskdef name="jflex" classname="jflex.anttask.JFlexTask" classpathref="jflex.classpath"/>
<property name="jflex.loaded" value="true"/>