mirror of https://github.com/apache/lucene.git
LUCENE-5770: Upgrade to JFlex 1.6, which has direct support for supplementary code points - as a result, ICU4J is no longer used to generate surrogate pairs to augment JFlex scanner specifications.
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1608134 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
35d479f1b7
commit
b207188421
|
@ -136,6 +136,13 @@ Test Framework
|
|||
* LUCENE-5786: Unflushed/ truncated events file (hung testing subprocess).
|
||||
(Dawid Weiss)
|
||||
|
||||
Build
|
||||
|
||||
* LUCENE-5770: Upgrade to JFlex 1.6, which has direct support for
|
||||
supplementary code points - as a result, ICU4J is no longer used
|
||||
to generate surrogate pairs to augment JFlex scanner specifications.
|
||||
(Steve Rowe)
|
||||
|
||||
======================= Lucene 4.9.0 =======================
|
||||
|
||||
Changes in Runtime Behavior
|
||||
|
|
|
@ -29,16 +29,9 @@
|
|||
|
||||
<import file="../analysis-module-build.xml"/>
|
||||
|
||||
<target name="jflex" depends="-install-jflex,clean-jflex,-gen-uax29-supp-macros,
|
||||
-jflex-StandardAnalyzer,-jflex-UAX29URLEmailTokenizer,
|
||||
<target name="jflex" depends="-install-jflex,clean-jflex,-jflex-StandardAnalyzer,-jflex-UAX29URLEmailTokenizer,
|
||||
-jflex-wiki-tokenizer,-jflex-HTMLStripCharFilter"/>
|
||||
|
||||
<target name="-gen-uax29-supp-macros">
|
||||
<subant target="gen-uax29-supp-macros">
|
||||
<fileset dir="../icu" includes="build.xml"/>
|
||||
</subant>
|
||||
</target>
|
||||
|
||||
<target name="-jflex-HTMLStripCharFilter"
|
||||
depends="init,generate-jflex-html-char-entities">
|
||||
<jflex file="src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex"
|
||||
|
|
|
@ -1,64 +0,0 @@
|
|||
/*
|
||||
* Copyright 2010 The Apache Software Foundation.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
// Generated using ICU4J 52.1.0.0
|
||||
// by org.apache.lucene.analysis.icu.GenerateHTMLStripCharFilterSupplementaryMacros
|
||||
|
||||
|
||||
ID_Start_Supp = (
|
||||
[\uD802][\uDC00-\uDC05\uDC08\uDC0A-\uDC35\uDC37\uDC38\uDC3C\uDC3F-\uDC55\uDD00-\uDD15\uDD20-\uDD39\uDD80-\uDDB7\uDDBE\uDDBF\uDE00\uDE10-\uDE13\uDE15-\uDE17\uDE19-\uDE33\uDE60-\uDE7C\uDF00-\uDF35\uDF40-\uDF55\uDF60-\uDF72]
|
||||
| [\uD81A][\uDC00-\uDE38]
|
||||
| [\uD869][\uDC00-\uDED6\uDF00-\uDFFF]
|
||||
| [\uD835][\uDC00-\uDC54\uDC56-\uDC9C\uDC9E\uDC9F\uDCA2\uDCA5\uDCA6\uDCA9-\uDCAC\uDCAE-\uDCB9\uDCBB\uDCBD-\uDCC3\uDCC5-\uDD05\uDD07-\uDD0A\uDD0D-\uDD14\uDD16-\uDD1C\uDD1E-\uDD39\uDD3B-\uDD3E\uDD40-\uDD44\uDD46\uDD4A-\uDD50\uDD52-\uDEA5\uDEA8-\uDEC0\uDEC2-\uDEDA\uDEDC-\uDEFA\uDEFC-\uDF14\uDF16-\uDF34\uDF36-\uDF4E\uDF50-\uDF6E\uDF70-\uDF88\uDF8A-\uDFA8\uDFAA-\uDFC2\uDFC4-\uDFCB]
|
||||
| [\uD80C\uD840-\uD868\uD86A-\uD86C][\uDC00-\uDFFF]
|
||||
| [\uD82C][\uDC00\uDC01]
|
||||
| [\uD86D][\uDC00-\uDF34\uDF40-\uDFFF]
|
||||
| [\uD81B][\uDF00-\uDF44\uDF50\uDF93-\uDF9F]
|
||||
| [\uD87E][\uDC00-\uDE1D]
|
||||
| [\uD804][\uDC03-\uDC37\uDC83-\uDCAF\uDCD0-\uDCE8\uDD03-\uDD26\uDD83-\uDDB2\uDDC1-\uDDC4]
|
||||
| [\uD83B][\uDE00-\uDE03\uDE05-\uDE1F\uDE21\uDE22\uDE24\uDE27\uDE29-\uDE32\uDE34-\uDE37\uDE39\uDE3B\uDE42\uDE47\uDE49\uDE4B\uDE4D-\uDE4F\uDE51\uDE52\uDE54\uDE57\uDE59\uDE5B\uDE5D\uDE5F\uDE61\uDE62\uDE64\uDE67-\uDE6A\uDE6C-\uDE72\uDE74-\uDE77\uDE79-\uDE7C\uDE7E\uDE80-\uDE89\uDE8B-\uDE9B\uDEA1-\uDEA3\uDEA5-\uDEA9\uDEAB-\uDEBB]
|
||||
| [\uD809][\uDC00-\uDC62]
|
||||
| [\uD808][\uDC00-\uDF6E]
|
||||
| [\uD803][\uDC00-\uDC48]
|
||||
| [\uD800][\uDC00-\uDC0B\uDC0D-\uDC26\uDC28-\uDC3A\uDC3C\uDC3D\uDC3F-\uDC4D\uDC50-\uDC5D\uDC80-\uDCFA\uDD40-\uDD74\uDE80-\uDE9C\uDEA0-\uDED0\uDF00-\uDF1E\uDF30-\uDF4A\uDF80-\uDF9D\uDFA0-\uDFC3\uDFC8-\uDFCF\uDFD1-\uDFD5]
|
||||
| [\uD80D][\uDC00-\uDC2E]
|
||||
| [\uD805][\uDE80-\uDEAA]
|
||||
| [\uD86E][\uDC00-\uDC1D]
|
||||
| [\uD801][\uDC00-\uDC9D]
|
||||
)
|
||||
ID_Continue_Supp = (
|
||||
[\uD81A][\uDC00-\uDE38]
|
||||
| [\uD869][\uDC00-\uDED6\uDF00-\uDFFF]
|
||||
| [\uD80C\uD840-\uD868\uD86A-\uD86C][\uDC00-\uDFFF]
|
||||
| [\uD82C][\uDC00\uDC01]
|
||||
| [\uD81B][\uDF00-\uDF44\uDF50-\uDF7E\uDF8F-\uDF9F]
|
||||
| [\uD801][\uDC00-\uDC9D\uDCA0-\uDCA9]
|
||||
| [\uD86D][\uDC00-\uDF34\uDF40-\uDFFF]
|
||||
| [\uD87E][\uDC00-\uDE1D]
|
||||
| [\uD802][\uDC00-\uDC05\uDC08\uDC0A-\uDC35\uDC37\uDC38\uDC3C\uDC3F-\uDC55\uDD00-\uDD15\uDD20-\uDD39\uDD80-\uDDB7\uDDBE\uDDBF\uDE00-\uDE03\uDE05\uDE06\uDE0C-\uDE13\uDE15-\uDE17\uDE19-\uDE33\uDE38-\uDE3A\uDE3F\uDE60-\uDE7C\uDF00-\uDF35\uDF40-\uDF55\uDF60-\uDF72]
|
||||
| [\uD805][\uDE80-\uDEB7\uDEC0-\uDEC9]
|
||||
| [\uD83B][\uDE00-\uDE03\uDE05-\uDE1F\uDE21\uDE22\uDE24\uDE27\uDE29-\uDE32\uDE34-\uDE37\uDE39\uDE3B\uDE42\uDE47\uDE49\uDE4B\uDE4D-\uDE4F\uDE51\uDE52\uDE54\uDE57\uDE59\uDE5B\uDE5D\uDE5F\uDE61\uDE62\uDE64\uDE67-\uDE6A\uDE6C-\uDE72\uDE74-\uDE77\uDE79-\uDE7C\uDE7E\uDE80-\uDE89\uDE8B-\uDE9B\uDEA1-\uDEA3\uDEA5-\uDEA9\uDEAB-\uDEBB]
|
||||
| [\uD809][\uDC00-\uDC62]
|
||||
| [\uD808][\uDC00-\uDF6E]
|
||||
| [\uD803][\uDC00-\uDC48]
|
||||
| [\uD80D][\uDC00-\uDC2E]
|
||||
| [\uD800][\uDC00-\uDC0B\uDC0D-\uDC26\uDC28-\uDC3A\uDC3C\uDC3D\uDC3F-\uDC4D\uDC50-\uDC5D\uDC80-\uDCFA\uDD40-\uDD74\uDDFD\uDE80-\uDE9C\uDEA0-\uDED0\uDF00-\uDF1E\uDF30-\uDF4A\uDF80-\uDF9D\uDFA0-\uDFC3\uDFC8-\uDFCF\uDFD1-\uDFD5]
|
||||
| [\uD804][\uDC00-\uDC46\uDC66-\uDC6F\uDC80-\uDCBA\uDCD0-\uDCE8\uDCF0-\uDCF9\uDD00-\uDD34\uDD36-\uDD3F\uDD80-\uDDC4\uDDD0-\uDDD9]
|
||||
| [\uD86E][\uDC00-\uDC1D]
|
||||
| [\uDB40][\uDD00-\uDDEF]
|
||||
| [\uD834][\uDD65-\uDD69\uDD6D-\uDD72\uDD7B-\uDD82\uDD85-\uDD8B\uDDAA-\uDDAD\uDE42-\uDE44]
|
||||
| [\uD835][\uDC00-\uDC54\uDC56-\uDC9C\uDC9E\uDC9F\uDCA2\uDCA5\uDCA6\uDCA9-\uDCAC\uDCAE-\uDCB9\uDCBB\uDCBD-\uDCC3\uDCC5-\uDD05\uDD07-\uDD0A\uDD0D-\uDD14\uDD16-\uDD1C\uDD1E-\uDD39\uDD3B-\uDD3E\uDD40-\uDD44\uDD46\uDD4A-\uDD50\uDD52-\uDEA5\uDEA8-\uDEC0\uDEC2-\uDEDA\uDEDC-\uDEFA\uDEFC-\uDF14\uDF16-\uDF34\uDF36-\uDF4E\uDF50-\uDF6E\uDF70-\uDF88\uDF8A-\uDFA8\uDFAA-\uDFC2\uDFC4-\uDFCB\uDFCE-\uDFFF]
|
||||
)
|
File diff suppressed because it is too large
Load Diff
|
@ -64,7 +64,7 @@ import org.apache.lucene.analysis.util.OpenStringBuilder;
|
|||
//
|
||||
// <identifier> := <ID_Start> <ID_Continue>*
|
||||
//
|
||||
Name = ( ( [:_\p{ID_Start}] | {ID_Start_Supp} ) ( [-.:_\p{ID_Continue}] | {ID_Continue_Supp} )* )
|
||||
Name = [:_\p{ID_Start}] [-.:_\p{ID_Continue}]*
|
||||
|
||||
// From Apache httpd mod_include documentation
|
||||
// <http://httpd.apache.org/docs/current/mod/mod_include.html>:
|
||||
|
@ -141,8 +141,6 @@ InlineElment = ( [aAbBiIqQsSuU] |
|
|||
|
||||
%include HTMLCharacterEntities.jflex
|
||||
|
||||
%include HTMLStripCharFilter.SUPPLEMENTARY.jflex-macro
|
||||
|
||||
%{
|
||||
private static final int INITIAL_INPUT_SEGMENT_SIZE = 1024;
|
||||
private static final char BLOCK_LEVEL_START_TAG_REPLACEMENT = '\n';
|
||||
|
@ -309,7 +307,7 @@ InlineElment = ( [aAbBiIqQsSuU] |
|
|||
cumulativeDiff += inputSegment.length() - outputSegment.length();
|
||||
// position the correction at (already output length) + (substitution length)
|
||||
addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
|
||||
eofReturnValue = outputSegment.length() > 0 ? outputSegment.nextChar() : -1;
|
||||
eofReturnValue = ( ! outputSegment.isRead()) ? outputSegment.nextChar() : -1;
|
||||
break;
|
||||
}
|
||||
case BANG:
|
||||
|
@ -322,7 +320,7 @@ InlineElment = ( [aAbBiIqQsSuU] |
|
|||
case LEFT_ANGLE_BRACKET_SLASH:
|
||||
case LEFT_ANGLE_BRACKET_SPACE: { // Include
|
||||
outputSegment = inputSegment;
|
||||
eofReturnValue = outputSegment.length() > 0 ? outputSegment.nextChar() : -1;
|
||||
eofReturnValue = ( ! outputSegment.isRead()) ? outputSegment.nextChar() : -1;
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
|
@ -789,7 +787,7 @@ InlineElment = ( [aAbBiIqQsSuU] |
|
|||
}
|
||||
}
|
||||
[^] {
|
||||
inputSegment.append(zzBuffer[zzStartRead]);
|
||||
inputSegment.append(yytext());
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -801,7 +799,13 @@ InlineElment = ( [aAbBiIqQsSuU] |
|
|||
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
||||
yybegin(YYINITIAL);
|
||||
}
|
||||
[^] { return zzBuffer[zzStartRead]; }
|
||||
[^] {
|
||||
if (yylength() == 1) {
|
||||
return zzBuffer[zzStartRead];
|
||||
} else {
|
||||
outputSegment.append(yytext()); return outputSegment.nextChar();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
<COMMENT> {
|
||||
|
@ -916,7 +920,7 @@ InlineElment = ( [aAbBiIqQsSuU] |
|
|||
|
||||
<AMPERSAND,NUMERIC_CHARACTER,CHARACTER_REFERENCE_TAIL,LEFT_ANGLE_BRACKET_SLASH,END_TAG_TAIL_INCLUDE,END_TAG_TAIL_EXCLUDE,END_TAG_TAIL_SUBSTITUTE,LEFT_ANGLE_BRACKET,LEFT_ANGLE_BRACKET_SPACE,START_TAG_TAIL_INCLUDE,START_TAG_TAIL_EXCLUDE,START_TAG_TAIL_SUBSTITUTE,BANG> {
|
||||
[^] {
|
||||
yypushback(1);
|
||||
yypushback(yylength());
|
||||
outputSegment = inputSegment;
|
||||
outputSegment.restart();
|
||||
yybegin(YYINITIAL);
|
||||
|
@ -924,4 +928,10 @@ InlineElment = ( [aAbBiIqQsSuU] |
|
|||
}
|
||||
}
|
||||
|
||||
[^] { return zzBuffer[zzStartRead]; }
|
||||
[^] {
|
||||
if (yylength() == 1) {
|
||||
return zzBuffer[zzStartRead];
|
||||
} else {
|
||||
outputSegment.append(yytext()); return outputSegment.nextChar();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
/* The following code was generated by JFlex 1.5.1 */
|
||||
/* The following code was generated by JFlex 1.6.0 */
|
||||
|
||||
package org.apache.lucene.analysis.standard;
|
||||
|
||||
|
@ -114,7 +114,7 @@ class ClassicTokenizerImpl implements StandardTokenizerInterface {
|
|||
"\1\0\2\12\1\0\2\12\1\0\154\12\41\0\u016b\12\22\0\100\12"+
|
||||
"\2\0\66\12\50\0\14\12\164\0\3\12\1\0\1\12\1\0\207\12"+
|
||||
"\23\0\12\2\7\0\32\12\6\0\32\12\12\0\1\13\72\13\37\12"+
|
||||
"\3\0\6\12\2\0\6\12\2\0\6\12\2\0\3\12\43\0";
|
||||
"\3\0\6\12\2\0\6\12\2\0\6\12\2\0\3\12\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\63\0";
|
||||
|
||||
/**
|
||||
* Translates characters to character classes
|
||||
|
@ -339,6 +339,14 @@ class ClassicTokenizerImpl implements StandardTokenizerInterface {
|
|||
|
||||
/** denotes if the user-EOF-code has already been executed */
|
||||
private boolean zzEOFDone;
|
||||
|
||||
/**
|
||||
* The number of occupied positions in zzBuffer beyond zzEndRead.
|
||||
* When a lead/high surrogate has been read from the input stream
|
||||
* into the final zzBuffer position, this will have a value of 1;
|
||||
* otherwise, it will have a value of 0.
|
||||
*/
|
||||
private int zzFinalHighSurrogate = 0;
|
||||
|
||||
/* user code: */
|
||||
|
||||
|
@ -385,10 +393,10 @@ public final void getText(CharTermAttribute t) {
|
|||
* @return the unpacked character translation table
|
||||
*/
|
||||
private static char [] zzUnpackCMap(String packed) {
|
||||
char [] map = new char[0x10000];
|
||||
char [] map = new char[0x110000];
|
||||
int i = 0; /* index in packed string */
|
||||
int j = 0; /* index in unpacked array */
|
||||
while (i < 1138) {
|
||||
while (i < 1170) {
|
||||
int count = packed.charAt(i++);
|
||||
char value = packed.charAt(i++);
|
||||
do map[j++] = value; while (--count > 0);
|
||||
|
@ -408,6 +416,8 @@ public final void getText(CharTermAttribute t) {
|
|||
|
||||
/* first: make room (if you can) */
|
||||
if (zzStartRead > 0) {
|
||||
zzEndRead += zzFinalHighSurrogate;
|
||||
zzFinalHighSurrogate = 0;
|
||||
System.arraycopy(zzBuffer, zzStartRead,
|
||||
zzBuffer, 0,
|
||||
zzEndRead-zzStartRead);
|
||||
|
@ -420,33 +430,38 @@ public final void getText(CharTermAttribute t) {
|
|||
}
|
||||
|
||||
/* is the buffer big enough? */
|
||||
if (zzCurrentPos >= zzBuffer.length) {
|
||||
if (zzCurrentPos >= zzBuffer.length - zzFinalHighSurrogate) {
|
||||
/* if not: blow it up */
|
||||
char newBuffer[] = new char[zzCurrentPos*2];
|
||||
char newBuffer[] = new char[zzBuffer.length*2];
|
||||
System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length);
|
||||
zzBuffer = newBuffer;
|
||||
zzEndRead += zzFinalHighSurrogate;
|
||||
zzFinalHighSurrogate = 0;
|
||||
}
|
||||
|
||||
/* finally: fill the buffer with new input */
|
||||
int numRead = zzReader.read(zzBuffer, zzEndRead,
|
||||
zzBuffer.length-zzEndRead);
|
||||
/* fill the buffer with new input */
|
||||
int requested = zzBuffer.length - zzEndRead;
|
||||
int totalRead = 0;
|
||||
while (totalRead < requested) {
|
||||
int numRead = zzReader.read(zzBuffer, zzEndRead + totalRead, requested - totalRead);
|
||||
if (numRead == -1) {
|
||||
break;
|
||||
}
|
||||
totalRead += numRead;
|
||||
}
|
||||
|
||||
if (numRead > 0) {
|
||||
zzEndRead+= numRead;
|
||||
if (totalRead > 0) {
|
||||
zzEndRead += totalRead;
|
||||
if (totalRead == requested) { /* possibly more input available */
|
||||
if (Character.isHighSurrogate(zzBuffer[zzEndRead - 1])) {
|
||||
--zzEndRead;
|
||||
zzFinalHighSurrogate = 1;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
// unlikely but not impossible: read 0 characters, but not at end of stream
|
||||
if (numRead == 0) {
|
||||
int c = zzReader.read();
|
||||
if (c == -1) {
|
||||
return true;
|
||||
} else {
|
||||
zzBuffer[zzEndRead++] = (char) c;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// numRead < 0
|
||||
// totalRead = 0: End of stream
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -482,6 +497,7 @@ public final void getText(CharTermAttribute t) {
|
|||
zzEOFDone = false;
|
||||
zzEndRead = zzStartRead = 0;
|
||||
zzCurrentPos = zzMarkedPos = 0;
|
||||
zzFinalHighSurrogate = 0;
|
||||
yyline = yychar = yycolumn = 0;
|
||||
zzLexicalState = YYINITIAL;
|
||||
if (zzBuffer.length > ZZ_BUFFERSIZE)
|
||||
|
@ -625,8 +641,10 @@ public final void getText(CharTermAttribute t) {
|
|||
zzForAction: {
|
||||
while (true) {
|
||||
|
||||
if (zzCurrentPosL < zzEndReadL)
|
||||
zzInput = zzBufferL[zzCurrentPosL++];
|
||||
if (zzCurrentPosL < zzEndReadL) {
|
||||
zzInput = Character.codePointAt(zzBufferL, zzCurrentPosL, zzEndReadL);
|
||||
zzCurrentPosL += Character.charCount(zzInput);
|
||||
}
|
||||
else if (zzAtEOF) {
|
||||
zzInput = YYEOF;
|
||||
break zzForAction;
|
||||
|
@ -646,7 +664,8 @@ public final void getText(CharTermAttribute t) {
|
|||
break zzForAction;
|
||||
}
|
||||
else {
|
||||
zzInput = zzBufferL[zzCurrentPosL++];
|
||||
zzInput = Character.codePointAt(zzBufferL, zzCurrentPosL, zzEndReadL);
|
||||
zzCurrentPosL += Character.charCount(zzInput);
|
||||
}
|
||||
}
|
||||
int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ];
|
||||
|
|
|
@ -1,143 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
// Generated using ICU4J 52.1.0.0
|
||||
// by org.apache.lucene.analysis.icu.GenerateJFlexSupplementaryMacros
|
||||
|
||||
|
||||
ALetterSupp = (
|
||||
([\ud83b][\uDE00-\uDE03\uDE05-\uDE1F\uDE21\uDE22\uDE24\uDE27\uDE29-\uDE32\uDE34-\uDE37\uDE39\uDE3B\uDE42\uDE47\uDE49\uDE4B\uDE4D-\uDE4F\uDE51\uDE52\uDE54\uDE57\uDE59\uDE5B\uDE5D\uDE5F\uDE61\uDE62\uDE64\uDE67-\uDE6A\uDE6C-\uDE72\uDE74-\uDE77\uDE79-\uDE7C\uDE7E\uDE80-\uDE89\uDE8B-\uDE9B\uDEA1-\uDEA3\uDEA5-\uDEA9\uDEAB-\uDEBB])
|
||||
| ([\ud81a][\uDC00-\uDE38])
|
||||
| ([\ud81b][\uDF00-\uDF44\uDF50\uDF93-\uDF9F])
|
||||
| ([\ud835][\uDC00-\uDC54\uDC56-\uDC9C\uDC9E\uDC9F\uDCA2\uDCA5\uDCA6\uDCA9-\uDCAC\uDCAE-\uDCB9\uDCBB\uDCBD-\uDCC3\uDCC5-\uDD05\uDD07-\uDD0A\uDD0D-\uDD14\uDD16-\uDD1C\uDD1E-\uDD39\uDD3B-\uDD3E\uDD40-\uDD44\uDD46\uDD4A-\uDD50\uDD52-\uDEA5\uDEA8-\uDEC0\uDEC2-\uDEDA\uDEDC-\uDEFA\uDEFC-\uDF14\uDF16-\uDF34\uDF36-\uDF4E\uDF50-\uDF6E\uDF70-\uDF88\uDF8A-\uDFA8\uDFAA-\uDFC2\uDFC4-\uDFCB])
|
||||
| ([\ud80d][\uDC00-\uDC2E])
|
||||
| ([\ud80c][\uDC00-\uDFFF])
|
||||
| ([\ud809][\uDC00-\uDC62])
|
||||
| ([\ud808][\uDC00-\uDF6E])
|
||||
| ([\ud805][\uDE80-\uDEAA])
|
||||
| ([\ud804][\uDC03-\uDC37\uDC83-\uDCAF\uDCD0-\uDCE8\uDD03-\uDD26\uDD83-\uDDB2\uDDC1-\uDDC4])
|
||||
| ([\ud801][\uDC00-\uDC9D])
|
||||
| ([\ud800][\uDC00-\uDC0B\uDC0D-\uDC26\uDC28-\uDC3A\uDC3C\uDC3D\uDC3F-\uDC4D\uDC50-\uDC5D\uDC80-\uDCFA\uDD40-\uDD74\uDE80-\uDE9C\uDEA0-\uDED0\uDF00-\uDF1E\uDF30-\uDF4A\uDF80-\uDF9D\uDFA0-\uDFC3\uDFC8-\uDFCF\uDFD1-\uDFD5])
|
||||
| ([\ud803][\uDC00-\uDC48])
|
||||
| ([\ud802][\uDC00-\uDC05\uDC08\uDC0A-\uDC35\uDC37\uDC38\uDC3C\uDC3F-\uDC55\uDD00-\uDD15\uDD20-\uDD39\uDD80-\uDDB7\uDDBE\uDDBF\uDE00\uDE10-\uDE13\uDE15-\uDE17\uDE19-\uDE33\uDE60-\uDE7C\uDF00-\uDF35\uDF40-\uDF55\uDF60-\uDF72])
|
||||
)
|
||||
FormatSupp = (
|
||||
([\ud804][\uDCBD])
|
||||
| ([\ud834][\uDD73-\uDD7A])
|
||||
| ([\udb40][\uDC01\uDC20-\uDC7F])
|
||||
)
|
||||
NumericSupp = (
|
||||
([\ud805][\uDEC0-\uDEC9])
|
||||
| ([\ud804][\uDC66-\uDC6F\uDCF0-\uDCF9\uDD36-\uDD3F\uDDD0-\uDDD9])
|
||||
| ([\ud835][\uDFCE-\uDFFF])
|
||||
| ([\ud801][\uDCA0-\uDCA9])
|
||||
)
|
||||
ExtendSupp = (
|
||||
([\ud81b][\uDF51-\uDF7E\uDF8F-\uDF92])
|
||||
| ([\ud805][\uDEAB-\uDEB7])
|
||||
| ([\ud804][\uDC00-\uDC02\uDC38-\uDC46\uDC80-\uDC82\uDCB0-\uDCBA\uDD00-\uDD02\uDD27-\uDD34\uDD80-\uDD82\uDDB3-\uDDC0])
|
||||
| ([\ud834][\uDD65-\uDD69\uDD6D-\uDD72\uDD7B-\uDD82\uDD85-\uDD8B\uDDAA-\uDDAD\uDE42-\uDE44])
|
||||
| ([\ud800][\uDDFD])
|
||||
| ([\udb40][\uDD00-\uDDEF])
|
||||
| ([\ud802][\uDE01-\uDE03\uDE05\uDE06\uDE0C-\uDE0F\uDE38-\uDE3A\uDE3F])
|
||||
)
|
||||
KatakanaSupp = (
|
||||
([\ud82c][\uDC00])
|
||||
)
|
||||
MidLetterSupp = (
|
||||
[]
|
||||
)
|
||||
MidNumSupp = (
|
||||
[]
|
||||
)
|
||||
MidNumLetSupp = (
|
||||
[]
|
||||
)
|
||||
ExtendNumLetSupp = (
|
||||
[]
|
||||
)
|
||||
ExtendNumLetSupp = (
|
||||
[]
|
||||
)
|
||||
ComplexContextSupp = (
|
||||
[]
|
||||
)
|
||||
HanSupp = (
|
||||
([\ud87e][\uDC00-\uDE1D])
|
||||
| ([\ud86b][\uDC00-\uDFFF])
|
||||
| ([\ud86a][\uDC00-\uDFFF])
|
||||
| ([\ud869][\uDC00-\uDED6\uDF00-\uDFFF])
|
||||
| ([\ud868][\uDC00-\uDFFF])
|
||||
| ([\ud86e][\uDC00-\uDC1D])
|
||||
| ([\ud86d][\uDC00-\uDF34\uDF40-\uDFFF])
|
||||
| ([\ud86c][\uDC00-\uDFFF])
|
||||
| ([\ud863][\uDC00-\uDFFF])
|
||||
| ([\ud862][\uDC00-\uDFFF])
|
||||
| ([\ud861][\uDC00-\uDFFF])
|
||||
| ([\ud860][\uDC00-\uDFFF])
|
||||
| ([\ud867][\uDC00-\uDFFF])
|
||||
| ([\ud866][\uDC00-\uDFFF])
|
||||
| ([\ud865][\uDC00-\uDFFF])
|
||||
| ([\ud864][\uDC00-\uDFFF])
|
||||
| ([\ud858][\uDC00-\uDFFF])
|
||||
| ([\ud859][\uDC00-\uDFFF])
|
||||
| ([\ud85a][\uDC00-\uDFFF])
|
||||
| ([\ud85b][\uDC00-\uDFFF])
|
||||
| ([\ud85c][\uDC00-\uDFFF])
|
||||
| ([\ud85d][\uDC00-\uDFFF])
|
||||
| ([\ud85e][\uDC00-\uDFFF])
|
||||
| ([\ud85f][\uDC00-\uDFFF])
|
||||
| ([\ud850][\uDC00-\uDFFF])
|
||||
| ([\ud851][\uDC00-\uDFFF])
|
||||
| ([\ud852][\uDC00-\uDFFF])
|
||||
| ([\ud853][\uDC00-\uDFFF])
|
||||
| ([\ud854][\uDC00-\uDFFF])
|
||||
| ([\ud855][\uDC00-\uDFFF])
|
||||
| ([\ud856][\uDC00-\uDFFF])
|
||||
| ([\ud857][\uDC00-\uDFFF])
|
||||
| ([\ud849][\uDC00-\uDFFF])
|
||||
| ([\ud848][\uDC00-\uDFFF])
|
||||
| ([\ud84b][\uDC00-\uDFFF])
|
||||
| ([\ud84a][\uDC00-\uDFFF])
|
||||
| ([\ud84d][\uDC00-\uDFFF])
|
||||
| ([\ud84c][\uDC00-\uDFFF])
|
||||
| ([\ud84f][\uDC00-\uDFFF])
|
||||
| ([\ud84e][\uDC00-\uDFFF])
|
||||
| ([\ud841][\uDC00-\uDFFF])
|
||||
| ([\ud840][\uDC00-\uDFFF])
|
||||
| ([\ud843][\uDC00-\uDFFF])
|
||||
| ([\ud842][\uDC00-\uDFFF])
|
||||
| ([\ud845][\uDC00-\uDFFF])
|
||||
| ([\ud844][\uDC00-\uDFFF])
|
||||
| ([\ud847][\uDC00-\uDFFF])
|
||||
| ([\ud846][\uDC00-\uDFFF])
|
||||
)
|
||||
HiraganaSupp = (
|
||||
([\ud83c][\uDE00])
|
||||
| ([\ud82c][\uDC01])
|
||||
)
|
||||
SingleQuoteSupp = (
|
||||
[]
|
||||
)
|
||||
DoubleQuoteSupp = (
|
||||
[]
|
||||
)
|
||||
HebrewLetterSupp = (
|
||||
[]
|
||||
)
|
||||
RegionalIndicatorSupp = (
|
||||
([\ud83c][\uDDE6-\uDDFF])
|
||||
)
|
File diff suppressed because it is too large
Load Diff
|
@ -48,41 +48,22 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
|||
%char
|
||||
%buffer 4096
|
||||
|
||||
%include SUPPLEMENTARY.jflex-macro
|
||||
ALetter = (\p{WB:ALetter} | {ALetterSupp})
|
||||
Format = (\p{WB:Format} | {FormatSupp})
|
||||
Numeric = ([\p{WB:Numeric}[\p{Blk:HalfAndFullForms}&&\p{Nd}]] | {NumericSupp})
|
||||
Extend = (\p{WB:Extend} | {ExtendSupp})
|
||||
Katakana = (\p{WB:Katakana} | {KatakanaSupp})
|
||||
MidLetter = (\p{WB:MidLetter} | {MidLetterSupp})
|
||||
MidNum = (\p{WB:MidNum} | {MidNumSupp})
|
||||
MidNumLet = (\p{WB:MidNumLet} | {MidNumLetSupp})
|
||||
ExtendNumLet = (\p{WB:ExtendNumLet} | {ExtendNumLetSupp})
|
||||
ComplexContext = (\p{LB:Complex_Context} | {ComplexContextSupp})
|
||||
Han = (\p{Script:Han} | {HanSupp})
|
||||
Hiragana = (\p{Script:Hiragana} | {HiraganaSupp})
|
||||
SingleQuote = (\p{WB:Single_Quote} | {SingleQuoteSupp})
|
||||
DoubleQuote = (\p{WB:Double_Quote} | {DoubleQuoteSupp})
|
||||
HebrewLetter = (\p{WB:Hebrew_Letter} | {HebrewLetterSupp})
|
||||
RegionalIndicator = (\p{WB:Regional_Indicator} | {RegionalIndicatorSupp})
|
||||
HebrewOrALetter = ({HebrewLetter} | {ALetter})
|
||||
|
||||
// UAX#29 WB4. X (Extend | Format)* --> X
|
||||
//
|
||||
HangulEx = [\p{Script:Hangul}&&[\p{WB:ALetter}\p{WB:Hebrew_Letter}]] ({Format} | {Extend})*
|
||||
HebrewOrALetterEx = {HebrewOrALetter} ({Format} | {Extend})*
|
||||
NumericEx = {Numeric} ({Format} | {Extend})*
|
||||
KatakanaEx = {Katakana} ({Format} | {Extend})*
|
||||
MidLetterEx = ({MidLetter} | {MidNumLet} | {SingleQuote}) ({Format} | {Extend})*
|
||||
MidNumericEx = ({MidNum} | {MidNumLet} | {SingleQuote}) ({Format} | {Extend})*
|
||||
ExtendNumLetEx = {ExtendNumLet} ({Format} | {Extend})*
|
||||
HanEx = {Han} ({Format} | {Extend})*
|
||||
HiraganaEx = {Hiragana} ({Format} | {Extend})*
|
||||
SingleQuoteEx = {SingleQuote} ({Format} | {Extend})*
|
||||
DoubleQuoteEx = {DoubleQuote} ({Format} | {Extend})*
|
||||
HebrewLetterEx = {HebrewLetter} ({Format} | {Extend})*
|
||||
RegionalIndicatorEx = {RegionalIndicator} ({Format} | {Extend})*
|
||||
|
||||
HangulEx = [\p{Script:Hangul}&&[\p{WB:ALetter}\p{WB:Hebrew_Letter}]] [\p{WB:Format}\p{WB:Extend}]*
|
||||
HebrewOrALetterEx = [\p{WB:HebrewLetter}\p{WB:ALetter}] [\p{WB:Format}\p{WB:Extend}]*
|
||||
NumericEx = [\p{WB:Numeric}[\p{Blk:HalfAndFullForms}&&\p{Nd}]] [\p{WB:Format}\p{WB:Extend}]*
|
||||
KatakanaEx = \p{WB:Katakana} [\p{WB:Format}\p{WB:Extend}]*
|
||||
MidLetterEx = [\p{WB:MidLetter}\p{WB:MidNumLet}\p{WB:SingleQuote}] [\p{WB:Format}\p{WB:Extend}]*
|
||||
MidNumericEx = [\p{WB:MidNum}\p{WB:MidNumLet}\p{WB:SingleQuote}] [\p{WB:Format}\p{WB:Extend}]*
|
||||
ExtendNumLetEx = \p{WB:ExtendNumLet} [\p{WB:Format}\p{WB:Extend}]*
|
||||
HanEx = \p{Script:Han} [\p{WB:Format}\p{WB:Extend}]*
|
||||
HiraganaEx = \p{Script:Hiragana} [\p{WB:Format}\p{WB:Extend}]*
|
||||
SingleQuoteEx = \p{WB:Single_Quote} [\p{WB:Format}\p{WB:Extend}]*
|
||||
DoubleQuoteEx = \p{WB:Double_Quote} [\p{WB:Format}\p{WB:Extend}]*
|
||||
HebrewLetterEx = \p{WB:Hebrew_Letter} [\p{WB:Format}\p{WB:Extend}]*
|
||||
RegionalIndicatorEx = \p{WB:RegionalIndicator} [\p{WB:Format}\p{WB:Extend}]*
|
||||
ComplexContextEx = \p{LB:Complex_Context} [\p{WB:Format}\p{WB:Extend}]*
|
||||
|
||||
%{
|
||||
/** Alphanumeric sequences */
|
||||
|
@ -193,7 +174,7 @@ RegionalIndicatorEx = {RegionalIndicator}
|
|||
//
|
||||
// http://www.unicode.org/reports/tr14/#SA
|
||||
//
|
||||
{ComplexContext}+ { return SOUTH_EAST_ASIAN_TYPE; }
|
||||
{ComplexContextEx}+ { return SOUTH_EAST_ASIAN_TYPE; }
|
||||
|
||||
// UAX#29 WB14. Any ÷ Any
|
||||
//
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -52,40 +52,22 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
|||
%xstate AVOID_BAD_URL
|
||||
%buffer 4096
|
||||
|
||||
%include SUPPLEMENTARY.jflex-macro
|
||||
ALetter = (\p{WB:ALetter} | {ALetterSupp})
|
||||
Format = (\p{WB:Format} | {FormatSupp})
|
||||
Numeric = ([\p{WB:Numeric}[\p{Blk:HalfAndFullForms}&&\p{Nd}]] | {NumericSupp})
|
||||
Extend = (\p{WB:Extend} | {ExtendSupp})
|
||||
Katakana = (\p{WB:Katakana} | {KatakanaSupp})
|
||||
MidLetter = (\p{WB:MidLetter} | {MidLetterSupp})
|
||||
MidNum = (\p{WB:MidNum} | {MidNumSupp})
|
||||
MidNumLet = (\p{WB:MidNumLet} | {MidNumLetSupp})
|
||||
ExtendNumLet = (\p{WB:ExtendNumLet} | {ExtendNumLetSupp})
|
||||
ComplexContext = (\p{LB:Complex_Context} | {ComplexContextSupp})
|
||||
Han = (\p{Script:Han} | {HanSupp})
|
||||
Hiragana = (\p{Script:Hiragana} | {HiraganaSupp})
|
||||
SingleQuote = (\p{WB:Single_Quote} | {SingleQuoteSupp})
|
||||
DoubleQuote = (\p{WB:Double_Quote} | {DoubleQuoteSupp})
|
||||
HebrewLetter = (\p{WB:Hebrew_Letter} | {HebrewLetterSupp})
|
||||
RegionalIndicator = (\p{WB:Regional_Indicator} | {RegionalIndicatorSupp})
|
||||
HebrewOrALetter = ({HebrewLetter} | {ALetter})
|
||||
|
||||
// UAX#29 WB4. X (Extend | Format)* --> X
|
||||
//
|
||||
HangulEx = [\p{Script:Hangul}&&[\p{WB:ALetter}\p{WB:Hebrew_Letter}]] ({Format} | {Extend})*
|
||||
HebrewOrALetterEx = {HebrewOrALetter} ({Format} | {Extend})*
|
||||
NumericEx = {Numeric} ({Format} | {Extend})*
|
||||
KatakanaEx = {Katakana} ({Format} | {Extend})*
|
||||
MidLetterEx = ({MidLetter} | {MidNumLet} | {SingleQuote}) ({Format} | {Extend})*
|
||||
MidNumericEx = ({MidNum} | {MidNumLet} | {SingleQuote}) ({Format} | {Extend})*
|
||||
ExtendNumLetEx = {ExtendNumLet} ({Format} | {Extend})*
|
||||
HanEx = {Han} ({Format} | {Extend})*
|
||||
HiraganaEx = {Hiragana} ({Format} | {Extend})*
|
||||
SingleQuoteEx = {SingleQuote} ({Format} | {Extend})*
|
||||
DoubleQuoteEx = {DoubleQuote} ({Format} | {Extend})*
|
||||
HebrewLetterEx = {HebrewLetter} ({Format} | {Extend})*
|
||||
RegionalIndicatorEx = {RegionalIndicator} ({Format} | {Extend})*
|
||||
HangulEx = [\p{Script:Hangul}&&[\p{WB:ALetter}\p{WB:Hebrew_Letter}]] [\p{WB:Format}\p{WB:Extend}]*
|
||||
HebrewOrALetterEx = [\p{WB:HebrewLetter}\p{WB:ALetter}] [\p{WB:Format}\p{WB:Extend}]*
|
||||
NumericEx = [\p{WB:Numeric}[\p{Blk:HalfAndFullForms}&&\p{Nd}]] [\p{WB:Format}\p{WB:Extend}]*
|
||||
KatakanaEx = \p{WB:Katakana} [\p{WB:Format}\p{WB:Extend}]*
|
||||
MidLetterEx = [\p{WB:MidLetter}\p{WB:MidNumLet}\p{WB:SingleQuote}] [\p{WB:Format}\p{WB:Extend}]*
|
||||
MidNumericEx = [\p{WB:MidNum}\p{WB:MidNumLet}\p{WB:SingleQuote}] [\p{WB:Format}\p{WB:Extend}]*
|
||||
ExtendNumLetEx = \p{WB:ExtendNumLet} [\p{WB:Format}\p{WB:Extend}]*
|
||||
HanEx = \p{Script:Han} [\p{WB:Format}\p{WB:Extend}]*
|
||||
HiraganaEx = \p{Script:Hiragana} [\p{WB:Format}\p{WB:Extend}]*
|
||||
SingleQuoteEx = \p{WB:Single_Quote} [\p{WB:Format}\p{WB:Extend}]*
|
||||
DoubleQuoteEx = \p{WB:Double_Quote} [\p{WB:Format}\p{WB:Extend}]*
|
||||
HebrewLetterEx = \p{WB:Hebrew_Letter} [\p{WB:Format}\p{WB:Extend}]*
|
||||
RegionalIndicatorEx = \p{WB:RegionalIndicator} [\p{WB:Format}\p{WB:Extend}]*
|
||||
ComplexContextEx = \p{LB:Complex_Context} [\p{WB:Format}\p{WB:Extend}]*
|
||||
|
||||
// URL and E-mail syntax specifications:
|
||||
//
|
||||
|
@ -304,7 +286,7 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
|
|||
//
|
||||
// http://www.unicode.org/reports/tr14/#SA
|
||||
//
|
||||
{ComplexContext}+ { yybegin(YYINITIAL); return SOUTH_EAST_ASIAN_TYPE; }
|
||||
{ComplexContextEx}+ { yybegin(YYINITIAL); return SOUTH_EAST_ASIAN_TYPE; }
|
||||
|
||||
// UAX#29 WB14. Any ÷ Any
|
||||
//
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
/* The following code was generated by JFlex 1.5.1 */
|
||||
/* The following code was generated by JFlex 1.6.0 */
|
||||
|
||||
package org.apache.lucene.analysis.wikipedia;
|
||||
|
||||
|
@ -71,7 +71,7 @@ class WikipediaTokenizerImpl {
|
|||
"\167\15\11\17\166\15\12\17\166\15\12\17\166\15\12\17\340\15\12\17"+
|
||||
"\166\15\12\17\u0166\15\12\17\266\15\u0100\15\u0e00\15\u1040\0\u0150\21\140\0"+
|
||||
"\20\21\u0100\0\200\21\200\0\u19c0\21\100\0\u5200\21\u0c00\0\u2bb0\20\u2150\0"+
|
||||
"\u0200\21\u0465\0\73\21\75\15\43\0";
|
||||
"\u0200\21\u0465\0\73\21\75\15\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\uffff\0\63\0";
|
||||
|
||||
/**
|
||||
* Translates characters to character classes
|
||||
|
@ -427,6 +427,14 @@ class WikipediaTokenizerImpl {
|
|||
|
||||
/** denotes if the user-EOF-code has already been executed */
|
||||
private boolean zzEOFDone;
|
||||
|
||||
/**
|
||||
* The number of occupied positions in zzBuffer beyond zzEndRead.
|
||||
* When a lead/high surrogate has been read from the input stream
|
||||
* into the final zzBuffer position, this will have a value of 1;
|
||||
* otherwise, it will have a value of 0.
|
||||
*/
|
||||
private int zzFinalHighSurrogate = 0;
|
||||
|
||||
/* user code: */
|
||||
|
||||
|
@ -519,10 +527,10 @@ final void reset() {
|
|||
* @return the unpacked character translation table
|
||||
*/
|
||||
private static char [] zzUnpackCMap(String packed) {
|
||||
char [] map = new char[0x10000];
|
||||
char [] map = new char[0x110000];
|
||||
int i = 0; /* index in packed string */
|
||||
int j = 0; /* index in unpacked array */
|
||||
while (i < 230) {
|
||||
while (i < 262) {
|
||||
int count = packed.charAt(i++);
|
||||
char value = packed.charAt(i++);
|
||||
do map[j++] = value; while (--count > 0);
|
||||
|
@ -542,6 +550,8 @@ final void reset() {
|
|||
|
||||
/* first: make room (if you can) */
|
||||
if (zzStartRead > 0) {
|
||||
zzEndRead += zzFinalHighSurrogate;
|
||||
zzFinalHighSurrogate = 0;
|
||||
System.arraycopy(zzBuffer, zzStartRead,
|
||||
zzBuffer, 0,
|
||||
zzEndRead-zzStartRead);
|
||||
|
@ -554,33 +564,38 @@ final void reset() {
|
|||
}
|
||||
|
||||
/* is the buffer big enough? */
|
||||
if (zzCurrentPos >= zzBuffer.length) {
|
||||
if (zzCurrentPos >= zzBuffer.length - zzFinalHighSurrogate) {
|
||||
/* if not: blow it up */
|
||||
char newBuffer[] = new char[zzCurrentPos*2];
|
||||
char newBuffer[] = new char[zzBuffer.length*2];
|
||||
System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length);
|
||||
zzBuffer = newBuffer;
|
||||
zzEndRead += zzFinalHighSurrogate;
|
||||
zzFinalHighSurrogate = 0;
|
||||
}
|
||||
|
||||
/* finally: fill the buffer with new input */
|
||||
int numRead = zzReader.read(zzBuffer, zzEndRead,
|
||||
zzBuffer.length-zzEndRead);
|
||||
/* fill the buffer with new input */
|
||||
int requested = zzBuffer.length - zzEndRead;
|
||||
int totalRead = 0;
|
||||
while (totalRead < requested) {
|
||||
int numRead = zzReader.read(zzBuffer, zzEndRead + totalRead, requested - totalRead);
|
||||
if (numRead == -1) {
|
||||
break;
|
||||
}
|
||||
totalRead += numRead;
|
||||
}
|
||||
|
||||
if (numRead > 0) {
|
||||
zzEndRead+= numRead;
|
||||
if (totalRead > 0) {
|
||||
zzEndRead += totalRead;
|
||||
if (totalRead == requested) { /* possibly more input available */
|
||||
if (Character.isHighSurrogate(zzBuffer[zzEndRead - 1])) {
|
||||
--zzEndRead;
|
||||
zzFinalHighSurrogate = 1;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
// unlikely but not impossible: read 0 characters, but not at end of stream
|
||||
if (numRead == 0) {
|
||||
int c = zzReader.read();
|
||||
if (c == -1) {
|
||||
return true;
|
||||
} else {
|
||||
zzBuffer[zzEndRead++] = (char) c;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// numRead < 0
|
||||
// totalRead = 0: End of stream
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -616,6 +631,7 @@ final void reset() {
|
|||
zzEOFDone = false;
|
||||
zzEndRead = zzStartRead = 0;
|
||||
zzCurrentPos = zzMarkedPos = 0;
|
||||
zzFinalHighSurrogate = 0;
|
||||
yyline = yychar = yycolumn = 0;
|
||||
zzLexicalState = YYINITIAL;
|
||||
if (zzBuffer.length > ZZ_BUFFERSIZE)
|
||||
|
@ -759,8 +775,10 @@ final void reset() {
|
|||
zzForAction: {
|
||||
while (true) {
|
||||
|
||||
if (zzCurrentPosL < zzEndReadL)
|
||||
zzInput = zzBufferL[zzCurrentPosL++];
|
||||
if (zzCurrentPosL < zzEndReadL) {
|
||||
zzInput = Character.codePointAt(zzBufferL, zzCurrentPosL, zzEndReadL);
|
||||
zzCurrentPosL += Character.charCount(zzInput);
|
||||
}
|
||||
else if (zzAtEOF) {
|
||||
zzInput = YYEOF;
|
||||
break zzForAction;
|
||||
|
@ -780,7 +798,8 @@ final void reset() {
|
|||
break zzForAction;
|
||||
}
|
||||
else {
|
||||
zzInput = zzBufferL[zzCurrentPosL++];
|
||||
zzInput = Character.codePointAt(zzBufferL, zzCurrentPosL, zzEndReadL);
|
||||
zzCurrentPosL += Character.charCount(zzInput);
|
||||
}
|
||||
}
|
||||
int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ];
|
||||
|
|
|
@ -104,43 +104,6 @@ are part of the ICU4C package. See http://site.icu-project.org/ </echo>
|
|||
<arg value="${rbbi.dst.dir}"/>
|
||||
</java>
|
||||
</target>
|
||||
|
||||
<property name="uax29.supp.macros.output.file"
|
||||
location="../common/src/java/org/apache/lucene/analysis/standard/SUPPLEMENTARY.jflex-macro"/>
|
||||
|
||||
<target name="gen-uax29-supp-macros" depends="compile-tools">
|
||||
<java
|
||||
classname="org.apache.lucene.analysis.icu.GenerateJFlexSupplementaryMacros"
|
||||
dir="."
|
||||
fork="true"
|
||||
failonerror="true"
|
||||
output="${uax29.supp.macros.output.file}">
|
||||
<classpath>
|
||||
<path refid="icujar"/>
|
||||
<pathelement location="${build.dir}/classes/tools"/>
|
||||
</classpath>
|
||||
<assertions>
|
||||
<enable package="org.apache.lucene"/>
|
||||
</assertions>
|
||||
</java>
|
||||
</target>
|
||||
|
||||
<property name="html.strip.charfilter.supp.macros.output.file"
|
||||
location="../common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.SUPPLEMENTARY.jflex-macro"/>
|
||||
|
||||
<target name="gen-html-strip-charfilter-supp-macros" depends="compile-tools">
|
||||
<java
|
||||
classname="org.apache.lucene.analysis.icu.GenerateHTMLStripCharFilterSupplementaryMacros"
|
||||
dir="."
|
||||
fork="true"
|
||||
failonerror="true"
|
||||
output="${html.strip.charfilter.supp.macros.output.file}">
|
||||
<classpath>
|
||||
<path refid="icujar"/>
|
||||
<pathelement location="${build.dir}/classes/tools"/>
|
||||
</classpath>
|
||||
</java>
|
||||
</target>
|
||||
|
||||
<target name="compile-tools" depends="init,common.compile-tools">
|
||||
<compile
|
||||
|
@ -150,6 +113,6 @@ are part of the ICU4C package. See http://site.icu-project.org/ </echo>
|
|||
</compile>
|
||||
</target>
|
||||
|
||||
<target name="regenerate" depends="gen-html-strip-charfilter-supp-macros,gen-uax29-supp-macros,gen-utr30-data-files,gennorm2,genrbbi"/>
|
||||
<target name="regenerate" depends="gen-utr30-data-files,gennorm2,genrbbi"/>
|
||||
|
||||
</project>
|
||||
|
|
|
@ -1,109 +0,0 @@
|
|||
package org.apache.lucene.analysis.icu;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.text.DateFormat;
|
||||
import java.util.*;
|
||||
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
import com.ibm.icu.text.UnicodeSetIterator;
|
||||
import com.ibm.icu.util.VersionInfo;
|
||||
|
||||
/** creates a macro to augment jflex's unicode support for > BMP */
|
||||
public class GenerateHTMLStripCharFilterSupplementaryMacros {
|
||||
private static final UnicodeSet BMP = new UnicodeSet("[\u0000-\uFFFF]");
|
||||
private static final String NL = System.getProperty("line.separator");
|
||||
private static final DateFormat DATE_FORMAT = DateFormat.getDateTimeInstance
|
||||
(DateFormat.FULL, DateFormat.FULL, Locale.ROOT);
|
||||
static {
|
||||
DATE_FORMAT.setTimeZone(TimeZone.getTimeZone("UTC"));
|
||||
}
|
||||
|
||||
private static final String APACHE_LICENSE
|
||||
= "/*" + NL
|
||||
+ " * Copyright 2010 The Apache Software Foundation." + NL
|
||||
+ " *" + NL
|
||||
+ " * Licensed under the Apache License, Version 2.0 (the \"License\");" + NL
|
||||
+ " * you may not use this file except in compliance with the License." + NL
|
||||
+ " * You may obtain a copy of the License at" + NL
|
||||
+ " *" + NL
|
||||
+ " * http://www.apache.org/licenses/LICENSE-2.0" + NL
|
||||
+ " *" + NL
|
||||
+ " * Unless required by applicable law or agreed to in writing, software" + NL
|
||||
+ " * distributed under the License is distributed on an \"AS IS\" BASIS," + NL
|
||||
+ " * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied." + NL
|
||||
+ " * See the License for the specific language governing permissions and" + NL
|
||||
+ " * limitations under the License." + NL
|
||||
+ " */" + NL + NL;
|
||||
|
||||
|
||||
public static void main(String args[]) {
|
||||
outputHeader();
|
||||
outputMacro("ID_Start_Supp", "[:ID_Start:]");
|
||||
outputMacro("ID_Continue_Supp", "[:ID_Continue:]");
|
||||
}
|
||||
|
||||
static void outputHeader() {
|
||||
System.out.print(APACHE_LICENSE);
|
||||
System.out.println("// Generated using ICU4J " + VersionInfo.ICU_VERSION.toString());
|
||||
System.out.println("// by " + GenerateHTMLStripCharFilterSupplementaryMacros.class.getName());
|
||||
System.out.print(NL + NL);
|
||||
}
|
||||
|
||||
// we have to carefully output the possibilities as compact utf-16
|
||||
// range expressions, or jflex will OOM!
|
||||
static void outputMacro(String name, String pattern) {
|
||||
UnicodeSet set = new UnicodeSet(pattern);
|
||||
set.removeAll(BMP);
|
||||
System.out.println(name + " = (");
|
||||
// if the set is empty, we have to do this or jflex will barf
|
||||
if (set.isEmpty()) {
|
||||
System.out.println("\t []");
|
||||
}
|
||||
|
||||
HashMap<Character,UnicodeSet> utf16ByLead = new HashMap<>();
|
||||
for (UnicodeSetIterator it = new UnicodeSetIterator(set); it.next();) {
|
||||
char utf16[] = Character.toChars(it.codepoint);
|
||||
UnicodeSet trails = utf16ByLead.get(utf16[0]);
|
||||
if (trails == null) {
|
||||
trails = new UnicodeSet();
|
||||
utf16ByLead.put(utf16[0], trails);
|
||||
}
|
||||
trails.add(utf16[1]);
|
||||
}
|
||||
|
||||
Map<String,UnicodeSet> utf16ByTrail = new HashMap<>();
|
||||
for (Map.Entry<Character,UnicodeSet> entry : utf16ByLead.entrySet()) {
|
||||
String trail = entry.getValue().getRegexEquivalent();
|
||||
UnicodeSet leads = utf16ByTrail.get(trail);
|
||||
if (leads == null) {
|
||||
leads = new UnicodeSet();
|
||||
utf16ByTrail.put(trail, leads);
|
||||
}
|
||||
leads.add(entry.getKey());
|
||||
}
|
||||
|
||||
boolean isFirst = true;
|
||||
for (Map.Entry<String,UnicodeSet> entry : utf16ByTrail.entrySet()) {
|
||||
System.out.print( isFirst ? "\t " : "\t| ");
|
||||
isFirst = false;
|
||||
System.out.println(entry.getValue().getRegexEquivalent() + entry.getKey());
|
||||
}
|
||||
System.out.println(")");
|
||||
}
|
||||
}
|
|
@ -1,118 +0,0 @@
|
|||
package org.apache.lucene.analysis.icu;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.text.DateFormat;
|
||||
import java.util.Date;
|
||||
import java.util.HashMap;
|
||||
import java.util.Locale;
|
||||
import java.util.TimeZone;
|
||||
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
import com.ibm.icu.text.UnicodeSetIterator;
|
||||
import com.ibm.icu.util.VersionInfo;
|
||||
|
||||
/** creates a macro to augment jflex's unicode wordbreak support for > BMP */
|
||||
public class GenerateJFlexSupplementaryMacros {
|
||||
private static final UnicodeSet BMP = new UnicodeSet("[\u0000-\uFFFF]");
|
||||
private static final String NL = System.getProperty("line.separator");
|
||||
private static final DateFormat DATE_FORMAT = DateFormat.getDateTimeInstance
|
||||
(DateFormat.FULL, DateFormat.FULL, Locale.ROOT);
|
||||
static {
|
||||
DATE_FORMAT.setTimeZone(TimeZone.getTimeZone("UTC"));
|
||||
}
|
||||
|
||||
private static final String APACHE_LICENSE
|
||||
= "/*" + NL
|
||||
+ " * Licensed to the Apache Software Foundation (ASF) under one or more" + NL
|
||||
+ " * contributor license agreements. See the NOTICE file distributed with" + NL
|
||||
+ " * this work for additional information regarding copyright ownership." + NL
|
||||
+ " * The ASF licenses this file to You under the Apache License, Version 2.0" + NL
|
||||
+ " * (the \"License\"); you may not use this file except in compliance with" + NL
|
||||
+ " * the License. You may obtain a copy of the License at" + NL
|
||||
+ " *" + NL
|
||||
+ " * http://www.apache.org/licenses/LICENSE-2.0" + NL
|
||||
+ " *" + NL
|
||||
+ " * Unless required by applicable law or agreed to in writing, software" + NL
|
||||
+ " * distributed under the License is distributed on an \"AS IS\" BASIS," + NL
|
||||
+ " * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied." + NL
|
||||
+ " * See the License for the specific language governing permissions and" + NL
|
||||
+ " * limitations under the License." + NL
|
||||
+ " */" + NL;
|
||||
|
||||
|
||||
public static void main(String args[]) {
|
||||
outputHeader();
|
||||
outputMacro("ALetterSupp", "[:WordBreak=ALetter:]");
|
||||
outputMacro("FormatSupp", "[:WordBreak=Format:]");
|
||||
outputMacro("NumericSupp", "[:WordBreak=Numeric:]");
|
||||
outputMacro("ExtendSupp", "[:WordBreak=Extend:]");
|
||||
outputMacro("KatakanaSupp", "[:WordBreak=Katakana:]");
|
||||
outputMacro("MidLetterSupp", "[:WordBreak=MidLetter:]");
|
||||
outputMacro("MidNumSupp", "[:WordBreak=MidNum:]");
|
||||
outputMacro("MidNumLetSupp", "[:WordBreak=MidNumLet:]");
|
||||
outputMacro("ExtendNumLetSupp", "[:WordBreak=ExtendNumLet:]");
|
||||
outputMacro("ExtendNumLetSupp", "[:WordBreak=ExtendNumLet:]");
|
||||
outputMacro("ComplexContextSupp", "[:LineBreak=Complex_Context:]");
|
||||
outputMacro("HanSupp", "[:Script=Han:]");
|
||||
outputMacro("HiraganaSupp", "[:Script=Hiragana:]");
|
||||
outputMacro("SingleQuoteSupp", "[:WordBreak=Single_Quote:]");
|
||||
outputMacro("DoubleQuoteSupp", "[:WordBreak=Double_Quote:]");
|
||||
outputMacro("HebrewLetterSupp", "[:WordBreak=Hebrew_Letter:]");
|
||||
outputMacro("RegionalIndicatorSupp", "[:WordBreak=Regional_Indicator:]");
|
||||
}
|
||||
|
||||
static void outputHeader() {
|
||||
System.out.print(APACHE_LICENSE);
|
||||
System.out.println("// Generated using ICU4J " + VersionInfo.ICU_VERSION.toString());
|
||||
System.out.println("// by " + GenerateJFlexSupplementaryMacros.class.getName());
|
||||
System.out.print(NL + NL);
|
||||
}
|
||||
|
||||
// we have to carefully output the possibilities as compact utf-16
|
||||
// range expressions, or jflex will OOM!
|
||||
static void outputMacro(String name, String pattern) {
|
||||
UnicodeSet set = new UnicodeSet(pattern);
|
||||
set.removeAll(BMP);
|
||||
System.out.println(name + " = (");
|
||||
// if the set is empty, we have to do this or jflex will barf
|
||||
if (set.isEmpty()) {
|
||||
System.out.println("\t []");
|
||||
}
|
||||
|
||||
HashMap<Character,UnicodeSet> utf16ByLead = new HashMap<>();
|
||||
for (UnicodeSetIterator it = new UnicodeSetIterator(set); it.next();) {
|
||||
char utf16[] = Character.toChars(it.codepoint);
|
||||
UnicodeSet trails = utf16ByLead.get(utf16[0]);
|
||||
if (trails == null) {
|
||||
trails = new UnicodeSet();
|
||||
utf16ByLead.put(utf16[0], trails);
|
||||
}
|
||||
trails.add(utf16[1]);
|
||||
}
|
||||
|
||||
boolean isFirst = true;
|
||||
for (Character c : utf16ByLead.keySet()) {
|
||||
UnicodeSet trail = utf16ByLead.get(c);
|
||||
System.out.print( isFirst ? "\t " : "\t| ");
|
||||
isFirst = false;
|
||||
System.out.println("([\\u" + Integer.toHexString(c) + "]" + trail.getRegexEquivalent() + ")");
|
||||
}
|
||||
System.out.println(")");
|
||||
}
|
||||
}
|
|
@ -2196,7 +2196,7 @@ ${ant.project.name}.test.dependencies=${test.classpath.list}
|
|||
|
||||
<!-- JFlex task -->
|
||||
<target name="-install-jflex" unless="jflex.loaded" depends="ivy-availability-check,ivy-configure">
|
||||
<ivy:cachepath organisation="de.jflex" module="jflex" revision="1.5.1"
|
||||
<ivy:cachepath organisation="de.jflex" module="jflex" revision="1.6.0"
|
||||
inline="true" conf="default" transitive="true" pathid="jflex.classpath"/>
|
||||
<taskdef name="jflex" classname="jflex.anttask.JFlexTask" classpathref="jflex.classpath"/>
|
||||
<property name="jflex.loaded" value="true"/>
|
||||
|
|
Loading…
Reference in New Issue