mirror of https://github.com/apache/lucene.git
LUCENE-1103: The link is now incremented 1, but then the next token in the link is not incremented. This way, the link is not associated with the previous term. Instead it associated with the next term in the link, which would be the display tokens. If there are no display tokens, then it will take it's proper place in the token chain.
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@608978 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
d86944d06f
commit
b18f6ae959
|
@ -1,4 +1,4 @@
|
|||
/* The following code was generated by JFlex 1.4.1 on 1/3/08 10:05 PM */
|
||||
/* The following code was generated by JFlex 1.4.1 on 1/4/08 3:07 PM */
|
||||
|
||||
package org.apache.lucene.wikipedia.analysis;
|
||||
|
||||
|
@ -25,7 +25,7 @@ import org.apache.lucene.analysis.Token;
|
|||
/**
|
||||
* This class is a scanner generated by
|
||||
* <a href="http://www.jflex.de/">JFlex</a> 1.4.1
|
||||
* on 1/3/08 10:05 PM from the specification file
|
||||
* on 1/4/08 3:07 PM from the specification file
|
||||
* <tt>/Volumes/User/grantingersoll/projects/lucene/Lucene-Trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerImpl.jflex</tt>
|
||||
*/
|
||||
class WikipediaTokenizerImpl {
|
||||
|
@ -53,11 +53,11 @@ class WikipediaTokenizerImpl {
|
|||
*/
|
||||
private static final String ZZ_CMAP_PACKED =
|
||||
"\11\0\1\24\1\23\1\0\1\24\1\22\22\0\1\24\1\0\1\12"+
|
||||
"\1\52\2\0\1\3\1\1\4\0\1\14\1\5\1\2\1\10\12\16"+
|
||||
"\1\27\1\0\1\7\1\11\1\13\1\52\1\4\2\15\1\30\5\15"+
|
||||
"\1\53\2\0\1\3\1\1\4\0\1\14\1\5\1\2\1\10\12\16"+
|
||||
"\1\27\1\0\1\7\1\11\1\13\1\53\1\4\2\15\1\30\5\15"+
|
||||
"\1\41\21\15\1\25\1\0\1\26\1\0\1\6\1\0\1\31\1\43"+
|
||||
"\2\15\1\33\1\40\1\34\1\50\1\41\4\15\1\42\1\35\1\51"+
|
||||
"\1\15\1\36\1\15\1\32\3\15\1\44\1\37\1\15\1\45\1\47"+
|
||||
"\1\15\1\36\1\52\1\32\3\15\1\44\1\37\1\15\1\45\1\47"+
|
||||
"\1\46\102\0\27\15\1\0\37\15\1\0\u0568\15\12\17\206\15\12\17"+
|
||||
"\u026c\15\12\17\166\15\12\17\166\15\12\17\166\15\12\17\166\15\12\17"+
|
||||
"\167\15\11\17\166\15\12\17\166\15\12\17\166\15\12\17\340\15\12\17"+
|
||||
|
@ -77,18 +77,19 @@ class WikipediaTokenizerImpl {
|
|||
|
||||
private static final String ZZ_ACTION_PACKED_0 =
|
||||
"\11\0\4\1\4\2\1\3\1\1\1\4\2\1\1\5"+
|
||||
"\1\1\1\6\2\7\1\10\1\11\1\10\1\12\1\13"+
|
||||
"\1\7\1\14\1\15\1\16\1\17\1\7\1\20\1\7"+
|
||||
"\4\21\1\22\1\21\1\23\1\24\1\25\3\0\1\26"+
|
||||
"\14\0\1\27\1\30\1\10\1\0\1\31\1\0\1\32"+
|
||||
"\1\0\1\33\3\0\1\34\1\35\2\36\1\35\2\37"+
|
||||
"\2\0\1\36\1\0\14\36\1\35\3\0\1\10\1\40"+
|
||||
"\3\0\1\41\1\42\5\0\1\43\4\0\1\43\2\0"+
|
||||
"\2\43\2\0\1\10\5\0\1\30\1\35\1\36\1\44"+
|
||||
"\5\0\1\45\30\0\1\46\2\0\1\47\1\50\1\51";
|
||||
"\1\1\1\6\1\1\2\7\1\10\1\11\1\10\1\12"+
|
||||
"\1\13\1\7\1\14\1\15\1\16\1\17\1\7\1\20"+
|
||||
"\1\7\4\21\1\22\1\21\1\23\1\24\1\25\3\0"+
|
||||
"\1\26\14\0\1\27\1\30\1\31\1\32\1\10\1\0"+
|
||||
"\1\33\1\0\1\34\1\0\1\35\3\0\1\36\1\37"+
|
||||
"\2\40\1\37\2\41\2\0\1\40\1\0\14\40\1\37"+
|
||||
"\3\0\1\10\1\42\3\0\1\43\1\44\5\0\1\45"+
|
||||
"\4\0\1\45\2\0\2\45\2\0\1\10\5\0\1\30"+
|
||||
"\1\37\1\40\1\46\3\0\1\10\2\0\1\47\30\0"+
|
||||
"\1\50\2\0\1\51\1\52\1\53";
|
||||
|
||||
private static int [] zzUnpackAction() {
|
||||
int [] result = new int[174];
|
||||
int [] result = new int[178];
|
||||
int offset = 0;
|
||||
offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
|
||||
return result;
|
||||
|
@ -113,31 +114,32 @@ class WikipediaTokenizerImpl {
|
|||
private static final int [] ZZ_ROWMAP = zzUnpackRowMap();
|
||||
|
||||
private static final String ZZ_ROWMAP_PACKED_0 =
|
||||
"\0\0\0\53\0\126\0\201\0\254\0\327\0\u0102\0\u012d"+
|
||||
"\0\u0158\0\u0183\0\u01ae\0\u01d9\0\u0204\0\u022f\0\u025a\0\u0285"+
|
||||
"\0\u02b0\0\u0183\0\u02db\0\u0306\0\u0331\0\u035c\0\u0387\0\u03b2"+
|
||||
"\0\u03dd\0\u0183\0\u035c\0\u0408\0\u0183\0\u0433\0\u045e\0\u0489"+
|
||||
"\0\u04b4\0\u04df\0\u050a\0\u0535\0\u0560\0\u058b\0\u05b6\0\u05e1"+
|
||||
"\0\u0183\0\u060c\0\u035c\0\u0637\0\u0662\0\u068d\0\u06b8\0\u0183"+
|
||||
"\0\u0183\0\u06e3\0\u070e\0\u0739\0\u0183\0\u0764\0\u078f\0\u07ba"+
|
||||
"\0\u07e5\0\u0810\0\u083b\0\u0866\0\u0891\0\u08bc\0\u08e7\0\u0912"+
|
||||
"\0\u093d\0\u0968\0\u0993\0\u09be\0\u09e9\0\u0a14\0\u0a3f\0\u0a6a"+
|
||||
"\0\u0a95\0\u0ac0\0\u0aeb\0\u0b16\0\u0b41\0\u0b6c\0\u0b97\0\u0bc2"+
|
||||
"\0\u0bed\0\u0c18\0\u07ba\0\u0c43\0\u0c6e\0\u0c99\0\u0cc4\0\u0cef"+
|
||||
"\0\u0d1a\0\u0d45\0\u0d70\0\u0d9b\0\u0dc6\0\u0df1\0\u0e1c\0\u0e47"+
|
||||
"\0\u0e72\0\u0e9d\0\u0ec8\0\u0ef3\0\u0f1e\0\u0f49\0\u0f74\0\u0f9f"+
|
||||
"\0\u0fca\0\u0183\0\u0ff5\0\u1020\0\u104b\0\u1076\0\u0183\0\u10a1"+
|
||||
"\0\u10cc\0\u10f7\0\u1122\0\u114d\0\u1178\0\u11a3\0\u11ce\0\u11f9"+
|
||||
"\0\u1224\0\u124f\0\u127a\0\u12a5\0\u078f\0\u0912\0\u12d0\0\u12fb"+
|
||||
"\0\u1326\0\u1351\0\u137c\0\u13a7\0\u13d2\0\u13fd\0\u0183\0\u1428"+
|
||||
"\0\u1453\0\u147e\0\u14a9\0\u14d4\0\u14ff\0\u152a\0\u1555\0\u0183"+
|
||||
"\0\u1580\0\u15ab\0\u15d6\0\u1601\0\u162c\0\u1657\0\u1682\0\u16ad"+
|
||||
"\0\u16d8\0\u1703\0\u172e\0\u1759\0\u1784\0\u17af\0\u17da\0\u1805"+
|
||||
"\0\u1830\0\u185b\0\u1886\0\u18b1\0\u18dc\0\u1907\0\u1932\0\u195d"+
|
||||
"\0\u1988\0\u19b3\0\u19de\0\u0183\0\u0183\0\u0183";
|
||||
"\0\0\0\54\0\130\0\204\0\260\0\334\0\u0108\0\u0134"+
|
||||
"\0\u0160\0\u018c\0\u01b8\0\u01e4\0\u0210\0\u023c\0\u0268\0\u0294"+
|
||||
"\0\u02c0\0\u018c\0\u02ec\0\u0318\0\u0344\0\u0370\0\u039c\0\u03c8"+
|
||||
"\0\u03f4\0\u0420\0\u018c\0\u0370\0\u044c\0\u018c\0\u0478\0\u04a4"+
|
||||
"\0\u04d0\0\u04fc\0\u0528\0\u0554\0\u0580\0\u05ac\0\u05d8\0\u0604"+
|
||||
"\0\u0630\0\u018c\0\u065c\0\u0370\0\u0688\0\u06b4\0\u06e0\0\u070c"+
|
||||
"\0\u018c\0\u018c\0\u0738\0\u0764\0\u0790\0\u018c\0\u07bc\0\u07e8"+
|
||||
"\0\u0814\0\u0840\0\u086c\0\u0898\0\u08c4\0\u08f0\0\u091c\0\u0948"+
|
||||
"\0\u0974\0\u09a0\0\u09cc\0\u09f8\0\u018c\0\u018c\0\u0a24\0\u0a50"+
|
||||
"\0\u0a7c\0\u0aa8\0\u0ad4\0\u0b00\0\u0b2c\0\u0b58\0\u0b84\0\u0bb0"+
|
||||
"\0\u0bdc\0\u0c08\0\u0c34\0\u0c60\0\u0c8c\0\u0814\0\u0cb8\0\u0ce4"+
|
||||
"\0\u0d10\0\u0d3c\0\u0d68\0\u0d94\0\u0dc0\0\u0dec\0\u0e18\0\u0e44"+
|
||||
"\0\u0e70\0\u0e9c\0\u0ec8\0\u0ef4\0\u0f20\0\u0f4c\0\u0f78\0\u0fa4"+
|
||||
"\0\u0fd0\0\u0ffc\0\u1028\0\u1054\0\u018c\0\u1080\0\u10ac\0\u10d8"+
|
||||
"\0\u1104\0\u018c\0\u1130\0\u115c\0\u1188\0\u11b4\0\u11e0\0\u120c"+
|
||||
"\0\u1238\0\u1264\0\u1290\0\u12bc\0\u12e8\0\u1314\0\u1340\0\u07e8"+
|
||||
"\0\u0974\0\u136c\0\u1398\0\u13c4\0\u13f0\0\u141c\0\u1448\0\u1474"+
|
||||
"\0\u14a0\0\u018c\0\u14cc\0\u14f8\0\u1524\0\u1550\0\u157c\0\u15a8"+
|
||||
"\0\u15d4\0\u1600\0\u162c\0\u018c\0\u1658\0\u1684\0\u16b0\0\u16dc"+
|
||||
"\0\u1708\0\u1734\0\u1760\0\u178c\0\u17b8\0\u17e4\0\u1810\0\u183c"+
|
||||
"\0\u1868\0\u1894\0\u18c0\0\u18ec\0\u1918\0\u1944\0\u1970\0\u199c"+
|
||||
"\0\u19c8\0\u19f4\0\u1a20\0\u1a4c\0\u1a78\0\u1aa4\0\u1ad0\0\u018c"+
|
||||
"\0\u018c\0\u018c";
|
||||
|
||||
private static int [] zzUnpackRowMap() {
|
||||
int [] result = new int[174];
|
||||
int [] result = new int[178];
|
||||
int offset = 0;
|
||||
offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
|
||||
return result;
|
||||
|
@ -162,147 +164,149 @@ class WikipediaTokenizerImpl {
|
|||
private static final String ZZ_TRANS_PACKED_0 =
|
||||
"\1\12\1\13\5\12\1\14\1\12\1\15\3\12\1\16"+
|
||||
"\1\17\1\20\1\21\1\22\1\23\2\12\1\24\2\12"+
|
||||
"\15\16\1\25\2\12\2\16\10\12\1\26\5\12\4\27"+
|
||||
"\1\12\1\23\3\12\1\30\1\12\15\27\3\12\2\27"+
|
||||
"\10\12\1\26\5\12\4\31\1\12\1\23\3\12\1\30"+
|
||||
"\1\12\15\31\3\12\2\31\1\12\7\32\1\33\5\32"+
|
||||
"\4\34\1\32\1\23\2\12\1\32\1\35\1\32\15\34"+
|
||||
"\3\32\1\36\1\34\2\32\1\37\5\32\1\33\5\32"+
|
||||
"\4\40\1\32\1\41\2\32\1\42\2\32\15\40\3\32"+
|
||||
"\2\40\10\32\1\33\5\32\4\43\1\32\1\41\2\32"+
|
||||
"\1\42\2\32\15\43\3\32\2\43\10\32\1\33\1\32"+
|
||||
"\1\44\3\32\4\45\1\32\1\41\5\32\15\45\3\32"+
|
||||
"\2\45\10\32\1\46\5\32\4\47\1\32\1\41\5\32"+
|
||||
"\15\47\1\32\1\50\1\32\2\47\1\32\1\51\1\52"+
|
||||
"\5\51\1\53\1\51\1\54\3\51\4\55\1\51\1\56"+
|
||||
"\2\51\1\57\2\51\15\55\2\51\1\60\2\55\1\51"+
|
||||
"\54\0\1\61\61\0\1\62\4\0\4\63\7\0\6\63"+
|
||||
"\1\64\6\63\3\0\2\63\12\0\1\65\42\0\1\66"+
|
||||
"\1\67\1\70\1\71\2\72\1\0\1\73\3\0\1\73"+
|
||||
"\1\16\1\17\1\20\1\21\7\0\15\16\3\0\2\16"+
|
||||
"\3\0\1\74\1\0\1\75\2\76\1\0\1\77\3\0"+
|
||||
"\1\77\3\17\1\21\7\0\15\17\3\0\2\17\2\0"+
|
||||
"\1\66\1\100\1\70\1\71\2\76\1\0\1\77\3\0"+
|
||||
"\1\77\1\20\1\17\1\20\1\21\7\0\15\20\3\0"+
|
||||
"\2\20\3\0\1\101\1\0\1\75\2\72\1\0\1\73"+
|
||||
"\3\0\1\73\4\21\7\0\15\21\3\0\2\21\24\0"+
|
||||
"\1\12\54\0\1\102\72\0\1\103\15\0\1\62\4\0"+
|
||||
"\4\63\7\0\15\63\3\0\2\63\16\0\4\27\7\0"+
|
||||
"\15\27\3\0\2\27\27\0\1\35\41\0\4\31\7\0"+
|
||||
"\15\31\3\0\2\31\16\0\4\34\7\0\15\34\3\0"+
|
||||
"\2\34\16\0\4\34\7\0\2\34\1\104\12\34\3\0"+
|
||||
"\2\34\2\0\1\105\66\0\4\40\7\0\15\40\3\0"+
|
||||
"\2\40\24\0\1\32\54\0\1\106\42\0\4\43\7\0"+
|
||||
"\15\43\3\0\2\43\12\0\1\35\56\0\4\45\7\0"+
|
||||
"\15\45\3\0\2\45\11\0\1\107\4\0\4\63\7\0"+
|
||||
"\15\63\3\0\2\63\16\0\4\47\7\0\15\47\3\0"+
|
||||
"\2\47\47\0\1\35\5\0\1\110\62\0\1\111\56\0"+
|
||||
"\4\55\7\0\15\55\3\0\2\55\24\0\1\51\54\0"+
|
||||
"\1\112\42\0\4\63\7\0\15\63\3\0\2\63\14\0"+
|
||||
"\1\32\1\0\4\113\1\0\3\114\3\0\15\113\3\0"+
|
||||
"\2\113\14\0\1\32\1\0\4\113\1\0\3\114\3\0"+
|
||||
"\3\113\1\115\11\113\3\0\2\113\16\0\1\116\1\0"+
|
||||
"\1\116\10\0\15\116\3\0\2\116\16\0\1\117\1\120"+
|
||||
"\1\121\1\122\7\0\15\117\3\0\2\117\16\0\1\123"+
|
||||
"\1\0\1\123\10\0\15\123\3\0\2\123\16\0\1\124"+
|
||||
"\1\125\1\124\1\125\7\0\15\124\3\0\2\124\16\0"+
|
||||
"\1\126\2\127\1\130\7\0\15\126\3\0\2\126\16\0"+
|
||||
"\1\73\2\131\10\0\15\73\3\0\2\73\16\0\1\132"+
|
||||
"\2\133\1\134\7\0\15\132\3\0\2\132\16\0\4\125"+
|
||||
"\7\0\15\125\3\0\2\125\16\0\1\135\2\136\1\137"+
|
||||
"\7\0\15\135\3\0\2\135\16\0\1\140\2\141\1\142"+
|
||||
"\7\0\15\140\3\0\2\140\16\0\1\143\1\133\1\144"+
|
||||
"\1\134\7\0\15\143\3\0\2\143\16\0\1\145\2\120"+
|
||||
"\1\122\7\0\15\145\3\0\2\145\30\0\1\146\1\147"+
|
||||
"\63\0\1\150\26\0\4\34\7\0\2\34\1\151\12\34"+
|
||||
"\3\0\2\34\2\0\1\152\100\0\1\153\1\154\37\0"+
|
||||
"\4\63\7\0\6\63\1\155\6\63\3\0\2\63\2\0"+
|
||||
"\1\156\62\0\1\157\70\0\1\160\1\161\33\0\1\162"+
|
||||
"\1\0\1\32\1\0\4\113\1\0\3\114\3\0\15\113"+
|
||||
"\3\0\2\113\16\0\4\163\1\0\3\114\3\0\15\163"+
|
||||
"\3\0\2\163\12\0\1\162\1\0\1\32\1\0\4\113"+
|
||||
"\1\0\3\114\3\0\10\113\1\164\4\113\3\0\2\113"+
|
||||
"\2\0\1\66\13\0\1\116\1\0\1\116\10\0\15\116"+
|
||||
"\3\0\2\116\3\0\1\165\1\0\1\75\2\166\6\0"+
|
||||
"\1\117\1\120\1\121\1\122\7\0\15\117\3\0\2\117"+
|
||||
"\3\0\1\167\1\0\1\75\2\170\1\0\1\171\3\0"+
|
||||
"\1\171\3\120\1\122\7\0\15\120\3\0\2\120\3\0"+
|
||||
"\1\172\1\0\1\75\2\170\1\0\1\171\3\0\1\171"+
|
||||
"\1\121\1\120\1\121\1\122\7\0\15\121\3\0\2\121"+
|
||||
"\3\0\1\173\1\0\1\75\2\166\6\0\4\122\7\0"+
|
||||
"\15\122\3\0\2\122\3\0\1\174\2\0\1\174\7\0"+
|
||||
"\1\124\1\125\1\124\1\125\7\0\15\124\3\0\2\124"+
|
||||
"\3\0\1\174\2\0\1\174\7\0\4\125\7\0\15\125"+
|
||||
"\3\0\2\125\3\0\1\166\1\0\1\75\2\166\6\0"+
|
||||
"\1\126\2\127\1\130\7\0\15\126\3\0\2\126\3\0"+
|
||||
"\1\170\1\0\1\75\2\170\1\0\1\171\3\0\1\171"+
|
||||
"\3\127\1\130\7\0\15\127\3\0\2\127\3\0\1\166"+
|
||||
"\1\0\1\75\2\166\6\0\4\130\7\0\15\130\3\0"+
|
||||
"\2\130\3\0\1\171\2\0\2\171\1\0\1\171\3\0"+
|
||||
"\1\171\3\131\10\0\15\131\3\0\2\131\3\0\1\101"+
|
||||
"\1\0\1\75\2\72\1\0\1\73\3\0\1\73\1\132"+
|
||||
"\2\133\1\134\7\0\15\132\3\0\2\132\3\0\1\74"+
|
||||
"\1\0\1\75\2\76\1\0\1\77\3\0\1\77\3\133"+
|
||||
"\1\134\7\0\15\133\3\0\2\133\3\0\1\101\1\0"+
|
||||
"\1\75\2\72\1\0\1\73\3\0\1\73\4\134\7\0"+
|
||||
"\15\134\3\0\2\134\3\0\1\72\1\0\1\75\2\72"+
|
||||
"\1\0\1\73\3\0\1\73\1\135\2\136\1\137\7\0"+
|
||||
"\15\135\3\0\2\135\3\0\1\76\1\0\1\75\2\76"+
|
||||
"\1\0\1\77\3\0\1\77\3\136\1\137\7\0\15\136"+
|
||||
"\3\0\2\136\3\0\1\72\1\0\1\75\2\72\1\0"+
|
||||
"\1\73\3\0\1\73\4\137\7\0\15\137\3\0\2\137"+
|
||||
"\3\0\1\73\2\0\2\73\1\0\1\73\3\0\1\73"+
|
||||
"\1\140\2\141\1\142\7\0\15\140\3\0\2\140\3\0"+
|
||||
"\1\77\2\0\2\77\1\0\1\77\3\0\1\77\3\141"+
|
||||
"\1\142\7\0\15\141\3\0\2\141\3\0\1\73\2\0"+
|
||||
"\2\73\1\0\1\73\3\0\1\73\4\142\7\0\15\142"+
|
||||
"\3\0\2\142\3\0\1\175\1\0\1\75\2\72\1\0"+
|
||||
"\1\73\3\0\1\73\1\143\1\133\1\144\1\134\7\0"+
|
||||
"\15\143\3\0\2\143\3\0\1\176\1\0\1\75\2\76"+
|
||||
"\1\0\1\77\3\0\1\77\1\144\1\133\1\144\1\134"+
|
||||
"\7\0\15\144\3\0\2\144\3\0\1\173\1\0\1\75"+
|
||||
"\2\166\6\0\1\145\2\120\1\122\7\0\15\145\3\0"+
|
||||
"\2\145\31\0\1\147\53\0\1\177\63\0\1\200\25\0"+
|
||||
"\4\34\7\0\15\34\3\0\1\34\1\201\31\0\1\154"+
|
||||
"\53\0\1\202\34\0\1\32\1\0\4\113\1\0\3\114"+
|
||||
"\3\0\3\113\1\203\11\113\3\0\2\113\2\0\1\204"+
|
||||
"\101\0\1\161\53\0\1\205\33\0\1\206\51\0\1\162"+
|
||||
"\3\0\4\163\7\0\15\163\3\0\2\163\12\0\1\162"+
|
||||
"\1\0\1\207\1\0\4\113\1\0\3\114\3\0\15\113"+
|
||||
"\3\0\2\113\16\0\1\210\1\122\1\210\1\122\7\0"+
|
||||
"\15\210\3\0\2\210\16\0\4\130\7\0\15\130\3\0"+
|
||||
"\2\130\16\0\4\134\7\0\15\134\3\0\2\134\16\0"+
|
||||
"\4\137\7\0\15\137\3\0\2\137\16\0\4\142\7\0"+
|
||||
"\15\142\3\0\2\142\16\0\1\211\1\134\1\211\1\134"+
|
||||
"\7\0\15\211\3\0\2\211\16\0\4\122\7\0\15\122"+
|
||||
"\3\0\2\122\16\0\4\212\7\0\15\212\3\0\2\212"+
|
||||
"\33\0\1\213\60\0\1\214\27\0\4\34\6\0\1\215"+
|
||||
"\15\34\3\0\2\34\33\0\1\216\31\0\1\162\1\0"+
|
||||
"\1\32\1\0\4\113\1\0\3\114\3\0\10\113\1\217"+
|
||||
"\4\113\3\0\2\113\2\0\1\220\103\0\1\221\35\0"+
|
||||
"\4\222\7\0\15\222\3\0\2\222\3\0\1\165\1\0"+
|
||||
"\1\75\2\166\6\0\1\210\1\122\1\210\1\122\7\0"+
|
||||
"\15\210\3\0\2\210\3\0\1\175\1\0\1\75\2\72"+
|
||||
"\1\0\1\73\3\0\1\73\1\211\1\134\1\211\1\134"+
|
||||
"\7\0\15\211\3\0\2\211\3\0\1\174\2\0\1\174"+
|
||||
"\7\0\4\212\7\0\15\212\3\0\2\212\34\0\1\223"+
|
||||
"\54\0\1\224\25\0\1\225\75\0\1\226\30\0\1\162"+
|
||||
"\1\0\1\35\1\0\4\113\1\0\3\114\3\0\15\113"+
|
||||
"\3\0\2\113\34\0\1\227\31\0\1\230\2\0\4\222"+
|
||||
"\7\0\15\222\3\0\2\222\35\0\1\231\61\0\1\232"+
|
||||
"\17\0\1\233\76\0\1\234\52\0\1\235\31\0\1\32"+
|
||||
"\1\0\4\163\1\0\3\114\3\0\15\163\3\0\2\163"+
|
||||
"\36\0\1\236\52\0\1\237\32\0\4\240\7\0\15\240"+
|
||||
"\3\0\2\240\36\0\1\241\52\0\1\242\53\0\1\243"+
|
||||
"\60\0\1\244\10\0\1\245\12\0\4\240\7\0\15\240"+
|
||||
"\3\0\2\240\37\0\1\246\52\0\1\247\53\0\1\250"+
|
||||
"\21\0\1\12\61\0\4\251\7\0\15\251\3\0\2\251"+
|
||||
"\40\0\1\252\52\0\1\253\42\0\1\254\25\0\2\251"+
|
||||
"\1\0\2\251\1\0\2\251\2\0\5\251\7\0\15\251"+
|
||||
"\3\0\3\251\27\0\1\255\52\0\1\256\23\0";
|
||||
"\15\16\1\25\2\12\3\16\10\12\1\26\5\12\4\27"+
|
||||
"\1\12\1\23\3\12\1\30\1\12\15\27\3\12\3\27"+
|
||||
"\10\12\1\26\5\12\4\31\1\12\1\23\3\12\1\32"+
|
||||
"\1\12\15\31\3\12\3\31\1\12\7\33\1\34\5\33"+
|
||||
"\4\35\1\33\1\23\2\12\1\33\1\36\1\33\15\35"+
|
||||
"\3\33\1\37\2\35\2\33\1\40\5\33\1\34\5\33"+
|
||||
"\4\41\1\33\1\42\2\33\1\43\2\33\15\41\3\33"+
|
||||
"\3\41\10\33\1\34\5\33\4\44\1\33\1\42\2\33"+
|
||||
"\1\43\2\33\15\44\3\33\3\44\10\33\1\34\1\33"+
|
||||
"\1\45\3\33\4\46\1\33\1\42\5\33\15\46\3\33"+
|
||||
"\3\46\10\33\1\47\5\33\4\50\1\33\1\42\5\33"+
|
||||
"\15\50\1\33\1\51\1\33\3\50\1\33\1\52\1\53"+
|
||||
"\5\52\1\54\1\52\1\55\3\52\4\56\1\52\1\57"+
|
||||
"\2\52\1\60\2\52\15\56\2\52\1\61\3\56\1\52"+
|
||||
"\55\0\1\62\62\0\1\63\4\0\4\64\7\0\6\64"+
|
||||
"\1\65\6\64\3\0\3\64\12\0\1\66\43\0\1\67"+
|
||||
"\1\70\1\71\1\72\2\73\1\0\1\74\3\0\1\74"+
|
||||
"\1\16\1\17\1\20\1\21\7\0\15\16\3\0\3\16"+
|
||||
"\3\0\1\75\1\0\1\76\2\77\1\0\1\100\3\0"+
|
||||
"\1\100\3\17\1\21\7\0\15\17\3\0\3\17\2\0"+
|
||||
"\1\67\1\101\1\71\1\72\2\77\1\0\1\100\3\0"+
|
||||
"\1\100\1\20\1\17\1\20\1\21\7\0\15\20\3\0"+
|
||||
"\3\20\3\0\1\102\1\0\1\76\2\73\1\0\1\74"+
|
||||
"\3\0\1\74\4\21\7\0\15\21\3\0\3\21\24\0"+
|
||||
"\1\12\55\0\1\103\73\0\1\104\16\0\1\63\4\0"+
|
||||
"\4\64\7\0\15\64\3\0\3\64\16\0\4\27\7\0"+
|
||||
"\15\27\3\0\3\27\27\0\1\105\42\0\4\31\7\0"+
|
||||
"\15\31\3\0\3\31\27\0\1\106\42\0\4\35\7\0"+
|
||||
"\15\35\3\0\3\35\16\0\4\35\7\0\2\35\1\107"+
|
||||
"\12\35\3\0\3\35\2\0\1\110\67\0\4\41\7\0"+
|
||||
"\15\41\3\0\3\41\24\0\1\33\55\0\1\111\43\0"+
|
||||
"\4\44\7\0\15\44\3\0\3\44\12\0\1\105\57\0"+
|
||||
"\4\46\7\0\15\46\3\0\3\46\11\0\1\112\4\0"+
|
||||
"\4\64\7\0\15\64\3\0\3\64\16\0\4\50\7\0"+
|
||||
"\15\50\3\0\3\50\47\0\1\105\6\0\1\113\63\0"+
|
||||
"\1\114\57\0\4\56\7\0\15\56\3\0\3\56\24\0"+
|
||||
"\1\52\55\0\1\115\43\0\4\64\7\0\15\64\3\0"+
|
||||
"\3\64\14\0\1\33\1\0\4\116\1\0\3\117\3\0"+
|
||||
"\15\116\3\0\3\116\14\0\1\33\1\0\4\116\1\0"+
|
||||
"\3\117\3\0\3\116\1\120\11\116\3\0\3\116\16\0"+
|
||||
"\1\121\1\0\1\121\10\0\15\121\3\0\3\121\16\0"+
|
||||
"\1\122\1\123\1\124\1\125\7\0\15\122\3\0\3\122"+
|
||||
"\16\0\1\126\1\0\1\126\10\0\15\126\3\0\3\126"+
|
||||
"\16\0\1\127\1\130\1\127\1\130\7\0\15\127\3\0"+
|
||||
"\3\127\16\0\1\131\2\132\1\133\7\0\15\131\3\0"+
|
||||
"\3\131\16\0\1\74\2\134\10\0\15\74\3\0\3\74"+
|
||||
"\16\0\1\135\2\136\1\137\7\0\15\135\3\0\3\135"+
|
||||
"\16\0\4\130\7\0\15\130\3\0\3\130\16\0\1\140"+
|
||||
"\2\141\1\142\7\0\15\140\3\0\3\140\16\0\1\143"+
|
||||
"\2\144\1\145\7\0\15\143\3\0\3\143\16\0\1\146"+
|
||||
"\1\136\1\147\1\137\7\0\15\146\3\0\3\146\16\0"+
|
||||
"\1\150\2\123\1\125\7\0\15\150\3\0\3\150\30\0"+
|
||||
"\1\151\1\152\64\0\1\153\27\0\4\35\7\0\2\35"+
|
||||
"\1\154\12\35\3\0\3\35\2\0\1\155\101\0\1\156"+
|
||||
"\1\157\40\0\4\64\7\0\6\64\1\160\6\64\3\0"+
|
||||
"\3\64\2\0\1\161\63\0\1\162\71\0\1\163\1\164"+
|
||||
"\34\0\1\165\1\0\1\33\1\0\4\116\1\0\3\117"+
|
||||
"\3\0\15\116\3\0\3\116\16\0\4\166\1\0\3\117"+
|
||||
"\3\0\15\166\3\0\3\166\12\0\1\165\1\0\1\33"+
|
||||
"\1\0\4\116\1\0\3\117\3\0\10\116\1\167\4\116"+
|
||||
"\3\0\3\116\2\0\1\67\13\0\1\121\1\0\1\121"+
|
||||
"\10\0\15\121\3\0\3\121\3\0\1\170\1\0\1\76"+
|
||||
"\2\171\6\0\1\122\1\123\1\124\1\125\7\0\15\122"+
|
||||
"\3\0\3\122\3\0\1\172\1\0\1\76\2\173\1\0"+
|
||||
"\1\174\3\0\1\174\3\123\1\125\7\0\15\123\3\0"+
|
||||
"\3\123\3\0\1\175\1\0\1\76\2\173\1\0\1\174"+
|
||||
"\3\0\1\174\1\124\1\123\1\124\1\125\7\0\15\124"+
|
||||
"\3\0\3\124\3\0\1\176\1\0\1\76\2\171\6\0"+
|
||||
"\4\125\7\0\15\125\3\0\3\125\3\0\1\177\2\0"+
|
||||
"\1\177\7\0\1\127\1\130\1\127\1\130\7\0\15\127"+
|
||||
"\3\0\3\127\3\0\1\177\2\0\1\177\7\0\4\130"+
|
||||
"\7\0\15\130\3\0\3\130\3\0\1\171\1\0\1\76"+
|
||||
"\2\171\6\0\1\131\2\132\1\133\7\0\15\131\3\0"+
|
||||
"\3\131\3\0\1\173\1\0\1\76\2\173\1\0\1\174"+
|
||||
"\3\0\1\174\3\132\1\133\7\0\15\132\3\0\3\132"+
|
||||
"\3\0\1\171\1\0\1\76\2\171\6\0\4\133\7\0"+
|
||||
"\15\133\3\0\3\133\3\0\1\174\2\0\2\174\1\0"+
|
||||
"\1\174\3\0\1\174\3\134\10\0\15\134\3\0\3\134"+
|
||||
"\3\0\1\102\1\0\1\76\2\73\1\0\1\74\3\0"+
|
||||
"\1\74\1\135\2\136\1\137\7\0\15\135\3\0\3\135"+
|
||||
"\3\0\1\75\1\0\1\76\2\77\1\0\1\100\3\0"+
|
||||
"\1\100\3\136\1\137\7\0\15\136\3\0\3\136\3\0"+
|
||||
"\1\102\1\0\1\76\2\73\1\0\1\74\3\0\1\74"+
|
||||
"\4\137\7\0\15\137\3\0\3\137\3\0\1\73\1\0"+
|
||||
"\1\76\2\73\1\0\1\74\3\0\1\74\1\140\2\141"+
|
||||
"\1\142\7\0\15\140\3\0\3\140\3\0\1\77\1\0"+
|
||||
"\1\76\2\77\1\0\1\100\3\0\1\100\3\141\1\142"+
|
||||
"\7\0\15\141\3\0\3\141\3\0\1\73\1\0\1\76"+
|
||||
"\2\73\1\0\1\74\3\0\1\74\4\142\7\0\15\142"+
|
||||
"\3\0\3\142\3\0\1\74\2\0\2\74\1\0\1\74"+
|
||||
"\3\0\1\74\1\143\2\144\1\145\7\0\15\143\3\0"+
|
||||
"\3\143\3\0\1\100\2\0\2\100\1\0\1\100\3\0"+
|
||||
"\1\100\3\144\1\145\7\0\15\144\3\0\3\144\3\0"+
|
||||
"\1\74\2\0\2\74\1\0\1\74\3\0\1\74\4\145"+
|
||||
"\7\0\15\145\3\0\3\145\3\0\1\200\1\0\1\76"+
|
||||
"\2\73\1\0\1\74\3\0\1\74\1\146\1\136\1\147"+
|
||||
"\1\137\7\0\15\146\3\0\3\146\3\0\1\201\1\0"+
|
||||
"\1\76\2\77\1\0\1\100\3\0\1\100\1\147\1\136"+
|
||||
"\1\147\1\137\7\0\15\147\3\0\3\147\3\0\1\176"+
|
||||
"\1\0\1\76\2\171\6\0\1\150\2\123\1\125\7\0"+
|
||||
"\15\150\3\0\3\150\31\0\1\152\54\0\1\202\64\0"+
|
||||
"\1\203\26\0\4\35\7\0\15\35\3\0\1\35\1\204"+
|
||||
"\1\35\31\0\1\157\54\0\1\205\35\0\1\33\1\0"+
|
||||
"\4\116\1\0\3\117\3\0\3\116\1\206\11\116\3\0"+
|
||||
"\3\116\2\0\1\207\102\0\1\164\54\0\1\210\34\0"+
|
||||
"\1\211\52\0\1\165\3\0\4\166\7\0\15\166\3\0"+
|
||||
"\3\166\12\0\1\165\1\0\1\212\1\0\4\116\1\0"+
|
||||
"\3\117\3\0\15\116\3\0\3\116\16\0\1\213\1\125"+
|
||||
"\1\213\1\125\7\0\15\213\3\0\3\213\16\0\4\133"+
|
||||
"\7\0\15\133\3\0\3\133\16\0\4\137\7\0\15\137"+
|
||||
"\3\0\3\137\16\0\4\142\7\0\15\142\3\0\3\142"+
|
||||
"\16\0\4\145\7\0\15\145\3\0\3\145\16\0\1\214"+
|
||||
"\1\137\1\214\1\137\7\0\15\214\3\0\3\214\16\0"+
|
||||
"\4\125\7\0\15\125\3\0\3\125\16\0\4\215\7\0"+
|
||||
"\15\215\3\0\3\215\33\0\1\216\61\0\1\217\30\0"+
|
||||
"\4\35\6\0\1\220\15\35\3\0\2\35\1\221\33\0"+
|
||||
"\1\222\32\0\1\165\1\0\1\33\1\0\4\116\1\0"+
|
||||
"\3\117\3\0\10\116\1\223\4\116\3\0\3\116\2\0"+
|
||||
"\1\224\104\0\1\225\36\0\4\226\7\0\15\226\3\0"+
|
||||
"\3\226\3\0\1\170\1\0\1\76\2\171\6\0\1\213"+
|
||||
"\1\125\1\213\1\125\7\0\15\213\3\0\3\213\3\0"+
|
||||
"\1\200\1\0\1\76\2\73\1\0\1\74\3\0\1\74"+
|
||||
"\1\214\1\137\1\214\1\137\7\0\15\214\3\0\3\214"+
|
||||
"\3\0\1\177\2\0\1\177\7\0\4\215\7\0\15\215"+
|
||||
"\3\0\3\215\34\0\1\227\55\0\1\230\26\0\1\231"+
|
||||
"\60\0\4\35\6\0\1\220\15\35\3\0\3\35\34\0"+
|
||||
"\1\232\31\0\1\165\1\0\1\105\1\0\4\116\1\0"+
|
||||
"\3\117\3\0\15\116\3\0\3\116\34\0\1\233\32\0"+
|
||||
"\1\234\2\0\4\226\7\0\15\226\3\0\3\226\35\0"+
|
||||
"\1\235\62\0\1\236\20\0\1\237\77\0\1\240\53\0"+
|
||||
"\1\241\32\0\1\33\1\0\4\166\1\0\3\117\3\0"+
|
||||
"\15\166\3\0\3\166\36\0\1\242\53\0\1\243\33\0"+
|
||||
"\4\244\7\0\15\244\3\0\3\244\36\0\1\245\53\0"+
|
||||
"\1\246\54\0\1\247\61\0\1\250\11\0\1\251\12\0"+
|
||||
"\4\244\7\0\15\244\3\0\3\244\37\0\1\252\53\0"+
|
||||
"\1\253\54\0\1\254\22\0\1\12\62\0\4\255\7\0"+
|
||||
"\15\255\3\0\3\255\40\0\1\256\53\0\1\257\43\0"+
|
||||
"\1\260\26\0\2\255\1\0\2\255\1\0\2\255\2\0"+
|
||||
"\5\255\7\0\15\255\3\0\4\255\27\0\1\261\53\0"+
|
||||
"\1\262\24\0";
|
||||
|
||||
private static int [] zzUnpackTrans() {
|
||||
int [] result = new int[6665];
|
||||
int [] result = new int[6908];
|
||||
int offset = 0;
|
||||
offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
|
||||
return result;
|
||||
|
@ -340,16 +344,16 @@ class WikipediaTokenizerImpl {
|
|||
private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
|
||||
|
||||
private static final String ZZ_ATTRIBUTE_PACKED_0 =
|
||||
"\11\0\1\11\7\1\1\11\7\1\1\11\2\1\1\11"+
|
||||
"\13\1\1\11\6\1\2\11\3\0\1\11\14\0\3\1"+
|
||||
"\1\0\1\1\1\0\1\1\1\0\1\1\3\0\7\1"+
|
||||
"\2\0\1\1\1\0\15\1\3\0\1\1\1\11\3\0"+
|
||||
"\1\1\1\11\5\0\1\1\4\0\1\1\2\0\2\1"+
|
||||
"\2\0\1\1\5\0\1\11\3\1\5\0\1\11\30\0"+
|
||||
"\1\1\2\0\3\11";
|
||||
"\11\0\1\11\7\1\1\11\10\1\1\11\2\1\1\11"+
|
||||
"\13\1\1\11\6\1\2\11\3\0\1\11\14\0\2\1"+
|
||||
"\2\11\1\1\1\0\1\1\1\0\1\1\1\0\1\1"+
|
||||
"\3\0\7\1\2\0\1\1\1\0\15\1\3\0\1\1"+
|
||||
"\1\11\3\0\1\1\1\11\5\0\1\1\4\0\1\1"+
|
||||
"\2\0\2\1\2\0\1\1\5\0\1\11\3\1\3\0"+
|
||||
"\1\1\2\0\1\11\30\0\1\1\2\0\3\11";
|
||||
|
||||
private static int [] zzUnpackAttribute() {
|
||||
int [] result = new int[174];
|
||||
int [] result = new int[178];
|
||||
int offset = 0;
|
||||
offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
|
||||
return result;
|
||||
|
@ -441,6 +445,7 @@ public static final int EXTERNAL_LINK_URL = 17;
|
|||
private int currentTokType;
|
||||
private int numBalanced = 0;
|
||||
private int positionInc = 1;
|
||||
private int numLinkToks = 0;
|
||||
|
||||
public static final String [] TOKEN_TYPES = new String [] {
|
||||
"<ALPHANUM>",
|
||||
|
@ -772,167 +777,175 @@ final void getText(Token t, int tokType) {
|
|||
case 7:
|
||||
{ /* ignore */
|
||||
}
|
||||
case 42: break;
|
||||
case 44: break;
|
||||
case 3:
|
||||
{ positionInc = 1; return CJ;
|
||||
}
|
||||
case 43: break;
|
||||
case 26:
|
||||
case 45: break;
|
||||
case 28:
|
||||
{ numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);/*end italics*/
|
||||
}
|
||||
case 44: break;
|
||||
case 37:
|
||||
case 46: break;
|
||||
case 9:
|
||||
{ numLinkToks = 0; positionInc = 0; yybegin(YYINITIAL);
|
||||
}
|
||||
case 47: break;
|
||||
case 39:
|
||||
{ numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);/*end bold italics*/
|
||||
}
|
||||
case 45: break;
|
||||
case 48: break;
|
||||
case 11:
|
||||
{ currentTokType = ITALICS; yybegin(STRING); return currentTokType;/*italics*/
|
||||
}
|
||||
case 46: break;
|
||||
case 49: break;
|
||||
case 23:
|
||||
{ positionInc = 1; currentTokType = INTERNAL_LINK; yybegin(INTERNAL_LINK_STATE);
|
||||
}
|
||||
case 50: break;
|
||||
case 5:
|
||||
{ yybegin(CATEGORY_STATE); return currentTokType;
|
||||
}
|
||||
case 47: break;
|
||||
case 34:
|
||||
case 51: break;
|
||||
case 36:
|
||||
{ numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);/*end sub header*/
|
||||
}
|
||||
case 48: break;
|
||||
case 52: break;
|
||||
case 8:
|
||||
{ if (numLinkToks == 0){positionInc = 0;} else{positionInc = 1;} currentTokType = EXTERNAL_LINK; yybegin(EXTERNAL_LINK_STATE); numLinkToks++; return currentTokType;
|
||||
}
|
||||
case 53: break;
|
||||
case 24:
|
||||
{ positionInc = 1; currentTokType = CITATION; yybegin(DOUBLE_BRACE_STATE);
|
||||
}
|
||||
case 49: break;
|
||||
case 54: break;
|
||||
case 22:
|
||||
{ positionInc = 1; yybegin(DOUBLE_EQUALS_STATE);
|
||||
}
|
||||
case 50: break;
|
||||
case 39:
|
||||
case 55: break;
|
||||
case 41:
|
||||
{ positionInc = 1; currentTokType = CATEGORY; yybegin(CATEGORY_STATE);
|
||||
}
|
||||
case 51: break;
|
||||
case 56: break;
|
||||
case 18:
|
||||
{ yybegin(STRING); return currentTokType;/* STRING ALPHANUM*/
|
||||
}
|
||||
case 52: break;
|
||||
case 57: break;
|
||||
case 21:
|
||||
{ positionInc = 1; if (numBalanced == 0){numBalanced++;yybegin(TWO_SINGLE_QUOTES_STATE);} else{numBalanced = 0;}
|
||||
}
|
||||
case 53: break;
|
||||
case 58: break;
|
||||
case 1:
|
||||
{ positionInc = 1;
|
||||
}
|
||||
case 54: break;
|
||||
case 41:
|
||||
case 59: break;
|
||||
case 43:
|
||||
{ numBalanced = 0;currentTokType = CATEGORY;yybegin(CATEGORY_STATE);
|
||||
}
|
||||
case 55: break;
|
||||
case 9:
|
||||
case 60: break;
|
||||
case 25:
|
||||
{ yybegin(YYINITIAL);
|
||||
}
|
||||
case 56: break;
|
||||
case 61: break;
|
||||
case 40:
|
||||
{ positionInc = 1; yybegin(EXTERNAL_LINK_STATE); return currentTokType;
|
||||
}
|
||||
case 62: break;
|
||||
case 19:
|
||||
{ numBalanced = 0;currentTokType = EXTERNAL_LINK;yybegin(EXTERNAL_LINK_STATE);
|
||||
}
|
||||
case 57: break;
|
||||
case 63: break;
|
||||
case 13:
|
||||
{ yybegin(STRING);return currentTokType;
|
||||
}
|
||||
case 58: break;
|
||||
case 36:
|
||||
case 64: break;
|
||||
case 38:
|
||||
{ positionInc = 1; return EMAIL;
|
||||
}
|
||||
case 59: break;
|
||||
case 35:
|
||||
case 65: break;
|
||||
case 37:
|
||||
{ positionInc = 1; return ACRONYM;
|
||||
}
|
||||
case 60: break;
|
||||
case 66: break;
|
||||
case 4:
|
||||
{ positionInc = 1;currentTokType = EXTERNAL_LINK_URL; yybegin(EXTERNAL_LINK_STATE);
|
||||
}
|
||||
case 61: break;
|
||||
case 67: break;
|
||||
case 17:
|
||||
{ /* ignore STRING */
|
||||
}
|
||||
case 62: break;
|
||||
case 40:
|
||||
case 68: break;
|
||||
case 42:
|
||||
{ currentTokType = CATEGORY;yybegin(CATEGORY_STATE);
|
||||
}
|
||||
case 63: break;
|
||||
case 69: break;
|
||||
case 20:
|
||||
{ yybegin(STRING); return currentTokType;/*pipe*/
|
||||
}
|
||||
case 64: break;
|
||||
case 70: break;
|
||||
case 12:
|
||||
{ currentTokType = EXTERNAL_LINK;yybegin(EXTERNAL_LINK_STATE);
|
||||
}
|
||||
case 65: break;
|
||||
case 27:
|
||||
case 71: break;
|
||||
case 29:
|
||||
{ numBalanced = 0;currentTokType = INTERNAL_LINK;yybegin(INTERNAL_LINK_STATE);
|
||||
}
|
||||
case 66: break;
|
||||
case 33:
|
||||
case 72: break;
|
||||
case 35:
|
||||
{ numBalanced = 0;currentTokType = ALPHANUM;yybegin(YYINITIAL);/*end bold*/
|
||||
}
|
||||
case 67: break;
|
||||
case 73: break;
|
||||
case 16:
|
||||
{ yybegin(DOUBLE_BRACE_STATE); return currentTokType;
|
||||
}
|
||||
case 68: break;
|
||||
case 29:
|
||||
case 74: break;
|
||||
case 31:
|
||||
{ positionInc = 1; return HOST;
|
||||
}
|
||||
case 69: break;
|
||||
case 32:
|
||||
case 75: break;
|
||||
case 34:
|
||||
{ currentTokType = BOLD_ITALICS; yybegin(FIVE_SINGLE_QUOTES_STATE);
|
||||
}
|
||||
case 70: break;
|
||||
case 25:
|
||||
case 76: break;
|
||||
case 27:
|
||||
{ currentTokType = INTERNAL_LINK;yybegin(INTERNAL_LINK_STATE);
|
||||
}
|
||||
case 71: break;
|
||||
case 23:
|
||||
{ positionInc = 0; currentTokType = INTERNAL_LINK; yybegin(INTERNAL_LINK_STATE);
|
||||
}
|
||||
case 72: break;
|
||||
case 77: break;
|
||||
case 14:
|
||||
{ currentTokType = SUB_HEADING; yybegin(STRING);
|
||||
}
|
||||
case 73: break;
|
||||
case 28:
|
||||
case 78: break;
|
||||
case 30:
|
||||
{ positionInc = 1; return APOSTROPHE;
|
||||
}
|
||||
case 74: break;
|
||||
case 30:
|
||||
case 79: break;
|
||||
case 32:
|
||||
{ positionInc = 1; return NUM;
|
||||
}
|
||||
case 75: break;
|
||||
case 80: break;
|
||||
case 15:
|
||||
{ currentTokType = HEADING; yybegin(DOUBLE_EQUALS_STATE); return currentTokType;
|
||||
}
|
||||
case 76: break;
|
||||
case 6:
|
||||
{ yybegin(INTERNAL_LINK_STATE); return currentTokType;
|
||||
}
|
||||
case 77: break;
|
||||
case 81: break;
|
||||
case 2:
|
||||
{ positionInc = 1; return ALPHANUM;
|
||||
}
|
||||
case 78: break;
|
||||
case 31:
|
||||
case 82: break;
|
||||
case 33:
|
||||
{ positionInc = 1; return COMPANY;
|
||||
}
|
||||
case 79: break;
|
||||
case 83: break;
|
||||
case 10:
|
||||
{ currentTokType = BOLD; yybegin(THREE_SINGLE_QUOTES_STATE);
|
||||
}
|
||||
case 80: break;
|
||||
case 8:
|
||||
{ positionInc = 1; currentTokType = EXTERNAL_LINK; yybegin(EXTERNAL_LINK_STATE);return currentTokType;
|
||||
case 84: break;
|
||||
case 6:
|
||||
{ if (numLinkToks == 1){positionInc = 0;} else{positionInc = 1;} yybegin(INTERNAL_LINK_STATE); numLinkToks++; return currentTokType;
|
||||
}
|
||||
case 81: break;
|
||||
case 38:
|
||||
{ positionInc = 0; yybegin(EXTERNAL_LINK_STATE); return currentTokType;
|
||||
case 85: break;
|
||||
case 26:
|
||||
{ numLinkToks = 0; yybegin(YYINITIAL);
|
||||
}
|
||||
case 82: break;
|
||||
case 86: break;
|
||||
default:
|
||||
if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
|
||||
zzAtEOF = true;
|
||||
|
|
|
@ -53,6 +53,7 @@ public static final int EXTERNAL_LINK_URL = 17;
|
|||
private int currentTokType;
|
||||
private int numBalanced = 0;
|
||||
private int positionInc = 1;
|
||||
private int numLinkToks = 0;
|
||||
|
||||
public static final String [] TOKEN_TYPES = new String [] {
|
||||
"<ALPHANUM>",
|
||||
|
@ -187,8 +188,10 @@ DOUBLE_EQUALS = "="{2}
|
|||
|
||||
//wikipedia
|
||||
<YYINITIAL>{
|
||||
//First {ALPHANUM} is always the link, set position to 0 for double bracket
|
||||
{DOUBLE_BRACKET} {positionInc = 0; currentTokType = INTERNAL_LINK; yybegin(INTERNAL_LINK_STATE);}
|
||||
//First {ALPHANUM} is always the link, set positioninc to 1 for double bracket, but then inside the internal link state
|
||||
//set it to 0 for the next token, such that the link and the first token are in the same position, but then subsequent
|
||||
//tokens within the link are incremented
|
||||
{DOUBLE_BRACKET} {positionInc = 1; currentTokType = INTERNAL_LINK; yybegin(INTERNAL_LINK_STATE);}
|
||||
{DOUBLE_BRACKET_CAT} {positionInc = 1; currentTokType = CATEGORY; yybegin(CATEGORY_STATE);}
|
||||
{EXTERNAL_LINK} {positionInc = 1;currentTokType = EXTERNAL_LINK_URL; yybegin(EXTERNAL_LINK_STATE);}
|
||||
{TWO_SINGLE_QUOTES} {positionInc = 1; if (numBalanced == 0){numBalanced++;yybegin(TWO_SINGLE_QUOTES_STATE);} else{numBalanced = 0;}}
|
||||
|
@ -201,16 +204,18 @@ DOUBLE_EQUALS = "="{2}
|
|||
|
||||
<INTERNAL_LINK_STATE>{
|
||||
//First {ALPHANUM} is always the link, set position to 0 for these
|
||||
{ALPHANUM} {yybegin(INTERNAL_LINK_STATE); return currentTokType;}
|
||||
{DOUBLE_BRACKET_CLOSE} {yybegin(YYINITIAL);}
|
||||
//This is slightly different from EXTERNAL_LINK_STATE because that one has an explicit grammar for capturing the URL
|
||||
{ALPHANUM} {if (numLinkToks == 1){positionInc = 0;} else{positionInc = 1;} yybegin(INTERNAL_LINK_STATE); numLinkToks++; return currentTokType;}
|
||||
{DOUBLE_BRACKET_CLOSE} {numLinkToks = 0; yybegin(YYINITIAL);}
|
||||
//ignore
|
||||
. | {WHITESPACE} { positionInc = 1; }
|
||||
}
|
||||
|
||||
<EXTERNAL_LINK_STATE>{
|
||||
"http://"{HOST}("/"?({ALPHANUM}|{P}|\?|"&"|"="|"#")*)* {positionInc = 0; yybegin(EXTERNAL_LINK_STATE); return currentTokType;}
|
||||
{ALPHANUM} {positionInc = 1; currentTokType = EXTERNAL_LINK; yybegin(EXTERNAL_LINK_STATE);return currentTokType;}
|
||||
"]" {yybegin(YYINITIAL);}
|
||||
//increment the link token, but then don't increment the tokens after that which are still in the link
|
||||
("http://"|"https://"){HOST}("/"?({ALPHANUM}|{P}|\?|"&"|"="|"#")*)* {positionInc = 1; yybegin(EXTERNAL_LINK_STATE); return currentTokType;}
|
||||
{ALPHANUM} {if (numLinkToks == 0){positionInc = 0;} else{positionInc = 1;} currentTokType = EXTERNAL_LINK; yybegin(EXTERNAL_LINK_STATE); numLinkToks++; return currentTokType;}
|
||||
"]" {numLinkToks = 0; positionInc = 0; yybegin(YYINITIAL);}
|
||||
{WHITESPACE} { positionInc = 1; }
|
||||
}
|
||||
|
||||
|
|
|
@ -21,10 +21,9 @@ package org.apache.lucene.wikipedia.analysis;
|
|||
import junit.framework.TestCase;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.StringReader;
|
||||
import java.util.Map;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
|
||||
/**
|
||||
|
@ -156,7 +155,7 @@ public class WikipediaTokenizerTest extends TestCase {
|
|||
}
|
||||
|
||||
public void testLinkPhrases() throws Exception {
|
||||
String test = "click [[link here]] click [http://lucene.apache.org here]";
|
||||
String test = "click [[link here again]] click [http://lucene.apache.org here again]";
|
||||
WikipediaTokenizer tf = new WikipediaTokenizer(new StringReader(test));
|
||||
Token token = new Token();
|
||||
token = tf.next(token);
|
||||
|
@ -166,11 +165,17 @@ public class WikipediaTokenizerTest extends TestCase {
|
|||
token = tf.next(token);
|
||||
assertTrue("token is null and it shouldn't be", token != null);
|
||||
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "link", new String(token.termBuffer(), 0, token.termLength()).equals("link") == true);
|
||||
assertTrue(token.getPositionIncrement() + " does not equal: " + 0, token.getPositionIncrement() == 0);
|
||||
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
|
||||
token = tf.next(token);
|
||||
assertTrue("token is null and it shouldn't be", token != null);
|
||||
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "here",
|
||||
new String(token.termBuffer(), 0, token.termLength()).equals("here") == true);
|
||||
//The link, and here should be at the same position for phrases to work
|
||||
assertTrue(token.getPositionIncrement() + " does not equal: " + 0, token.getPositionIncrement() == 0);
|
||||
token = tf.next(token);
|
||||
assertTrue("token is null and it shouldn't be", token != null);
|
||||
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "again",
|
||||
new String(token.termBuffer(), 0, token.termLength()).equals("again") == true);
|
||||
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
|
||||
|
||||
token = tf.next(token);
|
||||
|
@ -183,18 +188,24 @@ public class WikipediaTokenizerTest extends TestCase {
|
|||
assertTrue("token is null and it shouldn't be", token != null);
|
||||
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "http://lucene.apache.org",
|
||||
new String(token.termBuffer(), 0, token.termLength()).equals("http://lucene.apache.org") == true);
|
||||
assertTrue(token.getPositionIncrement() + " does not equal: " + 0, token.getPositionIncrement() == 0);
|
||||
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
|
||||
|
||||
token = tf.next(token);
|
||||
assertTrue("token is null and it shouldn't be", token != null);
|
||||
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "here",
|
||||
new String(token.termBuffer(), 0, token.termLength()).equals("here") == true);
|
||||
assertTrue(token.getPositionIncrement() + " does not equal: " + 0, token.getPositionIncrement() == 0);
|
||||
|
||||
token = tf.next(token);
|
||||
assertTrue("token is null and it shouldn't be", token != null);
|
||||
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "again",
|
||||
new String(token.termBuffer(), 0, token.termLength()).equals("again") == true);
|
||||
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
|
||||
|
||||
}
|
||||
|
||||
public void testLinks() throws Exception {
|
||||
String test = "[http://lucene.apache.org/java/docs/index.html#news here] [http://lucene.apache.org/java/docs/index.html?b=c here]";
|
||||
String test = "[http://lucene.apache.org/java/docs/index.html#news here] [http://lucene.apache.org/java/docs/index.html?b=c here] [https://lucene.apache.org/java/docs/index.html?b=c here]";
|
||||
WikipediaTokenizer tf = new WikipediaTokenizer(new StringReader(test));
|
||||
Token token = new Token();
|
||||
token = tf.next(token);
|
||||
|
@ -204,10 +215,15 @@ public class WikipediaTokenizerTest extends TestCase {
|
|||
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.EXTERNAL_LINK_URL, token.type().equals(WikipediaTokenizer.EXTERNAL_LINK_URL) == true);
|
||||
tf.next(token);//skip here
|
||||
token = tf.next(token);
|
||||
|
||||
assertTrue("token is null and it shouldn't be", token != null);
|
||||
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "http://lucene.apache.org/java/docs/index.html?b=c",
|
||||
new String(token.termBuffer(), 0, token.termLength()).equals("http://lucene.apache.org/java/docs/index.html?b=c") == true);
|
||||
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.EXTERNAL_LINK_URL, token.type().equals(WikipediaTokenizer.EXTERNAL_LINK_URL) == true);
|
||||
tf.next(token);//skip here
|
||||
token = tf.next(token);
|
||||
assertTrue("token is null and it shouldn't be", token != null);
|
||||
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "https://lucene.apache.org/java/docs/index.html?b=c",
|
||||
new String(token.termBuffer(), 0, token.termLength()).equals("https://lucene.apache.org/java/docs/index.html?b=c") == true);
|
||||
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.EXTERNAL_LINK_URL, token.type().equals(WikipediaTokenizer.EXTERNAL_LINK_URL) == true);
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue