LUCENE-1133: Adds ability to keep certain strings as single tokens

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@614895 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Grant Ingersoll 2008-01-24 15:05:53 +00:00
parent 1f0e88f186
commit 305c47f500
4 changed files with 842 additions and 329 deletions

View File

@ -22,17 +22,17 @@ import org.apache.lucene.analysis.Tokenizer;
import java.io.IOException;
import java.io.Reader;
import java.util.*;
/**
* Extension of StandardTokenizer that is aware of Wikipedia syntax. It is based off of the
* Wikipedia tutorial available at http://en.wikipedia.org/wiki/Wikipedia:Tutorial, but it may not be complete.
*
* <p/>
* <p/>
* EXPERIMENTAL !!!!!!!!!
* NOTE: This Tokenizer is considered experimental and the grammar is subject to change in the trunk and in follow up releases.
*
**/
*/
public class WikipediaTokenizer extends Tokenizer {
public static final String INTERNAL_LINK = "il";
public static final String EXTERNAL_LINK = "el";
@ -45,11 +45,21 @@ public class WikipediaTokenizer extends Tokenizer {
public static final String BOLD_ITALICS = "bi";
public static final String HEADING = "h";
public static final String SUB_HEADING = "sh";
public static final int TOKENS_ONLY = 0;
public static final int UNTOKENIZED_ONLY = 1;
public static final int BOTH = 2;
public static final int UNTOKENIZED_TOKEN_FLAG = 1;
/**
* A private instance of the JFlex-constructed scanner
*/
private final WikipediaTokenizerImpl scanner;
private int tokenOutput = TOKENS_ONLY;
private Set untokenizedTypes = Collections.EMPTY_SET;
private Iterator tokens = null;
void setInput(Reader reader) {
this.input = reader;
}
@ -57,11 +67,19 @@ public class WikipediaTokenizer extends Tokenizer {
/**
* Creates a new instance of the {@link WikipediaTokenizer}. Attaches the
* <code>input</code> to a newly created JFlex scanner.
*
* @param input The Input Reader
*/
public WikipediaTokenizer(Reader input) {
this.input = input;
this(input, TOKENS_ONLY, Collections.EMPTY_SET);
}
public WikipediaTokenizer(Reader input, int tokenOutput, Set untokenizedTypes) {
super(input);
this.tokenOutput = tokenOutput;
this.scanner = new WikipediaTokenizerImpl(input);
this.untokenizedTypes = untokenizedTypes;
}
/*
@ -70,19 +88,116 @@ public class WikipediaTokenizer extends Tokenizer {
* @see org.apache.lucene.analysis.TokenStream#next()
*/
public Token next(Token result) throws IOException {
if (tokens != null && tokens.hasNext()){
return (Token)tokens.next();
}
int tokenType = scanner.getNextToken();
if (tokenType == WikipediaTokenizerImpl.YYEOF) {
return null;
}
String type = WikipediaTokenizerImpl.TOKEN_TYPES[tokenType];
if (tokenOutput == TOKENS_ONLY || untokenizedTypes.contains(type) == false){
setupToken(result);
} else if (tokenOutput == UNTOKENIZED_ONLY && untokenizedTypes.contains(type) == true){
collapseTokens(result, tokenType);
scanner.getText(result, tokenType);
}
else if (tokenOutput == BOTH){
//collapse into a single token, add it to tokens AND output the individual tokens
//output the untokenized Token first
collapseAndSaveTokens(result, tokenType, type);
}
result.setPositionIncrement(scanner.getPositionIncrement());
result.setType(type);
return result;
}
private void collapseAndSaveTokens(Token result, int tokenType, String type) throws IOException {
//collapse
StringBuffer buffer = new StringBuffer(32);
int numAdded = scanner.setText(buffer);
//TODO: how to know how much whitespace to add
int theStart = scanner.yychar();
int lastPos = theStart + numAdded;
int tmpTokType;
int numSeen = 0;
List tmp = new ArrayList();
Token saved = new Token();
setupSavedToken(saved, 0, type);
tmp.add(saved);
//while we can get a token and that token is the same type and we have not transitioned to a new wiki-item of the same type
while ((tmpTokType = scanner.getNextToken()) != WikipediaTokenizerImpl.YYEOF && tmpTokType == tokenType && scanner.getNumWikiTokensSeen() > numSeen){
int currPos = scanner.yychar();
//append whitespace
for (int i = 0; i < (currPos - lastPos); i++){
buffer.append(' ');
}
numAdded = scanner.setText(buffer);
saved = new Token();
setupSavedToken(saved, scanner.getPositionIncrement(), type);
tmp.add(saved);
numSeen++;
lastPos = currPos + numAdded;
}
//trim the buffer
String s = buffer.toString().trim();
result.setTermBuffer(s.toCharArray(), 0, s.length());
result.setStartOffset(theStart);
result.setEndOffset(theStart + s.length());
result.setFlags(UNTOKENIZED_TOKEN_FLAG);
//The way the loop is written, we will have proceeded to the next token. We need to pushback the scanner to lastPos
if (tmpTokType != WikipediaTokenizerImpl.YYEOF){
scanner.yypushback(scanner.yylength());
}
tokens = tmp.iterator();
}
private void setupSavedToken(Token saved, int positionInc, String type){
setupToken(saved);
saved.setPositionIncrement(positionInc);
saved.setType(type);
}
private void collapseTokens(Token result, int tokenType) throws IOException {
//collapse
StringBuffer buffer = new StringBuffer(32);
int numAdded = scanner.setText(buffer);
//TODO: how to know how much whitespace to add
int theStart = scanner.yychar();
int lastPos = theStart + numAdded;
int tmpTokType;
int numSeen = 0;
//while we can get a token and that token is the same type and we have not transitioned to a new wiki-item of the same type
while ((tmpTokType = scanner.getNextToken()) != WikipediaTokenizerImpl.YYEOF && tmpTokType == tokenType && scanner.getNumWikiTokensSeen() > numSeen){
int currPos = scanner.yychar();
//append whitespace
for (int i = 0; i < (currPos - lastPos); i++){
buffer.append(' ');
}
numAdded = scanner.setText(buffer);
numSeen++;
lastPos = currPos + numAdded;
}
//trim the buffer
String s = buffer.toString().trim();
result.setTermBuffer(s.toCharArray(), 0, s.length());
result.setStartOffset(theStart);
result.setEndOffset(theStart + s.length());
result.setFlags(UNTOKENIZED_TOKEN_FLAG);
//The way the loop is written, we will have proceeded to the next token. We need to pushback the scanner to lastPos
if (tmpTokType != WikipediaTokenizerImpl.YYEOF){
scanner.yypushback(scanner.yylength());
} else {
tokens = null;
}
}
private void setupToken(Token result) {
scanner.getText(result);
final int start = scanner.yychar();
result.setStartOffset(start);
result.setEndOffset(start + result.termLength());
result.setPositionIncrement(scanner.getPositionIncrement());
result.setType(WikipediaTokenizerImpl.TOKEN_TYPES[tokenType]);
return result;
}
/*

View File

@ -1,4 +1,4 @@
/* The following code was generated by JFlex 1.4.1 on 1/4/08 3:30 PM */
/* The following code was generated by JFlex 1.4.1 on 1/16/08 10:31 AM */
package org.apache.lucene.wikipedia.analysis;
@ -25,7 +25,7 @@ import org.apache.lucene.analysis.Token;
/**
* This class is a scanner generated by
* <a href="http://www.jflex.de/">JFlex</a> 1.4.1
* on 1/4/08 3:30 PM from the specification file
* on 1/16/08 10:31 AM from the specification file
* <tt>/Volumes/User/grantingersoll/projects/lucene/Lucene-Trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerImpl.jflex</tt>
*/
class WikipediaTokenizerImpl {
@ -37,14 +37,14 @@ class WikipediaTokenizerImpl {
private static final int ZZ_BUFFERSIZE = 16384;
/** lexical states */
public static final int DOUBLE_BRACE_STATE = 7;
public static final int DOUBLE_BRACE_STATE = 8;
public static final int INTERNAL_LINK_STATE = 2;
public static final int TWO_SINGLE_QUOTES_STATE = 4;
public static final int CATEGORY_STATE = 1;
public static final int FIVE_SINGLE_QUOTES_STATE = 5;
public static final int STRING = 8;
public static final int FIVE_SINGLE_QUOTES_STATE = 6;
public static final int STRING = 9;
public static final int YYINITIAL = 0;
public static final int DOUBLE_EQUALS_STATE = 6;
public static final int DOUBLE_EQUALS_STATE = 7;
public static final int THREE_SINGLE_QUOTES_STATE = 5;
public static final int EXTERNAL_LINK_STATE = 3;
@ -76,20 +76,20 @@ class WikipediaTokenizerImpl {
private static final int [] ZZ_ACTION = zzUnpackAction();
private static final String ZZ_ACTION_PACKED_0 =
"\11\0\4\1\4\2\1\3\1\1\1\4\2\1\1\5"+
"\1\1\1\6\1\1\2\7\1\10\1\11\1\10\1\12"+
"\1\13\1\7\1\14\1\15\1\16\1\17\1\7\1\20"+
"\1\7\4\21\1\22\1\21\1\23\1\24\1\25\3\0"+
"\1\26\14\0\1\27\1\30\1\31\1\32\1\10\1\0"+
"\1\33\1\0\1\34\1\0\1\35\3\0\1\36\1\37"+
"\2\40\1\37\2\41\2\0\1\40\1\0\14\40\1\37"+
"\3\0\1\10\1\42\3\0\1\43\1\44\5\0\1\45"+
"\4\0\1\45\2\0\2\45\2\0\1\10\5\0\1\30"+
"\1\37\1\40\1\46\3\0\1\10\2\0\1\47\30\0"+
"\1\50\2\0\1\51\1\52\1\53";
"\12\0\4\1\4\2\1\3\1\1\1\4\1\1\2\5"+
"\1\6\2\5\1\7\1\5\2\10\1\11\1\12\1\11"+
"\1\13\1\14\1\10\1\15\1\16\1\15\1\17\1\20"+
"\1\10\1\21\1\10\4\22\1\23\1\22\1\24\1\25"+
"\1\26\3\0\1\27\14\0\1\30\1\31\1\32\1\33"+
"\1\11\1\0\1\34\1\35\1\0\1\36\1\0\1\37"+
"\3\0\1\40\1\41\2\42\1\41\2\43\2\0\1\42"+
"\1\0\14\42\1\41\3\0\1\11\1\44\3\0\1\45"+
"\1\46\5\0\1\47\4\0\1\47\2\0\2\47\2\0"+
"\1\11\5\0\1\31\1\41\1\42\1\50\3\0\1\11"+
"\2\0\1\51\30\0\1\52\2\0\1\53\1\54\1\55";
private static int [] zzUnpackAction() {
int [] result = new int[178];
int [] result = new int[183];
int offset = 0;
offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
return result;
@ -116,30 +116,30 @@ class WikipediaTokenizerImpl {
private static final String ZZ_ROWMAP_PACKED_0 =
"\0\0\0\54\0\130\0\204\0\260\0\334\0\u0108\0\u0134"+
"\0\u0160\0\u018c\0\u01b8\0\u01e4\0\u0210\0\u023c\0\u0268\0\u0294"+
"\0\u02c0\0\u018c\0\u02ec\0\u0318\0\u0344\0\u0370\0\u039c\0\u03c8"+
"\0\u03f4\0\u0420\0\u018c\0\u0370\0\u044c\0\u018c\0\u0478\0\u04a4"+
"\0\u04d0\0\u04fc\0\u0528\0\u0554\0\u0580\0\u05ac\0\u05d8\0\u0604"+
"\0\u0630\0\u018c\0\u065c\0\u0370\0\u0688\0\u06b4\0\u06e0\0\u070c"+
"\0\u018c\0\u018c\0\u0738\0\u0764\0\u0790\0\u018c\0\u07bc\0\u07e8"+
"\0\u0814\0\u0840\0\u086c\0\u0898\0\u08c4\0\u08f0\0\u091c\0\u0948"+
"\0\u0974\0\u09a0\0\u09cc\0\u09f8\0\u018c\0\u018c\0\u0a24\0\u0a50"+
"\0\u0a7c\0\u0aa8\0\u0ad4\0\u0b00\0\u0b2c\0\u0b58\0\u0b84\0\u0bb0"+
"\0\u0bdc\0\u0c08\0\u0c34\0\u0c60\0\u0c8c\0\u0814\0\u0cb8\0\u0ce4"+
"\0\u0d10\0\u0d3c\0\u0d68\0\u0d94\0\u0dc0\0\u0dec\0\u0e18\0\u0e44"+
"\0\u0e70\0\u0e9c\0\u0ec8\0\u0ef4\0\u0f20\0\u0f4c\0\u0f78\0\u0fa4"+
"\0\u0fd0\0\u0ffc\0\u1028\0\u1054\0\u018c\0\u1080\0\u10ac\0\u10d8"+
"\0\u1104\0\u018c\0\u1130\0\u115c\0\u1188\0\u11b4\0\u11e0\0\u120c"+
"\0\u1238\0\u1264\0\u1290\0\u12bc\0\u12e8\0\u1314\0\u1340\0\u07e8"+
"\0\u0974\0\u136c\0\u1398\0\u13c4\0\u13f0\0\u141c\0\u1448\0\u1474"+
"\0\u14a0\0\u018c\0\u14cc\0\u14f8\0\u1524\0\u1550\0\u157c\0\u15a8"+
"\0\u15d4\0\u1600\0\u162c\0\u018c\0\u1658\0\u1684\0\u16b0\0\u16dc"+
"\0\u1708\0\u1734\0\u1760\0\u178c\0\u17b8\0\u17e4\0\u1810\0\u183c"+
"\0\u1868\0\u1894\0\u18c0\0\u18ec\0\u1918\0\u1944\0\u1970\0\u199c"+
"\0\u19c8\0\u19f4\0\u1a20\0\u1a4c\0\u1a78\0\u1aa4\0\u1ad0\0\u018c"+
"\0\u018c\0\u018c";
"\0\u02c0\0\u02ec\0\u01b8\0\u0318\0\u0344\0\u0370\0\u01b8\0\u039c"+
"\0\u03c8\0\u03f4\0\u0420\0\u044c\0\u0478\0\u01b8\0\u039c\0\u04a4"+
"\0\u01b8\0\u04d0\0\u04fc\0\u0528\0\u0554\0\u0580\0\u05ac\0\u05d8"+
"\0\u0604\0\u0630\0\u065c\0\u0688\0\u06b4\0\u01b8\0\u06e0\0\u039c"+
"\0\u070c\0\u0738\0\u0764\0\u0790\0\u01b8\0\u01b8\0\u07bc\0\u07e8"+
"\0\u0814\0\u01b8\0\u0840\0\u086c\0\u0898\0\u08c4\0\u08f0\0\u091c"+
"\0\u0948\0\u0974\0\u09a0\0\u09cc\0\u09f8\0\u0a24\0\u0a50\0\u0a7c"+
"\0\u01b8\0\u01b8\0\u0aa8\0\u0ad4\0\u0b00\0\u0b00\0\u0b2c\0\u0b58"+
"\0\u0b84\0\u0bb0\0\u0bdc\0\u0c08\0\u0c34\0\u0c60\0\u0c8c\0\u0cb8"+
"\0\u0ce4\0\u0d10\0\u0898\0\u0d3c\0\u0d68\0\u0d94\0\u0dc0\0\u0dec"+
"\0\u0e18\0\u0e44\0\u0e70\0\u0e9c\0\u0ec8\0\u0ef4\0\u0f20\0\u0f4c"+
"\0\u0f78\0\u0fa4\0\u0fd0\0\u0ffc\0\u1028\0\u1054\0\u1080\0\u10ac"+
"\0\u10d8\0\u01b8\0\u1104\0\u1130\0\u115c\0\u1188\0\u01b8\0\u11b4"+
"\0\u11e0\0\u120c\0\u1238\0\u1264\0\u1290\0\u12bc\0\u12e8\0\u1314"+
"\0\u1340\0\u136c\0\u1398\0\u13c4\0\u086c\0\u09f8\0\u13f0\0\u141c"+
"\0\u1448\0\u1474\0\u14a0\0\u14cc\0\u14f8\0\u1524\0\u01b8\0\u1550"+
"\0\u157c\0\u15a8\0\u15d4\0\u1600\0\u162c\0\u1658\0\u1684\0\u16b0"+
"\0\u01b8\0\u16dc\0\u1708\0\u1734\0\u1760\0\u178c\0\u17b8\0\u17e4"+
"\0\u1810\0\u183c\0\u1868\0\u1894\0\u18c0\0\u18ec\0\u1918\0\u1944"+
"\0\u1970\0\u199c\0\u19c8\0\u19f4\0\u1a20\0\u1a4c\0\u1a78\0\u1aa4"+
"\0\u1ad0\0\u1afc\0\u1b28\0\u1b54\0\u01b8\0\u01b8\0\u01b8";
private static int [] zzUnpackRowMap() {
int [] result = new int[178];
int [] result = new int[183];
int offset = 0;
offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
return result;
@ -162,151 +162,153 @@ class WikipediaTokenizerImpl {
private static final int [] ZZ_TRANS = zzUnpackTrans();
private static final String ZZ_TRANS_PACKED_0 =
"\1\12\1\13\5\12\1\14\1\12\1\15\3\12\1\16"+
"\1\17\1\20\1\21\1\22\1\23\2\12\1\24\2\12"+
"\15\16\1\25\2\12\3\16\10\12\1\26\5\12\4\27"+
"\1\12\1\23\3\12\1\30\1\12\15\27\3\12\3\27"+
"\10\12\1\26\5\12\4\31\1\12\1\23\3\12\1\32"+
"\1\12\15\31\3\12\3\31\1\12\7\33\1\34\5\33"+
"\4\35\1\33\1\23\2\12\1\33\1\36\1\33\15\35"+
"\3\33\1\37\2\35\2\33\1\40\5\33\1\34\5\33"+
"\4\41\1\33\1\42\2\33\1\43\2\33\15\41\3\33"+
"\3\41\10\33\1\34\5\33\4\44\1\33\1\42\2\33"+
"\1\43\2\33\15\44\3\33\3\44\10\33\1\34\1\33"+
"\1\45\3\33\4\46\1\33\1\42\5\33\15\46\3\33"+
"\3\46\10\33\1\47\5\33\4\50\1\33\1\42\5\33"+
"\15\50\1\33\1\51\1\33\3\50\1\33\1\52\1\53"+
"\5\52\1\54\1\52\1\55\3\52\4\56\1\52\1\57"+
"\2\52\1\60\2\52\15\56\2\52\1\61\3\56\1\52"+
"\55\0\1\62\62\0\1\63\4\0\4\64\7\0\6\64"+
"\1\65\6\64\3\0\3\64\12\0\1\66\43\0\1\67"+
"\1\70\1\71\1\72\2\73\1\0\1\74\3\0\1\74"+
"\1\16\1\17\1\20\1\21\7\0\15\16\3\0\3\16"+
"\3\0\1\75\1\0\1\76\2\77\1\0\1\100\3\0"+
"\1\100\3\17\1\21\7\0\15\17\3\0\3\17\2\0"+
"\1\67\1\101\1\71\1\72\2\77\1\0\1\100\3\0"+
"\1\100\1\20\1\17\1\20\1\21\7\0\15\20\3\0"+
"\3\20\3\0\1\102\1\0\1\76\2\73\1\0\1\74"+
"\3\0\1\74\4\21\7\0\15\21\3\0\3\21\24\0"+
"\1\12\55\0\1\103\73\0\1\104\16\0\1\63\4\0"+
"\4\64\7\0\15\64\3\0\3\64\16\0\4\27\7\0"+
"\15\27\3\0\3\27\27\0\1\105\42\0\4\31\7\0"+
"\15\31\3\0\3\31\27\0\1\106\42\0\4\35\7\0"+
"\15\35\3\0\3\35\16\0\4\35\7\0\2\35\1\107"+
"\12\35\3\0\3\35\2\0\1\110\67\0\4\41\7\0"+
"\15\41\3\0\3\41\24\0\1\33\55\0\1\111\43\0"+
"\4\44\7\0\15\44\3\0\3\44\12\0\1\105\57\0"+
"\4\46\7\0\15\46\3\0\3\46\11\0\1\112\4\0"+
"\4\64\7\0\15\64\3\0\3\64\16\0\4\50\7\0"+
"\15\50\3\0\3\50\47\0\1\105\6\0\1\113\63\0"+
"\1\114\57\0\4\56\7\0\15\56\3\0\3\56\24\0"+
"\1\52\55\0\1\115\43\0\4\64\7\0\15\64\3\0"+
"\3\64\14\0\1\33\1\0\4\116\1\0\3\117\3\0"+
"\15\116\3\0\3\116\14\0\1\33\1\0\4\116\1\0"+
"\3\117\3\0\3\116\1\120\11\116\3\0\3\116\16\0"+
"\1\121\1\0\1\121\10\0\15\121\3\0\3\121\16\0"+
"\1\122\1\123\1\124\1\125\7\0\15\122\3\0\3\122"+
"\1\13\1\14\5\13\1\15\1\13\1\16\3\13\1\17"+
"\1\20\1\21\1\22\1\23\1\24\2\13\1\25\2\13"+
"\15\17\1\26\2\13\3\17\1\13\7\27\1\30\5\27"+
"\4\31\1\27\1\32\3\27\1\33\1\27\15\31\3\27"+
"\3\31\10\27\1\30\5\27\4\34\1\27\1\32\3\27"+
"\1\35\1\27\15\34\3\27\3\34\1\27\7\36\1\37"+
"\5\36\4\40\1\36\1\32\2\27\1\36\1\41\1\36"+
"\15\40\3\36\1\42\2\40\2\36\1\43\5\36\1\37"+
"\5\36\4\44\1\36\1\45\2\36\1\46\2\36\15\44"+
"\3\36\3\44\10\36\1\37\5\36\4\47\1\36\1\45"+
"\2\36\1\46\2\36\15\47\3\36\3\47\10\36\1\37"+
"\5\36\4\47\1\36\1\45\2\36\1\50\2\36\15\47"+
"\3\36\3\47\10\36\1\37\1\36\1\51\3\36\4\52"+
"\1\36\1\45\5\36\15\52\3\36\3\52\10\36\1\53"+
"\5\36\4\54\1\36\1\45\5\36\15\54\1\36\1\55"+
"\1\36\3\54\1\36\1\56\1\57\5\56\1\60\1\56"+
"\1\61\3\56\4\62\1\56\1\63\2\56\1\64\2\56"+
"\15\62\2\56\1\65\3\62\1\56\55\0\1\66\62\0"+
"\1\67\4\0\4\70\7\0\6\70\1\71\6\70\3\0"+
"\3\70\12\0\1\72\43\0\1\73\1\74\1\75\1\76"+
"\2\77\1\0\1\100\3\0\1\100\1\17\1\20\1\21"+
"\1\22\7\0\15\17\3\0\3\17\3\0\1\101\1\0"+
"\1\102\2\103\1\0\1\104\3\0\1\104\3\20\1\22"+
"\7\0\15\20\3\0\3\20\2\0\1\73\1\105\1\75"+
"\1\76\2\103\1\0\1\104\3\0\1\104\1\21\1\20"+
"\1\21\1\22\7\0\15\21\3\0\3\21\3\0\1\106"+
"\1\0\1\102\2\77\1\0\1\100\3\0\1\100\4\22"+
"\7\0\15\22\3\0\3\22\24\0\1\13\55\0\1\107"+
"\73\0\1\110\16\0\1\67\4\0\4\70\7\0\15\70"+
"\3\0\3\70\16\0\4\31\7\0\15\31\3\0\3\31"+
"\24\0\1\27\56\0\1\111\42\0\4\34\7\0\15\34"+
"\3\0\3\34\27\0\1\112\42\0\4\40\7\0\15\40"+
"\3\0\3\40\16\0\4\40\7\0\2\40\1\113\12\40"+
"\3\0\3\40\2\0\1\114\67\0\4\44\7\0\15\44"+
"\3\0\3\44\24\0\1\36\55\0\1\115\43\0\4\47"+
"\7\0\15\47\3\0\3\47\26\0\1\116\37\0\1\111"+
"\57\0\4\52\7\0\15\52\3\0\3\52\11\0\1\117"+
"\4\0\4\70\7\0\15\70\3\0\3\70\16\0\4\54"+
"\7\0\15\54\3\0\3\54\47\0\1\111\6\0\1\120"+
"\63\0\1\121\57\0\4\62\7\0\15\62\3\0\3\62"+
"\24\0\1\56\55\0\1\122\43\0\4\70\7\0\15\70"+
"\3\0\3\70\14\0\1\36\1\0\4\123\1\0\3\124"+
"\3\0\15\123\3\0\3\123\14\0\1\36\1\0\4\123"+
"\1\0\3\124\3\0\3\123\1\125\11\123\3\0\3\123"+
"\16\0\1\126\1\0\1\126\10\0\15\126\3\0\3\126"+
"\16\0\1\127\1\130\1\127\1\130\7\0\15\127\3\0"+
"\3\127\16\0\1\131\2\132\1\133\7\0\15\131\3\0"+
"\3\131\16\0\1\74\2\134\10\0\15\74\3\0\3\74"+
"\16\0\1\135\2\136\1\137\7\0\15\135\3\0\3\135"+
"\16\0\4\130\7\0\15\130\3\0\3\130\16\0\1\140"+
"\2\141\1\142\7\0\15\140\3\0\3\140\16\0\1\143"+
"\2\144\1\145\7\0\15\143\3\0\3\143\16\0\1\146"+
"\1\136\1\147\1\137\7\0\15\146\3\0\3\146\16\0"+
"\1\150\2\123\1\125\7\0\15\150\3\0\3\150\30\0"+
"\1\151\1\152\64\0\1\153\27\0\4\35\7\0\2\35"+
"\1\154\12\35\3\0\3\35\2\0\1\155\101\0\1\156"+
"\1\157\40\0\4\64\7\0\6\64\1\160\6\64\3\0"+
"\3\64\2\0\1\161\63\0\1\162\71\0\1\163\1\164"+
"\34\0\1\165\1\0\1\33\1\0\4\116\1\0\3\117"+
"\3\0\15\116\3\0\3\116\16\0\4\166\1\0\3\117"+
"\3\0\15\166\3\0\3\166\12\0\1\165\1\0\1\33"+
"\1\0\4\116\1\0\3\117\3\0\10\116\1\167\4\116"+
"\3\0\3\116\2\0\1\67\13\0\1\121\1\0\1\121"+
"\10\0\15\121\3\0\3\121\3\0\1\170\1\0\1\76"+
"\2\171\6\0\1\122\1\123\1\124\1\125\7\0\15\122"+
"\3\0\3\122\3\0\1\172\1\0\1\76\2\173\1\0"+
"\1\174\3\0\1\174\3\123\1\125\7\0\15\123\3\0"+
"\3\123\3\0\1\175\1\0\1\76\2\173\1\0\1\174"+
"\3\0\1\174\1\124\1\123\1\124\1\125\7\0\15\124"+
"\3\0\3\124\3\0\1\176\1\0\1\76\2\171\6\0"+
"\4\125\7\0\15\125\3\0\3\125\3\0\1\177\2\0"+
"\1\177\7\0\1\127\1\130\1\127\1\130\7\0\15\127"+
"\3\0\3\127\3\0\1\177\2\0\1\177\7\0\4\130"+
"\7\0\15\130\3\0\3\130\3\0\1\171\1\0\1\76"+
"\2\171\6\0\1\131\2\132\1\133\7\0\15\131\3\0"+
"\3\131\3\0\1\173\1\0\1\76\2\173\1\0\1\174"+
"\3\0\1\174\3\132\1\133\7\0\15\132\3\0\3\132"+
"\3\0\1\171\1\0\1\76\2\171\6\0\4\133\7\0"+
"\15\133\3\0\3\133\3\0\1\174\2\0\2\174\1\0"+
"\1\174\3\0\1\174\3\134\10\0\15\134\3\0\3\134"+
"\3\0\1\102\1\0\1\76\2\73\1\0\1\74\3\0"+
"\1\74\1\135\2\136\1\137\7\0\15\135\3\0\3\135"+
"\3\0\1\75\1\0\1\76\2\77\1\0\1\100\3\0"+
"\1\100\3\136\1\137\7\0\15\136\3\0\3\136\3\0"+
"\1\102\1\0\1\76\2\73\1\0\1\74\3\0\1\74"+
"\4\137\7\0\15\137\3\0\3\137\3\0\1\73\1\0"+
"\1\76\2\73\1\0\1\74\3\0\1\74\1\140\2\141"+
"\1\142\7\0\15\140\3\0\3\140\3\0\1\77\1\0"+
"\1\76\2\77\1\0\1\100\3\0\1\100\3\141\1\142"+
"\7\0\15\141\3\0\3\141\3\0\1\73\1\0\1\76"+
"\2\73\1\0\1\74\3\0\1\74\4\142\7\0\15\142"+
"\3\0\3\142\3\0\1\74\2\0\2\74\1\0\1\74"+
"\3\0\1\74\1\143\2\144\1\145\7\0\15\143\3\0"+
"\3\143\3\0\1\100\2\0\2\100\1\0\1\100\3\0"+
"\1\100\3\144\1\145\7\0\15\144\3\0\3\144\3\0"+
"\1\74\2\0\2\74\1\0\1\74\3\0\1\74\4\145"+
"\7\0\15\145\3\0\3\145\3\0\1\200\1\0\1\76"+
"\2\73\1\0\1\74\3\0\1\74\1\146\1\136\1\147"+
"\1\137\7\0\15\146\3\0\3\146\3\0\1\201\1\0"+
"\1\76\2\77\1\0\1\100\3\0\1\100\1\147\1\136"+
"\1\147\1\137\7\0\15\147\3\0\3\147\3\0\1\176"+
"\1\0\1\76\2\171\6\0\1\150\2\123\1\125\7\0"+
"\15\150\3\0\3\150\31\0\1\152\54\0\1\202\64\0"+
"\1\203\26\0\4\35\7\0\15\35\3\0\1\35\1\204"+
"\1\35\31\0\1\157\54\0\1\205\35\0\1\33\1\0"+
"\4\116\1\0\3\117\3\0\3\116\1\206\11\116\3\0"+
"\3\116\2\0\1\207\102\0\1\164\54\0\1\210\34\0"+
"\1\211\52\0\1\165\3\0\4\166\7\0\15\166\3\0"+
"\3\166\12\0\1\165\1\0\1\212\1\0\4\116\1\0"+
"\3\117\3\0\15\116\3\0\3\116\16\0\1\213\1\125"+
"\1\213\1\125\7\0\15\213\3\0\3\213\16\0\4\133"+
"\7\0\15\133\3\0\3\133\16\0\4\137\7\0\15\137"+
"\3\0\3\137\16\0\4\142\7\0\15\142\3\0\3\142"+
"\16\0\4\145\7\0\15\145\3\0\3\145\16\0\1\214"+
"\1\137\1\214\1\137\7\0\15\214\3\0\3\214\16\0"+
"\4\125\7\0\15\125\3\0\3\125\16\0\4\215\7\0"+
"\15\215\3\0\3\215\33\0\1\216\61\0\1\217\30\0"+
"\4\35\6\0\1\220\15\35\3\0\2\35\1\221\33\0"+
"\1\222\32\0\1\165\1\0\1\33\1\0\4\116\1\0"+
"\3\117\3\0\10\116\1\223\4\116\3\0\3\116\2\0"+
"\1\224\104\0\1\225\36\0\4\226\7\0\15\226\3\0"+
"\3\226\3\0\1\170\1\0\1\76\2\171\6\0\1\213"+
"\1\125\1\213\1\125\7\0\15\213\3\0\3\213\3\0"+
"\1\200\1\0\1\76\2\73\1\0\1\74\3\0\1\74"+
"\1\214\1\137\1\214\1\137\7\0\15\214\3\0\3\214"+
"\3\0\1\177\2\0\1\177\7\0\4\215\7\0\15\215"+
"\3\0\3\215\34\0\1\227\55\0\1\230\26\0\1\231"+
"\60\0\4\35\6\0\1\220\15\35\3\0\3\35\34\0"+
"\1\232\31\0\1\165\1\0\1\105\1\0\4\116\1\0"+
"\3\117\3\0\15\116\3\0\3\116\34\0\1\233\32\0"+
"\1\234\2\0\4\226\7\0\15\226\3\0\3\226\35\0"+
"\1\235\62\0\1\236\20\0\1\237\77\0\1\240\53\0"+
"\1\241\32\0\1\33\1\0\4\166\1\0\3\117\3\0"+
"\15\166\3\0\3\166\36\0\1\242\53\0\1\243\33\0"+
"\4\244\7\0\15\244\3\0\3\244\36\0\1\245\53\0"+
"\1\246\54\0\1\247\61\0\1\250\11\0\1\251\12\0"+
"\4\244\7\0\15\244\3\0\3\244\37\0\1\252\53\0"+
"\1\253\54\0\1\254\22\0\1\12\62\0\4\255\7\0"+
"\15\255\3\0\3\255\40\0\1\256\53\0\1\257\43\0"+
"\1\260\26\0\2\255\1\0\2\255\1\0\2\255\2\0"+
"\5\255\7\0\15\255\3\0\4\255\27\0\1\261\53\0"+
"\1\262\24\0";
"\16\0\1\127\1\130\1\131\1\132\7\0\15\127\3\0"+
"\3\127\16\0\1\133\1\0\1\133\10\0\15\133\3\0"+
"\3\133\16\0\1\134\1\135\1\134\1\135\7\0\15\134"+
"\3\0\3\134\16\0\1\136\2\137\1\140\7\0\15\136"+
"\3\0\3\136\16\0\1\100\2\141\10\0\15\100\3\0"+
"\3\100\16\0\1\142\2\143\1\144\7\0\15\142\3\0"+
"\3\142\16\0\4\135\7\0\15\135\3\0\3\135\16\0"+
"\1\145\2\146\1\147\7\0\15\145\3\0\3\145\16\0"+
"\1\150\2\151\1\152\7\0\15\150\3\0\3\150\16\0"+
"\1\153\1\143\1\154\1\144\7\0\15\153\3\0\3\153"+
"\16\0\1\155\2\130\1\132\7\0\15\155\3\0\3\155"+
"\30\0\1\156\1\157\64\0\1\160\27\0\4\40\7\0"+
"\2\40\1\161\12\40\3\0\3\40\2\0\1\162\101\0"+
"\1\163\1\164\40\0\4\70\7\0\6\70\1\165\6\70"+
"\3\0\3\70\2\0\1\166\63\0\1\167\71\0\1\170"+
"\1\171\34\0\1\172\1\0\1\36\1\0\4\123\1\0"+
"\3\124\3\0\15\123\3\0\3\123\16\0\4\173\1\0"+
"\3\124\3\0\15\173\3\0\3\173\12\0\1\172\1\0"+
"\1\36\1\0\4\123\1\0\3\124\3\0\10\123\1\174"+
"\4\123\3\0\3\123\2\0\1\73\13\0\1\126\1\0"+
"\1\126\10\0\15\126\3\0\3\126\3\0\1\175\1\0"+
"\1\102\2\176\6\0\1\127\1\130\1\131\1\132\7\0"+
"\15\127\3\0\3\127\3\0\1\177\1\0\1\102\2\200"+
"\1\0\1\201\3\0\1\201\3\130\1\132\7\0\15\130"+
"\3\0\3\130\3\0\1\202\1\0\1\102\2\200\1\0"+
"\1\201\3\0\1\201\1\131\1\130\1\131\1\132\7\0"+
"\15\131\3\0\3\131\3\0\1\203\1\0\1\102\2\176"+
"\6\0\4\132\7\0\15\132\3\0\3\132\3\0\1\204"+
"\2\0\1\204\7\0\1\134\1\135\1\134\1\135\7\0"+
"\15\134\3\0\3\134\3\0\1\204\2\0\1\204\7\0"+
"\4\135\7\0\15\135\3\0\3\135\3\0\1\176\1\0"+
"\1\102\2\176\6\0\1\136\2\137\1\140\7\0\15\136"+
"\3\0\3\136\3\0\1\200\1\0\1\102\2\200\1\0"+
"\1\201\3\0\1\201\3\137\1\140\7\0\15\137\3\0"+
"\3\137\3\0\1\176\1\0\1\102\2\176\6\0\4\140"+
"\7\0\15\140\3\0\3\140\3\0\1\201\2\0\2\201"+
"\1\0\1\201\3\0\1\201\3\141\10\0\15\141\3\0"+
"\3\141\3\0\1\106\1\0\1\102\2\77\1\0\1\100"+
"\3\0\1\100\1\142\2\143\1\144\7\0\15\142\3\0"+
"\3\142\3\0\1\101\1\0\1\102\2\103\1\0\1\104"+
"\3\0\1\104\3\143\1\144\7\0\15\143\3\0\3\143"+
"\3\0\1\106\1\0\1\102\2\77\1\0\1\100\3\0"+
"\1\100\4\144\7\0\15\144\3\0\3\144\3\0\1\77"+
"\1\0\1\102\2\77\1\0\1\100\3\0\1\100\1\145"+
"\2\146\1\147\7\0\15\145\3\0\3\145\3\0\1\103"+
"\1\0\1\102\2\103\1\0\1\104\3\0\1\104\3\146"+
"\1\147\7\0\15\146\3\0\3\146\3\0\1\77\1\0"+
"\1\102\2\77\1\0\1\100\3\0\1\100\4\147\7\0"+
"\15\147\3\0\3\147\3\0\1\100\2\0\2\100\1\0"+
"\1\100\3\0\1\100\1\150\2\151\1\152\7\0\15\150"+
"\3\0\3\150\3\0\1\104\2\0\2\104\1\0\1\104"+
"\3\0\1\104\3\151\1\152\7\0\15\151\3\0\3\151"+
"\3\0\1\100\2\0\2\100\1\0\1\100\3\0\1\100"+
"\4\152\7\0\15\152\3\0\3\152\3\0\1\205\1\0"+
"\1\102\2\77\1\0\1\100\3\0\1\100\1\153\1\143"+
"\1\154\1\144\7\0\15\153\3\0\3\153\3\0\1\206"+
"\1\0\1\102\2\103\1\0\1\104\3\0\1\104\1\154"+
"\1\143\1\154\1\144\7\0\15\154\3\0\3\154\3\0"+
"\1\203\1\0\1\102\2\176\6\0\1\155\2\130\1\132"+
"\7\0\15\155\3\0\3\155\31\0\1\157\54\0\1\207"+
"\64\0\1\210\26\0\4\40\7\0\15\40\3\0\1\40"+
"\1\211\1\40\31\0\1\164\54\0\1\212\35\0\1\36"+
"\1\0\4\123\1\0\3\124\3\0\3\123\1\213\11\123"+
"\3\0\3\123\2\0\1\214\102\0\1\171\54\0\1\215"+
"\34\0\1\216\52\0\1\172\3\0\4\173\7\0\15\173"+
"\3\0\3\173\12\0\1\172\1\0\1\217\1\0\4\123"+
"\1\0\3\124\3\0\15\123\3\0\3\123\16\0\1\220"+
"\1\132\1\220\1\132\7\0\15\220\3\0\3\220\16\0"+
"\4\140\7\0\15\140\3\0\3\140\16\0\4\144\7\0"+
"\15\144\3\0\3\144\16\0\4\147\7\0\15\147\3\0"+
"\3\147\16\0\4\152\7\0\15\152\3\0\3\152\16\0"+
"\1\221\1\144\1\221\1\144\7\0\15\221\3\0\3\221"+
"\16\0\4\132\7\0\15\132\3\0\3\132\16\0\4\222"+
"\7\0\15\222\3\0\3\222\33\0\1\223\61\0\1\224"+
"\30\0\4\40\6\0\1\225\15\40\3\0\2\40\1\226"+
"\33\0\1\227\32\0\1\172\1\0\1\36\1\0\4\123"+
"\1\0\3\124\3\0\10\123\1\230\4\123\3\0\3\123"+
"\2\0\1\231\104\0\1\232\36\0\4\233\7\0\15\233"+
"\3\0\3\233\3\0\1\175\1\0\1\102\2\176\6\0"+
"\1\220\1\132\1\220\1\132\7\0\15\220\3\0\3\220"+
"\3\0\1\205\1\0\1\102\2\77\1\0\1\100\3\0"+
"\1\100\1\221\1\144\1\221\1\144\7\0\15\221\3\0"+
"\3\221\3\0\1\204\2\0\1\204\7\0\4\222\7\0"+
"\15\222\3\0\3\222\34\0\1\234\55\0\1\235\26\0"+
"\1\236\60\0\4\40\6\0\1\225\15\40\3\0\3\40"+
"\34\0\1\237\31\0\1\172\1\0\1\111\1\0\4\123"+
"\1\0\3\124\3\0\15\123\3\0\3\123\34\0\1\240"+
"\32\0\1\241\2\0\4\233\7\0\15\233\3\0\3\233"+
"\35\0\1\242\62\0\1\243\20\0\1\244\77\0\1\245"+
"\53\0\1\246\32\0\1\36\1\0\4\173\1\0\3\124"+
"\3\0\15\173\3\0\3\173\36\0\1\247\53\0\1\250"+
"\33\0\4\251\7\0\15\251\3\0\3\251\36\0\1\252"+
"\53\0\1\253\54\0\1\254\61\0\1\255\11\0\1\256"+
"\12\0\4\251\7\0\15\251\3\0\3\251\37\0\1\257"+
"\53\0\1\260\54\0\1\261\22\0\1\13\62\0\4\262"+
"\7\0\15\262\3\0\3\262\40\0\1\263\53\0\1\264"+
"\43\0\1\265\26\0\2\262\1\0\2\262\1\0\2\262"+
"\2\0\5\262\7\0\15\262\3\0\4\262\27\0\1\266"+
"\53\0\1\267\24\0";
private static int [] zzUnpackTrans() {
int [] result = new int[6908];
int [] result = new int[7040];
int offset = 0;
offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
return result;
@ -344,16 +346,17 @@ class WikipediaTokenizerImpl {
private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
private static final String ZZ_ATTRIBUTE_PACKED_0 =
"\11\0\1\11\7\1\1\11\10\1\1\11\2\1\1\11"+
"\13\1\1\11\6\1\2\11\3\0\1\11\14\0\2\1"+
"\2\11\1\1\1\0\1\1\1\0\1\1\1\0\1\1"+
"\3\0\7\1\2\0\1\1\1\0\15\1\3\0\1\1"+
"\1\11\3\0\1\1\1\11\5\0\1\1\4\0\1\1"+
"\2\0\2\1\2\0\1\1\5\0\1\11\3\1\3\0"+
"\1\1\2\0\1\11\30\0\1\1\2\0\3\11";
"\12\0\1\11\7\1\1\11\3\1\1\11\6\1\1\11"+
"\2\1\1\11\14\1\1\11\6\1\2\11\3\0\1\11"+
"\14\0\2\1\2\11\1\1\1\0\2\1\1\0\1\1"+
"\1\0\1\1\3\0\7\1\2\0\1\1\1\0\15\1"+
"\3\0\1\1\1\11\3\0\1\1\1\11\5\0\1\1"+
"\4\0\1\1\2\0\2\1\2\0\1\1\5\0\1\11"+
"\3\1\3\0\1\1\2\0\1\11\30\0\1\1\2\0"+
"\3\11";
private static int [] zzUnpackAttribute() {
int [] result = new int[178];
int [] result = new int[183];
int offset = 0;
offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
return result;
@ -446,6 +449,10 @@ private int currentTokType;
private int numBalanced = 0;
private int positionInc = 1;
private int numLinkToks = 0;
//Anytime we start a new on a Wiki reserved token (category, link, etc.) this value will be 0, otherwise it will be the number of tokens seen
//this can be useful for detecting when a new reserved token is encountered
//see https://issues.apache.org/jira/browse/LUCENE-1133
private int numWikiTokensSeen = 0;
public static final String [] TOKEN_TYPES = new String [] {
"<ALPHANUM>",
@ -468,6 +475,14 @@ public static final String [] TOKEN_TYPES = new String [] {
WikipediaTokenizer.EXTERNAL_LINK_URL
};
/**
Returns the number of tokens seen inside a category or link, etc.
@return the number of tokens seen inside the context of wiki syntax.
**/
public final int getNumWikiTokensSeen(){
return numWikiTokensSeen;
}
public final int yychar()
{
return yychar;
@ -480,10 +495,18 @@ public final int getPositionIncrement(){
/**
* Fills Lucene token with the current token text.
*/
final void getText(Token t, int tokType) {
final void getText(Token t) {
t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
}
final int setText(StringBuffer buffer){
int length = zzMarkedPos - zzStartRead;
buffer.append(zzBuffer, zzStartRead, length);
return length;
}
/**
* Creates a new scanner
@ -774,178 +797,186 @@ final void getText(Token t, int tokType) {
zzMarkedPos = zzMarkedPosL;
switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
case 7:
case 8:
{ /* ignore */
}
case 44: break;
case 46: break;
case 28:
{ currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0; yybegin(INTERNAL_LINK_STATE);
}
case 47: break;
case 3:
{ positionInc = 1; return CJ;
}
case 45: break;
case 28:
case 48: break;
case 30:
{ numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);/*end italics*/
}
case 46: break;
case 9:
case 49: break;
case 10:
{ numLinkToks = 0; positionInc = 0; yybegin(YYINITIAL);
}
case 47: break;
case 4:
{ positionInc = 1; currentTokType = EXTERNAL_LINK_URL; yybegin(EXTERNAL_LINK_STATE);
}
case 48: break;
case 39:
case 50: break;
case 41:
{ numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);/*end bold italics*/
}
case 49: break;
case 11:
{ currentTokType = ITALICS; yybegin(STRING); return currentTokType;/*italics*/
}
case 50: break;
case 23:
{ positionInc = 1; currentTokType = INTERNAL_LINK; yybegin(INTERNAL_LINK_STATE);
}
case 51: break;
case 5:
{ yybegin(CATEGORY_STATE); return currentTokType;
case 7:
{ yybegin(INTERNAL_LINK_STATE); numWikiTokensSeen++; return currentTokType;
}
case 52: break;
case 36:
{ numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);/*end sub header*/
case 23:
{ numWikiTokensSeen = 0; positionInc = 1; yybegin(DOUBLE_EQUALS_STATE);
}
case 53: break;
case 8:
{ if (numLinkToks == 0){positionInc = 0;} else{positionInc = 1;} currentTokType = EXTERNAL_LINK; yybegin(EXTERNAL_LINK_STATE); numLinkToks++; return currentTokType;
case 38:
{ numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);/*end sub header*/
}
case 54: break;
case 24:
{ positionInc = 1; currentTokType = CITATION; yybegin(DOUBLE_BRACE_STATE);
case 17:
{ yybegin(DOUBLE_BRACE_STATE); numWikiTokensSeen = 0; return currentTokType;
}
case 55: break;
case 22:
{ positionInc = 1; yybegin(DOUBLE_EQUALS_STATE);
case 24:
{ numWikiTokensSeen = 0; positionInc = 1; currentTokType = INTERNAL_LINK; yybegin(INTERNAL_LINK_STATE);
}
case 56: break;
case 41:
{ positionInc = 1; currentTokType = CATEGORY; yybegin(CATEGORY_STATE);
case 14:
{ yybegin(STRING); numWikiTokensSeen++; return currentTokType;
}
case 57: break;
case 18:
{ yybegin(STRING); return currentTokType;/* STRING ALPHANUM*/
}
case 58: break;
case 21:
{ positionInc = 1; if (numBalanced == 0){numBalanced++;yybegin(TWO_SINGLE_QUOTES_STATE);} else{numBalanced = 0;}
}
case 59: break;
case 1:
case 5:
{ positionInc = 1;
}
case 60: break;
case 58: break;
case 43:
{ numBalanced = 0;currentTokType = CATEGORY;yybegin(CATEGORY_STATE);
{ numWikiTokensSeen = 0; positionInc = 1; currentTokType = CATEGORY; yybegin(CATEGORY_STATE);
}
case 59: break;
case 26:
{ yybegin(YYINITIAL);
}
case 60: break;
case 20:
{ numBalanced = 0; numWikiTokensSeen = 0; currentTokType = EXTERNAL_LINK;yybegin(EXTERNAL_LINK_STATE);
}
case 61: break;
case 25:
{ yybegin(YYINITIAL);
case 1:
{ numWikiTokensSeen = 0; positionInc = 1;
}
case 62: break;
case 40:
{ positionInc = 1; yybegin(EXTERNAL_LINK_STATE); return currentTokType;
}
case 63: break;
case 19:
{ numBalanced = 0;currentTokType = EXTERNAL_LINK;yybegin(EXTERNAL_LINK_STATE);
}
case 64: break;
case 13:
{ yybegin(STRING);return currentTokType;
}
case 65: break;
case 38:
{ positionInc = 1; return EMAIL;
}
case 66: break;
case 37:
case 63: break;
case 25:
{ numWikiTokensSeen = 0; positionInc = 1; currentTokType = CITATION; yybegin(DOUBLE_BRACE_STATE);
}
case 64: break;
case 39:
{ positionInc = 1; return ACRONYM;
}
case 65: break;
case 9:
{ if (numLinkToks == 0){positionInc = 0;} else{positionInc = 1;} numWikiTokensSeen++; currentTokType = EXTERNAL_LINK; yybegin(EXTERNAL_LINK_STATE); numLinkToks++; return currentTokType;
}
case 66: break;
case 22:
{ numWikiTokensSeen = 0; positionInc = 1; if (numBalanced == 0){numBalanced++;yybegin(TWO_SINGLE_QUOTES_STATE);} else{numBalanced = 0;}
}
case 67: break;
case 17:
{ /* ignore STRING */
case 31:
{ numBalanced = 0; numWikiTokensSeen = 0; currentTokType = INTERNAL_LINK;yybegin(INTERNAL_LINK_STATE);
}
case 68: break;
case 42:
{ currentTokType = CATEGORY;yybegin(CATEGORY_STATE);
case 15:
{ currentTokType = SUB_HEADING; numWikiTokensSeen = 0; yybegin(STRING);
}
case 69: break;
case 20:
{ yybegin(STRING); return currentTokType;/*pipe*/
case 18:
{ /* ignore STRING */
}
case 70: break;
case 12:
{ currentTokType = EXTERNAL_LINK;yybegin(EXTERNAL_LINK_STATE);
case 42:
{ positionInc = 1; numWikiTokensSeen++; yybegin(EXTERNAL_LINK_STATE); return currentTokType;
}
case 71: break;
case 29:
{ numBalanced = 0;currentTokType = INTERNAL_LINK;yybegin(INTERNAL_LINK_STATE);
case 21:
{ yybegin(STRING); return currentTokType;/*pipe*/
}
case 72: break;
case 35:
case 37:
{ numBalanced = 0;currentTokType = ALPHANUM;yybegin(YYINITIAL);/*end bold*/
}
case 73: break;
case 16:
{ yybegin(DOUBLE_BRACE_STATE); return currentTokType;
}
case 74: break;
case 31:
case 33:
{ positionInc = 1; return HOST;
}
case 74: break;
case 45:
{ numBalanced = 0; numWikiTokensSeen = 0; currentTokType = CATEGORY;yybegin(CATEGORY_STATE);
}
case 75: break;
case 34:
case 36:
{ currentTokType = BOLD_ITALICS; yybegin(FIVE_SINGLE_QUOTES_STATE);
}
case 76: break;
case 27:
{ currentTokType = INTERNAL_LINK;yybegin(INTERNAL_LINK_STATE);
case 13:
{ currentTokType = EXTERNAL_LINK; numWikiTokensSeen = 0; yybegin(EXTERNAL_LINK_STATE);
}
case 77: break;
case 14:
{ currentTokType = SUB_HEADING; yybegin(STRING);
case 16:
{ currentTokType = HEADING; yybegin(DOUBLE_EQUALS_STATE); numWikiTokensSeen++; return currentTokType;
}
case 78: break;
case 30:
{ positionInc = 1; return APOSTROPHE;
case 12:
{ currentTokType = ITALICS; numWikiTokensSeen++; yybegin(STRING); return currentTokType;/*italics*/
}
case 79: break;
case 32:
{ positionInc = 1; return NUM;
case 6:
{ yybegin(CATEGORY_STATE); numWikiTokensSeen++; return currentTokType;
}
case 80: break;
case 15:
{ currentTokType = HEADING; yybegin(DOUBLE_EQUALS_STATE); return currentTokType;
case 32:
{ positionInc = 1; return APOSTROPHE;
}
case 81: break;
case 6:
{ yybegin(INTERNAL_LINK_STATE); return currentTokType;
case 19:
{ yybegin(STRING); numWikiTokensSeen++; return currentTokType;/* STRING ALPHANUM*/
}
case 82: break;
case 34:
{ positionInc = 1; return NUM;
}
case 83: break;
case 44:
{ currentTokType = CATEGORY; numWikiTokensSeen = 0; yybegin(CATEGORY_STATE);
}
case 84: break;
case 2:
{ positionInc = 1; return ALPHANUM;
}
case 83: break;
case 33:
case 85: break;
case 35:
{ positionInc = 1; return COMPANY;
}
case 84: break;
case 10:
case 86: break;
case 11:
{ currentTokType = BOLD; yybegin(THREE_SINGLE_QUOTES_STATE);
}
case 85: break;
case 26:
case 87: break;
case 29:
{ currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0; yybegin(INTERNAL_LINK_STATE);
}
case 88: break;
case 4:
{ numWikiTokensSeen = 0; positionInc = 1; currentTokType = EXTERNAL_LINK_URL; yybegin(EXTERNAL_LINK_STATE);
}
case 89: break;
case 27:
{ numLinkToks = 0; yybegin(YYINITIAL);
}
case 86: break;
case 90: break;
default:
if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
zzAtEOF = true;

View File

@ -54,6 +54,10 @@ private int currentTokType;
private int numBalanced = 0;
private int positionInc = 1;
private int numLinkToks = 0;
//Anytime we start a new on a Wiki reserved token (category, link, etc.) this value will be 0, otherwise it will be the number of tokens seen
//this can be useful for detecting when a new reserved token is encountered
//see https://issues.apache.org/jira/browse/LUCENE-1133
private int numWikiTokensSeen = 0;
public static final String [] TOKEN_TYPES = new String [] {
"<ALPHANUM>",
@ -76,6 +80,14 @@ public static final String [] TOKEN_TYPES = new String [] {
WikipediaTokenizer.EXTERNAL_LINK_URL
};
/**
Returns the number of tokens seen inside a category or link, etc.
@return the number of tokens seen inside the context of wiki syntax.
**/
public final int getNumWikiTokensSeen(){
return numWikiTokensSeen;
}
public final int yychar()
{
return yychar;
@ -88,9 +100,17 @@ public final int getPositionIncrement(){
/**
* Fills Lucene token with the current token text.
*/
final void getText(Token t, int tokType) {
final void getText(Token t) {
t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
}
final int setText(StringBuffer buffer){
int length = zzMarkedPos - zzStartRead;
buffer.append(zzBuffer, zzStartRead, length);
return length;
}
%}
// basic word: a sequence of digits & letters
@ -191,21 +211,21 @@ DOUBLE_EQUALS = "="{2}
//First {ALPHANUM} is always the link, set positioninc to 1 for double bracket, but then inside the internal link state
//set it to 0 for the next token, such that the link and the first token are in the same position, but then subsequent
//tokens within the link are incremented
{DOUBLE_BRACKET} {positionInc = 1; currentTokType = INTERNAL_LINK; yybegin(INTERNAL_LINK_STATE);}
{DOUBLE_BRACKET_CAT} {positionInc = 1; currentTokType = CATEGORY; yybegin(CATEGORY_STATE);}
{EXTERNAL_LINK} {positionInc = 1; currentTokType = EXTERNAL_LINK_URL; yybegin(EXTERNAL_LINK_STATE);}
{TWO_SINGLE_QUOTES} {positionInc = 1; if (numBalanced == 0){numBalanced++;yybegin(TWO_SINGLE_QUOTES_STATE);} else{numBalanced = 0;}}
{DOUBLE_EQUALS} {positionInc = 1; yybegin(DOUBLE_EQUALS_STATE);}
{DOUBLE_BRACE} {positionInc = 1; currentTokType = CITATION; yybegin(DOUBLE_BRACE_STATE);}
{CITATION} {positionInc = 1; currentTokType = CITATION; yybegin(DOUBLE_BRACE_STATE);}
{DOUBLE_BRACKET} {numWikiTokensSeen = 0; positionInc = 1; currentTokType = INTERNAL_LINK; yybegin(INTERNAL_LINK_STATE);}
{DOUBLE_BRACKET_CAT} {numWikiTokensSeen = 0; positionInc = 1; currentTokType = CATEGORY; yybegin(CATEGORY_STATE);}
{EXTERNAL_LINK} {numWikiTokensSeen = 0; positionInc = 1; currentTokType = EXTERNAL_LINK_URL; yybegin(EXTERNAL_LINK_STATE);}
{TWO_SINGLE_QUOTES} {numWikiTokensSeen = 0; positionInc = 1; if (numBalanced == 0){numBalanced++;yybegin(TWO_SINGLE_QUOTES_STATE);} else{numBalanced = 0;}}
{DOUBLE_EQUALS} {numWikiTokensSeen = 0; positionInc = 1; yybegin(DOUBLE_EQUALS_STATE);}
{DOUBLE_BRACE} {numWikiTokensSeen = 0; positionInc = 1; currentTokType = CITATION; yybegin(DOUBLE_BRACE_STATE);}
{CITATION} {numWikiTokensSeen = 0; positionInc = 1; currentTokType = CITATION; yybegin(DOUBLE_BRACE_STATE);}
//ignore
. | {WHITESPACE} |{INFOBOX} { positionInc = 1; }
. | {WHITESPACE} |{INFOBOX} {numWikiTokensSeen = 0; positionInc = 1; }
}
<INTERNAL_LINK_STATE>{
//First {ALPHANUM} is always the link, set position to 0 for these
//This is slightly different from EXTERNAL_LINK_STATE because that one has an explicit grammar for capturing the URL
{ALPHANUM} {yybegin(INTERNAL_LINK_STATE); return currentTokType;}
{ALPHANUM} {yybegin(INTERNAL_LINK_STATE); numWikiTokensSeen++; return currentTokType;}
{DOUBLE_BRACKET_CLOSE} {numLinkToks = 0; yybegin(YYINITIAL);}
//ignore
. | {WHITESPACE} { positionInc = 1; }
@ -213,14 +233,14 @@ DOUBLE_EQUALS = "="{2}
<EXTERNAL_LINK_STATE>{
//increment the link token, but then don't increment the tokens after that which are still in the link
("http://"|"https://"){HOST}("/"?({ALPHANUM}|{P}|\?|"&"|"="|"#")*)* {positionInc = 1; yybegin(EXTERNAL_LINK_STATE); return currentTokType;}
{ALPHANUM} {if (numLinkToks == 0){positionInc = 0;} else{positionInc = 1;} currentTokType = EXTERNAL_LINK; yybegin(EXTERNAL_LINK_STATE); numLinkToks++; return currentTokType;}
("http://"|"https://"){HOST}("/"?({ALPHANUM}|{P}|\?|"&"|"="|"#")*)* {positionInc = 1; numWikiTokensSeen++; yybegin(EXTERNAL_LINK_STATE); return currentTokType;}
{ALPHANUM} {if (numLinkToks == 0){positionInc = 0;} else{positionInc = 1;} numWikiTokensSeen++; currentTokType = EXTERNAL_LINK; yybegin(EXTERNAL_LINK_STATE); numLinkToks++; return currentTokType;}
"]" {numLinkToks = 0; positionInc = 0; yybegin(YYINITIAL);}
{WHITESPACE} { positionInc = 1; }
}
<CATEGORY_STATE>{
{ALPHANUM} {yybegin(CATEGORY_STATE); return currentTokType;}
{ALPHANUM} {yybegin(CATEGORY_STATE); numWikiTokensSeen++; return currentTokType;}
{DOUBLE_BRACKET_CLOSE} {yybegin(YYINITIAL);}
//ignore
. | {WHITESPACE} { positionInc = 1; }
@ -229,22 +249,22 @@ DOUBLE_EQUALS = "="{2}
<TWO_SINGLE_QUOTES_STATE>{
"'" {currentTokType = BOLD; yybegin(THREE_SINGLE_QUOTES_STATE);}
"'''" {currentTokType = BOLD_ITALICS; yybegin(FIVE_SINGLE_QUOTES_STATE);}
{ALPHANUM} {currentTokType = ITALICS; yybegin(STRING); return currentTokType;/*italics*/}
{ALPHANUM} {currentTokType = ITALICS; numWikiTokensSeen++; yybegin(STRING); return currentTokType;/*italics*/}
//we can have links inside, let those override
{DOUBLE_BRACKET} {currentTokType = INTERNAL_LINK;yybegin(INTERNAL_LINK_STATE);}
{DOUBLE_BRACKET_CAT} {currentTokType = CATEGORY;yybegin(CATEGORY_STATE);}
{EXTERNAL_LINK} {currentTokType = EXTERNAL_LINK;yybegin(EXTERNAL_LINK_STATE);}
{DOUBLE_BRACKET} {currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0; yybegin(INTERNAL_LINK_STATE);}
{DOUBLE_BRACKET_CAT} {currentTokType = CATEGORY; numWikiTokensSeen = 0; yybegin(CATEGORY_STATE);}
{EXTERNAL_LINK} {currentTokType = EXTERNAL_LINK; numWikiTokensSeen = 0; yybegin(EXTERNAL_LINK_STATE);}
//ignore
. | {WHITESPACE} { /* ignore */ }
}
//bold
<THREE_SINGLE_QUOTES_STATE>{
{ALPHANUM} {yybegin(STRING);return currentTokType;}
{ALPHANUM} {yybegin(STRING); numWikiTokensSeen++; return currentTokType;}
//we can have links inside, let those override
{DOUBLE_BRACKET} {currentTokType = INTERNAL_LINK;yybegin(INTERNAL_LINK_STATE);}
{DOUBLE_BRACKET_CAT} {currentTokType = CATEGORY;yybegin(CATEGORY_STATE);}
{EXTERNAL_LINK} {currentTokType = EXTERNAL_LINK;yybegin(EXTERNAL_LINK_STATE);}
{DOUBLE_BRACKET} {currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0; yybegin(INTERNAL_LINK_STATE);}
{DOUBLE_BRACKET_CAT} {currentTokType = CATEGORY; numWikiTokensSeen = 0; yybegin(CATEGORY_STATE);}
{EXTERNAL_LINK} {currentTokType = EXTERNAL_LINK; numWikiTokensSeen = 0; yybegin(EXTERNAL_LINK_STATE);}
//ignore
. | {WHITESPACE} { /* ignore */ }
@ -252,26 +272,26 @@ DOUBLE_EQUALS = "="{2}
}
//bold italics
<FIVE_SINGLE_QUOTES_STATE>{
{ALPHANUM} {yybegin(STRING);return currentTokType;}
{ALPHANUM} {yybegin(STRING); numWikiTokensSeen++; return currentTokType;}
//we can have links inside, let those override
{DOUBLE_BRACKET} {currentTokType = INTERNAL_LINK;yybegin(INTERNAL_LINK_STATE);}
{DOUBLE_BRACKET_CAT} {currentTokType = CATEGORY;yybegin(CATEGORY_STATE);}
{EXTERNAL_LINK} {currentTokType = EXTERNAL_LINK;yybegin(EXTERNAL_LINK_STATE);}
{DOUBLE_BRACKET} {currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0; yybegin(INTERNAL_LINK_STATE);}
{DOUBLE_BRACKET_CAT} {currentTokType = CATEGORY; numWikiTokensSeen = 0; yybegin(CATEGORY_STATE);}
{EXTERNAL_LINK} {currentTokType = EXTERNAL_LINK; numWikiTokensSeen = 0; yybegin(EXTERNAL_LINK_STATE);}
//ignore
. | {WHITESPACE} { /* ignore */ }
}
<DOUBLE_EQUALS_STATE>{
"=" {currentTokType = SUB_HEADING; yybegin(STRING);}
{ALPHANUM} {currentTokType = HEADING; yybegin(DOUBLE_EQUALS_STATE); return currentTokType;}
"=" {currentTokType = SUB_HEADING; numWikiTokensSeen = 0; yybegin(STRING);}
{ALPHANUM} {currentTokType = HEADING; yybegin(DOUBLE_EQUALS_STATE); numWikiTokensSeen++; return currentTokType;}
{DOUBLE_EQUALS} {yybegin(YYINITIAL);}
//ignore
. | {WHITESPACE} { /* ignore */ }
}
<DOUBLE_BRACE_STATE>{
{ALPHANUM} {yybegin(DOUBLE_BRACE_STATE); return currentTokType;}
{ALPHANUM} {yybegin(DOUBLE_BRACE_STATE); numWikiTokensSeen = 0; return currentTokType;}
{DOUBLE_BRACE_CLOSE} {yybegin(YYINITIAL);}
{CITATION_CLOSE} {yybegin(YYINITIAL);}
//ignore
@ -283,11 +303,11 @@ DOUBLE_EQUALS = "="{2}
"'''" {numBalanced = 0;currentTokType = ALPHANUM;yybegin(YYINITIAL);/*end bold*/}
"''" {numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);/*end italics*/}
"===" {numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);/*end sub header*/}
{ALPHANUM} {yybegin(STRING); return currentTokType;/* STRING ALPHANUM*/}
{ALPHANUM} {yybegin(STRING); numWikiTokensSeen++; return currentTokType;/* STRING ALPHANUM*/}
//we can have links inside, let those override
{DOUBLE_BRACKET} {numBalanced = 0;currentTokType = INTERNAL_LINK;yybegin(INTERNAL_LINK_STATE);}
{DOUBLE_BRACKET_CAT} {numBalanced = 0;currentTokType = CATEGORY;yybegin(CATEGORY_STATE);}
{EXTERNAL_LINK} {numBalanced = 0;currentTokType = EXTERNAL_LINK;yybegin(EXTERNAL_LINK_STATE);}
{DOUBLE_BRACKET} {numBalanced = 0; numWikiTokensSeen = 0; currentTokType = INTERNAL_LINK;yybegin(INTERNAL_LINK_STATE);}
{DOUBLE_BRACKET_CAT} {numBalanced = 0; numWikiTokensSeen = 0; currentTokType = CATEGORY;yybegin(CATEGORY_STATE);}
{EXTERNAL_LINK} {numBalanced = 0; numWikiTokensSeen = 0; currentTokType = EXTERNAL_LINK;yybegin(EXTERNAL_LINK_STATE);}
{PIPE} {yybegin(STRING); return currentTokType;/*pipe*/}

View File

@ -22,8 +22,11 @@ import junit.framework.TestCase;
import org.apache.lucene.analysis.Token;
import java.io.StringReader;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import java.util.HashSet;
/**
@ -31,6 +34,7 @@ import java.util.Map;
*
**/
public class WikipediaTokenizerTest extends TestCase {
protected static final String LINK_PHRASES = "click [[link here again]] click [http://lucene.apache.org here again] [[Category:a b c d]]";
public WikipediaTokenizerTest(String s) {
@ -155,8 +159,13 @@ public class WikipediaTokenizerTest extends TestCase {
}
public void testLinkPhrases() throws Exception {
String test = "click [[link here again]] click [http://lucene.apache.org here again]";
WikipediaTokenizer tf = new WikipediaTokenizer(new StringReader(test));
WikipediaTokenizer tf = new WikipediaTokenizer(new StringReader(LINK_PHRASES));
checkLinkPhrases(tf);
}
private void checkLinkPhrases(WikipediaTokenizer tf) throws IOException {
Token token = new Token();
token = tf.next(token);
assertTrue("token is null and it shouldn't be", token != null);
@ -201,7 +210,33 @@ public class WikipediaTokenizerTest extends TestCase {
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "again",
new String(token.termBuffer(), 0, token.termLength()).equals("again") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
token = tf.next(token);
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "a",
new String(token.termBuffer(), 0, token.termLength()).equals("a") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
token = tf.next(token);
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "b",
new String(token.termBuffer(), 0, token.termLength()).equals("b") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
token = tf.next(token);
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "c",
new String(token.termBuffer(), 0, token.termLength()).equals("c") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
token = tf.next(token);
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "d",
new String(token.termBuffer(), 0, token.termLength()).equals("d") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
token = tf.next();
assertTrue("token is not null and it should be", token == null);
}
public void testLinks() throws Exception {
@ -225,5 +260,317 @@ public class WikipediaTokenizerTest extends TestCase {
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "https://lucene.apache.org/java/docs/index.html?b=c",
new String(token.termBuffer(), 0, token.termLength()).equals("https://lucene.apache.org/java/docs/index.html?b=c") == true);
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.EXTERNAL_LINK_URL, token.type().equals(WikipediaTokenizer.EXTERNAL_LINK_URL) == true);
token = tf.next();
assertTrue("token is null and it shouldn't be", token != null);
token = tf.next();
assertTrue("token is not null and it should be", token == null);
}
public void testLucene1133() throws Exception {
Set untoks = new HashSet();
untoks.add(WikipediaTokenizer.CATEGORY);
untoks.add(WikipediaTokenizer.ITALICS);
//should be exactly the same, regardless of untoks
WikipediaTokenizer tf = new WikipediaTokenizer(new StringReader(LINK_PHRASES), WikipediaTokenizer.TOKENS_ONLY, untoks);
checkLinkPhrases(tf);
String test = "[[Category:a b c d]] [[Category:e f g]] [[link here]] [[link there]] ''italics here'' something ''more italics'' [[Category:h i j]]";
tf = new WikipediaTokenizer(new StringReader(test), WikipediaTokenizer.UNTOKENIZED_ONLY, untoks);
Token token;
token = tf.next();
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "a b c d",
new String(token.termBuffer(), 0, token.termLength()).equals("a b c d") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
assertTrue(token.startOffset() + " does not equal: " + 11, token.startOffset() == 11);
assertTrue(token.endOffset() + " does not equal: " + 18, token.endOffset() == 18);
token = tf.next();
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "e f g",
new String(token.termBuffer(), 0, token.termLength()).equals("e f g") == true);
assertTrue(token.startOffset() + " does not equal: " + 32, token.startOffset() == 32);
assertTrue(token.endOffset() + " does not equal: " + 37, token.endOffset() == 37);
token = tf.next();
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "link",
new String(token.termBuffer(), 0, token.termLength()).equals("link") == true);
assertTrue(token.startOffset() + " does not equal: " + 42, token.startOffset() == 42);
assertTrue(token.endOffset() + " does not equal: " + 46, token.endOffset() == 46);
token = tf.next();
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "here",
new String(token.termBuffer(), 0, token.termLength()).equals("here") == true);
assertTrue(token.startOffset() + " does not equal: " + 47, token.startOffset() == 47);
assertTrue(token.endOffset() + " does not equal: " + 51, token.endOffset() == 51);
token = tf.next();
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "link",
new String(token.termBuffer(), 0, token.termLength()).equals("link") == true);
assertTrue(token.startOffset() + " does not equal: " + 56, token.startOffset() == 56);
assertTrue(token.endOffset() + " does not equal: " + 60, token.endOffset() == 60);
token = tf.next();
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "there",
new String(token.termBuffer(), 0, token.termLength()).equals("there") == true);
assertTrue(token.startOffset() + " does not equal: " + 61, token.startOffset() == 61);
assertTrue(token.endOffset() + " does not equal: " + 66, token.endOffset() == 66);
token = tf.next();
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "italics here",
new String(token.termBuffer(), 0, token.termLength()).equals("italics here") == true);
assertTrue(token.startOffset() + " does not equal: " + 71, token.startOffset() == 71);
assertTrue(token.endOffset() + " does not equal: " + 83, token.endOffset() == 83);
token = tf.next();
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "something",
new String(token.termBuffer(), 0, token.termLength()).equals("something") == true);
assertTrue(token.startOffset() + " does not equal: " + 86, token.startOffset() == 86);
assertTrue(token.endOffset() + " does not equal: " + 95, token.endOffset() == 95);
token = tf.next();
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "more italics",
new String(token.termBuffer(), 0, token.termLength()).equals("more italics") == true);
assertTrue(token.startOffset() + " does not equal: " + 98, token.startOffset() == 98);
assertTrue(token.endOffset() + " does not equal: " + 110, token.endOffset() == 110);
token = tf.next();
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "h i j",
new String(token.termBuffer(), 0, token.termLength()).equals("h i j") == true);
assertTrue(token.startOffset() + " does not equal: " + 124, token.startOffset() == 124);
assertTrue(token.endOffset() + " does not equal: " + 133, token.endOffset() == 133);
token = tf.next();
assertTrue("token is not null and it should be", token == null);
}
public void testBoth() throws Exception {
Set untoks = new HashSet();
untoks.add(WikipediaTokenizer.CATEGORY);
untoks.add(WikipediaTokenizer.ITALICS);
String test = "[[Category:a b c d]] [[Category:e f g]] [[link here]] [[link there]] ''italics here'' something ''more italics'' [[Category:h i j]]";
//should output all the indivual tokens plus the untokenized tokens as well. Untokenized tokens
WikipediaTokenizer tf = new WikipediaTokenizer(new StringReader(test), WikipediaTokenizer.BOTH, untoks);
Token token;
token = tf.next();
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "a b c d",
new String(token.termBuffer(), 0, token.termLength()).equals("a b c d") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true);
assertTrue(token.getFlags() + " does not equal: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, token.getFlags() == WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG);
assertTrue(token.startOffset() + " does not equal: " + 11, token.startOffset() == 11);
assertTrue(token.endOffset() + " does not equal: " + 18, token.endOffset() == 18);
token = tf.next();
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "a",
new String(token.termBuffer(), 0, token.termLength()).equals("a") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 0, token.getPositionIncrement() == 0);
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true);
assertTrue(token.getFlags() + " equals: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG + " and it shouldn't", token.getFlags() != WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG);
assertTrue(token.startOffset() + " does not equal: " + 11, token.startOffset() == 11);
assertTrue(token.endOffset() + " does not equal: " + 12, token.endOffset() == 12);
token = tf.next();
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "b",
new String(token.termBuffer(), 0, token.termLength()).equals("b") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true);
assertTrue(token.startOffset() + " does not equal: " + 13, token.startOffset() == 13);
assertTrue(token.endOffset() + " does not equal: " + 14, token.endOffset() == 14);
token = tf.next();
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "c",
new String(token.termBuffer(), 0, token.termLength()).equals("c") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true);
assertTrue(token.startOffset() + " does not equal: " + 15, token.startOffset() == 15);
assertTrue(token.endOffset() + " does not equal: " + 16, token.endOffset() == 16);
token = tf.next();
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "d",
new String(token.termBuffer(), 0, token.termLength()).equals("d") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true);
assertTrue(token.startOffset() + " does not equal: " + 17, token.startOffset() == 17);
assertTrue(token.endOffset() + " does not equal: " + 18, token.endOffset() == 18);
token = tf.next();
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "e f g",
new String(token.termBuffer(), 0, token.termLength()).equals("e f g") == true);
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true);
assertTrue(token.getFlags() + " does not equal: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, token.getFlags() == WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG);
assertTrue(token.startOffset() + " does not equal: " + 32, token.startOffset() == 32);
assertTrue(token.endOffset() + " does not equal: " + 37, token.endOffset() == 37);
token = tf.next();
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "e",
new String(token.termBuffer(), 0, token.termLength()).equals("e") == true);
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 0, token.getPositionIncrement() == 0);
assertTrue(token.startOffset() + " does not equal: " + 32, token.startOffset() == 32);
assertTrue(token.endOffset() + " does not equal: " + 33, token.endOffset() == 33);
token = tf.next();
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "f",
new String(token.termBuffer(), 0, token.termLength()).equals("f") == true);
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
assertTrue(token.startOffset() + " does not equal: " + 34, token.startOffset() == 34);
assertTrue(token.endOffset() + " does not equal: " + 35, token.endOffset() == 35);
token = tf.next();
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "g",
new String(token.termBuffer(), 0, token.termLength()).equals("g") == true);
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
assertTrue(token.startOffset() + " does not equal: " + 36, token.startOffset() == 36);
assertTrue(token.endOffset() + " does not equal: " + 37, token.endOffset() == 37);
token = tf.next();
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "link",
new String(token.termBuffer(), 0, token.termLength()).equals("link") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.INTERNAL_LINK, token.type().equals(WikipediaTokenizer.INTERNAL_LINK) == true);
assertTrue(token.startOffset() + " does not equal: " + 42, token.startOffset() == 42);
assertTrue(token.endOffset() + " does not equal: " + 46, token.endOffset() == 46);
token = tf.next();
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "here",
new String(token.termBuffer(), 0, token.termLength()).equals("here") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.INTERNAL_LINK, token.type().equals(WikipediaTokenizer.INTERNAL_LINK) == true);
assertTrue(token.startOffset() + " does not equal: " + 47, token.startOffset() == 47);
assertTrue(token.endOffset() + " does not equal: " + 51, token.endOffset() == 51);
token = tf.next();
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "link",
new String(token.termBuffer(), 0, token.termLength()).equals("link") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
assertTrue(token.startOffset() + " does not equal: " + 56, token.startOffset() == 56);
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.INTERNAL_LINK, token.type().equals(WikipediaTokenizer.INTERNAL_LINK) == true);
assertTrue(token.endOffset() + " does not equal: " + 60, token.endOffset() == 60);
token = tf.next();
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "there",
new String(token.termBuffer(), 0, token.termLength()).equals("there") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.INTERNAL_LINK, token.type().equals(WikipediaTokenizer.INTERNAL_LINK) == true);
assertTrue(token.startOffset() + " does not equal: " + 61, token.startOffset() == 61);
assertTrue(token.endOffset() + " does not equal: " + 66, token.endOffset() == 66);
token = tf.next();
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "italics here",
new String(token.termBuffer(), 0, token.termLength()).equals("italics here") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.ITALICS, token.type().equals(WikipediaTokenizer.ITALICS) == true);
assertTrue(token.getFlags() + " does not equal: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, token.getFlags() == WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG);
assertTrue(token.startOffset() + " does not equal: " + 71, token.startOffset() == 71);
assertTrue(token.endOffset() + " does not equal: " + 83, token.endOffset() == 83);
token = tf.next();
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "italics",
new String(token.termBuffer(), 0, token.termLength()).equals("italics") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 0, token.getPositionIncrement() == 0);
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.ITALICS, token.type().equals(WikipediaTokenizer.ITALICS) == true);
assertTrue(token.startOffset() + " does not equal: " + 71, token.startOffset() == 71);
assertTrue(token.endOffset() + " does not equal: " + 78, token.endOffset() == 78);
token = tf.next();
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "here",
new String(token.termBuffer(), 0, token.termLength()).equals("here") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.ITALICS, token.type().equals(WikipediaTokenizer.ITALICS) == true);
assertTrue(token.startOffset() + " does not equal: " + 79, token.startOffset() == 79);
assertTrue(token.endOffset() + " does not equal: " + 83, token.endOffset() == 83);
token = tf.next();
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "something",
new String(token.termBuffer(), 0, token.termLength()).equals("something") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
assertTrue(token.startOffset() + " does not equal: " + 86, token.startOffset() == 86);
assertTrue(token.endOffset() + " does not equal: " + 95, token.endOffset() == 95);
token = tf.next();
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "more italics",
new String(token.termBuffer(), 0, token.termLength()).equals("more italics") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.ITALICS, token.type().equals(WikipediaTokenizer.ITALICS) == true);
assertTrue(token.getFlags() + " does not equal: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, token.getFlags() == WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG);
assertTrue(token.startOffset() + " does not equal: " + 98, token.startOffset() == 98);
assertTrue(token.endOffset() + " does not equal: " + 110, token.endOffset() == 110);
token = tf.next();
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "more",
new String(token.termBuffer(), 0, token.termLength()).equals("more") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 0, token.getPositionIncrement() == 0);
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.ITALICS, token.type().equals(WikipediaTokenizer.ITALICS) == true);
assertTrue(token.startOffset() + " does not equal: " + 98, token.startOffset() == 98);
assertTrue(token.endOffset() + " does not equal: " + 102, token.endOffset() == 102);
token = tf.next();
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "italics",
new String(token.termBuffer(), 0, token.termLength()).equals("italics") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.ITALICS, token.type().equals(WikipediaTokenizer.ITALICS) == true);
assertTrue(token.startOffset() + " does not equal: " + 103, token.startOffset() == 103);
assertTrue(token.endOffset() + " does not equal: " + 110, token.endOffset() == 110);
token = tf.next();
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "h i j",
new String(token.termBuffer(), 0, token.termLength()).equals("h i j") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true);
assertTrue(token.getFlags() + " does not equal: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, token.getFlags() == WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG);
assertTrue(token.startOffset() + " does not equal: " + 124, token.startOffset() == 124);
assertTrue(token.endOffset() + " does not equal: " + 133, token.endOffset() == 133);
token = tf.next();
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "h",
new String(token.termBuffer(), 0, token.termLength()).equals("h") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 0, token.getPositionIncrement() == 0);
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true);
assertTrue(token.startOffset() + " does not equal: " + 124, token.startOffset() == 124);
assertTrue(token.endOffset() + " does not equal: " + 125, token.endOffset() == 125);
token = tf.next();
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "i",
new String(token.termBuffer(), 0, token.termLength()).equals("i") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true);
assertTrue(token.startOffset() + " does not equal: " + 128, token.startOffset() == 128);
assertTrue(token.endOffset() + " does not equal: " + 129, token.endOffset() == 129);
token = tf.next();
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "j",
new String(token.termBuffer(), 0, token.termLength()).equals("j") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true);
assertTrue(token.startOffset() + " does not equal: " + 132, token.startOffset() == 132);
assertTrue(token.endOffset() + " does not equal: " + 133, token.endOffset() == 133);
token = tf.next();
assertTrue("token is not null and it should be", token == null);
}
}