mirror of https://github.com/apache/lucene.git
LUCENE-1133: Adds ability to keep certain strings as single tokens
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@614895 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
1f0e88f186
commit
305c47f500
|
@ -22,17 +22,17 @@ import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Extension of StandardTokenizer that is aware of Wikipedia syntax. It is based off of the
|
* Extension of StandardTokenizer that is aware of Wikipedia syntax. It is based off of the
|
||||||
* Wikipedia tutorial available at http://en.wikipedia.org/wiki/Wikipedia:Tutorial, but it may not be complete.
|
* Wikipedia tutorial available at http://en.wikipedia.org/wiki/Wikipedia:Tutorial, but it may not be complete.
|
||||||
*
|
* <p/>
|
||||||
* <p/>
|
* <p/>
|
||||||
* EXPERIMENTAL !!!!!!!!!
|
* EXPERIMENTAL !!!!!!!!!
|
||||||
* NOTE: This Tokenizer is considered experimental and the grammar is subject to change in the trunk and in follow up releases.
|
* NOTE: This Tokenizer is considered experimental and the grammar is subject to change in the trunk and in follow up releases.
|
||||||
*
|
*/
|
||||||
**/
|
|
||||||
public class WikipediaTokenizer extends Tokenizer {
|
public class WikipediaTokenizer extends Tokenizer {
|
||||||
public static final String INTERNAL_LINK = "il";
|
public static final String INTERNAL_LINK = "il";
|
||||||
public static final String EXTERNAL_LINK = "el";
|
public static final String EXTERNAL_LINK = "el";
|
||||||
|
@ -45,11 +45,21 @@ public class WikipediaTokenizer extends Tokenizer {
|
||||||
public static final String BOLD_ITALICS = "bi";
|
public static final String BOLD_ITALICS = "bi";
|
||||||
public static final String HEADING = "h";
|
public static final String HEADING = "h";
|
||||||
public static final String SUB_HEADING = "sh";
|
public static final String SUB_HEADING = "sh";
|
||||||
|
|
||||||
|
public static final int TOKENS_ONLY = 0;
|
||||||
|
public static final int UNTOKENIZED_ONLY = 1;
|
||||||
|
public static final int BOTH = 2;
|
||||||
|
|
||||||
|
public static final int UNTOKENIZED_TOKEN_FLAG = 1;
|
||||||
/**
|
/**
|
||||||
* A private instance of the JFlex-constructed scanner
|
* A private instance of the JFlex-constructed scanner
|
||||||
*/
|
*/
|
||||||
private final WikipediaTokenizerImpl scanner;
|
private final WikipediaTokenizerImpl scanner;
|
||||||
|
|
||||||
|
private int tokenOutput = TOKENS_ONLY;
|
||||||
|
private Set untokenizedTypes = Collections.EMPTY_SET;
|
||||||
|
private Iterator tokens = null;
|
||||||
|
|
||||||
void setInput(Reader reader) {
|
void setInput(Reader reader) {
|
||||||
this.input = reader;
|
this.input = reader;
|
||||||
}
|
}
|
||||||
|
@ -57,11 +67,19 @@ public class WikipediaTokenizer extends Tokenizer {
|
||||||
/**
|
/**
|
||||||
* Creates a new instance of the {@link WikipediaTokenizer}. Attaches the
|
* Creates a new instance of the {@link WikipediaTokenizer}. Attaches the
|
||||||
* <code>input</code> to a newly created JFlex scanner.
|
* <code>input</code> to a newly created JFlex scanner.
|
||||||
|
*
|
||||||
* @param input The Input Reader
|
* @param input The Input Reader
|
||||||
*/
|
*/
|
||||||
public WikipediaTokenizer(Reader input) {
|
public WikipediaTokenizer(Reader input) {
|
||||||
this.input = input;
|
this(input, TOKENS_ONLY, Collections.EMPTY_SET);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public WikipediaTokenizer(Reader input, int tokenOutput, Set untokenizedTypes) {
|
||||||
|
super(input);
|
||||||
|
this.tokenOutput = tokenOutput;
|
||||||
this.scanner = new WikipediaTokenizerImpl(input);
|
this.scanner = new WikipediaTokenizerImpl(input);
|
||||||
|
this.untokenizedTypes = untokenizedTypes;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -70,19 +88,116 @@ public class WikipediaTokenizer extends Tokenizer {
|
||||||
* @see org.apache.lucene.analysis.TokenStream#next()
|
* @see org.apache.lucene.analysis.TokenStream#next()
|
||||||
*/
|
*/
|
||||||
public Token next(Token result) throws IOException {
|
public Token next(Token result) throws IOException {
|
||||||
|
if (tokens != null && tokens.hasNext()){
|
||||||
|
return (Token)tokens.next();
|
||||||
|
}
|
||||||
int tokenType = scanner.getNextToken();
|
int tokenType = scanner.getNextToken();
|
||||||
|
|
||||||
if (tokenType == WikipediaTokenizerImpl.YYEOF) {
|
if (tokenType == WikipediaTokenizerImpl.YYEOF) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
String type = WikipediaTokenizerImpl.TOKEN_TYPES[tokenType];
|
||||||
|
if (tokenOutput == TOKENS_ONLY || untokenizedTypes.contains(type) == false){
|
||||||
|
setupToken(result);
|
||||||
|
} else if (tokenOutput == UNTOKENIZED_ONLY && untokenizedTypes.contains(type) == true){
|
||||||
|
collapseTokens(result, tokenType);
|
||||||
|
|
||||||
scanner.getText(result, tokenType);
|
}
|
||||||
|
else if (tokenOutput == BOTH){
|
||||||
|
//collapse into a single token, add it to tokens AND output the individual tokens
|
||||||
|
//output the untokenized Token first
|
||||||
|
collapseAndSaveTokens(result, tokenType, type);
|
||||||
|
}
|
||||||
|
result.setPositionIncrement(scanner.getPositionIncrement());
|
||||||
|
result.setType(type);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void collapseAndSaveTokens(Token result, int tokenType, String type) throws IOException {
|
||||||
|
//collapse
|
||||||
|
StringBuffer buffer = new StringBuffer(32);
|
||||||
|
int numAdded = scanner.setText(buffer);
|
||||||
|
//TODO: how to know how much whitespace to add
|
||||||
|
int theStart = scanner.yychar();
|
||||||
|
int lastPos = theStart + numAdded;
|
||||||
|
int tmpTokType;
|
||||||
|
int numSeen = 0;
|
||||||
|
List tmp = new ArrayList();
|
||||||
|
Token saved = new Token();
|
||||||
|
setupSavedToken(saved, 0, type);
|
||||||
|
tmp.add(saved);
|
||||||
|
//while we can get a token and that token is the same type and we have not transitioned to a new wiki-item of the same type
|
||||||
|
while ((tmpTokType = scanner.getNextToken()) != WikipediaTokenizerImpl.YYEOF && tmpTokType == tokenType && scanner.getNumWikiTokensSeen() > numSeen){
|
||||||
|
int currPos = scanner.yychar();
|
||||||
|
//append whitespace
|
||||||
|
for (int i = 0; i < (currPos - lastPos); i++){
|
||||||
|
buffer.append(' ');
|
||||||
|
}
|
||||||
|
numAdded = scanner.setText(buffer);
|
||||||
|
saved = new Token();
|
||||||
|
setupSavedToken(saved, scanner.getPositionIncrement(), type);
|
||||||
|
tmp.add(saved);
|
||||||
|
numSeen++;
|
||||||
|
lastPos = currPos + numAdded;
|
||||||
|
}
|
||||||
|
//trim the buffer
|
||||||
|
String s = buffer.toString().trim();
|
||||||
|
result.setTermBuffer(s.toCharArray(), 0, s.length());
|
||||||
|
result.setStartOffset(theStart);
|
||||||
|
result.setEndOffset(theStart + s.length());
|
||||||
|
result.setFlags(UNTOKENIZED_TOKEN_FLAG);
|
||||||
|
//The way the loop is written, we will have proceeded to the next token. We need to pushback the scanner to lastPos
|
||||||
|
if (tmpTokType != WikipediaTokenizerImpl.YYEOF){
|
||||||
|
scanner.yypushback(scanner.yylength());
|
||||||
|
}
|
||||||
|
tokens = tmp.iterator();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void setupSavedToken(Token saved, int positionInc, String type){
|
||||||
|
setupToken(saved);
|
||||||
|
saved.setPositionIncrement(positionInc);
|
||||||
|
saved.setType(type);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void collapseTokens(Token result, int tokenType) throws IOException {
|
||||||
|
//collapse
|
||||||
|
StringBuffer buffer = new StringBuffer(32);
|
||||||
|
int numAdded = scanner.setText(buffer);
|
||||||
|
//TODO: how to know how much whitespace to add
|
||||||
|
int theStart = scanner.yychar();
|
||||||
|
int lastPos = theStart + numAdded;
|
||||||
|
int tmpTokType;
|
||||||
|
int numSeen = 0;
|
||||||
|
//while we can get a token and that token is the same type and we have not transitioned to a new wiki-item of the same type
|
||||||
|
while ((tmpTokType = scanner.getNextToken()) != WikipediaTokenizerImpl.YYEOF && tmpTokType == tokenType && scanner.getNumWikiTokensSeen() > numSeen){
|
||||||
|
int currPos = scanner.yychar();
|
||||||
|
//append whitespace
|
||||||
|
for (int i = 0; i < (currPos - lastPos); i++){
|
||||||
|
buffer.append(' ');
|
||||||
|
}
|
||||||
|
numAdded = scanner.setText(buffer);
|
||||||
|
numSeen++;
|
||||||
|
lastPos = currPos + numAdded;
|
||||||
|
}
|
||||||
|
//trim the buffer
|
||||||
|
String s = buffer.toString().trim();
|
||||||
|
result.setTermBuffer(s.toCharArray(), 0, s.length());
|
||||||
|
result.setStartOffset(theStart);
|
||||||
|
result.setEndOffset(theStart + s.length());
|
||||||
|
result.setFlags(UNTOKENIZED_TOKEN_FLAG);
|
||||||
|
//The way the loop is written, we will have proceeded to the next token. We need to pushback the scanner to lastPos
|
||||||
|
if (tmpTokType != WikipediaTokenizerImpl.YYEOF){
|
||||||
|
scanner.yypushback(scanner.yylength());
|
||||||
|
} else {
|
||||||
|
tokens = null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void setupToken(Token result) {
|
||||||
|
scanner.getText(result);
|
||||||
final int start = scanner.yychar();
|
final int start = scanner.yychar();
|
||||||
result.setStartOffset(start);
|
result.setStartOffset(start);
|
||||||
result.setEndOffset(start + result.termLength());
|
result.setEndOffset(start + result.termLength());
|
||||||
result.setPositionIncrement(scanner.getPositionIncrement());
|
|
||||||
result.setType(WikipediaTokenizerImpl.TOKEN_TYPES[tokenType]);
|
|
||||||
return result;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
/* The following code was generated by JFlex 1.4.1 on 1/4/08 3:30 PM */
|
/* The following code was generated by JFlex 1.4.1 on 1/16/08 10:31 AM */
|
||||||
|
|
||||||
package org.apache.lucene.wikipedia.analysis;
|
package org.apache.lucene.wikipedia.analysis;
|
||||||
|
|
||||||
|
@ -25,7 +25,7 @@ import org.apache.lucene.analysis.Token;
|
||||||
/**
|
/**
|
||||||
* This class is a scanner generated by
|
* This class is a scanner generated by
|
||||||
* <a href="http://www.jflex.de/">JFlex</a> 1.4.1
|
* <a href="http://www.jflex.de/">JFlex</a> 1.4.1
|
||||||
* on 1/4/08 3:30 PM from the specification file
|
* on 1/16/08 10:31 AM from the specification file
|
||||||
* <tt>/Volumes/User/grantingersoll/projects/lucene/Lucene-Trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerImpl.jflex</tt>
|
* <tt>/Volumes/User/grantingersoll/projects/lucene/Lucene-Trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerImpl.jflex</tt>
|
||||||
*/
|
*/
|
||||||
class WikipediaTokenizerImpl {
|
class WikipediaTokenizerImpl {
|
||||||
|
@ -37,14 +37,14 @@ class WikipediaTokenizerImpl {
|
||||||
private static final int ZZ_BUFFERSIZE = 16384;
|
private static final int ZZ_BUFFERSIZE = 16384;
|
||||||
|
|
||||||
/** lexical states */
|
/** lexical states */
|
||||||
public static final int DOUBLE_BRACE_STATE = 7;
|
public static final int DOUBLE_BRACE_STATE = 8;
|
||||||
public static final int INTERNAL_LINK_STATE = 2;
|
public static final int INTERNAL_LINK_STATE = 2;
|
||||||
public static final int TWO_SINGLE_QUOTES_STATE = 4;
|
public static final int TWO_SINGLE_QUOTES_STATE = 4;
|
||||||
public static final int CATEGORY_STATE = 1;
|
public static final int CATEGORY_STATE = 1;
|
||||||
public static final int FIVE_SINGLE_QUOTES_STATE = 5;
|
public static final int FIVE_SINGLE_QUOTES_STATE = 6;
|
||||||
public static final int STRING = 8;
|
public static final int STRING = 9;
|
||||||
public static final int YYINITIAL = 0;
|
public static final int YYINITIAL = 0;
|
||||||
public static final int DOUBLE_EQUALS_STATE = 6;
|
public static final int DOUBLE_EQUALS_STATE = 7;
|
||||||
public static final int THREE_SINGLE_QUOTES_STATE = 5;
|
public static final int THREE_SINGLE_QUOTES_STATE = 5;
|
||||||
public static final int EXTERNAL_LINK_STATE = 3;
|
public static final int EXTERNAL_LINK_STATE = 3;
|
||||||
|
|
||||||
|
@ -76,20 +76,20 @@ class WikipediaTokenizerImpl {
|
||||||
private static final int [] ZZ_ACTION = zzUnpackAction();
|
private static final int [] ZZ_ACTION = zzUnpackAction();
|
||||||
|
|
||||||
private static final String ZZ_ACTION_PACKED_0 =
|
private static final String ZZ_ACTION_PACKED_0 =
|
||||||
"\11\0\4\1\4\2\1\3\1\1\1\4\2\1\1\5"+
|
"\12\0\4\1\4\2\1\3\1\1\1\4\1\1\2\5"+
|
||||||
"\1\1\1\6\1\1\2\7\1\10\1\11\1\10\1\12"+
|
"\1\6\2\5\1\7\1\5\2\10\1\11\1\12\1\11"+
|
||||||
"\1\13\1\7\1\14\1\15\1\16\1\17\1\7\1\20"+
|
"\1\13\1\14\1\10\1\15\1\16\1\15\1\17\1\20"+
|
||||||
"\1\7\4\21\1\22\1\21\1\23\1\24\1\25\3\0"+
|
"\1\10\1\21\1\10\4\22\1\23\1\22\1\24\1\25"+
|
||||||
"\1\26\14\0\1\27\1\30\1\31\1\32\1\10\1\0"+
|
"\1\26\3\0\1\27\14\0\1\30\1\31\1\32\1\33"+
|
||||||
"\1\33\1\0\1\34\1\0\1\35\3\0\1\36\1\37"+
|
"\1\11\1\0\1\34\1\35\1\0\1\36\1\0\1\37"+
|
||||||
"\2\40\1\37\2\41\2\0\1\40\1\0\14\40\1\37"+
|
"\3\0\1\40\1\41\2\42\1\41\2\43\2\0\1\42"+
|
||||||
"\3\0\1\10\1\42\3\0\1\43\1\44\5\0\1\45"+
|
"\1\0\14\42\1\41\3\0\1\11\1\44\3\0\1\45"+
|
||||||
"\4\0\1\45\2\0\2\45\2\0\1\10\5\0\1\30"+
|
"\1\46\5\0\1\47\4\0\1\47\2\0\2\47\2\0"+
|
||||||
"\1\37\1\40\1\46\3\0\1\10\2\0\1\47\30\0"+
|
"\1\11\5\0\1\31\1\41\1\42\1\50\3\0\1\11"+
|
||||||
"\1\50\2\0\1\51\1\52\1\53";
|
"\2\0\1\51\30\0\1\52\2\0\1\53\1\54\1\55";
|
||||||
|
|
||||||
private static int [] zzUnpackAction() {
|
private static int [] zzUnpackAction() {
|
||||||
int [] result = new int[178];
|
int [] result = new int[183];
|
||||||
int offset = 0;
|
int offset = 0;
|
||||||
offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
|
offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
|
||||||
return result;
|
return result;
|
||||||
|
@ -116,30 +116,30 @@ class WikipediaTokenizerImpl {
|
||||||
private static final String ZZ_ROWMAP_PACKED_0 =
|
private static final String ZZ_ROWMAP_PACKED_0 =
|
||||||
"\0\0\0\54\0\130\0\204\0\260\0\334\0\u0108\0\u0134"+
|
"\0\0\0\54\0\130\0\204\0\260\0\334\0\u0108\0\u0134"+
|
||||||
"\0\u0160\0\u018c\0\u01b8\0\u01e4\0\u0210\0\u023c\0\u0268\0\u0294"+
|
"\0\u0160\0\u018c\0\u01b8\0\u01e4\0\u0210\0\u023c\0\u0268\0\u0294"+
|
||||||
"\0\u02c0\0\u018c\0\u02ec\0\u0318\0\u0344\0\u0370\0\u039c\0\u03c8"+
|
"\0\u02c0\0\u02ec\0\u01b8\0\u0318\0\u0344\0\u0370\0\u01b8\0\u039c"+
|
||||||
"\0\u03f4\0\u0420\0\u018c\0\u0370\0\u044c\0\u018c\0\u0478\0\u04a4"+
|
"\0\u03c8\0\u03f4\0\u0420\0\u044c\0\u0478\0\u01b8\0\u039c\0\u04a4"+
|
||||||
"\0\u04d0\0\u04fc\0\u0528\0\u0554\0\u0580\0\u05ac\0\u05d8\0\u0604"+
|
"\0\u01b8\0\u04d0\0\u04fc\0\u0528\0\u0554\0\u0580\0\u05ac\0\u05d8"+
|
||||||
"\0\u0630\0\u018c\0\u065c\0\u0370\0\u0688\0\u06b4\0\u06e0\0\u070c"+
|
"\0\u0604\0\u0630\0\u065c\0\u0688\0\u06b4\0\u01b8\0\u06e0\0\u039c"+
|
||||||
"\0\u018c\0\u018c\0\u0738\0\u0764\0\u0790\0\u018c\0\u07bc\0\u07e8"+
|
"\0\u070c\0\u0738\0\u0764\0\u0790\0\u01b8\0\u01b8\0\u07bc\0\u07e8"+
|
||||||
"\0\u0814\0\u0840\0\u086c\0\u0898\0\u08c4\0\u08f0\0\u091c\0\u0948"+
|
"\0\u0814\0\u01b8\0\u0840\0\u086c\0\u0898\0\u08c4\0\u08f0\0\u091c"+
|
||||||
"\0\u0974\0\u09a0\0\u09cc\0\u09f8\0\u018c\0\u018c\0\u0a24\0\u0a50"+
|
"\0\u0948\0\u0974\0\u09a0\0\u09cc\0\u09f8\0\u0a24\0\u0a50\0\u0a7c"+
|
||||||
"\0\u0a7c\0\u0aa8\0\u0ad4\0\u0b00\0\u0b2c\0\u0b58\0\u0b84\0\u0bb0"+
|
"\0\u01b8\0\u01b8\0\u0aa8\0\u0ad4\0\u0b00\0\u0b00\0\u0b2c\0\u0b58"+
|
||||||
"\0\u0bdc\0\u0c08\0\u0c34\0\u0c60\0\u0c8c\0\u0814\0\u0cb8\0\u0ce4"+
|
"\0\u0b84\0\u0bb0\0\u0bdc\0\u0c08\0\u0c34\0\u0c60\0\u0c8c\0\u0cb8"+
|
||||||
"\0\u0d10\0\u0d3c\0\u0d68\0\u0d94\0\u0dc0\0\u0dec\0\u0e18\0\u0e44"+
|
"\0\u0ce4\0\u0d10\0\u0898\0\u0d3c\0\u0d68\0\u0d94\0\u0dc0\0\u0dec"+
|
||||||
"\0\u0e70\0\u0e9c\0\u0ec8\0\u0ef4\0\u0f20\0\u0f4c\0\u0f78\0\u0fa4"+
|
"\0\u0e18\0\u0e44\0\u0e70\0\u0e9c\0\u0ec8\0\u0ef4\0\u0f20\0\u0f4c"+
|
||||||
"\0\u0fd0\0\u0ffc\0\u1028\0\u1054\0\u018c\0\u1080\0\u10ac\0\u10d8"+
|
"\0\u0f78\0\u0fa4\0\u0fd0\0\u0ffc\0\u1028\0\u1054\0\u1080\0\u10ac"+
|
||||||
"\0\u1104\0\u018c\0\u1130\0\u115c\0\u1188\0\u11b4\0\u11e0\0\u120c"+
|
"\0\u10d8\0\u01b8\0\u1104\0\u1130\0\u115c\0\u1188\0\u01b8\0\u11b4"+
|
||||||
"\0\u1238\0\u1264\0\u1290\0\u12bc\0\u12e8\0\u1314\0\u1340\0\u07e8"+
|
"\0\u11e0\0\u120c\0\u1238\0\u1264\0\u1290\0\u12bc\0\u12e8\0\u1314"+
|
||||||
"\0\u0974\0\u136c\0\u1398\0\u13c4\0\u13f0\0\u141c\0\u1448\0\u1474"+
|
"\0\u1340\0\u136c\0\u1398\0\u13c4\0\u086c\0\u09f8\0\u13f0\0\u141c"+
|
||||||
"\0\u14a0\0\u018c\0\u14cc\0\u14f8\0\u1524\0\u1550\0\u157c\0\u15a8"+
|
"\0\u1448\0\u1474\0\u14a0\0\u14cc\0\u14f8\0\u1524\0\u01b8\0\u1550"+
|
||||||
"\0\u15d4\0\u1600\0\u162c\0\u018c\0\u1658\0\u1684\0\u16b0\0\u16dc"+
|
"\0\u157c\0\u15a8\0\u15d4\0\u1600\0\u162c\0\u1658\0\u1684\0\u16b0"+
|
||||||
"\0\u1708\0\u1734\0\u1760\0\u178c\0\u17b8\0\u17e4\0\u1810\0\u183c"+
|
"\0\u01b8\0\u16dc\0\u1708\0\u1734\0\u1760\0\u178c\0\u17b8\0\u17e4"+
|
||||||
"\0\u1868\0\u1894\0\u18c0\0\u18ec\0\u1918\0\u1944\0\u1970\0\u199c"+
|
"\0\u1810\0\u183c\0\u1868\0\u1894\0\u18c0\0\u18ec\0\u1918\0\u1944"+
|
||||||
"\0\u19c8\0\u19f4\0\u1a20\0\u1a4c\0\u1a78\0\u1aa4\0\u1ad0\0\u018c"+
|
"\0\u1970\0\u199c\0\u19c8\0\u19f4\0\u1a20\0\u1a4c\0\u1a78\0\u1aa4"+
|
||||||
"\0\u018c\0\u018c";
|
"\0\u1ad0\0\u1afc\0\u1b28\0\u1b54\0\u01b8\0\u01b8\0\u01b8";
|
||||||
|
|
||||||
private static int [] zzUnpackRowMap() {
|
private static int [] zzUnpackRowMap() {
|
||||||
int [] result = new int[178];
|
int [] result = new int[183];
|
||||||
int offset = 0;
|
int offset = 0;
|
||||||
offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
|
offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
|
||||||
return result;
|
return result;
|
||||||
|
@ -162,151 +162,153 @@ class WikipediaTokenizerImpl {
|
||||||
private static final int [] ZZ_TRANS = zzUnpackTrans();
|
private static final int [] ZZ_TRANS = zzUnpackTrans();
|
||||||
|
|
||||||
private static final String ZZ_TRANS_PACKED_0 =
|
private static final String ZZ_TRANS_PACKED_0 =
|
||||||
"\1\12\1\13\5\12\1\14\1\12\1\15\3\12\1\16"+
|
"\1\13\1\14\5\13\1\15\1\13\1\16\3\13\1\17"+
|
||||||
"\1\17\1\20\1\21\1\22\1\23\2\12\1\24\2\12"+
|
"\1\20\1\21\1\22\1\23\1\24\2\13\1\25\2\13"+
|
||||||
"\15\16\1\25\2\12\3\16\10\12\1\26\5\12\4\27"+
|
"\15\17\1\26\2\13\3\17\1\13\7\27\1\30\5\27"+
|
||||||
"\1\12\1\23\3\12\1\30\1\12\15\27\3\12\3\27"+
|
"\4\31\1\27\1\32\3\27\1\33\1\27\15\31\3\27"+
|
||||||
"\10\12\1\26\5\12\4\31\1\12\1\23\3\12\1\32"+
|
"\3\31\10\27\1\30\5\27\4\34\1\27\1\32\3\27"+
|
||||||
"\1\12\15\31\3\12\3\31\1\12\7\33\1\34\5\33"+
|
"\1\35\1\27\15\34\3\27\3\34\1\27\7\36\1\37"+
|
||||||
"\4\35\1\33\1\23\2\12\1\33\1\36\1\33\15\35"+
|
"\5\36\4\40\1\36\1\32\2\27\1\36\1\41\1\36"+
|
||||||
"\3\33\1\37\2\35\2\33\1\40\5\33\1\34\5\33"+
|
"\15\40\3\36\1\42\2\40\2\36\1\43\5\36\1\37"+
|
||||||
"\4\41\1\33\1\42\2\33\1\43\2\33\15\41\3\33"+
|
"\5\36\4\44\1\36\1\45\2\36\1\46\2\36\15\44"+
|
||||||
"\3\41\10\33\1\34\5\33\4\44\1\33\1\42\2\33"+
|
"\3\36\3\44\10\36\1\37\5\36\4\47\1\36\1\45"+
|
||||||
"\1\43\2\33\15\44\3\33\3\44\10\33\1\34\1\33"+
|
"\2\36\1\46\2\36\15\47\3\36\3\47\10\36\1\37"+
|
||||||
"\1\45\3\33\4\46\1\33\1\42\5\33\15\46\3\33"+
|
"\5\36\4\47\1\36\1\45\2\36\1\50\2\36\15\47"+
|
||||||
"\3\46\10\33\1\47\5\33\4\50\1\33\1\42\5\33"+
|
"\3\36\3\47\10\36\1\37\1\36\1\51\3\36\4\52"+
|
||||||
"\15\50\1\33\1\51\1\33\3\50\1\33\1\52\1\53"+
|
"\1\36\1\45\5\36\15\52\3\36\3\52\10\36\1\53"+
|
||||||
"\5\52\1\54\1\52\1\55\3\52\4\56\1\52\1\57"+
|
"\5\36\4\54\1\36\1\45\5\36\15\54\1\36\1\55"+
|
||||||
"\2\52\1\60\2\52\15\56\2\52\1\61\3\56\1\52"+
|
"\1\36\3\54\1\36\1\56\1\57\5\56\1\60\1\56"+
|
||||||
"\55\0\1\62\62\0\1\63\4\0\4\64\7\0\6\64"+
|
"\1\61\3\56\4\62\1\56\1\63\2\56\1\64\2\56"+
|
||||||
"\1\65\6\64\3\0\3\64\12\0\1\66\43\0\1\67"+
|
"\15\62\2\56\1\65\3\62\1\56\55\0\1\66\62\0"+
|
||||||
"\1\70\1\71\1\72\2\73\1\0\1\74\3\0\1\74"+
|
"\1\67\4\0\4\70\7\0\6\70\1\71\6\70\3\0"+
|
||||||
"\1\16\1\17\1\20\1\21\7\0\15\16\3\0\3\16"+
|
"\3\70\12\0\1\72\43\0\1\73\1\74\1\75\1\76"+
|
||||||
"\3\0\1\75\1\0\1\76\2\77\1\0\1\100\3\0"+
|
"\2\77\1\0\1\100\3\0\1\100\1\17\1\20\1\21"+
|
||||||
"\1\100\3\17\1\21\7\0\15\17\3\0\3\17\2\0"+
|
"\1\22\7\0\15\17\3\0\3\17\3\0\1\101\1\0"+
|
||||||
"\1\67\1\101\1\71\1\72\2\77\1\0\1\100\3\0"+
|
"\1\102\2\103\1\0\1\104\3\0\1\104\3\20\1\22"+
|
||||||
"\1\100\1\20\1\17\1\20\1\21\7\0\15\20\3\0"+
|
"\7\0\15\20\3\0\3\20\2\0\1\73\1\105\1\75"+
|
||||||
"\3\20\3\0\1\102\1\0\1\76\2\73\1\0\1\74"+
|
"\1\76\2\103\1\0\1\104\3\0\1\104\1\21\1\20"+
|
||||||
"\3\0\1\74\4\21\7\0\15\21\3\0\3\21\24\0"+
|
"\1\21\1\22\7\0\15\21\3\0\3\21\3\0\1\106"+
|
||||||
"\1\12\55\0\1\103\73\0\1\104\16\0\1\63\4\0"+
|
"\1\0\1\102\2\77\1\0\1\100\3\0\1\100\4\22"+
|
||||||
"\4\64\7\0\15\64\3\0\3\64\16\0\4\27\7\0"+
|
"\7\0\15\22\3\0\3\22\24\0\1\13\55\0\1\107"+
|
||||||
"\15\27\3\0\3\27\27\0\1\105\42\0\4\31\7\0"+
|
"\73\0\1\110\16\0\1\67\4\0\4\70\7\0\15\70"+
|
||||||
"\15\31\3\0\3\31\27\0\1\106\42\0\4\35\7\0"+
|
"\3\0\3\70\16\0\4\31\7\0\15\31\3\0\3\31"+
|
||||||
"\15\35\3\0\3\35\16\0\4\35\7\0\2\35\1\107"+
|
"\24\0\1\27\56\0\1\111\42\0\4\34\7\0\15\34"+
|
||||||
"\12\35\3\0\3\35\2\0\1\110\67\0\4\41\7\0"+
|
"\3\0\3\34\27\0\1\112\42\0\4\40\7\0\15\40"+
|
||||||
"\15\41\3\0\3\41\24\0\1\33\55\0\1\111\43\0"+
|
"\3\0\3\40\16\0\4\40\7\0\2\40\1\113\12\40"+
|
||||||
"\4\44\7\0\15\44\3\0\3\44\12\0\1\105\57\0"+
|
"\3\0\3\40\2\0\1\114\67\0\4\44\7\0\15\44"+
|
||||||
"\4\46\7\0\15\46\3\0\3\46\11\0\1\112\4\0"+
|
"\3\0\3\44\24\0\1\36\55\0\1\115\43\0\4\47"+
|
||||||
"\4\64\7\0\15\64\3\0\3\64\16\0\4\50\7\0"+
|
"\7\0\15\47\3\0\3\47\26\0\1\116\37\0\1\111"+
|
||||||
"\15\50\3\0\3\50\47\0\1\105\6\0\1\113\63\0"+
|
"\57\0\4\52\7\0\15\52\3\0\3\52\11\0\1\117"+
|
||||||
"\1\114\57\0\4\56\7\0\15\56\3\0\3\56\24\0"+
|
"\4\0\4\70\7\0\15\70\3\0\3\70\16\0\4\54"+
|
||||||
"\1\52\55\0\1\115\43\0\4\64\7\0\15\64\3\0"+
|
"\7\0\15\54\3\0\3\54\47\0\1\111\6\0\1\120"+
|
||||||
"\3\64\14\0\1\33\1\0\4\116\1\0\3\117\3\0"+
|
"\63\0\1\121\57\0\4\62\7\0\15\62\3\0\3\62"+
|
||||||
"\15\116\3\0\3\116\14\0\1\33\1\0\4\116\1\0"+
|
"\24\0\1\56\55\0\1\122\43\0\4\70\7\0\15\70"+
|
||||||
"\3\117\3\0\3\116\1\120\11\116\3\0\3\116\16\0"+
|
"\3\0\3\70\14\0\1\36\1\0\4\123\1\0\3\124"+
|
||||||
"\1\121\1\0\1\121\10\0\15\121\3\0\3\121\16\0"+
|
"\3\0\15\123\3\0\3\123\14\0\1\36\1\0\4\123"+
|
||||||
"\1\122\1\123\1\124\1\125\7\0\15\122\3\0\3\122"+
|
"\1\0\3\124\3\0\3\123\1\125\11\123\3\0\3\123"+
|
||||||
"\16\0\1\126\1\0\1\126\10\0\15\126\3\0\3\126"+
|
"\16\0\1\126\1\0\1\126\10\0\15\126\3\0\3\126"+
|
||||||
"\16\0\1\127\1\130\1\127\1\130\7\0\15\127\3\0"+
|
"\16\0\1\127\1\130\1\131\1\132\7\0\15\127\3\0"+
|
||||||
"\3\127\16\0\1\131\2\132\1\133\7\0\15\131\3\0"+
|
"\3\127\16\0\1\133\1\0\1\133\10\0\15\133\3\0"+
|
||||||
"\3\131\16\0\1\74\2\134\10\0\15\74\3\0\3\74"+
|
"\3\133\16\0\1\134\1\135\1\134\1\135\7\0\15\134"+
|
||||||
"\16\0\1\135\2\136\1\137\7\0\15\135\3\0\3\135"+
|
"\3\0\3\134\16\0\1\136\2\137\1\140\7\0\15\136"+
|
||||||
"\16\0\4\130\7\0\15\130\3\0\3\130\16\0\1\140"+
|
"\3\0\3\136\16\0\1\100\2\141\10\0\15\100\3\0"+
|
||||||
"\2\141\1\142\7\0\15\140\3\0\3\140\16\0\1\143"+
|
"\3\100\16\0\1\142\2\143\1\144\7\0\15\142\3\0"+
|
||||||
"\2\144\1\145\7\0\15\143\3\0\3\143\16\0\1\146"+
|
"\3\142\16\0\4\135\7\0\15\135\3\0\3\135\16\0"+
|
||||||
"\1\136\1\147\1\137\7\0\15\146\3\0\3\146\16\0"+
|
"\1\145\2\146\1\147\7\0\15\145\3\0\3\145\16\0"+
|
||||||
"\1\150\2\123\1\125\7\0\15\150\3\0\3\150\30\0"+
|
"\1\150\2\151\1\152\7\0\15\150\3\0\3\150\16\0"+
|
||||||
"\1\151\1\152\64\0\1\153\27\0\4\35\7\0\2\35"+
|
"\1\153\1\143\1\154\1\144\7\0\15\153\3\0\3\153"+
|
||||||
"\1\154\12\35\3\0\3\35\2\0\1\155\101\0\1\156"+
|
"\16\0\1\155\2\130\1\132\7\0\15\155\3\0\3\155"+
|
||||||
"\1\157\40\0\4\64\7\0\6\64\1\160\6\64\3\0"+
|
"\30\0\1\156\1\157\64\0\1\160\27\0\4\40\7\0"+
|
||||||
"\3\64\2\0\1\161\63\0\1\162\71\0\1\163\1\164"+
|
"\2\40\1\161\12\40\3\0\3\40\2\0\1\162\101\0"+
|
||||||
"\34\0\1\165\1\0\1\33\1\0\4\116\1\0\3\117"+
|
"\1\163\1\164\40\0\4\70\7\0\6\70\1\165\6\70"+
|
||||||
"\3\0\15\116\3\0\3\116\16\0\4\166\1\0\3\117"+
|
"\3\0\3\70\2\0\1\166\63\0\1\167\71\0\1\170"+
|
||||||
"\3\0\15\166\3\0\3\166\12\0\1\165\1\0\1\33"+
|
"\1\171\34\0\1\172\1\0\1\36\1\0\4\123\1\0"+
|
||||||
"\1\0\4\116\1\0\3\117\3\0\10\116\1\167\4\116"+
|
"\3\124\3\0\15\123\3\0\3\123\16\0\4\173\1\0"+
|
||||||
"\3\0\3\116\2\0\1\67\13\0\1\121\1\0\1\121"+
|
"\3\124\3\0\15\173\3\0\3\173\12\0\1\172\1\0"+
|
||||||
"\10\0\15\121\3\0\3\121\3\0\1\170\1\0\1\76"+
|
"\1\36\1\0\4\123\1\0\3\124\3\0\10\123\1\174"+
|
||||||
"\2\171\6\0\1\122\1\123\1\124\1\125\7\0\15\122"+
|
"\4\123\3\0\3\123\2\0\1\73\13\0\1\126\1\0"+
|
||||||
"\3\0\3\122\3\0\1\172\1\0\1\76\2\173\1\0"+
|
"\1\126\10\0\15\126\3\0\3\126\3\0\1\175\1\0"+
|
||||||
"\1\174\3\0\1\174\3\123\1\125\7\0\15\123\3\0"+
|
"\1\102\2\176\6\0\1\127\1\130\1\131\1\132\7\0"+
|
||||||
"\3\123\3\0\1\175\1\0\1\76\2\173\1\0\1\174"+
|
"\15\127\3\0\3\127\3\0\1\177\1\0\1\102\2\200"+
|
||||||
"\3\0\1\174\1\124\1\123\1\124\1\125\7\0\15\124"+
|
"\1\0\1\201\3\0\1\201\3\130\1\132\7\0\15\130"+
|
||||||
"\3\0\3\124\3\0\1\176\1\0\1\76\2\171\6\0"+
|
"\3\0\3\130\3\0\1\202\1\0\1\102\2\200\1\0"+
|
||||||
"\4\125\7\0\15\125\3\0\3\125\3\0\1\177\2\0"+
|
"\1\201\3\0\1\201\1\131\1\130\1\131\1\132\7\0"+
|
||||||
"\1\177\7\0\1\127\1\130\1\127\1\130\7\0\15\127"+
|
"\15\131\3\0\3\131\3\0\1\203\1\0\1\102\2\176"+
|
||||||
"\3\0\3\127\3\0\1\177\2\0\1\177\7\0\4\130"+
|
"\6\0\4\132\7\0\15\132\3\0\3\132\3\0\1\204"+
|
||||||
"\7\0\15\130\3\0\3\130\3\0\1\171\1\0\1\76"+
|
"\2\0\1\204\7\0\1\134\1\135\1\134\1\135\7\0"+
|
||||||
"\2\171\6\0\1\131\2\132\1\133\7\0\15\131\3\0"+
|
"\15\134\3\0\3\134\3\0\1\204\2\0\1\204\7\0"+
|
||||||
"\3\131\3\0\1\173\1\0\1\76\2\173\1\0\1\174"+
|
"\4\135\7\0\15\135\3\0\3\135\3\0\1\176\1\0"+
|
||||||
"\3\0\1\174\3\132\1\133\7\0\15\132\3\0\3\132"+
|
"\1\102\2\176\6\0\1\136\2\137\1\140\7\0\15\136"+
|
||||||
"\3\0\1\171\1\0\1\76\2\171\6\0\4\133\7\0"+
|
"\3\0\3\136\3\0\1\200\1\0\1\102\2\200\1\0"+
|
||||||
"\15\133\3\0\3\133\3\0\1\174\2\0\2\174\1\0"+
|
"\1\201\3\0\1\201\3\137\1\140\7\0\15\137\3\0"+
|
||||||
"\1\174\3\0\1\174\3\134\10\0\15\134\3\0\3\134"+
|
"\3\137\3\0\1\176\1\0\1\102\2\176\6\0\4\140"+
|
||||||
"\3\0\1\102\1\0\1\76\2\73\1\0\1\74\3\0"+
|
"\7\0\15\140\3\0\3\140\3\0\1\201\2\0\2\201"+
|
||||||
"\1\74\1\135\2\136\1\137\7\0\15\135\3\0\3\135"+
|
"\1\0\1\201\3\0\1\201\3\141\10\0\15\141\3\0"+
|
||||||
"\3\0\1\75\1\0\1\76\2\77\1\0\1\100\3\0"+
|
"\3\141\3\0\1\106\1\0\1\102\2\77\1\0\1\100"+
|
||||||
"\1\100\3\136\1\137\7\0\15\136\3\0\3\136\3\0"+
|
"\3\0\1\100\1\142\2\143\1\144\7\0\15\142\3\0"+
|
||||||
"\1\102\1\0\1\76\2\73\1\0\1\74\3\0\1\74"+
|
"\3\142\3\0\1\101\1\0\1\102\2\103\1\0\1\104"+
|
||||||
"\4\137\7\0\15\137\3\0\3\137\3\0\1\73\1\0"+
|
"\3\0\1\104\3\143\1\144\7\0\15\143\3\0\3\143"+
|
||||||
"\1\76\2\73\1\0\1\74\3\0\1\74\1\140\2\141"+
|
"\3\0\1\106\1\0\1\102\2\77\1\0\1\100\3\0"+
|
||||||
"\1\142\7\0\15\140\3\0\3\140\3\0\1\77\1\0"+
|
"\1\100\4\144\7\0\15\144\3\0\3\144\3\0\1\77"+
|
||||||
"\1\76\2\77\1\0\1\100\3\0\1\100\3\141\1\142"+
|
"\1\0\1\102\2\77\1\0\1\100\3\0\1\100\1\145"+
|
||||||
"\7\0\15\141\3\0\3\141\3\0\1\73\1\0\1\76"+
|
"\2\146\1\147\7\0\15\145\3\0\3\145\3\0\1\103"+
|
||||||
"\2\73\1\0\1\74\3\0\1\74\4\142\7\0\15\142"+
|
"\1\0\1\102\2\103\1\0\1\104\3\0\1\104\3\146"+
|
||||||
"\3\0\3\142\3\0\1\74\2\0\2\74\1\0\1\74"+
|
"\1\147\7\0\15\146\3\0\3\146\3\0\1\77\1\0"+
|
||||||
"\3\0\1\74\1\143\2\144\1\145\7\0\15\143\3\0"+
|
"\1\102\2\77\1\0\1\100\3\0\1\100\4\147\7\0"+
|
||||||
"\3\143\3\0\1\100\2\0\2\100\1\0\1\100\3\0"+
|
"\15\147\3\0\3\147\3\0\1\100\2\0\2\100\1\0"+
|
||||||
"\1\100\3\144\1\145\7\0\15\144\3\0\3\144\3\0"+
|
"\1\100\3\0\1\100\1\150\2\151\1\152\7\0\15\150"+
|
||||||
"\1\74\2\0\2\74\1\0\1\74\3\0\1\74\4\145"+
|
"\3\0\3\150\3\0\1\104\2\0\2\104\1\0\1\104"+
|
||||||
"\7\0\15\145\3\0\3\145\3\0\1\200\1\0\1\76"+
|
"\3\0\1\104\3\151\1\152\7\0\15\151\3\0\3\151"+
|
||||||
"\2\73\1\0\1\74\3\0\1\74\1\146\1\136\1\147"+
|
"\3\0\1\100\2\0\2\100\1\0\1\100\3\0\1\100"+
|
||||||
"\1\137\7\0\15\146\3\0\3\146\3\0\1\201\1\0"+
|
"\4\152\7\0\15\152\3\0\3\152\3\0\1\205\1\0"+
|
||||||
"\1\76\2\77\1\0\1\100\3\0\1\100\1\147\1\136"+
|
"\1\102\2\77\1\0\1\100\3\0\1\100\1\153\1\143"+
|
||||||
"\1\147\1\137\7\0\15\147\3\0\3\147\3\0\1\176"+
|
"\1\154\1\144\7\0\15\153\3\0\3\153\3\0\1\206"+
|
||||||
"\1\0\1\76\2\171\6\0\1\150\2\123\1\125\7\0"+
|
"\1\0\1\102\2\103\1\0\1\104\3\0\1\104\1\154"+
|
||||||
"\15\150\3\0\3\150\31\0\1\152\54\0\1\202\64\0"+
|
"\1\143\1\154\1\144\7\0\15\154\3\0\3\154\3\0"+
|
||||||
"\1\203\26\0\4\35\7\0\15\35\3\0\1\35\1\204"+
|
"\1\203\1\0\1\102\2\176\6\0\1\155\2\130\1\132"+
|
||||||
"\1\35\31\0\1\157\54\0\1\205\35\0\1\33\1\0"+
|
"\7\0\15\155\3\0\3\155\31\0\1\157\54\0\1\207"+
|
||||||
"\4\116\1\0\3\117\3\0\3\116\1\206\11\116\3\0"+
|
"\64\0\1\210\26\0\4\40\7\0\15\40\3\0\1\40"+
|
||||||
"\3\116\2\0\1\207\102\0\1\164\54\0\1\210\34\0"+
|
"\1\211\1\40\31\0\1\164\54\0\1\212\35\0\1\36"+
|
||||||
"\1\211\52\0\1\165\3\0\4\166\7\0\15\166\3\0"+
|
"\1\0\4\123\1\0\3\124\3\0\3\123\1\213\11\123"+
|
||||||
"\3\166\12\0\1\165\1\0\1\212\1\0\4\116\1\0"+
|
"\3\0\3\123\2\0\1\214\102\0\1\171\54\0\1\215"+
|
||||||
"\3\117\3\0\15\116\3\0\3\116\16\0\1\213\1\125"+
|
"\34\0\1\216\52\0\1\172\3\0\4\173\7\0\15\173"+
|
||||||
"\1\213\1\125\7\0\15\213\3\0\3\213\16\0\4\133"+
|
"\3\0\3\173\12\0\1\172\1\0\1\217\1\0\4\123"+
|
||||||
"\7\0\15\133\3\0\3\133\16\0\4\137\7\0\15\137"+
|
"\1\0\3\124\3\0\15\123\3\0\3\123\16\0\1\220"+
|
||||||
"\3\0\3\137\16\0\4\142\7\0\15\142\3\0\3\142"+
|
"\1\132\1\220\1\132\7\0\15\220\3\0\3\220\16\0"+
|
||||||
"\16\0\4\145\7\0\15\145\3\0\3\145\16\0\1\214"+
|
"\4\140\7\0\15\140\3\0\3\140\16\0\4\144\7\0"+
|
||||||
"\1\137\1\214\1\137\7\0\15\214\3\0\3\214\16\0"+
|
"\15\144\3\0\3\144\16\0\4\147\7\0\15\147\3\0"+
|
||||||
"\4\125\7\0\15\125\3\0\3\125\16\0\4\215\7\0"+
|
"\3\147\16\0\4\152\7\0\15\152\3\0\3\152\16\0"+
|
||||||
"\15\215\3\0\3\215\33\0\1\216\61\0\1\217\30\0"+
|
"\1\221\1\144\1\221\1\144\7\0\15\221\3\0\3\221"+
|
||||||
"\4\35\6\0\1\220\15\35\3\0\2\35\1\221\33\0"+
|
"\16\0\4\132\7\0\15\132\3\0\3\132\16\0\4\222"+
|
||||||
"\1\222\32\0\1\165\1\0\1\33\1\0\4\116\1\0"+
|
"\7\0\15\222\3\0\3\222\33\0\1\223\61\0\1\224"+
|
||||||
"\3\117\3\0\10\116\1\223\4\116\3\0\3\116\2\0"+
|
"\30\0\4\40\6\0\1\225\15\40\3\0\2\40\1\226"+
|
||||||
"\1\224\104\0\1\225\36\0\4\226\7\0\15\226\3\0"+
|
"\33\0\1\227\32\0\1\172\1\0\1\36\1\0\4\123"+
|
||||||
"\3\226\3\0\1\170\1\0\1\76\2\171\6\0\1\213"+
|
"\1\0\3\124\3\0\10\123\1\230\4\123\3\0\3\123"+
|
||||||
"\1\125\1\213\1\125\7\0\15\213\3\0\3\213\3\0"+
|
"\2\0\1\231\104\0\1\232\36\0\4\233\7\0\15\233"+
|
||||||
"\1\200\1\0\1\76\2\73\1\0\1\74\3\0\1\74"+
|
"\3\0\3\233\3\0\1\175\1\0\1\102\2\176\6\0"+
|
||||||
"\1\214\1\137\1\214\1\137\7\0\15\214\3\0\3\214"+
|
"\1\220\1\132\1\220\1\132\7\0\15\220\3\0\3\220"+
|
||||||
"\3\0\1\177\2\0\1\177\7\0\4\215\7\0\15\215"+
|
"\3\0\1\205\1\0\1\102\2\77\1\0\1\100\3\0"+
|
||||||
"\3\0\3\215\34\0\1\227\55\0\1\230\26\0\1\231"+
|
"\1\100\1\221\1\144\1\221\1\144\7\0\15\221\3\0"+
|
||||||
"\60\0\4\35\6\0\1\220\15\35\3\0\3\35\34\0"+
|
"\3\221\3\0\1\204\2\0\1\204\7\0\4\222\7\0"+
|
||||||
"\1\232\31\0\1\165\1\0\1\105\1\0\4\116\1\0"+
|
"\15\222\3\0\3\222\34\0\1\234\55\0\1\235\26\0"+
|
||||||
"\3\117\3\0\15\116\3\0\3\116\34\0\1\233\32\0"+
|
"\1\236\60\0\4\40\6\0\1\225\15\40\3\0\3\40"+
|
||||||
"\1\234\2\0\4\226\7\0\15\226\3\0\3\226\35\0"+
|
"\34\0\1\237\31\0\1\172\1\0\1\111\1\0\4\123"+
|
||||||
"\1\235\62\0\1\236\20\0\1\237\77\0\1\240\53\0"+
|
"\1\0\3\124\3\0\15\123\3\0\3\123\34\0\1\240"+
|
||||||
"\1\241\32\0\1\33\1\0\4\166\1\0\3\117\3\0"+
|
"\32\0\1\241\2\0\4\233\7\0\15\233\3\0\3\233"+
|
||||||
"\15\166\3\0\3\166\36\0\1\242\53\0\1\243\33\0"+
|
"\35\0\1\242\62\0\1\243\20\0\1\244\77\0\1\245"+
|
||||||
"\4\244\7\0\15\244\3\0\3\244\36\0\1\245\53\0"+
|
"\53\0\1\246\32\0\1\36\1\0\4\173\1\0\3\124"+
|
||||||
"\1\246\54\0\1\247\61\0\1\250\11\0\1\251\12\0"+
|
"\3\0\15\173\3\0\3\173\36\0\1\247\53\0\1\250"+
|
||||||
"\4\244\7\0\15\244\3\0\3\244\37\0\1\252\53\0"+
|
"\33\0\4\251\7\0\15\251\3\0\3\251\36\0\1\252"+
|
||||||
"\1\253\54\0\1\254\22\0\1\12\62\0\4\255\7\0"+
|
"\53\0\1\253\54\0\1\254\61\0\1\255\11\0\1\256"+
|
||||||
"\15\255\3\0\3\255\40\0\1\256\53\0\1\257\43\0"+
|
"\12\0\4\251\7\0\15\251\3\0\3\251\37\0\1\257"+
|
||||||
"\1\260\26\0\2\255\1\0\2\255\1\0\2\255\2\0"+
|
"\53\0\1\260\54\0\1\261\22\0\1\13\62\0\4\262"+
|
||||||
"\5\255\7\0\15\255\3\0\4\255\27\0\1\261\53\0"+
|
"\7\0\15\262\3\0\3\262\40\0\1\263\53\0\1\264"+
|
||||||
"\1\262\24\0";
|
"\43\0\1\265\26\0\2\262\1\0\2\262\1\0\2\262"+
|
||||||
|
"\2\0\5\262\7\0\15\262\3\0\4\262\27\0\1\266"+
|
||||||
|
"\53\0\1\267\24\0";
|
||||||
|
|
||||||
private static int [] zzUnpackTrans() {
|
private static int [] zzUnpackTrans() {
|
||||||
int [] result = new int[6908];
|
int [] result = new int[7040];
|
||||||
int offset = 0;
|
int offset = 0;
|
||||||
offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
|
offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
|
||||||
return result;
|
return result;
|
||||||
|
@ -344,16 +346,17 @@ class WikipediaTokenizerImpl {
|
||||||
private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
|
private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
|
||||||
|
|
||||||
private static final String ZZ_ATTRIBUTE_PACKED_0 =
|
private static final String ZZ_ATTRIBUTE_PACKED_0 =
|
||||||
"\11\0\1\11\7\1\1\11\10\1\1\11\2\1\1\11"+
|
"\12\0\1\11\7\1\1\11\3\1\1\11\6\1\1\11"+
|
||||||
"\13\1\1\11\6\1\2\11\3\0\1\11\14\0\2\1"+
|
"\2\1\1\11\14\1\1\11\6\1\2\11\3\0\1\11"+
|
||||||
"\2\11\1\1\1\0\1\1\1\0\1\1\1\0\1\1"+
|
"\14\0\2\1\2\11\1\1\1\0\2\1\1\0\1\1"+
|
||||||
"\3\0\7\1\2\0\1\1\1\0\15\1\3\0\1\1"+
|
"\1\0\1\1\3\0\7\1\2\0\1\1\1\0\15\1"+
|
||||||
"\1\11\3\0\1\1\1\11\5\0\1\1\4\0\1\1"+
|
"\3\0\1\1\1\11\3\0\1\1\1\11\5\0\1\1"+
|
||||||
"\2\0\2\1\2\0\1\1\5\0\1\11\3\1\3\0"+
|
"\4\0\1\1\2\0\2\1\2\0\1\1\5\0\1\11"+
|
||||||
"\1\1\2\0\1\11\30\0\1\1\2\0\3\11";
|
"\3\1\3\0\1\1\2\0\1\11\30\0\1\1\2\0"+
|
||||||
|
"\3\11";
|
||||||
|
|
||||||
private static int [] zzUnpackAttribute() {
|
private static int [] zzUnpackAttribute() {
|
||||||
int [] result = new int[178];
|
int [] result = new int[183];
|
||||||
int offset = 0;
|
int offset = 0;
|
||||||
offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
|
offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
|
||||||
return result;
|
return result;
|
||||||
|
@ -446,6 +449,10 @@ private int currentTokType;
|
||||||
private int numBalanced = 0;
|
private int numBalanced = 0;
|
||||||
private int positionInc = 1;
|
private int positionInc = 1;
|
||||||
private int numLinkToks = 0;
|
private int numLinkToks = 0;
|
||||||
|
//Anytime we start a new on a Wiki reserved token (category, link, etc.) this value will be 0, otherwise it will be the number of tokens seen
|
||||||
|
//this can be useful for detecting when a new reserved token is encountered
|
||||||
|
//see https://issues.apache.org/jira/browse/LUCENE-1133
|
||||||
|
private int numWikiTokensSeen = 0;
|
||||||
|
|
||||||
public static final String [] TOKEN_TYPES = new String [] {
|
public static final String [] TOKEN_TYPES = new String [] {
|
||||||
"<ALPHANUM>",
|
"<ALPHANUM>",
|
||||||
|
@ -468,6 +475,14 @@ public static final String [] TOKEN_TYPES = new String [] {
|
||||||
WikipediaTokenizer.EXTERNAL_LINK_URL
|
WikipediaTokenizer.EXTERNAL_LINK_URL
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
Returns the number of tokens seen inside a category or link, etc.
|
||||||
|
@return the number of tokens seen inside the context of wiki syntax.
|
||||||
|
**/
|
||||||
|
public final int getNumWikiTokensSeen(){
|
||||||
|
return numWikiTokensSeen;
|
||||||
|
}
|
||||||
|
|
||||||
public final int yychar()
|
public final int yychar()
|
||||||
{
|
{
|
||||||
return yychar;
|
return yychar;
|
||||||
|
@ -480,10 +495,18 @@ public final int getPositionIncrement(){
|
||||||
/**
|
/**
|
||||||
* Fills Lucene token with the current token text.
|
* Fills Lucene token with the current token text.
|
||||||
*/
|
*/
|
||||||
final void getText(Token t, int tokType) {
|
final void getText(Token t) {
|
||||||
t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
|
t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
final int setText(StringBuffer buffer){
|
||||||
|
int length = zzMarkedPos - zzStartRead;
|
||||||
|
buffer.append(zzBuffer, zzStartRead, length);
|
||||||
|
return length;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates a new scanner
|
* Creates a new scanner
|
||||||
|
@ -774,178 +797,186 @@ final void getText(Token t, int tokType) {
|
||||||
zzMarkedPos = zzMarkedPosL;
|
zzMarkedPos = zzMarkedPosL;
|
||||||
|
|
||||||
switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
|
switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
|
||||||
case 7:
|
case 8:
|
||||||
{ /* ignore */
|
{ /* ignore */
|
||||||
}
|
}
|
||||||
case 44: break;
|
case 46: break;
|
||||||
|
case 28:
|
||||||
|
{ currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0; yybegin(INTERNAL_LINK_STATE);
|
||||||
|
}
|
||||||
|
case 47: break;
|
||||||
case 3:
|
case 3:
|
||||||
{ positionInc = 1; return CJ;
|
{ positionInc = 1; return CJ;
|
||||||
}
|
}
|
||||||
case 45: break;
|
case 48: break;
|
||||||
case 28:
|
case 30:
|
||||||
{ numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);/*end italics*/
|
{ numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);/*end italics*/
|
||||||
}
|
}
|
||||||
case 46: break;
|
case 49: break;
|
||||||
case 9:
|
case 10:
|
||||||
{ numLinkToks = 0; positionInc = 0; yybegin(YYINITIAL);
|
{ numLinkToks = 0; positionInc = 0; yybegin(YYINITIAL);
|
||||||
}
|
}
|
||||||
case 47: break;
|
case 50: break;
|
||||||
case 4:
|
case 41:
|
||||||
{ positionInc = 1; currentTokType = EXTERNAL_LINK_URL; yybegin(EXTERNAL_LINK_STATE);
|
|
||||||
}
|
|
||||||
case 48: break;
|
|
||||||
case 39:
|
|
||||||
{ numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);/*end bold italics*/
|
{ numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);/*end bold italics*/
|
||||||
}
|
}
|
||||||
case 49: break;
|
|
||||||
case 11:
|
|
||||||
{ currentTokType = ITALICS; yybegin(STRING); return currentTokType;/*italics*/
|
|
||||||
}
|
|
||||||
case 50: break;
|
|
||||||
case 23:
|
|
||||||
{ positionInc = 1; currentTokType = INTERNAL_LINK; yybegin(INTERNAL_LINK_STATE);
|
|
||||||
}
|
|
||||||
case 51: break;
|
case 51: break;
|
||||||
case 5:
|
case 7:
|
||||||
{ yybegin(CATEGORY_STATE); return currentTokType;
|
{ yybegin(INTERNAL_LINK_STATE); numWikiTokensSeen++; return currentTokType;
|
||||||
}
|
}
|
||||||
case 52: break;
|
case 52: break;
|
||||||
case 36:
|
case 23:
|
||||||
{ numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);/*end sub header*/
|
{ numWikiTokensSeen = 0; positionInc = 1; yybegin(DOUBLE_EQUALS_STATE);
|
||||||
}
|
}
|
||||||
case 53: break;
|
case 53: break;
|
||||||
case 8:
|
case 38:
|
||||||
{ if (numLinkToks == 0){positionInc = 0;} else{positionInc = 1;} currentTokType = EXTERNAL_LINK; yybegin(EXTERNAL_LINK_STATE); numLinkToks++; return currentTokType;
|
{ numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);/*end sub header*/
|
||||||
}
|
}
|
||||||
case 54: break;
|
case 54: break;
|
||||||
case 24:
|
case 17:
|
||||||
{ positionInc = 1; currentTokType = CITATION; yybegin(DOUBLE_BRACE_STATE);
|
{ yybegin(DOUBLE_BRACE_STATE); numWikiTokensSeen = 0; return currentTokType;
|
||||||
}
|
}
|
||||||
case 55: break;
|
case 55: break;
|
||||||
case 22:
|
case 24:
|
||||||
{ positionInc = 1; yybegin(DOUBLE_EQUALS_STATE);
|
{ numWikiTokensSeen = 0; positionInc = 1; currentTokType = INTERNAL_LINK; yybegin(INTERNAL_LINK_STATE);
|
||||||
}
|
}
|
||||||
case 56: break;
|
case 56: break;
|
||||||
case 41:
|
case 14:
|
||||||
{ positionInc = 1; currentTokType = CATEGORY; yybegin(CATEGORY_STATE);
|
{ yybegin(STRING); numWikiTokensSeen++; return currentTokType;
|
||||||
}
|
}
|
||||||
case 57: break;
|
case 57: break;
|
||||||
case 18:
|
case 5:
|
||||||
{ yybegin(STRING); return currentTokType;/* STRING ALPHANUM*/
|
|
||||||
}
|
|
||||||
case 58: break;
|
|
||||||
case 21:
|
|
||||||
{ positionInc = 1; if (numBalanced == 0){numBalanced++;yybegin(TWO_SINGLE_QUOTES_STATE);} else{numBalanced = 0;}
|
|
||||||
}
|
|
||||||
case 59: break;
|
|
||||||
case 1:
|
|
||||||
{ positionInc = 1;
|
{ positionInc = 1;
|
||||||
}
|
}
|
||||||
case 60: break;
|
case 58: break;
|
||||||
case 43:
|
case 43:
|
||||||
{ numBalanced = 0;currentTokType = CATEGORY;yybegin(CATEGORY_STATE);
|
{ numWikiTokensSeen = 0; positionInc = 1; currentTokType = CATEGORY; yybegin(CATEGORY_STATE);
|
||||||
|
}
|
||||||
|
case 59: break;
|
||||||
|
case 26:
|
||||||
|
{ yybegin(YYINITIAL);
|
||||||
|
}
|
||||||
|
case 60: break;
|
||||||
|
case 20:
|
||||||
|
{ numBalanced = 0; numWikiTokensSeen = 0; currentTokType = EXTERNAL_LINK;yybegin(EXTERNAL_LINK_STATE);
|
||||||
}
|
}
|
||||||
case 61: break;
|
case 61: break;
|
||||||
case 25:
|
case 1:
|
||||||
{ yybegin(YYINITIAL);
|
{ numWikiTokensSeen = 0; positionInc = 1;
|
||||||
}
|
}
|
||||||
case 62: break;
|
case 62: break;
|
||||||
case 40:
|
case 40:
|
||||||
{ positionInc = 1; yybegin(EXTERNAL_LINK_STATE); return currentTokType;
|
|
||||||
}
|
|
||||||
case 63: break;
|
|
||||||
case 19:
|
|
||||||
{ numBalanced = 0;currentTokType = EXTERNAL_LINK;yybegin(EXTERNAL_LINK_STATE);
|
|
||||||
}
|
|
||||||
case 64: break;
|
|
||||||
case 13:
|
|
||||||
{ yybegin(STRING);return currentTokType;
|
|
||||||
}
|
|
||||||
case 65: break;
|
|
||||||
case 38:
|
|
||||||
{ positionInc = 1; return EMAIL;
|
{ positionInc = 1; return EMAIL;
|
||||||
}
|
}
|
||||||
case 66: break;
|
case 63: break;
|
||||||
case 37:
|
case 25:
|
||||||
|
{ numWikiTokensSeen = 0; positionInc = 1; currentTokType = CITATION; yybegin(DOUBLE_BRACE_STATE);
|
||||||
|
}
|
||||||
|
case 64: break;
|
||||||
|
case 39:
|
||||||
{ positionInc = 1; return ACRONYM;
|
{ positionInc = 1; return ACRONYM;
|
||||||
}
|
}
|
||||||
|
case 65: break;
|
||||||
|
case 9:
|
||||||
|
{ if (numLinkToks == 0){positionInc = 0;} else{positionInc = 1;} numWikiTokensSeen++; currentTokType = EXTERNAL_LINK; yybegin(EXTERNAL_LINK_STATE); numLinkToks++; return currentTokType;
|
||||||
|
}
|
||||||
|
case 66: break;
|
||||||
|
case 22:
|
||||||
|
{ numWikiTokensSeen = 0; positionInc = 1; if (numBalanced == 0){numBalanced++;yybegin(TWO_SINGLE_QUOTES_STATE);} else{numBalanced = 0;}
|
||||||
|
}
|
||||||
case 67: break;
|
case 67: break;
|
||||||
case 17:
|
case 31:
|
||||||
{ /* ignore STRING */
|
{ numBalanced = 0; numWikiTokensSeen = 0; currentTokType = INTERNAL_LINK;yybegin(INTERNAL_LINK_STATE);
|
||||||
}
|
}
|
||||||
case 68: break;
|
case 68: break;
|
||||||
case 42:
|
case 15:
|
||||||
{ currentTokType = CATEGORY;yybegin(CATEGORY_STATE);
|
{ currentTokType = SUB_HEADING; numWikiTokensSeen = 0; yybegin(STRING);
|
||||||
}
|
}
|
||||||
case 69: break;
|
case 69: break;
|
||||||
case 20:
|
case 18:
|
||||||
{ yybegin(STRING); return currentTokType;/*pipe*/
|
{ /* ignore STRING */
|
||||||
}
|
}
|
||||||
case 70: break;
|
case 70: break;
|
||||||
case 12:
|
case 42:
|
||||||
{ currentTokType = EXTERNAL_LINK;yybegin(EXTERNAL_LINK_STATE);
|
{ positionInc = 1; numWikiTokensSeen++; yybegin(EXTERNAL_LINK_STATE); return currentTokType;
|
||||||
}
|
}
|
||||||
case 71: break;
|
case 71: break;
|
||||||
case 29:
|
case 21:
|
||||||
{ numBalanced = 0;currentTokType = INTERNAL_LINK;yybegin(INTERNAL_LINK_STATE);
|
{ yybegin(STRING); return currentTokType;/*pipe*/
|
||||||
}
|
}
|
||||||
case 72: break;
|
case 72: break;
|
||||||
case 35:
|
case 37:
|
||||||
{ numBalanced = 0;currentTokType = ALPHANUM;yybegin(YYINITIAL);/*end bold*/
|
{ numBalanced = 0;currentTokType = ALPHANUM;yybegin(YYINITIAL);/*end bold*/
|
||||||
}
|
}
|
||||||
case 73: break;
|
case 73: break;
|
||||||
case 16:
|
case 33:
|
||||||
{ yybegin(DOUBLE_BRACE_STATE); return currentTokType;
|
|
||||||
}
|
|
||||||
case 74: break;
|
|
||||||
case 31:
|
|
||||||
{ positionInc = 1; return HOST;
|
{ positionInc = 1; return HOST;
|
||||||
}
|
}
|
||||||
|
case 74: break;
|
||||||
|
case 45:
|
||||||
|
{ numBalanced = 0; numWikiTokensSeen = 0; currentTokType = CATEGORY;yybegin(CATEGORY_STATE);
|
||||||
|
}
|
||||||
case 75: break;
|
case 75: break;
|
||||||
case 34:
|
case 36:
|
||||||
{ currentTokType = BOLD_ITALICS; yybegin(FIVE_SINGLE_QUOTES_STATE);
|
{ currentTokType = BOLD_ITALICS; yybegin(FIVE_SINGLE_QUOTES_STATE);
|
||||||
}
|
}
|
||||||
case 76: break;
|
case 76: break;
|
||||||
case 27:
|
case 13:
|
||||||
{ currentTokType = INTERNAL_LINK;yybegin(INTERNAL_LINK_STATE);
|
{ currentTokType = EXTERNAL_LINK; numWikiTokensSeen = 0; yybegin(EXTERNAL_LINK_STATE);
|
||||||
}
|
}
|
||||||
case 77: break;
|
case 77: break;
|
||||||
case 14:
|
case 16:
|
||||||
{ currentTokType = SUB_HEADING; yybegin(STRING);
|
{ currentTokType = HEADING; yybegin(DOUBLE_EQUALS_STATE); numWikiTokensSeen++; return currentTokType;
|
||||||
}
|
}
|
||||||
case 78: break;
|
case 78: break;
|
||||||
case 30:
|
case 12:
|
||||||
{ positionInc = 1; return APOSTROPHE;
|
{ currentTokType = ITALICS; numWikiTokensSeen++; yybegin(STRING); return currentTokType;/*italics*/
|
||||||
}
|
}
|
||||||
case 79: break;
|
case 79: break;
|
||||||
case 32:
|
case 6:
|
||||||
{ positionInc = 1; return NUM;
|
{ yybegin(CATEGORY_STATE); numWikiTokensSeen++; return currentTokType;
|
||||||
}
|
}
|
||||||
case 80: break;
|
case 80: break;
|
||||||
case 15:
|
case 32:
|
||||||
{ currentTokType = HEADING; yybegin(DOUBLE_EQUALS_STATE); return currentTokType;
|
{ positionInc = 1; return APOSTROPHE;
|
||||||
}
|
}
|
||||||
case 81: break;
|
case 81: break;
|
||||||
case 6:
|
case 19:
|
||||||
{ yybegin(INTERNAL_LINK_STATE); return currentTokType;
|
{ yybegin(STRING); numWikiTokensSeen++; return currentTokType;/* STRING ALPHANUM*/
|
||||||
}
|
}
|
||||||
case 82: break;
|
case 82: break;
|
||||||
|
case 34:
|
||||||
|
{ positionInc = 1; return NUM;
|
||||||
|
}
|
||||||
|
case 83: break;
|
||||||
|
case 44:
|
||||||
|
{ currentTokType = CATEGORY; numWikiTokensSeen = 0; yybegin(CATEGORY_STATE);
|
||||||
|
}
|
||||||
|
case 84: break;
|
||||||
case 2:
|
case 2:
|
||||||
{ positionInc = 1; return ALPHANUM;
|
{ positionInc = 1; return ALPHANUM;
|
||||||
}
|
}
|
||||||
case 83: break;
|
case 85: break;
|
||||||
case 33:
|
case 35:
|
||||||
{ positionInc = 1; return COMPANY;
|
{ positionInc = 1; return COMPANY;
|
||||||
}
|
}
|
||||||
case 84: break;
|
case 86: break;
|
||||||
case 10:
|
case 11:
|
||||||
{ currentTokType = BOLD; yybegin(THREE_SINGLE_QUOTES_STATE);
|
{ currentTokType = BOLD; yybegin(THREE_SINGLE_QUOTES_STATE);
|
||||||
}
|
}
|
||||||
case 85: break;
|
case 87: break;
|
||||||
case 26:
|
case 29:
|
||||||
|
{ currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0; yybegin(INTERNAL_LINK_STATE);
|
||||||
|
}
|
||||||
|
case 88: break;
|
||||||
|
case 4:
|
||||||
|
{ numWikiTokensSeen = 0; positionInc = 1; currentTokType = EXTERNAL_LINK_URL; yybegin(EXTERNAL_LINK_STATE);
|
||||||
|
}
|
||||||
|
case 89: break;
|
||||||
|
case 27:
|
||||||
{ numLinkToks = 0; yybegin(YYINITIAL);
|
{ numLinkToks = 0; yybegin(YYINITIAL);
|
||||||
}
|
}
|
||||||
case 86: break;
|
case 90: break;
|
||||||
default:
|
default:
|
||||||
if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
|
if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
|
||||||
zzAtEOF = true;
|
zzAtEOF = true;
|
||||||
|
|
|
@ -54,6 +54,10 @@ private int currentTokType;
|
||||||
private int numBalanced = 0;
|
private int numBalanced = 0;
|
||||||
private int positionInc = 1;
|
private int positionInc = 1;
|
||||||
private int numLinkToks = 0;
|
private int numLinkToks = 0;
|
||||||
|
//Anytime we start a new on a Wiki reserved token (category, link, etc.) this value will be 0, otherwise it will be the number of tokens seen
|
||||||
|
//this can be useful for detecting when a new reserved token is encountered
|
||||||
|
//see https://issues.apache.org/jira/browse/LUCENE-1133
|
||||||
|
private int numWikiTokensSeen = 0;
|
||||||
|
|
||||||
public static final String [] TOKEN_TYPES = new String [] {
|
public static final String [] TOKEN_TYPES = new String [] {
|
||||||
"<ALPHANUM>",
|
"<ALPHANUM>",
|
||||||
|
@ -76,6 +80,14 @@ public static final String [] TOKEN_TYPES = new String [] {
|
||||||
WikipediaTokenizer.EXTERNAL_LINK_URL
|
WikipediaTokenizer.EXTERNAL_LINK_URL
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
Returns the number of tokens seen inside a category or link, etc.
|
||||||
|
@return the number of tokens seen inside the context of wiki syntax.
|
||||||
|
**/
|
||||||
|
public final int getNumWikiTokensSeen(){
|
||||||
|
return numWikiTokensSeen;
|
||||||
|
}
|
||||||
|
|
||||||
public final int yychar()
|
public final int yychar()
|
||||||
{
|
{
|
||||||
return yychar;
|
return yychar;
|
||||||
|
@ -88,9 +100,17 @@ public final int getPositionIncrement(){
|
||||||
/**
|
/**
|
||||||
* Fills Lucene token with the current token text.
|
* Fills Lucene token with the current token text.
|
||||||
*/
|
*/
|
||||||
final void getText(Token t, int tokType) {
|
final void getText(Token t) {
|
||||||
t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
|
t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
final int setText(StringBuffer buffer){
|
||||||
|
int length = zzMarkedPos - zzStartRead;
|
||||||
|
buffer.append(zzBuffer, zzStartRead, length);
|
||||||
|
return length;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
%}
|
%}
|
||||||
|
|
||||||
// basic word: a sequence of digits & letters
|
// basic word: a sequence of digits & letters
|
||||||
|
@ -191,21 +211,21 @@ DOUBLE_EQUALS = "="{2}
|
||||||
//First {ALPHANUM} is always the link, set positioninc to 1 for double bracket, but then inside the internal link state
|
//First {ALPHANUM} is always the link, set positioninc to 1 for double bracket, but then inside the internal link state
|
||||||
//set it to 0 for the next token, such that the link and the first token are in the same position, but then subsequent
|
//set it to 0 for the next token, such that the link and the first token are in the same position, but then subsequent
|
||||||
//tokens within the link are incremented
|
//tokens within the link are incremented
|
||||||
{DOUBLE_BRACKET} {positionInc = 1; currentTokType = INTERNAL_LINK; yybegin(INTERNAL_LINK_STATE);}
|
{DOUBLE_BRACKET} {numWikiTokensSeen = 0; positionInc = 1; currentTokType = INTERNAL_LINK; yybegin(INTERNAL_LINK_STATE);}
|
||||||
{DOUBLE_BRACKET_CAT} {positionInc = 1; currentTokType = CATEGORY; yybegin(CATEGORY_STATE);}
|
{DOUBLE_BRACKET_CAT} {numWikiTokensSeen = 0; positionInc = 1; currentTokType = CATEGORY; yybegin(CATEGORY_STATE);}
|
||||||
{EXTERNAL_LINK} {positionInc = 1; currentTokType = EXTERNAL_LINK_URL; yybegin(EXTERNAL_LINK_STATE);}
|
{EXTERNAL_LINK} {numWikiTokensSeen = 0; positionInc = 1; currentTokType = EXTERNAL_LINK_URL; yybegin(EXTERNAL_LINK_STATE);}
|
||||||
{TWO_SINGLE_QUOTES} {positionInc = 1; if (numBalanced == 0){numBalanced++;yybegin(TWO_SINGLE_QUOTES_STATE);} else{numBalanced = 0;}}
|
{TWO_SINGLE_QUOTES} {numWikiTokensSeen = 0; positionInc = 1; if (numBalanced == 0){numBalanced++;yybegin(TWO_SINGLE_QUOTES_STATE);} else{numBalanced = 0;}}
|
||||||
{DOUBLE_EQUALS} {positionInc = 1; yybegin(DOUBLE_EQUALS_STATE);}
|
{DOUBLE_EQUALS} {numWikiTokensSeen = 0; positionInc = 1; yybegin(DOUBLE_EQUALS_STATE);}
|
||||||
{DOUBLE_BRACE} {positionInc = 1; currentTokType = CITATION; yybegin(DOUBLE_BRACE_STATE);}
|
{DOUBLE_BRACE} {numWikiTokensSeen = 0; positionInc = 1; currentTokType = CITATION; yybegin(DOUBLE_BRACE_STATE);}
|
||||||
{CITATION} {positionInc = 1; currentTokType = CITATION; yybegin(DOUBLE_BRACE_STATE);}
|
{CITATION} {numWikiTokensSeen = 0; positionInc = 1; currentTokType = CITATION; yybegin(DOUBLE_BRACE_STATE);}
|
||||||
//ignore
|
//ignore
|
||||||
. | {WHITESPACE} |{INFOBOX} { positionInc = 1; }
|
. | {WHITESPACE} |{INFOBOX} {numWikiTokensSeen = 0; positionInc = 1; }
|
||||||
}
|
}
|
||||||
|
|
||||||
<INTERNAL_LINK_STATE>{
|
<INTERNAL_LINK_STATE>{
|
||||||
//First {ALPHANUM} is always the link, set position to 0 for these
|
//First {ALPHANUM} is always the link, set position to 0 for these
|
||||||
//This is slightly different from EXTERNAL_LINK_STATE because that one has an explicit grammar for capturing the URL
|
//This is slightly different from EXTERNAL_LINK_STATE because that one has an explicit grammar for capturing the URL
|
||||||
{ALPHANUM} {yybegin(INTERNAL_LINK_STATE); return currentTokType;}
|
{ALPHANUM} {yybegin(INTERNAL_LINK_STATE); numWikiTokensSeen++; return currentTokType;}
|
||||||
{DOUBLE_BRACKET_CLOSE} {numLinkToks = 0; yybegin(YYINITIAL);}
|
{DOUBLE_BRACKET_CLOSE} {numLinkToks = 0; yybegin(YYINITIAL);}
|
||||||
//ignore
|
//ignore
|
||||||
. | {WHITESPACE} { positionInc = 1; }
|
. | {WHITESPACE} { positionInc = 1; }
|
||||||
|
@ -213,14 +233,14 @@ DOUBLE_EQUALS = "="{2}
|
||||||
|
|
||||||
<EXTERNAL_LINK_STATE>{
|
<EXTERNAL_LINK_STATE>{
|
||||||
//increment the link token, but then don't increment the tokens after that which are still in the link
|
//increment the link token, but then don't increment the tokens after that which are still in the link
|
||||||
("http://"|"https://"){HOST}("/"?({ALPHANUM}|{P}|\?|"&"|"="|"#")*)* {positionInc = 1; yybegin(EXTERNAL_LINK_STATE); return currentTokType;}
|
("http://"|"https://"){HOST}("/"?({ALPHANUM}|{P}|\?|"&"|"="|"#")*)* {positionInc = 1; numWikiTokensSeen++; yybegin(EXTERNAL_LINK_STATE); return currentTokType;}
|
||||||
{ALPHANUM} {if (numLinkToks == 0){positionInc = 0;} else{positionInc = 1;} currentTokType = EXTERNAL_LINK; yybegin(EXTERNAL_LINK_STATE); numLinkToks++; return currentTokType;}
|
{ALPHANUM} {if (numLinkToks == 0){positionInc = 0;} else{positionInc = 1;} numWikiTokensSeen++; currentTokType = EXTERNAL_LINK; yybegin(EXTERNAL_LINK_STATE); numLinkToks++; return currentTokType;}
|
||||||
"]" {numLinkToks = 0; positionInc = 0; yybegin(YYINITIAL);}
|
"]" {numLinkToks = 0; positionInc = 0; yybegin(YYINITIAL);}
|
||||||
{WHITESPACE} { positionInc = 1; }
|
{WHITESPACE} { positionInc = 1; }
|
||||||
}
|
}
|
||||||
|
|
||||||
<CATEGORY_STATE>{
|
<CATEGORY_STATE>{
|
||||||
{ALPHANUM} {yybegin(CATEGORY_STATE); return currentTokType;}
|
{ALPHANUM} {yybegin(CATEGORY_STATE); numWikiTokensSeen++; return currentTokType;}
|
||||||
{DOUBLE_BRACKET_CLOSE} {yybegin(YYINITIAL);}
|
{DOUBLE_BRACKET_CLOSE} {yybegin(YYINITIAL);}
|
||||||
//ignore
|
//ignore
|
||||||
. | {WHITESPACE} { positionInc = 1; }
|
. | {WHITESPACE} { positionInc = 1; }
|
||||||
|
@ -229,22 +249,22 @@ DOUBLE_EQUALS = "="{2}
|
||||||
<TWO_SINGLE_QUOTES_STATE>{
|
<TWO_SINGLE_QUOTES_STATE>{
|
||||||
"'" {currentTokType = BOLD; yybegin(THREE_SINGLE_QUOTES_STATE);}
|
"'" {currentTokType = BOLD; yybegin(THREE_SINGLE_QUOTES_STATE);}
|
||||||
"'''" {currentTokType = BOLD_ITALICS; yybegin(FIVE_SINGLE_QUOTES_STATE);}
|
"'''" {currentTokType = BOLD_ITALICS; yybegin(FIVE_SINGLE_QUOTES_STATE);}
|
||||||
{ALPHANUM} {currentTokType = ITALICS; yybegin(STRING); return currentTokType;/*italics*/}
|
{ALPHANUM} {currentTokType = ITALICS; numWikiTokensSeen++; yybegin(STRING); return currentTokType;/*italics*/}
|
||||||
//we can have links inside, let those override
|
//we can have links inside, let those override
|
||||||
{DOUBLE_BRACKET} {currentTokType = INTERNAL_LINK;yybegin(INTERNAL_LINK_STATE);}
|
{DOUBLE_BRACKET} {currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0; yybegin(INTERNAL_LINK_STATE);}
|
||||||
{DOUBLE_BRACKET_CAT} {currentTokType = CATEGORY;yybegin(CATEGORY_STATE);}
|
{DOUBLE_BRACKET_CAT} {currentTokType = CATEGORY; numWikiTokensSeen = 0; yybegin(CATEGORY_STATE);}
|
||||||
{EXTERNAL_LINK} {currentTokType = EXTERNAL_LINK;yybegin(EXTERNAL_LINK_STATE);}
|
{EXTERNAL_LINK} {currentTokType = EXTERNAL_LINK; numWikiTokensSeen = 0; yybegin(EXTERNAL_LINK_STATE);}
|
||||||
|
|
||||||
//ignore
|
//ignore
|
||||||
. | {WHITESPACE} { /* ignore */ }
|
. | {WHITESPACE} { /* ignore */ }
|
||||||
}
|
}
|
||||||
//bold
|
//bold
|
||||||
<THREE_SINGLE_QUOTES_STATE>{
|
<THREE_SINGLE_QUOTES_STATE>{
|
||||||
{ALPHANUM} {yybegin(STRING);return currentTokType;}
|
{ALPHANUM} {yybegin(STRING); numWikiTokensSeen++; return currentTokType;}
|
||||||
//we can have links inside, let those override
|
//we can have links inside, let those override
|
||||||
{DOUBLE_BRACKET} {currentTokType = INTERNAL_LINK;yybegin(INTERNAL_LINK_STATE);}
|
{DOUBLE_BRACKET} {currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0; yybegin(INTERNAL_LINK_STATE);}
|
||||||
{DOUBLE_BRACKET_CAT} {currentTokType = CATEGORY;yybegin(CATEGORY_STATE);}
|
{DOUBLE_BRACKET_CAT} {currentTokType = CATEGORY; numWikiTokensSeen = 0; yybegin(CATEGORY_STATE);}
|
||||||
{EXTERNAL_LINK} {currentTokType = EXTERNAL_LINK;yybegin(EXTERNAL_LINK_STATE);}
|
{EXTERNAL_LINK} {currentTokType = EXTERNAL_LINK; numWikiTokensSeen = 0; yybegin(EXTERNAL_LINK_STATE);}
|
||||||
|
|
||||||
//ignore
|
//ignore
|
||||||
. | {WHITESPACE} { /* ignore */ }
|
. | {WHITESPACE} { /* ignore */ }
|
||||||
|
@ -252,26 +272,26 @@ DOUBLE_EQUALS = "="{2}
|
||||||
}
|
}
|
||||||
//bold italics
|
//bold italics
|
||||||
<FIVE_SINGLE_QUOTES_STATE>{
|
<FIVE_SINGLE_QUOTES_STATE>{
|
||||||
{ALPHANUM} {yybegin(STRING);return currentTokType;}
|
{ALPHANUM} {yybegin(STRING); numWikiTokensSeen++; return currentTokType;}
|
||||||
//we can have links inside, let those override
|
//we can have links inside, let those override
|
||||||
{DOUBLE_BRACKET} {currentTokType = INTERNAL_LINK;yybegin(INTERNAL_LINK_STATE);}
|
{DOUBLE_BRACKET} {currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0; yybegin(INTERNAL_LINK_STATE);}
|
||||||
{DOUBLE_BRACKET_CAT} {currentTokType = CATEGORY;yybegin(CATEGORY_STATE);}
|
{DOUBLE_BRACKET_CAT} {currentTokType = CATEGORY; numWikiTokensSeen = 0; yybegin(CATEGORY_STATE);}
|
||||||
{EXTERNAL_LINK} {currentTokType = EXTERNAL_LINK;yybegin(EXTERNAL_LINK_STATE);}
|
{EXTERNAL_LINK} {currentTokType = EXTERNAL_LINK; numWikiTokensSeen = 0; yybegin(EXTERNAL_LINK_STATE);}
|
||||||
|
|
||||||
//ignore
|
//ignore
|
||||||
. | {WHITESPACE} { /* ignore */ }
|
. | {WHITESPACE} { /* ignore */ }
|
||||||
}
|
}
|
||||||
|
|
||||||
<DOUBLE_EQUALS_STATE>{
|
<DOUBLE_EQUALS_STATE>{
|
||||||
"=" {currentTokType = SUB_HEADING; yybegin(STRING);}
|
"=" {currentTokType = SUB_HEADING; numWikiTokensSeen = 0; yybegin(STRING);}
|
||||||
{ALPHANUM} {currentTokType = HEADING; yybegin(DOUBLE_EQUALS_STATE); return currentTokType;}
|
{ALPHANUM} {currentTokType = HEADING; yybegin(DOUBLE_EQUALS_STATE); numWikiTokensSeen++; return currentTokType;}
|
||||||
{DOUBLE_EQUALS} {yybegin(YYINITIAL);}
|
{DOUBLE_EQUALS} {yybegin(YYINITIAL);}
|
||||||
//ignore
|
//ignore
|
||||||
. | {WHITESPACE} { /* ignore */ }
|
. | {WHITESPACE} { /* ignore */ }
|
||||||
}
|
}
|
||||||
|
|
||||||
<DOUBLE_BRACE_STATE>{
|
<DOUBLE_BRACE_STATE>{
|
||||||
{ALPHANUM} {yybegin(DOUBLE_BRACE_STATE); return currentTokType;}
|
{ALPHANUM} {yybegin(DOUBLE_BRACE_STATE); numWikiTokensSeen = 0; return currentTokType;}
|
||||||
{DOUBLE_BRACE_CLOSE} {yybegin(YYINITIAL);}
|
{DOUBLE_BRACE_CLOSE} {yybegin(YYINITIAL);}
|
||||||
{CITATION_CLOSE} {yybegin(YYINITIAL);}
|
{CITATION_CLOSE} {yybegin(YYINITIAL);}
|
||||||
//ignore
|
//ignore
|
||||||
|
@ -283,11 +303,11 @@ DOUBLE_EQUALS = "="{2}
|
||||||
"'''" {numBalanced = 0;currentTokType = ALPHANUM;yybegin(YYINITIAL);/*end bold*/}
|
"'''" {numBalanced = 0;currentTokType = ALPHANUM;yybegin(YYINITIAL);/*end bold*/}
|
||||||
"''" {numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);/*end italics*/}
|
"''" {numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);/*end italics*/}
|
||||||
"===" {numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);/*end sub header*/}
|
"===" {numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);/*end sub header*/}
|
||||||
{ALPHANUM} {yybegin(STRING); return currentTokType;/* STRING ALPHANUM*/}
|
{ALPHANUM} {yybegin(STRING); numWikiTokensSeen++; return currentTokType;/* STRING ALPHANUM*/}
|
||||||
//we can have links inside, let those override
|
//we can have links inside, let those override
|
||||||
{DOUBLE_BRACKET} {numBalanced = 0;currentTokType = INTERNAL_LINK;yybegin(INTERNAL_LINK_STATE);}
|
{DOUBLE_BRACKET} {numBalanced = 0; numWikiTokensSeen = 0; currentTokType = INTERNAL_LINK;yybegin(INTERNAL_LINK_STATE);}
|
||||||
{DOUBLE_BRACKET_CAT} {numBalanced = 0;currentTokType = CATEGORY;yybegin(CATEGORY_STATE);}
|
{DOUBLE_BRACKET_CAT} {numBalanced = 0; numWikiTokensSeen = 0; currentTokType = CATEGORY;yybegin(CATEGORY_STATE);}
|
||||||
{EXTERNAL_LINK} {numBalanced = 0;currentTokType = EXTERNAL_LINK;yybegin(EXTERNAL_LINK_STATE);}
|
{EXTERNAL_LINK} {numBalanced = 0; numWikiTokensSeen = 0; currentTokType = EXTERNAL_LINK;yybegin(EXTERNAL_LINK_STATE);}
|
||||||
|
|
||||||
|
|
||||||
{PIPE} {yybegin(STRING); return currentTokType;/*pipe*/}
|
{PIPE} {yybegin(STRING); return currentTokType;/*pipe*/}
|
||||||
|
|
|
@ -22,8 +22,11 @@ import junit.framework.TestCase;
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.Token;
|
||||||
|
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
|
import java.io.IOException;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.HashSet;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -31,6 +34,7 @@ import java.util.Map;
|
||||||
*
|
*
|
||||||
**/
|
**/
|
||||||
public class WikipediaTokenizerTest extends TestCase {
|
public class WikipediaTokenizerTest extends TestCase {
|
||||||
|
protected static final String LINK_PHRASES = "click [[link here again]] click [http://lucene.apache.org here again] [[Category:a b c d]]";
|
||||||
|
|
||||||
|
|
||||||
public WikipediaTokenizerTest(String s) {
|
public WikipediaTokenizerTest(String s) {
|
||||||
|
@ -155,8 +159,13 @@ public class WikipediaTokenizerTest extends TestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testLinkPhrases() throws Exception {
|
public void testLinkPhrases() throws Exception {
|
||||||
String test = "click [[link here again]] click [http://lucene.apache.org here again]";
|
|
||||||
WikipediaTokenizer tf = new WikipediaTokenizer(new StringReader(test));
|
WikipediaTokenizer tf = new WikipediaTokenizer(new StringReader(LINK_PHRASES));
|
||||||
|
checkLinkPhrases(tf);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private void checkLinkPhrases(WikipediaTokenizer tf) throws IOException {
|
||||||
Token token = new Token();
|
Token token = new Token();
|
||||||
token = tf.next(token);
|
token = tf.next(token);
|
||||||
assertTrue("token is null and it shouldn't be", token != null);
|
assertTrue("token is null and it shouldn't be", token != null);
|
||||||
|
@ -201,7 +210,33 @@ public class WikipediaTokenizerTest extends TestCase {
|
||||||
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "again",
|
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "again",
|
||||||
new String(token.termBuffer(), 0, token.termLength()).equals("again") == true);
|
new String(token.termBuffer(), 0, token.termLength()).equals("again") == true);
|
||||||
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
|
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
|
||||||
|
|
||||||
|
token = tf.next(token);
|
||||||
|
assertTrue("token is null and it shouldn't be", token != null);
|
||||||
|
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "a",
|
||||||
|
new String(token.termBuffer(), 0, token.termLength()).equals("a") == true);
|
||||||
|
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
|
||||||
|
|
||||||
|
token = tf.next(token);
|
||||||
|
assertTrue("token is null and it shouldn't be", token != null);
|
||||||
|
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "b",
|
||||||
|
new String(token.termBuffer(), 0, token.termLength()).equals("b") == true);
|
||||||
|
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
|
||||||
|
|
||||||
|
token = tf.next(token);
|
||||||
|
assertTrue("token is null and it shouldn't be", token != null);
|
||||||
|
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "c",
|
||||||
|
new String(token.termBuffer(), 0, token.termLength()).equals("c") == true);
|
||||||
|
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
|
||||||
|
|
||||||
|
token = tf.next(token);
|
||||||
|
assertTrue("token is null and it shouldn't be", token != null);
|
||||||
|
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "d",
|
||||||
|
new String(token.termBuffer(), 0, token.termLength()).equals("d") == true);
|
||||||
|
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
|
||||||
|
|
||||||
|
token = tf.next();
|
||||||
|
assertTrue("token is not null and it should be", token == null);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testLinks() throws Exception {
|
public void testLinks() throws Exception {
|
||||||
|
@ -225,5 +260,317 @@ public class WikipediaTokenizerTest extends TestCase {
|
||||||
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "https://lucene.apache.org/java/docs/index.html?b=c",
|
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "https://lucene.apache.org/java/docs/index.html?b=c",
|
||||||
new String(token.termBuffer(), 0, token.termLength()).equals("https://lucene.apache.org/java/docs/index.html?b=c") == true);
|
new String(token.termBuffer(), 0, token.termLength()).equals("https://lucene.apache.org/java/docs/index.html?b=c") == true);
|
||||||
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.EXTERNAL_LINK_URL, token.type().equals(WikipediaTokenizer.EXTERNAL_LINK_URL) == true);
|
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.EXTERNAL_LINK_URL, token.type().equals(WikipediaTokenizer.EXTERNAL_LINK_URL) == true);
|
||||||
|
token = tf.next();
|
||||||
|
assertTrue("token is null and it shouldn't be", token != null);
|
||||||
|
|
||||||
|
token = tf.next();
|
||||||
|
assertTrue("token is not null and it should be", token == null);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testLucene1133() throws Exception {
|
||||||
|
Set untoks = new HashSet();
|
||||||
|
untoks.add(WikipediaTokenizer.CATEGORY);
|
||||||
|
untoks.add(WikipediaTokenizer.ITALICS);
|
||||||
|
//should be exactly the same, regardless of untoks
|
||||||
|
WikipediaTokenizer tf = new WikipediaTokenizer(new StringReader(LINK_PHRASES), WikipediaTokenizer.TOKENS_ONLY, untoks);
|
||||||
|
checkLinkPhrases(tf);
|
||||||
|
String test = "[[Category:a b c d]] [[Category:e f g]] [[link here]] [[link there]] ''italics here'' something ''more italics'' [[Category:h i j]]";
|
||||||
|
tf = new WikipediaTokenizer(new StringReader(test), WikipediaTokenizer.UNTOKENIZED_ONLY, untoks);
|
||||||
|
Token token;
|
||||||
|
token = tf.next();
|
||||||
|
assertTrue("token is null and it shouldn't be", token != null);
|
||||||
|
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "a b c d",
|
||||||
|
new String(token.termBuffer(), 0, token.termLength()).equals("a b c d") == true);
|
||||||
|
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
|
||||||
|
assertTrue(token.startOffset() + " does not equal: " + 11, token.startOffset() == 11);
|
||||||
|
assertTrue(token.endOffset() + " does not equal: " + 18, token.endOffset() == 18);
|
||||||
|
token = tf.next();
|
||||||
|
assertTrue("token is null and it shouldn't be", token != null);
|
||||||
|
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "e f g",
|
||||||
|
new String(token.termBuffer(), 0, token.termLength()).equals("e f g") == true);
|
||||||
|
assertTrue(token.startOffset() + " does not equal: " + 32, token.startOffset() == 32);
|
||||||
|
assertTrue(token.endOffset() + " does not equal: " + 37, token.endOffset() == 37);
|
||||||
|
token = tf.next();
|
||||||
|
assertTrue("token is null and it shouldn't be", token != null);
|
||||||
|
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "link",
|
||||||
|
new String(token.termBuffer(), 0, token.termLength()).equals("link") == true);
|
||||||
|
assertTrue(token.startOffset() + " does not equal: " + 42, token.startOffset() == 42);
|
||||||
|
assertTrue(token.endOffset() + " does not equal: " + 46, token.endOffset() == 46);
|
||||||
|
token = tf.next();
|
||||||
|
assertTrue("token is null and it shouldn't be", token != null);
|
||||||
|
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "here",
|
||||||
|
new String(token.termBuffer(), 0, token.termLength()).equals("here") == true);
|
||||||
|
assertTrue(token.startOffset() + " does not equal: " + 47, token.startOffset() == 47);
|
||||||
|
assertTrue(token.endOffset() + " does not equal: " + 51, token.endOffset() == 51);
|
||||||
|
token = tf.next();
|
||||||
|
assertTrue("token is null and it shouldn't be", token != null);
|
||||||
|
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "link",
|
||||||
|
new String(token.termBuffer(), 0, token.termLength()).equals("link") == true);
|
||||||
|
assertTrue(token.startOffset() + " does not equal: " + 56, token.startOffset() == 56);
|
||||||
|
assertTrue(token.endOffset() + " does not equal: " + 60, token.endOffset() == 60);
|
||||||
|
token = tf.next();
|
||||||
|
assertTrue("token is null and it shouldn't be", token != null);
|
||||||
|
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "there",
|
||||||
|
new String(token.termBuffer(), 0, token.termLength()).equals("there") == true);
|
||||||
|
assertTrue(token.startOffset() + " does not equal: " + 61, token.startOffset() == 61);
|
||||||
|
assertTrue(token.endOffset() + " does not equal: " + 66, token.endOffset() == 66);
|
||||||
|
token = tf.next();
|
||||||
|
assertTrue("token is null and it shouldn't be", token != null);
|
||||||
|
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "italics here",
|
||||||
|
new String(token.termBuffer(), 0, token.termLength()).equals("italics here") == true);
|
||||||
|
assertTrue(token.startOffset() + " does not equal: " + 71, token.startOffset() == 71);
|
||||||
|
assertTrue(token.endOffset() + " does not equal: " + 83, token.endOffset() == 83);
|
||||||
|
token = tf.next();
|
||||||
|
assertTrue("token is null and it shouldn't be", token != null);
|
||||||
|
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "something",
|
||||||
|
new String(token.termBuffer(), 0, token.termLength()).equals("something") == true);
|
||||||
|
assertTrue(token.startOffset() + " does not equal: " + 86, token.startOffset() == 86);
|
||||||
|
assertTrue(token.endOffset() + " does not equal: " + 95, token.endOffset() == 95);
|
||||||
|
token = tf.next();
|
||||||
|
assertTrue("token is null and it shouldn't be", token != null);
|
||||||
|
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "more italics",
|
||||||
|
new String(token.termBuffer(), 0, token.termLength()).equals("more italics") == true);
|
||||||
|
assertTrue(token.startOffset() + " does not equal: " + 98, token.startOffset() == 98);
|
||||||
|
assertTrue(token.endOffset() + " does not equal: " + 110, token.endOffset() == 110);
|
||||||
|
|
||||||
|
token = tf.next();
|
||||||
|
assertTrue("token is null and it shouldn't be", token != null);
|
||||||
|
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "h i j",
|
||||||
|
new String(token.termBuffer(), 0, token.termLength()).equals("h i j") == true);
|
||||||
|
assertTrue(token.startOffset() + " does not equal: " + 124, token.startOffset() == 124);
|
||||||
|
assertTrue(token.endOffset() + " does not equal: " + 133, token.endOffset() == 133);
|
||||||
|
|
||||||
|
token = tf.next();
|
||||||
|
assertTrue("token is not null and it should be", token == null);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testBoth() throws Exception {
|
||||||
|
Set untoks = new HashSet();
|
||||||
|
untoks.add(WikipediaTokenizer.CATEGORY);
|
||||||
|
untoks.add(WikipediaTokenizer.ITALICS);
|
||||||
|
String test = "[[Category:a b c d]] [[Category:e f g]] [[link here]] [[link there]] ''italics here'' something ''more italics'' [[Category:h i j]]";
|
||||||
|
//should output all the indivual tokens plus the untokenized tokens as well. Untokenized tokens
|
||||||
|
WikipediaTokenizer tf = new WikipediaTokenizer(new StringReader(test), WikipediaTokenizer.BOTH, untoks);
|
||||||
|
Token token;
|
||||||
|
token = tf.next();
|
||||||
|
assertTrue("token is null and it shouldn't be", token != null);
|
||||||
|
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "a b c d",
|
||||||
|
new String(token.termBuffer(), 0, token.termLength()).equals("a b c d") == true);
|
||||||
|
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
|
||||||
|
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true);
|
||||||
|
assertTrue(token.getFlags() + " does not equal: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, token.getFlags() == WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG);
|
||||||
|
assertTrue(token.startOffset() + " does not equal: " + 11, token.startOffset() == 11);
|
||||||
|
assertTrue(token.endOffset() + " does not equal: " + 18, token.endOffset() == 18);
|
||||||
|
token = tf.next();
|
||||||
|
assertTrue("token is null and it shouldn't be", token != null);
|
||||||
|
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "a",
|
||||||
|
new String(token.termBuffer(), 0, token.termLength()).equals("a") == true);
|
||||||
|
assertTrue(token.getPositionIncrement() + " does not equal: " + 0, token.getPositionIncrement() == 0);
|
||||||
|
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true);
|
||||||
|
assertTrue(token.getFlags() + " equals: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG + " and it shouldn't", token.getFlags() != WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG);
|
||||||
|
assertTrue(token.startOffset() + " does not equal: " + 11, token.startOffset() == 11);
|
||||||
|
assertTrue(token.endOffset() + " does not equal: " + 12, token.endOffset() == 12);
|
||||||
|
|
||||||
|
token = tf.next();
|
||||||
|
assertTrue("token is null and it shouldn't be", token != null);
|
||||||
|
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "b",
|
||||||
|
new String(token.termBuffer(), 0, token.termLength()).equals("b") == true);
|
||||||
|
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
|
||||||
|
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true);
|
||||||
|
assertTrue(token.startOffset() + " does not equal: " + 13, token.startOffset() == 13);
|
||||||
|
assertTrue(token.endOffset() + " does not equal: " + 14, token.endOffset() == 14);
|
||||||
|
|
||||||
|
token = tf.next();
|
||||||
|
assertTrue("token is null and it shouldn't be", token != null);
|
||||||
|
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "c",
|
||||||
|
new String(token.termBuffer(), 0, token.termLength()).equals("c") == true);
|
||||||
|
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
|
||||||
|
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true);
|
||||||
|
assertTrue(token.startOffset() + " does not equal: " + 15, token.startOffset() == 15);
|
||||||
|
assertTrue(token.endOffset() + " does not equal: " + 16, token.endOffset() == 16);
|
||||||
|
|
||||||
|
token = tf.next();
|
||||||
|
assertTrue("token is null and it shouldn't be", token != null);
|
||||||
|
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "d",
|
||||||
|
new String(token.termBuffer(), 0, token.termLength()).equals("d") == true);
|
||||||
|
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
|
||||||
|
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true);
|
||||||
|
assertTrue(token.startOffset() + " does not equal: " + 17, token.startOffset() == 17);
|
||||||
|
assertTrue(token.endOffset() + " does not equal: " + 18, token.endOffset() == 18);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
token = tf.next();
|
||||||
|
assertTrue("token is null and it shouldn't be", token != null);
|
||||||
|
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "e f g",
|
||||||
|
new String(token.termBuffer(), 0, token.termLength()).equals("e f g") == true);
|
||||||
|
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true);
|
||||||
|
assertTrue(token.getFlags() + " does not equal: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, token.getFlags() == WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG);
|
||||||
|
assertTrue(token.startOffset() + " does not equal: " + 32, token.startOffset() == 32);
|
||||||
|
assertTrue(token.endOffset() + " does not equal: " + 37, token.endOffset() == 37);
|
||||||
|
|
||||||
|
token = tf.next();
|
||||||
|
assertTrue("token is null and it shouldn't be", token != null);
|
||||||
|
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "e",
|
||||||
|
new String(token.termBuffer(), 0, token.termLength()).equals("e") == true);
|
||||||
|
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true);
|
||||||
|
assertTrue(token.getPositionIncrement() + " does not equal: " + 0, token.getPositionIncrement() == 0);
|
||||||
|
assertTrue(token.startOffset() + " does not equal: " + 32, token.startOffset() == 32);
|
||||||
|
assertTrue(token.endOffset() + " does not equal: " + 33, token.endOffset() == 33);
|
||||||
|
|
||||||
|
token = tf.next();
|
||||||
|
assertTrue("token is null and it shouldn't be", token != null);
|
||||||
|
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "f",
|
||||||
|
new String(token.termBuffer(), 0, token.termLength()).equals("f") == true);
|
||||||
|
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true);
|
||||||
|
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
|
||||||
|
assertTrue(token.startOffset() + " does not equal: " + 34, token.startOffset() == 34);
|
||||||
|
assertTrue(token.endOffset() + " does not equal: " + 35, token.endOffset() == 35);
|
||||||
|
|
||||||
|
token = tf.next();
|
||||||
|
assertTrue("token is null and it shouldn't be", token != null);
|
||||||
|
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "g",
|
||||||
|
new String(token.termBuffer(), 0, token.termLength()).equals("g") == true);
|
||||||
|
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true);
|
||||||
|
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
|
||||||
|
assertTrue(token.startOffset() + " does not equal: " + 36, token.startOffset() == 36);
|
||||||
|
assertTrue(token.endOffset() + " does not equal: " + 37, token.endOffset() == 37);
|
||||||
|
|
||||||
|
token = tf.next();
|
||||||
|
assertTrue("token is null and it shouldn't be", token != null);
|
||||||
|
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "link",
|
||||||
|
new String(token.termBuffer(), 0, token.termLength()).equals("link") == true);
|
||||||
|
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
|
||||||
|
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.INTERNAL_LINK, token.type().equals(WikipediaTokenizer.INTERNAL_LINK) == true);
|
||||||
|
assertTrue(token.startOffset() + " does not equal: " + 42, token.startOffset() == 42);
|
||||||
|
assertTrue(token.endOffset() + " does not equal: " + 46, token.endOffset() == 46);
|
||||||
|
token = tf.next();
|
||||||
|
assertTrue("token is null and it shouldn't be", token != null);
|
||||||
|
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "here",
|
||||||
|
new String(token.termBuffer(), 0, token.termLength()).equals("here") == true);
|
||||||
|
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
|
||||||
|
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.INTERNAL_LINK, token.type().equals(WikipediaTokenizer.INTERNAL_LINK) == true);
|
||||||
|
assertTrue(token.startOffset() + " does not equal: " + 47, token.startOffset() == 47);
|
||||||
|
assertTrue(token.endOffset() + " does not equal: " + 51, token.endOffset() == 51);
|
||||||
|
token = tf.next();
|
||||||
|
assertTrue("token is null and it shouldn't be", token != null);
|
||||||
|
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "link",
|
||||||
|
new String(token.termBuffer(), 0, token.termLength()).equals("link") == true);
|
||||||
|
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
|
||||||
|
assertTrue(token.startOffset() + " does not equal: " + 56, token.startOffset() == 56);
|
||||||
|
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.INTERNAL_LINK, token.type().equals(WikipediaTokenizer.INTERNAL_LINK) == true);
|
||||||
|
assertTrue(token.endOffset() + " does not equal: " + 60, token.endOffset() == 60);
|
||||||
|
token = tf.next();
|
||||||
|
assertTrue("token is null and it shouldn't be", token != null);
|
||||||
|
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "there",
|
||||||
|
new String(token.termBuffer(), 0, token.termLength()).equals("there") == true);
|
||||||
|
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
|
||||||
|
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.INTERNAL_LINK, token.type().equals(WikipediaTokenizer.INTERNAL_LINK) == true);
|
||||||
|
assertTrue(token.startOffset() + " does not equal: " + 61, token.startOffset() == 61);
|
||||||
|
assertTrue(token.endOffset() + " does not equal: " + 66, token.endOffset() == 66);
|
||||||
|
token = tf.next();
|
||||||
|
assertTrue("token is null and it shouldn't be", token != null);
|
||||||
|
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "italics here",
|
||||||
|
new String(token.termBuffer(), 0, token.termLength()).equals("italics here") == true);
|
||||||
|
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
|
||||||
|
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.ITALICS, token.type().equals(WikipediaTokenizer.ITALICS) == true);
|
||||||
|
assertTrue(token.getFlags() + " does not equal: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, token.getFlags() == WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG);
|
||||||
|
assertTrue(token.startOffset() + " does not equal: " + 71, token.startOffset() == 71);
|
||||||
|
assertTrue(token.endOffset() + " does not equal: " + 83, token.endOffset() == 83);
|
||||||
|
|
||||||
|
token = tf.next();
|
||||||
|
assertTrue("token is null and it shouldn't be", token != null);
|
||||||
|
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "italics",
|
||||||
|
new String(token.termBuffer(), 0, token.termLength()).equals("italics") == true);
|
||||||
|
assertTrue(token.getPositionIncrement() + " does not equal: " + 0, token.getPositionIncrement() == 0);
|
||||||
|
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.ITALICS, token.type().equals(WikipediaTokenizer.ITALICS) == true);
|
||||||
|
assertTrue(token.startOffset() + " does not equal: " + 71, token.startOffset() == 71);
|
||||||
|
assertTrue(token.endOffset() + " does not equal: " + 78, token.endOffset() == 78);
|
||||||
|
|
||||||
|
token = tf.next();
|
||||||
|
assertTrue("token is null and it shouldn't be", token != null);
|
||||||
|
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "here",
|
||||||
|
new String(token.termBuffer(), 0, token.termLength()).equals("here") == true);
|
||||||
|
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
|
||||||
|
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.ITALICS, token.type().equals(WikipediaTokenizer.ITALICS) == true);
|
||||||
|
assertTrue(token.startOffset() + " does not equal: " + 79, token.startOffset() == 79);
|
||||||
|
assertTrue(token.endOffset() + " does not equal: " + 83, token.endOffset() == 83);
|
||||||
|
|
||||||
|
token = tf.next();
|
||||||
|
assertTrue("token is null and it shouldn't be", token != null);
|
||||||
|
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "something",
|
||||||
|
new String(token.termBuffer(), 0, token.termLength()).equals("something") == true);
|
||||||
|
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
|
||||||
|
assertTrue(token.startOffset() + " does not equal: " + 86, token.startOffset() == 86);
|
||||||
|
assertTrue(token.endOffset() + " does not equal: " + 95, token.endOffset() == 95);
|
||||||
|
token = tf.next();
|
||||||
|
assertTrue("token is null and it shouldn't be", token != null);
|
||||||
|
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "more italics",
|
||||||
|
new String(token.termBuffer(), 0, token.termLength()).equals("more italics") == true);
|
||||||
|
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
|
||||||
|
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.ITALICS, token.type().equals(WikipediaTokenizer.ITALICS) == true);
|
||||||
|
assertTrue(token.getFlags() + " does not equal: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, token.getFlags() == WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG);
|
||||||
|
assertTrue(token.startOffset() + " does not equal: " + 98, token.startOffset() == 98);
|
||||||
|
assertTrue(token.endOffset() + " does not equal: " + 110, token.endOffset() == 110);
|
||||||
|
|
||||||
|
token = tf.next();
|
||||||
|
assertTrue("token is null and it shouldn't be", token != null);
|
||||||
|
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "more",
|
||||||
|
new String(token.termBuffer(), 0, token.termLength()).equals("more") == true);
|
||||||
|
assertTrue(token.getPositionIncrement() + " does not equal: " + 0, token.getPositionIncrement() == 0);
|
||||||
|
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.ITALICS, token.type().equals(WikipediaTokenizer.ITALICS) == true);
|
||||||
|
assertTrue(token.startOffset() + " does not equal: " + 98, token.startOffset() == 98);
|
||||||
|
assertTrue(token.endOffset() + " does not equal: " + 102, token.endOffset() == 102);
|
||||||
|
|
||||||
|
token = tf.next();
|
||||||
|
assertTrue("token is null and it shouldn't be", token != null);
|
||||||
|
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "italics",
|
||||||
|
new String(token.termBuffer(), 0, token.termLength()).equals("italics") == true);
|
||||||
|
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
|
||||||
|
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.ITALICS, token.type().equals(WikipediaTokenizer.ITALICS) == true);
|
||||||
|
|
||||||
|
assertTrue(token.startOffset() + " does not equal: " + 103, token.startOffset() == 103);
|
||||||
|
assertTrue(token.endOffset() + " does not equal: " + 110, token.endOffset() == 110);
|
||||||
|
|
||||||
|
token = tf.next();
|
||||||
|
assertTrue("token is null and it shouldn't be", token != null);
|
||||||
|
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "h i j",
|
||||||
|
new String(token.termBuffer(), 0, token.termLength()).equals("h i j") == true);
|
||||||
|
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
|
||||||
|
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true);
|
||||||
|
assertTrue(token.getFlags() + " does not equal: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, token.getFlags() == WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG);
|
||||||
|
assertTrue(token.startOffset() + " does not equal: " + 124, token.startOffset() == 124);
|
||||||
|
assertTrue(token.endOffset() + " does not equal: " + 133, token.endOffset() == 133);
|
||||||
|
|
||||||
|
token = tf.next();
|
||||||
|
assertTrue("token is null and it shouldn't be", token != null);
|
||||||
|
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "h",
|
||||||
|
new String(token.termBuffer(), 0, token.termLength()).equals("h") == true);
|
||||||
|
assertTrue(token.getPositionIncrement() + " does not equal: " + 0, token.getPositionIncrement() == 0);
|
||||||
|
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true);
|
||||||
|
assertTrue(token.startOffset() + " does not equal: " + 124, token.startOffset() == 124);
|
||||||
|
assertTrue(token.endOffset() + " does not equal: " + 125, token.endOffset() == 125);
|
||||||
|
|
||||||
|
token = tf.next();
|
||||||
|
assertTrue("token is null and it shouldn't be", token != null);
|
||||||
|
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "i",
|
||||||
|
new String(token.termBuffer(), 0, token.termLength()).equals("i") == true);
|
||||||
|
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
|
||||||
|
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true);
|
||||||
|
assertTrue(token.startOffset() + " does not equal: " + 128, token.startOffset() == 128);
|
||||||
|
assertTrue(token.endOffset() + " does not equal: " + 129, token.endOffset() == 129);
|
||||||
|
token = tf.next();
|
||||||
|
assertTrue("token is null and it shouldn't be", token != null);
|
||||||
|
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "j",
|
||||||
|
new String(token.termBuffer(), 0, token.termLength()).equals("j") == true);
|
||||||
|
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
|
||||||
|
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true);
|
||||||
|
assertTrue(token.startOffset() + " does not equal: " + 132, token.startOffset() == 132);
|
||||||
|
assertTrue(token.endOffset() + " does not equal: " + 133, token.endOffset() == 133);
|
||||||
|
|
||||||
|
token = tf.next();
|
||||||
|
assertTrue("token is not null and it should be", token == null);
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue