mirror of https://github.com/apache/lucene.git
LUCENE-1103
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@608852 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
2d633f98a2
commit
f715fc6031
|
@ -250,6 +250,7 @@
|
|||
<packageset dir="contrib/spellchecker/src/java"/>
|
||||
<packageset dir="contrib/surround/src/java"/>
|
||||
<packageset dir="contrib/swing/src/java"/>
|
||||
<packageset dir="contrib/wikipedia/src/java"/>
|
||||
<packageset dir="contrib/wordnet/src/java"/>
|
||||
<packageset dir="contrib/xml-query-parser/src/java"/>
|
||||
<!-- end alpha sort -->
|
||||
|
@ -279,6 +280,7 @@
|
|||
<group title="contrib: SpellChecker" packages="org.apache.lucene.search.spell*"/>
|
||||
<group title="contrib: Surround Parser" packages="org.apache.lucene.queryParser.surround*"/>
|
||||
<group title="contrib: Swing" packages="org.apache.lucene.swing*"/>
|
||||
<group title="contrib: Wikipedia" packages="org.apache.lucene.wikipedia*"/>
|
||||
<group title="contrib: WordNet" packages="org.apache.lucene.wordnet*"/>
|
||||
<group title="contrib: XML Query Parser" packages="org.apache.lucene.xmlparser*"/>
|
||||
|
||||
|
|
|
@ -0,0 +1,49 @@
|
|||
<?xml version="1.0"?>
|
||||
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
<project name="wikipedia" default="default">
|
||||
|
||||
<description>
|
||||
Tools for working with Wikipedia
|
||||
</description>
|
||||
|
||||
|
||||
<import file="../contrib-build.xml"/>
|
||||
|
||||
|
||||
<target name="jflex" depends="clean-jflex,jflex-wiki-tokenizer"/>
|
||||
|
||||
<target name="jflex-wiki-tokenizer" depends="init,jflex-check" if="jflex.present">
|
||||
<taskdef classname="JFlex.anttask.JFlexTask" name="jflex">
|
||||
<classpath location="${jflex.home}/lib/JFlex.jar"/>
|
||||
</taskdef>
|
||||
|
||||
<jflex file="src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerImpl.jflex"
|
||||
outdir="src/java/org/apache/lucene/wikipedia/analysis"
|
||||
nobak="on"/>
|
||||
</target>
|
||||
|
||||
<target name="clean-jflex">
|
||||
<delete>
|
||||
<fileset dir="src/java/org/apache/lucene/wikipedia" includes="*.java">
|
||||
<containsregexp expression="generated.*by.*JFlex"/>
|
||||
</fileset>
|
||||
</delete>
|
||||
</target>
|
||||
</project>
|
|
@ -0,0 +1,43 @@
|
|||
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
|
||||
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one
|
||||
or more contributor license agreements. See the NOTICE file
|
||||
distributed with this work for additional information
|
||||
regarding copyright ownership. The ASF licenses this file
|
||||
to you under the Apache License, Version 2.0 (the
|
||||
"License"); you may not use this file except in compliance
|
||||
with the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing,
|
||||
software distributed under the License is distributed on an
|
||||
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations
|
||||
under the License.
|
||||
-->
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<parent>
|
||||
<groupId>org.apache.lucene</groupId>
|
||||
<artifactId>lucene-contrib</artifactId>
|
||||
<version>@version@</version>
|
||||
</parent>
|
||||
<groupId>org.apache.lucene</groupId>
|
||||
<artifactId>lucene-wikipedia</artifactId>
|
||||
<name>Lucene Wikipedia Tools</name>
|
||||
<version>@version@</version>
|
||||
<description>Lucene Wikipedia Contributions</description>
|
||||
<packaging>jar</packaging>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.apache.lucene</groupId>
|
||||
<artifactId>lucene-benchmark</artifactId>
|
||||
<version>@version@</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
</project>
|
|
@ -0,0 +1,100 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.wikipedia.analysis;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.IOException;
|
||||
|
||||
|
||||
/**
|
||||
* Extension of StandardTokenizer that is aware of Wikipedia syntax. It is based off of the
|
||||
* Wikipedia tutorial available at http://en.wikipedia.org/wiki/Wikipedia:Tutorial, but it may not be complete.
|
||||
*
|
||||
*
|
||||
**/
|
||||
public class WikipediaTokenizer extends Tokenizer {
|
||||
public static final String INTERNAL_LINK = "il";
|
||||
public static final String EXTERNAL_LINK = "el";
|
||||
//The URL part of the link, i.e. the first token
|
||||
public static final String EXTERNAL_LINK_URL = "elu";
|
||||
public static final String CITATION = "ci";
|
||||
public static final String CATEGORY = "c";
|
||||
public static final String BOLD = "b";
|
||||
public static final String ITALICS = "i";
|
||||
public static final String BOLD_ITALICS = "bi";
|
||||
public static final String HEADING = "h";
|
||||
public static final String SUB_HEADING = "sh";
|
||||
/**
|
||||
* A private instance of the JFlex-constructed scanner
|
||||
*/
|
||||
private final WikipediaTokenizerImpl scanner;
|
||||
|
||||
void setInput(Reader reader) {
|
||||
this.input = reader;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new instance of the {@link WikipediaTokenizer}. Attaches the
|
||||
* <code>input</code> to a newly created JFlex scanner.
|
||||
* @param input The Input Reader
|
||||
*/
|
||||
public WikipediaTokenizer(Reader input) {
|
||||
this.input = input;
|
||||
this.scanner = new WikipediaTokenizerImpl(input);
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see org.apache.lucene.analysis.TokenStream#next()
|
||||
*/
|
||||
public Token next(Token result) throws IOException {
|
||||
int tokenType = scanner.getNextToken();
|
||||
|
||||
if (tokenType == WikipediaTokenizerImpl.YYEOF) {
|
||||
return null;
|
||||
}
|
||||
|
||||
scanner.getText(result, tokenType);
|
||||
final int start = scanner.yychar();
|
||||
result.setStartOffset(start);
|
||||
result.setEndOffset(start + result.termLength());
|
||||
result.setPositionIncrement(scanner.getPositionIncrement());
|
||||
result.setType(WikipediaTokenizerImpl.TOKEN_TYPES[tokenType]);
|
||||
return result;
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see org.apache.lucene.analysis.TokenStream#reset()
|
||||
*/
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
scanner.yyreset(input);
|
||||
}
|
||||
|
||||
public void reset(Reader reader) throws IOException {
|
||||
input = reader;
|
||||
reset();
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,949 @@
|
|||
/* The following code was generated by JFlex 1.4.1 on 1/3/08 10:05 PM */
|
||||
|
||||
package org.apache.lucene.wikipedia.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
|
||||
|
||||
/**
|
||||
* This class is a scanner generated by
|
||||
* <a href="http://www.jflex.de/">JFlex</a> 1.4.1
|
||||
* on 1/3/08 10:05 PM from the specification file
|
||||
* <tt>/Volumes/User/grantingersoll/projects/lucene/Lucene-Trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerImpl.jflex</tt>
|
||||
*/
|
||||
class WikipediaTokenizerImpl {
|
||||
|
||||
/** This character denotes the end of file */
|
||||
public static final int YYEOF = -1;
|
||||
|
||||
/** initial size of the lookahead buffer */
|
||||
private static final int ZZ_BUFFERSIZE = 16384;
|
||||
|
||||
/** lexical states */
|
||||
public static final int DOUBLE_BRACE_STATE = 7;
|
||||
public static final int INTERNAL_LINK_STATE = 2;
|
||||
public static final int TWO_SINGLE_QUOTES_STATE = 4;
|
||||
public static final int CATEGORY_STATE = 1;
|
||||
public static final int FIVE_SINGLE_QUOTES_STATE = 5;
|
||||
public static final int STRING = 8;
|
||||
public static final int YYINITIAL = 0;
|
||||
public static final int DOUBLE_EQUALS_STATE = 6;
|
||||
public static final int THREE_SINGLE_QUOTES_STATE = 5;
|
||||
public static final int EXTERNAL_LINK_STATE = 3;
|
||||
|
||||
/**
|
||||
* Translates characters to character classes
|
||||
*/
|
||||
private static final String ZZ_CMAP_PACKED =
|
||||
"\11\0\1\24\1\23\1\0\1\24\1\22\22\0\1\24\1\0\1\12"+
|
||||
"\1\52\2\0\1\3\1\1\4\0\1\14\1\5\1\2\1\10\12\16"+
|
||||
"\1\27\1\0\1\7\1\11\1\13\1\52\1\4\2\15\1\30\5\15"+
|
||||
"\1\41\21\15\1\25\1\0\1\26\1\0\1\6\1\0\1\31\1\43"+
|
||||
"\2\15\1\33\1\40\1\34\1\50\1\41\4\15\1\42\1\35\1\51"+
|
||||
"\1\15\1\36\1\15\1\32\3\15\1\44\1\37\1\15\1\45\1\47"+
|
||||
"\1\46\102\0\27\15\1\0\37\15\1\0\u0568\15\12\17\206\15\12\17"+
|
||||
"\u026c\15\12\17\166\15\12\17\166\15\12\17\166\15\12\17\166\15\12\17"+
|
||||
"\167\15\11\17\166\15\12\17\166\15\12\17\166\15\12\17\340\15\12\17"+
|
||||
"\166\15\12\17\u0166\15\12\17\266\15\u0100\15\u0e00\15\u1040\0\u0150\21\140\0"+
|
||||
"\20\21\u0100\0\200\21\200\0\u19c0\21\100\0\u5200\21\u0c00\0\u2bb0\20\u2150\0"+
|
||||
"\u0200\21\u0465\0\73\21\75\15\43\0";
|
||||
|
||||
/**
|
||||
* Translates characters to character classes
|
||||
*/
|
||||
private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED);
|
||||
|
||||
/**
|
||||
* Translates DFA states to action switch labels.
|
||||
*/
|
||||
private static final int [] ZZ_ACTION = zzUnpackAction();
|
||||
|
||||
private static final String ZZ_ACTION_PACKED_0 =
|
||||
"\11\0\4\1\4\2\1\3\1\1\1\4\2\1\1\5"+
|
||||
"\1\1\1\6\2\7\1\10\1\11\1\10\1\12\1\13"+
|
||||
"\1\7\1\14\1\15\1\16\1\17\1\7\1\20\1\7"+
|
||||
"\4\21\1\22\1\21\1\23\1\24\1\25\3\0\1\26"+
|
||||
"\14\0\1\27\1\30\1\10\1\0\1\31\1\0\1\32"+
|
||||
"\1\0\1\33\3\0\1\34\1\35\2\36\1\35\2\37"+
|
||||
"\2\0\1\36\1\0\14\36\1\35\3\0\1\10\1\40"+
|
||||
"\3\0\1\41\1\42\5\0\1\43\4\0\1\43\2\0"+
|
||||
"\2\43\2\0\1\10\5\0\1\30\1\35\1\36\1\44"+
|
||||
"\5\0\1\45\30\0\1\46\2\0\1\47\1\50\1\51";
|
||||
|
||||
private static int [] zzUnpackAction() {
|
||||
int [] result = new int[174];
|
||||
int offset = 0;
|
||||
offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
|
||||
return result;
|
||||
}
|
||||
|
||||
private static int zzUnpackAction(String packed, int offset, int [] result) {
|
||||
int i = 0; /* index in packed string */
|
||||
int j = offset; /* index in unpacked array */
|
||||
int l = packed.length();
|
||||
while (i < l) {
|
||||
int count = packed.charAt(i++);
|
||||
int value = packed.charAt(i++);
|
||||
do result[j++] = value; while (--count > 0);
|
||||
}
|
||||
return j;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Translates a state to a row index in the transition table
|
||||
*/
|
||||
private static final int [] ZZ_ROWMAP = zzUnpackRowMap();
|
||||
|
||||
private static final String ZZ_ROWMAP_PACKED_0 =
|
||||
"\0\0\0\53\0\126\0\201\0\254\0\327\0\u0102\0\u012d"+
|
||||
"\0\u0158\0\u0183\0\u01ae\0\u01d9\0\u0204\0\u022f\0\u025a\0\u0285"+
|
||||
"\0\u02b0\0\u0183\0\u02db\0\u0306\0\u0331\0\u035c\0\u0387\0\u03b2"+
|
||||
"\0\u03dd\0\u0183\0\u035c\0\u0408\0\u0183\0\u0433\0\u045e\0\u0489"+
|
||||
"\0\u04b4\0\u04df\0\u050a\0\u0535\0\u0560\0\u058b\0\u05b6\0\u05e1"+
|
||||
"\0\u0183\0\u060c\0\u035c\0\u0637\0\u0662\0\u068d\0\u06b8\0\u0183"+
|
||||
"\0\u0183\0\u06e3\0\u070e\0\u0739\0\u0183\0\u0764\0\u078f\0\u07ba"+
|
||||
"\0\u07e5\0\u0810\0\u083b\0\u0866\0\u0891\0\u08bc\0\u08e7\0\u0912"+
|
||||
"\0\u093d\0\u0968\0\u0993\0\u09be\0\u09e9\0\u0a14\0\u0a3f\0\u0a6a"+
|
||||
"\0\u0a95\0\u0ac0\0\u0aeb\0\u0b16\0\u0b41\0\u0b6c\0\u0b97\0\u0bc2"+
|
||||
"\0\u0bed\0\u0c18\0\u07ba\0\u0c43\0\u0c6e\0\u0c99\0\u0cc4\0\u0cef"+
|
||||
"\0\u0d1a\0\u0d45\0\u0d70\0\u0d9b\0\u0dc6\0\u0df1\0\u0e1c\0\u0e47"+
|
||||
"\0\u0e72\0\u0e9d\0\u0ec8\0\u0ef3\0\u0f1e\0\u0f49\0\u0f74\0\u0f9f"+
|
||||
"\0\u0fca\0\u0183\0\u0ff5\0\u1020\0\u104b\0\u1076\0\u0183\0\u10a1"+
|
||||
"\0\u10cc\0\u10f7\0\u1122\0\u114d\0\u1178\0\u11a3\0\u11ce\0\u11f9"+
|
||||
"\0\u1224\0\u124f\0\u127a\0\u12a5\0\u078f\0\u0912\0\u12d0\0\u12fb"+
|
||||
"\0\u1326\0\u1351\0\u137c\0\u13a7\0\u13d2\0\u13fd\0\u0183\0\u1428"+
|
||||
"\0\u1453\0\u147e\0\u14a9\0\u14d4\0\u14ff\0\u152a\0\u1555\0\u0183"+
|
||||
"\0\u1580\0\u15ab\0\u15d6\0\u1601\0\u162c\0\u1657\0\u1682\0\u16ad"+
|
||||
"\0\u16d8\0\u1703\0\u172e\0\u1759\0\u1784\0\u17af\0\u17da\0\u1805"+
|
||||
"\0\u1830\0\u185b\0\u1886\0\u18b1\0\u18dc\0\u1907\0\u1932\0\u195d"+
|
||||
"\0\u1988\0\u19b3\0\u19de\0\u0183\0\u0183\0\u0183";
|
||||
|
||||
private static int [] zzUnpackRowMap() {
|
||||
int [] result = new int[174];
|
||||
int offset = 0;
|
||||
offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
|
||||
return result;
|
||||
}
|
||||
|
||||
private static int zzUnpackRowMap(String packed, int offset, int [] result) {
|
||||
int i = 0; /* index in packed string */
|
||||
int j = offset; /* index in unpacked array */
|
||||
int l = packed.length();
|
||||
while (i < l) {
|
||||
int high = packed.charAt(i++) << 16;
|
||||
result[j++] = high | packed.charAt(i++);
|
||||
}
|
||||
return j;
|
||||
}
|
||||
|
||||
/**
|
||||
* The transition table of the DFA
|
||||
*/
|
||||
private static final int [] ZZ_TRANS = zzUnpackTrans();
|
||||
|
||||
private static final String ZZ_TRANS_PACKED_0 =
|
||||
"\1\12\1\13\5\12\1\14\1\12\1\15\3\12\1\16"+
|
||||
"\1\17\1\20\1\21\1\22\1\23\2\12\1\24\2\12"+
|
||||
"\15\16\1\25\2\12\2\16\10\12\1\26\5\12\4\27"+
|
||||
"\1\12\1\23\3\12\1\30\1\12\15\27\3\12\2\27"+
|
||||
"\10\12\1\26\5\12\4\31\1\12\1\23\3\12\1\30"+
|
||||
"\1\12\15\31\3\12\2\31\1\12\7\32\1\33\5\32"+
|
||||
"\4\34\1\32\1\23\2\12\1\32\1\35\1\32\15\34"+
|
||||
"\3\32\1\36\1\34\2\32\1\37\5\32\1\33\5\32"+
|
||||
"\4\40\1\32\1\41\2\32\1\42\2\32\15\40\3\32"+
|
||||
"\2\40\10\32\1\33\5\32\4\43\1\32\1\41\2\32"+
|
||||
"\1\42\2\32\15\43\3\32\2\43\10\32\1\33\1\32"+
|
||||
"\1\44\3\32\4\45\1\32\1\41\5\32\15\45\3\32"+
|
||||
"\2\45\10\32\1\46\5\32\4\47\1\32\1\41\5\32"+
|
||||
"\15\47\1\32\1\50\1\32\2\47\1\32\1\51\1\52"+
|
||||
"\5\51\1\53\1\51\1\54\3\51\4\55\1\51\1\56"+
|
||||
"\2\51\1\57\2\51\15\55\2\51\1\60\2\55\1\51"+
|
||||
"\54\0\1\61\61\0\1\62\4\0\4\63\7\0\6\63"+
|
||||
"\1\64\6\63\3\0\2\63\12\0\1\65\42\0\1\66"+
|
||||
"\1\67\1\70\1\71\2\72\1\0\1\73\3\0\1\73"+
|
||||
"\1\16\1\17\1\20\1\21\7\0\15\16\3\0\2\16"+
|
||||
"\3\0\1\74\1\0\1\75\2\76\1\0\1\77\3\0"+
|
||||
"\1\77\3\17\1\21\7\0\15\17\3\0\2\17\2\0"+
|
||||
"\1\66\1\100\1\70\1\71\2\76\1\0\1\77\3\0"+
|
||||
"\1\77\1\20\1\17\1\20\1\21\7\0\15\20\3\0"+
|
||||
"\2\20\3\0\1\101\1\0\1\75\2\72\1\0\1\73"+
|
||||
"\3\0\1\73\4\21\7\0\15\21\3\0\2\21\24\0"+
|
||||
"\1\12\54\0\1\102\72\0\1\103\15\0\1\62\4\0"+
|
||||
"\4\63\7\0\15\63\3\0\2\63\16\0\4\27\7\0"+
|
||||
"\15\27\3\0\2\27\27\0\1\35\41\0\4\31\7\0"+
|
||||
"\15\31\3\0\2\31\16\0\4\34\7\0\15\34\3\0"+
|
||||
"\2\34\16\0\4\34\7\0\2\34\1\104\12\34\3\0"+
|
||||
"\2\34\2\0\1\105\66\0\4\40\7\0\15\40\3\0"+
|
||||
"\2\40\24\0\1\32\54\0\1\106\42\0\4\43\7\0"+
|
||||
"\15\43\3\0\2\43\12\0\1\35\56\0\4\45\7\0"+
|
||||
"\15\45\3\0\2\45\11\0\1\107\4\0\4\63\7\0"+
|
||||
"\15\63\3\0\2\63\16\0\4\47\7\0\15\47\3\0"+
|
||||
"\2\47\47\0\1\35\5\0\1\110\62\0\1\111\56\0"+
|
||||
"\4\55\7\0\15\55\3\0\2\55\24\0\1\51\54\0"+
|
||||
"\1\112\42\0\4\63\7\0\15\63\3\0\2\63\14\0"+
|
||||
"\1\32\1\0\4\113\1\0\3\114\3\0\15\113\3\0"+
|
||||
"\2\113\14\0\1\32\1\0\4\113\1\0\3\114\3\0"+
|
||||
"\3\113\1\115\11\113\3\0\2\113\16\0\1\116\1\0"+
|
||||
"\1\116\10\0\15\116\3\0\2\116\16\0\1\117\1\120"+
|
||||
"\1\121\1\122\7\0\15\117\3\0\2\117\16\0\1\123"+
|
||||
"\1\0\1\123\10\0\15\123\3\0\2\123\16\0\1\124"+
|
||||
"\1\125\1\124\1\125\7\0\15\124\3\0\2\124\16\0"+
|
||||
"\1\126\2\127\1\130\7\0\15\126\3\0\2\126\16\0"+
|
||||
"\1\73\2\131\10\0\15\73\3\0\2\73\16\0\1\132"+
|
||||
"\2\133\1\134\7\0\15\132\3\0\2\132\16\0\4\125"+
|
||||
"\7\0\15\125\3\0\2\125\16\0\1\135\2\136\1\137"+
|
||||
"\7\0\15\135\3\0\2\135\16\0\1\140\2\141\1\142"+
|
||||
"\7\0\15\140\3\0\2\140\16\0\1\143\1\133\1\144"+
|
||||
"\1\134\7\0\15\143\3\0\2\143\16\0\1\145\2\120"+
|
||||
"\1\122\7\0\15\145\3\0\2\145\30\0\1\146\1\147"+
|
||||
"\63\0\1\150\26\0\4\34\7\0\2\34\1\151\12\34"+
|
||||
"\3\0\2\34\2\0\1\152\100\0\1\153\1\154\37\0"+
|
||||
"\4\63\7\0\6\63\1\155\6\63\3\0\2\63\2\0"+
|
||||
"\1\156\62\0\1\157\70\0\1\160\1\161\33\0\1\162"+
|
||||
"\1\0\1\32\1\0\4\113\1\0\3\114\3\0\15\113"+
|
||||
"\3\0\2\113\16\0\4\163\1\0\3\114\3\0\15\163"+
|
||||
"\3\0\2\163\12\0\1\162\1\0\1\32\1\0\4\113"+
|
||||
"\1\0\3\114\3\0\10\113\1\164\4\113\3\0\2\113"+
|
||||
"\2\0\1\66\13\0\1\116\1\0\1\116\10\0\15\116"+
|
||||
"\3\0\2\116\3\0\1\165\1\0\1\75\2\166\6\0"+
|
||||
"\1\117\1\120\1\121\1\122\7\0\15\117\3\0\2\117"+
|
||||
"\3\0\1\167\1\0\1\75\2\170\1\0\1\171\3\0"+
|
||||
"\1\171\3\120\1\122\7\0\15\120\3\0\2\120\3\0"+
|
||||
"\1\172\1\0\1\75\2\170\1\0\1\171\3\0\1\171"+
|
||||
"\1\121\1\120\1\121\1\122\7\0\15\121\3\0\2\121"+
|
||||
"\3\0\1\173\1\0\1\75\2\166\6\0\4\122\7\0"+
|
||||
"\15\122\3\0\2\122\3\0\1\174\2\0\1\174\7\0"+
|
||||
"\1\124\1\125\1\124\1\125\7\0\15\124\3\0\2\124"+
|
||||
"\3\0\1\174\2\0\1\174\7\0\4\125\7\0\15\125"+
|
||||
"\3\0\2\125\3\0\1\166\1\0\1\75\2\166\6\0"+
|
||||
"\1\126\2\127\1\130\7\0\15\126\3\0\2\126\3\0"+
|
||||
"\1\170\1\0\1\75\2\170\1\0\1\171\3\0\1\171"+
|
||||
"\3\127\1\130\7\0\15\127\3\0\2\127\3\0\1\166"+
|
||||
"\1\0\1\75\2\166\6\0\4\130\7\0\15\130\3\0"+
|
||||
"\2\130\3\0\1\171\2\0\2\171\1\0\1\171\3\0"+
|
||||
"\1\171\3\131\10\0\15\131\3\0\2\131\3\0\1\101"+
|
||||
"\1\0\1\75\2\72\1\0\1\73\3\0\1\73\1\132"+
|
||||
"\2\133\1\134\7\0\15\132\3\0\2\132\3\0\1\74"+
|
||||
"\1\0\1\75\2\76\1\0\1\77\3\0\1\77\3\133"+
|
||||
"\1\134\7\0\15\133\3\0\2\133\3\0\1\101\1\0"+
|
||||
"\1\75\2\72\1\0\1\73\3\0\1\73\4\134\7\0"+
|
||||
"\15\134\3\0\2\134\3\0\1\72\1\0\1\75\2\72"+
|
||||
"\1\0\1\73\3\0\1\73\1\135\2\136\1\137\7\0"+
|
||||
"\15\135\3\0\2\135\3\0\1\76\1\0\1\75\2\76"+
|
||||
"\1\0\1\77\3\0\1\77\3\136\1\137\7\0\15\136"+
|
||||
"\3\0\2\136\3\0\1\72\1\0\1\75\2\72\1\0"+
|
||||
"\1\73\3\0\1\73\4\137\7\0\15\137\3\0\2\137"+
|
||||
"\3\0\1\73\2\0\2\73\1\0\1\73\3\0\1\73"+
|
||||
"\1\140\2\141\1\142\7\0\15\140\3\0\2\140\3\0"+
|
||||
"\1\77\2\0\2\77\1\0\1\77\3\0\1\77\3\141"+
|
||||
"\1\142\7\0\15\141\3\0\2\141\3\0\1\73\2\0"+
|
||||
"\2\73\1\0\1\73\3\0\1\73\4\142\7\0\15\142"+
|
||||
"\3\0\2\142\3\0\1\175\1\0\1\75\2\72\1\0"+
|
||||
"\1\73\3\0\1\73\1\143\1\133\1\144\1\134\7\0"+
|
||||
"\15\143\3\0\2\143\3\0\1\176\1\0\1\75\2\76"+
|
||||
"\1\0\1\77\3\0\1\77\1\144\1\133\1\144\1\134"+
|
||||
"\7\0\15\144\3\0\2\144\3\0\1\173\1\0\1\75"+
|
||||
"\2\166\6\0\1\145\2\120\1\122\7\0\15\145\3\0"+
|
||||
"\2\145\31\0\1\147\53\0\1\177\63\0\1\200\25\0"+
|
||||
"\4\34\7\0\15\34\3\0\1\34\1\201\31\0\1\154"+
|
||||
"\53\0\1\202\34\0\1\32\1\0\4\113\1\0\3\114"+
|
||||
"\3\0\3\113\1\203\11\113\3\0\2\113\2\0\1\204"+
|
||||
"\101\0\1\161\53\0\1\205\33\0\1\206\51\0\1\162"+
|
||||
"\3\0\4\163\7\0\15\163\3\0\2\163\12\0\1\162"+
|
||||
"\1\0\1\207\1\0\4\113\1\0\3\114\3\0\15\113"+
|
||||
"\3\0\2\113\16\0\1\210\1\122\1\210\1\122\7\0"+
|
||||
"\15\210\3\0\2\210\16\0\4\130\7\0\15\130\3\0"+
|
||||
"\2\130\16\0\4\134\7\0\15\134\3\0\2\134\16\0"+
|
||||
"\4\137\7\0\15\137\3\0\2\137\16\0\4\142\7\0"+
|
||||
"\15\142\3\0\2\142\16\0\1\211\1\134\1\211\1\134"+
|
||||
"\7\0\15\211\3\0\2\211\16\0\4\122\7\0\15\122"+
|
||||
"\3\0\2\122\16\0\4\212\7\0\15\212\3\0\2\212"+
|
||||
"\33\0\1\213\60\0\1\214\27\0\4\34\6\0\1\215"+
|
||||
"\15\34\3\0\2\34\33\0\1\216\31\0\1\162\1\0"+
|
||||
"\1\32\1\0\4\113\1\0\3\114\3\0\10\113\1\217"+
|
||||
"\4\113\3\0\2\113\2\0\1\220\103\0\1\221\35\0"+
|
||||
"\4\222\7\0\15\222\3\0\2\222\3\0\1\165\1\0"+
|
||||
"\1\75\2\166\6\0\1\210\1\122\1\210\1\122\7\0"+
|
||||
"\15\210\3\0\2\210\3\0\1\175\1\0\1\75\2\72"+
|
||||
"\1\0\1\73\3\0\1\73\1\211\1\134\1\211\1\134"+
|
||||
"\7\0\15\211\3\0\2\211\3\0\1\174\2\0\1\174"+
|
||||
"\7\0\4\212\7\0\15\212\3\0\2\212\34\0\1\223"+
|
||||
"\54\0\1\224\25\0\1\225\75\0\1\226\30\0\1\162"+
|
||||
"\1\0\1\35\1\0\4\113\1\0\3\114\3\0\15\113"+
|
||||
"\3\0\2\113\34\0\1\227\31\0\1\230\2\0\4\222"+
|
||||
"\7\0\15\222\3\0\2\222\35\0\1\231\61\0\1\232"+
|
||||
"\17\0\1\233\76\0\1\234\52\0\1\235\31\0\1\32"+
|
||||
"\1\0\4\163\1\0\3\114\3\0\15\163\3\0\2\163"+
|
||||
"\36\0\1\236\52\0\1\237\32\0\4\240\7\0\15\240"+
|
||||
"\3\0\2\240\36\0\1\241\52\0\1\242\53\0\1\243"+
|
||||
"\60\0\1\244\10\0\1\245\12\0\4\240\7\0\15\240"+
|
||||
"\3\0\2\240\37\0\1\246\52\0\1\247\53\0\1\250"+
|
||||
"\21\0\1\12\61\0\4\251\7\0\15\251\3\0\2\251"+
|
||||
"\40\0\1\252\52\0\1\253\42\0\1\254\25\0\2\251"+
|
||||
"\1\0\2\251\1\0\2\251\2\0\5\251\7\0\15\251"+
|
||||
"\3\0\3\251\27\0\1\255\52\0\1\256\23\0";
|
||||
|
||||
private static int [] zzUnpackTrans() {
|
||||
int [] result = new int[6665];
|
||||
int offset = 0;
|
||||
offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
|
||||
return result;
|
||||
}
|
||||
|
||||
private static int zzUnpackTrans(String packed, int offset, int [] result) {
|
||||
int i = 0; /* index in packed string */
|
||||
int j = offset; /* index in unpacked array */
|
||||
int l = packed.length();
|
||||
while (i < l) {
|
||||
int count = packed.charAt(i++);
|
||||
int value = packed.charAt(i++);
|
||||
value--;
|
||||
do result[j++] = value; while (--count > 0);
|
||||
}
|
||||
return j;
|
||||
}
|
||||
|
||||
|
||||
/* error codes */
|
||||
private static final int ZZ_UNKNOWN_ERROR = 0;
|
||||
private static final int ZZ_NO_MATCH = 1;
|
||||
private static final int ZZ_PUSHBACK_2BIG = 2;
|
||||
|
||||
/* error messages for the codes above */
|
||||
private static final String ZZ_ERROR_MSG[] = {
|
||||
"Unkown internal scanner error",
|
||||
"Error: could not match input",
|
||||
"Error: pushback value was too large"
|
||||
};
|
||||
|
||||
/**
|
||||
* ZZ_ATTRIBUTE[aState] contains the attributes of state <code>aState</code>
|
||||
*/
|
||||
private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
|
||||
|
||||
private static final String ZZ_ATTRIBUTE_PACKED_0 =
|
||||
"\11\0\1\11\7\1\1\11\7\1\1\11\2\1\1\11"+
|
||||
"\13\1\1\11\6\1\2\11\3\0\1\11\14\0\3\1"+
|
||||
"\1\0\1\1\1\0\1\1\1\0\1\1\3\0\7\1"+
|
||||
"\2\0\1\1\1\0\15\1\3\0\1\1\1\11\3\0"+
|
||||
"\1\1\1\11\5\0\1\1\4\0\1\1\2\0\2\1"+
|
||||
"\2\0\1\1\5\0\1\11\3\1\5\0\1\11\30\0"+
|
||||
"\1\1\2\0\3\11";
|
||||
|
||||
private static int [] zzUnpackAttribute() {
|
||||
int [] result = new int[174];
|
||||
int offset = 0;
|
||||
offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
|
||||
return result;
|
||||
}
|
||||
|
||||
private static int zzUnpackAttribute(String packed, int offset, int [] result) {
|
||||
int i = 0; /* index in packed string */
|
||||
int j = offset; /* index in unpacked array */
|
||||
int l = packed.length();
|
||||
while (i < l) {
|
||||
int count = packed.charAt(i++);
|
||||
int value = packed.charAt(i++);
|
||||
do result[j++] = value; while (--count > 0);
|
||||
}
|
||||
return j;
|
||||
}
|
||||
|
||||
/** the input device */
|
||||
private java.io.Reader zzReader;
|
||||
|
||||
/** the current state of the DFA */
|
||||
private int zzState;
|
||||
|
||||
/** the current lexical state */
|
||||
private int zzLexicalState = YYINITIAL;
|
||||
|
||||
/** this buffer contains the current text to be matched and is
|
||||
the source of the yytext() string */
|
||||
private char zzBuffer[] = new char[ZZ_BUFFERSIZE];
|
||||
|
||||
/** the textposition at the last accepting state */
|
||||
private int zzMarkedPos;
|
||||
|
||||
/** the textposition at the last state to be included in yytext */
|
||||
private int zzPushbackPos;
|
||||
|
||||
/** the current text position in the buffer */
|
||||
private int zzCurrentPos;
|
||||
|
||||
/** startRead marks the beginning of the yytext() string in the buffer */
|
||||
private int zzStartRead;
|
||||
|
||||
/** endRead marks the last character in the buffer, that has been read
|
||||
from input */
|
||||
private int zzEndRead;
|
||||
|
||||
/** number of newlines encountered up to the start of the matched text */
|
||||
private int yyline;
|
||||
|
||||
/** the number of characters up to the start of the matched text */
|
||||
private int yychar;
|
||||
|
||||
/**
|
||||
* the number of characters from the last newline up to the start of the
|
||||
* matched text
|
||||
*/
|
||||
private int yycolumn;
|
||||
|
||||
/**
|
||||
* zzAtBOL == true <=> the scanner is currently at the beginning of a line
|
||||
*/
|
||||
private boolean zzAtBOL = true;
|
||||
|
||||
/** zzAtEOF == true <=> the scanner is at the EOF */
|
||||
private boolean zzAtEOF;
|
||||
|
||||
/* user code: */
|
||||
|
||||
public static final int ALPHANUM = 0;
|
||||
public static final int APOSTROPHE = 1;
|
||||
public static final int ACRONYM = 2;
|
||||
public static final int COMPANY = 3;
|
||||
public static final int EMAIL = 4;
|
||||
public static final int HOST = 5;
|
||||
public static final int NUM = 6;
|
||||
public static final int CJ = 7;
|
||||
public static final int INTERNAL_LINK = 8;
|
||||
public static final int EXTERNAL_LINK = 9;
|
||||
public static final int CITATION = 10;
|
||||
public static final int CATEGORY = 11;
|
||||
public static final int BOLD = 12;
|
||||
public static final int ITALICS = 13;
|
||||
public static final int BOLD_ITALICS = 14;
|
||||
public static final int HEADING = 15;
|
||||
public static final int SUB_HEADING = 16;
|
||||
public static final int EXTERNAL_LINK_URL = 17;
|
||||
|
||||
|
||||
private int currentTokType;
|
||||
private int numBalanced = 0;
|
||||
private int positionInc = 1;
|
||||
|
||||
public static final String [] TOKEN_TYPES = new String [] {
|
||||
"<ALPHANUM>",
|
||||
"<APOSTROPHE>",
|
||||
"<ACRONYM>",
|
||||
"<COMPANY>",
|
||||
"<EMAIL>",
|
||||
"<HOST>",
|
||||
"<NUM>",
|
||||
"<CJ>",
|
||||
WikipediaTokenizer.INTERNAL_LINK,
|
||||
WikipediaTokenizer.EXTERNAL_LINK,
|
||||
WikipediaTokenizer.CITATION,
|
||||
WikipediaTokenizer.CATEGORY,
|
||||
WikipediaTokenizer.BOLD,
|
||||
WikipediaTokenizer.ITALICS,
|
||||
WikipediaTokenizer.BOLD_ITALICS,
|
||||
WikipediaTokenizer.HEADING,
|
||||
WikipediaTokenizer.SUB_HEADING,
|
||||
WikipediaTokenizer.EXTERNAL_LINK_URL
|
||||
};
|
||||
|
||||
public final int yychar()
|
||||
{
|
||||
return yychar;
|
||||
}
|
||||
|
||||
public final int getPositionIncrement(){
|
||||
return positionInc;
|
||||
}
|
||||
|
||||
/**
|
||||
* Fills Lucene token with the current token text.
|
||||
*/
|
||||
final void getText(Token t, int tokType) {
|
||||
t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Creates a new scanner
|
||||
* There is also a java.io.InputStream version of this constructor.
|
||||
*
|
||||
* @param in the java.io.Reader to read input from.
|
||||
*/
|
||||
WikipediaTokenizerImpl(java.io.Reader in) {
|
||||
this.zzReader = in;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new scanner.
|
||||
* There is also java.io.Reader version of this constructor.
|
||||
*
|
||||
* @param in the java.io.Inputstream to read input from.
|
||||
*/
|
||||
WikipediaTokenizerImpl(java.io.InputStream in) {
|
||||
this(new java.io.InputStreamReader(in));
|
||||
}
|
||||
|
||||
/**
|
||||
* Unpacks the compressed character translation table.
|
||||
*
|
||||
* @param packed the packed character translation table
|
||||
* @return the unpacked character translation table
|
||||
*/
|
||||
private static char [] zzUnpackCMap(String packed) {
|
||||
char [] map = new char[0x10000];
|
||||
int i = 0; /* index in packed string */
|
||||
int j = 0; /* index in unpacked array */
|
||||
while (i < 230) {
|
||||
int count = packed.charAt(i++);
|
||||
char value = packed.charAt(i++);
|
||||
do map[j++] = value; while (--count > 0);
|
||||
}
|
||||
return map;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Refills the input buffer.
|
||||
*
|
||||
* @return <code>false</code>, iff there was new input.
|
||||
*
|
||||
* @exception java.io.IOException if any I/O-Error occurs
|
||||
*/
|
||||
private boolean zzRefill() throws java.io.IOException {
|
||||
|
||||
/* first: make room (if you can) */
|
||||
if (zzStartRead > 0) {
|
||||
System.arraycopy(zzBuffer, zzStartRead,
|
||||
zzBuffer, 0,
|
||||
zzEndRead-zzStartRead);
|
||||
|
||||
/* translate stored positions */
|
||||
zzEndRead-= zzStartRead;
|
||||
zzCurrentPos-= zzStartRead;
|
||||
zzMarkedPos-= zzStartRead;
|
||||
zzPushbackPos-= zzStartRead;
|
||||
zzStartRead = 0;
|
||||
}
|
||||
|
||||
/* is the buffer big enough? */
|
||||
if (zzCurrentPos >= zzBuffer.length) {
|
||||
/* if not: blow it up */
|
||||
char newBuffer[] = new char[zzCurrentPos*2];
|
||||
System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length);
|
||||
zzBuffer = newBuffer;
|
||||
}
|
||||
|
||||
/* finally: fill the buffer with new input */
|
||||
int numRead = zzReader.read(zzBuffer, zzEndRead,
|
||||
zzBuffer.length-zzEndRead);
|
||||
|
||||
if (numRead < 0) {
|
||||
return true;
|
||||
}
|
||||
else {
|
||||
zzEndRead+= numRead;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Closes the input stream.
|
||||
*/
|
||||
public final void yyclose() throws java.io.IOException {
|
||||
zzAtEOF = true; /* indicate end of file */
|
||||
zzEndRead = zzStartRead; /* invalidate buffer */
|
||||
|
||||
if (zzReader != null)
|
||||
zzReader.close();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Resets the scanner to read from a new input stream.
|
||||
* Does not close the old reader.
|
||||
*
|
||||
* All internal variables are reset, the old input stream
|
||||
* <b>cannot</b> be reused (internal buffer is discarded and lost).
|
||||
* Lexical state is set to <tt>ZZ_INITIAL</tt>.
|
||||
*
|
||||
* @param reader the new input stream
|
||||
*/
|
||||
public final void yyreset(java.io.Reader reader) {
|
||||
zzReader = reader;
|
||||
zzAtBOL = true;
|
||||
zzAtEOF = false;
|
||||
zzEndRead = zzStartRead = 0;
|
||||
zzCurrentPos = zzMarkedPos = zzPushbackPos = 0;
|
||||
yyline = yychar = yycolumn = 0;
|
||||
zzLexicalState = YYINITIAL;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns the current lexical state.
|
||||
*/
|
||||
public final int yystate() {
|
||||
return zzLexicalState;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Enters a new lexical state
|
||||
*
|
||||
* @param newState the new lexical state
|
||||
*/
|
||||
public final void yybegin(int newState) {
|
||||
zzLexicalState = newState;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns the text matched by the current regular expression.
|
||||
*/
|
||||
public final String yytext() {
|
||||
return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead );
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns the character at position <tt>pos</tt> from the
|
||||
* matched text.
|
||||
*
|
||||
* It is equivalent to yytext().charAt(pos), but faster
|
||||
*
|
||||
* @param pos the position of the character to fetch.
|
||||
* A value from 0 to yylength()-1.
|
||||
*
|
||||
* @return the character at position pos
|
||||
*/
|
||||
public final char yycharat(int pos) {
|
||||
return zzBuffer[zzStartRead+pos];
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns the length of the matched text region.
|
||||
*/
|
||||
public final int yylength() {
|
||||
return zzMarkedPos-zzStartRead;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Reports an error that occured while scanning.
|
||||
*
|
||||
* In a wellformed scanner (no or only correct usage of
|
||||
* yypushback(int) and a match-all fallback rule) this method
|
||||
* will only be called with things that "Can't Possibly Happen".
|
||||
* If this method is called, something is seriously wrong
|
||||
* (e.g. a JFlex bug producing a faulty scanner etc.).
|
||||
*
|
||||
* Usual syntax/scanner level error handling should be done
|
||||
* in error fallback rules.
|
||||
*
|
||||
* @param errorCode the code of the errormessage to display
|
||||
*/
|
||||
private void zzScanError(int errorCode) {
|
||||
String message;
|
||||
try {
|
||||
message = ZZ_ERROR_MSG[errorCode];
|
||||
}
|
||||
catch (ArrayIndexOutOfBoundsException e) {
|
||||
message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR];
|
||||
}
|
||||
|
||||
throw new Error(message);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Pushes the specified amount of characters back into the input stream.
|
||||
*
|
||||
* They will be read again by then next call of the scanning method
|
||||
*
|
||||
* @param number the number of characters to be read again.
|
||||
* This number must not be greater than yylength()!
|
||||
*/
|
||||
public void yypushback(int number) {
|
||||
if ( number > yylength() )
|
||||
zzScanError(ZZ_PUSHBACK_2BIG);
|
||||
|
||||
zzMarkedPos -= number;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Resumes scanning until the next regular expression is matched,
|
||||
* the end of input is encountered or an I/O-Error occurs.
|
||||
*
|
||||
* @return the next token
|
||||
* @exception java.io.IOException if any I/O-Error occurs
|
||||
*/
|
||||
public int getNextToken() throws java.io.IOException {
|
||||
int zzInput;
|
||||
int zzAction;
|
||||
|
||||
// cached fields:
|
||||
int zzCurrentPosL;
|
||||
int zzMarkedPosL;
|
||||
int zzEndReadL = zzEndRead;
|
||||
char [] zzBufferL = zzBuffer;
|
||||
char [] zzCMapL = ZZ_CMAP;
|
||||
|
||||
int [] zzTransL = ZZ_TRANS;
|
||||
int [] zzRowMapL = ZZ_ROWMAP;
|
||||
int [] zzAttrL = ZZ_ATTRIBUTE;
|
||||
|
||||
while (true) {
|
||||
zzMarkedPosL = zzMarkedPos;
|
||||
|
||||
yychar+= zzMarkedPosL-zzStartRead;
|
||||
|
||||
zzAction = -1;
|
||||
|
||||
zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL;
|
||||
|
||||
zzState = zzLexicalState;
|
||||
|
||||
|
||||
zzForAction: {
|
||||
while (true) {
|
||||
|
||||
if (zzCurrentPosL < zzEndReadL)
|
||||
zzInput = zzBufferL[zzCurrentPosL++];
|
||||
else if (zzAtEOF) {
|
||||
zzInput = YYEOF;
|
||||
break zzForAction;
|
||||
}
|
||||
else {
|
||||
// store back cached positions
|
||||
zzCurrentPos = zzCurrentPosL;
|
||||
zzMarkedPos = zzMarkedPosL;
|
||||
boolean eof = zzRefill();
|
||||
// get translated positions and possibly new buffer
|
||||
zzCurrentPosL = zzCurrentPos;
|
||||
zzMarkedPosL = zzMarkedPos;
|
||||
zzBufferL = zzBuffer;
|
||||
zzEndReadL = zzEndRead;
|
||||
if (eof) {
|
||||
zzInput = YYEOF;
|
||||
break zzForAction;
|
||||
}
|
||||
else {
|
||||
zzInput = zzBufferL[zzCurrentPosL++];
|
||||
}
|
||||
}
|
||||
int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ];
|
||||
if (zzNext == -1) break zzForAction;
|
||||
zzState = zzNext;
|
||||
|
||||
int zzAttributes = zzAttrL[zzState];
|
||||
if ( (zzAttributes & 1) == 1 ) {
|
||||
zzAction = zzState;
|
||||
zzMarkedPosL = zzCurrentPosL;
|
||||
if ( (zzAttributes & 8) == 8 ) break zzForAction;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
// store back cached position
|
||||
zzMarkedPos = zzMarkedPosL;
|
||||
|
||||
switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
|
||||
case 7:
|
||||
{ /* ignore */
|
||||
}
|
||||
case 42: break;
|
||||
case 3:
|
||||
{ positionInc = 1; return CJ;
|
||||
}
|
||||
case 43: break;
|
||||
case 26:
|
||||
{ numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);/*end italics*/
|
||||
}
|
||||
case 44: break;
|
||||
case 37:
|
||||
{ numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);/*end bold italics*/
|
||||
}
|
||||
case 45: break;
|
||||
case 11:
|
||||
{ currentTokType = ITALICS; yybegin(STRING); return currentTokType;/*italics*/
|
||||
}
|
||||
case 46: break;
|
||||
case 5:
|
||||
{ yybegin(CATEGORY_STATE); return currentTokType;
|
||||
}
|
||||
case 47: break;
|
||||
case 34:
|
||||
{ numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);/*end sub header*/
|
||||
}
|
||||
case 48: break;
|
||||
case 24:
|
||||
{ positionInc = 1; currentTokType = CITATION; yybegin(DOUBLE_BRACE_STATE);
|
||||
}
|
||||
case 49: break;
|
||||
case 22:
|
||||
{ positionInc = 1; yybegin(DOUBLE_EQUALS_STATE);
|
||||
}
|
||||
case 50: break;
|
||||
case 39:
|
||||
{ positionInc = 1; currentTokType = CATEGORY; yybegin(CATEGORY_STATE);
|
||||
}
|
||||
case 51: break;
|
||||
case 18:
|
||||
{ yybegin(STRING); return currentTokType;/* STRING ALPHANUM*/
|
||||
}
|
||||
case 52: break;
|
||||
case 21:
|
||||
{ positionInc = 1; if (numBalanced == 0){numBalanced++;yybegin(TWO_SINGLE_QUOTES_STATE);} else{numBalanced = 0;}
|
||||
}
|
||||
case 53: break;
|
||||
case 1:
|
||||
{ positionInc = 1;
|
||||
}
|
||||
case 54: break;
|
||||
case 41:
|
||||
{ numBalanced = 0;currentTokType = CATEGORY;yybegin(CATEGORY_STATE);
|
||||
}
|
||||
case 55: break;
|
||||
case 9:
|
||||
{ yybegin(YYINITIAL);
|
||||
}
|
||||
case 56: break;
|
||||
case 19:
|
||||
{ numBalanced = 0;currentTokType = EXTERNAL_LINK;yybegin(EXTERNAL_LINK_STATE);
|
||||
}
|
||||
case 57: break;
|
||||
case 13:
|
||||
{ yybegin(STRING);return currentTokType;
|
||||
}
|
||||
case 58: break;
|
||||
case 36:
|
||||
{ positionInc = 1; return EMAIL;
|
||||
}
|
||||
case 59: break;
|
||||
case 35:
|
||||
{ positionInc = 1; return ACRONYM;
|
||||
}
|
||||
case 60: break;
|
||||
case 4:
|
||||
{ positionInc = 1;currentTokType = EXTERNAL_LINK_URL; yybegin(EXTERNAL_LINK_STATE);
|
||||
}
|
||||
case 61: break;
|
||||
case 17:
|
||||
{ /* ignore STRING */
|
||||
}
|
||||
case 62: break;
|
||||
case 40:
|
||||
{ currentTokType = CATEGORY;yybegin(CATEGORY_STATE);
|
||||
}
|
||||
case 63: break;
|
||||
case 20:
|
||||
{ yybegin(STRING); return currentTokType;/*pipe*/
|
||||
}
|
||||
case 64: break;
|
||||
case 12:
|
||||
{ currentTokType = EXTERNAL_LINK;yybegin(EXTERNAL_LINK_STATE);
|
||||
}
|
||||
case 65: break;
|
||||
case 27:
|
||||
{ numBalanced = 0;currentTokType = INTERNAL_LINK;yybegin(INTERNAL_LINK_STATE);
|
||||
}
|
||||
case 66: break;
|
||||
case 33:
|
||||
{ numBalanced = 0;currentTokType = ALPHANUM;yybegin(YYINITIAL);/*end bold*/
|
||||
}
|
||||
case 67: break;
|
||||
case 16:
|
||||
{ yybegin(DOUBLE_BRACE_STATE); return currentTokType;
|
||||
}
|
||||
case 68: break;
|
||||
case 29:
|
||||
{ positionInc = 1; return HOST;
|
||||
}
|
||||
case 69: break;
|
||||
case 32:
|
||||
{ currentTokType = BOLD_ITALICS; yybegin(FIVE_SINGLE_QUOTES_STATE);
|
||||
}
|
||||
case 70: break;
|
||||
case 25:
|
||||
{ currentTokType = INTERNAL_LINK;yybegin(INTERNAL_LINK_STATE);
|
||||
}
|
||||
case 71: break;
|
||||
case 23:
|
||||
{ positionInc = 0; currentTokType = INTERNAL_LINK; yybegin(INTERNAL_LINK_STATE);
|
||||
}
|
||||
case 72: break;
|
||||
case 14:
|
||||
{ currentTokType = SUB_HEADING; yybegin(STRING);
|
||||
}
|
||||
case 73: break;
|
||||
case 28:
|
||||
{ positionInc = 1; return APOSTROPHE;
|
||||
}
|
||||
case 74: break;
|
||||
case 30:
|
||||
{ positionInc = 1; return NUM;
|
||||
}
|
||||
case 75: break;
|
||||
case 15:
|
||||
{ currentTokType = HEADING; yybegin(DOUBLE_EQUALS_STATE); return currentTokType;
|
||||
}
|
||||
case 76: break;
|
||||
case 6:
|
||||
{ yybegin(INTERNAL_LINK_STATE); return currentTokType;
|
||||
}
|
||||
case 77: break;
|
||||
case 2:
|
||||
{ positionInc = 1; return ALPHANUM;
|
||||
}
|
||||
case 78: break;
|
||||
case 31:
|
||||
{ positionInc = 1; return COMPANY;
|
||||
}
|
||||
case 79: break;
|
||||
case 10:
|
||||
{ currentTokType = BOLD; yybegin(THREE_SINGLE_QUOTES_STATE);
|
||||
}
|
||||
case 80: break;
|
||||
case 8:
|
||||
{ positionInc = 1; currentTokType = EXTERNAL_LINK; yybegin(EXTERNAL_LINK_STATE);return currentTokType;
|
||||
}
|
||||
case 81: break;
|
||||
case 38:
|
||||
{ positionInc = 0; yybegin(EXTERNAL_LINK_STATE); return currentTokType;
|
||||
}
|
||||
case 82: break;
|
||||
default:
|
||||
if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
|
||||
zzAtEOF = true;
|
||||
return YYEOF;
|
||||
}
|
||||
else {
|
||||
zzScanError(ZZ_NO_MATCH);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
|
@ -0,0 +1,324 @@
|
|||
package org.apache.lucene.wikipedia.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
|
||||
%%
|
||||
|
||||
%class WikipediaTokenizerImpl
|
||||
%unicode
|
||||
%integer
|
||||
%function getNextToken
|
||||
%pack
|
||||
%char
|
||||
|
||||
%{
|
||||
|
||||
public static final int ALPHANUM = 0;
|
||||
public static final int APOSTROPHE = 1;
|
||||
public static final int ACRONYM = 2;
|
||||
public static final int COMPANY = 3;
|
||||
public static final int EMAIL = 4;
|
||||
public static final int HOST = 5;
|
||||
public static final int NUM = 6;
|
||||
public static final int CJ = 7;
|
||||
public static final int INTERNAL_LINK = 8;
|
||||
public static final int EXTERNAL_LINK = 9;
|
||||
public static final int CITATION = 10;
|
||||
public static final int CATEGORY = 11;
|
||||
public static final int BOLD = 12;
|
||||
public static final int ITALICS = 13;
|
||||
public static final int BOLD_ITALICS = 14;
|
||||
public static final int HEADING = 15;
|
||||
public static final int SUB_HEADING = 16;
|
||||
public static final int EXTERNAL_LINK_URL = 17;
|
||||
|
||||
|
||||
private int currentTokType;
|
||||
private int numBalanced = 0;
|
||||
private int positionInc = 1;
|
||||
|
||||
public static final String [] TOKEN_TYPES = new String [] {
|
||||
"<ALPHANUM>",
|
||||
"<APOSTROPHE>",
|
||||
"<ACRONYM>",
|
||||
"<COMPANY>",
|
||||
"<EMAIL>",
|
||||
"<HOST>",
|
||||
"<NUM>",
|
||||
"<CJ>",
|
||||
WikipediaTokenizer.INTERNAL_LINK,
|
||||
WikipediaTokenizer.EXTERNAL_LINK,
|
||||
WikipediaTokenizer.CITATION,
|
||||
WikipediaTokenizer.CATEGORY,
|
||||
WikipediaTokenizer.BOLD,
|
||||
WikipediaTokenizer.ITALICS,
|
||||
WikipediaTokenizer.BOLD_ITALICS,
|
||||
WikipediaTokenizer.HEADING,
|
||||
WikipediaTokenizer.SUB_HEADING,
|
||||
WikipediaTokenizer.EXTERNAL_LINK_URL
|
||||
};
|
||||
|
||||
public final int yychar()
|
||||
{
|
||||
return yychar;
|
||||
}
|
||||
|
||||
public final int getPositionIncrement(){
|
||||
return positionInc;
|
||||
}
|
||||
|
||||
/**
|
||||
* Fills Lucene token with the current token text.
|
||||
*/
|
||||
final void getText(Token t, int tokType) {
|
||||
t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
|
||||
}
|
||||
%}
|
||||
|
||||
// basic word: a sequence of digits & letters
|
||||
ALPHANUM = ({LETTER}|{DIGIT}|{KOREAN})+
|
||||
|
||||
// internal apostrophes: O'Reilly, you're, O'Reilly's
|
||||
// use a post-filter to remove possesives
|
||||
APOSTROPHE = {ALPHA} ("'" {ALPHA})+
|
||||
|
||||
// acronyms: U.S.A., I.B.M., etc.
|
||||
// use a post-filter to remove dots
|
||||
ACRONYM = {ALPHA} "." ({ALPHA} ".")+
|
||||
|
||||
// company names like AT&T and Excite@Home.
|
||||
COMPANY = {ALPHA} ("&"|"@") {ALPHA}
|
||||
|
||||
// email addresses
|
||||
EMAIL = {ALPHANUM} (("."|"-"|"_") {ALPHANUM})* "@" {ALPHANUM} (("."|"-") {ALPHANUM})+
|
||||
|
||||
// hostname
|
||||
HOST = {ALPHANUM} ((".") {ALPHANUM})+
|
||||
|
||||
// floating point, serial, model numbers, ip addresses, etc.
|
||||
// every other segment must have at least one digit
|
||||
NUM = ({ALPHANUM} {P} {HAS_DIGIT}
|
||||
| {DIGIT}+ {P} {DIGIT}+
|
||||
| {HAS_DIGIT} {P} {ALPHANUM}
|
||||
| {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+
|
||||
| {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+
|
||||
| {ALPHANUM} {P} {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+
|
||||
| {HAS_DIGIT} {P} {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+)
|
||||
|
||||
TAGS = "<"\/?{ALPHANUM}({WHITESPACE}*{ALPHANUM}=\"{ALPHANUM}\")*">"
|
||||
|
||||
// punctuation
|
||||
P = ("_"|"-"|"/"|"."|",")
|
||||
|
||||
// at least one digit
|
||||
HAS_DIGIT =
|
||||
({LETTER}|{DIGIT})*
|
||||
{DIGIT}
|
||||
({LETTER}|{DIGIT})*
|
||||
|
||||
ALPHA = ({LETTER})+
|
||||
|
||||
|
||||
LETTER = [\u0041-\u005a\u0061-\u007a\u00c0-\u00d6\u00d8-\u00f6\u00f8-\u00ff\u0100-\u1fff\uffa0-\uffdc]
|
||||
|
||||
DIGIT = [\u0030-\u0039\u0660-\u0669\u06f0-\u06f9\u0966-\u096f\u09e6-\u09ef\u0a66-\u0a6f\u0ae6-\u0aef\u0b66-\u0b6f\u0be7-\u0bef\u0c66-\u0c6f\u0ce6-\u0cef\u0d66-\u0d6f\u0e50-\u0e59\u0ed0-\u0ed9\u1040-\u1049]
|
||||
|
||||
KOREAN = [\uac00-\ud7af\u1100-\u11ff]
|
||||
|
||||
// Chinese, Japanese
|
||||
CJ = [\u3040-\u318f\u3100-\u312f\u3040-\u309F\u30A0-\u30FF\u31F0-\u31FF\u3300-\u337f\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\uff65-\uff9f]
|
||||
|
||||
WHITESPACE = \r\n | [ \r\n\t\f]
|
||||
|
||||
//Wikipedia
|
||||
DOUBLE_BRACKET = "["{2}
|
||||
DOUBLE_BRACKET_CLOSE = "]"{2}
|
||||
DOUBLE_BRACKET_CAT = "["{2}":"?"Category:"
|
||||
EXTERNAL_LINK = "["
|
||||
TWO_SINGLE_QUOTES = "'"{2}
|
||||
CITATION = "<ref>"
|
||||
CITATION_CLOSE = "</ref>"
|
||||
INFOBOX = {DOUBLE_BRACE}("I"|"i")nfobox_
|
||||
|
||||
DOUBLE_BRACE = "{"{2}
|
||||
DOUBLE_BRACE_CLOSE = "}"{2}
|
||||
PIPE = "|"
|
||||
DOUBLE_EQUALS = "="{2}
|
||||
|
||||
|
||||
%state CATEGORY_STATE
|
||||
%state INTERNAL_LINK_STATE
|
||||
%state EXTERNAL_LINK_STATE
|
||||
|
||||
%state TWO_SINGLE_QUOTES_STATE
|
||||
%state THREE_SINGLE_QUOTES_STATE
|
||||
%state FIVE_SINGLE_QUOTES_STATE
|
||||
%state DOUBLE_EQUALS_STATE
|
||||
%state DOUBLE_BRACE_STATE
|
||||
%state STRING
|
||||
|
||||
%%
|
||||
|
||||
<YYINITIAL>{ALPHANUM} {positionInc = 1; return ALPHANUM; }
|
||||
<YYINITIAL>{APOSTROPHE} {positionInc = 1; return APOSTROPHE; }
|
||||
<YYINITIAL>{ACRONYM} {positionInc = 1; return ACRONYM; }
|
||||
<YYINITIAL>{COMPANY} {positionInc = 1; return COMPANY; }
|
||||
<YYINITIAL>{EMAIL} {positionInc = 1; return EMAIL; }
|
||||
<YYINITIAL>{NUM} {positionInc = 1; return NUM; }
|
||||
<YYINITIAL>{HOST} {positionInc = 1; return HOST; }
|
||||
<YYINITIAL>{CJ} {positionInc = 1; return CJ; }
|
||||
|
||||
//wikipedia
|
||||
<YYINITIAL>{
|
||||
//First {ALPHANUM} is always the link, set position to 0 for double bracket
|
||||
{DOUBLE_BRACKET} {positionInc = 0; currentTokType = INTERNAL_LINK; yybegin(INTERNAL_LINK_STATE);}
|
||||
{DOUBLE_BRACKET_CAT} {positionInc = 1; currentTokType = CATEGORY; yybegin(CATEGORY_STATE);}
|
||||
{EXTERNAL_LINK} {positionInc = 1;currentTokType = EXTERNAL_LINK_URL; yybegin(EXTERNAL_LINK_STATE);}
|
||||
{TWO_SINGLE_QUOTES} {positionInc = 1; if (numBalanced == 0){numBalanced++;yybegin(TWO_SINGLE_QUOTES_STATE);} else{numBalanced = 0;}}
|
||||
{DOUBLE_EQUALS} {positionInc = 1; yybegin(DOUBLE_EQUALS_STATE);}
|
||||
{DOUBLE_BRACE} {positionInc = 1; currentTokType = CITATION; yybegin(DOUBLE_BRACE_STATE);}
|
||||
{CITATION} {positionInc = 1; currentTokType = CITATION; yybegin(DOUBLE_BRACE_STATE);}
|
||||
//ignore
|
||||
. | {WHITESPACE} |{INFOBOX} { positionInc = 1; }
|
||||
}
|
||||
|
||||
<INTERNAL_LINK_STATE>{
|
||||
//First {ALPHANUM} is always the link, set position to 0 for these
|
||||
{ALPHANUM} {yybegin(INTERNAL_LINK_STATE); return currentTokType;}
|
||||
{DOUBLE_BRACKET_CLOSE} {yybegin(YYINITIAL);}
|
||||
//ignore
|
||||
. | {WHITESPACE} { positionInc = 1; }
|
||||
}
|
||||
|
||||
<EXTERNAL_LINK_STATE>{
|
||||
"http://"{HOST}("/"?({ALPHANUM}|{P}|\?|"&"|"="|"#")*)* {positionInc = 0; yybegin(EXTERNAL_LINK_STATE); return currentTokType;}
|
||||
{ALPHANUM} {positionInc = 1; currentTokType = EXTERNAL_LINK; yybegin(EXTERNAL_LINK_STATE);return currentTokType;}
|
||||
"]" {yybegin(YYINITIAL);}
|
||||
{WHITESPACE} { positionInc = 1; }
|
||||
}
|
||||
|
||||
<CATEGORY_STATE>{
|
||||
{ALPHANUM} {yybegin(CATEGORY_STATE); return currentTokType;}
|
||||
{DOUBLE_BRACKET_CLOSE} {yybegin(YYINITIAL);}
|
||||
//ignore
|
||||
. | {WHITESPACE} { positionInc = 1; }
|
||||
}
|
||||
//italics
|
||||
<TWO_SINGLE_QUOTES_STATE>{
|
||||
"'" {currentTokType = BOLD; yybegin(THREE_SINGLE_QUOTES_STATE);}
|
||||
"'''" {currentTokType = BOLD_ITALICS; yybegin(FIVE_SINGLE_QUOTES_STATE);}
|
||||
{ALPHANUM} {currentTokType = ITALICS; yybegin(STRING); return currentTokType;/*italics*/}
|
||||
//we can have links inside, let those override
|
||||
{DOUBLE_BRACKET} {currentTokType = INTERNAL_LINK;yybegin(INTERNAL_LINK_STATE);}
|
||||
{DOUBLE_BRACKET_CAT} {currentTokType = CATEGORY;yybegin(CATEGORY_STATE);}
|
||||
{EXTERNAL_LINK} {currentTokType = EXTERNAL_LINK;yybegin(EXTERNAL_LINK_STATE);}
|
||||
|
||||
//ignore
|
||||
. | {WHITESPACE} { /* ignore */ }
|
||||
}
|
||||
//bold
|
||||
<THREE_SINGLE_QUOTES_STATE>{
|
||||
{ALPHANUM} {yybegin(STRING);return currentTokType;}
|
||||
//we can have links inside, let those override
|
||||
{DOUBLE_BRACKET} {currentTokType = INTERNAL_LINK;yybegin(INTERNAL_LINK_STATE);}
|
||||
{DOUBLE_BRACKET_CAT} {currentTokType = CATEGORY;yybegin(CATEGORY_STATE);}
|
||||
{EXTERNAL_LINK} {currentTokType = EXTERNAL_LINK;yybegin(EXTERNAL_LINK_STATE);}
|
||||
|
||||
//ignore
|
||||
. | {WHITESPACE} { /* ignore */ }
|
||||
|
||||
}
|
||||
//bold italics
|
||||
<FIVE_SINGLE_QUOTES_STATE>{
|
||||
{ALPHANUM} {yybegin(STRING);return currentTokType;}
|
||||
//we can have links inside, let those override
|
||||
{DOUBLE_BRACKET} {currentTokType = INTERNAL_LINK;yybegin(INTERNAL_LINK_STATE);}
|
||||
{DOUBLE_BRACKET_CAT} {currentTokType = CATEGORY;yybegin(CATEGORY_STATE);}
|
||||
{EXTERNAL_LINK} {currentTokType = EXTERNAL_LINK;yybegin(EXTERNAL_LINK_STATE);}
|
||||
|
||||
//ignore
|
||||
. | {WHITESPACE} { /* ignore */ }
|
||||
}
|
||||
|
||||
<DOUBLE_EQUALS_STATE>{
|
||||
"=" {currentTokType = SUB_HEADING; yybegin(STRING);}
|
||||
{ALPHANUM} {currentTokType = HEADING; yybegin(DOUBLE_EQUALS_STATE); return currentTokType;}
|
||||
{DOUBLE_EQUALS} {yybegin(YYINITIAL);}
|
||||
//ignore
|
||||
. | {WHITESPACE} { /* ignore */ }
|
||||
}
|
||||
|
||||
<DOUBLE_BRACE_STATE>{
|
||||
{ALPHANUM} {yybegin(DOUBLE_BRACE_STATE); return currentTokType;}
|
||||
{DOUBLE_BRACE_CLOSE} {yybegin(YYINITIAL);}
|
||||
{CITATION_CLOSE} {yybegin(YYINITIAL);}
|
||||
//ignore
|
||||
. | {WHITESPACE} { /* ignore */ }
|
||||
}
|
||||
|
||||
<STRING> {
|
||||
"'''''" {numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);/*end bold italics*/}
|
||||
"'''" {numBalanced = 0;currentTokType = ALPHANUM;yybegin(YYINITIAL);/*end bold*/}
|
||||
"''" {numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);/*end italics*/}
|
||||
"===" {numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);/*end sub header*/}
|
||||
{ALPHANUM} {yybegin(STRING); return currentTokType;/* STRING ALPHANUM*/}
|
||||
//we can have links inside, let those override
|
||||
{DOUBLE_BRACKET} {numBalanced = 0;currentTokType = INTERNAL_LINK;yybegin(INTERNAL_LINK_STATE);}
|
||||
{DOUBLE_BRACKET_CAT} {numBalanced = 0;currentTokType = CATEGORY;yybegin(CATEGORY_STATE);}
|
||||
{EXTERNAL_LINK} {numBalanced = 0;currentTokType = EXTERNAL_LINK;yybegin(EXTERNAL_LINK_STATE);}
|
||||
|
||||
|
||||
{PIPE} {yybegin(STRING); return currentTokType;/*pipe*/}
|
||||
|
||||
.|{WHITESPACE} { /* ignore STRING */ }
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/*
|
||||
{INTERNAL_LINK} { return curentTokType; }
|
||||
|
||||
{CITATION} { return currentTokType; }
|
||||
{CATEGORY} { return currentTokType; }
|
||||
|
||||
{BOLD} { return currentTokType; }
|
||||
{ITALICS} { return currentTokType; }
|
||||
{BOLD_ITALICS} { return currentTokType; }
|
||||
{HEADING} { return currentTokType; }
|
||||
{SUB_HEADING} { return currentTokType; }
|
||||
|
||||
*/
|
||||
//end wikipedia
|
||||
|
||||
/** Ignore the rest */
|
||||
. | {WHITESPACE}|{TAGS} { /* ignore */ }
|
||||
|
||||
|
||||
//INTERNAL_LINK = "["{2}({ALPHANUM}+{WHITESPACE}*)+"]"{2}
|
||||
//EXTERNAL_LINK = "["http://"{HOST}.*?"]"
|
||||
//CITATION = "{"{2}({ALPHANUM}+{WHITESPACE}*)+"}"{2}
|
||||
//CATEGORY = "["{2}"Category:"({ALPHANUM}+{WHITESPACE}*)+"]"{2}
|
||||
//CATEGORY_COLON = "["{2}":Category:"({ALPHANUM}+{WHITESPACE}*)+"]"{2}
|
||||
//BOLD = '''({ALPHANUM}+{WHITESPACE}*)+'''
|
||||
//ITALICS = ''({ALPHANUM}+{WHITESPACE}*)+''
|
||||
//BOLD_ITALICS = '''''({ALPHANUM}+{WHITESPACE}*)+'''''
|
||||
//HEADING = "="{2}({ALPHANUM}+{WHITESPACE}*)+"="{2}
|
||||
//SUB_HEADING ="="{3}({ALPHANUM}+{WHITESPACE}*)+"="{3}
|
|
@ -0,0 +1,35 @@
|
|||
<!--
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
-->
|
||||
|
||||
<HTML>
|
||||
<!--
|
||||
*
|
||||
--><HEAD>
|
||||
<TITLE>org.apache.lucene.wikipedia</TITLE>
|
||||
</HEAD>
|
||||
<BODY>
|
||||
<DIV>Tools for working with <a href="http://www.wikipedia.org">Wikipedia</a> content.
|
||||
</DIV>
|
||||
<DIV> </DIV>
|
||||
<DIV align="center">
|
||||
Copyright © 2007 <A HREF="http://www.apache.org">Apache Software Foundation</A>
|
||||
</DIV>
|
||||
</BODY>
|
||||
</HTML>
|
|
@ -0,0 +1,213 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
|
||||
package org.apache.lucene.wikipedia.analysis;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.StringReader;
|
||||
import java.util.Map;
|
||||
import java.util.HashMap;
|
||||
|
||||
|
||||
/**
|
||||
*
|
||||
*
|
||||
**/
|
||||
public class WikipediaTokenizerTest extends TestCase {
|
||||
|
||||
|
||||
public WikipediaTokenizerTest(String s) {
|
||||
super(s);
|
||||
}
|
||||
|
||||
protected void setUp() {
|
||||
}
|
||||
|
||||
protected void tearDown() {
|
||||
|
||||
}
|
||||
|
||||
|
||||
public void testHandwritten() throws Exception {
|
||||
//make sure all tokens are in only one type
|
||||
String test = "[[link]] This is a [[Category:foo]] Category This is a linked [[:Category:bar none withstanding]] " +
|
||||
"Category This is (parens) This is a [[link]] This is an external URL [http://lucene.apache.org] " +
|
||||
"Here is ''italics'' and ''more italics'', '''bold''' and '''''five quotes''''' " +
|
||||
" This is a [[link|display info]] This is a period. Here is $3.25 and here is 3.50. Here's Johnny. " +
|
||||
"==heading== ===sub head=== followed by some text [[Category:blah| ]] " +
|
||||
"''[[Category:ital_cat]]'' here is some that is ''italics [[Category:foo]] but is never closed." +
|
||||
"'''same [[Category:foo]] goes for this '''''and2 [[Category:foo]] and this" +
|
||||
" [http://foo.boo.com/test/test/ Test Test] [http://foo.boo.com/test/test/test.html Test Test]" +
|
||||
" [http://foo.boo.com/test/test/test.html?g=b&c=d Test Test] <ref>Citation</ref> <sup>martian</sup> <span class=\"glue\">code</span>";
|
||||
Map tcm = new HashMap();//map tokens to types
|
||||
tcm.put("link", WikipediaTokenizer.INTERNAL_LINK);
|
||||
tcm.put("display", WikipediaTokenizer.INTERNAL_LINK);
|
||||
tcm.put("info", WikipediaTokenizer.INTERNAL_LINK);
|
||||
|
||||
tcm.put("http://lucene.apache.org", WikipediaTokenizer.EXTERNAL_LINK_URL);
|
||||
tcm.put("http://foo.boo.com/test/test/", WikipediaTokenizer.EXTERNAL_LINK_URL);
|
||||
tcm.put("http://foo.boo.com/test/test/test.html", WikipediaTokenizer.EXTERNAL_LINK_URL);
|
||||
tcm.put("http://foo.boo.com/test/test/test.html?g=b&c=d", WikipediaTokenizer.EXTERNAL_LINK_URL);
|
||||
tcm.put("Test", WikipediaTokenizer.EXTERNAL_LINK);
|
||||
|
||||
//alphanums
|
||||
tcm.put("This", "<ALPHANUM>");
|
||||
tcm.put("is", "<ALPHANUM>");
|
||||
tcm.put("a", "<ALPHANUM>");
|
||||
tcm.put("Category", "<ALPHANUM>");
|
||||
tcm.put("linked", "<ALPHANUM>");
|
||||
tcm.put("parens", "<ALPHANUM>");
|
||||
tcm.put("external", "<ALPHANUM>");
|
||||
tcm.put("URL", "<ALPHANUM>");
|
||||
tcm.put("and", "<ALPHANUM>");
|
||||
tcm.put("period", "<ALPHANUM>");
|
||||
tcm.put("Here", "<ALPHANUM>");
|
||||
tcm.put("Here's", "<APOSTROPHE>");
|
||||
tcm.put("here", "<ALPHANUM>");
|
||||
tcm.put("Johnny", "<ALPHANUM>");
|
||||
tcm.put("followed", "<ALPHANUM>");
|
||||
tcm.put("by", "<ALPHANUM>");
|
||||
tcm.put("text", "<ALPHANUM>");
|
||||
tcm.put("that", "<ALPHANUM>");
|
||||
tcm.put("but", "<ALPHANUM>");
|
||||
tcm.put("never", "<ALPHANUM>");
|
||||
tcm.put("closed", "<ALPHANUM>");
|
||||
tcm.put("goes", "<ALPHANUM>");
|
||||
tcm.put("for", "<ALPHANUM>");
|
||||
tcm.put("this", "<ALPHANUM>");
|
||||
tcm.put("an", "<ALPHANUM>");
|
||||
tcm.put("some", "<ALPHANUM>");
|
||||
tcm.put("martian", "<ALPHANUM>");
|
||||
tcm.put("code", "<ALPHANUM>");
|
||||
|
||||
tcm.put("foo", WikipediaTokenizer.CATEGORY);
|
||||
tcm.put("bar", WikipediaTokenizer.CATEGORY);
|
||||
tcm.put("none", WikipediaTokenizer.CATEGORY);
|
||||
tcm.put("withstanding", WikipediaTokenizer.CATEGORY);
|
||||
tcm.put("blah", WikipediaTokenizer.CATEGORY);
|
||||
tcm.put("ital", WikipediaTokenizer.CATEGORY);
|
||||
tcm.put("cat", WikipediaTokenizer.CATEGORY);
|
||||
|
||||
tcm.put("italics", WikipediaTokenizer.ITALICS);
|
||||
tcm.put("more", WikipediaTokenizer.ITALICS);
|
||||
tcm.put("bold", WikipediaTokenizer.BOLD);
|
||||
tcm.put("same", WikipediaTokenizer.BOLD);
|
||||
tcm.put("five", WikipediaTokenizer.BOLD_ITALICS);
|
||||
tcm.put("and2", WikipediaTokenizer.BOLD_ITALICS);
|
||||
tcm.put("quotes", WikipediaTokenizer.BOLD_ITALICS);
|
||||
|
||||
tcm.put("heading", WikipediaTokenizer.HEADING);
|
||||
tcm.put("sub", WikipediaTokenizer.SUB_HEADING);
|
||||
tcm.put("head", WikipediaTokenizer.SUB_HEADING);
|
||||
|
||||
tcm.put("Citation", WikipediaTokenizer.CITATION);
|
||||
|
||||
tcm.put("3.25", "<NUM>");
|
||||
tcm.put("3.50", "<NUM>");
|
||||
WikipediaTokenizer tf = new WikipediaTokenizer(new StringReader(test));
|
||||
Token token = new Token();
|
||||
int count = 0;
|
||||
int numItalics = 0;
|
||||
int numBoldItalics = 0;
|
||||
int numCategory = 0;
|
||||
int numCitation = 0;
|
||||
while ((token = tf.next(token)) != null) {
|
||||
String tokText = token.termText();
|
||||
//System.out.println("Text: " + tokText + " Type: " + token.type());
|
||||
assertTrue("token is null and it shouldn't be", token != null);
|
||||
String expectedType = (String) tcm.get(tokText);
|
||||
assertTrue("expectedType is null and it shouldn't be for: " + token, expectedType != null);
|
||||
assertTrue(token.type() + " is not equal to " + expectedType + " for " + token, token.type().equals(expectedType) == true);
|
||||
count++;
|
||||
if (token.type().equals(WikipediaTokenizer.ITALICS) == true){
|
||||
numItalics++;
|
||||
} else if (token.type().equals(WikipediaTokenizer.BOLD_ITALICS) == true){
|
||||
numBoldItalics++;
|
||||
} else if (token.type().equals(WikipediaTokenizer.CATEGORY) == true){
|
||||
numCategory++;
|
||||
}
|
||||
else if (token.type().equals(WikipediaTokenizer.CITATION) == true){
|
||||
numCitation++;
|
||||
}
|
||||
}
|
||||
assertTrue("We have not seen enough tokens: " + count + " is not >= " + tcm.size(), count >= tcm.size());
|
||||
assertTrue(numItalics + " does not equal: " + 4 + " for numItalics", numItalics == 4);
|
||||
assertTrue(numBoldItalics + " does not equal: " + 3 + " for numBoldItalics", numBoldItalics == 3);
|
||||
assertTrue(numCategory + " does not equal: " + 10 + " for numCategory", numCategory == 10);
|
||||
assertTrue(numCitation + " does not equal: " + 1 + " for numCitation", numCitation == 1);
|
||||
}
|
||||
|
||||
public void testLinkPhrases() throws Exception {
|
||||
String test = "click [[link here]] click [http://lucene.apache.org here]";
|
||||
WikipediaTokenizer tf = new WikipediaTokenizer(new StringReader(test));
|
||||
Token token = new Token();
|
||||
token = tf.next(token);
|
||||
assertTrue("token is null and it shouldn't be", token != null);
|
||||
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "click", new String(token.termBuffer(), 0, token.termLength()).equals("click") == true);
|
||||
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
|
||||
token = tf.next(token);
|
||||
assertTrue("token is null and it shouldn't be", token != null);
|
||||
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "link", new String(token.termBuffer(), 0, token.termLength()).equals("link") == true);
|
||||
assertTrue(token.getPositionIncrement() + " does not equal: " + 0, token.getPositionIncrement() == 0);
|
||||
token = tf.next(token);
|
||||
assertTrue("token is null and it shouldn't be", token != null);
|
||||
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "here",
|
||||
new String(token.termBuffer(), 0, token.termLength()).equals("here") == true);
|
||||
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
|
||||
|
||||
token = tf.next(token);
|
||||
assertTrue("token is null and it shouldn't be", token != null);
|
||||
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "click",
|
||||
new String(token.termBuffer(), 0, token.termLength()).equals("click") == true);
|
||||
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
|
||||
|
||||
token = tf.next(token);
|
||||
assertTrue("token is null and it shouldn't be", token != null);
|
||||
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "http://lucene.apache.org",
|
||||
new String(token.termBuffer(), 0, token.termLength()).equals("http://lucene.apache.org") == true);
|
||||
assertTrue(token.getPositionIncrement() + " does not equal: " + 0, token.getPositionIncrement() == 0);
|
||||
|
||||
token = tf.next(token);
|
||||
assertTrue("token is null and it shouldn't be", token != null);
|
||||
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "here",
|
||||
new String(token.termBuffer(), 0, token.termLength()).equals("here") == true);
|
||||
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
|
||||
|
||||
}
|
||||
|
||||
public void testLinks() throws Exception {
|
||||
String test = "[http://lucene.apache.org/java/docs/index.html#news here] [http://lucene.apache.org/java/docs/index.html?b=c here]";
|
||||
WikipediaTokenizer tf = new WikipediaTokenizer(new StringReader(test));
|
||||
Token token = new Token();
|
||||
token = tf.next(token);
|
||||
assertTrue("token is null and it shouldn't be", token != null);
|
||||
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "http://lucene.apache.org/java/docs/index.html#news",
|
||||
new String(token.termBuffer(), 0, token.termLength()).equals("http://lucene.apache.org/java/docs/index.html#news") == true);
|
||||
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.EXTERNAL_LINK_URL, token.type().equals(WikipediaTokenizer.EXTERNAL_LINK_URL) == true);
|
||||
tf.next(token);//skip here
|
||||
token = tf.next(token);
|
||||
|
||||
assertTrue("token is null and it shouldn't be", token != null);
|
||||
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "http://lucene.apache.org/java/docs/index.html?b=c",
|
||||
new String(token.termBuffer(), 0, token.termLength()).equals("http://lucene.apache.org/java/docs/index.html?b=c") == true);
|
||||
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.EXTERNAL_LINK_URL, token.type().equals(WikipediaTokenizer.EXTERNAL_LINK_URL) == true);
|
||||
}
|
||||
}
|
|
@ -367,6 +367,10 @@ document.write("Last Published: " + document.lastModified);
|
|||
<li>
|
||||
<a href="api/contrib-swing/index.html">Swing</a>
|
||||
</li>
|
||||
|
||||
<li>
|
||||
<a href="api/contrib-wikipedia/index.html">Wikipedia</a>
|
||||
</li>
|
||||
|
||||
<li>
|
||||
<a href="api/contrib-wordnet/index.html">Wordnet</a>
|
||||
|
@ -383,11 +387,11 @@ document.write("Last Published: " + document.lastModified);
|
|||
</p>
|
||||
</div>
|
||||
|
||||
<a name="N10097"></a><a name="Downloads"></a>
|
||||
<a name="N1009C"></a><a name="Downloads"></a>
|
||||
<h2 class="boxed">Downloads</h2>
|
||||
<div class="section">
|
||||
<p>System Requirements are detailed <a href="systemrequirements.html">here</a>.</p>
|
||||
<a name="N100A3"></a><a name="Clover"></a>
|
||||
<a name="N100A8"></a><a name="Clover"></a>
|
||||
<h3 class="boxed">Clover Test Coverage Reports</h3>
|
||||
<p>
|
||||
|
||||
|
@ -396,7 +400,7 @@ document.write("Last Published: " + document.lastModified);
|
|||
<a href="http://lucene.zones.apache.org:8080/hudson/job/Lucene-Nightly/lastSuccessfulBuild/artifact/trunk/build/test/clover/reports/index.html">here</a>
|
||||
for the nightly build.
|
||||
</p>
|
||||
<a name="N100B4"></a><a name="Hudson"></a>
|
||||
<a name="N100B9"></a><a name="Hudson"></a>
|
||||
<h3 class="boxed">Hudson</h3>
|
||||
<p>
|
||||
|
||||
|
@ -404,13 +408,13 @@ document.write("Last Published: " + document.lastModified);
|
|||
project. It is responsible for running nightly builds, code coverage reports as well as building the nightly version
|
||||
of the website.
|
||||
</p>
|
||||
<a name="N100C1"></a><a name="Nightly"></a>
|
||||
<a name="N100C6"></a><a name="Nightly"></a>
|
||||
<h3 class="boxed">Nightly Build Download</h3>
|
||||
<p>Nightly builds are based on the trunk version of the code checked into
|
||||
<a href="https://svn.apache.org/repos/asf/lucene/java/trunk">SVN</a>
|
||||
|
||||
</p>
|
||||
<a href="http://lucene.zones.apache.org:8080/hudson/job/Lucene-Nightly/">Download via Hudson</a><a name="N100D3"></a><a name="source"></a>
|
||||
<a href="http://lucene.zones.apache.org:8080/hudson/job/Lucene-Nightly/">Download via Hudson</a><a name="N100D8"></a><a name="source"></a>
|
||||
<h3 class="boxed">Source Code</h3>
|
||||
<p>The source files are now stored using Subversion (see http://subversion.tigris.org/ and http://svnbook.red-bean.com/)
|
||||
</p>
|
||||
|
|
|
@ -5,10 +5,10 @@
|
|||
/Producer (FOP 0.20.5) >>
|
||||
endobj
|
||||
5 0 obj
|
||||
<< /Length 677 /Filter [ /ASCII85Decode /FlateDecode ]
|
||||
<< /Length 680 /Filter [ /ASCII85Decode /FlateDecode ]
|
||||
>>
|
||||
stream
|
||||
Gaua=966RV&:j6I$6AM$<'>LHA#F%1Bfjds>FL0G11<e;/&7SNq]Z`9S4S?[.)cF,`Nm,5C0/h1%GkAg5Sum3h$(mtAH;j$J-"qb(aN]GA-gU;#3ep&]/ni-lp7ej)2[APh<K'JakDs)h!dWOA>Sut];]T!W(q!2=bAet?M3_@^r3fKo#YAMEQkN1%pIJb1\.LtUi'.39h$=WfkX&D$ptq:,H82VL,c_h[8K(Fc;`WB\og;YJC1VbN(fJ+7PK\#4mZt!5et;#^@iB5UT=jd;Rk\opTD^:Xt<Ye#/_-[k36A0O](e$'q*+_$&kI,a?727rO"0tg?QdGDn<6sSTlFj,$.pR.)3TrnR(3!%/D[jS]CK30@XK;.?Q:,pGt&Qj>:DrCFl?ENl^_7Os^t7,Q2JU%^u-!)Q0lTrMo<nMSe]#CUmNXrPp?BCX+O3BaG&*`)?`o/:*cc:]Fpk2bUXP+.*qSbiC$](/Fa-^RXo>T)MuiO.ELK<fCB"NTEpua[nU3=VS/eE_l^I38"-1cfP12HW0*DWq2ANI2`#%cM-:B`-lBp*b'pA6,J2=?l+1u2We=2VAp`rPDF5I>(AVg8]X1aV_)eep.sBn^,r7-'98VO&.4tmqKIa^GD[?ICEJ9(AhI(bkJEaNd/2&B!(LDdh0?5AUcuj>3qV;OUY?7~>
|
||||
Gaua=c#T:-&:j43KoYjMWr@QZm5/Z9D.f;V97'K#-;5k/"ZV^T07n`8N#UP>.6^iaQJML?]95'DnE,T0_B8p%/jT+`%1pS_$rH]9+LZ?C+\5$)P+UZMol`";#IOaP,`K<YX&!T4.Smct_b5,KV;<`_VJ]A8>Ec(5%DIJU\oWo5-A21?T00t%7r7\jhT2^:7Qsk`1:8aXa[c[k4(Uff.&[uA,kA\9j5fpa9(1OpeQ&BE`Wa<^7`Y3VZ;GW\_DiP%@:/m8!q&;I[6MhV7?os$dLE[VG>m;+9qeUFkbm>o:D\YCPhHADa?2YH@JC7!k4(V%01GlePYVd8dhZNW`+sX[R83':q(4dirW40MaIe%I4MI_AW?80=bA:=qe&[3(o8mgmZ%t_-,/[JTSl!LhoXCY\%)"m,EMDa<YPR[`be9%L;1!/KJ".^pk'Z)O0*-Yl4=/q9U)t$+l$mBQHKgap"#Igs1BGO]H^^@ZC#A3AF.V(6T54)(':-atoo$\lgs7)jdBmI^aQCT-rS.<Zo*[ZZs)7:AS[5<@8+Ljqn>ee2qr894W<kYB5$e?u#=a8Mn28kNHNBOTKO@27`k?-gjrDBmK:#D<gfEm9N*UgT:4FK.4,G(*S<WemLmN=C>J4enV=A0#FB`>)Srh=rbh`=hU%_i>ZDs0MW!k`Zh]eWG%f~>
|
||||
endstream
|
||||
endobj
|
||||
6 0 obj
|
||||
|
@ -102,10 +102,10 @@ endobj
|
|||
>>
|
||||
endobj
|
||||
22 0 obj
|
||||
<< /Length 1268 /Filter [ /ASCII85Decode /FlateDecode ]
|
||||
<< /Length 1260 /Filter [ /ASCII85Decode /FlateDecode ]
|
||||
>>
|
||||
stream
|
||||
Gatm;?'!Gq&:O#Nn0:K*nOK)[hWQR">-S@A]`RH86^kT'Zu0-=UqOIVr_AoXEJf1PLfL90PNI,8S/HZ#e+r(\I.D>FE1Fk"UZOM.[.tF^M%MLYjT"i14dmW!',6<(^&ubf?VuA'&!2oe1&3Smc6of6,=J"+k,SeSlNsWm+,X68^u=epc*up*(DOVP=7!<<Mu3-$0@kcH%[YeYW(aq(X'?God18(KCSRQ#`SYm0G_"g.lrelsjs1LL/Z1<F7WB!IF?\9D/EGa`0&9__o$S_[%dDBHQ.BI=.r>J8ZhTljLAS:3aXg5Q3a,jIiO[`FA5XXNbG/AQf1V[b>?Op>irF91NM@aRAmLWW.6T;tk)[&#KVbcu>hT)a=+%t*gBm_o1)"YdK-@)k>A0dN724^BYVog"("oL4]r>hYVff/.j,AVZV74T8Qse4m\]Q=G>+#[?E71*O?\Mggng<r\cY*39hKff7>_]L#LB4el)10G0\kr+&lA_i:7)3ckPPi^n!!K,MjK2<T#YVDk5d#5s_h"L)B^!IkaK9VpLimN=?nRi_WB1#/9mNuRKfoEk<h'ST\3te7].m1HeRa%_F^mc+'a??nGOO<@Gf$B'<#4/!S!li+P"/i45!Zf[VIlWD@i>#u!f*Mmh'a\#T.K_u7rrdhGS5:e\^/Icnc7uPESqSo9.`jjh>-1FouchJfCC"B8jluOB)*59PK>=NGX:7fAQJ$4,5l(#F[/^8]$,=qWGEfc7o'=LNP4o,C1D?k/DWQ8]_]h^Vk3$9Z%RGrq5V]FOHgb$`EmPuh1$D>W$\@lq=Zg+jZ2npBItb@+TV65XbH9VP;$l!i)XX(d>('QFQr67N(#]-(;3M\YTVn'\d&<e6/U"#a0Lk/B6]M&leKQn!%9u^T^CUal4=nKjE%bERKQ_pZ%7"K?g>`RTi'IoK(a6="6$Z9HYJH^F'2Q;9F--FCME+=e#)]Z:Z3!(J`nQ"grH"\4b\NY#"d#-P#F6tPl8J@aKek'ZoC*fP8DONp4Nb1=YgD;3ZmGZNJqn<R>C^(g>ge?H<Hh0pt)t`_0fnUH,=Pd"c?\rau[f)maA0G*N$Cn$5p,Rp%,;dO`_oC;X%JVWq5$O=mF^OE`=:.)bS;C_"R-T,nNBVm&CS^??-drOiJG-iQ"ecolHJtZU\WH8kCNK8Bi=iG'Ga9c#=L,"C$G_DD,8\h9e&N5-5X1F,loG`<ZbTCYgL^1+^C>p.VZi6W\R#S_L"`oWJX8Qr8QVnm,G-KEHHPm1T:,5i+<s~>
|
||||
Gatm;9on$e&A@sBn<a2Y7O1ENoG6n-'.gFTZ?D?3D%Ep7[Y:PHLQ_&9^V1pcZ7(JZ[V0'af#W0-2fFG@d<%Qt5G*=q">i]65`k+eA?>Tl6(/?of#7UOdg%X"!ud7lLA2e3k5'g(b!\c/ON$;HA3-?q]oNJ8*gj,X!Ei3;"oL(&6:6<b7qB=[^V=7M27_]/AR7N3cUGAcltj2EeR`TT`P*sHq3q4Qjpu;Cm(E+KNfs`_&k+r6[)SU]@D(\Yk//&Gj-IugI1@P3INPkJ`M0:^EpB$_/6cgOB"uc+0QN)$ok(]EeIM-o;sIn=.>R58A!+kAA<oa!)QoDfhf#3iZc3%5'H0IZ,a<LMc2]sZ^(ntJ-\Neq=eT[3V(99#:$i)^cSC\U6GjtCs&s*2E+L$<.,\sTga]VU`HkJ;E_lF`>)kD!C0'1@P>Or+i0PqRJjqCB1FXVEqm-70e>lTsADZGDU;,)[:SQ"t0G`\&YZ<=8J=4Rrb.^mUii3V/EZ4Zts57-+K!?*^:+7QDVc7sb\%89l^p")%FCX`HX.>Wn3]=,pf1a@]Sc/gl?*2"`&)a.HUdH?/Oe:fS9H8>#RGLZoQq2IU6-RTkHT)A*n7a\41>cn(oB\r(fW3@hX;BF:$lCo=bLe0L*f?t:UKHH>jUq)t:/70IA$7YK7?q4%?l^^']u#0QK.4U,D\4e//5NL)^YC4THNffP>MN#OW'0*nTi[L^WK2u(LRloq"nV"7gOW8u#4cD&"pIZ4o$$gIk^C!`!s@f<B6HE31^Ao9bc=Og7<Ymd1,gs`]^gI!Am%If`^S=Vc>Sc9L9n!2#4#gtP*tU0g!=>(>R1ags7`D/3QD.HBd40i;nCGu;X&#^mlDF)PfTTjM-D-fi[&Gs#E85f&^F-O%9?,IK(&TR5VkE"T&.l'IBZnu"q:FR*mYHg`=Q4Jgf8#9Vf7=/E+9:,X(b@m[n7aXRU,&Peb1\RIqEK#FkCD,qpf>(P`&B3'QIOA0@?AFfP/40AU)J!*c]>Pi+/M7.rST2/\T8aB6JX<kg3u7m6)gVF4mgS<7$^u&n;[on%s@ZbOTgVS@$^(ikub,0=Pggef>_ql!:@,jmEd]H>dg@aks`gZl0-'NI0ahb%2gqWY_74F-XgD,_Die.M-f'^N@%n4E;rQ(:8q9]&YTCG^*A"c+k)dJpWLIY0:eVmUR77cKceDllk%inK<VlCYgL^2Cuhmp.V[TA!b^=c@'b'pETS>VUT"DI92^DGdJK!C$irj!3?kVBE~>
|
||||
endstream
|
||||
endobj
|
||||
23 0 obj
|
||||
|
@ -117,10 +117,10 @@ endobj
|
|||
>>
|
||||
endobj
|
||||
24 0 obj
|
||||
<< /Length 1000 /Filter [ /ASCII85Decode /FlateDecode ]
|
||||
<< /Length 1033 /Filter [ /ASCII85Decode /FlateDecode ]
|
||||
>>
|
||||
stream
|
||||
Gatm:997OU&AIm?pp_iZ'0EJji5RB#%j;'l`J*E8Zu4K3Ll:Z-g4,0Q7m/<'!!uYB11P[nS!e1W]q3PVc"B7RN3*!PorOfO8XcOUQ](h><Ql;u1aPVSCA)@brPo"&"2$_,'njPT@D82XmL]%57f%`]L6Q^6s$=A*]P3^52j`B4BBV\8n@gW'X-Tbg#Lu.":,T.kd5KN7\Mg9_fZp5HPa/86;\Eqi)SJAV+M;V/ZB5gPhrV^@d(FL"OPe.Wh,.EK!ITjL8A!9/<E5$[Y-\LeBq&]I>GKa[Y'p_8W`A`?5/Yc$-'l!fP\['LEheU]nWrBe-gUX$9'9ggAWiMbXqMu!WTVA7Z:ACP2:/B[(3d'ib"3FDG2/gR(HG42rlH&456WWZ6Hkmg?i9hj$uD^gn<&]h!u8q2Nu;@tcc8TWU.?G9E?@FI#qG_*j)SDndhK`eHIpZX>iHkQ%3k?_&tY)U&e<T_(7@M33#$u;OF@;ZErsn>TY+=%PmAP9`?bK/F71i>Ip]]1!^=]E,Bi94&N/4sQOcH$U0Ddf<fo\2%'bP??H$((!E3P@`gbu&;+>L/BrK?BO="BY6nsjZ,LYB3egsXTDRtLl,=n>T-k<=QF3+B=+H-BnO`^$S\30K+qb#E0**4&l6O0/?\<*h`>R,Iioc$Eh0<@S(*h:tIO!8r'#u7U^ZCoH'64fUIWI'<q2Cuk07T8'=deXP<YcCB^%4q.4kl1-<WT&#CGU=OP9mI]7G=hsCFmFW+GQL)^r-F-AIuE$Z[&^.nVF'pggm6@71FKE)Z*30FnD&la2C>uLTT7E3SmX)jW:Zh%cnR_:,"lE.QO^$rM=a\`AZ[o>9XLF':B`go(XKe?.AIbk@\=`q=R92")Hk*U!\HCmfmr\S0Q[fm&elXe_q9`,cF?-WT*YNe.S.j5a@";2kfo=Ehf5@9mpj1c'N:G)Bjc9@TQK:Dn#bA_i-;H9eSVYZrYPVrerRXRk^6S[9.S7f0%"A/"j0OmBE~>
|
||||
GatU29lo&3&A@sBE(jj^";9pn[jJ*'Ful0Pnat9%+PGBo;1o=uT)AJ[0=-Q':-'8RD&*9CmI&^tQa=fSg#UJ4U6YW49JV2+MW<@f1>[sC6:Pt91Sb1(-9iiDr<@l&/0tmCrY_QW5:9TkOC"X0MmkXDLZ$BD:cIMK5R=4nU6/ZK/LR?*7bicnDC^Wa7RK"Z6GLei*9)$0Z9@F<;nXlb3_-s4m)q,u,*"IF%'V;!nHb@()Kqc`^bE]sdgg>I(a,[Fj,I)D\WG@a:X)mNAQ>cL4=2R/<][]&Shn\OGp,gpV#ZQ["ib?:NN`"[<+6"/+QaHXij0YhCh#6oMe"dcr6iGu<+TT;JlA".HUHCb2=dl4X0jOOrBsN+EadfjmhZ%/>G:X'R*fB<Zj$Hk@([.aH$lI-),V9jB$]eiAY-l.%PBh$As<D*4E9@U'Ll8WTsENW!'H*XjL+d/&fA7=MR0;&?N(O-q6u[s\SM?/c"j_f8sM&)('s(P%Pp#)MAT)p;]qU8`nug)!JsPd+Hm^3[`FF.$t2%`)6ckaO5>84l<[##/`r**d+'<gZ+HXcqUbpLHG\8f_-tdhl2u@kk#u4LpVpW'HE1^-`SBY>5=@lo7M66+@6cYnWn)5&U8p1[-Nq;l#]iH?!ca+Hf*I&+"B3<%p_cbdH%E\oV)Y<:9'sD`X#*L\D+4unXtC2@Edq[(3,WU#k'[f.rKGmVQ'MV)]a=anC2p>=0NX"^i6lSo[ORMf`/`J=+n>udePgF:^6_fB,!(\CA2q`&iNQhOjf3(]B06fM(PZ.4ff"9."M%Y>AU?'5&,A*oqO0F]4`V@m!-i6K0F^m:f`BjG$&jUes7t#^BGI/t/;Z`=1[V(m7b5!"MGe;E*mNQ!:'tT4/l$aF-m]0pGV;/<qHp#/je+Y\P\B3J;UY17hHr?k\-R=N8cPZ=e$-rHJ)?pS]eIAEV9HL*h\oR2o;nE&kNi%0jO@BW+gJM&[<Df\=5)3]ddDF@G;E/t=,Ze&!/#.E7G&J0[K3fj6>sgE>Y]bL-t,tW~>
|
||||
endstream
|
||||
endobj
|
||||
25 0 obj
|
||||
|
@ -260,31 +260,31 @@ endobj
|
|||
13 0 obj
|
||||
<<
|
||||
/S /GoTo
|
||||
/D [23 0 R /XYZ 85.0 251.932 null]
|
||||
/D [23 0 R /XYZ 85.0 238.732 null]
|
||||
>>
|
||||
endobj
|
||||
15 0 obj
|
||||
<<
|
||||
/S /GoTo
|
||||
/D [23 0 R /XYZ 85.0 199.598 null]
|
||||
/D [23 0 R /XYZ 85.0 186.398 null]
|
||||
>>
|
||||
endobj
|
||||
17 0 obj
|
||||
<<
|
||||
/S /GoTo
|
||||
/D [23 0 R /XYZ 85.0 148.345 null]
|
||||
/D [25 0 R /XYZ 85.0 659.0 null]
|
||||
>>
|
||||
endobj
|
||||
19 0 obj
|
||||
<<
|
||||
/S /GoTo
|
||||
/D [25 0 R /XYZ 85.0 611.4 null]
|
||||
/D [25 0 R /XYZ 85.0 581.347 null]
|
||||
>>
|
||||
endobj
|
||||
21 0 obj
|
||||
<<
|
||||
/S /GoTo
|
||||
/D [25 0 R /XYZ 85.0 546.647 null]
|
||||
/D [25 0 R /XYZ 85.0 516.594 null]
|
||||
>>
|
||||
endobj
|
||||
26 0 obj
|
||||
|
@ -295,45 +295,45 @@ endobj
|
|||
xref
|
||||
0 40
|
||||
0000000000 65535 f
|
||||
0000006782 00000 n
|
||||
0000006854 00000 n
|
||||
0000006946 00000 n
|
||||
0000006810 00000 n
|
||||
0000006882 00000 n
|
||||
0000006974 00000 n
|
||||
0000000015 00000 n
|
||||
0000000071 00000 n
|
||||
0000000839 00000 n
|
||||
0000000959 00000 n
|
||||
0000001026 00000 n
|
||||
0000007080 00000 n
|
||||
0000001161 00000 n
|
||||
0000007143 00000 n
|
||||
0000001298 00000 n
|
||||
0000007209 00000 n
|
||||
0000001434 00000 n
|
||||
0000007275 00000 n
|
||||
0000001571 00000 n
|
||||
0000007341 00000 n
|
||||
0000001708 00000 n
|
||||
0000007407 00000 n
|
||||
0000001844 00000 n
|
||||
0000007471 00000 n
|
||||
0000001981 00000 n
|
||||
0000003342 00000 n
|
||||
0000003450 00000 n
|
||||
0000004543 00000 n
|
||||
0000007537 00000 n
|
||||
0000004651 00000 n
|
||||
0000004866 00000 n
|
||||
0000005102 00000 n
|
||||
0000005288 00000 n
|
||||
0000005555 00000 n
|
||||
0000005707 00000 n
|
||||
0000005953 00000 n
|
||||
0000006120 00000 n
|
||||
0000006233 00000 n
|
||||
0000006343 00000 n
|
||||
0000006451 00000 n
|
||||
0000006557 00000 n
|
||||
0000006673 00000 n
|
||||
0000000842 00000 n
|
||||
0000000962 00000 n
|
||||
0000001029 00000 n
|
||||
0000007108 00000 n
|
||||
0000001164 00000 n
|
||||
0000007171 00000 n
|
||||
0000001301 00000 n
|
||||
0000007237 00000 n
|
||||
0000001437 00000 n
|
||||
0000007303 00000 n
|
||||
0000001574 00000 n
|
||||
0000007369 00000 n
|
||||
0000001711 00000 n
|
||||
0000007433 00000 n
|
||||
0000001847 00000 n
|
||||
0000007499 00000 n
|
||||
0000001984 00000 n
|
||||
0000003337 00000 n
|
||||
0000003445 00000 n
|
||||
0000004571 00000 n
|
||||
0000007565 00000 n
|
||||
0000004679 00000 n
|
||||
0000004894 00000 n
|
||||
0000005130 00000 n
|
||||
0000005316 00000 n
|
||||
0000005583 00000 n
|
||||
0000005735 00000 n
|
||||
0000005981 00000 n
|
||||
0000006148 00000 n
|
||||
0000006261 00000 n
|
||||
0000006371 00000 n
|
||||
0000006479 00000 n
|
||||
0000006585 00000 n
|
||||
0000006701 00000 n
|
||||
trailer
|
||||
<<
|
||||
/Size 40
|
||||
|
@ -341,5 +341,5 @@ trailer
|
|||
/Info 4 0 R
|
||||
>>
|
||||
startxref
|
||||
7588
|
||||
7616
|
||||
%%EOF
|
||||
|
|
|
@ -35,6 +35,7 @@
|
|||
<li><a href="api/contrib-spellchecker/index.html">Spellchecker</a></li>
|
||||
<li><a href="api/contrib-surround/index.html">Surround</a></li>
|
||||
<li><a href="api/contrib-swing/index.html">Swing</a></li>
|
||||
<li><a href="api/contrib-wikipedia/index.html">Wikipedia</a></li>
|
||||
<li><a href="api/contrib-wordnet/index.html">Wordnet</a></li>
|
||||
<li><a href="api/contrib-xml-query-parser/index.html">XML Query Parser</a></li></ul></li>
|
||||
</ul>
|
||||
|
|
Loading…
Reference in New Issue