LUCENE-1103

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@608852 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Grant Ingersoll 2008-01-04 14:29:15 +00:00
parent 2d633f98a2
commit f715fc6031
11 changed files with 1774 additions and 54 deletions

View File

@ -250,6 +250,7 @@
<packageset dir="contrib/spellchecker/src/java"/>
<packageset dir="contrib/surround/src/java"/>
<packageset dir="contrib/swing/src/java"/>
<packageset dir="contrib/wikipedia/src/java"/>
<packageset dir="contrib/wordnet/src/java"/>
<packageset dir="contrib/xml-query-parser/src/java"/>
<!-- end alpha sort -->
@ -279,6 +280,7 @@
<group title="contrib: SpellChecker" packages="org.apache.lucene.search.spell*"/>
<group title="contrib: Surround Parser" packages="org.apache.lucene.queryParser.surround*"/>
<group title="contrib: Swing" packages="org.apache.lucene.swing*"/>
<group title="contrib: Wikipedia" packages="org.apache.lucene.wikipedia*"/>
<group title="contrib: WordNet" packages="org.apache.lucene.wordnet*"/>
<group title="contrib: XML Query Parser" packages="org.apache.lucene.xmlparser*"/>

View File

@ -0,0 +1,49 @@
<?xml version="1.0"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<project name="wikipedia" default="default">
<description>
Tools for working with Wikipedia
</description>
<import file="../contrib-build.xml"/>
<target name="jflex" depends="clean-jflex,jflex-wiki-tokenizer"/>
<target name="jflex-wiki-tokenizer" depends="init,jflex-check" if="jflex.present">
<taskdef classname="JFlex.anttask.JFlexTask" name="jflex">
<classpath location="${jflex.home}/lib/JFlex.jar"/>
</taskdef>
<jflex file="src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerImpl.jflex"
outdir="src/java/org/apache/lucene/wikipedia/analysis"
nobak="on"/>
</target>
<target name="clean-jflex">
<delete>
<fileset dir="src/java/org/apache/lucene/wikipedia" includes="*.java">
<containsregexp expression="generated.*by.*JFlex"/>
</fileset>
</delete>
</target>
</project>

View File

@ -0,0 +1,43 @@
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-contrib</artifactId>
<version>@version@</version>
</parent>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-wikipedia</artifactId>
<name>Lucene Wikipedia Tools</name>
<version>@version@</version>
<description>Lucene Wikipedia Contributions</description>
<packaging>jar</packaging>
<dependencies>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-benchmark</artifactId>
<version>@version@</version>
</dependency>
</dependencies>
</project>

View File

@ -0,0 +1,100 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.wikipedia.analysis;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.Tokenizer;
import java.io.Reader;
import java.io.IOException;
/**
* Extension of StandardTokenizer that is aware of Wikipedia syntax. It is based off of the
* Wikipedia tutorial available at http://en.wikipedia.org/wiki/Wikipedia:Tutorial, but it may not be complete.
*
*
**/
public class WikipediaTokenizer extends Tokenizer {
public static final String INTERNAL_LINK = "il";
public static final String EXTERNAL_LINK = "el";
//The URL part of the link, i.e. the first token
public static final String EXTERNAL_LINK_URL = "elu";
public static final String CITATION = "ci";
public static final String CATEGORY = "c";
public static final String BOLD = "b";
public static final String ITALICS = "i";
public static final String BOLD_ITALICS = "bi";
public static final String HEADING = "h";
public static final String SUB_HEADING = "sh";
/**
* A private instance of the JFlex-constructed scanner
*/
private final WikipediaTokenizerImpl scanner;
void setInput(Reader reader) {
this.input = reader;
}
/**
* Creates a new instance of the {@link WikipediaTokenizer}. Attaches the
* <code>input</code> to a newly created JFlex scanner.
* @param input The Input Reader
*/
public WikipediaTokenizer(Reader input) {
this.input = input;
this.scanner = new WikipediaTokenizerImpl(input);
}
/*
* (non-Javadoc)
*
* @see org.apache.lucene.analysis.TokenStream#next()
*/
public Token next(Token result) throws IOException {
int tokenType = scanner.getNextToken();
if (tokenType == WikipediaTokenizerImpl.YYEOF) {
return null;
}
scanner.getText(result, tokenType);
final int start = scanner.yychar();
result.setStartOffset(start);
result.setEndOffset(start + result.termLength());
result.setPositionIncrement(scanner.getPositionIncrement());
result.setType(WikipediaTokenizerImpl.TOKEN_TYPES[tokenType]);
return result;
}
/*
* (non-Javadoc)
*
* @see org.apache.lucene.analysis.TokenStream#reset()
*/
public void reset() throws IOException {
super.reset();
scanner.yyreset(input);
}
public void reset(Reader reader) throws IOException {
input = reader;
reset();
}
}

View File

@ -0,0 +1,949 @@
/* The following code was generated by JFlex 1.4.1 on 1/3/08 10:05 PM */
package org.apache.lucene.wikipedia.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.Token;
/**
* This class is a scanner generated by
* <a href="http://www.jflex.de/">JFlex</a> 1.4.1
* on 1/3/08 10:05 PM from the specification file
* <tt>/Volumes/User/grantingersoll/projects/lucene/Lucene-Trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerImpl.jflex</tt>
*/
class WikipediaTokenizerImpl {
/** This character denotes the end of file */
public static final int YYEOF = -1;
/** initial size of the lookahead buffer */
private static final int ZZ_BUFFERSIZE = 16384;
/** lexical states */
public static final int DOUBLE_BRACE_STATE = 7;
public static final int INTERNAL_LINK_STATE = 2;
public static final int TWO_SINGLE_QUOTES_STATE = 4;
public static final int CATEGORY_STATE = 1;
public static final int FIVE_SINGLE_QUOTES_STATE = 5;
public static final int STRING = 8;
public static final int YYINITIAL = 0;
public static final int DOUBLE_EQUALS_STATE = 6;
public static final int THREE_SINGLE_QUOTES_STATE = 5;
public static final int EXTERNAL_LINK_STATE = 3;
/**
* Translates characters to character classes
*/
private static final String ZZ_CMAP_PACKED =
"\11\0\1\24\1\23\1\0\1\24\1\22\22\0\1\24\1\0\1\12"+
"\1\52\2\0\1\3\1\1\4\0\1\14\1\5\1\2\1\10\12\16"+
"\1\27\1\0\1\7\1\11\1\13\1\52\1\4\2\15\1\30\5\15"+
"\1\41\21\15\1\25\1\0\1\26\1\0\1\6\1\0\1\31\1\43"+
"\2\15\1\33\1\40\1\34\1\50\1\41\4\15\1\42\1\35\1\51"+
"\1\15\1\36\1\15\1\32\3\15\1\44\1\37\1\15\1\45\1\47"+
"\1\46\102\0\27\15\1\0\37\15\1\0\u0568\15\12\17\206\15\12\17"+
"\u026c\15\12\17\166\15\12\17\166\15\12\17\166\15\12\17\166\15\12\17"+
"\167\15\11\17\166\15\12\17\166\15\12\17\166\15\12\17\340\15\12\17"+
"\166\15\12\17\u0166\15\12\17\266\15\u0100\15\u0e00\15\u1040\0\u0150\21\140\0"+
"\20\21\u0100\0\200\21\200\0\u19c0\21\100\0\u5200\21\u0c00\0\u2bb0\20\u2150\0"+
"\u0200\21\u0465\0\73\21\75\15\43\0";
/**
* Translates characters to character classes
*/
private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED);
/**
* Translates DFA states to action switch labels.
*/
private static final int [] ZZ_ACTION = zzUnpackAction();
private static final String ZZ_ACTION_PACKED_0 =
"\11\0\4\1\4\2\1\3\1\1\1\4\2\1\1\5"+
"\1\1\1\6\2\7\1\10\1\11\1\10\1\12\1\13"+
"\1\7\1\14\1\15\1\16\1\17\1\7\1\20\1\7"+
"\4\21\1\22\1\21\1\23\1\24\1\25\3\0\1\26"+
"\14\0\1\27\1\30\1\10\1\0\1\31\1\0\1\32"+
"\1\0\1\33\3\0\1\34\1\35\2\36\1\35\2\37"+
"\2\0\1\36\1\0\14\36\1\35\3\0\1\10\1\40"+
"\3\0\1\41\1\42\5\0\1\43\4\0\1\43\2\0"+
"\2\43\2\0\1\10\5\0\1\30\1\35\1\36\1\44"+
"\5\0\1\45\30\0\1\46\2\0\1\47\1\50\1\51";
private static int [] zzUnpackAction() {
int [] result = new int[174];
int offset = 0;
offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
return result;
}
private static int zzUnpackAction(String packed, int offset, int [] result) {
int i = 0; /* index in packed string */
int j = offset; /* index in unpacked array */
int l = packed.length();
while (i < l) {
int count = packed.charAt(i++);
int value = packed.charAt(i++);
do result[j++] = value; while (--count > 0);
}
return j;
}
/**
* Translates a state to a row index in the transition table
*/
private static final int [] ZZ_ROWMAP = zzUnpackRowMap();
private static final String ZZ_ROWMAP_PACKED_0 =
"\0\0\0\53\0\126\0\201\0\254\0\327\0\u0102\0\u012d"+
"\0\u0158\0\u0183\0\u01ae\0\u01d9\0\u0204\0\u022f\0\u025a\0\u0285"+
"\0\u02b0\0\u0183\0\u02db\0\u0306\0\u0331\0\u035c\0\u0387\0\u03b2"+
"\0\u03dd\0\u0183\0\u035c\0\u0408\0\u0183\0\u0433\0\u045e\0\u0489"+
"\0\u04b4\0\u04df\0\u050a\0\u0535\0\u0560\0\u058b\0\u05b6\0\u05e1"+
"\0\u0183\0\u060c\0\u035c\0\u0637\0\u0662\0\u068d\0\u06b8\0\u0183"+
"\0\u0183\0\u06e3\0\u070e\0\u0739\0\u0183\0\u0764\0\u078f\0\u07ba"+
"\0\u07e5\0\u0810\0\u083b\0\u0866\0\u0891\0\u08bc\0\u08e7\0\u0912"+
"\0\u093d\0\u0968\0\u0993\0\u09be\0\u09e9\0\u0a14\0\u0a3f\0\u0a6a"+
"\0\u0a95\0\u0ac0\0\u0aeb\0\u0b16\0\u0b41\0\u0b6c\0\u0b97\0\u0bc2"+
"\0\u0bed\0\u0c18\0\u07ba\0\u0c43\0\u0c6e\0\u0c99\0\u0cc4\0\u0cef"+
"\0\u0d1a\0\u0d45\0\u0d70\0\u0d9b\0\u0dc6\0\u0df1\0\u0e1c\0\u0e47"+
"\0\u0e72\0\u0e9d\0\u0ec8\0\u0ef3\0\u0f1e\0\u0f49\0\u0f74\0\u0f9f"+
"\0\u0fca\0\u0183\0\u0ff5\0\u1020\0\u104b\0\u1076\0\u0183\0\u10a1"+
"\0\u10cc\0\u10f7\0\u1122\0\u114d\0\u1178\0\u11a3\0\u11ce\0\u11f9"+
"\0\u1224\0\u124f\0\u127a\0\u12a5\0\u078f\0\u0912\0\u12d0\0\u12fb"+
"\0\u1326\0\u1351\0\u137c\0\u13a7\0\u13d2\0\u13fd\0\u0183\0\u1428"+
"\0\u1453\0\u147e\0\u14a9\0\u14d4\0\u14ff\0\u152a\0\u1555\0\u0183"+
"\0\u1580\0\u15ab\0\u15d6\0\u1601\0\u162c\0\u1657\0\u1682\0\u16ad"+
"\0\u16d8\0\u1703\0\u172e\0\u1759\0\u1784\0\u17af\0\u17da\0\u1805"+
"\0\u1830\0\u185b\0\u1886\0\u18b1\0\u18dc\0\u1907\0\u1932\0\u195d"+
"\0\u1988\0\u19b3\0\u19de\0\u0183\0\u0183\0\u0183";
private static int [] zzUnpackRowMap() {
int [] result = new int[174];
int offset = 0;
offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
return result;
}
private static int zzUnpackRowMap(String packed, int offset, int [] result) {
int i = 0; /* index in packed string */
int j = offset; /* index in unpacked array */
int l = packed.length();
while (i < l) {
int high = packed.charAt(i++) << 16;
result[j++] = high | packed.charAt(i++);
}
return j;
}
/**
* The transition table of the DFA
*/
private static final int [] ZZ_TRANS = zzUnpackTrans();
private static final String ZZ_TRANS_PACKED_0 =
"\1\12\1\13\5\12\1\14\1\12\1\15\3\12\1\16"+
"\1\17\1\20\1\21\1\22\1\23\2\12\1\24\2\12"+
"\15\16\1\25\2\12\2\16\10\12\1\26\5\12\4\27"+
"\1\12\1\23\3\12\1\30\1\12\15\27\3\12\2\27"+
"\10\12\1\26\5\12\4\31\1\12\1\23\3\12\1\30"+
"\1\12\15\31\3\12\2\31\1\12\7\32\1\33\5\32"+
"\4\34\1\32\1\23\2\12\1\32\1\35\1\32\15\34"+
"\3\32\1\36\1\34\2\32\1\37\5\32\1\33\5\32"+
"\4\40\1\32\1\41\2\32\1\42\2\32\15\40\3\32"+
"\2\40\10\32\1\33\5\32\4\43\1\32\1\41\2\32"+
"\1\42\2\32\15\43\3\32\2\43\10\32\1\33\1\32"+
"\1\44\3\32\4\45\1\32\1\41\5\32\15\45\3\32"+
"\2\45\10\32\1\46\5\32\4\47\1\32\1\41\5\32"+
"\15\47\1\32\1\50\1\32\2\47\1\32\1\51\1\52"+
"\5\51\1\53\1\51\1\54\3\51\4\55\1\51\1\56"+
"\2\51\1\57\2\51\15\55\2\51\1\60\2\55\1\51"+
"\54\0\1\61\61\0\1\62\4\0\4\63\7\0\6\63"+
"\1\64\6\63\3\0\2\63\12\0\1\65\42\0\1\66"+
"\1\67\1\70\1\71\2\72\1\0\1\73\3\0\1\73"+
"\1\16\1\17\1\20\1\21\7\0\15\16\3\0\2\16"+
"\3\0\1\74\1\0\1\75\2\76\1\0\1\77\3\0"+
"\1\77\3\17\1\21\7\0\15\17\3\0\2\17\2\0"+
"\1\66\1\100\1\70\1\71\2\76\1\0\1\77\3\0"+
"\1\77\1\20\1\17\1\20\1\21\7\0\15\20\3\0"+
"\2\20\3\0\1\101\1\0\1\75\2\72\1\0\1\73"+
"\3\0\1\73\4\21\7\0\15\21\3\0\2\21\24\0"+
"\1\12\54\0\1\102\72\0\1\103\15\0\1\62\4\0"+
"\4\63\7\0\15\63\3\0\2\63\16\0\4\27\7\0"+
"\15\27\3\0\2\27\27\0\1\35\41\0\4\31\7\0"+
"\15\31\3\0\2\31\16\0\4\34\7\0\15\34\3\0"+
"\2\34\16\0\4\34\7\0\2\34\1\104\12\34\3\0"+
"\2\34\2\0\1\105\66\0\4\40\7\0\15\40\3\0"+
"\2\40\24\0\1\32\54\0\1\106\42\0\4\43\7\0"+
"\15\43\3\0\2\43\12\0\1\35\56\0\4\45\7\0"+
"\15\45\3\0\2\45\11\0\1\107\4\0\4\63\7\0"+
"\15\63\3\0\2\63\16\0\4\47\7\0\15\47\3\0"+
"\2\47\47\0\1\35\5\0\1\110\62\0\1\111\56\0"+
"\4\55\7\0\15\55\3\0\2\55\24\0\1\51\54\0"+
"\1\112\42\0\4\63\7\0\15\63\3\0\2\63\14\0"+
"\1\32\1\0\4\113\1\0\3\114\3\0\15\113\3\0"+
"\2\113\14\0\1\32\1\0\4\113\1\0\3\114\3\0"+
"\3\113\1\115\11\113\3\0\2\113\16\0\1\116\1\0"+
"\1\116\10\0\15\116\3\0\2\116\16\0\1\117\1\120"+
"\1\121\1\122\7\0\15\117\3\0\2\117\16\0\1\123"+
"\1\0\1\123\10\0\15\123\3\0\2\123\16\0\1\124"+
"\1\125\1\124\1\125\7\0\15\124\3\0\2\124\16\0"+
"\1\126\2\127\1\130\7\0\15\126\3\0\2\126\16\0"+
"\1\73\2\131\10\0\15\73\3\0\2\73\16\0\1\132"+
"\2\133\1\134\7\0\15\132\3\0\2\132\16\0\4\125"+
"\7\0\15\125\3\0\2\125\16\0\1\135\2\136\1\137"+
"\7\0\15\135\3\0\2\135\16\0\1\140\2\141\1\142"+
"\7\0\15\140\3\0\2\140\16\0\1\143\1\133\1\144"+
"\1\134\7\0\15\143\3\0\2\143\16\0\1\145\2\120"+
"\1\122\7\0\15\145\3\0\2\145\30\0\1\146\1\147"+
"\63\0\1\150\26\0\4\34\7\0\2\34\1\151\12\34"+
"\3\0\2\34\2\0\1\152\100\0\1\153\1\154\37\0"+
"\4\63\7\0\6\63\1\155\6\63\3\0\2\63\2\0"+
"\1\156\62\0\1\157\70\0\1\160\1\161\33\0\1\162"+
"\1\0\1\32\1\0\4\113\1\0\3\114\3\0\15\113"+
"\3\0\2\113\16\0\4\163\1\0\3\114\3\0\15\163"+
"\3\0\2\163\12\0\1\162\1\0\1\32\1\0\4\113"+
"\1\0\3\114\3\0\10\113\1\164\4\113\3\0\2\113"+
"\2\0\1\66\13\0\1\116\1\0\1\116\10\0\15\116"+
"\3\0\2\116\3\0\1\165\1\0\1\75\2\166\6\0"+
"\1\117\1\120\1\121\1\122\7\0\15\117\3\0\2\117"+
"\3\0\1\167\1\0\1\75\2\170\1\0\1\171\3\0"+
"\1\171\3\120\1\122\7\0\15\120\3\0\2\120\3\0"+
"\1\172\1\0\1\75\2\170\1\0\1\171\3\0\1\171"+
"\1\121\1\120\1\121\1\122\7\0\15\121\3\0\2\121"+
"\3\0\1\173\1\0\1\75\2\166\6\0\4\122\7\0"+
"\15\122\3\0\2\122\3\0\1\174\2\0\1\174\7\0"+
"\1\124\1\125\1\124\1\125\7\0\15\124\3\0\2\124"+
"\3\0\1\174\2\0\1\174\7\0\4\125\7\0\15\125"+
"\3\0\2\125\3\0\1\166\1\0\1\75\2\166\6\0"+
"\1\126\2\127\1\130\7\0\15\126\3\0\2\126\3\0"+
"\1\170\1\0\1\75\2\170\1\0\1\171\3\0\1\171"+
"\3\127\1\130\7\0\15\127\3\0\2\127\3\0\1\166"+
"\1\0\1\75\2\166\6\0\4\130\7\0\15\130\3\0"+
"\2\130\3\0\1\171\2\0\2\171\1\0\1\171\3\0"+
"\1\171\3\131\10\0\15\131\3\0\2\131\3\0\1\101"+
"\1\0\1\75\2\72\1\0\1\73\3\0\1\73\1\132"+
"\2\133\1\134\7\0\15\132\3\0\2\132\3\0\1\74"+
"\1\0\1\75\2\76\1\0\1\77\3\0\1\77\3\133"+
"\1\134\7\0\15\133\3\0\2\133\3\0\1\101\1\0"+
"\1\75\2\72\1\0\1\73\3\0\1\73\4\134\7\0"+
"\15\134\3\0\2\134\3\0\1\72\1\0\1\75\2\72"+
"\1\0\1\73\3\0\1\73\1\135\2\136\1\137\7\0"+
"\15\135\3\0\2\135\3\0\1\76\1\0\1\75\2\76"+
"\1\0\1\77\3\0\1\77\3\136\1\137\7\0\15\136"+
"\3\0\2\136\3\0\1\72\1\0\1\75\2\72\1\0"+
"\1\73\3\0\1\73\4\137\7\0\15\137\3\0\2\137"+
"\3\0\1\73\2\0\2\73\1\0\1\73\3\0\1\73"+
"\1\140\2\141\1\142\7\0\15\140\3\0\2\140\3\0"+
"\1\77\2\0\2\77\1\0\1\77\3\0\1\77\3\141"+
"\1\142\7\0\15\141\3\0\2\141\3\0\1\73\2\0"+
"\2\73\1\0\1\73\3\0\1\73\4\142\7\0\15\142"+
"\3\0\2\142\3\0\1\175\1\0\1\75\2\72\1\0"+
"\1\73\3\0\1\73\1\143\1\133\1\144\1\134\7\0"+
"\15\143\3\0\2\143\3\0\1\176\1\0\1\75\2\76"+
"\1\0\1\77\3\0\1\77\1\144\1\133\1\144\1\134"+
"\7\0\15\144\3\0\2\144\3\0\1\173\1\0\1\75"+
"\2\166\6\0\1\145\2\120\1\122\7\0\15\145\3\0"+
"\2\145\31\0\1\147\53\0\1\177\63\0\1\200\25\0"+
"\4\34\7\0\15\34\3\0\1\34\1\201\31\0\1\154"+
"\53\0\1\202\34\0\1\32\1\0\4\113\1\0\3\114"+
"\3\0\3\113\1\203\11\113\3\0\2\113\2\0\1\204"+
"\101\0\1\161\53\0\1\205\33\0\1\206\51\0\1\162"+
"\3\0\4\163\7\0\15\163\3\0\2\163\12\0\1\162"+
"\1\0\1\207\1\0\4\113\1\0\3\114\3\0\15\113"+
"\3\0\2\113\16\0\1\210\1\122\1\210\1\122\7\0"+
"\15\210\3\0\2\210\16\0\4\130\7\0\15\130\3\0"+
"\2\130\16\0\4\134\7\0\15\134\3\0\2\134\16\0"+
"\4\137\7\0\15\137\3\0\2\137\16\0\4\142\7\0"+
"\15\142\3\0\2\142\16\0\1\211\1\134\1\211\1\134"+
"\7\0\15\211\3\0\2\211\16\0\4\122\7\0\15\122"+
"\3\0\2\122\16\0\4\212\7\0\15\212\3\0\2\212"+
"\33\0\1\213\60\0\1\214\27\0\4\34\6\0\1\215"+
"\15\34\3\0\2\34\33\0\1\216\31\0\1\162\1\0"+
"\1\32\1\0\4\113\1\0\3\114\3\0\10\113\1\217"+
"\4\113\3\0\2\113\2\0\1\220\103\0\1\221\35\0"+
"\4\222\7\0\15\222\3\0\2\222\3\0\1\165\1\0"+
"\1\75\2\166\6\0\1\210\1\122\1\210\1\122\7\0"+
"\15\210\3\0\2\210\3\0\1\175\1\0\1\75\2\72"+
"\1\0\1\73\3\0\1\73\1\211\1\134\1\211\1\134"+
"\7\0\15\211\3\0\2\211\3\0\1\174\2\0\1\174"+
"\7\0\4\212\7\0\15\212\3\0\2\212\34\0\1\223"+
"\54\0\1\224\25\0\1\225\75\0\1\226\30\0\1\162"+
"\1\0\1\35\1\0\4\113\1\0\3\114\3\0\15\113"+
"\3\0\2\113\34\0\1\227\31\0\1\230\2\0\4\222"+
"\7\0\15\222\3\0\2\222\35\0\1\231\61\0\1\232"+
"\17\0\1\233\76\0\1\234\52\0\1\235\31\0\1\32"+
"\1\0\4\163\1\0\3\114\3\0\15\163\3\0\2\163"+
"\36\0\1\236\52\0\1\237\32\0\4\240\7\0\15\240"+
"\3\0\2\240\36\0\1\241\52\0\1\242\53\0\1\243"+
"\60\0\1\244\10\0\1\245\12\0\4\240\7\0\15\240"+
"\3\0\2\240\37\0\1\246\52\0\1\247\53\0\1\250"+
"\21\0\1\12\61\0\4\251\7\0\15\251\3\0\2\251"+
"\40\0\1\252\52\0\1\253\42\0\1\254\25\0\2\251"+
"\1\0\2\251\1\0\2\251\2\0\5\251\7\0\15\251"+
"\3\0\3\251\27\0\1\255\52\0\1\256\23\0";
private static int [] zzUnpackTrans() {
int [] result = new int[6665];
int offset = 0;
offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
return result;
}
private static int zzUnpackTrans(String packed, int offset, int [] result) {
int i = 0; /* index in packed string */
int j = offset; /* index in unpacked array */
int l = packed.length();
while (i < l) {
int count = packed.charAt(i++);
int value = packed.charAt(i++);
value--;
do result[j++] = value; while (--count > 0);
}
return j;
}
/* error codes */
private static final int ZZ_UNKNOWN_ERROR = 0;
private static final int ZZ_NO_MATCH = 1;
private static final int ZZ_PUSHBACK_2BIG = 2;
/* error messages for the codes above */
private static final String ZZ_ERROR_MSG[] = {
"Unkown internal scanner error",
"Error: could not match input",
"Error: pushback value was too large"
};
/**
* ZZ_ATTRIBUTE[aState] contains the attributes of state <code>aState</code>
*/
private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
private static final String ZZ_ATTRIBUTE_PACKED_0 =
"\11\0\1\11\7\1\1\11\7\1\1\11\2\1\1\11"+
"\13\1\1\11\6\1\2\11\3\0\1\11\14\0\3\1"+
"\1\0\1\1\1\0\1\1\1\0\1\1\3\0\7\1"+
"\2\0\1\1\1\0\15\1\3\0\1\1\1\11\3\0"+
"\1\1\1\11\5\0\1\1\4\0\1\1\2\0\2\1"+
"\2\0\1\1\5\0\1\11\3\1\5\0\1\11\30\0"+
"\1\1\2\0\3\11";
private static int [] zzUnpackAttribute() {
int [] result = new int[174];
int offset = 0;
offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
return result;
}
private static int zzUnpackAttribute(String packed, int offset, int [] result) {
int i = 0; /* index in packed string */
int j = offset; /* index in unpacked array */
int l = packed.length();
while (i < l) {
int count = packed.charAt(i++);
int value = packed.charAt(i++);
do result[j++] = value; while (--count > 0);
}
return j;
}
/** the input device */
private java.io.Reader zzReader;
/** the current state of the DFA */
private int zzState;
/** the current lexical state */
private int zzLexicalState = YYINITIAL;
/** this buffer contains the current text to be matched and is
the source of the yytext() string */
private char zzBuffer[] = new char[ZZ_BUFFERSIZE];
/** the textposition at the last accepting state */
private int zzMarkedPos;
/** the textposition at the last state to be included in yytext */
private int zzPushbackPos;
/** the current text position in the buffer */
private int zzCurrentPos;
/** startRead marks the beginning of the yytext() string in the buffer */
private int zzStartRead;
/** endRead marks the last character in the buffer, that has been read
from input */
private int zzEndRead;
/** number of newlines encountered up to the start of the matched text */
private int yyline;
/** the number of characters up to the start of the matched text */
private int yychar;
/**
* the number of characters from the last newline up to the start of the
* matched text
*/
private int yycolumn;
/**
* zzAtBOL == true <=> the scanner is currently at the beginning of a line
*/
private boolean zzAtBOL = true;
/** zzAtEOF == true <=> the scanner is at the EOF */
private boolean zzAtEOF;
/* user code: */
public static final int ALPHANUM = 0;
public static final int APOSTROPHE = 1;
public static final int ACRONYM = 2;
public static final int COMPANY = 3;
public static final int EMAIL = 4;
public static final int HOST = 5;
public static final int NUM = 6;
public static final int CJ = 7;
public static final int INTERNAL_LINK = 8;
public static final int EXTERNAL_LINK = 9;
public static final int CITATION = 10;
public static final int CATEGORY = 11;
public static final int BOLD = 12;
public static final int ITALICS = 13;
public static final int BOLD_ITALICS = 14;
public static final int HEADING = 15;
public static final int SUB_HEADING = 16;
public static final int EXTERNAL_LINK_URL = 17;
private int currentTokType;
private int numBalanced = 0;
private int positionInc = 1;
public static final String [] TOKEN_TYPES = new String [] {
"<ALPHANUM>",
"<APOSTROPHE>",
"<ACRONYM>",
"<COMPANY>",
"<EMAIL>",
"<HOST>",
"<NUM>",
"<CJ>",
WikipediaTokenizer.INTERNAL_LINK,
WikipediaTokenizer.EXTERNAL_LINK,
WikipediaTokenizer.CITATION,
WikipediaTokenizer.CATEGORY,
WikipediaTokenizer.BOLD,
WikipediaTokenizer.ITALICS,
WikipediaTokenizer.BOLD_ITALICS,
WikipediaTokenizer.HEADING,
WikipediaTokenizer.SUB_HEADING,
WikipediaTokenizer.EXTERNAL_LINK_URL
};
public final int yychar()
{
return yychar;
}
public final int getPositionIncrement(){
return positionInc;
}
/**
* Fills Lucene token with the current token text.
*/
final void getText(Token t, int tokType) {
t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
}
/**
* Creates a new scanner
* There is also a java.io.InputStream version of this constructor.
*
* @param in the java.io.Reader to read input from.
*/
WikipediaTokenizerImpl(java.io.Reader in) {
this.zzReader = in;
}
/**
* Creates a new scanner.
* There is also java.io.Reader version of this constructor.
*
* @param in the java.io.Inputstream to read input from.
*/
WikipediaTokenizerImpl(java.io.InputStream in) {
this(new java.io.InputStreamReader(in));
}
/**
* Unpacks the compressed character translation table.
*
* @param packed the packed character translation table
* @return the unpacked character translation table
*/
private static char [] zzUnpackCMap(String packed) {
char [] map = new char[0x10000];
int i = 0; /* index in packed string */
int j = 0; /* index in unpacked array */
while (i < 230) {
int count = packed.charAt(i++);
char value = packed.charAt(i++);
do map[j++] = value; while (--count > 0);
}
return map;
}
/**
* Refills the input buffer.
*
* @return <code>false</code>, iff there was new input.
*
* @exception java.io.IOException if any I/O-Error occurs
*/
private boolean zzRefill() throws java.io.IOException {
/* first: make room (if you can) */
if (zzStartRead > 0) {
System.arraycopy(zzBuffer, zzStartRead,
zzBuffer, 0,
zzEndRead-zzStartRead);
/* translate stored positions */
zzEndRead-= zzStartRead;
zzCurrentPos-= zzStartRead;
zzMarkedPos-= zzStartRead;
zzPushbackPos-= zzStartRead;
zzStartRead = 0;
}
/* is the buffer big enough? */
if (zzCurrentPos >= zzBuffer.length) {
/* if not: blow it up */
char newBuffer[] = new char[zzCurrentPos*2];
System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length);
zzBuffer = newBuffer;
}
/* finally: fill the buffer with new input */
int numRead = zzReader.read(zzBuffer, zzEndRead,
zzBuffer.length-zzEndRead);
if (numRead < 0) {
return true;
}
else {
zzEndRead+= numRead;
return false;
}
}
/**
* Closes the input stream.
*/
public final void yyclose() throws java.io.IOException {
zzAtEOF = true; /* indicate end of file */
zzEndRead = zzStartRead; /* invalidate buffer */
if (zzReader != null)
zzReader.close();
}
/**
* Resets the scanner to read from a new input stream.
* Does not close the old reader.
*
* All internal variables are reset, the old input stream
* <b>cannot</b> be reused (internal buffer is discarded and lost).
* Lexical state is set to <tt>ZZ_INITIAL</tt>.
*
* @param reader the new input stream
*/
public final void yyreset(java.io.Reader reader) {
zzReader = reader;
zzAtBOL = true;
zzAtEOF = false;
zzEndRead = zzStartRead = 0;
zzCurrentPos = zzMarkedPos = zzPushbackPos = 0;
yyline = yychar = yycolumn = 0;
zzLexicalState = YYINITIAL;
}
/**
* Returns the current lexical state.
*/
public final int yystate() {
return zzLexicalState;
}
/**
* Enters a new lexical state
*
* @param newState the new lexical state
*/
public final void yybegin(int newState) {
zzLexicalState = newState;
}
/**
* Returns the text matched by the current regular expression.
*/
public final String yytext() {
return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead );
}
/**
* Returns the character at position <tt>pos</tt> from the
* matched text.
*
* It is equivalent to yytext().charAt(pos), but faster
*
* @param pos the position of the character to fetch.
* A value from 0 to yylength()-1.
*
* @return the character at position pos
*/
public final char yycharat(int pos) {
return zzBuffer[zzStartRead+pos];
}
/**
* Returns the length of the matched text region.
*/
public final int yylength() {
return zzMarkedPos-zzStartRead;
}
/**
* Reports an error that occured while scanning.
*
* In a wellformed scanner (no or only correct usage of
* yypushback(int) and a match-all fallback rule) this method
* will only be called with things that "Can't Possibly Happen".
* If this method is called, something is seriously wrong
* (e.g. a JFlex bug producing a faulty scanner etc.).
*
* Usual syntax/scanner level error handling should be done
* in error fallback rules.
*
* @param errorCode the code of the errormessage to display
*/
private void zzScanError(int errorCode) {
String message;
try {
message = ZZ_ERROR_MSG[errorCode];
}
catch (ArrayIndexOutOfBoundsException e) {
message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR];
}
throw new Error(message);
}
/**
* Pushes the specified amount of characters back into the input stream.
*
* They will be read again by then next call of the scanning method
*
* @param number the number of characters to be read again.
* This number must not be greater than yylength()!
*/
public void yypushback(int number) {
if ( number > yylength() )
zzScanError(ZZ_PUSHBACK_2BIG);
zzMarkedPos -= number;
}
/**
* Resumes scanning until the next regular expression is matched,
* the end of input is encountered or an I/O-Error occurs.
*
* @return the next token
* @exception java.io.IOException if any I/O-Error occurs
*/
public int getNextToken() throws java.io.IOException {
int zzInput;
int zzAction;
// cached fields:
int zzCurrentPosL;
int zzMarkedPosL;
int zzEndReadL = zzEndRead;
char [] zzBufferL = zzBuffer;
char [] zzCMapL = ZZ_CMAP;
int [] zzTransL = ZZ_TRANS;
int [] zzRowMapL = ZZ_ROWMAP;
int [] zzAttrL = ZZ_ATTRIBUTE;
while (true) {
zzMarkedPosL = zzMarkedPos;
yychar+= zzMarkedPosL-zzStartRead;
zzAction = -1;
zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL;
zzState = zzLexicalState;
zzForAction: {
while (true) {
if (zzCurrentPosL < zzEndReadL)
zzInput = zzBufferL[zzCurrentPosL++];
else if (zzAtEOF) {
zzInput = YYEOF;
break zzForAction;
}
else {
// store back cached positions
zzCurrentPos = zzCurrentPosL;
zzMarkedPos = zzMarkedPosL;
boolean eof = zzRefill();
// get translated positions and possibly new buffer
zzCurrentPosL = zzCurrentPos;
zzMarkedPosL = zzMarkedPos;
zzBufferL = zzBuffer;
zzEndReadL = zzEndRead;
if (eof) {
zzInput = YYEOF;
break zzForAction;
}
else {
zzInput = zzBufferL[zzCurrentPosL++];
}
}
int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ];
if (zzNext == -1) break zzForAction;
zzState = zzNext;
int zzAttributes = zzAttrL[zzState];
if ( (zzAttributes & 1) == 1 ) {
zzAction = zzState;
zzMarkedPosL = zzCurrentPosL;
if ( (zzAttributes & 8) == 8 ) break zzForAction;
}
}
}
// store back cached position
zzMarkedPos = zzMarkedPosL;
switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
case 7:
{ /* ignore */
}
case 42: break;
case 3:
{ positionInc = 1; return CJ;
}
case 43: break;
case 26:
{ numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);/*end italics*/
}
case 44: break;
case 37:
{ numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);/*end bold italics*/
}
case 45: break;
case 11:
{ currentTokType = ITALICS; yybegin(STRING); return currentTokType;/*italics*/
}
case 46: break;
case 5:
{ yybegin(CATEGORY_STATE); return currentTokType;
}
case 47: break;
case 34:
{ numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);/*end sub header*/
}
case 48: break;
case 24:
{ positionInc = 1; currentTokType = CITATION; yybegin(DOUBLE_BRACE_STATE);
}
case 49: break;
case 22:
{ positionInc = 1; yybegin(DOUBLE_EQUALS_STATE);
}
case 50: break;
case 39:
{ positionInc = 1; currentTokType = CATEGORY; yybegin(CATEGORY_STATE);
}
case 51: break;
case 18:
{ yybegin(STRING); return currentTokType;/* STRING ALPHANUM*/
}
case 52: break;
case 21:
{ positionInc = 1; if (numBalanced == 0){numBalanced++;yybegin(TWO_SINGLE_QUOTES_STATE);} else{numBalanced = 0;}
}
case 53: break;
case 1:
{ positionInc = 1;
}
case 54: break;
case 41:
{ numBalanced = 0;currentTokType = CATEGORY;yybegin(CATEGORY_STATE);
}
case 55: break;
case 9:
{ yybegin(YYINITIAL);
}
case 56: break;
case 19:
{ numBalanced = 0;currentTokType = EXTERNAL_LINK;yybegin(EXTERNAL_LINK_STATE);
}
case 57: break;
case 13:
{ yybegin(STRING);return currentTokType;
}
case 58: break;
case 36:
{ positionInc = 1; return EMAIL;
}
case 59: break;
case 35:
{ positionInc = 1; return ACRONYM;
}
case 60: break;
case 4:
{ positionInc = 1;currentTokType = EXTERNAL_LINK_URL; yybegin(EXTERNAL_LINK_STATE);
}
case 61: break;
case 17:
{ /* ignore STRING */
}
case 62: break;
case 40:
{ currentTokType = CATEGORY;yybegin(CATEGORY_STATE);
}
case 63: break;
case 20:
{ yybegin(STRING); return currentTokType;/*pipe*/
}
case 64: break;
case 12:
{ currentTokType = EXTERNAL_LINK;yybegin(EXTERNAL_LINK_STATE);
}
case 65: break;
case 27:
{ numBalanced = 0;currentTokType = INTERNAL_LINK;yybegin(INTERNAL_LINK_STATE);
}
case 66: break;
case 33:
{ numBalanced = 0;currentTokType = ALPHANUM;yybegin(YYINITIAL);/*end bold*/
}
case 67: break;
case 16:
{ yybegin(DOUBLE_BRACE_STATE); return currentTokType;
}
case 68: break;
case 29:
{ positionInc = 1; return HOST;
}
case 69: break;
case 32:
{ currentTokType = BOLD_ITALICS; yybegin(FIVE_SINGLE_QUOTES_STATE);
}
case 70: break;
case 25:
{ currentTokType = INTERNAL_LINK;yybegin(INTERNAL_LINK_STATE);
}
case 71: break;
case 23:
{ positionInc = 0; currentTokType = INTERNAL_LINK; yybegin(INTERNAL_LINK_STATE);
}
case 72: break;
case 14:
{ currentTokType = SUB_HEADING; yybegin(STRING);
}
case 73: break;
case 28:
{ positionInc = 1; return APOSTROPHE;
}
case 74: break;
case 30:
{ positionInc = 1; return NUM;
}
case 75: break;
case 15:
{ currentTokType = HEADING; yybegin(DOUBLE_EQUALS_STATE); return currentTokType;
}
case 76: break;
case 6:
{ yybegin(INTERNAL_LINK_STATE); return currentTokType;
}
case 77: break;
case 2:
{ positionInc = 1; return ALPHANUM;
}
case 78: break;
case 31:
{ positionInc = 1; return COMPANY;
}
case 79: break;
case 10:
{ currentTokType = BOLD; yybegin(THREE_SINGLE_QUOTES_STATE);
}
case 80: break;
case 8:
{ positionInc = 1; currentTokType = EXTERNAL_LINK; yybegin(EXTERNAL_LINK_STATE);return currentTokType;
}
case 81: break;
case 38:
{ positionInc = 0; yybegin(EXTERNAL_LINK_STATE); return currentTokType;
}
case 82: break;
default:
if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
zzAtEOF = true;
return YYEOF;
}
else {
zzScanError(ZZ_NO_MATCH);
}
}
}
}
}

View File

@ -0,0 +1,324 @@
package org.apache.lucene.wikipedia.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.Token;
%%
%class WikipediaTokenizerImpl
%unicode
%integer
%function getNextToken
%pack
%char
%{
public static final int ALPHANUM = 0;
public static final int APOSTROPHE = 1;
public static final int ACRONYM = 2;
public static final int COMPANY = 3;
public static final int EMAIL = 4;
public static final int HOST = 5;
public static final int NUM = 6;
public static final int CJ = 7;
public static final int INTERNAL_LINK = 8;
public static final int EXTERNAL_LINK = 9;
public static final int CITATION = 10;
public static final int CATEGORY = 11;
public static final int BOLD = 12;
public static final int ITALICS = 13;
public static final int BOLD_ITALICS = 14;
public static final int HEADING = 15;
public static final int SUB_HEADING = 16;
public static final int EXTERNAL_LINK_URL = 17;
private int currentTokType;
private int numBalanced = 0;
private int positionInc = 1;
public static final String [] TOKEN_TYPES = new String [] {
"<ALPHANUM>",
"<APOSTROPHE>",
"<ACRONYM>",
"<COMPANY>",
"<EMAIL>",
"<HOST>",
"<NUM>",
"<CJ>",
WikipediaTokenizer.INTERNAL_LINK,
WikipediaTokenizer.EXTERNAL_LINK,
WikipediaTokenizer.CITATION,
WikipediaTokenizer.CATEGORY,
WikipediaTokenizer.BOLD,
WikipediaTokenizer.ITALICS,
WikipediaTokenizer.BOLD_ITALICS,
WikipediaTokenizer.HEADING,
WikipediaTokenizer.SUB_HEADING,
WikipediaTokenizer.EXTERNAL_LINK_URL
};
public final int yychar()
{
return yychar;
}
public final int getPositionIncrement(){
return positionInc;
}
/**
* Fills Lucene token with the current token text.
*/
final void getText(Token t, int tokType) {
t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
}
%}
// basic word: a sequence of digits & letters
ALPHANUM = ({LETTER}|{DIGIT}|{KOREAN})+
// internal apostrophes: O'Reilly, you're, O'Reilly's
// use a post-filter to remove possesives
APOSTROPHE = {ALPHA} ("'" {ALPHA})+
// acronyms: U.S.A., I.B.M., etc.
// use a post-filter to remove dots
ACRONYM = {ALPHA} "." ({ALPHA} ".")+
// company names like AT&T and Excite@Home.
COMPANY = {ALPHA} ("&"|"@") {ALPHA}
// email addresses
EMAIL = {ALPHANUM} (("."|"-"|"_") {ALPHANUM})* "@" {ALPHANUM} (("."|"-") {ALPHANUM})+
// hostname
HOST = {ALPHANUM} ((".") {ALPHANUM})+
// floating point, serial, model numbers, ip addresses, etc.
// every other segment must have at least one digit
NUM = ({ALPHANUM} {P} {HAS_DIGIT}
| {DIGIT}+ {P} {DIGIT}+
| {HAS_DIGIT} {P} {ALPHANUM}
| {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+
| {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+
| {ALPHANUM} {P} {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+
| {HAS_DIGIT} {P} {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+)
TAGS = "<"\/?{ALPHANUM}({WHITESPACE}*{ALPHANUM}=\"{ALPHANUM}\")*">"
// punctuation
P = ("_"|"-"|"/"|"."|",")
// at least one digit
HAS_DIGIT =
({LETTER}|{DIGIT})*
{DIGIT}
({LETTER}|{DIGIT})*
ALPHA = ({LETTER})+
LETTER = [\u0041-\u005a\u0061-\u007a\u00c0-\u00d6\u00d8-\u00f6\u00f8-\u00ff\u0100-\u1fff\uffa0-\uffdc]
DIGIT = [\u0030-\u0039\u0660-\u0669\u06f0-\u06f9\u0966-\u096f\u09e6-\u09ef\u0a66-\u0a6f\u0ae6-\u0aef\u0b66-\u0b6f\u0be7-\u0bef\u0c66-\u0c6f\u0ce6-\u0cef\u0d66-\u0d6f\u0e50-\u0e59\u0ed0-\u0ed9\u1040-\u1049]
KOREAN = [\uac00-\ud7af\u1100-\u11ff]
// Chinese, Japanese
CJ = [\u3040-\u318f\u3100-\u312f\u3040-\u309F\u30A0-\u30FF\u31F0-\u31FF\u3300-\u337f\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\uff65-\uff9f]
WHITESPACE = \r\n | [ \r\n\t\f]
//Wikipedia
DOUBLE_BRACKET = "["{2}
DOUBLE_BRACKET_CLOSE = "]"{2}
DOUBLE_BRACKET_CAT = "["{2}":"?"Category:"
EXTERNAL_LINK = "["
TWO_SINGLE_QUOTES = "'"{2}
CITATION = "<ref>"
CITATION_CLOSE = "</ref>"
INFOBOX = {DOUBLE_BRACE}("I"|"i")nfobox_
DOUBLE_BRACE = "{"{2}
DOUBLE_BRACE_CLOSE = "}"{2}
PIPE = "|"
DOUBLE_EQUALS = "="{2}
%state CATEGORY_STATE
%state INTERNAL_LINK_STATE
%state EXTERNAL_LINK_STATE
%state TWO_SINGLE_QUOTES_STATE
%state THREE_SINGLE_QUOTES_STATE
%state FIVE_SINGLE_QUOTES_STATE
%state DOUBLE_EQUALS_STATE
%state DOUBLE_BRACE_STATE
%state STRING
%%
<YYINITIAL>{ALPHANUM} {positionInc = 1; return ALPHANUM; }
<YYINITIAL>{APOSTROPHE} {positionInc = 1; return APOSTROPHE; }
<YYINITIAL>{ACRONYM} {positionInc = 1; return ACRONYM; }
<YYINITIAL>{COMPANY} {positionInc = 1; return COMPANY; }
<YYINITIAL>{EMAIL} {positionInc = 1; return EMAIL; }
<YYINITIAL>{NUM} {positionInc = 1; return NUM; }
<YYINITIAL>{HOST} {positionInc = 1; return HOST; }
<YYINITIAL>{CJ} {positionInc = 1; return CJ; }
//wikipedia
<YYINITIAL>{
//First {ALPHANUM} is always the link, set position to 0 for double bracket
{DOUBLE_BRACKET} {positionInc = 0; currentTokType = INTERNAL_LINK; yybegin(INTERNAL_LINK_STATE);}
{DOUBLE_BRACKET_CAT} {positionInc = 1; currentTokType = CATEGORY; yybegin(CATEGORY_STATE);}
{EXTERNAL_LINK} {positionInc = 1;currentTokType = EXTERNAL_LINK_URL; yybegin(EXTERNAL_LINK_STATE);}
{TWO_SINGLE_QUOTES} {positionInc = 1; if (numBalanced == 0){numBalanced++;yybegin(TWO_SINGLE_QUOTES_STATE);} else{numBalanced = 0;}}
{DOUBLE_EQUALS} {positionInc = 1; yybegin(DOUBLE_EQUALS_STATE);}
{DOUBLE_BRACE} {positionInc = 1; currentTokType = CITATION; yybegin(DOUBLE_BRACE_STATE);}
{CITATION} {positionInc = 1; currentTokType = CITATION; yybegin(DOUBLE_BRACE_STATE);}
//ignore
. | {WHITESPACE} |{INFOBOX} { positionInc = 1; }
}
<INTERNAL_LINK_STATE>{
//First {ALPHANUM} is always the link, set position to 0 for these
{ALPHANUM} {yybegin(INTERNAL_LINK_STATE); return currentTokType;}
{DOUBLE_BRACKET_CLOSE} {yybegin(YYINITIAL);}
//ignore
. | {WHITESPACE} { positionInc = 1; }
}
<EXTERNAL_LINK_STATE>{
"http://"{HOST}("/"?({ALPHANUM}|{P}|\?|"&"|"="|"#")*)* {positionInc = 0; yybegin(EXTERNAL_LINK_STATE); return currentTokType;}
{ALPHANUM} {positionInc = 1; currentTokType = EXTERNAL_LINK; yybegin(EXTERNAL_LINK_STATE);return currentTokType;}
"]" {yybegin(YYINITIAL);}
{WHITESPACE} { positionInc = 1; }
}
<CATEGORY_STATE>{
{ALPHANUM} {yybegin(CATEGORY_STATE); return currentTokType;}
{DOUBLE_BRACKET_CLOSE} {yybegin(YYINITIAL);}
//ignore
. | {WHITESPACE} { positionInc = 1; }
}
//italics
<TWO_SINGLE_QUOTES_STATE>{
"'" {currentTokType = BOLD; yybegin(THREE_SINGLE_QUOTES_STATE);}
"'''" {currentTokType = BOLD_ITALICS; yybegin(FIVE_SINGLE_QUOTES_STATE);}
{ALPHANUM} {currentTokType = ITALICS; yybegin(STRING); return currentTokType;/*italics*/}
//we can have links inside, let those override
{DOUBLE_BRACKET} {currentTokType = INTERNAL_LINK;yybegin(INTERNAL_LINK_STATE);}
{DOUBLE_BRACKET_CAT} {currentTokType = CATEGORY;yybegin(CATEGORY_STATE);}
{EXTERNAL_LINK} {currentTokType = EXTERNAL_LINK;yybegin(EXTERNAL_LINK_STATE);}
//ignore
. | {WHITESPACE} { /* ignore */ }
}
//bold
<THREE_SINGLE_QUOTES_STATE>{
{ALPHANUM} {yybegin(STRING);return currentTokType;}
//we can have links inside, let those override
{DOUBLE_BRACKET} {currentTokType = INTERNAL_LINK;yybegin(INTERNAL_LINK_STATE);}
{DOUBLE_BRACKET_CAT} {currentTokType = CATEGORY;yybegin(CATEGORY_STATE);}
{EXTERNAL_LINK} {currentTokType = EXTERNAL_LINK;yybegin(EXTERNAL_LINK_STATE);}
//ignore
. | {WHITESPACE} { /* ignore */ }
}
//bold italics
<FIVE_SINGLE_QUOTES_STATE>{
{ALPHANUM} {yybegin(STRING);return currentTokType;}
//we can have links inside, let those override
{DOUBLE_BRACKET} {currentTokType = INTERNAL_LINK;yybegin(INTERNAL_LINK_STATE);}
{DOUBLE_BRACKET_CAT} {currentTokType = CATEGORY;yybegin(CATEGORY_STATE);}
{EXTERNAL_LINK} {currentTokType = EXTERNAL_LINK;yybegin(EXTERNAL_LINK_STATE);}
//ignore
. | {WHITESPACE} { /* ignore */ }
}
<DOUBLE_EQUALS_STATE>{
"=" {currentTokType = SUB_HEADING; yybegin(STRING);}
{ALPHANUM} {currentTokType = HEADING; yybegin(DOUBLE_EQUALS_STATE); return currentTokType;}
{DOUBLE_EQUALS} {yybegin(YYINITIAL);}
//ignore
. | {WHITESPACE} { /* ignore */ }
}
<DOUBLE_BRACE_STATE>{
{ALPHANUM} {yybegin(DOUBLE_BRACE_STATE); return currentTokType;}
{DOUBLE_BRACE_CLOSE} {yybegin(YYINITIAL);}
{CITATION_CLOSE} {yybegin(YYINITIAL);}
//ignore
. | {WHITESPACE} { /* ignore */ }
}
<STRING> {
"'''''" {numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);/*end bold italics*/}
"'''" {numBalanced = 0;currentTokType = ALPHANUM;yybegin(YYINITIAL);/*end bold*/}
"''" {numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);/*end italics*/}
"===" {numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);/*end sub header*/}
{ALPHANUM} {yybegin(STRING); return currentTokType;/* STRING ALPHANUM*/}
//we can have links inside, let those override
{DOUBLE_BRACKET} {numBalanced = 0;currentTokType = INTERNAL_LINK;yybegin(INTERNAL_LINK_STATE);}
{DOUBLE_BRACKET_CAT} {numBalanced = 0;currentTokType = CATEGORY;yybegin(CATEGORY_STATE);}
{EXTERNAL_LINK} {numBalanced = 0;currentTokType = EXTERNAL_LINK;yybegin(EXTERNAL_LINK_STATE);}
{PIPE} {yybegin(STRING); return currentTokType;/*pipe*/}
.|{WHITESPACE} { /* ignore STRING */ }
}
/*
{INTERNAL_LINK} { return curentTokType; }
{CITATION} { return currentTokType; }
{CATEGORY} { return currentTokType; }
{BOLD} { return currentTokType; }
{ITALICS} { return currentTokType; }
{BOLD_ITALICS} { return currentTokType; }
{HEADING} { return currentTokType; }
{SUB_HEADING} { return currentTokType; }
*/
//end wikipedia
/** Ignore the rest */
. | {WHITESPACE}|{TAGS} { /* ignore */ }
//INTERNAL_LINK = "["{2}({ALPHANUM}+{WHITESPACE}*)+"]"{2}
//EXTERNAL_LINK = "["http://"{HOST}.*?"]"
//CITATION = "{"{2}({ALPHANUM}+{WHITESPACE}*)+"}"{2}
//CATEGORY = "["{2}"Category:"({ALPHANUM}+{WHITESPACE}*)+"]"{2}
//CATEGORY_COLON = "["{2}":Category:"({ALPHANUM}+{WHITESPACE}*)+"]"{2}
//BOLD = '''({ALPHANUM}+{WHITESPACE}*)+'''
//ITALICS = ''({ALPHANUM}+{WHITESPACE}*)+''
//BOLD_ITALICS = '''''({ALPHANUM}+{WHITESPACE}*)+'''''
//HEADING = "="{2}({ALPHANUM}+{WHITESPACE}*)+"="{2}
//SUB_HEADING ="="{3}({ALPHANUM}+{WHITESPACE}*)+"="{3}

View File

@ -0,0 +1,35 @@
<!--
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-->
<HTML>
<!--
*
--><HEAD>
<TITLE>org.apache.lucene.wikipedia</TITLE>
</HEAD>
<BODY>
<DIV>Tools for working with <a href="http://www.wikipedia.org">Wikipedia</a> content.
</DIV>
<DIV>&nbsp;</DIV>
<DIV align="center">
Copyright &copy; 2007 <A HREF="http://www.apache.org">Apache Software Foundation</A>
</DIV>
</BODY>
</HTML>

View File

@ -0,0 +1,213 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.wikipedia.analysis;
import junit.framework.TestCase;
import org.apache.lucene.analysis.Token;
import java.io.File;
import java.io.StringReader;
import java.util.Map;
import java.util.HashMap;
/**
*
*
**/
public class WikipediaTokenizerTest extends TestCase {
public WikipediaTokenizerTest(String s) {
super(s);
}
protected void setUp() {
}
protected void tearDown() {
}
public void testHandwritten() throws Exception {
//make sure all tokens are in only one type
String test = "[[link]] This is a [[Category:foo]] Category This is a linked [[:Category:bar none withstanding]] " +
"Category This is (parens) This is a [[link]] This is an external URL [http://lucene.apache.org] " +
"Here is ''italics'' and ''more italics'', '''bold''' and '''''five quotes''''' " +
" This is a [[link|display info]] This is a period. Here is $3.25 and here is 3.50. Here's Johnny. " +
"==heading== ===sub head=== followed by some text [[Category:blah| ]] " +
"''[[Category:ital_cat]]'' here is some that is ''italics [[Category:foo]] but is never closed." +
"'''same [[Category:foo]] goes for this '''''and2 [[Category:foo]] and this" +
" [http://foo.boo.com/test/test/ Test Test] [http://foo.boo.com/test/test/test.html Test Test]" +
" [http://foo.boo.com/test/test/test.html?g=b&c=d Test Test] <ref>Citation</ref> <sup>martian</sup> <span class=\"glue\">code</span>";
Map tcm = new HashMap();//map tokens to types
tcm.put("link", WikipediaTokenizer.INTERNAL_LINK);
tcm.put("display", WikipediaTokenizer.INTERNAL_LINK);
tcm.put("info", WikipediaTokenizer.INTERNAL_LINK);
tcm.put("http://lucene.apache.org", WikipediaTokenizer.EXTERNAL_LINK_URL);
tcm.put("http://foo.boo.com/test/test/", WikipediaTokenizer.EXTERNAL_LINK_URL);
tcm.put("http://foo.boo.com/test/test/test.html", WikipediaTokenizer.EXTERNAL_LINK_URL);
tcm.put("http://foo.boo.com/test/test/test.html?g=b&c=d", WikipediaTokenizer.EXTERNAL_LINK_URL);
tcm.put("Test", WikipediaTokenizer.EXTERNAL_LINK);
//alphanums
tcm.put("This", "<ALPHANUM>");
tcm.put("is", "<ALPHANUM>");
tcm.put("a", "<ALPHANUM>");
tcm.put("Category", "<ALPHANUM>");
tcm.put("linked", "<ALPHANUM>");
tcm.put("parens", "<ALPHANUM>");
tcm.put("external", "<ALPHANUM>");
tcm.put("URL", "<ALPHANUM>");
tcm.put("and", "<ALPHANUM>");
tcm.put("period", "<ALPHANUM>");
tcm.put("Here", "<ALPHANUM>");
tcm.put("Here's", "<APOSTROPHE>");
tcm.put("here", "<ALPHANUM>");
tcm.put("Johnny", "<ALPHANUM>");
tcm.put("followed", "<ALPHANUM>");
tcm.put("by", "<ALPHANUM>");
tcm.put("text", "<ALPHANUM>");
tcm.put("that", "<ALPHANUM>");
tcm.put("but", "<ALPHANUM>");
tcm.put("never", "<ALPHANUM>");
tcm.put("closed", "<ALPHANUM>");
tcm.put("goes", "<ALPHANUM>");
tcm.put("for", "<ALPHANUM>");
tcm.put("this", "<ALPHANUM>");
tcm.put("an", "<ALPHANUM>");
tcm.put("some", "<ALPHANUM>");
tcm.put("martian", "<ALPHANUM>");
tcm.put("code", "<ALPHANUM>");
tcm.put("foo", WikipediaTokenizer.CATEGORY);
tcm.put("bar", WikipediaTokenizer.CATEGORY);
tcm.put("none", WikipediaTokenizer.CATEGORY);
tcm.put("withstanding", WikipediaTokenizer.CATEGORY);
tcm.put("blah", WikipediaTokenizer.CATEGORY);
tcm.put("ital", WikipediaTokenizer.CATEGORY);
tcm.put("cat", WikipediaTokenizer.CATEGORY);
tcm.put("italics", WikipediaTokenizer.ITALICS);
tcm.put("more", WikipediaTokenizer.ITALICS);
tcm.put("bold", WikipediaTokenizer.BOLD);
tcm.put("same", WikipediaTokenizer.BOLD);
tcm.put("five", WikipediaTokenizer.BOLD_ITALICS);
tcm.put("and2", WikipediaTokenizer.BOLD_ITALICS);
tcm.put("quotes", WikipediaTokenizer.BOLD_ITALICS);
tcm.put("heading", WikipediaTokenizer.HEADING);
tcm.put("sub", WikipediaTokenizer.SUB_HEADING);
tcm.put("head", WikipediaTokenizer.SUB_HEADING);
tcm.put("Citation", WikipediaTokenizer.CITATION);
tcm.put("3.25", "<NUM>");
tcm.put("3.50", "<NUM>");
WikipediaTokenizer tf = new WikipediaTokenizer(new StringReader(test));
Token token = new Token();
int count = 0;
int numItalics = 0;
int numBoldItalics = 0;
int numCategory = 0;
int numCitation = 0;
while ((token = tf.next(token)) != null) {
String tokText = token.termText();
//System.out.println("Text: " + tokText + " Type: " + token.type());
assertTrue("token is null and it shouldn't be", token != null);
String expectedType = (String) tcm.get(tokText);
assertTrue("expectedType is null and it shouldn't be for: " + token, expectedType != null);
assertTrue(token.type() + " is not equal to " + expectedType + " for " + token, token.type().equals(expectedType) == true);
count++;
if (token.type().equals(WikipediaTokenizer.ITALICS) == true){
numItalics++;
} else if (token.type().equals(WikipediaTokenizer.BOLD_ITALICS) == true){
numBoldItalics++;
} else if (token.type().equals(WikipediaTokenizer.CATEGORY) == true){
numCategory++;
}
else if (token.type().equals(WikipediaTokenizer.CITATION) == true){
numCitation++;
}
}
assertTrue("We have not seen enough tokens: " + count + " is not >= " + tcm.size(), count >= tcm.size());
assertTrue(numItalics + " does not equal: " + 4 + " for numItalics", numItalics == 4);
assertTrue(numBoldItalics + " does not equal: " + 3 + " for numBoldItalics", numBoldItalics == 3);
assertTrue(numCategory + " does not equal: " + 10 + " for numCategory", numCategory == 10);
assertTrue(numCitation + " does not equal: " + 1 + " for numCitation", numCitation == 1);
}
public void testLinkPhrases() throws Exception {
String test = "click [[link here]] click [http://lucene.apache.org here]";
WikipediaTokenizer tf = new WikipediaTokenizer(new StringReader(test));
Token token = new Token();
token = tf.next(token);
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "click", new String(token.termBuffer(), 0, token.termLength()).equals("click") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
token = tf.next(token);
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "link", new String(token.termBuffer(), 0, token.termLength()).equals("link") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 0, token.getPositionIncrement() == 0);
token = tf.next(token);
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "here",
new String(token.termBuffer(), 0, token.termLength()).equals("here") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
token = tf.next(token);
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "click",
new String(token.termBuffer(), 0, token.termLength()).equals("click") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
token = tf.next(token);
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "http://lucene.apache.org",
new String(token.termBuffer(), 0, token.termLength()).equals("http://lucene.apache.org") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 0, token.getPositionIncrement() == 0);
token = tf.next(token);
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "here",
new String(token.termBuffer(), 0, token.termLength()).equals("here") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
}
public void testLinks() throws Exception {
String test = "[http://lucene.apache.org/java/docs/index.html#news here] [http://lucene.apache.org/java/docs/index.html?b=c here]";
WikipediaTokenizer tf = new WikipediaTokenizer(new StringReader(test));
Token token = new Token();
token = tf.next(token);
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "http://lucene.apache.org/java/docs/index.html#news",
new String(token.termBuffer(), 0, token.termLength()).equals("http://lucene.apache.org/java/docs/index.html#news") == true);
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.EXTERNAL_LINK_URL, token.type().equals(WikipediaTokenizer.EXTERNAL_LINK_URL) == true);
tf.next(token);//skip here
token = tf.next(token);
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "http://lucene.apache.org/java/docs/index.html?b=c",
new String(token.termBuffer(), 0, token.termLength()).equals("http://lucene.apache.org/java/docs/index.html?b=c") == true);
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.EXTERNAL_LINK_URL, token.type().equals(WikipediaTokenizer.EXTERNAL_LINK_URL) == true);
}
}

View File

@ -367,6 +367,10 @@ document.write("Last Published: " + document.lastModified);
<li>
<a href="api/contrib-swing/index.html">Swing</a>
</li>
<li>
<a href="api/contrib-wikipedia/index.html">Wikipedia</a>
</li>
<li>
<a href="api/contrib-wordnet/index.html">Wordnet</a>
@ -383,11 +387,11 @@ document.write("Last Published: " + document.lastModified);
</p>
</div>
<a name="N10097"></a><a name="Downloads"></a>
<a name="N1009C"></a><a name="Downloads"></a>
<h2 class="boxed">Downloads</h2>
<div class="section">
<p>System Requirements are detailed <a href="systemrequirements.html">here</a>.</p>
<a name="N100A3"></a><a name="Clover"></a>
<a name="N100A8"></a><a name="Clover"></a>
<h3 class="boxed">Clover Test Coverage Reports</h3>
<p>
@ -396,7 +400,7 @@ document.write("Last Published: " + document.lastModified);
<a href="http://lucene.zones.apache.org:8080/hudson/job/Lucene-Nightly/lastSuccessfulBuild/artifact/trunk/build/test/clover/reports/index.html">here</a>
for the nightly build.
</p>
<a name="N100B4"></a><a name="Hudson"></a>
<a name="N100B9"></a><a name="Hudson"></a>
<h3 class="boxed">Hudson</h3>
<p>
@ -404,13 +408,13 @@ document.write("Last Published: " + document.lastModified);
project. It is responsible for running nightly builds, code coverage reports as well as building the nightly version
of the website.
</p>
<a name="N100C1"></a><a name="Nightly"></a>
<a name="N100C6"></a><a name="Nightly"></a>
<h3 class="boxed">Nightly Build Download</h3>
<p>Nightly builds are based on the trunk version of the code checked into
<a href="https://svn.apache.org/repos/asf/lucene/java/trunk">SVN</a>
</p>
<a href="http://lucene.zones.apache.org:8080/hudson/job/Lucene-Nightly/">Download via Hudson</a><a name="N100D3"></a><a name="source"></a>
<a href="http://lucene.zones.apache.org:8080/hudson/job/Lucene-Nightly/">Download via Hudson</a><a name="N100D8"></a><a name="source"></a>
<h3 class="boxed">Source Code</h3>
<p>The source files are now stored using Subversion (see http://subversion.tigris.org/ and http://svnbook.red-bean.com/)
</p>

View File

@ -5,10 +5,10 @@
/Producer (FOP 0.20.5) >>
endobj
5 0 obj
<< /Length 677 /Filter [ /ASCII85Decode /FlateDecode ]
<< /Length 680 /Filter [ /ASCII85Decode /FlateDecode ]
>>
stream
Gaua=966RV&:j6I$6AM$<'>LHA#F%1Bfjds>FL0G11<e;/&7SNq]Z`9S4S?[.)cF,`Nm,5C0/h1%GkAg5Sum3h$(mtAH;j$J-"qb(aN]GA-gU;#3ep&]/ni-lp7ej)2[APh<K'JakDs)h!dWOA>Sut];]T!W(q!2=bAet?M3_@^r3fKo#YAMEQkN1%pIJb1\.LtUi'.39h$=WfkX&D$ptq:,H82VL,c_h[8K(Fc;`WB\og;YJC1VbN(fJ+7PK\#4mZt!5et;#^@iB5UT=jd;Rk\opTD^:Xt<Ye#/_-[k36A0O](e$'q*+_$&kI,a?727rO"0tg?QdGDn<6sSTlFj,$.pR.)3TrnR(3!%/D[jS]CK30@XK;.?Q:,pGt&Qj>:DrCFl?ENl^_7Os^t7,Q2JU%^u-!)Q0lTrMo<nMSe]#CUmNXrPp?BCX+O3BaG&*`)?`o/:*cc:]Fpk2bUXP+.*qSbiC$](/Fa-^RXo>T)MuiO.ELK<fCB"NTEpua[nU3=VS/eE_l^I38"-1cfP12HW0*DWq2ANI2`#%cM-:B`-lBp*b'pA6,J2=?l+1u2We=2VAp`rPDF5I>(AVg8]X1aV_)eep.sBn^,r7-'98VO&.4tmqKIa^GD[?ICEJ9(AhI(bkJEaNd/2&B!(LDdh0?5AUcuj>3qV;OUY?7~>
Gaua=c#T:-&:j43KoYjMWr@QZm5/Z9D.f;V97'K#-;5k/"ZV^T07n`8N#UP>.6^iaQJML?]95'DnE,T0_B8p%/jT+`%1pS_$rH]9+LZ?C+\5$)P+UZMol`";#IOaP,`K<YX&!T4.Smct_b5,KV;<`_VJ]A8>Ec(5%DIJU\oWo5-A21?T00t%7r7\jhT2^:7Qsk`1:8aXa[c[k4(Uff.&[uA,kA\9j5fpa9(1OpeQ&BE`Wa<^7`Y3VZ;GW\_DiP%@:/m8!q&;I[6MhV7?os$dLE[VG>m;+9qeUFkbm>o:D\YCPhHADa?2YH@JC7!k4(V%01GlePYVd8dhZNW`+sX[R83':q(4dirW40MaIe%I4MI_AW?80=bA:=qe&[3(o8mgmZ%t_-,/[JTSl!LhoXCY\%)"m,EMDa<YPR[`be9%L;1!/KJ".^pk'Z)O0*-Yl4=/q9U)t$+l$mBQHKgap"#Igs1BGO]H^^@ZC#A3AF.V(6T54)(':-atoo$\lgs7)jdBmI^aQCT-rS.<Zo*[ZZs)7:AS[5<@8+Ljqn>ee2qr894W<kYB5$e?u#=a8Mn28kNHNBOTKO@27`k?-gjrDBmK:#D<gfEm9N*UgT:4FK.4,G(*S<WemLmN=C>J4enV=A0#FB`>)Srh=rbh`=hU%_i>ZDs0MW!k`Zh]eWG%f~>
endstream
endobj
6 0 obj
@ -102,10 +102,10 @@ endobj
>>
endobj
22 0 obj
<< /Length 1268 /Filter [ /ASCII85Decode /FlateDecode ]
<< /Length 1260 /Filter [ /ASCII85Decode /FlateDecode ]
>>
stream
Gatm;?'!Gq&:O#Nn0:K*nOK)[hWQR">-S@A]`RH86^kT'Zu0-=UqOIVr_AoXEJf1PLfL90PNI,8S/HZ#e+r(\I.D>FE1Fk"UZOM.[.tF^M%MLYjT"i14dmW!',6<(^&ubf?VuA'&!2oe1&3Smc6of6,=J"+k,SeSlNsWm+,X68^u=epc*up*(DOVP=7!<<Mu3-$0@kcH%[YeYW(aq(X'?God18(KCSRQ#`SYm0G_"g.lrelsjs1LL/Z1<F7WB!IF?\9D/EGa`0&9__o$S_[%dDBHQ.BI=.r>J8ZhTljLAS:3aXg5Q3a,jIiO[`FA5XXNbG/AQf1V[b>?Op>irF91NM@aRAmLWW.6T;tk)[&#KVbcu>hT)a=+%t*gBm_o1)"YdK-@)k>A0dN724^BYVog"("oL4]r>hYVff/.j,AVZV74T8Qse4m\]Q=G>+#[?E71*O?\Mggng<r\cY*39hKff7>_]L#LB4el)10G0\kr+&lA_i:7)3ckPPi^n!!K,MjK2<T#YVDk5d#5s_h"L)B^!IkaK9VpLimN=?nRi_WB1#/9mNuRKfoEk<h'ST\3te7].m1HeRa%_F^mc+'a??nGOO<@Gf$B'<#4/!S!li+P"/i45!Zf[VIlWD@i>#u!f*Mmh'a\#T.K_u7rrdhGS5:e\^/Icnc7uPESqSo9.`jjh>-1FouchJfCC"B8jluOB)*59PK>=NGX:7fAQJ$4,5l(#F[/^8]$,=qWGEfc7o'=LNP4o,C1D?k/DWQ8]_]h^Vk3$9Z%RGrq5V]FOHgb$`EmPuh1$D>W$\@lq=Zg+jZ2npBItb@+TV65XbH9VP;$l!i)XX(d>('QFQr67N(#]-(;3M\YTVn'\d&<e6/U"#a0Lk/B6]M&leKQn!%9u^T^CUal4=nKjE%bERKQ_pZ%7"K?g>`RTi'IoK(a6="6$Z9HYJH^F'2Q;9F--FCME+=e#)]Z:Z3!(J`nQ"grH"\4b\NY#"d#-P#F6tPl8J@aKek'ZoC*fP8DONp4Nb1=YgD;3ZmGZNJqn<R>C^(g>ge?H<Hh0pt)t`_0fnUH,=Pd"c?\rau[f)maA0G*N$Cn$5p,Rp%,;dO`_oC;X%JVWq5$O=mF^OE`=:.)bS;C_"R-T,nNBVm&CS^??-drOiJG-iQ"ecolHJtZU\WH8kCNK8Bi=iG'Ga9c#=L,"C$G_DD,8\h9e&N5-5X1F,loG`<ZbTCYgL^1+^C>p.VZi6W\R#S_L"`oWJX8Qr8QVnm,G-KEHHPm1T:,5i+<s~>
Gatm;9on$e&A@sBn<a2Y7O1ENoG6n-'.gFTZ?D?3D%Ep7[Y:PHLQ_&9^V1pcZ7(JZ[V0'af#W0-2fFG@d<%Qt5G*=q">i]65`k+eA?>Tl6(/?of#7UOdg%X"!ud7lLA2e3k5'g(b!\c/ON$;HA3-?q]oNJ8*gj,X!Ei3;"oL(&6:6<b7qB=[^V=7M27_]/AR7N3cUGAcltj2EeR`TT`P*sHq3q4Qjpu;Cm(E+KNfs`_&k+r6[)SU]@D(\Yk//&Gj-IugI1@P3INPkJ`M0:^EpB$_/6cgOB"uc+0QN)$ok(]EeIM-o;sIn=.>R58A!+kAA<oa!)QoDfhf#3iZc3%5'H0IZ,a<LMc2]sZ^(ntJ-\Neq=eT[3V(99#:$i)^cSC\U6GjtCs&s*2E+L$<.,\sTga]VU`HkJ;E_lF`>)kD!C0'1@P>Or+i0PqRJjqCB1FXVEqm-70e>lTsADZGDU;,)[:SQ"t0G`\&YZ<=8J=4Rrb.^mUii3V/EZ4Zts57-+K!?*^:+7QDVc7sb\%89l^p")%FCX`HX.>Wn3]=,pf1a@]Sc/gl?*2"`&)a.HUdH?/Oe:fS9H8>#RGLZoQq2IU6-RTkHT)A*n7a\41>cn(oB\r(fW3@hX;BF:$lCo=bLe0L*f?t:UKHH>jUq)t:/70IA$7YK7?q4%?l^^']u#0QK.4U,D\4e//5NL)^YC4THNffP>MN#OW'0*nTi[L^WK2u(LRloq"nV"7gOW8u#4cD&"pIZ4o$$gIk^C!`!s@f<B6HE31^Ao9bc=Og7<Ymd1,gs`]^gI!Am%If`^S=Vc>Sc9L9n!2#4#gtP*tU0g!=>(>R1ags7`D/3QD.HBd40i;nCGu;X&#^mlDF)PfTTjM-D-fi[&Gs#E85f&^F-O%9?,IK(&TR5VkE"T&.l'IBZnu"q:FR*mYHg`=Q4Jgf8#9Vf7=/E+9:,X(b@m[n7aXRU,&Peb1\RIqEK#FkCD,qpf>(P`&B3'QIOA0@?AFfP/40AU)J!*c]>Pi+/M7.rST2/\T8aB6JX<kg3u7m6)gVF4mgS<7$^u&n;[on%s@ZbOTgVS@$^(ikub,0=Pggef>_ql!:@,jmEd]H>dg@aks`gZl0-'NI0ahb%2gqWY_74F-XgD,_Die.M-f'^N@%n4E;rQ(:8q9]&YTCG^*A"c+k)dJpWLIY0:eVmUR77cKceDllk%inK<VlCYgL^2Cuhmp.V[TA!b^=c@'b'pETS>VUT"DI92^DGdJK!C$irj!3?kVBE~>
endstream
endobj
23 0 obj
@ -117,10 +117,10 @@ endobj
>>
endobj
24 0 obj
<< /Length 1000 /Filter [ /ASCII85Decode /FlateDecode ]
<< /Length 1033 /Filter [ /ASCII85Decode /FlateDecode ]
>>
stream
Gatm:997OU&AIm?pp_iZ'0EJji5RB#%j;'l`J*E8Zu4K3Ll:Z-g4,0Q7m/<'!!uYB11P[nS!e1W]q3PVc"B7RN3*!PorOfO8XcOUQ](h><Ql;u1aPVSCA)@brPo"&"2$_,'njPT@D82XmL]%57f%`]L6Q^6s$=A*]P3^52j`B4BBV\8n@gW'X-Tbg#Lu.":,T.kd5KN7\Mg9_fZp5HPa/86;\Eqi)SJAV+M;V/ZB5gPhrV^@d(FL"OPe.Wh,.EK!ITjL8A!9/<E5$[Y-\LeBq&]I>GKa[Y'p_8W`A`?5/Yc$-'l!fP\['LEheU]nWrBe-gUX$9'9ggAWiMbXqMu!WTVA7Z:ACP2:/B[(3d'ib"3FDG2/gR(HG42rlH&456WWZ6Hkmg?i9hj$uD^gn<&]h!u8q2Nu;@tcc8TWU.?G9E?@FI#qG_*j)SDndhK`eHIpZX>iHkQ%3k?_&tY)U&e<T_(7@M33#$u;OF@;ZErsn>TY+=%PmAP9`?bK/F71i>Ip]]1!^=]E,Bi94&N/4sQOcH$U0Ddf<fo\2%'bP??H$((!E3P@`gbu&;+>L/BrK?BO="BY6nsjZ,LYB3egsXTDRtLl,=n>T-k<=QF3+B=+H-BnO`^$S\30K+qb#E0**4&l6O0/?\<*h`>R,Iioc$Eh0<@S(*h:tIO!8r'#u7U^ZCoH'64fUIWI'<q2Cuk07T8'=deXP<YcCB^%4q.4kl1-<WT&#CGU=OP9mI]7G=hsCFmFW+GQL)^r-F-AIuE$Z[&^.nVF'pggm6@71FKE)Z*30FnD&la2C>uLTT7E3SmX)jW:Zh%cnR_:,"lE.QO^$rM=a\`AZ[o>9XLF':B`go(XKe?.AIbk@\=`q=R92")Hk*U!\HCmfmr\S0Q[fm&elXe_q9`,cF?-WT*YNe.S.j5a@";2kfo=Ehf5@9mpj1c'N:G)Bjc9@TQK:Dn#bA_i-;H9eSVYZrYPVrerRXRk^6S[9.S7f0%"A/"j0OmBE~>
GatU29lo&3&A@sBE(jj^";9pn[jJ*'Ful0Pnat9%+PGBo;1o=uT)AJ[0=-Q':-'8RD&*9CmI&^tQa=fSg#UJ4U6YW49JV2+MW<@f1>[sC6:Pt91Sb1(-9iiDr<@l&/0tmCrY_QW5:9TkOC"X0MmkXDLZ$BD:cIMK5R=4nU6/ZK/LR?*7bicnDC^Wa7RK"Z6GLei*9)$0Z9@F<;nXlb3_-s4m)q,u,*"IF%'V;!nHb@()Kqc`^bE]sdgg>I(a,[Fj,I)D\WG@a:X)mNAQ>cL4=2R/<][]&Shn\OGp,gpV#ZQ["ib?:NN`"[<+6"/+QaHXij0YhCh#6oMe"dcr6iGu<+TT;JlA".HUHCb2=dl4X0jOOrBsN+EadfjmhZ%/>G:X'R*fB<Zj$Hk@([.aH$lI-),V9jB$]eiAY-l.%PBh$As<D*4E9@U'Ll8WTsENW!'H*XjL+d/&fA7=MR0;&?N(O-q6u[s\SM?/c"j_f8sM&)('s(P%Pp#)MAT)p;]qU8`nug)!JsPd+Hm^3[`FF.$t2%`)6ckaO5>84l<[##/`r**d+'<gZ+HXcqUbpLHG\8f_-tdhl2u@kk#u4LpVpW'HE1^-`SBY>5=@lo7M66+@6cYnWn)5&U8p1[-Nq;l#]iH?!ca+Hf*I&+"B3<%p_cbdH%E\oV)Y<:9'sD`X#*L\D+4unXtC2@Edq[(3,WU#k'[f.rKGmVQ'MV)]a=anC2p>=0NX"^i6lSo[ORMf`/`J=+n>udePgF:^6_fB,!(\CA2q`&iNQhOjf3(]B06fM(PZ.4ff"9."M%Y>AU?'5&,A*oqO0F]4`V@m!-i6K0F^m:f`BjG$&jUes7t#^BGI/t/;Z`=1[V(m7b5!"MGe;E*mNQ!:'tT4/l$aF-m]0pGV;/<qHp#/je+Y\P\B3J;UY17hHr?k\-R=N8cPZ=e$-rHJ)?pS]eIAEV9HL*h\oR2o;nE&kNi%0jO@BW+gJM&[<Df\=5)3]ddDF@G;E/t=,Ze&!/#.E7G&J0[K3fj6>sgE>Y]bL-t,tW~>
endstream
endobj
25 0 obj
@ -260,31 +260,31 @@ endobj
13 0 obj
<<
/S /GoTo
/D [23 0 R /XYZ 85.0 251.932 null]
/D [23 0 R /XYZ 85.0 238.732 null]
>>
endobj
15 0 obj
<<
/S /GoTo
/D [23 0 R /XYZ 85.0 199.598 null]
/D [23 0 R /XYZ 85.0 186.398 null]
>>
endobj
17 0 obj
<<
/S /GoTo
/D [23 0 R /XYZ 85.0 148.345 null]
/D [25 0 R /XYZ 85.0 659.0 null]
>>
endobj
19 0 obj
<<
/S /GoTo
/D [25 0 R /XYZ 85.0 611.4 null]
/D [25 0 R /XYZ 85.0 581.347 null]
>>
endobj
21 0 obj
<<
/S /GoTo
/D [25 0 R /XYZ 85.0 546.647 null]
/D [25 0 R /XYZ 85.0 516.594 null]
>>
endobj
26 0 obj
@ -295,45 +295,45 @@ endobj
xref
0 40
0000000000 65535 f
0000006782 00000 n
0000006854 00000 n
0000006946 00000 n
0000006810 00000 n
0000006882 00000 n
0000006974 00000 n
0000000015 00000 n
0000000071 00000 n
0000000839 00000 n
0000000959 00000 n
0000001026 00000 n
0000007080 00000 n
0000001161 00000 n
0000007143 00000 n
0000001298 00000 n
0000007209 00000 n
0000001434 00000 n
0000007275 00000 n
0000001571 00000 n
0000007341 00000 n
0000001708 00000 n
0000007407 00000 n
0000001844 00000 n
0000007471 00000 n
0000001981 00000 n
0000003342 00000 n
0000003450 00000 n
0000004543 00000 n
0000007537 00000 n
0000004651 00000 n
0000004866 00000 n
0000005102 00000 n
0000005288 00000 n
0000005555 00000 n
0000005707 00000 n
0000005953 00000 n
0000006120 00000 n
0000006233 00000 n
0000006343 00000 n
0000006451 00000 n
0000006557 00000 n
0000006673 00000 n
0000000842 00000 n
0000000962 00000 n
0000001029 00000 n
0000007108 00000 n
0000001164 00000 n
0000007171 00000 n
0000001301 00000 n
0000007237 00000 n
0000001437 00000 n
0000007303 00000 n
0000001574 00000 n
0000007369 00000 n
0000001711 00000 n
0000007433 00000 n
0000001847 00000 n
0000007499 00000 n
0000001984 00000 n
0000003337 00000 n
0000003445 00000 n
0000004571 00000 n
0000007565 00000 n
0000004679 00000 n
0000004894 00000 n
0000005130 00000 n
0000005316 00000 n
0000005583 00000 n
0000005735 00000 n
0000005981 00000 n
0000006148 00000 n
0000006261 00000 n
0000006371 00000 n
0000006479 00000 n
0000006585 00000 n
0000006701 00000 n
trailer
<<
/Size 40
@ -341,5 +341,5 @@ trailer
/Info 4 0 R
>>
startxref
7588
7616
%%EOF

View File

@ -35,6 +35,7 @@
<li><a href="api/contrib-spellchecker/index.html">Spellchecker</a></li>
<li><a href="api/contrib-surround/index.html">Surround</a></li>
<li><a href="api/contrib-swing/index.html">Swing</a></li>
<li><a href="api/contrib-wikipedia/index.html">Wikipedia</a></li>
<li><a href="api/contrib-wordnet/index.html">Wordnet</a></li>
<li><a href="api/contrib-xml-query-parser/index.html">XML Query Parser</a></li></ul></li>
</ul>