LUCENE-5339: merge trunk

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5339@1552377 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2013-12-19 17:48:47 +00:00
commit 0699ac7d42
1350 changed files with 149288 additions and 8431 deletions

2
.gitignore vendored
View File

@ -23,6 +23,8 @@
/bin
/bin.*
/pom.xml
/nbproject
/nb-build
# ./lucene

View File

@ -36,10 +36,7 @@
depends="check-svn-working-copy,validate,documentation-lint"/>
<target name="test" description="Test both Lucene and Solr">
<subant target="test" inheritall="false" failonerror="true">
<fileset dir="lucene" includes="build.xml" />
<fileset dir="solr" includes="build.xml" />
</subant>
<subant buildpath="." antfile="extra-targets.xml" target="-run-test" inheritall="false" failonerror="true" />
</target>
<target name="pitest" description="Run PITest on both Lucene and Solr">
@ -194,6 +191,39 @@
</delete>
</target>
<target name="netbeans" depends="resolve" description="Setup Netbeans configuration">
<pathconvert property="netbeans.fileset.sourcefolders" pathsep="|" dirsep="/">
<dirset dir="${basedir}/lucene" includes="**/src/java, **/src/examples, **/src/test, **/src/resources"
excludes="tools/**, build/**, backwards/**" />
<dirset dir="${basedir}/solr" includes="**/src/java, **/src/examples, **/src/test, **/src/resources"
excludes="build/**" />
<map from="${basedir}/" to=""/>
</pathconvert>
<!-- TODO: find a better way to exclude duplicate JAR files & fix the servlet-api mess! -->
<pathconvert property="netbeans.path.libs" pathsep=":" dirsep="/">
<fileset dir="${basedir}/lucene" includes="**/lib/*.jar"
excludes="**/*servlet-api*.jar, analysis/uima/**, tools/**, build/**"/>
<fileset dir="${basedir}/solr" includes="**/test-lib/*.jar,**/lib/*.jar"
excludes="core/test-lib/*servlet-api*.jar, contrib/analysis-extras/**, test-framework/lib/junit*, test-framework/lib/ant*, test-framework/lib/randomizedtesting*, build/**, dist/**, package/**, example/solr-webapp/**" />
<map from="${basedir}/" to=""/>
</pathconvert>
<mkdir dir="nbproject"/>
<copy todir="nbproject" overwrite="true">
<fileset dir="dev-tools/netbeans/nbproject"/>
</copy>
<xslt in="${ant.file}" out="nbproject/project.xml" style="dev-tools/netbeans/nb-project.xsl" force="true">
<outputproperty name="indent" value="yes"/>
<param name="netbeans.fileset.sourcefolders" expression="${netbeans.fileset.sourcefolders}"/>
<param name="netbeans.path.libs" expression="${netbeans.path.libs}"/>
<param name="netbeans.source-level" expression="1.7"/>
</xslt>
</target>
<target name="clean-netbeans" description="Removes all Netbeans configuration files">
<delete dir="nbproject" failonerror="true"/>
<delete dir="nb-build" failonerror="true"/>
</target>
<target name="eclipse" depends="resolve" description="Setup Eclipse configuration">
<basename file="${basedir}" property="eclipseprojectname"/>
<copy file="dev-tools/eclipse/dot.project" tofile=".project" overwrite="false" encoding="UTF-8">

View File

@ -45,6 +45,9 @@
<buildFile url="file://$PROJECT_DIR$/solr/contrib/dataimporthandler/build.xml" />
<buildFile url="file://$PROJECT_DIR$/solr/contrib/extraction/build.xml" />
<buildFile url="file://$PROJECT_DIR$/solr/contrib/langid/build.xml" />
<buildFile url="file://$PROJECT_DIR$/solr/contrib/morphlines-cell/build.xml" />
<buildFile url="file://$PROJECT_DIR$/solr/contrib/morphlines-core/build.xml" />
<buildFile url="file://$PROJECT_DIR$/solr/contrib/map-reduce/build.xml" />
<buildFile url="file://$PROJECT_DIR$/solr/contrib/uima/build.xml" />
<buildFile url="file://$PROJECT_DIR$/solr/contrib/velocity/build.xml" />
<buildFile url="file://$PROJECT_DIR$/solr/solrj/build.xml" />

View File

@ -0,0 +1,10 @@
<component name="libraryTable">
<library name="Solr morphlines cell library">
<CLASSES>
<root url="file://$PROJECT_DIR$/solr/contrib/morphlines-cell/lib" />
</CLASSES>
<JAVADOC />
<SOURCES />
<jarDirectory url="file://$PROJECT_DIR$/solr/contrib/morphlines-cell/lib" recursive="false" />
</library>
</component>

View File

@ -0,0 +1,10 @@
<component name="libraryTable">
<library name="Solr morphlines core library">
<CLASSES>
<root url="file://$PROJECT_DIR$/solr/contrib/morphlines-core/lib" />
</CLASSES>
<JAVADOC />
<SOURCES />
<jarDirectory url="file://$PROJECT_DIR$/solr/contrib/morphlines-core/lib" recursive="false" />
</library>
</component>

View File

@ -0,0 +1,10 @@
<component name="libraryTable">
<library name="Solr morphlines core test library">
<CLASSES>
<root url="file://$PROJECT_DIR$/solr/contrib/morphlines-core/test-lib" />
</CLASSES>
<JAVADOC />
<SOURCES />
<jarDirectory url="file://$PROJECT_DIR$/solr/contrib/morphlines-core/test-lib" recursive="false" />
</library>
</component>

View File

@ -49,6 +49,9 @@
<module filepath="$PROJECT_DIR$/solr/contrib/dataimporthandler/dataimporthandler.iml" />
<module filepath="$PROJECT_DIR$/solr/contrib/extraction/extraction.iml" />
<module filepath="$PROJECT_DIR$/solr/contrib/langid/langid.iml" />
<module filepath="$PROJECT_DIR$/solr/contrib/morphlines-cell/morphlines-cell.iml" />
<module filepath="$PROJECT_DIR$/solr/contrib/morphlines-core/morphlines-core.iml" />
<module filepath="$PROJECT_DIR$/solr/contrib/map-reduce/map-reduce.iml" />
<module filepath="$PROJECT_DIR$/solr/contrib/uima/uima.iml" />
<module filepath="$PROJECT_DIR$/solr/contrib/velocity/velocity.iml" />
<module filepath="$PROJECT_DIR$/solr/solrj/src/java/solrj.iml" />

View File

@ -235,6 +235,27 @@
<option name="VM_PARAMETERS" value="-ea -DtempDir=temp -Djetty.testMode=1 -Djetty.insecurerandom=1 -Dsolr.directoryFactory=org.apache.solr.core.MockDirectoryFactory" />
<option name="TEST_SEARCH_SCOPE"><value defaultName="singleModule" /></option>
</configuration>
<configuration default="false" name="Solr morphlines-cell contrib" type="JUnit" factoryName="JUnit">
<module name="morphlines-cell" />
<option name="TEST_OBJECT" value="package" />
<option name="WORKING_DIRECTORY" value="file://$PROJECT_DIR$/idea-build/solr/contrib/morphlines-cell" />
<option name="VM_PARAMETERS" value="-ea -DtempDir=temp -Djetty.testMode=1 -Djetty.insecurerandom=1 -Dsolr.directoryFactory=org.apache.solr.core.MockDirectoryFactory" />
<option name="TEST_SEARCH_SCOPE"><value defaultName="singleModule" /></option>
</configuration>
<configuration default="false" name="Solr morphlines-core contrib" type="JUnit" factoryName="JUnit">
<module name="morphlines-core" />
<option name="TEST_OBJECT" value="package" />
<option name="WORKING_DIRECTORY" value="file://$PROJECT_DIR$/idea-build/solr/contrib/morphlines-core" />
<option name="VM_PARAMETERS" value="-ea -DtempDir=temp -Djetty.testMode=1 -Djetty.insecurerandom=1 -Dsolr.directoryFactory=org.apache.solr.core.MockDirectoryFactory" />
<option name="TEST_SEARCH_SCOPE"><value defaultName="singleModule" /></option>
</configuration>
<configuration default="false" name="Solr mr (map-reduce) contrib" type="JUnit" factoryName="JUnit">
<module name="map-reduce" />
<option name="TEST_OBJECT" value="package" />
<option name="WORKING_DIRECTORY" value="file://$PROJECT_DIR$/idea-build/solr/contrib/map-reduce" />
<option name="VM_PARAMETERS" value="-ea -DtempDir=temp -Djetty.testMode=1 -Djetty.insecurerandom=1 -Dsolr.directoryFactory=org.apache.solr.core.MockDirectoryFactory" />
<option name="TEST_SEARCH_SCOPE"><value defaultName="singleModule" /></option>
</configuration>
<configuration default="false" name="Solr uima contrib" type="JUnit" factoryName="JUnit">
<module name="uima" />
<option name="TEST_OBJECT" value="package" />
@ -249,7 +270,7 @@
<option name="VM_PARAMETERS" value="-ea" />
<option name="TEST_SEARCH_SCOPE"><value defaultName="singleModule" /></option>
</configuration>
<list size="35">
<list size="38">
<item index="0" class="java.lang.String" itemvalue="JUnit.Lucene core" />
<item index="1" class="java.lang.String" itemvalue="JUnit.Module analyzers-common" />
<item index="2" class="java.lang.String" itemvalue="JUnit.Module analyzers-icu" />
@ -281,10 +302,13 @@
<item index="28" class="java.lang.String" itemvalue="JUnit.Solr dataimporthandler contrib" />
<item index="29" class="java.lang.String" itemvalue="JUnit.Solr dataimporthandler-extras contrib" />
<item index="30" class="java.lang.String" itemvalue="JUnit.Solr extraction contrib" />
<item index="31" class="java.lang.String" itemvalue="JUnit.Solr langid contrib" />
<item index="32" class="java.lang.String" itemvalue="JUnit.Solr uima contrib" />
<item index="33" class="java.lang.String" itemvalue="JUnit.Solr velocity contrib" />
<item index="34" class="java.lang.String" itemvalue="JUnit.Solrj" />
<item index="31" class="java.lang.String" itemvalue="JUnit.Solr morphlines-cell contrib" />
<item index="32" class="java.lang.String" itemvalue="JUnit.Solr morphlines-core contrib" />
<item index="33" class="java.lang.String" itemvalue="JUnit.Solr mr (map-reduce) contrib" />
<item index="34" class="java.lang.String" itemvalue="JUnit.Solr langid contrib" />
<item index="35" class="java.lang.String" itemvalue="JUnit.Solr uima contrib" />
<item index="36" class="java.lang.String" itemvalue="JUnit.Solr velocity contrib" />
<item index="37" class="java.lang.String" itemvalue="JUnit.Solrj" />
</list>
</component>
</project>

View File

@ -33,5 +33,6 @@
<orderEntry type="module" module-name="analysis-common" />
<orderEntry type="module" module-name="lucene-core" />
<orderEntry type="module" module-name="queryparser" />
<orderEntry type="module" module-name="queries" />
</component>
</module>

View File

@ -26,5 +26,7 @@
<orderEntry type="module" module-name="lucene-core" />
<orderEntry type="module" module-name="facet" />
<orderEntry type="module" module-name="queryparser" />
<orderEntry type="module" module-name="queries" />
<orderEntry type="module" module-name="expressions" />
</component>
</module>

View File

@ -15,6 +15,7 @@
<orderEntry type="library" scope="TEST" name="JUnit" level="project" />
<orderEntry type="module" scope="TEST" module-name="lucene-test-framework" />
<orderEntry type="module" module-name="analysis-common" />
<orderEntry type="module" module-name="queries" />
<orderEntry type="module" module-name="lucene-core" />
</component>
</module>

View File

@ -0,0 +1,40 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="JAVA_MODULE" version="4">
<component name="NewModuleRootManager" inherit-compiler-output="false">
<output url="file://$MODULE_DIR$/../../../idea-build/solr/contrib/map-reduce/classes/java" />
<output-test url="file://$MODULE_DIR$/../../../idea-build/solr/contrib/map-reduce/classes/test" />
<exclude-output />
<content url="file://$MODULE_DIR$">
<sourceFolder url="file://$MODULE_DIR$/src/java" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/src/test" isTestSource="true" />
<sourceFolder url="file://$MODULE_DIR$/src/test-files" isTestSource="true" />
</content>
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
<orderEntry type="library" scope="TEST" name="JUnit" level="project" />
<orderEntry type="library" name="Solr core library" level="project" />
<orderEntry type="library" name="Solrj library" level="project" />
<orderEntry type="library" name="Solr extraction library" level="project" />
<orderEntry type="library" name="Solr morphlines core library" level="project" />
<orderEntry type="library" name="Solr morphlines cell library" level="project" />
<orderEntry type="library" scope="TEST" name="Solr morphlines core test library" level="project" />
<orderEntry type="module" scope="TEST" module-name="lucene-test-framework" />
<orderEntry type="module" scope="TEST" module-name="solr-test-framework" />
<orderEntry type="module" module-name="solr-core" />
<orderEntry type="module" module-name="solrj" />
<orderEntry type="module" module-name="misc" />
<orderEntry type="module" module-name="extraction" />
<orderEntry type="module" module-name="lucene-core" />
<orderEntry type="module" module-name="morphlines-core" />
<orderEntry type="module-library">
<library>
<CLASSES>
<root url="file://$MODULE_DIR$/lib" />
</CLASSES>
<JAVADOC />
<SOURCES />
<jarDirectory url="file://$MODULE_DIR$/lib" recursive="false" />
</library>
</orderEntry>
</component>
</module>

View File

@ -0,0 +1,28 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="JAVA_MODULE" version="4">
<component name="NewModuleRootManager" inherit-compiler-output="false">
<output url="file://$MODULE_DIR$/../../../idea-build/solr/contrib/morphlines-cell/classes/java" />
<output-test url="file://$MODULE_DIR$/../../../idea-build/solr/contrib/morphlines-cell/classes/test" />
<exclude-output />
<content url="file://$MODULE_DIR$">
<sourceFolder url="file://$MODULE_DIR$/src/java" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/src/test" isTestSource="true" />
<sourceFolder url="file://$MODULE_DIR$/src/test-files" isTestSource="true" />
</content>
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
<orderEntry type="library" scope="TEST" name="JUnit" level="project" />
<orderEntry type="library" name="Solr core library" level="project" />
<orderEntry type="library" name="Solrj library" level="project" />
<orderEntry type="library" name="Solr extraction library" level="project" />
<orderEntry type="library" name="Solr morphlines core library" level="project" />
<orderEntry type="library" name="Solr morphlines cell library" level="project" />
<orderEntry type="library" scope="TEST" name="Solr morphlines core test library" level="project" />
<orderEntry type="module" scope="TEST" module-name="lucene-test-framework" />
<orderEntry type="module" scope="TEST" module-name="solr-test-framework" />
<orderEntry type="module" module-name="solr-core" />
<orderEntry type="module" module-name="solrj" />
<orderEntry type="module" module-name="extraction" />
<orderEntry type="module" module-name="morphlines-core" />
</component>
</module>

View File

@ -0,0 +1,27 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="JAVA_MODULE" version="4">
<component name="NewModuleRootManager" inherit-compiler-output="false">
<output url="file://$MODULE_DIR$/../../../idea-build/solr/contrib/morphlines-core/classes/java" />
<output-test url="file://$MODULE_DIR$/../../../idea-build/solr/contrib/morphlines-core/classes/test" />
<exclude-output />
<content url="file://$MODULE_DIR$">
<sourceFolder url="file://$MODULE_DIR$/src/java" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/src/test" isTestSource="true" />
<sourceFolder url="file://$MODULE_DIR$/src/test-files" isTestSource="true" />
</content>
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
<orderEntry type="library" scope="TEST" name="JUnit" level="project" />
<orderEntry type="library" name="Solr core library" level="project" />
<orderEntry type="library" name="Solrj library" level="project" />
<orderEntry type="library" name="Solr extraction library" level="project" />
<orderEntry type="library" name="Solr morphlines core library" level="project" />
<orderEntry type="library" scope="TEST" name="Solr morphlines core test library" level="project" />
<orderEntry type="module" scope="TEST" module-name="lucene-test-framework" />
<orderEntry type="module" scope="TEST" module-name="solr-test-framework" />
<orderEntry type="module" module-name="solr-core" />
<orderEntry type="module" module-name="solrj" />
<orderEntry type="module" module-name="lucene-core" />
<orderEntry type="module" module-name="analysis-common" />
</component>
</module>

View File

@ -159,7 +159,7 @@
<plugin>
<groupId>de.thetaphi</groupId>
<artifactId>forbiddenapis</artifactId>
<version>1.3</version>
<version>1.4</version>
<configuration>
<!--
This is the default setting, we don't support too new Java versions.

View File

@ -58,10 +58,10 @@
<artifactId>solr-test-framework</artifactId>
<scope>test</scope>
</dependency>
@solr-extraction.internal.dependencies@
@solr-extraction.external.dependencies@
@solr-extraction.internal.test.dependencies@
@solr-extraction.external.test.dependencies@
@solr-cell.internal.dependencies@
@solr-cell.external.dependencies@
@solr-cell.internal.test.dependencies@
@solr-cell.external.test.dependencies@
</dependencies>
<build>
<sourceDirectory>${module-path}/src/java</sourceDirectory>

View File

@ -0,0 +1,97 @@
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.apache.solr</groupId>
<artifactId>solr-parent</artifactId>
<version>@version@</version>
<relativePath>../../pom.xml</relativePath>
</parent>
<groupId>org.apache.solr</groupId>
<artifactId>solr-map-reduce</artifactId>
<packaging>jar</packaging>
<name>Apache Solr map-reduce index construction</name>
<description>Apache Solr - map-reduce index construction</description>
<properties>
<module-directory>solr/contrib/map-reduce</module-directory>
<relative-top-level>../../../..</relative-top-level>
<module-path>${relative-top-level}/${module-directory}</module-path>
</properties>
<scm>
<connection>scm:svn:${vc-anonymous-base-url}/${module-directory}</connection>
<developerConnection>scm:svn:${vc-dev-base-url}/${module-directory}</developerConnection>
<url>${vc-browse-base-url}/${module-directory}</url>
</scm>
<dependencies>
<dependency>
<!-- lucene-test-framework dependency must be declared before lucene-core -->
<!-- This dependency cannot be put into solr-parent, because local -->
<!-- dependencies are always ordered before inherited dependencies. -->
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-test-framework</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.solr</groupId>
<artifactId>solr-test-framework</artifactId>
<scope>test</scope>
</dependency>
@solr-map-reduce.internal.dependencies@
@solr-map-reduce.external.dependencies@
@solr-map-reduce.internal.test.dependencies@
@solr-map-reduce.external.test.dependencies@
</dependencies>
<build>
<sourceDirectory>${module-path}/src/java</sourceDirectory>
<testSourceDirectory>${module-path}/src/test</testSourceDirectory>
<testResources>
<testResource>
<directory>${module-path}/src/test-files</directory>
</testResource>
<testResource>
<directory>${top-level}/dev-tools/maven/solr</directory>
<includes>
<include>maven.testlogging.properties</include>
</includes>
</testResource>
</testResources>
<plugins>
<plugin>
<groupId>de.thetaphi</groupId>
<artifactId>forbiddenapis</artifactId>
<executions>
<execution>
<id>test-check-forbidden-servlet-api</id>
<configuration>
<signaturesFiles>
<signaturesFile>${top-level}/lucene/tools/forbiddenApis/servlet-api.txt</signaturesFile>
</signaturesFiles>
</configuration>
<goals>
<goal>testCheck</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>

View File

@ -0,0 +1,104 @@
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.apache.solr</groupId>
<artifactId>solr-parent</artifactId>
<version>@version@</version>
<relativePath>../../pom.xml</relativePath>
</parent>
<groupId>org.apache.solr</groupId>
<artifactId>solr-morphlines-cell</artifactId>
<packaging>jar</packaging>
<name>Apache Solr Cell Morphlines</name>
<description>Apache Solr - Cell Morphlines</description>
<properties>
<module-directory>solr/contrib/morphlines-cell</module-directory>
<relative-top-level>../../../..</relative-top-level>
<module-path>${relative-top-level}/${module-directory}</module-path>
</properties>
<scm>
<connection>scm:svn:${vc-anonymous-base-url}/${module-directory}</connection>
<developerConnection>scm:svn:${vc-dev-base-url}/${module-directory}</developerConnection>
<url>${vc-browse-base-url}/${module-directory}</url>
</scm>
<dependencies>
<dependency>
<!-- lucene-test-framework dependency must be declared before lucene-core -->
<!-- This dependency cannot be put into solr-parent, because local -->
<!-- dependencies are always ordered before inherited dependencies. -->
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-test-framework</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.solr</groupId>
<artifactId>solr-test-framework</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.solr</groupId>
<artifactId>solr-morphlines-core</artifactId>
<version>${project.version}</version>
<type>test-jar</type>
<scope>test</scope>
</dependency>
@solr-morphlines-cell.internal.dependencies@
@solr-morphlines-cell.external.dependencies@
@solr-morphlines-cell.internal.test.dependencies@
@solr-morphlines-cell.external.test.dependencies@
</dependencies>
<build>
<sourceDirectory>${module-path}/src/java</sourceDirectory>
<testSourceDirectory>${module-path}/src/test</testSourceDirectory>
<testResources>
<testResource>
<directory>${module-path}/src/test-files</directory>
</testResource>
<testResource>
<directory>${top-level}/dev-tools/maven/solr</directory>
<includes>
<include>maven.testlogging.properties</include>
</includes>
</testResource>
</testResources>
<plugins>
<plugin>
<groupId>de.thetaphi</groupId>
<artifactId>forbiddenapis</artifactId>
<executions>
<execution>
<id>test-check-forbidden-servlet-api</id>
<configuration>
<signaturesFiles>
<signaturesFile>${top-level}/lucene/tools/forbiddenApis/servlet-api.txt</signaturesFile>
</signaturesFiles>
</configuration>
<goals>
<goal>testCheck</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>

View File

@ -0,0 +1,108 @@
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.apache.solr</groupId>
<artifactId>solr-parent</artifactId>
<version>@version@</version>
<relativePath>../../pom.xml</relativePath>
</parent>
<groupId>org.apache.solr</groupId>
<artifactId>solr-morphlines-core</artifactId>
<packaging>jar</packaging>
<name>Apache Solr Morphlines Core</name>
<description>Apache Solr - Morphlines Core</description>
<properties>
<module-directory>solr/contrib/morphlines-core</module-directory>
<relative-top-level>../../../..</relative-top-level>
<module-path>${relative-top-level}/${module-directory}</module-path>
</properties>
<scm>
<connection>scm:svn:${vc-anonymous-base-url}/${module-directory}</connection>
<developerConnection>scm:svn:${vc-dev-base-url}/${module-directory}</developerConnection>
<url>${vc-browse-base-url}/${module-directory}</url>
</scm>
<dependencies>
<dependency>
<!-- lucene-test-framework dependency must be declared before lucene-core -->
<!-- This dependency cannot be put into solr-parent, because local -->
<!-- dependencies are always ordered before inherited dependencies. -->
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-test-framework</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.solr</groupId>
<artifactId>solr-test-framework</artifactId>
<scope>test</scope>
</dependency>
@solr-morphlines-core.internal.dependencies@
@solr-morphlines-core.external.dependencies@
@solr-morphlines-core.internal.test.dependencies@
@solr-morphlines-core.external.test.dependencies@
</dependencies>
<build>
<sourceDirectory>${module-path}/src/java</sourceDirectory>
<testSourceDirectory>${module-path}/src/test</testSourceDirectory>
<testResources>
<testResource>
<directory>${module-path}/src/test-files</directory>
</testResource>
<testResource>
<directory>${top-level}/dev-tools/maven/solr</directory>
<includes>
<include>maven.testlogging.properties</include>
</includes>
</testResource>
</testResources>
<plugins>
<plugin>
<groupId>de.thetaphi</groupId>
<artifactId>forbiddenapis</artifactId>
<executions>
<execution>
<id>test-check-forbidden-servlet-api</id>
<configuration>
<signaturesFiles>
<signaturesFile>${top-level}/lucene/tools/forbiddenApis/servlet-api.txt</signaturesFile>
</signaturesFiles>
</configuration>
<goals>
<goal>testCheck</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
<executions>
<execution>
<goals>
<goal>test-jar</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>

View File

@ -37,6 +37,9 @@
<module>dataimporthandler-extras</module>
<module>extraction</module>
<module>langid</module>
<module>morphlines-cell</module>
<module>morphlines-core</module>
<module>map-reduce</module>
<module>uima</module>
<module>velocity</module>
</modules>

View File

@ -81,6 +81,11 @@
<name>Public online Restlet repository</name>
<url>http://maven.restlet.org</url>
</repository>
<repository>
<id>releases.cloudera.com</id>
<name>Cloudera Releases</name>
<url>https://repository.cloudera.com/artifactory/libs-release</url>
</repository>
</repositories>
<build>
<pluginManagement>

View File

@ -0,0 +1,165 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<xsl:stylesheet version="1.0"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:str="http://exslt.org/strings"
xmlns:common="http://exslt.org/common"
extension-element-prefixes="str common">
<xsl:param name="netbeans.fileset.sourcefolders"/>
<xsl:param name="netbeans.path.libs"/>
<xsl:param name="netbeans.source-level"/>
<xsl:variable name="netbeans.fileset.sourcefolders.sortedfrag">
<xsl:for-each select="str:split($netbeans.fileset.sourcefolders,'|')">
<!-- hack to sort **/src/java before **/src/test before **/src/resources : contains() returns "true" which sorts before "false" if descending: -->
<xsl:sort select="string(contains(text(), '/src/java'))" order="descending" lang="en"/>
<xsl:sort select="string(contains(text(), '/src/test'))" order="descending" lang="en"/>
<xsl:sort select="string(contains(text(), '/src/resources'))" order="descending" lang="en"/>
<!-- hack to sort the list, starts-with() returns "true" which sorts before "false" if descending: -->
<xsl:sort select="string(starts-with(text(), 'lucene/core/'))" order="descending" lang="en"/>
<xsl:sort select="string(starts-with(text(), 'lucene/test-framework/'))" order="descending" lang="en"/>
<xsl:sort select="string(starts-with(text(), 'lucene/'))" order="descending" lang="en"/>
<xsl:sort select="string(starts-with(text(), 'solr/core/'))" order="descending" lang="en"/>
<xsl:sort select="string(starts-with(text(), 'solr/solrj/'))" order="descending" lang="en"/>
<xsl:sort select="string(starts-with(text(), 'solr/test-framework/'))" order="descending" lang="en"/>
<xsl:sort select="string(starts-with(text(), 'solr/'))" order="descending" lang="en"/>
<!-- all others in one group above are sorted by path name: -->
<xsl:sort select="text()" order="ascending" lang="en"/>
<xsl:copy-of select="."/>
</xsl:for-each>
</xsl:variable>
<xsl:variable name="netbeans.fileset.sourcefolders.sorted" select="common:node-set($netbeans.fileset.sourcefolders.sortedfrag)/*"/>
<xsl:variable name="netbeans.full.classpath.frag">
<classpath mode="compile" xmlns="http://www.netbeans.org/ns/freeform-project-java/3">
<xsl:value-of select="$netbeans.path.libs"/>
<xsl:for-each select="$netbeans.fileset.sourcefolders.sorted[contains(text(), '/src/java')]">
<xsl:text>:</xsl:text>
<xsl:value-of select="."/>
</xsl:for-each>
</classpath>
</xsl:variable>
<!--
NOTE: This template matches the root element of any given input XML document!
The XSL input file is ignored completely.
-->
<xsl:template match="/">
<project xmlns="http://www.netbeans.org/ns/project/1">
<type>org.netbeans.modules.ant.freeform</type>
<configuration>
<general-data xmlns="http://www.netbeans.org/ns/freeform-project/1">
<name>lucene</name>
<properties/>
<folders>
<xsl:for-each select="$netbeans.fileset.sourcefolders.sorted">
<source-folder>
<label>
<xsl:value-of select="."/>
</label>
<xsl:if test="contains(text(), '/src/java') or contains(text(), '/src/test')">
<type>java</type>
</xsl:if>
<location>
<xsl:value-of select="."/>
</location>
</source-folder>
</xsl:for-each>
</folders>
<ide-actions>
<action name="build">
<target>compile</target>
</action>
<action name="clean">
<target>clean</target>
</action>
<action name="javadoc">
<target>documentation</target>
</action>
<action name="test">
<target>test</target>
</action>
<action name="rebuild">
<target>clean</target>
<target>compile</target>
</action>
</ide-actions>
<view>
<items>
<xsl:for-each select="$netbeans.fileset.sourcefolders.sorted">
<source-folder>
<xsl:attribute name="style">
<xsl:choose>
<xsl:when test="contains(text(), '/src/java') or contains(text(), '/src/test')">packages</xsl:when>
<xsl:otherwise>tree</xsl:otherwise>
</xsl:choose>
</xsl:attribute>
<label>
<xsl:value-of select="."/>
</label>
<location>
<xsl:value-of select="."/>
</location>
</source-folder>
</xsl:for-each>
<source-file>
<label>Project Build Script</label>
<location>build.xml</location>
</source-file>
</items>
<context-menu>
<ide-action name="build"/>
<ide-action name="rebuild"/>
<ide-action name="clean"/>
<ide-action name="javadoc"/>
<ide-action name="test"/>
</context-menu>
</view>
<subprojects/>
</general-data>
<java-data xmlns="http://www.netbeans.org/ns/freeform-project-java/3">
<compilation-unit>
<xsl:for-each select="$netbeans.fileset.sourcefolders.sorted[contains(text(), '/src/java')]">
<package-root>
<xsl:value-of select="."/>
</package-root>
</xsl:for-each>
<xsl:copy-of select="$netbeans.full.classpath.frag"/>
<built-to>nb-build/classes</built-to>
<source-level>
<xsl:value-of select="$netbeans.source-level"/>
</source-level>
</compilation-unit>
<compilation-unit>
<xsl:for-each select="$netbeans.fileset.sourcefolders.sorted[contains(text(), '/src/test')]">
<package-root>
<xsl:value-of select="."/>
</package-root>
</xsl:for-each>
<unit-tests/>
<xsl:copy-of select="$netbeans.full.classpath.frag"/>
<built-to>nb-build/test-classes</built-to>
<source-level>
<xsl:value-of select="$netbeans.source-level"/>
</source-level>
</compilation-unit>
</java-data>
</configuration>
</project>
</xsl:template>
</xsl:stylesheet>

View File

@ -0,0 +1,9 @@
auxiliary.org-netbeans-modules-editor-indent.CodeStyle.project.expand-tabs=true
auxiliary.org-netbeans-modules-editor-indent.CodeStyle.project.indent-shift-width=2
auxiliary.org-netbeans-modules-editor-indent.CodeStyle.project.spaces-per-tab=2
auxiliary.org-netbeans-modules-editor-indent.CodeStyle.project.tab-size=2
auxiliary.org-netbeans-modules-editor-indent.CodeStyle.project.text-limit-width=80
auxiliary.org-netbeans-modules-editor-indent.CodeStyle.project.text-line-wrap=none
auxiliary.org-netbeans-modules-editor-indent.CodeStyle.usedProfile=project
auxiliary.org-netbeans-modules-editor-indent.text.x-java.CodeStyle.project.continuationIndentSize=4
auxiliary.org-netbeans-modules-editor-indent.text.x-java.CodeStyle.project.spaceAfterTypeCast=false

View File

@ -25,6 +25,27 @@
<import file="lucene/common-build.xml"/>
<target name="-run-test">
<mkdir dir="lucene/build" />
<tempfile property="tests.totals.tmpfile"
destdir="lucene/build"
prefix=".test-totals-"
suffix=".tmp"
deleteonexit="true"
createfile="true" />
<subant target="test" inheritall="false" failonerror="true">
<fileset dir="lucene" includes="build.xml" />
<fileset dir="solr" includes="build.xml" />
<propertyset>
<propertyref name="tests.totals.tmpfile" />
</propertyset>
</subant>
<property name="tests.totals.toplevel" value="true" />
<antcall target="-check-totals" />
</target>
<!--
Run after Junit tests.
@ -70,11 +91,13 @@
<target name="-check-after-regeneration" depends="ivy-availability-check,ivy-fail,ivy-configure,resolve-groovy">
<svn-checker failonmodifications="true"/>
</target>
<property name="svnkit.version" value="1.7.8"/>
<macrodef xmlns:ivy="antlib:org.apache.ivy.ant" name="svn-checker">
<attribute name="failonmodifications" default="true"/> <!-- false if file modifications are allowed -->
<sequential>
<ivy:cachepath organisation="org.tmatesoft.svnkit" module="svnkit" revision="1.7.8"
<ivy:cachepath organisation="org.tmatesoft.svnkit" module="svnkit" revision="${svnkit.version}"
inline="true" conf="default" transitive="true" pathid="svnkit.classpath"/>
<local name="svn.checkprops.failed"/>
<local name="svn.unversioned.failed"/>

View File

@ -68,6 +68,14 @@ New Features
* LUCENE-5336: Add SimpleQueryParser: parser for human-entered queries.
(Jack Conradson via Robert Muir)
* LUCENE-5329: suggest: DocumentDictionary and
DocumentExpressionDictionary are now lenient for dirty documents
(missing the term, weight or payload). (Areek Zillur via
Mike McCandless)
* SOLR-1871: The RangeMapFloatFunction accepts an arbitrary ValueSource
as target and default values. (Chris Harris, shalin)
* LUCENE-5371: Speed up Lucene range faceting from O(N) per hit to
O(log(N)) per hit using segment trees; this only really starts to
matter in practice if the number of ranges is over 10 or so. (Mike
@ -83,6 +91,30 @@ Build
* LUCENE-5322: Clean up / simplify Maven-related Ant targets.
(Steve Rowe)
* LUCENE-5347: Upgrade forbidden-apis checker to version 1.4.
(Uwe Schindler)
* LUCENE-4381: Upgrade analysis/icu to 52.1. (Robert Muir)
* LUCENE-5357: Upgrade StandardTokenizer and UAX29URLEmailTokenizer to
Unicode 6.3; update UAX29URLEmailTokenizer's recognized top level
domains in URLs and Emails from the IANA Root Zone Database.
(Steve Rowe)
* LUCENE-5360: Add support for developing in Netbeans IDE.
(Michal Hlavac, Uwe Schindler, Steve Rowe)
Bug fixes
* LUCENE-5285: Improved highlighting of multi-valued fields with
FastVectorHighlighter. (Nik Everett via Adrien Grand)
Changes in Runtime Behavior
* LUCENE-5362: IndexReader and SegmentCoreReaders now throw
AlreadyClosedException if the refCount in incremented but
is less that 1. (Simon Willnauer)
======================= Lucene 4.6.0 =======================
New Features
@ -176,39 +208,9 @@ New Features
Bug Fixes
* LUCENE-4998: Fixed a few places to pass IOContext.READONCE instead
of IOContext.READ (Shikhar Bhushan via Mike McCandless)
* LUCENE-5242: DirectoryTaxonomyWriter.replaceTaxonomy did not fully reset
its state, which could result in exceptions being thrown, as well as
incorrect ordinals returned from getParent. (Shai Erera)
* LUCENE-5254: Fixed bounded memory leak, where objects like live
docs bitset were not freed from an starting reader after reopening
to a new reader and closing the original one. (Shai Erera, Mike
McCandless)
* LUCENE-5262: Fixed file handle leaks when multiple attempts to open an
NRT reader hit exceptions. (Shai Erera)
* LUCENE-5263: Transient IOExceptions, e.g. due to disk full or file
descriptor exhaustion, hit at unlucky times inside IndexWriter could
lead to silently losing deletions. (Shai Erera, Mike McCandless)
* LUCENE-5264: CommonTermsQuery ignored minMustMatch if only high-frequent
terms were present in the query and the high-frequent operator was set
to SHOULD. (Simon Willnauer)
* LUCENE-5269: Fix bug in NGramTokenFilter where it would sometimes count
unicode characters incorrectly. (Mike McCandless, Robert Muir)
* LUCENE-5272: OpenBitSet.ensureCapacity did not modify numBits, causing
false assertion errors in fastSet. (Shai Erera)
* LUCENE-5289: IndexWriter.hasUncommittedChanges was returning false
when there were buffered delete-by-Term. (Shalin Shekhar Mangar,
Mike McCandless)
* LUCENE-5303: OrdinalsCache did not use coreCacheKey, resulting in
over caching across multiple threads. (Mike McCandless, Shai Erera)
@ -221,7 +223,11 @@ Bug Fixes
deleted at a later point in time. This could cause short-term disk
pollution or OOM if in-memory directories are used. (Simon Willnauer)
API Changes:
* LUCENE-5342: Fixed bulk-merge issue in CompressingStoredFieldsFormat which
created corrupted segments when mixing chunk sizes.
Lucene41StoredFieldsFormat is not impacted. (Adrien Grand, Robert Muir)
API Changes
* LUCENE-5222: Add SortField.needsScores(). Previously it was not possible
for a custom Sort that makes use of the relevance score to work correctly
@ -314,6 +320,40 @@ Tests
is either a "word" character or not), but now it gives a general longest-match
behavior. (Nik Everett via Robert Muir)
======================= Lucene 4.5.1 =======================
Bug Fixes
* LUCENE-4998: Fixed a few places to pass IOContext.READONCE instead
of IOContext.READ (Shikhar Bhushan via Mike McCandless)
* LUCENE-5242: DirectoryTaxonomyWriter.replaceTaxonomy did not fully reset
its state, which could result in exceptions being thrown, as well as
incorrect ordinals returned from getParent. (Shai Erera)
* LUCENE-5254: Fixed bounded memory leak, where objects like live
docs bitset were not freed from an starting reader after reopening
to a new reader and closing the original one. (Shai Erera, Mike
McCandless)
* LUCENE-5262: Fixed file handle leaks when multiple attempts to open an
NRT reader hit exceptions. (Shai Erera)
* LUCENE-5263: Transient IOExceptions, e.g. due to disk full or file
descriptor exhaustion, hit at unlucky times inside IndexWriter could
lead to silently losing deletions. (Shai Erera, Mike McCandless)
* LUCENE-5264: CommonTermsQuery ignored minMustMatch if only high-frequent
terms were present in the query and the high-frequent operator was set
to SHOULD. (Simon Willnauer)
* LUCENE-5269: Fix bug in NGramTokenFilter where it would sometimes count
unicode characters incorrectly. (Mike McCandless, Robert Muir)
* LUCENE-5289: IndexWriter.hasUncommittedChanges was returning false
when there were buffered delete-by-Term. (Shalin Shekhar Mangar,
Mike McCandless)
======================= Lucene 4.5.0 =======================
New features

View File

@ -45,17 +45,13 @@
<taskdef classname="jflex.anttask.JFlexTask" name="jflex">
<classpath refid="jflex.classpath"/>
</taskdef>
<!-- this logic below looks duplicated with run-jflex, but its not, the regexp is different! -->
<jflex file="src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex"
outdir="src/java/org/apache/lucene/analysis/charfilter"
nobak="on"/>
<!-- Remove the inappropriate JFlex-generated constructors -->
nobak="on" inputstreamctor="false"/>
<!-- Remove the inappropriate JFlex-generated constructor -->
<replaceregexp file="src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java"
match="/\*\*\s*\*\s*Creates a new scanner.*this\(new java\.io\.InputStreamReader\(in\)\);\s*\}"
replace="" flags="sg"/>
<replaceregexp file="src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java"
match="\/\*\s*The following code was generated by JFlex.*"
replace="\/\* The following code was generated by JFlex. \*\/" flags=""/>
match="/\*\*\s*\*\s*Creates a new scanner\s*\*\s*\*\s*@param\s*in\s*the java.io.Reader to read input from\.\s*\*/\s*public HTMLStripCharFilter\(java\.io\.Reader in\)\s*\{\s*this.zzReader = in;\s*\}"
replace="" flags="s"/>
</target>
<target name="generate-jflex-html-char-entities">
@ -96,15 +92,7 @@
<attribute name="dir"/>
<attribute name="name"/>
<sequential>
<jflex file="@{dir}/@{name}.jflex"
outdir="@{dir}"
nobak="on" />
<replaceregexp file="@{dir}/@{name}.java"
match="/\*\*\s*\*\s*Creates a new scanner\..*this\(new java\.io\.InputStreamReader\(in\)\);\s*\}"
replace="" flags="sg"/>
<replaceregexp file="@{dir}/@{name}.java"
match="\/\*\s*The following code was generated by JFlex.*"
replace="\/\* The following code was generated by JFlex. \*\/" flags=""/>
<jflex file="@{dir}/@{name}.jflex" outdir="@{dir}" nobak="on" inputstreamctor="false"/>
</sequential>
</macrodef>

View File

@ -73,7 +73,7 @@ CharacterEntities = ( "AElig" | "Aacute" | "Acirc" | "Agrave" | "Alpha"
upperCaseVariantsAccepted.put("amp", "AMP");
}
private static final CharArrayMap<Character> entityValues
= new CharArrayMap<Character>(Version.LUCENE_40, 253, false);
= new CharArrayMap<Character>(Version.LUCENE_CURRENT, 253, false);
static {
String[] entities = {
"AElig", "\u00C6", "Aacute", "\u00C1", "Acirc", "\u00C2",

View File

@ -14,7 +14,7 @@
* limitations under the License.
*/
// Generated using ICU4J 49.1.0.0
// Generated using ICU4J 52.1.0.0
// by org.apache.lucene.analysis.icu.GenerateHTMLStripCharFilterSupplementaryMacros

View File

@ -1,4 +1,4 @@
/* The following code was generated by JFlex. */
/* The following code was generated by JFlex 1.5.0-SNAPSHOT */
package org.apache.lucene.analysis.charfilter;
@ -152,77 +152,77 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
"\21\1\1\41\32\1\5\0\113\1\3\0\3\1\17\0\15\1\1\0"+
"\4\1\3\2\13\0\22\1\3\2\13\0\22\1\2\2\14\0\15\1"+
"\1\0\3\1\1\0\2\2\14\0\64\1\40\2\3\0\1\1\4\0"+
"\1\1\1\2\2\0\12\274\41\0\3\2\1\41\1\0\12\274\6\0"+
"\130\1\10\0\51\1\1\2\1\1\5\0\106\1\12\0\35\1\3\0"+
"\14\2\4\0\14\2\12\0\12\274\36\1\2\0\5\1\13\0\54\1"+
"\4\0\21\2\7\1\2\2\6\0\12\274\1\2\45\0\27\1\5\2"+
"\4\0\65\1\12\2\1\0\35\2\2\0\1\2\12\274\6\0\12\274"+
"\15\0\1\1\130\0\5\2\57\1\21\2\7\1\4\0\12\274\21\0"+
"\11\2\14\0\3\2\36\1\15\2\2\1\12\274\54\1\16\2\14\0"+
"\44\1\24\2\10\0\12\274\3\0\3\1\12\274\44\1\122\0\3\2"+
"\1\0\25\2\4\1\1\2\4\1\3\2\2\1\11\0\300\1\47\2"+
"\25\0\4\2\u0116\1\2\0\6\1\2\0\46\1\2\0\6\1\2\0"+
"\10\1\1\0\1\1\1\0\1\1\1\0\1\1\1\0\37\1\2\0"+
"\65\1\1\0\7\1\1\0\1\1\3\0\3\1\1\0\7\1\3\0"+
"\4\1\2\0\6\1\4\0\15\1\5\0\3\1\1\0\7\1\3\0"+
"\13\41\35\0\2\41\5\0\1\41\17\0\2\2\23\0\1\2\12\0"+
"\1\41\21\0\1\1\15\0\1\1\20\0\15\1\63\0\15\2\4\0"+
"\1\2\3\0\14\2\21\0\1\1\4\0\1\1\2\0\12\1\1\0"+
"\1\1\2\0\6\1\6\0\1\1\1\0\1\1\1\0\1\1\1\0"+
"\20\1\2\0\4\1\5\0\5\1\4\0\1\1\21\0\51\1\u0a77\0"+
"\57\1\1\0\57\1\1\0\205\1\6\0\4\1\3\2\2\1\14\0"+
"\46\1\1\0\1\1\5\0\1\1\2\0\70\1\7\0\1\1\17\0"+
"\1\2\27\1\11\0\7\1\1\0\7\1\1\0\7\1\1\0\7\1"+
"\1\0\7\1\1\0\7\1\1\0\7\1\1\0\7\1\1\0\40\2"+
"\u0200\0\1\41\4\0\3\1\31\0\11\1\6\2\1\0\5\1\2\0"+
"\5\1\4\0\126\1\2\0\2\2\5\1\1\0\132\1\1\0\4\1"+
"\5\0\51\1\3\0\136\1\21\0\33\1\65\0\20\1\u0200\0\u19b6\1"+
"\112\0\u51cd\1\63\0\u048d\1\103\0\56\1\2\0\u010d\1\3\0\20\1"+
"\12\274\2\1\24\0\57\1\1\2\4\0\12\2\1\0\31\1\7\0"+
"\1\2\120\1\2\2\45\0\11\1\2\0\147\1\2\0\4\1\1\0"+
"\4\1\14\0\13\1\115\0\12\1\1\2\3\1\1\2\4\1\1\2"+
"\27\1\5\2\30\0\64\1\14\0\2\2\62\1\21\2\13\0\12\274"+
"\6\0\22\2\6\1\3\0\1\1\4\0\12\274\34\1\10\2\2\0"+
"\27\1\15\2\14\0\35\1\3\0\4\2\57\1\16\2\16\0\1\1"+
"\12\274\46\0\51\1\16\2\11\0\3\1\1\2\10\1\2\2\2\0"+
"\12\274\6\0\27\1\3\0\1\1\1\2\4\0\60\1\1\2\1\1"+
"\3\2\2\1\2\2\5\1\2\2\1\1\1\2\1\1\30\0\3\1"+
"\2\0\13\1\5\2\2\0\3\1\2\2\12\0\6\1\2\0\6\1"+
"\2\0\6\1\11\0\7\1\1\0\7\1\221\0\43\1\10\2\1\0"+
"\2\2\2\0\12\274\6\0\u2ba4\1\14\0\27\1\4\0\61\1\4\0"+
"\1\170\1\223\1\103\1\165\1\136\1\214\2\0\1\160\1\153\2\0"+
"\1\120\1\210\14\0\1\105\1\127\20\0\1\122\7\0\1\256\1\112"+
"\5\0\1\143\4\0\51\120\1\110\3\120\1\124\1\220\17\0\1\133"+
"\u02c1\0\1\252\277\0\2\123\1\212\3\222\2\211\1\222\1\211\2\222"+
"\1\221\21\222\11\213\1\157\7\213\7\204\1\156\1\204\1\246\2\207"+
"\1\166\1\246\1\207\1\166\10\246\2\167\5\203\2\155\5\203\1\107"+
"\10\202\5\154\3\224\12\251\20\224\3\225\32\227\1\226\2\200\2\234"+
"\1\235\2\234\2\235\2\234\1\235\3\200\1\177\2\200\12\250\1\247"+
"\1\176\1\171\7\176\1\171\13\176\31\200\7\176\12\250\1\176\5\134"+
"\3\245\3\142\1\140\4\142\2\140\10\142\1\140\7\141\1\137\2\141"+
"\7\142\16\245\1\135\4\245\1\106\4\244\1\106\5\255\1\254\1\255"+
"\3\254\7\255\1\254\23\255\5\264\3\255\6\264\2\255\6\253\5\263"+
"\3\262\2\142\7\257\36\142\4\257\5\142\5\245\6\244\2\245\1\244"+
"\4\141\13\253\12\244\26\253\15\134\1\243\2\134\1\152\3\237\1\134"+
"\2\237\5\151\4\237\4\152\1\151\3\152\1\151\5\152\2\147\1\116"+
"\2\147\1\116\1\147\2\116\1\147\1\116\12\147\1\116\4\146\1\115"+
"\1\236\1\240\1\150\3\164\1\240\2\164\1\260\2\261\2\164\1\150"+
"\1\164\1\150\1\164\1\150\1\164\3\150\1\164\2\150\1\164\1\150"+
"\2\164\1\150\1\164\1\150\1\164\1\150\1\164\1\150\1\164\1\150"+
"\1\162\2\145\1\162\1\145\2\162\4\145\1\162\7\145\1\162\4\145"+
"\1\162\4\145\1\164\1\150\1\164\12\216\1\217\21\216\1\217\3\215"+
"\1\217\3\216\1\217\1\216\2\144\2\216\1\217\15\241\4\201\4\206"+
"\1\242\1\161\10\242\7\206\6\164\4\113\1\121\37\113\1\121\4\113"+
"\25\174\1\131\11\174\21\130\5\174\1\104\12\117\5\174\6\205\4\162"+
"\1\163\1\130\5\231\12\232\17\231\1\125\3\114\14\230\1\126\11\173"+
"\1\172\5\173\4\233\13\175\2\132\11\173\1\172\31\173\1\172\4\126"+
"\4\173\2\172\2\265\1\111\5\265\52\111\u1900\0\u016e\1\2\0\152\1"+
"\46\0\7\1\14\0\5\1\5\0\1\1\1\2\12\1\1\0\15\1"+
"\1\0\5\1\1\0\1\1\1\0\2\1\1\0\2\1\1\0\154\1"+
"\41\0\u016b\1\22\0\100\1\2\0\66\1\50\0\14\1\4\0\20\2"+
"\20\0\7\2\14\0\2\2\30\0\3\2\40\0\5\1\1\0\207\1"+
"\23\0\12\274\7\0\32\1\4\0\1\2\1\0\32\1\13\0\131\1"+
"\3\0\6\1\2\0\6\1\2\0\6\1\2\0\3\1\43\0";
"\1\1\1\2\2\0\12\274\41\0\3\2\2\0\12\274\6\0\130\1"+
"\10\0\51\1\1\2\1\1\5\0\106\1\12\0\35\1\3\0\14\2"+
"\4\0\14\2\12\0\12\274\36\1\2\0\5\1\13\0\54\1\4\0"+
"\21\2\7\1\2\2\6\0\12\274\1\2\45\0\27\1\5\2\4\0"+
"\65\1\12\2\1\0\35\2\2\0\1\2\12\274\6\0\12\274\15\0"+
"\1\1\130\0\5\2\57\1\21\2\7\1\4\0\12\274\21\0\11\2"+
"\14\0\3\2\36\1\15\2\2\1\12\274\54\1\16\2\14\0\44\1"+
"\24\2\10\0\12\274\3\0\3\1\12\274\44\1\122\0\3\2\1\0"+
"\25\2\4\1\1\2\4\1\3\2\2\1\11\0\300\1\47\2\25\0"+
"\4\2\u0116\1\2\0\6\1\2\0\46\1\2\0\6\1\2\0\10\1"+
"\1\0\1\1\1\0\1\1\1\0\1\1\1\0\37\1\2\0\65\1"+
"\1\0\7\1\1\0\1\1\3\0\3\1\1\0\7\1\3\0\4\1"+
"\2\0\6\1\4\0\15\1\5\0\3\1\1\0\7\1\3\0\13\41"+
"\35\0\2\41\5\0\1\41\17\0\2\2\23\0\1\2\12\0\1\41"+
"\21\0\1\1\15\0\1\1\20\0\15\1\63\0\15\2\4\0\1\2"+
"\3\0\14\2\21\0\1\1\4\0\1\1\2\0\12\1\1\0\1\1"+
"\2\0\6\1\6\0\1\1\1\0\1\1\1\0\1\1\1\0\20\1"+
"\2\0\4\1\5\0\5\1\4\0\1\1\21\0\51\1\u0a77\0\57\1"+
"\1\0\57\1\1\0\205\1\6\0\4\1\3\2\2\1\14\0\46\1"+
"\1\0\1\1\5\0\1\1\2\0\70\1\7\0\1\1\17\0\1\2"+
"\27\1\11\0\7\1\1\0\7\1\1\0\7\1\1\0\7\1\1\0"+
"\7\1\1\0\7\1\1\0\7\1\1\0\7\1\1\0\40\2\u0200\0"+
"\1\41\4\0\3\1\31\0\11\1\6\2\1\0\5\1\2\0\5\1"+
"\4\0\126\1\2\0\2\2\5\1\1\0\132\1\1\0\4\1\5\0"+
"\51\1\3\0\136\1\21\0\33\1\65\0\20\1\u0200\0\u19b6\1\112\0"+
"\u51cd\1\63\0\u048d\1\103\0\56\1\2\0\u010d\1\3\0\20\1\12\274"+
"\2\1\24\0\57\1\1\2\4\0\12\2\1\0\31\1\7\0\1\2"+
"\120\1\2\2\45\0\11\1\2\0\147\1\2\0\4\1\1\0\4\1"+
"\14\0\13\1\115\0\12\1\1\2\3\1\1\2\4\1\1\2\27\1"+
"\5\2\30\0\64\1\14\0\2\2\62\1\21\2\13\0\12\274\6\0"+
"\22\2\6\1\3\0\1\1\4\0\12\274\34\1\10\2\2\0\27\1"+
"\15\2\14\0\35\1\3\0\4\2\57\1\16\2\16\0\1\1\12\274"+
"\46\0\51\1\16\2\11\0\3\1\1\2\10\1\2\2\2\0\12\274"+
"\6\0\27\1\3\0\1\1\1\2\4\0\60\1\1\2\1\1\3\2"+
"\2\1\2\2\5\1\2\2\1\1\1\2\1\1\30\0\3\1\2\0"+
"\13\1\5\2\2\0\3\1\2\2\12\0\6\1\2\0\6\1\2\0"+
"\6\1\11\0\7\1\1\0\7\1\221\0\43\1\10\2\1\0\2\2"+
"\2\0\12\274\6\0\u2ba4\1\14\0\27\1\4\0\61\1\4\0\1\170"+
"\1\223\1\103\1\165\1\136\1\214\2\0\1\160\1\153\2\0\1\120"+
"\1\210\14\0\1\105\1\127\20\0\1\122\7\0\1\256\1\112\5\0"+
"\1\143\4\0\51\120\1\110\3\120\1\124\1\220\17\0\1\133\u02c1\0"+
"\1\252\277\0\2\123\1\212\3\222\2\211\1\222\1\211\2\222\1\221"+
"\21\222\11\213\1\157\7\213\7\204\1\156\1\204\1\246\2\207\1\166"+
"\1\246\1\207\1\166\10\246\2\167\5\203\2\155\5\203\1\107\10\202"+
"\5\154\3\224\12\251\20\224\3\225\32\227\1\226\2\200\2\234\1\235"+
"\2\234\2\235\2\234\1\235\3\200\1\177\2\200\12\250\1\247\1\176"+
"\1\171\7\176\1\171\13\176\31\200\7\176\12\250\1\176\5\134\3\245"+
"\3\142\1\140\4\142\2\140\10\142\1\140\7\141\1\137\2\141\7\142"+
"\16\245\1\135\4\245\1\106\4\244\1\106\5\255\1\254\1\255\3\254"+
"\7\255\1\254\23\255\5\264\3\255\6\264\2\255\6\253\5\263\3\262"+
"\2\142\7\257\36\142\4\257\5\142\5\245\6\244\2\245\1\244\4\141"+
"\13\253\12\244\26\253\15\134\1\243\2\134\1\152\3\237\1\134\2\237"+
"\5\151\4\237\4\152\1\151\3\152\1\151\5\152\2\147\1\116\2\147"+
"\1\116\1\147\2\116\1\147\1\116\12\147\1\116\4\146\1\115\1\236"+
"\1\240\1\150\3\164\1\240\2\164\1\260\2\261\2\164\1\150\1\164"+
"\1\150\1\164\1\150\1\164\3\150\1\164\2\150\1\164\1\150\2\164"+
"\1\150\1\164\1\150\1\164\1\150\1\164\1\150\1\164\1\150\1\162"+
"\2\145\1\162\1\145\2\162\4\145\1\162\7\145\1\162\4\145\1\162"+
"\4\145\1\164\1\150\1\164\12\216\1\217\21\216\1\217\3\215\1\217"+
"\3\216\1\217\1\216\2\144\2\216\1\217\15\241\4\201\4\206\1\242"+
"\1\161\10\242\7\206\6\164\4\113\1\121\37\113\1\121\4\113\25\174"+
"\1\131\11\174\21\130\5\174\1\104\12\117\5\174\6\205\4\162\1\163"+
"\1\130\5\231\12\232\17\231\1\125\3\114\14\230\1\126\11\173\1\172"+
"\5\173\4\233\13\175\2\132\11\173\1\172\31\173\1\172\4\126\4\173"+
"\2\172\2\265\1\111\5\265\52\111\u1900\0\u016e\1\2\0\152\1\46\0"+
"\7\1\14\0\5\1\5\0\1\1\1\2\12\1\1\0\15\1\1\0"+
"\5\1\1\0\1\1\1\0\2\1\1\0\2\1\1\0\154\1\41\0"+
"\u016b\1\22\0\100\1\2\0\66\1\50\0\14\1\4\0\20\2\20\0"+
"\7\2\14\0\2\2\30\0\3\2\40\0\5\1\1\0\207\1\23\0"+
"\12\274\7\0\32\1\4\0\1\2\1\0\32\1\13\0\131\1\3\0"+
"\6\1\2\0\6\1\2\0\6\1\2\0\3\1\43\0";
/**
* Translates characters to character classes
@ -30673,7 +30673,7 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
upperCaseVariantsAccepted.put("amp", "AMP");
}
private static final CharArrayMap<Character> entityValues
= new CharArrayMap<Character>(Version.LUCENE_40, 253, false);
= new CharArrayMap<Character>(Version.LUCENE_CURRENT, 253, false);
static {
String[] entities = {
"AElig", "\u00C6", "Aacute", "\u00C1", "Acirc", "\u00C2",
@ -30812,7 +30812,7 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
escapeSTYLE = true;
} else {
if (null == this.escapedTags) {
this.escapedTags = new CharArraySet(Version.LUCENE_40, 16, true);
this.escapedTags = new CharArraySet(Version.LUCENE_CURRENT, 16, true);
}
this.escapedTags.add(tag);
}
@ -30895,6 +30895,7 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
/**
* Unpacks the compressed character translation table.
*
@ -30905,7 +30906,7 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
char [] map = new char[0x10000];
int i = 0; /* index in packed string */
int j = 0; /* index in unpacked array */
while (i < 2778) {
while (i < 2776) {
int count = packed.charAt(i++);
char value = packed.charAt(i++);
do map[j++] = value; while (--count > 0);

View File

@ -34,7 +34,7 @@ import org.apache.lucene.analysis.util.OpenStringBuilder;
*/
%%
%unicode 6.1
%unicode 6.3
%apiprivate
%type int
%final
@ -197,7 +197,7 @@ InlineElment = ( [aAbBiIqQsSuU] |
escapeSTYLE = true;
} else {
if (null == this.escapedTags) {
this.escapedTags = new CharArraySet(Version.LUCENE_40, 16, true);
this.escapedTags = new CharArraySet(Version.LUCENE_CURRENT, 16, true);
}
this.escapedTags.add(tag);
}

View File

@ -61,7 +61,7 @@ def main():
print ' upperCaseVariantsAccepted.put("amp", "AMP");'
print ' }'
print ' private static final CharArrayMap<Character> entityValues'
print ' = new CharArrayMap<Character>(Version.LUCENE_40, %i, false);' % len(keys)
print ' = new CharArrayMap<Character>(Version.LUCENE_CURRENT, %i, false);' % len(keys)
print ' static {'
print ' String[] entities = {'
output_line = ' '

View File

@ -196,7 +196,7 @@ public class GreekStemmer {
return len;
}
private static final CharArraySet exc4 = new CharArraySet(Version.LUCENE_50,
private static final CharArraySet exc4 = new CharArraySet(Version.LUCENE_CURRENT,
Arrays.asList("θ", "δ", "ελ", "γαλ", "ν", "π", "ιδ", "παρ"),
false);
@ -222,7 +222,7 @@ public class GreekStemmer {
return len;
}
private static final CharArraySet exc6 = new CharArraySet(Version.LUCENE_50,
private static final CharArraySet exc6 = new CharArraySet(Version.LUCENE_CURRENT,
Arrays.asList("αλ", "αδ", "ενδ", "αμαν", "αμμοχαλ", "ηθ", "ανηθ",
"αντιδ", "φυσ", "βρωμ", "γερ", "εξωδ", "καλπ", "καλλιν", "καταδ",
"μουλ", "μπαν", "μπαγιατ", "μπολ", "μποσ", "νιτ", "ξικ", "συνομηλ",
@ -247,7 +247,7 @@ public class GreekStemmer {
return len;
}
private static final CharArraySet exc7 = new CharArraySet(Version.LUCENE_50,
private static final CharArraySet exc7 = new CharArraySet(Version.LUCENE_CURRENT,
Arrays.asList("αναπ", "αποθ", "αποκ", "αποστ", "βουβ", "ξεθ", "ουλ",
"πεθ", "πικρ", "ποτ", "σιχ", "χ"),
false);
@ -274,11 +274,11 @@ public class GreekStemmer {
return len;
}
private static final CharArraySet exc8a = new CharArraySet(Version.LUCENE_50,
private static final CharArraySet exc8a = new CharArraySet(Version.LUCENE_CURRENT,
Arrays.asList("τρ", "τσ"),
false);
private static final CharArraySet exc8b = new CharArraySet(Version.LUCENE_50,
private static final CharArraySet exc8b = new CharArraySet(Version.LUCENE_CURRENT,
Arrays.asList("βετερ", "βουλκ", "βραχμ", "γ", "δραδουμ", "θ", "καλπουζ",
"καστελ", "κορμορ", "λαοπλ", "μωαμεθ", "μ", "μουσουλμ", "ν", "ουλ",
"π", "πελεκ", "πλ", "πολισ", "πορτολ", "σαρακατσ", "σουλτ",
@ -337,7 +337,7 @@ public class GreekStemmer {
return len;
}
private static final CharArraySet exc9 = new CharArraySet(Version.LUCENE_50,
private static final CharArraySet exc9 = new CharArraySet(Version.LUCENE_CURRENT,
Arrays.asList("αβαρ", "βεν", "εναρ", "αβρ", "αδ", "αθ", "αν", "απλ",
"βαρον", "ντρ", "σκ", "κοπ", "μπορ", "νιφ", "παγ", "παρακαλ", "σερπ",
"σκελ", "συρφ", "τοκ", "υ", "δ", "εμ", "θαρρ", "θ"),
@ -425,11 +425,11 @@ public class GreekStemmer {
return len;
}
private static final CharArraySet exc12a = new CharArraySet(Version.LUCENE_50,
private static final CharArraySet exc12a = new CharArraySet(Version.LUCENE_CURRENT,
Arrays.asList("π", "απ", "συμπ", "ασυμπ", "ακαταπ", "αμεταμφ"),
false);
private static final CharArraySet exc12b = new CharArraySet(Version.LUCENE_50,
private static final CharArraySet exc12b = new CharArraySet(Version.LUCENE_CURRENT,
Arrays.asList("αλ", "αρ", "εκτελ", "ζ", "μ", "ξ", "παρακαλ", "αρ", "προ", "νισ"),
false);
@ -449,7 +449,7 @@ public class GreekStemmer {
return len;
}
private static final CharArraySet exc13 = new CharArraySet(Version.LUCENE_50,
private static final CharArraySet exc13 = new CharArraySet(Version.LUCENE_CURRENT,
Arrays.asList("διαθ", "θ", "παρακαταθ", "προσθ", "συνθ"),
false);
@ -483,7 +483,7 @@ public class GreekStemmer {
return len;
}
private static final CharArraySet exc14 = new CharArraySet(Version.LUCENE_50,
private static final CharArraySet exc14 = new CharArraySet(Version.LUCENE_CURRENT,
Arrays.asList("φαρμακ", "χαδ", "αγκ", "αναρρ", "βρομ", "εκλιπ", "λαμπιδ",
"λεχ", "μ", "πατ", "ρ", "λ", "μεδ", "μεσαζ", "υποτειν", "αμ", "αιθ",
"ανηκ", "δεσποζ", "ενδιαφερ", "δε", "δευτερευ", "καθαρευ", "πλε",
@ -521,7 +521,7 @@ public class GreekStemmer {
return len;
}
private static final CharArraySet exc15a = new CharArraySet(Version.LUCENE_50,
private static final CharArraySet exc15a = new CharArraySet(Version.LUCENE_CURRENT,
Arrays.asList("αβαστ", "πολυφ", "αδηφ", "παμφ", "ρ", "ασπ", "αφ", "αμαλ",
"αμαλλι", "ανυστ", "απερ", "ασπαρ", "αχαρ", "δερβεν", "δροσοπ",
"ξεφ", "νεοπ", "νομοτ", "ολοπ", "ομοτ", "προστ", "προσωποπ", "συμπ",
@ -530,7 +530,7 @@ public class GreekStemmer {
"ουλαμ", "ουρ", "π", "τρ", "μ"),
false);
private static final CharArraySet exc15b = new CharArraySet(Version.LUCENE_50,
private static final CharArraySet exc15b = new CharArraySet(Version.LUCENE_CURRENT,
Arrays.asList("ψοφ", "ναυλοχ"),
false);
@ -567,7 +567,7 @@ public class GreekStemmer {
return len;
}
private static final CharArraySet exc16 = new CharArraySet(Version.LUCENE_50,
private static final CharArraySet exc16 = new CharArraySet(Version.LUCENE_CURRENT,
Arrays.asList("ν", "χερσον", "δωδεκαν", "ερημον", "μεγαλον", "επταν"),
false);
@ -587,7 +587,7 @@ public class GreekStemmer {
return len;
}
private static final CharArraySet exc17 = new CharArraySet(Version.LUCENE_50,
private static final CharArraySet exc17 = new CharArraySet(Version.LUCENE_CURRENT,
Arrays.asList("ασβ", "σβ", "αχρ", "χρ", "απλ", "αειμν", "δυσχρ", "ευχρ", "κοινοχρ", "παλιμψ"),
false);
@ -601,7 +601,7 @@ public class GreekStemmer {
return len;
}
private static final CharArraySet exc18 = new CharArraySet(Version.LUCENE_50,
private static final CharArraySet exc18 = new CharArraySet(Version.LUCENE_CURRENT,
Arrays.asList("ν", "ρ", "σπι", "στραβομουτσ", "κακομουτσ", "εξων"),
false);
@ -625,7 +625,7 @@ public class GreekStemmer {
return len;
}
private static final CharArraySet exc19 = new CharArraySet(Version.LUCENE_50,
private static final CharArraySet exc19 = new CharArraySet(Version.LUCENE_CURRENT,
Arrays.asList("παρασουσ", "φ", "χ", "ωριοπλ", "αζ", "αλλοσουσ", "ασουσ"),
false);

View File

@ -280,10 +280,7 @@ public class KStemmer {
DictEntry defaultEntry;
DictEntry entry;
CharArrayMap<DictEntry> d = new CharArrayMap<DictEntry>(
Version.LUCENE_50, 1000, false);
d = new CharArrayMap<DictEntry>(Version.LUCENE_50, 1000, false);
CharArrayMap<DictEntry> d = new CharArrayMap<DictEntry>(Version.LUCENE_CURRENT, 1000, false);
for (int i = 0; i < exceptionWords.length; i++) {
if (!d.containsKey(exceptionWords[i])) {
entry = new DictEntry(exceptionWords[i], true);

View File

@ -34,7 +34,7 @@ public class HunspellStemmer {
private final int recursionCap;
private final HunspellDictionary dictionary;
private final StringBuilder segment = new StringBuilder();
private CharacterUtils charUtils = CharacterUtils.getInstance(Version.LUCENE_40);
private CharacterUtils charUtils = CharacterUtils.getInstance(Version.LUCENE_CURRENT);
/**
* Constructs a new HunspellStemmer which will use the provided HunspellDictionary to create its stems. Uses the
@ -324,7 +324,8 @@ public class HunspellStemmer {
InputStream affixInputStream = new FileInputStream(args[offset++]);
InputStream dicInputStream = new FileInputStream(args[offset++]);
HunspellDictionary dictionary = new HunspellDictionary(affixInputStream, dicInputStream, Version.LUCENE_40, ignoreCase);
// :Post-Release-Update-Version.LUCENE_XY:
HunspellDictionary dictionary = new HunspellDictionary(affixInputStream, dicInputStream, Version.LUCENE_50, ignoreCase);
affixInputStream.close();
dicInputStream.close();

View File

@ -35,7 +35,7 @@ public final class RemoveDuplicatesTokenFilter extends TokenFilter {
private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class);
// use a fixed version, as we don't care about case sensitivity.
private final CharArraySet previous = new CharArraySet(Version.LUCENE_50, 8, false);
private final CharArraySet previous = new CharArraySet(Version.LUCENE_CURRENT, 8, false);
/**
* Creates a new RemoveDuplicatesTokenFilter

View File

@ -134,7 +134,7 @@ public abstract class RSLPStemmerBase {
if (!exceptions[i].endsWith(suffix))
throw new RuntimeException("useless exception '" + exceptions[i] + "' does not end with '" + suffix + "'");
}
this.exceptions = new CharArraySet(Version.LUCENE_50,
this.exceptions = new CharArraySet(Version.LUCENE_CURRENT,
Arrays.asList(exceptions), false);
}

View File

@ -1,11 +1,12 @@
/*
* Copyright 2001-2005 The Apache Software Foundation.
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
@ -13,10 +14,9 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Generated from IANA Root Zone Database <http://www.internic.net/zones/root.zone>
// file version from Saturday, July 14, 2012 4:34:14 AM UTC
// generated on Sunday, July 15, 2012 12:59:44 AM UTC
// file version from Friday, December 6, 2013 4:34:10 AM UTC
// generated on Friday, December 6, 2013 3:21:59 PM UTC
// by org.apache.lucene.analysis.standard.GenerateJflexTLDMacros
ASCIITLD = "." (
@ -49,6 +49,7 @@ ASCIITLD = "." (
| [bB][gG]
| [bB][hH]
| [bB][iI]
| [bB][iI][kK][eE]
| [bB][iI][zZ]
| [bB][jJ]
| [bB][mM]
@ -62,6 +63,7 @@ ASCIITLD = "." (
| [bB][yY]
| [bB][zZ]
| [cC][aA]
| [cC][aA][mM][eE][rR][aA]
| [cC][aA][tT]
| [cC][cC]
| [cC][dD]
@ -71,10 +73,13 @@ ASCIITLD = "." (
| [cC][iI]
| [cC][kK]
| [cC][lL]
| [cC][lL][oO][tT][hH][iI][nN][gG]
| [cC][mM]
| [cC][nN]
| [cC][oO]
| [cC][oO][mM]
| [cC][oO][nN][sS][tT][rR][uU][cC][tT][iI][oO][nN]
| [cC][oO][nN][tT][rR][aA][cC][tT][oO][rR][sS]
| [cC][oO][oO][pP]
| [cC][rR]
| [cC][uU]
@ -84,6 +89,8 @@ ASCIITLD = "." (
| [cC][yY]
| [cC][zZ]
| [dD][eE]
| [dD][iI][aA][mM][oO][nN][dD][sS]
| [dD][iI][rR][eE][cC][tT][oO][rR][yY]
| [dD][jJ]
| [dD][kK]
| [dD][mM]
@ -93,8 +100,11 @@ ASCIITLD = "." (
| [eE][dD][uU]
| [eE][eE]
| [eE][gG]
| [eE][nN][tT][eE][rR][pP][rR][iI][sS][eE][sS]
| [eE][qQ][uU][iI][pP][mM][eE][nN][tT]
| [eE][rR]
| [eE][sS]
| [eE][sS][tT][aA][tT][eE]
| [eE][tT]
| [eE][uU]
| [fF][iI]
@ -104,6 +114,7 @@ ASCIITLD = "." (
| [fF][oO]
| [fF][rR]
| [gG][aA]
| [gG][aA][lL][lL][eE][rR][yY]
| [gG][bB]
| [gG][dD]
| [gG][eE]
@ -118,14 +129,17 @@ ASCIITLD = "." (
| [gG][pP]
| [gG][qQ]
| [gG][rR]
| [gG][rR][aA][pP][hH][iI][cC][sS]
| [gG][sS]
| [gG][tT]
| [gG][uU]
| [gG][uU][rR][uU]
| [gG][wW]
| [gG][yY]
| [hH][kK]
| [hH][mM]
| [hH][nN]
| [hH][oO][lL][dD][iI][nN][gG][sS]
| [hH][rR]
| [hH][tT]
| [hH][uU]
@ -150,6 +164,7 @@ ASCIITLD = "." (
| [kK][gG]
| [kK][hH]
| [kK][iI]
| [kK][iI][tT][cC][hH][eE][nN]
| [kK][mM]
| [kK][nN]
| [kK][pP]
@ -158,9 +173,11 @@ ASCIITLD = "." (
| [kK][yY]
| [kK][zZ]
| [lL][aA]
| [lL][aA][nN][dD]
| [lL][bB]
| [lL][cC]
| [lL][iI]
| [lL][iI][gG][hH][tT][iI][nN][gG]
| [lL][kK]
| [lL][rR]
| [lL][sS]
@ -172,6 +189,7 @@ ASCIITLD = "." (
| [mM][cC]
| [mM][dD]
| [mM][eE]
| [mM][eE][nN][uU]
| [mM][gG]
| [mM][hH]
| [mM][iI][lL]
@ -214,10 +232,13 @@ ASCIITLD = "." (
| [pP][fF]
| [pP][gG]
| [pP][hH]
| [pP][hH][oO][tT][oO][gG][rR][aA][pP][hH][yY]
| [pP][kK]
| [pP][lL]
| [pP][lL][uU][mM][bB][iI][nN][gG]
| [pP][mM]
| [pP][nN]
| [pP][oO][sS][tT]
| [pP][rR]
| [pP][rR][oO]
| [pP][sS]
@ -235,9 +256,11 @@ ASCIITLD = "." (
| [sS][cC]
| [sS][dD]
| [sS][eE]
| [sS][eE][xX][yY]
| [sS][gG]
| [sS][hH]
| [sS][iI]
| [sS][iI][nN][gG][lL][eE][sS]
| [sS][jJ]
| [sS][kK]
| [sS][lL]
@ -251,18 +274,22 @@ ASCIITLD = "." (
| [sS][xX]
| [sS][yY]
| [sS][zZ]
| [tT][aA][tT][tT][oO][oO]
| [tT][cC]
| [tT][dD]
| [tT][eE][cC][hH][nN][oO][lL][oO][gG][yY]
| [tT][eE][lL]
| [tT][fF]
| [tT][gG]
| [tT][hH]
| [tT][iI][pP][sS]
| [tT][jJ]
| [tT][kK]
| [tT][lL]
| [tT][mM]
| [tT][nN]
| [tT][oO]
| [tT][oO][dD][aA][yY]
| [tT][pP]
| [tT][rR]
| [tT][rR][aA][vV][eE][lL]
@ -273,61 +300,62 @@ ASCIITLD = "." (
| [uU][aA]
| [uU][gG]
| [uU][kK]
| [uU][nN][oO]
| [uU][sS]
| [uU][yY]
| [uU][zZ]
| [vV][aA]
| [vV][cC]
| [vV][eE]
| [vV][eE][nN][tT][uU][rR][eE][sS]
| [vV][gG]
| [vV][iI]
| [vV][nN]
| [vV][oO][yY][aA][gG][eE]
| [vV][uU]
| [wW][fF]
| [wW][sS]
| [xX][nN]--0[zZ][wW][mM]56[dD]
| [xX][nN]--11[bB]5[bB][sS]3[aA]9[aA][jJ]6[gG]
| [xX][nN]--3[eE]0[bB]707[eE]
| [xX][nN]--45[bB][rR][jJ]9[cC]
| [xX][nN]--80[aA][kK][hH][bB][yY][kK][nN][jJ]4[fF]
| [xX][nN]--80[aA][oO]21[aA]
| [xX][nN]--80[aA][sS][eE][hH][dD][bB]
| [xX][nN]--80[aA][sS][wW][gG]
| [xX][nN]--90[aA]3[aA][cC]
| [xX][nN]--9[tT]4[bB]11[yY][iI]5[aA]
| [xX][nN]--[cC][lL][cC][hH][cC]0[eE][aA]0[bB]2[gG]2[aA]9[gG][cC][dD]
| [xX][nN]--[dD][eE][bB][aA]0[aA][dD]
| [xX][nN]--[fF][iI][qQ][sS]8[sS]
| [xX][nN]--[fF][iI][qQ][zZ]9[sS]
| [xX][nN]--[fF][pP][cC][rR][jJ]9[cC]3[dD]
| [xX][nN]--[fF][zZ][cC]2[cC]9[eE]2[cC]
| [xX][nN]--[gG]6[wW]251[dD]
| [xX][nN]--[gG][eE][cC][rR][jJ]9[cC]
| [xX][nN]--[hH]2[bB][rR][jJ]9[cC]
| [xX][nN]--[hH][gG][bB][kK]6[aA][jJ]7[fF]53[bB][bB][aA]
| [xX][nN]--[hH][lL][cC][jJ]6[aA][yY][aA]9[eE][sS][cC]7[aA]
| [xX][nN]--[jJ]1[aA][mM][hH]
| [xX][nN]--[jJ]6[wW]193[gG]
| [xX][nN]--[jJ][xX][aA][lL][pP][dD][lL][pP]
| [xX][nN]--[kK][gG][bB][eE][cC][hH][tT][vV]
| [xX][nN]--[kK][pP][rR][wW]13[dD]
| [xX][nN]--[kK][pP][rR][yY]57[dD]
| [xX][nN]--[lL]1[aA][cC][cC]
| [xX][nN]--[lL][gG][bB][bB][aA][tT]1[aA][dD]8[jJ]
| [xX][nN]--[mM][gG][bB]9[aA][wW][bB][fF]
| [xX][nN]--[mM][gG][bB][aA]3[aA]4[fF]16[aA]
| [xX][nN]--[mM][gG][bB][aA][aA][mM]7[aA]8[hH]
| [xX][nN]--[mM][gG][bB][aA][yY][hH]7[gG][pP][aA]
| [xX][nN]--[mM][gG][bB][bB][hH]1[aA]71[eE]
| [xX][nN]--[mM][gG][bB][cC]0[aA]9[aA][zZ][cC][gG]
| [xX][nN]--[mM][gG][bB][eE][rR][pP]4[aA]5[dD]4[aA][rR]
| [xX][nN]--[mM][gG][bB][xX]4[cC][dD]0[aA][bB]
| [xX][nN]--[nN][gG][bB][cC]5[aA][zZ][dD]
| [xX][nN]--[oO]3[cC][wW]4[hH]
| [xX][nN]--[oO][gG][bB][pP][fF]8[fF][lL]
| [xX][nN]--[pP]1[aA][iI]
| [xX][nN]--[pP][gG][bB][sS]0[dD][hH]
| [xX][nN]--[qQ]9[jJ][yY][bB]4[cC]
| [xX][nN]--[sS]9[bB][rR][jJ]9[cC]
| [xX][nN]--[uU][nN][uU][pP]4[yY]
| [xX][nN]--[wW][gG][bB][hH]1[cC]
| [xX][nN]--[wW][gG][bB][lL]6[aA]
| [xX][nN]--[xX][kK][cC]2[aA][lL]3[hH][yY][eE]2[aA]
| [xX][nN]--[xX][kK][cC]2[dD][lL]3[aA]5[eE][eE]0[hH]
| [xX][nN]--[yY][fF][rR][oO]4[iI]67[oO]
| [xX][nN]--[yY][gG][bB][iI]2[aA][mM][mM][xX]
| [xX][nN]--[zZ][cC][kK][zZ][aA][hH]
| [xX][xX][xX]
| [yY][eE]
| [yY][tT]

View File

@ -1,4 +1,4 @@
/* The following code was generated by JFlex. */
/* The following code was generated by JFlex 1.5.0-SNAPSHOT */
package org.apache.lucene.analysis.standard;
@ -58,64 +58,63 @@ class ClassicTokenizerImpl implements StandardTokenizerInterface {
* Translates characters to character classes
*/
private static final String ZZ_CMAP_PACKED =
"\11\0\1\0\1\15\1\0\1\0\1\14\22\0\1\0\5\0\1\5"+
"\1\3\4\0\1\11\1\7\1\4\1\11\12\2\6\0\1\6\32\12"+
"\4\0\1\10\1\0\32\12\57\0\1\12\12\0\1\12\4\0\1\12"+
"\5\0\27\12\1\0\37\12\1\0\u0128\12\2\0\22\12\34\0\136\12"+
"\2\0\11\12\2\0\7\12\16\0\2\12\16\0\5\12\11\0\1\12"+
"\213\0\1\12\13\0\1\12\1\0\3\12\1\0\1\12\1\0\24\12"+
"\1\0\54\12\1\0\10\12\2\0\32\12\14\0\202\12\12\0\71\12"+
"\2\0\2\12\2\0\2\12\3\0\46\12\2\0\2\12\67\0\46\12"+
"\2\0\1\12\7\0\47\12\110\0\33\12\5\0\3\12\56\0\32\12"+
"\5\0\13\12\25\0\12\2\7\0\143\12\1\0\1\12\17\0\2\12"+
"\11\0\12\2\3\12\23\0\1\12\1\0\33\12\123\0\46\12\u015f\0"+
"\65\12\3\0\1\12\22\0\1\12\7\0\12\12\4\0\12\2\25\0"+
"\10\12\2\0\2\12\2\0\26\12\1\0\7\12\1\0\1\12\3\0"+
"\4\12\42\0\2\12\1\0\3\12\4\0\12\2\2\12\23\0\6\12"+
"\4\0\2\12\2\0\26\12\1\0\7\12\1\0\2\12\1\0\2\12"+
"\1\0\2\12\37\0\4\12\1\0\1\12\7\0\12\2\2\0\3\12"+
"\20\0\7\12\1\0\1\12\1\0\3\12\1\0\26\12\1\0\7\12"+
"\1\0\2\12\1\0\5\12\3\0\1\12\22\0\1\12\17\0\1\12"+
"\5\0\12\2\25\0\10\12\2\0\2\12\2\0\26\12\1\0\7\12"+
"\1\0\2\12\2\0\4\12\3\0\1\12\36\0\2\12\1\0\3\12"+
"\4\0\12\2\25\0\6\12\3\0\3\12\1\0\4\12\3\0\2\12"+
"\1\0\1\12\1\0\2\12\3\0\2\12\3\0\3\12\3\0\10\12"+
"\1\0\3\12\55\0\11\2\25\0\10\12\1\0\3\12\1\0\27\12"+
"\1\0\12\12\1\0\5\12\46\0\2\12\4\0\12\2\25\0\10\12"+
"\1\0\3\12\1\0\27\12\1\0\12\12\1\0\5\12\44\0\1\12"+
"\1\0\2\12\4\0\12\2\25\0\10\12\1\0\3\12\1\0\27\12"+
"\1\0\20\12\46\0\2\12\4\0\12\2\25\0\22\12\3\0\30\12"+
"\1\0\11\12\1\0\1\12\2\0\7\12\71\0\1\1\60\12\1\1"+
"\2\12\14\1\7\12\11\1\12\2\47\0\2\12\1\0\1\12\2\0"+
"\2\12\1\0\1\12\2\0\1\12\6\0\4\12\1\0\7\12\1\0"+
"\3\12\1\0\1\12\1\0\1\12\2\0\2\12\1\0\4\12\1\0"+
"\2\12\11\0\1\12\2\0\5\12\1\0\1\12\11\0\12\2\2\0"+
"\2\12\42\0\1\12\37\0\12\2\26\0\10\12\1\0\42\12\35\0"+
"\4\12\164\0\42\12\1\0\5\12\1\0\2\12\25\0\12\2\6\0"+
"\6\12\112\0\46\12\12\0\47\12\11\0\132\12\5\0\104\12\5\0"+
"\122\12\6\0\7\12\1\0\77\12\1\0\1\12\1\0\4\12\2\0"+
"\7\12\1\0\1\12\1\0\4\12\2\0\47\12\1\0\1\12\1\0"+
"\4\12\2\0\37\12\1\0\1\12\1\0\4\12\2\0\7\12\1\0"+
"\1\12\1\0\4\12\2\0\7\12\1\0\7\12\1\0\27\12\1\0"+
"\37\12\1\0\1\12\1\0\4\12\2\0\7\12\1\0\47\12\1\0"+
"\23\12\16\0\11\2\56\0\125\12\14\0\u026c\12\2\0\10\12\12\0"+
"\32\12\5\0\113\12\225\0\64\12\54\0\12\2\46\0\12\2\6\0"+
"\130\12\10\0\51\12\u0557\0\234\12\4\0\132\12\6\0\26\12\2\0"+
"\6\12\2\0\46\12\2\0\6\12\2\0\10\12\1\0\1\12\1\0"+
"\1\12\1\0\1\12\1\0\37\12\2\0\65\12\1\0\7\12\1\0"+
"\1\12\3\0\3\12\1\0\7\12\3\0\4\12\2\0\6\12\4\0"+
"\15\12\5\0\3\12\1\0\7\12\202\0\1\12\202\0\1\12\4\0"+
"\1\12\2\0\12\12\1\0\1\12\3\0\5\12\6\0\1\12\1\0"+
"\1\12\1\0\1\12\1\0\4\12\1\0\3\12\1\0\7\12\u0ecb\0"+
"\2\12\52\0\5\12\12\0\1\13\124\13\10\13\2\13\2\13\132\13"+
"\1\13\3\13\6\13\50\13\3\13\1\0\136\12\21\0\30\12\70\0"+
"\20\13\u0100\0\200\13\200\0\u19b6\13\12\13\100\0\u51a6\13\132\13\u048d\12"+
"\u0773\0\u2ba4\12\u215c\0\u012e\13\322\13\7\12\14\0\5\12\5\0\1\12"+
"\1\0\12\12\1\0\15\12\1\0\5\12\1\0\1\12\1\0\2\12"+
"\1\0\2\12\1\0\154\12\41\0\u016b\12\22\0\100\12\2\0\66\12"+
"\50\0\14\12\164\0\3\12\1\0\1\12\1\0\207\12\23\0\12\2"+
"\7\0\32\12\6\0\32\12\12\0\1\13\72\13\37\12\3\0\6\12"+
"\2\0\6\12\2\0\6\12\2\0\3\12\43\0";
"\46\0\1\5\1\3\4\0\1\11\1\7\1\4\1\11\12\2\6\0"+
"\1\6\32\12\4\0\1\10\1\0\32\12\57\0\1\12\12\0\1\12"+
"\4\0\1\12\5\0\27\12\1\0\37\12\1\0\u0128\12\2\0\22\12"+
"\34\0\136\12\2\0\11\12\2\0\7\12\16\0\2\12\16\0\5\12"+
"\11\0\1\12\213\0\1\12\13\0\1\12\1\0\3\12\1\0\1\12"+
"\1\0\24\12\1\0\54\12\1\0\10\12\2\0\32\12\14\0\202\12"+
"\12\0\71\12\2\0\2\12\2\0\2\12\3\0\46\12\2\0\2\12"+
"\67\0\46\12\2\0\1\12\7\0\47\12\110\0\33\12\5\0\3\12"+
"\56\0\32\12\5\0\13\12\25\0\12\2\7\0\143\12\1\0\1\12"+
"\17\0\2\12\11\0\12\2\3\12\23\0\1\12\1\0\33\12\123\0"+
"\46\12\u015f\0\65\12\3\0\1\12\22\0\1\12\7\0\12\12\4\0"+
"\12\2\25\0\10\12\2\0\2\12\2\0\26\12\1\0\7\12\1\0"+
"\1\12\3\0\4\12\42\0\2\12\1\0\3\12\4\0\12\2\2\12"+
"\23\0\6\12\4\0\2\12\2\0\26\12\1\0\7\12\1\0\2\12"+
"\1\0\2\12\1\0\2\12\37\0\4\12\1\0\1\12\7\0\12\2"+
"\2\0\3\12\20\0\7\12\1\0\1\12\1\0\3\12\1\0\26\12"+
"\1\0\7\12\1\0\2\12\1\0\5\12\3\0\1\12\22\0\1\12"+
"\17\0\1\12\5\0\12\2\25\0\10\12\2\0\2\12\2\0\26\12"+
"\1\0\7\12\1\0\2\12\2\0\4\12\3\0\1\12\36\0\2\12"+
"\1\0\3\12\4\0\12\2\25\0\6\12\3\0\3\12\1\0\4\12"+
"\3\0\2\12\1\0\1\12\1\0\2\12\3\0\2\12\3\0\3\12"+
"\3\0\10\12\1\0\3\12\55\0\11\2\25\0\10\12\1\0\3\12"+
"\1\0\27\12\1\0\12\12\1\0\5\12\46\0\2\12\4\0\12\2"+
"\25\0\10\12\1\0\3\12\1\0\27\12\1\0\12\12\1\0\5\12"+
"\44\0\1\12\1\0\2\12\4\0\12\2\25\0\10\12\1\0\3\12"+
"\1\0\27\12\1\0\20\12\46\0\2\12\4\0\12\2\25\0\22\12"+
"\3\0\30\12\1\0\11\12\1\0\1\12\2\0\7\12\71\0\1\1"+
"\60\12\1\1\2\12\14\1\7\12\11\1\12\2\47\0\2\12\1\0"+
"\1\12\2\0\2\12\1\0\1\12\2\0\1\12\6\0\4\12\1\0"+
"\7\12\1\0\3\12\1\0\1\12\1\0\1\12\2\0\2\12\1\0"+
"\4\12\1\0\2\12\11\0\1\12\2\0\5\12\1\0\1\12\11\0"+
"\12\2\2\0\2\12\42\0\1\12\37\0\12\2\26\0\10\12\1\0"+
"\42\12\35\0\4\12\164\0\42\12\1\0\5\12\1\0\2\12\25\0"+
"\12\2\6\0\6\12\112\0\46\12\12\0\47\12\11\0\132\12\5\0"+
"\104\12\5\0\122\12\6\0\7\12\1\0\77\12\1\0\1\12\1\0"+
"\4\12\2\0\7\12\1\0\1\12\1\0\4\12\2\0\47\12\1\0"+
"\1\12\1\0\4\12\2\0\37\12\1\0\1\12\1\0\4\12\2\0"+
"\7\12\1\0\1\12\1\0\4\12\2\0\7\12\1\0\7\12\1\0"+
"\27\12\1\0\37\12\1\0\1\12\1\0\4\12\2\0\7\12\1\0"+
"\47\12\1\0\23\12\16\0\11\2\56\0\125\12\14\0\u026c\12\2\0"+
"\10\12\12\0\32\12\5\0\113\12\225\0\64\12\54\0\12\2\46\0"+
"\12\2\6\0\130\12\10\0\51\12\u0557\0\234\12\4\0\132\12\6\0"+
"\26\12\2\0\6\12\2\0\46\12\2\0\6\12\2\0\10\12\1\0"+
"\1\12\1\0\1\12\1\0\1\12\1\0\37\12\2\0\65\12\1\0"+
"\7\12\1\0\1\12\3\0\3\12\1\0\7\12\3\0\4\12\2\0"+
"\6\12\4\0\15\12\5\0\3\12\1\0\7\12\202\0\1\12\202\0"+
"\1\12\4\0\1\12\2\0\12\12\1\0\1\12\3\0\5\12\6\0"+
"\1\12\1\0\1\12\1\0\1\12\1\0\4\12\1\0\3\12\1\0"+
"\7\12\u0ecb\0\2\12\52\0\5\12\12\0\1\13\124\13\10\13\2\13"+
"\2\13\132\13\1\13\3\13\6\13\50\13\3\13\1\0\136\12\21\0"+
"\30\12\70\0\20\13\u0100\0\200\13\200\0\u19b6\13\12\13\100\0\u51a6\13"+
"\132\13\u048d\12\u0773\0\u2ba4\12\u215c\0\u012e\13\322\13\7\12\14\0\5\12"+
"\5\0\1\12\1\0\12\12\1\0\15\12\1\0\5\12\1\0\1\12"+
"\1\0\2\12\1\0\2\12\1\0\154\12\41\0\u016b\12\22\0\100\12"+
"\2\0\66\12\50\0\14\12\164\0\3\12\1\0\1\12\1\0\207\12"+
"\23\0\12\2\7\0\32\12\6\0\32\12\12\0\1\13\72\13\37\12"+
"\3\0\6\12\2\0\6\12\2\0\6\12\2\0\3\12\43\0";
/**
* Translates characters to character classes
@ -128,13 +127,12 @@ class ClassicTokenizerImpl implements StandardTokenizerInterface {
private static final int [] ZZ_ACTION = zzUnpackAction();
private static final String ZZ_ACTION_PACKED_0 =
"\1\0\1\1\3\2\1\3\1\1\13\0\1\2\3\4"+
"\2\0\1\5\1\0\1\5\3\4\6\5\1\6\1\4"+
"\2\7\1\10\1\0\1\10\3\0\2\10\1\11\1\12"+
"\1\4";
"\1\0\1\1\3\2\1\3\13\0\1\2\3\4\2\0"+
"\1\5\1\0\1\5\3\4\6\5\1\6\1\4\2\7"+
"\1\10\1\0\1\10\3\0\2\10\1\11\1\12\1\4";
private static int [] zzUnpackAction() {
int [] result = new int[51];
int [] result = new int[50];
int offset = 0;
offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
return result;
@ -159,16 +157,16 @@ class ClassicTokenizerImpl implements StandardTokenizerInterface {
private static final int [] ZZ_ROWMAP = zzUnpackRowMap();
private static final String ZZ_ROWMAP_PACKED_0 =
"\0\0\0\16\0\34\0\52\0\70\0\16\0\106\0\124"+
"\0\142\0\160\0\176\0\214\0\232\0\250\0\266\0\304"+
"\0\322\0\340\0\356\0\374\0\u010a\0\u0118\0\u0126\0\u0134"+
"\0\u0142\0\u0150\0\u015e\0\u016c\0\u017a\0\u0188\0\u0196\0\u01a4"+
"\0\u01b2\0\u01c0\0\u01ce\0\u01dc\0\u01ea\0\u01f8\0\322\0\u0206"+
"\0\u0214\0\u0222\0\u0230\0\u023e\0\u024c\0\u025a\0\124\0\214"+
"\0\u0268\0\u0276\0\u0284";
"\0\0\0\14\0\30\0\44\0\60\0\14\0\74\0\110"+
"\0\124\0\140\0\154\0\170\0\204\0\220\0\234\0\250"+
"\0\264\0\300\0\314\0\330\0\344\0\360\0\374\0\u0108"+
"\0\u0114\0\u0120\0\u012c\0\u0138\0\u0144\0\u0150\0\u015c\0\u0168"+
"\0\u0174\0\u0180\0\u018c\0\u0198\0\u01a4\0\250\0\u01b0\0\u01bc"+
"\0\u01c8\0\u01d4\0\u01e0\0\u01ec\0\u01f8\0\74\0\154\0\u0204"+
"\0\u0210\0\u021c";
private static int [] zzUnpackRowMap() {
int [] result = new int[51];
int [] result = new int[50];
int offset = 0;
offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
return result;
@ -191,49 +189,49 @@ class ClassicTokenizerImpl implements StandardTokenizerInterface {
private static final int [] ZZ_TRANS = zzUnpackTrans();
private static final String ZZ_TRANS_PACKED_0 =
"\1\2\1\3\1\4\7\2\1\5\1\6\1\7\1\2"+
"\17\0\2\3\1\0\1\10\1\0\1\11\2\12\1\13"+
"\1\3\4\0\1\3\1\4\1\0\1\14\1\0\1\11"+
"\2\15\1\16\1\4\4\0\1\3\1\4\1\17\1\20"+
"\1\21\1\22\2\12\1\13\1\23\20\0\1\2\1\0"+
"\1\24\1\25\7\0\1\26\4\0\2\27\7\0\1\27"+
"\4\0\1\30\1\31\7\0\1\32\5\0\1\33\7\0"+
"\1\13\4\0\1\34\1\35\7\0\1\36\4\0\1\37"+
"\1\40\7\0\1\41\4\0\1\42\1\43\7\0\1\44"+
"\15\0\1\45\4\0\1\24\1\25\7\0\1\46\15\0"+
"\1\47\4\0\2\27\7\0\1\50\4\0\1\3\1\4"+
"\1\17\1\10\1\21\1\22\2\12\1\13\1\23\4\0"+
"\2\24\1\0\1\51\1\0\1\11\2\52\1\0\1\24"+
"\4\0\1\24\1\25\1\0\1\53\1\0\1\11\2\54"+
"\1\55\1\25\4\0\1\24\1\25\1\0\1\51\1\0"+
"\1\11\2\52\1\0\1\26\4\0\2\27\1\0\1\56"+
"\2\0\1\56\2\0\1\27\4\0\2\30\1\0\1\52"+
"\1\0\1\11\2\52\1\0\1\30\4\0\1\30\1\31"+
"\1\0\1\54\1\0\1\11\2\54\1\55\1\31\4\0"+
"\1\30\1\31\1\0\1\52\1\0\1\11\2\52\1\0"+
"\1\32\5\0\1\33\1\0\1\55\2\0\3\55\1\33"+
"\4\0\2\34\1\0\1\57\1\0\1\11\2\12\1\13"+
"\1\34\4\0\1\34\1\35\1\0\1\60\1\0\1\11"+
"\2\15\1\16\1\35\4\0\1\34\1\35\1\0\1\57"+
"\1\0\1\11\2\12\1\13\1\36\4\0\2\37\1\0"+
"\1\12\1\0\1\11\2\12\1\13\1\37\4\0\1\37"+
"\1\40\1\0\1\15\1\0\1\11\2\15\1\16\1\40"+
"\4\0\1\37\1\40\1\0\1\12\1\0\1\11\2\12"+
"\1\13\1\41\4\0\2\42\1\0\1\13\2\0\3\13"+
"\1\42\4\0\1\42\1\43\1\0\1\16\2\0\3\16"+
"\1\43\4\0\1\42\1\43\1\0\1\13\2\0\3\13"+
"\1\44\6\0\1\17\6\0\1\45\4\0\1\24\1\25"+
"\1\0\1\61\1\0\1\11\2\52\1\0\1\26\4\0"+
"\2\27\1\0\1\56\2\0\1\56\2\0\1\50\4\0"+
"\2\24\7\0\1\24\4\0\2\30\7\0\1\30\4\0"+
"\2\34\7\0\1\34\4\0\2\37\7\0\1\37\4\0"+
"\2\42\7\0\1\42\4\0\2\62\7\0\1\62\4\0"+
"\2\24\7\0\1\63\4\0\2\62\1\0\1\56\2\0"+
"\1\56\2\0\1\62\4\0\2\24\1\0\1\61\1\0"+
"\1\11\2\52\1\0\1\24\3\0";
"\1\2\1\3\1\4\7\2\1\5\1\6\15\0\2\3"+
"\1\0\1\7\1\0\1\10\2\11\1\12\1\3\2\0"+
"\1\3\1\4\1\0\1\13\1\0\1\10\2\14\1\15"+
"\1\4\2\0\1\3\1\4\1\16\1\17\1\20\1\21"+
"\2\11\1\12\1\22\2\0\1\23\1\24\7\0\1\25"+
"\2\0\2\26\7\0\1\26\2\0\1\27\1\30\7\0"+
"\1\31\3\0\1\32\7\0\1\12\2\0\1\33\1\34"+
"\7\0\1\35\2\0\1\36\1\37\7\0\1\40\2\0"+
"\1\41\1\42\7\0\1\43\13\0\1\44\2\0\1\23"+
"\1\24\7\0\1\45\13\0\1\46\2\0\2\26\7\0"+
"\1\47\2\0\1\3\1\4\1\16\1\7\1\20\1\21"+
"\2\11\1\12\1\22\2\0\2\23\1\0\1\50\1\0"+
"\1\10\2\51\1\0\1\23\2\0\1\23\1\24\1\0"+
"\1\52\1\0\1\10\2\53\1\54\1\24\2\0\1\23"+
"\1\24\1\0\1\50\1\0\1\10\2\51\1\0\1\25"+
"\2\0\2\26\1\0\1\55\2\0\1\55\2\0\1\26"+
"\2\0\2\27\1\0\1\51\1\0\1\10\2\51\1\0"+
"\1\27\2\0\1\27\1\30\1\0\1\53\1\0\1\10"+
"\2\53\1\54\1\30\2\0\1\27\1\30\1\0\1\51"+
"\1\0\1\10\2\51\1\0\1\31\3\0\1\32\1\0"+
"\1\54\2\0\3\54\1\32\2\0\2\33\1\0\1\56"+
"\1\0\1\10\2\11\1\12\1\33\2\0\1\33\1\34"+
"\1\0\1\57\1\0\1\10\2\14\1\15\1\34\2\0"+
"\1\33\1\34\1\0\1\56\1\0\1\10\2\11\1\12"+
"\1\35\2\0\2\36\1\0\1\11\1\0\1\10\2\11"+
"\1\12\1\36\2\0\1\36\1\37\1\0\1\14\1\0"+
"\1\10\2\14\1\15\1\37\2\0\1\36\1\37\1\0"+
"\1\11\1\0\1\10\2\11\1\12\1\40\2\0\2\41"+
"\1\0\1\12\2\0\3\12\1\41\2\0\1\41\1\42"+
"\1\0\1\15\2\0\3\15\1\42\2\0\1\41\1\42"+
"\1\0\1\12\2\0\3\12\1\43\4\0\1\16\6\0"+
"\1\44\2\0\1\23\1\24\1\0\1\60\1\0\1\10"+
"\2\51\1\0\1\25\2\0\2\26\1\0\1\55\2\0"+
"\1\55\2\0\1\47\2\0\2\23\7\0\1\23\2\0"+
"\2\27\7\0\1\27\2\0\2\33\7\0\1\33\2\0"+
"\2\36\7\0\1\36\2\0\2\41\7\0\1\41\2\0"+
"\2\61\7\0\1\61\2\0\2\23\7\0\1\62\2\0"+
"\2\61\1\0\1\55\2\0\1\55\2\0\1\61\2\0"+
"\2\23\1\0\1\60\1\0\1\10\2\51\1\0\1\23"+
"\1\0";
private static int [] zzUnpackTrans() {
int [] result = new int[658];
int [] result = new int[552];
int offset = 0;
offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
return result;
@ -271,11 +269,11 @@ class ClassicTokenizerImpl implements StandardTokenizerInterface {
private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
private static final String ZZ_ATTRIBUTE_PACKED_0 =
"\1\0\1\11\3\1\1\11\1\1\13\0\4\1\2\0"+
"\1\1\1\0\17\1\1\0\1\1\3\0\5\1";
"\1\0\1\11\3\1\1\11\13\0\4\1\2\0\1\1"+
"\1\0\17\1\1\0\1\1\3\0\5\1";
private static int [] zzUnpackAttribute() {
int [] result = new int[51];
int [] result = new int[50];
int offset = 0;
offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
return result;
@ -372,7 +370,6 @@ public final void getText(CharTermAttribute t) {
/**
* Creates a new scanner
* There is also a java.io.InputStream version of this constructor.
*
* @param in the java.io.Reader to read input from.
*/
@ -380,7 +377,6 @@ public final void getText(CharTermAttribute t) {
this.zzReader = in;
}
/**
* Unpacks the compressed character translation table.
@ -392,7 +388,7 @@ public final void getText(CharTermAttribute t) {
char [] map = new char[0x10000];
int i = 0; /* index in packed string */
int j = 0; /* index in unpacked array */
while (i < 1154) {
while (i < 1138) {
int count = packed.charAt(i++);
char value = packed.charAt(i++);
do map[j++] = value; while (--count > 0);

View File

@ -116,8 +116,6 @@ LETTER = !(![:letter:]|{CJ})
// Chinese and Japanese (but NOT Korean, which is included in [:letter:])
CJ = [\u3100-\u312f\u3040-\u309F\u30A0-\u30FF\u31F0-\u31FF\u3300-\u337f\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\uff65-\uff9f]
WHITESPACE = \r\n | [ \r\n\t\f]
%%
{ALPHANUM} { return ALPHANUM; }
@ -131,4 +129,4 @@ WHITESPACE = \r\n | [ \r\n\t\f]
{ACRONYM_DEP} { return ACRONYM_DEP; }
/** Ignore the rest */
. | {WHITESPACE} { /* Break so we don't hit fall-through warning: */ break;/* ignore */ }
[^] { /* Break so we don't hit fall-through warning: */ break;/* ignore */ }

View File

@ -18,4 +18,4 @@
WARNING: if you change StandardTokenizerImpl*.jflex or UAX29URLEmailTokenizer
and need to regenerate the tokenizer, only use the trunk version
of JFlex 1.5 (with a minimum SVN revision 607) at the moment!
of JFlex 1.5 (with a minimum SVN revision 722) at the moment!

View File

@ -1,11 +1,12 @@
/*
* Copyright 2010 The Apache Software Foundation.
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
@ -13,8 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Generated using ICU4J 49.1.0.0
// Generated using ICU4J 52.1.0.0
// by org.apache.lucene.analysis.icu.GenerateJFlexSupplementaryMacros
@ -39,6 +39,12 @@ FormatSupp = (
| ([\ud834][\uDD73-\uDD7A])
| ([\udb40][\uDC01\uDC20-\uDC7F])
)
NumericSupp = (
([\ud805][\uDEC0-\uDEC9])
| ([\ud804][\uDC66-\uDC6F\uDCF0-\uDCF9\uDD36-\uDD3F\uDDD0-\uDDD9])
| ([\ud835][\uDFCE-\uDFFF])
| ([\ud801][\uDCA0-\uDCA9])
)
ExtendSupp = (
([\ud81b][\uDF51-\uDF7E\uDF8F-\uDF92])
| ([\ud805][\uDEAB-\uDEB7])
@ -48,12 +54,6 @@ ExtendSupp = (
| ([\udb40][\uDD00-\uDDEF])
| ([\ud802][\uDE01-\uDE03\uDE05\uDE06\uDE0C-\uDE0F\uDE38-\uDE3A\uDE3F])
)
NumericSupp = (
([\ud805][\uDEC0-\uDEC9])
| ([\ud804][\uDC66-\uDC6F\uDCF0-\uDCF9\uDD36-\uDD3F\uDDD0-\uDDD9])
| ([\ud835][\uDFCE-\uDFFF])
| ([\ud801][\uDCA0-\uDCA9])
)
KatakanaSupp = (
([\ud82c][\uDC00])
)
@ -129,3 +129,15 @@ HiraganaSupp = (
([\ud83c][\uDE00])
| ([\ud82c][\uDC01])
)
SingleQuoteSupp = (
[]
)
DoubleQuoteSupp = (
[]
)
HebrewLetterSupp = (
[]
)
RegionalIndicatorSupp = (
([\ud83c][\uDDE6-\uDDFF])
)

View File

@ -32,11 +32,13 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
* Asian languages, including Thai, Lao, Myanmar, and Khmer</li>
* <li>&lt;IDEOGRAPHIC&gt;: A single CJKV ideographic character</li>
* <li>&lt;HIRAGANA&gt;: A single hiragana character</li>
* <li>&lt;KATAKANA&gt;: A sequence of katakana characters</li>
* <li>&lt;HANGUL&gt;: A sequence of Hangul characters</li>
* </ul>
*/
%%
%unicode 6.1
%unicode 6.3
%integer
%final
%public
@ -47,33 +49,40 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
%buffer 4096
%include SUPPLEMENTARY.jflex-macro
ALetter = ([\p{WB:ALetter}] | {ALetterSupp})
Format = ([\p{WB:Format}] | {FormatSupp})
Numeric = ([\p{WB:Numeric}] | {NumericSupp})
Extend = ([\p{WB:Extend}] | {ExtendSupp})
Katakana = ([\p{WB:Katakana}] | {KatakanaSupp})
MidLetter = ([\p{WB:MidLetter}] | {MidLetterSupp})
MidNum = ([\p{WB:MidNum}] | {MidNumSupp})
MidNumLet = ([\p{WB:MidNumLet}] | {MidNumLetSupp})
ExtendNumLet = ([\p{WB:ExtendNumLet}] | {ExtendNumLetSupp})
ComplexContext = ([\p{LB:Complex_Context}] | {ComplexContextSupp})
Han = ([\p{Script:Han}] | {HanSupp})
Hiragana = ([\p{Script:Hiragana}] | {HiraganaSupp})
ALetter = (\p{WB:ALetter} | {ALetterSupp})
Format = (\p{WB:Format} | {FormatSupp})
Numeric = ([\p{WB:Numeric}[\p{Blk:HalfAndFullForms}&&\p{Nd}]] | {NumericSupp})
Extend = (\p{WB:Extend} | {ExtendSupp})
Katakana = (\p{WB:Katakana} | {KatakanaSupp})
MidLetter = (\p{WB:MidLetter} | {MidLetterSupp})
MidNum = (\p{WB:MidNum} | {MidNumSupp})
MidNumLet = (\p{WB:MidNumLet} | {MidNumLetSupp})
ExtendNumLet = (\p{WB:ExtendNumLet} | {ExtendNumLetSupp})
ComplexContext = (\p{LB:Complex_Context} | {ComplexContextSupp})
Han = (\p{Script:Han} | {HanSupp})
Hiragana = (\p{Script:Hiragana} | {HiraganaSupp})
SingleQuote = (\p{WB:Single_Quote} | {SingleQuoteSupp})
DoubleQuote = (\p{WB:Double_Quote} | {DoubleQuoteSupp})
HebrewLetter = (\p{WB:Hebrew_Letter} | {HebrewLetterSupp})
RegionalIndicator = (\p{WB:Regional_Indicator} | {RegionalIndicatorSupp})
HebrewOrALetter = ({HebrewLetter} | {ALetter})
// Script=Hangul & Aletter
HangulEx = (!(!\p{Script:Hangul}|!\p{WB:ALetter})) ({Format} | {Extend})*
// UAX#29 WB4. X (Extend | Format)* --> X
//
ALetterEx = {ALetter} ({Format} | {Extend})*
// TODO: Convert hard-coded full-width numeric range to property intersection (something like [\p{Full-Width}&&\p{Numeric}]) once JFlex supports it
NumericEx = ({Numeric} | [\uFF10-\uFF19]) ({Format} | {Extend})*
KatakanaEx = {Katakana} ({Format} | {Extend})*
MidLetterEx = ({MidLetter} | {MidNumLet}) ({Format} | {Extend})*
MidNumericEx = ({MidNum} | {MidNumLet}) ({Format} | {Extend})*
ExtendNumLetEx = {ExtendNumLet} ({Format} | {Extend})*
HangulEx = [\p{Script:Hangul}&&[\p{WB:ALetter}\p{WB:Hebrew_Letter}]] ({Format} | {Extend})*
HebrewOrALetterEx = {HebrewOrALetter} ({Format} | {Extend})*
NumericEx = {Numeric} ({Format} | {Extend})*
KatakanaEx = {Katakana} ({Format} | {Extend})*
MidLetterEx = ({MidLetter} | {MidNumLet} | {SingleQuote}) ({Format} | {Extend})*
MidNumericEx = ({MidNum} | {MidNumLet} | {SingleQuote}) ({Format} | {Extend})*
ExtendNumLetEx = {ExtendNumLet} ({Format} | {Extend})*
HanEx = {Han} ({Format} | {Extend})*
HiraganaEx = {Hiragana} ({Format} | {Extend})*
SingleQuoteEx = {SingleQuote} ({Format} | {Extend})*
DoubleQuoteEx = {DoubleQuote} ({Format} | {Extend})*
HebrewLetterEx = {HebrewLetter} ({Format} | {Extend})*
RegionalIndicatorEx = {RegionalIndicator} ({Format} | {Extend})*
HanEx = {Han} ({Format} | {Extend})*
HiraganaEx = {Hiragana} ({Format} | {Extend})*
%{
/** Alphanumeric sequences */
@ -121,15 +130,12 @@ HiraganaEx = {Hiragana} ({Format} | {Extend})*
<<EOF>> { return StandardTokenizerInterface.YYEOF; }
// UAX#29 WB8. Numeric × Numeric
// WB11. Numeric (MidNum | MidNumLet) × Numeric
// WB12. Numeric × (MidNum | MidNumLet) Numeric
// WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
// WB13b. ExtendNumLet × (ALetter | Numeric | Katakana)
// WB11. Numeric (MidNum | MidNumLet | Single_Quote) × Numeric
// WB12. Numeric × (MidNum | MidNumLet | Single_Quote) Numeric
// WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
// WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana)
//
{ExtendNumLetEx}* {NumericEx} ({ExtendNumLetEx}+ {NumericEx}
| {MidNumericEx} {NumericEx}
| {NumericEx})*
{ExtendNumLetEx}*
{ExtendNumLetEx}* {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx} )* {ExtendNumLetEx}*
{ return NUMERIC_TYPE; }
// subset of the below for typing purposes only!
@ -139,22 +145,32 @@ HiraganaEx = {Hiragana} ({Format} | {Extend})*
{KatakanaEx}+
{ return KATAKANA_TYPE; }
// UAX#29 WB5. ALetter × ALetter
// WB6. ALetter × (MidLetter | MidNumLet) ALetter
// WB7. ALetter (MidLetter | MidNumLet) × ALetter
// WB9. ALetter × Numeric
// WB10. Numeric × ALetter
// UAX#29 WB5. (ALetter | Hebrew_Letter) × (ALetter | Hebrew_Letter)
// WB6. (ALetter | Hebrew_Letter) × (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
// WB7. (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) × (ALetter | Hebrew_Letter)
// WB7a. Hebrew_Letter × Single_Quote
// WB7b. Hebrew_Letter × Double_Quote Hebrew_Letter
// WB7c. Hebrew_Letter Double_Quote × Hebrew_Letter
// WB9. (ALetter | Hebrew_Letter) × Numeric
// WB10. Numeric × (ALetter | Hebrew_Letter)
// WB13. Katakana × Katakana
// WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
// WB13b. ExtendNumLet × (ALetter | Numeric | Katakana)
// WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
// WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana)
//
{ExtendNumLetEx}* ( {KatakanaEx} ({ExtendNumLetEx}* {KatakanaEx})*
| ( {NumericEx} ({ExtendNumLetEx}+ {NumericEx} | {MidNumericEx} {NumericEx} | {NumericEx})*
| {ALetterEx} ({ExtendNumLetEx}+ {ALetterEx} | {MidLetterEx} {ALetterEx} | {ALetterEx})* )+ )
({ExtendNumLetEx}+ ( {KatakanaEx} ({ExtendNumLetEx}* {KatakanaEx})*
| ( {NumericEx} ({ExtendNumLetEx}+ {NumericEx} | {MidNumericEx} {NumericEx} | {NumericEx})*
| {ALetterEx} ({ExtendNumLetEx}+ {ALetterEx} | {MidLetterEx} {ALetterEx} | {ALetterEx})* )+ ) )*
{ExtendNumLetEx}*
{ExtendNumLetEx}* ( {KatakanaEx} ( {ExtendNumLetEx}* {KatakanaEx} )*
| ( {HebrewLetterEx} ( {SingleQuoteEx} | {DoubleQuoteEx} {HebrewLetterEx} )
| {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} )* {NumericEx} )*
| {HebrewOrALetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx} )* {HebrewOrALetterEx} )*
)+
)
({ExtendNumLetEx}+ ( {KatakanaEx} ( {ExtendNumLetEx}* {KatakanaEx} )*
| ( {HebrewLetterEx} ( {SingleQuoteEx} | {DoubleQuoteEx} {HebrewLetterEx} )
| {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} )* {NumericEx} )*
| {HebrewOrALetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx} )* {HebrewOrALetterEx} )*
)+
)
)*
{ExtendNumLetEx}*
{ return WORD_TYPE; }
@ -166,7 +182,7 @@ HiraganaEx = {Hiragana} ({Format} | {Extend})*
// annex. That means that satisfactory treatment of languages like Chinese
// or Thai requires special handling.
//
// In Unicode 6.1, only one character has the \p{Line_Break = Contingent_Break}
// In Unicode 6.3, only one character has the \p{Line_Break = Contingent_Break}
// property: U+FFFC ( ) OBJECT REPLACEMENT CHARACTER.
//
// In the ICU implementation of UAX#29, \p{Line_Break = Complex_Context}
@ -188,6 +204,8 @@ HiraganaEx = {Hiragana} ({Format} | {Extend})*
// UAX#29 WB3. CR × LF
// WB3a. (Newline | CR | LF) ÷
// WB3b. ÷ (Newline | CR | LF)
// WB13c. Regional_Indicator × Regional_Indicator
// WB14. Any ÷ Any
//
[^] { /* Break so we don't hit fall-through warning: */ break; /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */ }
{RegionalIndicatorEx} {RegionalIndicatorEx}+ | [^]
{ /* Break so we don't hit fall-through warning: */ break; /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */ }

View File

@ -35,11 +35,13 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
* Asian languages, including Thai, Lao, Myanmar, and Khmer</li>
* <li>&lt;IDEOGRAPHIC&gt;: A single CJKV ideographic character</li>
* <li>&lt;HIRAGANA&gt;: A single hiragana character</li>
* <li>&lt;KATAKANA&gt;: A sequence of katakana characters</li>
* <li>&lt;HANGUL&gt;: A sequence of Hangul characters</li>
* </ul>
*/
%%
%unicode 6.1
%unicode 6.3
%integer
%final
%public
@ -50,33 +52,39 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
%buffer 4096
%include SUPPLEMENTARY.jflex-macro
ALetter = ([\p{WB:ALetter}] | {ALetterSupp})
Format = ([\p{WB:Format}] | {FormatSupp})
Numeric = ([\p{WB:Numeric}] | {NumericSupp})
Extend = ([\p{WB:Extend}] | {ExtendSupp})
Katakana = ([\p{WB:Katakana}] | {KatakanaSupp})
MidLetter = ([\p{WB:MidLetter}] | {MidLetterSupp})
MidNum = ([\p{WB:MidNum}] | {MidNumSupp})
MidNumLet = ([\p{WB:MidNumLet}] | {MidNumLetSupp})
ExtendNumLet = ([\p{WB:ExtendNumLet}] | {ExtendNumLetSupp})
ComplexContext = ([\p{LB:Complex_Context}] | {ComplexContextSupp})
Han = ([\p{Script:Han}] | {HanSupp})
Hiragana = ([\p{Script:Hiragana}] | {HiraganaSupp})
ALetter = (\p{WB:ALetter} | {ALetterSupp})
Format = (\p{WB:Format} | {FormatSupp})
Numeric = ([\p{WB:Numeric}[\p{Blk:HalfAndFullForms}&&\p{Nd}]] | {NumericSupp})
Extend = (\p{WB:Extend} | {ExtendSupp})
Katakana = (\p{WB:Katakana} | {KatakanaSupp})
MidLetter = (\p{WB:MidLetter} | {MidLetterSupp})
MidNum = (\p{WB:MidNum} | {MidNumSupp})
MidNumLet = (\p{WB:MidNumLet} | {MidNumLetSupp})
ExtendNumLet = (\p{WB:ExtendNumLet} | {ExtendNumLetSupp})
ComplexContext = (\p{LB:Complex_Context} | {ComplexContextSupp})
Han = (\p{Script:Han} | {HanSupp})
Hiragana = (\p{Script:Hiragana} | {HiraganaSupp})
SingleQuote = (\p{WB:Single_Quote} | {SingleQuoteSupp})
DoubleQuote = (\p{WB:Double_Quote} | {DoubleQuoteSupp})
HebrewLetter = (\p{WB:Hebrew_Letter} | {HebrewLetterSupp})
RegionalIndicator = (\p{WB:Regional_Indicator} | {RegionalIndicatorSupp})
HebrewOrALetter = ({HebrewLetter} | {ALetter})
// Script=Hangul & Aletter
HangulEx = (!(!\p{Script:Hangul}|!\p{WB:ALetter})) ({Format} | {Extend})*
// UAX#29 WB4. X (Extend | Format)* --> X
//
ALetterEx = {ALetter} ({Format} | {Extend})*
// TODO: Convert hard-coded full-width numeric range to property intersection (something like [\p{Full-Width}&&\p{Numeric}]) once JFlex supports it
NumericEx = ({Numeric} | [\uFF10-\uFF19]) ({Format} | {Extend})*
KatakanaEx = {Katakana} ({Format} | {Extend})*
MidLetterEx = ({MidLetter} | {MidNumLet}) ({Format} | {Extend})*
MidNumericEx = ({MidNum} | {MidNumLet}) ({Format} | {Extend})*
ExtendNumLetEx = {ExtendNumLet} ({Format} | {Extend})*
HanEx = {Han} ({Format} | {Extend})*
HiraganaEx = {Hiragana} ({Format} | {Extend})*
HangulEx = [\p{Script:Hangul}&&[\p{WB:ALetter}\p{WB:Hebrew_Letter}]] ({Format} | {Extend})*
HebrewOrALetterEx = {HebrewOrALetter} ({Format} | {Extend})*
NumericEx = {Numeric} ({Format} | {Extend})*
KatakanaEx = {Katakana} ({Format} | {Extend})*
MidLetterEx = ({MidLetter} | {MidNumLet} | {SingleQuote}) ({Format} | {Extend})*
MidNumericEx = ({MidNum} | {MidNumLet} | {SingleQuote}) ({Format} | {Extend})*
ExtendNumLetEx = {ExtendNumLet} ({Format} | {Extend})*
HanEx = {Han} ({Format} | {Extend})*
HiraganaEx = {Hiragana} ({Format} | {Extend})*
SingleQuoteEx = {SingleQuote} ({Format} | {Extend})*
DoubleQuoteEx = {DoubleQuote} ({Format} | {Extend})*
HebrewLetterEx = {HebrewLetter} ({Format} | {Extend})*
RegionalIndicatorEx = {RegionalIndicator} ({Format} | {Extend})*
// URL and E-mail syntax specifications:
//
@ -213,40 +221,47 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
{EMAIL} { return EMAIL_TYPE; }
// UAX#29 WB8. Numeric × Numeric
// WB11. Numeric (MidNum | MidNumLet) × Numeric
// WB12. Numeric × (MidNum | MidNumLet) Numeric
// WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
// WB13b. ExtendNumLet × (ALetter | Numeric | Katakana)
// WB11. Numeric (MidNum | MidNumLet | Single_Quote) × Numeric
// WB12. Numeric × (MidNum | MidNumLet | Single_Quote) Numeric
// WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
// WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana)
//
{ExtendNumLetEx}* {NumericEx} ({ExtendNumLetEx}+ {NumericEx}
| {MidNumericEx} {NumericEx}
| {NumericEx})*
{ExtendNumLetEx}*
{ExtendNumLetEx}* {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx} )* {ExtendNumLetEx}*
{ return NUMERIC_TYPE; }
// subset of the below for typing purposes only!
{HangulEx}+
{ return HANGUL_TYPE; }
{KatakanaEx}+
{ return KATAKANA_TYPE; }
// UAX#29 WB5. ALetter × ALetter
// WB6. ALetter × (MidLetter | MidNumLet) ALetter
// WB7. ALetter (MidLetter | MidNumLet) × ALetter
// WB9. ALetter × Numeric
// WB10. Numeric × ALetter
// UAX#29 WB5. (ALetter | Hebrew_Letter) × (ALetter | Hebrew_Letter)
// WB6. (ALetter | Hebrew_Letter) × (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
// WB7. (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) × (ALetter | Hebrew_Letter)
// WB7a. Hebrew_Letter × Single_Quote
// WB7b. Hebrew_Letter × Double_Quote Hebrew_Letter
// WB7c. Hebrew_Letter Double_Quote × Hebrew_Letter
// WB9. (ALetter | Hebrew_Letter) × Numeric
// WB10. Numeric × (ALetter | Hebrew_Letter)
// WB13. Katakana × Katakana
// WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
// WB13b. ExtendNumLet × (ALetter | Numeric | Katakana)
// WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
// WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana)
//
{ExtendNumLetEx}* ( {KatakanaEx} ({ExtendNumLetEx}* {KatakanaEx})*
| ( {NumericEx} ({ExtendNumLetEx}+ {NumericEx} | {MidNumericEx} {NumericEx} | {NumericEx})*
| {ALetterEx} ({ExtendNumLetEx}+ {ALetterEx} | {MidLetterEx} {ALetterEx} | {ALetterEx})* )+ )
({ExtendNumLetEx}+ ( {KatakanaEx} ({ExtendNumLetEx}* {KatakanaEx})*
| ( {NumericEx} ({ExtendNumLetEx}+ {NumericEx} | {MidNumericEx} {NumericEx} | {NumericEx})*
| {ALetterEx} ({ExtendNumLetEx}+ {ALetterEx} | {MidLetterEx} {ALetterEx} | {ALetterEx})* )+ ) )*
{ExtendNumLetEx}*
{ExtendNumLetEx}* ( {KatakanaEx} ( {ExtendNumLetEx}* {KatakanaEx} )*
| ( {HebrewLetterEx} ( {SingleQuoteEx} | {DoubleQuoteEx} {HebrewLetterEx} )
| {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} )* {NumericEx} )*
| {HebrewOrALetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx} )* {HebrewOrALetterEx} )*
)+
)
({ExtendNumLetEx}+ ( {KatakanaEx} ( {ExtendNumLetEx}* {KatakanaEx} )*
| ( {HebrewLetterEx} ( {SingleQuoteEx} | {DoubleQuoteEx} {HebrewLetterEx} )
| {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} )* {NumericEx} )*
| {HebrewOrALetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx} )* {HebrewOrALetterEx} )*
)+
)
)*
{ExtendNumLetEx}*
{ return WORD_TYPE; }
@ -258,7 +273,7 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
// annex. That means that satisfactory treatment of languages like Chinese
// or Thai requires special handling.
//
// In Unicode 6.1, only one character has the \p{Line_Break = Contingent_Break}
// In Unicode 6.3, only one character has the \p{Line_Break = Contingent_Break}
// property: U+FFFC ( ) OBJECT REPLACEMENT CHARACTER.
//
// In the ICU implementation of UAX#29, \p{Line_Break = Complex_Context}
@ -280,6 +295,8 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
// UAX#29 WB3. CR × LF
// WB3a. (Newline | CR | LF) ÷
// WB3b. ÷ (Newline | CR | LF)
// WB13c. Regional_Indicator × Regional_Indicator
// WB14. Any ÷ Any
//
[^] { /* Break so we don't hit fall-through warning: */ break;/* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */ }
{RegionalIndicatorEx} {RegionalIndicatorEx}+ | [^]
{ /* Break so we don't hit fall-through warning: */ break; /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */ }

View File

@ -133,8 +133,8 @@ public class SynonymFilterFactory extends TokenFilterFactory implements Resource
analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = factory == null ? new WhitespaceTokenizer(Version.LUCENE_50, reader) : factory.create(reader);
TokenStream stream = ignoreCase ? new LowerCaseFilter(Version.LUCENE_50, tokenizer) : tokenizer;
Tokenizer tokenizer = factory == null ? new WhitespaceTokenizer(Version.LUCENE_CURRENT, reader) : factory.create(reader);
TokenStream stream = ignoreCase ? new LowerCaseFilter(Version.LUCENE_CURRENT, tokenizer) : tokenizer;
return new TokenStreamComponents(tokenizer, stream);
}
};
@ -201,7 +201,7 @@ public class SynonymFilterFactory extends TokenFilterFactory implements Resource
private Analyzer loadAnalyzer(ResourceLoader loader, String cname) throws IOException {
Class<? extends Analyzer> clazz = loader.findClass(cname, Analyzer.class);
try {
Analyzer analyzer = clazz.getConstructor(Version.class).newInstance(Version.LUCENE_50);
Analyzer analyzer = clazz.getConstructor(Version.class).newInstance(Version.LUCENE_CURRENT);
if (analyzer instanceof ResourceLoaderAware) {
((ResourceLoaderAware) analyzer).inform(loader);
}

View File

@ -1,4 +1,4 @@
/* The following code was generated by JFlex. */
/* The following code was generated by JFlex 1.5.0-SNAPSHOT */
package org.apache.lucene.analysis.wikipedia;
@ -84,21 +84,20 @@ class WikipediaTokenizerImpl {
private static final int [] ZZ_ACTION = zzUnpackAction();
private static final String ZZ_ACTION_PACKED_0 =
"\12\0\4\1\4\2\1\3\1\1\1\4\1\1\2\5"+
"\1\6\2\5\1\7\1\5\2\10\1\11\1\12\1\11"+
"\1\13\1\14\1\10\1\15\1\16\1\15\1\17\1\20"+
"\1\10\1\21\1\10\4\22\1\23\1\22\1\24\1\25"+
"\1\26\3\0\1\27\14\0\1\30\1\31\1\32\1\33"+
"\1\11\1\0\1\34\1\35\1\36\1\0\1\37\1\0"+
"\1\40\3\0\1\41\1\42\2\43\1\42\2\44\2\0"+
"\1\43\1\0\14\43\1\42\3\0\1\11\1\45\3\0"+
"\1\46\1\47\5\0\1\50\4\0\1\50\2\0\2\50"+
"\2\0\1\11\5\0\1\31\1\42\1\43\1\51\3\0"+
"\1\11\2\0\1\52\30\0\1\53\2\0\1\54\1\55"+
"\1\56";
"\12\0\4\1\4\2\1\3\1\4\1\1\2\5\1\6"+
"\1\5\1\7\1\5\2\10\1\11\1\5\1\12\1\11"+
"\1\13\1\14\1\15\1\16\1\15\1\17\1\20\1\10"+
"\1\21\1\10\4\22\1\23\1\24\1\25\1\26\3\0"+
"\1\27\14\0\1\30\1\31\1\32\1\33\1\11\1\0"+
"\1\34\1\35\1\36\1\0\1\37\1\0\1\40\3\0"+
"\1\41\1\42\2\43\1\42\2\44\2\0\1\43\1\0"+
"\14\43\1\42\3\0\1\11\1\45\3\0\1\46\1\47"+
"\5\0\1\50\4\0\1\50\2\0\2\50\2\0\1\11"+
"\5\0\1\31\1\42\1\43\1\51\3\0\1\11\2\0"+
"\1\52\30\0\1\53\2\0\1\54\1\55\1\56";
private static int [] zzUnpackAction() {
int [] result = new int[184];
int [] result = new int[181];
int offset = 0;
offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
return result;
@ -125,30 +124,30 @@ class WikipediaTokenizerImpl {
private static final String ZZ_ROWMAP_PACKED_0 =
"\0\0\0\54\0\130\0\204\0\260\0\334\0\u0108\0\u0134"+
"\0\u0160\0\u018c\0\u01b8\0\u01e4\0\u0210\0\u023c\0\u0268\0\u0294"+
"\0\u02c0\0\u02ec\0\u01b8\0\u0318\0\u0344\0\u0370\0\u01b8\0\u039c"+
"\0\u03c8\0\u03f4\0\u0420\0\u044c\0\u0478\0\u01b8\0\u039c\0\u04a4"+
"\0\u01b8\0\u04d0\0\u04fc\0\u0528\0\u0554\0\u0580\0\u05ac\0\u05d8"+
"\0\u0604\0\u0630\0\u065c\0\u0688\0\u06b4\0\u01b8\0\u06e0\0\u039c"+
"\0\u070c\0\u0738\0\u0764\0\u0790\0\u01b8\0\u01b8\0\u07bc\0\u07e8"+
"\0\u0814\0\u01b8\0\u0840\0\u086c\0\u0898\0\u08c4\0\u08f0\0\u091c"+
"\0\u0948\0\u0974\0\u09a0\0\u09cc\0\u09f8\0\u0a24\0\u0a50\0\u0a7c"+
"\0\u01b8\0\u01b8\0\u0aa8\0\u0ad4\0\u0b00\0\u0b00\0\u01b8\0\u0b2c"+
"\0\u02c0\0\u02ec\0\u01b8\0\u0318\0\u0344\0\u01b8\0\u0370\0\u039c"+
"\0\u03c8\0\u03f4\0\u0420\0\u01b8\0\u0370\0\u044c\0\u0478\0\u01b8"+
"\0\u04a4\0\u04d0\0\u04fc\0\u0528\0\u0554\0\u0580\0\u05ac\0\u05d8"+
"\0\u0604\0\u0630\0\u065c\0\u01b8\0\u0688\0\u0370\0\u06b4\0\u06e0"+
"\0\u070c\0\u01b8\0\u01b8\0\u0738\0\u0764\0\u0790\0\u01b8\0\u07bc"+
"\0\u07e8\0\u0814\0\u0840\0\u086c\0\u0898\0\u08c4\0\u08f0\0\u091c"+
"\0\u0948\0\u0974\0\u09a0\0\u09cc\0\u09f8\0\u01b8\0\u01b8\0\u0a24"+
"\0\u0a50\0\u0a7c\0\u0a7c\0\u01b8\0\u0aa8\0\u0ad4\0\u0b00\0\u0b2c"+
"\0\u0b58\0\u0b84\0\u0bb0\0\u0bdc\0\u0c08\0\u0c34\0\u0c60\0\u0c8c"+
"\0\u0cb8\0\u0ce4\0\u0d10\0\u0898\0\u0d3c\0\u0d68\0\u0d94\0\u0dc0"+
"\0\u0814\0\u0cb8\0\u0ce4\0\u0d10\0\u0d3c\0\u0d68\0\u0d94\0\u0dc0"+
"\0\u0dec\0\u0e18\0\u0e44\0\u0e70\0\u0e9c\0\u0ec8\0\u0ef4\0\u0f20"+
"\0\u0f4c\0\u0f78\0\u0fa4\0\u0fd0\0\u0ffc\0\u1028\0\u1054\0\u1080"+
"\0\u10ac\0\u10d8\0\u01b8\0\u1104\0\u1130\0\u115c\0\u1188\0\u01b8"+
"\0\u0f4c\0\u0f78\0\u0fa4\0\u0fd0\0\u0ffc\0\u1028\0\u1054\0\u01b8"+
"\0\u1080\0\u10ac\0\u10d8\0\u1104\0\u01b8\0\u1130\0\u115c\0\u1188"+
"\0\u11b4\0\u11e0\0\u120c\0\u1238\0\u1264\0\u1290\0\u12bc\0\u12e8"+
"\0\u1314\0\u1340\0\u136c\0\u1398\0\u13c4\0\u086c\0\u09f8\0\u13f0"+
"\0\u141c\0\u1448\0\u1474\0\u14a0\0\u14cc\0\u14f8\0\u1524\0\u01b8"+
"\0\u1550\0\u157c\0\u15a8\0\u15d4\0\u1600\0\u162c\0\u1658\0\u1684"+
"\0\u16b0\0\u01b8\0\u16dc\0\u1708\0\u1734\0\u1760\0\u178c\0\u17b8"+
"\0\u1314\0\u1340\0\u07e8\0\u0974\0\u136c\0\u1398\0\u13c4\0\u13f0"+
"\0\u141c\0\u1448\0\u1474\0\u14a0\0\u01b8\0\u14cc\0\u14f8\0\u1524"+
"\0\u1550\0\u157c\0\u15a8\0\u15d4\0\u1600\0\u162c\0\u01b8\0\u1658"+
"\0\u1684\0\u16b0\0\u16dc\0\u1708\0\u1734\0\u1760\0\u178c\0\u17b8"+
"\0\u17e4\0\u1810\0\u183c\0\u1868\0\u1894\0\u18c0\0\u18ec\0\u1918"+
"\0\u1944\0\u1970\0\u199c\0\u19c8\0\u19f4\0\u1a20\0\u1a4c\0\u1a78"+
"\0\u1aa4\0\u1ad0\0\u1afc\0\u1b28\0\u1b54\0\u01b8\0\u01b8\0\u01b8";
"\0\u1aa4\0\u1ad0\0\u01b8\0\u01b8\0\u01b8";
private static int [] zzUnpackRowMap() {
int [] result = new int[184];
int [] result = new int[181];
int offset = 0;
offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
return result;
@ -172,152 +171,149 @@ class WikipediaTokenizerImpl {
private static final String ZZ_TRANS_PACKED_0 =
"\1\13\1\14\5\13\1\15\1\13\1\16\3\13\1\17"+
"\1\20\1\21\1\22\1\23\1\24\2\13\1\25\2\13"+
"\15\17\1\26\2\13\3\17\1\13\7\27\1\30\5\27"+
"\4\31\1\27\1\32\3\27\1\33\1\27\15\31\3\27"+
"\3\31\10\27\1\30\5\27\4\34\1\27\1\32\3\27"+
"\1\35\1\27\15\34\3\27\3\34\1\27\7\36\1\37"+
"\5\36\4\40\1\36\1\32\2\27\1\36\1\41\1\36"+
"\15\40\3\36\1\42\2\40\2\36\1\43\5\36\1\37"+
"\5\36\4\44\1\36\1\45\2\36\1\46\2\36\15\44"+
"\3\36\3\44\10\36\1\37\5\36\4\47\1\36\1\45"+
"\2\36\1\46\2\36\15\47\3\36\3\47\10\36\1\37"+
"\5\36\4\47\1\36\1\45\2\36\1\50\2\36\15\47"+
"\3\36\3\47\10\36\1\37\1\36\1\51\3\36\4\52"+
"\1\36\1\45\5\36\15\52\3\36\3\52\10\36\1\53"+
"\5\36\4\54\1\36\1\45\5\36\15\54\1\36\1\55"+
"\1\36\3\54\1\36\1\56\1\57\5\56\1\60\1\56"+
"\1\61\3\56\4\62\1\56\1\63\2\56\1\64\2\56"+
"\15\62\2\56\1\65\3\62\1\56\55\0\1\66\62\0"+
"\1\67\4\0\4\70\7\0\6\70\1\71\6\70\3\0"+
"\3\70\12\0\1\72\43\0\1\73\1\74\1\75\1\76"+
"\2\77\1\0\1\100\3\0\1\100\1\17\1\20\1\21"+
"\1\22\7\0\15\17\3\0\3\17\3\0\1\101\1\0"+
"\1\102\2\103\1\0\1\104\3\0\1\104\3\20\1\22"+
"\7\0\15\20\3\0\3\20\2\0\1\73\1\105\1\75"+
"\1\76\2\103\1\0\1\104\3\0\1\104\1\21\1\20"+
"\1\21\1\22\7\0\15\21\3\0\3\21\3\0\1\106"+
"\1\0\1\102\2\77\1\0\1\100\3\0\1\100\4\22"+
"\7\0\15\22\3\0\3\22\24\0\1\13\55\0\1\107"+
"\73\0\1\110\16\0\1\67\4\0\4\70\7\0\15\70"+
"\3\0\3\70\16\0\4\31\7\0\15\31\3\0\3\31"+
"\24\0\1\27\56\0\1\111\42\0\4\34\7\0\15\34"+
"\3\0\3\34\27\0\1\112\42\0\4\40\7\0\15\40"+
"\3\0\3\40\16\0\4\40\7\0\2\40\1\113\12\40"+
"\3\0\3\40\2\0\1\114\67\0\4\44\7\0\15\44"+
"\3\0\3\44\24\0\1\36\55\0\1\115\43\0\4\47"+
"\7\0\15\47\3\0\3\47\26\0\1\116\37\0\1\117"+
"\57\0\4\52\7\0\15\52\3\0\3\52\11\0\1\120"+
"\4\0\4\70\7\0\15\70\3\0\3\70\16\0\4\54"+
"\7\0\15\54\3\0\3\54\47\0\1\117\6\0\1\121"+
"\63\0\1\122\57\0\4\62\7\0\15\62\3\0\3\62"+
"\24\0\1\56\55\0\1\123\43\0\4\70\7\0\15\70"+
"\3\0\3\70\14\0\1\36\1\0\4\124\1\0\3\125"+
"\3\0\15\124\3\0\3\124\14\0\1\36\1\0\4\124"+
"\1\0\3\125\3\0\3\124\1\126\11\124\3\0\3\124"+
"\16\0\1\127\1\0\1\127\10\0\15\127\3\0\3\127"+
"\16\0\1\130\1\131\1\132\1\133\7\0\15\130\3\0"+
"\3\130\16\0\1\134\1\0\1\134\10\0\15\134\3\0"+
"\3\134\16\0\1\135\1\136\1\135\1\136\7\0\15\135"+
"\3\0\3\135\16\0\1\137\2\140\1\141\7\0\15\137"+
"\3\0\3\137\16\0\1\100\2\142\10\0\15\100\3\0"+
"\3\100\16\0\1\143\2\144\1\145\7\0\15\143\3\0"+
"\3\143\16\0\4\136\7\0\15\136\3\0\3\136\16\0"+
"\1\146\2\147\1\150\7\0\15\146\3\0\3\146\16\0"+
"\1\151\2\152\1\153\7\0\15\151\3\0\3\151\16\0"+
"\1\154\1\144\1\155\1\145\7\0\15\154\3\0\3\154"+
"\16\0\1\156\2\131\1\133\7\0\15\156\3\0\3\156"+
"\30\0\1\157\1\160\64\0\1\161\27\0\4\40\7\0"+
"\2\40\1\162\12\40\3\0\3\40\2\0\1\163\101\0"+
"\1\164\1\165\40\0\4\70\7\0\6\70\1\166\6\70"+
"\3\0\3\70\2\0\1\167\63\0\1\170\71\0\1\171"+
"\1\172\34\0\1\173\1\0\1\36\1\0\4\124\1\0"+
"\3\125\3\0\15\124\3\0\3\124\16\0\4\174\1\0"+
"\3\125\3\0\15\174\3\0\3\174\12\0\1\173\1\0"+
"\1\36\1\0\4\124\1\0\3\125\3\0\10\124\1\175"+
"\4\124\3\0\3\124\2\0\1\73\13\0\1\127\1\0"+
"\1\127\10\0\15\127\3\0\3\127\3\0\1\176\1\0"+
"\1\102\2\177\6\0\1\130\1\131\1\132\1\133\7\0"+
"\15\130\3\0\3\130\3\0\1\200\1\0\1\102\2\201"+
"\1\0\1\202\3\0\1\202\3\131\1\133\7\0\15\131"+
"\3\0\3\131\3\0\1\203\1\0\1\102\2\201\1\0"+
"\1\202\3\0\1\202\1\132\1\131\1\132\1\133\7\0"+
"\15\132\3\0\3\132\3\0\1\204\1\0\1\102\2\177"+
"\6\0\4\133\7\0\15\133\3\0\3\133\3\0\1\205"+
"\2\0\1\205\7\0\1\135\1\136\1\135\1\136\7\0"+
"\15\135\3\0\3\135\3\0\1\205\2\0\1\205\7\0"+
"\4\136\7\0\15\136\3\0\3\136\3\0\1\177\1\0"+
"\1\102\2\177\6\0\1\137\2\140\1\141\7\0\15\137"+
"\3\0\3\137\3\0\1\201\1\0\1\102\2\201\1\0"+
"\1\202\3\0\1\202\3\140\1\141\7\0\15\140\3\0"+
"\3\140\3\0\1\177\1\0\1\102\2\177\6\0\4\141"+
"\7\0\15\141\3\0\3\141\3\0\1\202\2\0\2\202"+
"\1\0\1\202\3\0\1\202\3\142\10\0\15\142\3\0"+
"\3\142\3\0\1\106\1\0\1\102\2\77\1\0\1\100"+
"\3\0\1\100\1\143\2\144\1\145\7\0\15\143\3\0"+
"\3\143\3\0\1\101\1\0\1\102\2\103\1\0\1\104"+
"\3\0\1\104\3\144\1\145\7\0\15\144\3\0\3\144"+
"\3\0\1\106\1\0\1\102\2\77\1\0\1\100\3\0"+
"\1\100\4\145\7\0\15\145\3\0\3\145\3\0\1\77"+
"\1\0\1\102\2\77\1\0\1\100\3\0\1\100\1\146"+
"\2\147\1\150\7\0\15\146\3\0\3\146\3\0\1\103"+
"\1\0\1\102\2\103\1\0\1\104\3\0\1\104\3\147"+
"\1\150\7\0\15\147\3\0\3\147\3\0\1\77\1\0"+
"\1\102\2\77\1\0\1\100\3\0\1\100\4\150\7\0"+
"\15\150\3\0\3\150\3\0\1\100\2\0\2\100\1\0"+
"\1\100\3\0\1\100\1\151\2\152\1\153\7\0\15\151"+
"\3\0\3\151\3\0\1\104\2\0\2\104\1\0\1\104"+
"\3\0\1\104\3\152\1\153\7\0\15\152\3\0\3\152"+
"\3\0\1\100\2\0\2\100\1\0\1\100\3\0\1\100"+
"\4\153\7\0\15\153\3\0\3\153\3\0\1\206\1\0"+
"\1\102\2\77\1\0\1\100\3\0\1\100\1\154\1\144"+
"\1\155\1\145\7\0\15\154\3\0\3\154\3\0\1\207"+
"\1\0\1\102\2\103\1\0\1\104\3\0\1\104\1\155"+
"\1\144\1\155\1\145\7\0\15\155\3\0\3\155\3\0"+
"\1\204\1\0\1\102\2\177\6\0\1\156\2\131\1\133"+
"\7\0\15\156\3\0\3\156\31\0\1\160\54\0\1\210"+
"\64\0\1\211\26\0\4\40\7\0\15\40\3\0\1\40"+
"\1\212\1\40\31\0\1\165\54\0\1\213\35\0\1\36"+
"\1\0\4\124\1\0\3\125\3\0\3\124\1\214\11\124"+
"\3\0\3\124\2\0\1\215\102\0\1\172\54\0\1\216"+
"\34\0\1\217\52\0\1\173\3\0\4\174\7\0\15\174"+
"\3\0\3\174\12\0\1\173\1\0\1\220\1\0\4\124"+
"\1\0\3\125\3\0\15\124\3\0\3\124\16\0\1\221"+
"\1\133\1\221\1\133\7\0\15\221\3\0\3\221\16\0"+
"\4\141\7\0\15\141\3\0\3\141\16\0\4\145\7\0"+
"\15\145\3\0\3\145\16\0\4\150\7\0\15\150\3\0"+
"\3\150\16\0\4\153\7\0\15\153\3\0\3\153\16\0"+
"\1\222\1\145\1\222\1\145\7\0\15\222\3\0\3\222"+
"\16\0\4\133\7\0\15\133\3\0\3\133\16\0\4\223"+
"\7\0\15\223\3\0\3\223\33\0\1\224\61\0\1\225"+
"\30\0\4\40\6\0\1\226\15\40\3\0\2\40\1\227"+
"\33\0\1\230\32\0\1\173\1\0\1\36\1\0\4\124"+
"\1\0\3\125\3\0\10\124\1\231\4\124\3\0\3\124"+
"\2\0\1\232\104\0\1\233\36\0\4\234\7\0\15\234"+
"\3\0\3\234\3\0\1\176\1\0\1\102\2\177\6\0"+
"\1\221\1\133\1\221\1\133\7\0\15\221\3\0\3\221"+
"\3\0\1\206\1\0\1\102\2\77\1\0\1\100\3\0"+
"\1\100\1\222\1\145\1\222\1\145\7\0\15\222\3\0"+
"\3\222\3\0\1\205\2\0\1\205\7\0\4\223\7\0"+
"\15\223\3\0\3\223\34\0\1\235\55\0\1\236\26\0"+
"\1\237\60\0\4\40\6\0\1\226\15\40\3\0\3\40"+
"\34\0\1\240\31\0\1\173\1\0\1\117\1\0\4\124"+
"\1\0\3\125\3\0\15\124\3\0\3\124\34\0\1\241"+
"\32\0\1\242\2\0\4\234\7\0\15\234\3\0\3\234"+
"\35\0\1\243\62\0\1\244\20\0\1\245\77\0\1\246"+
"\53\0\1\247\32\0\1\36\1\0\4\174\1\0\3\125"+
"\3\0\15\174\3\0\3\174\36\0\1\250\53\0\1\251"+
"\33\0\4\252\7\0\15\252\3\0\3\252\36\0\1\253"+
"\53\0\1\254\54\0\1\255\61\0\1\256\11\0\1\257"+
"\12\0\4\252\7\0\15\252\3\0\3\252\37\0\1\260"+
"\53\0\1\261\54\0\1\262\22\0\1\13\62\0\4\263"+
"\7\0\15\263\3\0\3\263\40\0\1\264\53\0\1\265"+
"\43\0\1\266\26\0\2\263\1\0\2\263\1\0\2\263"+
"\2\0\5\263\7\0\15\263\3\0\4\263\27\0\1\267"+
"\53\0\1\270\24\0";
"\1\20\1\21\1\22\1\23\3\13\1\24\2\13\15\17"+
"\1\25\2\13\3\17\1\13\7\26\1\27\5\26\4\30"+
"\5\26\1\31\1\26\15\30\3\26\3\30\10\26\1\27"+
"\5\26\4\32\5\26\1\33\1\26\15\32\3\26\3\32"+
"\1\26\7\34\1\35\5\34\4\36\1\34\1\37\2\26"+
"\1\34\1\40\1\34\15\36\3\34\1\41\2\36\2\34"+
"\1\42\5\34\1\35\5\34\4\43\4\34\1\44\2\34"+
"\15\43\3\34\3\43\10\34\1\35\5\34\4\45\4\34"+
"\1\44\2\34\15\45\3\34\3\45\10\34\1\35\5\34"+
"\4\45\4\34\1\46\2\34\15\45\3\34\3\45\10\34"+
"\1\35\1\34\1\47\3\34\4\50\7\34\15\50\3\34"+
"\3\50\10\34\1\51\5\34\4\52\7\34\15\52\1\34"+
"\1\53\1\34\3\52\1\34\1\54\1\55\5\54\1\56"+
"\1\54\1\57\3\54\4\60\4\54\1\61\2\54\15\60"+
"\2\54\1\62\3\60\1\54\55\0\1\63\62\0\1\64"+
"\4\0\4\65\7\0\6\65\1\66\6\65\3\0\3\65"+
"\12\0\1\67\43\0\1\70\1\71\1\72\1\73\2\74"+
"\1\0\1\75\3\0\1\75\1\17\1\20\1\21\1\22"+
"\7\0\15\17\3\0\3\17\3\0\1\76\1\0\1\77"+
"\2\100\1\0\1\101\3\0\1\101\3\20\1\22\7\0"+
"\15\20\3\0\3\20\2\0\1\70\1\102\1\72\1\73"+
"\2\100\1\0\1\101\3\0\1\101\1\21\1\20\1\21"+
"\1\22\7\0\15\21\3\0\3\21\3\0\1\103\1\0"+
"\1\77\2\74\1\0\1\75\3\0\1\75\4\22\7\0"+
"\15\22\3\0\3\22\26\0\1\104\73\0\1\105\16\0"+
"\1\64\4\0\4\65\7\0\15\65\3\0\3\65\16\0"+
"\4\30\7\0\15\30\3\0\3\30\27\0\1\106\42\0"+
"\4\32\7\0\15\32\3\0\3\32\27\0\1\107\42\0"+
"\4\36\7\0\15\36\3\0\3\36\24\0\1\26\45\0"+
"\4\36\7\0\2\36\1\110\12\36\3\0\3\36\2\0"+
"\1\111\67\0\4\43\7\0\15\43\3\0\3\43\26\0"+
"\1\112\43\0\4\45\7\0\15\45\3\0\3\45\26\0"+
"\1\113\37\0\1\114\57\0\4\50\7\0\15\50\3\0"+
"\3\50\11\0\1\115\4\0\4\65\7\0\15\65\3\0"+
"\3\65\16\0\4\52\7\0\15\52\3\0\3\52\47\0"+
"\1\114\6\0\1\116\63\0\1\117\57\0\4\60\7\0"+
"\15\60\3\0\3\60\26\0\1\120\43\0\4\65\7\0"+
"\15\65\3\0\3\65\14\0\1\34\1\0\4\121\1\0"+
"\3\122\3\0\15\121\3\0\3\121\14\0\1\34\1\0"+
"\4\121\1\0\3\122\3\0\3\121\1\123\11\121\3\0"+
"\3\121\16\0\1\124\1\0\1\124\10\0\15\124\3\0"+
"\3\124\16\0\1\125\1\126\1\127\1\130\7\0\15\125"+
"\3\0\3\125\16\0\1\131\1\0\1\131\10\0\15\131"+
"\3\0\3\131\16\0\1\132\1\133\1\132\1\133\7\0"+
"\15\132\3\0\3\132\16\0\1\134\2\135\1\136\7\0"+
"\15\134\3\0\3\134\16\0\1\75\2\137\10\0\15\75"+
"\3\0\3\75\16\0\1\140\2\141\1\142\7\0\15\140"+
"\3\0\3\140\16\0\4\133\7\0\15\133\3\0\3\133"+
"\16\0\1\143\2\144\1\145\7\0\15\143\3\0\3\143"+
"\16\0\1\146\2\147\1\150\7\0\15\146\3\0\3\146"+
"\16\0\1\151\1\141\1\152\1\142\7\0\15\151\3\0"+
"\3\151\16\0\1\153\2\126\1\130\7\0\15\153\3\0"+
"\3\153\30\0\1\154\1\155\64\0\1\156\27\0\4\36"+
"\7\0\2\36\1\157\12\36\3\0\3\36\2\0\1\160"+
"\101\0\1\161\1\162\40\0\4\65\7\0\6\65\1\163"+
"\6\65\3\0\3\65\2\0\1\164\63\0\1\165\71\0"+
"\1\166\1\167\34\0\1\170\1\0\1\34\1\0\4\121"+
"\1\0\3\122\3\0\15\121\3\0\3\121\16\0\4\171"+
"\1\0\3\122\3\0\15\171\3\0\3\171\12\0\1\170"+
"\1\0\1\34\1\0\4\121\1\0\3\122\3\0\10\121"+
"\1\172\4\121\3\0\3\121\2\0\1\70\13\0\1\124"+
"\1\0\1\124\10\0\15\124\3\0\3\124\3\0\1\173"+
"\1\0\1\77\2\174\6\0\1\125\1\126\1\127\1\130"+
"\7\0\15\125\3\0\3\125\3\0\1\175\1\0\1\77"+
"\2\176\1\0\1\177\3\0\1\177\3\126\1\130\7\0"+
"\15\126\3\0\3\126\3\0\1\200\1\0\1\77\2\176"+
"\1\0\1\177\3\0\1\177\1\127\1\126\1\127\1\130"+
"\7\0\15\127\3\0\3\127\3\0\1\201\1\0\1\77"+
"\2\174\6\0\4\130\7\0\15\130\3\0\3\130\3\0"+
"\1\202\2\0\1\202\7\0\1\132\1\133\1\132\1\133"+
"\7\0\15\132\3\0\3\132\3\0\1\202\2\0\1\202"+
"\7\0\4\133\7\0\15\133\3\0\3\133\3\0\1\174"+
"\1\0\1\77\2\174\6\0\1\134\2\135\1\136\7\0"+
"\15\134\3\0\3\134\3\0\1\176\1\0\1\77\2\176"+
"\1\0\1\177\3\0\1\177\3\135\1\136\7\0\15\135"+
"\3\0\3\135\3\0\1\174\1\0\1\77\2\174\6\0"+
"\4\136\7\0\15\136\3\0\3\136\3\0\1\177\2\0"+
"\2\177\1\0\1\177\3\0\1\177\3\137\10\0\15\137"+
"\3\0\3\137\3\0\1\103\1\0\1\77\2\74\1\0"+
"\1\75\3\0\1\75\1\140\2\141\1\142\7\0\15\140"+
"\3\0\3\140\3\0\1\76\1\0\1\77\2\100\1\0"+
"\1\101\3\0\1\101\3\141\1\142\7\0\15\141\3\0"+
"\3\141\3\0\1\103\1\0\1\77\2\74\1\0\1\75"+
"\3\0\1\75\4\142\7\0\15\142\3\0\3\142\3\0"+
"\1\74\1\0\1\77\2\74\1\0\1\75\3\0\1\75"+
"\1\143\2\144\1\145\7\0\15\143\3\0\3\143\3\0"+
"\1\100\1\0\1\77\2\100\1\0\1\101\3\0\1\101"+
"\3\144\1\145\7\0\15\144\3\0\3\144\3\0\1\74"+
"\1\0\1\77\2\74\1\0\1\75\3\0\1\75\4\145"+
"\7\0\15\145\3\0\3\145\3\0\1\75\2\0\2\75"+
"\1\0\1\75\3\0\1\75\1\146\2\147\1\150\7\0"+
"\15\146\3\0\3\146\3\0\1\101\2\0\2\101\1\0"+
"\1\101\3\0\1\101\3\147\1\150\7\0\15\147\3\0"+
"\3\147\3\0\1\75\2\0\2\75\1\0\1\75\3\0"+
"\1\75\4\150\7\0\15\150\3\0\3\150\3\0\1\203"+
"\1\0\1\77\2\74\1\0\1\75\3\0\1\75\1\151"+
"\1\141\1\152\1\142\7\0\15\151\3\0\3\151\3\0"+
"\1\204\1\0\1\77\2\100\1\0\1\101\3\0\1\101"+
"\1\152\1\141\1\152\1\142\7\0\15\152\3\0\3\152"+
"\3\0\1\201\1\0\1\77\2\174\6\0\1\153\2\126"+
"\1\130\7\0\15\153\3\0\3\153\31\0\1\155\54\0"+
"\1\205\64\0\1\206\26\0\4\36\7\0\15\36\3\0"+
"\1\36\1\207\1\36\31\0\1\162\54\0\1\210\35\0"+
"\1\34\1\0\4\121\1\0\3\122\3\0\3\121\1\211"+
"\11\121\3\0\3\121\2\0\1\212\102\0\1\167\54\0"+
"\1\213\34\0\1\214\52\0\1\170\3\0\4\171\7\0"+
"\15\171\3\0\3\171\12\0\1\170\1\0\1\215\1\0"+
"\4\121\1\0\3\122\3\0\15\121\3\0\3\121\16\0"+
"\1\216\1\130\1\216\1\130\7\0\15\216\3\0\3\216"+
"\16\0\4\136\7\0\15\136\3\0\3\136\16\0\4\142"+
"\7\0\15\142\3\0\3\142\16\0\4\145\7\0\15\145"+
"\3\0\3\145\16\0\4\150\7\0\15\150\3\0\3\150"+
"\16\0\1\217\1\142\1\217\1\142\7\0\15\217\3\0"+
"\3\217\16\0\4\130\7\0\15\130\3\0\3\130\16\0"+
"\4\220\7\0\15\220\3\0\3\220\33\0\1\221\61\0"+
"\1\222\30\0\4\36\6\0\1\223\15\36\3\0\2\36"+
"\1\224\33\0\1\225\32\0\1\170\1\0\1\34\1\0"+
"\4\121\1\0\3\122\3\0\10\121\1\226\4\121\3\0"+
"\3\121\2\0\1\227\104\0\1\230\36\0\4\231\7\0"+
"\15\231\3\0\3\231\3\0\1\173\1\0\1\77\2\174"+
"\6\0\1\216\1\130\1\216\1\130\7\0\15\216\3\0"+
"\3\216\3\0\1\203\1\0\1\77\2\74\1\0\1\75"+
"\3\0\1\75\1\217\1\142\1\217\1\142\7\0\15\217"+
"\3\0\3\217\3\0\1\202\2\0\1\202\7\0\4\220"+
"\7\0\15\220\3\0\3\220\34\0\1\232\55\0\1\233"+
"\26\0\1\234\60\0\4\36\6\0\1\223\15\36\3\0"+
"\3\36\34\0\1\235\31\0\1\170\1\0\1\114\1\0"+
"\4\121\1\0\3\122\3\0\15\121\3\0\3\121\34\0"+
"\1\236\32\0\1\237\2\0\4\231\7\0\15\231\3\0"+
"\3\231\35\0\1\240\62\0\1\241\20\0\1\242\77\0"+
"\1\243\53\0\1\244\32\0\1\34\1\0\4\171\1\0"+
"\3\122\3\0\15\171\3\0\3\171\36\0\1\245\53\0"+
"\1\246\33\0\4\247\7\0\15\247\3\0\3\247\36\0"+
"\1\250\53\0\1\251\54\0\1\252\61\0\1\253\11\0"+
"\1\254\12\0\4\247\7\0\15\247\3\0\3\247\37\0"+
"\1\255\53\0\1\256\54\0\1\257\22\0\1\13\62\0"+
"\4\260\7\0\15\260\3\0\3\260\40\0\1\261\53\0"+
"\1\262\43\0\1\263\26\0\2\260\1\0\2\260\1\0"+
"\2\260\2\0\5\260\7\0\15\260\3\0\4\260\27\0"+
"\1\264\53\0\1\265\24\0";
private static int [] zzUnpackTrans() {
int [] result = new int[7040];
int [] result = new int[6908];
int offset = 0;
offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
return result;
@ -355,8 +351,8 @@ class WikipediaTokenizerImpl {
private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
private static final String ZZ_ATTRIBUTE_PACKED_0 =
"\12\0\1\11\7\1\1\11\3\1\1\11\6\1\1\11"+
"\2\1\1\11\14\1\1\11\6\1\2\11\3\0\1\11"+
"\12\0\1\11\7\1\1\11\2\1\1\11\5\1\1\11"+
"\3\1\1\11\13\1\1\11\5\1\2\11\3\0\1\11"+
"\14\0\2\1\2\11\1\1\1\0\2\1\1\11\1\0"+
"\1\1\1\0\1\1\3\0\7\1\2\0\1\1\1\0"+
"\15\1\3\0\1\1\1\11\3\0\1\1\1\11\5\0"+
@ -365,7 +361,7 @@ class WikipediaTokenizerImpl {
"\2\0\3\11";
private static int [] zzUnpackAttribute() {
int [] result = new int[184];
int [] result = new int[181];
int offset = 0;
offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
return result;
@ -508,7 +504,6 @@ final void reset() {
/**
* Creates a new scanner
* There is also a java.io.InputStream version of this constructor.
*
* @param in the java.io.Reader to read input from.
*/
@ -516,7 +511,6 @@ final void reset() {
this.zzReader = in;
}
/**
* Unpacks the compressed character translation table.

View File

@ -212,7 +212,7 @@ DOUBLE_EQUALS = "="{2}
{DOUBLE_BRACE} {numWikiTokensSeen = 0; positionInc = 1; currentTokType = CITATION; yybegin(DOUBLE_BRACE_STATE);/* Break so we don't hit fall-through warning: */ break;}
{CITATION} {numWikiTokensSeen = 0; positionInc = 1; currentTokType = CITATION; yybegin(DOUBLE_BRACE_STATE);/* Break so we don't hit fall-through warning: */ break;}
//ignore
. | {WHITESPACE} |{INFOBOX} {numWikiTokensSeen = 0; positionInc = 1; /* Break so we don't hit fall-through warning: */ break;}
[^] |{INFOBOX} {numWikiTokensSeen = 0; positionInc = 1; /* Break so we don't hit fall-through warning: */ break;}
}
<INTERNAL_LINK_STATE>{
@ -221,7 +221,7 @@ DOUBLE_EQUALS = "="{2}
{ALPHANUM} {yybegin(INTERNAL_LINK_STATE); numWikiTokensSeen++; return currentTokType;}
{DOUBLE_BRACKET_CLOSE} {numLinkToks = 0; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;}
//ignore
. | {WHITESPACE} { positionInc = 1; /* Break so we don't hit fall-through warning: */ break;}
[^] { positionInc = 1; /* Break so we don't hit fall-through warning: */ break;}
}
<EXTERNAL_LINK_STATE>{
@ -236,7 +236,7 @@ DOUBLE_EQUALS = "="{2}
{ALPHANUM} {yybegin(CATEGORY_STATE); numWikiTokensSeen++; return currentTokType;}
{DOUBLE_BRACKET_CLOSE} {yybegin(YYINITIAL);/* Break so we don't hit fall-through warning: */ break;}
//ignore
. | {WHITESPACE} { positionInc = 1; /* Break so we don't hit fall-through warning: */ break;}
[^] { positionInc = 1; /* Break so we don't hit fall-through warning: */ break;}
}
//italics
<TWO_SINGLE_QUOTES_STATE>{
@ -249,7 +249,7 @@ DOUBLE_EQUALS = "="{2}
{EXTERNAL_LINK} {currentTokType = EXTERNAL_LINK; numWikiTokensSeen = 0; yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;}
//ignore
. | {WHITESPACE} { /* Break so we don't hit fall-through warning: */ break;/* ignore */ }
[^] { /* Break so we don't hit fall-through warning: */ break;/* ignore */ }
}
//bold
<THREE_SINGLE_QUOTES_STATE>{
@ -260,7 +260,7 @@ DOUBLE_EQUALS = "="{2}
{EXTERNAL_LINK} {currentTokType = EXTERNAL_LINK; numWikiTokensSeen = 0; yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;}
//ignore
. | {WHITESPACE} { /* Break so we don't hit fall-through warning: */ break;/* ignore */ }
[^] { /* Break so we don't hit fall-through warning: */ break;/* ignore */ }
}
//bold italics
@ -272,7 +272,7 @@ DOUBLE_EQUALS = "="{2}
{EXTERNAL_LINK} {currentTokType = EXTERNAL_LINK; numWikiTokensSeen = 0; yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;}
//ignore
. | {WHITESPACE} { /* Break so we don't hit fall-through warning: */ break;/* ignore */ }
[^] { /* Break so we don't hit fall-through warning: */ break;/* ignore */ }
}
<DOUBLE_EQUALS_STATE>{
@ -280,15 +280,15 @@ DOUBLE_EQUALS = "="{2}
{ALPHANUM} {currentTokType = HEADING; yybegin(DOUBLE_EQUALS_STATE); numWikiTokensSeen++; return currentTokType;}
{DOUBLE_EQUALS} {yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;}
//ignore
. | {WHITESPACE} { /* Break so we don't hit fall-through warning: */ break;/* ignore */ }
[^] { /* Break so we don't hit fall-through warning: */ break;/* ignore */ }
}
<DOUBLE_BRACE_STATE>{
{ALPHANUM} {yybegin(DOUBLE_BRACE_STATE); numWikiTokensSeen = 0; return currentTokType;}
{DOUBLE_BRACE_CLOSE} {yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;}
{CITATION_CLOSE} {yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;}
//ignore
. | {WHITESPACE} { /* Break so we don't hit fall-through warning: */ break;/* ignore */ }
//ignore
[^] { /* Break so we don't hit fall-through warning: */ break;/* ignore */ }
}
<STRING> {
@ -305,7 +305,7 @@ DOUBLE_EQUALS = "="{2}
{PIPE} {yybegin(STRING); return currentTokType;/*pipe*/}
.|{WHITESPACE} { /* Break so we don't hit fall-through warning: */ break;/* ignore STRING */ }
[^] { /* Break so we don't hit fall-through warning: */ break;/* ignore STRING */ }
}
@ -327,7 +327,7 @@ DOUBLE_EQUALS = "="{2}
//end wikipedia
/** Ignore the rest */
. | {WHITESPACE}|{TAGS} { /* Break so we don't hit fall-through warning: */ break;/* ignore */ }
[^] | {TAGS} { /* Break so we don't hit fall-through warning: */ break;/* ignore */ }
//INTERNAL_LINK = "["{2}({ALPHANUM}+{WHITESPACE}*)+"]"{2}

View File

@ -202,7 +202,7 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
}
public void testUnicodeWordBreaks() throws Exception {
WordBreakTestUnicode_6_1_0 wordBreakTest = new WordBreakTestUnicode_6_1_0();
WordBreakTestUnicode_6_3_0 wordBreakTest = new WordBreakTestUnicode_6_3_0();
wordBreakTest.test(a);
}
@ -230,6 +230,8 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
checkOneTerm(a, "壹゙", "壹゙"); // ideographic
checkOneTerm(a, "아゙", "아゙"); // hangul
}
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {

View File

@ -60,7 +60,7 @@ public class TestStopAnalyzer extends BaseTokenStreamTestCase {
public void testStopList() throws IOException {
CharArraySet stopWordsSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("good", "test", "analyzer"), false);
StopAnalyzer newStop = new StopAnalyzer(Version.LUCENE_40, stopWordsSet);
StopAnalyzer newStop = new StopAnalyzer(TEST_VERSION_CURRENT, stopWordsSet);
try (TokenStream stream = newStop.tokenStream("test", "This is a good test of the english stop analyzer")) {
assertNotNull(stream);
CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);

View File

@ -94,7 +94,7 @@ public class TestStopFilter extends BaseTokenStreamTestCase {
// LUCENE-3849: make sure after .end() we see the "ending" posInc
public void testEndStopword() throws Exception {
CharArraySet stopSet = StopFilter.makeStopSet(TEST_VERSION_CURRENT, "of");
StopFilter stpf = new StopFilter(Version.LUCENE_40, new MockTokenizer(new StringReader("test of"), MockTokenizer.WHITESPACE, false), stopSet);
StopFilter stpf = new StopFilter(TEST_VERSION_CURRENT, new MockTokenizer(new StringReader("test of"), MockTokenizer.WHITESPACE, false), stopSet);
assertTokenStreamContents(stpf, new String[] { "test" },
new int[] {0},
new int[] {4},

View File

@ -424,7 +424,7 @@ public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
}
public void testUnicodeWordBreaks() throws Exception {
WordBreakTestUnicode_6_1_0 wordBreakTest = new WordBreakTestUnicode_6_1_0();
WordBreakTestUnicode_6_3_0 wordBreakTest = new WordBreakTestUnicode_6_3_0();
wordBreakTest.test(a);
}

View File

@ -78,13 +78,13 @@ LTLNFsgB@[191.56.104.113]
iT0LOq.jtPW=G06~cETxl2ge@Ah0.4hn72v.tQ.LU
VGLn@z3E2.3an2.MM
TWmfsxn@[112.192.017.029]
2tP07A@2twe6u0d6uw6o.sed7n.109mx.XN--KGBECHTV
2tP07A@2twe6u0d6uw6o.sed7n.109mx.XN--KPRW13D
CjaPC63@['\RDrwk]
Ayydpdoa@tdgypppmen.wf
"gfKP9"@jo3-r0.mz
aTMgDW4@t5gax.XN--0ZWM56D
aTMgDW4@t5gax.XN--3E0B707E
mcDrMO3FQ@nwc21.y5qd45lesryrp.IL
NZqj@v50egeveepk.z290kk.Bc3.xn--jxalpdlp
NZqj@v50egeveepk.z290kk.Bc3.xn--kprw13d
XtAhFnq@[218.214.251.103]
x0S8uos@[109.82.126.233]
ALB4KFavj16pODdd@i206d6s.MM

View File

@ -78,9 +78,10 @@ import org.junit.Ignore;
* \\p{Script = Hiragana}
* \\p{LineBreak = Complex_Context} (From $line_break_url)
* \\p{WordBreak = ALetter} (From $word_break_url)
* \\p{WordBreak = Hebrew_Letter}
* \\p{WordBreak = Katakana}
* \\p{WordBreak = Numeric} (Excludes full-width Arabic digits)
* [\\uFF10-\\uFF19] (Full-width Arabic digits)
* [\\uFF10-\\uFF19] (Full-width Arabic digits)
*/
\@Ignore
public class ${class_name} extends BaseTokenStreamTestCase {
@ -97,7 +98,7 @@ parse_Unicode_data_file($line_break_url, $codepoints, {'sa' => 1});
parse_Unicode_data_file($scripts_url, $codepoints,
{'han' => 1, 'hiragana' => 1});
parse_Unicode_data_file($word_break_url, $codepoints,
{'aletter' => 1, 'katakana' => 1, 'numeric' => 1});
{'aletter' => 1, 'hebrew_letter' => 1, 'katakana' => 1, 'numeric' => 1});
my @tests = split /\r?\n/, get_URL_content($word_break_test_url);
my $output_path = File::Spec->catpath($volume, $directory, $output_filename);
@ -109,25 +110,33 @@ print STDERR "Writing '$output_path'...";
print OUT $header;
for my $line (@tests) {
next if ($line =~ /^\s*\#/);
# ÷ 0001 × 0300 ÷ # ÷ [0.2] <START OF HEADING> (Other) × [4.0] COMBINING GRAVE ACCENT (Extend_FE) ÷ [0.3]
next if ($line =~ /^\s*(?:|\#.*)$/); # Skip blank or comment-only lines
# Example line: ÷ 0001 × 0300 ÷ # ÷ [0.2] <START OF HEADING> (Other) × [4.0] COMBINING GRAVE ACCENT (Extend_FE) ÷ [0.3]
my ($sequence) = $line =~ /^(.*?)\s*\#/;
$line =~ s/\t/ /g; # Convert tabs to two spaces (no tabs allowed in Lucene source)
print OUT " // $line\n";
$sequence =~ s/\s*÷\s*$//; # Trim trailing break character
my $test_string = $sequence;
$test_string =~ s/\s*÷\s*/\\u/g;
$test_string =~ s/\s*×\s*/\\u/g;
$test_string =~ s/\\u([0-9A-F]{5,})/join('', map { "\\u$_" } above_BMP_char_to_surrogates($1))/ge;
$test_string =~ s/\\u000A/\\n/g;
$test_string =~ s/\\u000D/\\r/g;
$test_string =~ s/\\u0022/\\\"/g;
$sequence =~ s/^\s*÷\s*//; # Trim leading break character
my @tokens = ();
for my $candidate (split /\s*÷\s*/, $sequence) {
my @chars = ();
my $has_wanted_char = 0;
while ($candidate =~ /([0-9A-F]+)/gi) {
push @chars, $1;
my $hexchar = $1;
if (4 == length($hexchar)) {
push @chars, $hexchar;
} else {
push @chars, above_BMP_char_to_surrogates($hexchar);
}
unless ($has_wanted_char) {
$has_wanted_char = 1 if (defined($codepoints->[hex($1)]));
$has_wanted_char = 1 if (defined($codepoints->[hex($hexchar)]));
}
}
if ($has_wanted_char) {
@ -144,6 +153,21 @@ close OUT;
print STDERR "done.\n";
# sub above_BMP_char_to_surrogates
#
# Converts hex references to chars above the BMP (i.e., greater than 0xFFFF)
# to the corresponding UTF-16 surrogate pair
#
# Assumption: input string is a sequence more than four hex digits
#
sub above_BMP_char_to_surrogates {
my $ch = hex(shift);
my $high_surrogate = 0xD800 + (($ch - 0x10000) >> 10);
my $low_surrogate = 0xDC00 + ($ch & 0x3FF);
return map { sprintf("%04X", $_) } ($high_surrogate, $low_surrogate);
}
# sub parse_Unicode_data_file
#
# Downloads and parses the specified Unicode data file, parses it, and

View File

@ -121,14 +121,14 @@ Bzzzzzzzz! Bzzzzzzzzzzzzzzz! Tell them "0\!P?".shQVdSerA@2qmqj8ul.hm the leg
of LTLNFsgB@[191.56.104.113] all, until it has read it is
iT0LOq.jtPW=G06~cETxl2ge@Ah0.4hn72v.tQ.LU there. <VGLn@z3E2.3an2.MM> Once
TWmfsxn@[112.192.017.029] Spiros under the place
2tP07A@2twe6u0d6uw6o.sed7n.109mx.XN--KGBECHTV as were not a house of the
2tP07A@2twe6u0d6uw6o.sed7n.109mx.XN--KPRW13D as were not a house of the
rosebushes and the whateverend, feel her waist. She changes everything. We had
decided to do you know CjaPC63@['\RDrwk] this, is what did leave, pray; let us
come to, <Ayydpdoa@tdgypppmen.wf> what history as died. Strange, Spiros with
delight: That night "gfKP9"@jo3-r0.mz and gold case
<aTMgDW4@t5gax.XN--0ZWM56D> is spring: the aeon arising, wherein he returned,
<aTMgDW4@t5gax.XN--3E0B707E> is spring: the aeon arising, wherein he returned,
retraversing the mcDrMO3FQ@nwc21.y5qd45lesryrp.IL gates, first
<NZqj@v50egeveepk.z290kk.Bc3.xn--jxalpdlp> to reach session. Initiating first
<NZqj@v50egeveepk.z290kk.Bc3.xn--kprw13d> to reach session. Initiating first
part of the main hall toward his own spurs. Hes an <XtAhFnq@[218.214.251.103]>
Irifix And older ones who wins? ADAM: x0S8uos@[109.82.126.233] The violin and
reality. The hidden set up to come. ROSE WAKINS: No answer. The

View File

@ -24,7 +24,7 @@ and Joe recited this iron bars with their account, poor elth, and she had been
almost drove me towards evening. At
HTTP://173.202.175.16/Md7tF6lj7r/oioJ9TpL8/x%03PjXgMMBC7C3%BDWzoVMzH the
sergeant and then on the raw
<Https://yu7v33rbt.vC6U3.XN--JXALPDLP/y%4fMSzkGFlm/wbDF4m> afternoon towards
<Https://yu7v33rbt.vC6U3.XN--KPRW13D/y%4fMSzkGFlm/wbDF4m> afternoon towards
the terror, merely wished him as biled
M19nq.0URV4A.Me.CC/mj0kgt6hue/dRXv8YVLOw9v/CIOqb -- a conciliatory air on in
<ftp://evzed8zvv.l2xkky.Dq85qcl1.eu:1184/07eY0/3X1OB7gPUk/J8la5OPUY3/y1oTItIs1HFPPp/5Q02N0cPyDH87hSy/jheYGF8s%F3P/%86PmYhi/ViKHoxsHqM8J>
@ -47,7 +47,7 @@ to live. You didn't know nothing could attend more.' He had been a coming! Get
behind the answer those aids, I saw him in the same appearance of the convict's
file:///%C5=.%8by/uuFXEaW8.%7E4/DRM%33Kh2xb8u%7FHizfLn/aoF06#7srWW%2EKoFf
confession, and bring you see? '
HTTP://yA2O3F.XN--0ZWM56D/qPDTt/MwMXGQq2S7JT/TJ2iCND said my limbs. Joe in an
HTTP://yA2O3F.XN--3E0B707E/qPDTt/MwMXGQq2S7JT/TJ2iCND said my limbs. Joe in an
accusatory manner as well known that Joe Gargery marry her cup. `I wonder and
there was publicly made it was,
<file:///Gdx5CDZYW%6cnzMJ/7HJ/J%63BSZDXtS/yfWXqq6#> as lookers on; me, I
@ -63,7 +63,7 @@ again
FTP://Hi144dz6hctql2n3uom.GE/%1A4OBV%63h/DoA4hpXFmqldOw-MB/PNYoaSDJB2F1k5/Nx%BBEDhrHhcMB
towards evening. At last, and kneaded, and a dead man taking any. There was
publicly made out there?' said I,
ftp://w0yaysrl.XN--9T4B11YI5A/y4FFU%c4F0B/Dh9%D1dGK3bN/EqxueQEsX2p5/xgf4Jxr%D9q/2ubmieRM
ftp://w0yaysrl.XN--CLCHC0EA0B2G2A9GCD/y4FFU%c4F0B/Dh9%D1dGK3bN/EqxueQEsX2p5/xgf4Jxr%D9q/2ubmieRM
glancing http://t9wa4.rjcahbc06qmyk9jkhu3f.ZA/vIwW3sc3Pg/Bwmeo6KAjkRY at the
N54l6e.vu/1m2%8bMFjv/oBdy%36.eL;33/N%d21Qvm/ river wound, twenty miles of the
number called, hears the awful it lights; here and trimmings of Caesar. This
@ -155,7 +155,7 @@ ftp://E1cdf-p.XN--MGBERP4A5D4AR:60510/qMaw4kSSgYM/7jgIuL/gSVW6O91/2bhnsj/kl7R5sg
at me, and that her walking z3ymb.KM/DdnrqoBz=YtxSB away so much of the
grievous circumstances foreshadowed. After receiving the way, that I thought,
if she should go to?' `Good again!' cried the
FTP://7kgip3z.XN--HGBK6AJ7F53BBA:15983/OYEQzIA0 society of a savoury pork pie,
FTP://7kgip3z.XN--KPRY57D:15983/OYEQzIA0 society of a savoury pork pie,
and nezt6awdc.lSZDSU14B1OH.4n6nkmjyyj.cc they challenged, hears nothin' all my
hands in herself, and bring him by hand. `This,' ftp://085.062.055.011/bopfVV/
said he wore ftp://Mbbn8n.6ge03fiivyc7of.PS/mvb/X8VNt/5WrMZpw/flC6Rs a dog of
@ -191,7 +191,7 @@ and tingling, and that I had won of the shoulder. `Excuse me, and we departed
from Richard the furthest end of
http://ch43n.51rkj.rze.mq/pJjrSAiuSv/3x/EK%59ReZM9w both imp and stung by the
bright fire, another look
zQFC1SPO96J.Jy20d8.xn--0zwm56d:863/0OWpT4dpkMURAGe/nFg/LQBUr%3E/af7dO1 over her
zQFC1SPO96J.Jy20d8.xn--3e0b707e:863/0OWpT4dpkMURAGe/nFg/LQBUr%3E/af7dO1 over her
best use asking questions, and feet,
<ftp://Xctk9iigg.cat/u3cX1d/Sx6m3dql/d%46;type=d#0i%3cT1yMkZQ> hanging to try
back was the poker. `It was not warmly. `Seems
@ -204,7 +204,7 @@ kitchen wall,
Ftp://2gifamku.jqv10es.MX/yJ0rhtMYX/Y1Wq%F90RYO1F/NT0%aeAG3/r3Act1 he ate the
house, end with the Ghost in order): Forty-three pence?' To five hundred
Gargerys.' `I say, Pip; stay
7WO6F.XN--11B5BS3A9AJ6G/1L%f9G0NEu/L2lD/mQGNS9UhgCEb out with
7WO6F.XN--45BRJ9C/1L%f9G0NEu/L2lD/mQGNS9UhgCEb out with
ftp://mIMU.t4d24n4lyx39.zURN708MCNGK-TJ42GLLBQRJHVENGPO.bw:59930/KmBYQKHfcjNRe/rK3fUjg%0Ad/.zHeVoCaC5/w%A2%F7up9o7J0Eq/ySBVhB
his shot, and reposing no help to my seat. It was in the kitchen wall, because
I calculated the sounds by giving me by the name for a rush of Joe's forge
@ -299,7 +299,7 @@ She drew the kitchen, carrying file:///Y?GG/BBqMPBJ/nsxX3qP/8P24WdqBxH so low
wooden hut
ftp://7vl2w.jp/b%a5fBYyDR/ZN%62LG9aYpjSwn0yWg/nG97gndK%69XZ#fet%55XXZhslTNrq5T
where it seemed to give Pirrip as
<79wvzk3.24dyfkxg0f4z-hsqgqqzj2p9n59el0a.XN--DEBA0AD/:8epfLrewivg%488s/2ORX8M3/B0KpeeB/2rbuCnnBF/4P6%1cU6fTGNj/o%3aZMIHdO>
<79wvzk3.24dyfkxg0f4z-hsqgqqzj2p9n59el0a.XN--FIQS8S/:8epfLrewivg%488s/2ORX8M3/B0KpeeB/2rbuCnnBF/4P6%1cU6fTGNj/o%3aZMIHdO>
to say, on the guiltily coarse his head, he tried to the
Uow9.sF.GP/sF3FCFSbCRWGNJY%aaU/DVXA5nIOWmjc6S/FQXdiBw/Y7~cVmpypgft/vU1%D4z
remark. `There's one sprinkled all I was possible she beggared me. All these
@ -311,7 +311,7 @@ Http://Ed095eimjy.rlb5698d.kp/_l5uoOO/aA494s?3nSxdIpE=y%79qu+2un1hGR&J%76=8&L%be
he shook her veil so thick nor my milk and would impart all had returned, with
soap-suds, I had FILE:///#F9Bgl just like thin snow. `Enough of his right side
of thenceforth sitting
jyia054.l814D9SNHRRA5RJCCW.kvxga.XN--0ZWM56D/sBbx24%f2Tw2/Sd0Lul0Vg1bbIqW~/lveEw
jyia054.l814D9SNHRRA5RJCCW.kvxga.XN--3E0B707E/sBbx24%f2Tw2/Sd0Lul0Vg1bbIqW~/lveEw
in File:///KKfIe63z/BETB.T%C6sG/RcYgnOycg my soul. I sat down on it, I have
been a spoon that the pie, blacksmith?' asked Estella of it made a mouth wide
open, and so
@ -324,7 +324,7 @@ FTP://7qf.hlj.TN/IXOeaf/t%c52Jxwy#YkcAy2 of the stranger looked at it, I
pointed to Ftp://Gbu5t.HT/xad4fgjaN#GLpU3XQd6%7F(cHIz himself. No glimpse of
file:///A1omJiPzafgAm/addqzG%dc%62/Lw1mamTg herself, I saw that he would have
been there, I was too far and uncomfortable by it.
http://89qw34ksf0qf6iq264of-1nya4ds7qvpixw8c951aw8wcm3.qxk7usa.N8j1frzfgnkbi9y2.XN--9T4B11YI5A/Unwn3/%97gnj0/GQgJC~OFxsdE8ubC7/IWy450/8%7CQVgdI8/soi0BviZt/Zjs%10i5Xh?qi8t9=rBbPok,Si&*Xl=Q+fT&Hx4%D70=84+8W%18+sV2BU6xCDP%47M&Usbms=
http://89qw34ksf0qf6iq264of-1nya4ds7qvpixw8c951aw8wcm3.qxk7usa.N8j1frzfgnkbi9y2.XN--CLCHC0EA0B2G2A9GCD/Unwn3/%97gnj0/GQgJC~OFxsdE8ubC7/IWy450/8%7CQVgdI8/soi0BviZt/Zjs%10i5Xh?qi8t9=rBbPok,Si&*Xl=Q+fT&Hx4%D70=84+8W%18+sV2BU6xCDP%47M&Usbms=
Under the Above,' I rather to become transfixed -- he gave me out of the
kitchen empty-handed, to keep him, I had made a
Z7tid0uh.eZMOI-M1.umlsyksuzovqdw6wozbd.BW/m%e684OhC/ErAhpGiG subject, if he had
@ -468,7 +468,7 @@ hard twist upon his -- `Well, boy,' Uncle Pumblechook: a look at the sermon he
had heard it had hesitated as little window, violently plunging and she had
committed, and had all about the present calling, which the fingers of tea on
Saturdays than this country, gentlemen, but I could see those,
https://nWC9-RIA00RPVL4SSWRICWWX3NH5SMQIA7IPMCK174T30VQBL-M6.XN--0ZWM56D/CwE%e2rWaYZmE?X_coOVl=kqGQ&Pli=MjKg-+wO6Eh+lbbcN&x3M=3kQh99m92mRdf&iiO2wXgQ=qyWVG9G
https://nWC9-RIA00RPVL4SSWRICWWX3NH5SMQIA7IPMCK174T30VQBL-M6.XN--3E0B707E/CwE%e2rWaYZmE?X_coOVl=kqGQ&Pli=MjKg-+wO6Eh+lbbcN&x3M=3kQh99m92mRdf&iiO2wXgQ=qyWVG9G
too, if you remember what stock she told me again. `But I know what
file:///enqvF%EFLOBsZhl8h2z wittles is?' `Yes, ma'am.' `Estella, take me again
and ftp://133.4.130.192/p%b1LgcONfo%bc&kmH/Ibh6Lq%DCJhnswT%1A refractory
@ -493,7 +493,7 @@ right-side
ftp://zxmv98m49669kfvf24o12w3u93wbovfp-1smo6y90e27n133okplcjqrmv-a.CD/JM5RAAY/sJdBntYWuEY4uB7hz/ozRSmFJD/#Xv22:Xvg
flaxen curls and tables, and a foot of the blacksmith's.' `Halloa!' said Joe,
staring at that it had withered like a infunt, and took another look about the
rum <6S8.Crwllo5e3.jmtz.XN--G6W251D/6InlQn/hnhu2f%ac8tX/apq%0D6o/> out at once.
rum <6S8.Crwllo5e3.jmtz.XN--GECRJ9C/6InlQn/hnhu2f%ac8tX/apq%0D6o/> out at once.
Three Jolly Bargemen to think she seemed to tell you were. When we saw the file
coming at my slice. I have mentioned it with the wooden hut where we had got up
trying to file:///gVW/nnRNxPfMXKb%72Aq%4A hand. If ever grateful for. If a
@ -662,7 +662,7 @@ open,' he
https://227.086.128.010:64985/MDKuFInA86qto5/_cK=4S%49Ic/SPp76/TlV%0Arlwfx/
wiped the liquor. He was the bad; and some one
Ftp://171.160.94.43/ALTgS46I4VM/55PbbK/5N%faTSE another
Ftp://3zd7z.etw.XN--JXALPDLP/4UztCuTbW2z/LL%2cDI/dTYSi9 turned to put straws
Ftp://3zd7z.etw.XN--KPRW13D/4UztCuTbW2z/LL%2cDI/dTYSi9 turned to put straws
down by a most powerfully down
t6xfr.wxjz5p2t5.zl8m4.MN/2cbpjk/gsdm/5Mvc-j3rc/16Wb65&c7x to me, and all that
know the window,
@ -993,7 +993,7 @@ upon a door, which was gobbling mincemeat, meatbone, bread, some lace for it
that Joe's blue file:///EYS2nDf%9671qsm34OZeB%e5lUA/rYBDn0DKs0/ eyes, had an
hour longer than at me, and dismal, and gloves, and that's further than I
mpuwl0.BA/MkvAvc?j%11K4=9gE%613&qOOEP0t=g7EXs looked on. `Now, boy!
g6tylc0.daeczh.4q.XN--9T4B11YI5A/1SbCR9cX1%3D/YfP8CpLKn5KzTL8/Kj11z%B7OuqJU;qM4P
g6tylc0.daeczh.4q.XN--CLCHC0EA0B2G2A9GCD/1SbCR9cX1%3D/YfP8CpLKn5KzTL8/Kj11z%B7OuqJU;qM4P
Why, here's a ridiculous old chap. And looked up by hand. `Why don't like
`sulks.' Therefore, I was in such game?' Everybody, myself drifting down his
chest and he had made me worse by-and-by. I was a
@ -1035,7 +1035,7 @@ in every word out again. `You are prison-ships, and they fought
<HTTPS://bF2RA.kw/1TA9pTTBg/nM/VSRo%85Kt?%62mxNfo=HDowgwkM3&9oPOLH2=yKOxIe+YNtt>
for us heavy. `I Bolted, myself, 5.Piba4ac.JE/55M1H/AZXdj and thread, and we
after him, or to inspire confidence. This was brought you spoke all the act, he
couldn't m-k6-ej7x.XN--HLCJ6AYA9ESC7A/suVrNQSIj9/TmRhHbe/o&0dbqR/ keep the fire
couldn't m-k6-ej7x.XN--J6W193G/suVrNQSIj9/TmRhHbe/o&0dbqR/ keep the fire
between the forge was <ftp://242.228.138.8/o%CC_QjILS%17aYH/%caw8CcVZyPRZ/>
busy in it. Until
hGE9YH3D6.SD/m%1EpDJrzO/Tf2Xxqq8L/YJT7BTEY%661PvcMgOr/29ZbuJuWl6q/ she jammed
@ -1329,7 +1329,7 @@ sort Http://w9ys35.wb55p6l.hxl.rs/Y97%58Lp8JjLZw/5L --
FILE://155.24.106.255/3VEZIT7 if it was to him, I might not do not afraid of
report, and looking rather to make nothing of a confidential voice,
d1y8zvhwq40bi3tom.hPCZ.gJ-286X.TG/ayWKrgAvF6tn/L4SgquZT6C/1DmNe/CI69rJ/%f6QrzZGkSQ
as lda5l5wc.XN--HGBK6AJ7F53BBA/pr80SSZ/eNM1%D50lp/Rc%8EimOET if he would be
as lda5l5wc.XN--KPRY57D/pr80SSZ/eNM1%D50lp/Rc%8EimOET if he would be
supposed,' said the wind and so we were read the conversation consisted of it
had so that we saw some bread, some
l13t2t.sk/O%2BmRkw/@0AgGL@NX/wgt&aggDcp#0IYe'C brandy out: no black velvet

View File

@ -10,7 +10,7 @@ http://Rcbu6/Oxc%C0IkGSZ8rO9IUpd/BEvkvw3nWNXZ/P%17tp3gjATN/0ZRzs
file:///2CdsP/U2GCLT
Http://Pzw978uzb.ai/yB;mt/o8hVKG/%231Y/Xb1%bb6v1fhjfdkfkBvxed?8mq~=OvF&STpJJk=ws0ZO&0DRA=
HTTP://173.202.175.16/Md7tF6lj7r/oioJ9TpL8/x%03PjXgMMBC7C3%BDWzoVMzH
Https://yu7v33rbt.vC6U3.XN--JXALPDLP/y%4fMSzkGFlm/wbDF4m
Https://yu7v33rbt.vC6U3.XN--KPRW13D/y%4fMSzkGFlm/wbDF4m
M19nq.0URV4A.Me.CC/mj0kgt6hue/dRXv8YVLOw9v/CIOqb
ftp://evzed8zvv.l2xkky.Dq85qcl1.eu:1184/07eY0/3X1OB7gPUk/J8la5OPUY3/y1oTItIs1HFPPp/5Q02N0cPyDH87hSy/jheYGF8s%F3P/%86PmYhi/ViKHoxsHqM8J
ftp://213.7.210.47/%e5pFkj6e6Jczc/ypJGG/z%663jYR/37IxLQBPr/Ciq50EUIdueyj
@ -23,13 +23,13 @@ Ftp://Xmswrxn8d-1s.pe.gm/dB6C3xTk%D3x/EKOiTmk%7c/API/0cdgpi;Type=a
FILE:///rKnQkS0MAF#tM%53_2%03%d6ZICH
ftp://R5ecjkf1yx4wpskfh.tv0y3m90ak.0R605.se:51297/zpWcRRcG/1woSqw7ZUko/
file:///%C5=.%8by/uuFXEaW8.%7E4/DRM%33Kh2xb8u%7FHizfLn/aoF06#7srWW%2EKoFf
HTTP://yA2O3F.XN--0ZWM56D/qPDTt/MwMXGQq2S7JT/TJ2iCND
HTTP://yA2O3F.XN--3E0B707E/qPDTt/MwMXGQq2S7JT/TJ2iCND
file:///Gdx5CDZYW%6cnzMJ/7HJ/J%63BSZDXtS/yfWXqq6#
http://1qvgjd1.TP/7oq5gWW/Gwqf8fxBXR4/?Br,q=ayMz0&1IO%370N7=;Sl1czc2L+5bRISfD+w&ygP3FhV%E1w36=2Rx
ftp://5SCC6BUYP.Knf1cvlc22z9.1dc3rixt5ugyq4/5OnYTSN/QpCdo/t3zqkI/pn5skT/oJgrGy7
http://2dkbeuwsto3i3e8jaxi6su9wjlmwygtpdp7g65611z-2bbr82uhjqkdv2jrh7.KZ/FiSvI/aaB&dPQ%42kLdM
FTP://Hi144dz6hctql2n3uom.GE/%1A4OBV%63h/DoA4hpXFmqldOw-MB/PNYoaSDJB2F1k5/Nx%BBEDhrHhcMB
ftp://w0yaysrl.XN--9T4B11YI5A/y4FFU%c4F0B/Dh9%D1dGK3bN/EqxueQEsX2p5/xgf4Jxr%D9q/2ubmieRM
ftp://w0yaysrl.XN--CLCHC0EA0B2G2A9GCD/y4FFU%c4F0B/Dh9%D1dGK3bN/EqxueQEsX2p5/xgf4Jxr%D9q/2ubmieRM
http://t9wa4.rjcahbc06qmyk9jkhu3f.ZA/vIwW3sc3Pg/Bwmeo6KAjkRY
N54l6e.vu/1m2%8bMFjv/oBdy%36.eL;33/N%d21Qvm/
http://ah-2d4.ASIA/qmp
@ -75,7 +75,7 @@ http://4u3o/BKdhwRyzG
file:///LdsHfPABFz1vRD1OB6Yl/RS6&1Gmz/mfYul/
ftp://E1cdf-p.XN--MGBERP4A5D4AR:60510/qMaw4kSSgYM/7jgIuL/gSVW6O91/2bhnsj/kl7R5sgn6&X5EiZdZ0WhTX3T/fa%f3Azz
z3ymb.KM/DdnrqoBz=YtxSB
FTP://7kgip3z.XN--HGBK6AJ7F53BBA:15983/OYEQzIA0
FTP://7kgip3z.XN--KPRY57D:15983/OYEQzIA0
nezt6awdc.lSZDSU14B1OH.4n6nkmjyyj.cc
ftp://085.062.055.011/bopfVV/
ftp://Mbbn8n.6ge03fiivyc7of.PS/mvb/X8VNt/5WrMZpw/flC6Rs
@ -93,12 +93,12 @@ https://[3790:ad57:0B63::e5f7:f6ac:164C]/Obax;zcD/Y%48%9a/Z2xcdar
bl60k0jqkc9.oow84o1.BF/Xly5cTna/BzoQuHi3r8e/o5BDNrvT/=6HRdBjH/Mrp5%02/p%e9pT2Ae
ftp://Bs3ceuxd8ii66gt.X8wwdpt.BB:27095/3BfkvfzcmTS/FTffh&S/gIWvJ5Kd/AlOQ%3EnO
http://ch43n.51rkj.rze.mq/pJjrSAiuSv/3x/EK%59ReZM9w
zQFC1SPO96J.Jy20d8.xn--0zwm56d:863/0OWpT4dpkMURAGe/nFg/LQBUr%3E/af7dO1
zQFC1SPO96J.Jy20d8.xn--3e0b707e:863/0OWpT4dpkMURAGe/nFg/LQBUr%3E/af7dO1
ftp://Xctk9iigg.cat/u3cX1d/Sx6m3dql/d%46;type=d#0i%3cT1yMkZQ
HTTPS://56aderic0knmip9lkqdqag14.uk:45885/lELiK:/vF%4C5Enwqy/P5NGJ2b/dD6sg1yMV
ftp://vlt.3g45k63viz2.tcnm3.UA:60664/AJ9iqYk%c1/uKbohn2/K%D1kequ4z8rxFpJ
Ftp://2gifamku.jqv10es.MX/yJ0rhtMYX/Y1Wq%F90RYO1F/NT0%aeAG3/r3Act1
7WO6F.XN--11B5BS3A9AJ6G/1L%f9G0NEu/L2lD/mQGNS9UhgCEb
7WO6F.XN--45BRJ9C/1L%f9G0NEu/L2lD/mQGNS9UhgCEb
ftp://mIMU.t4d24n4lyx39.zURN708MCNGK-TJ42GLLBQRJHVENGPO.bw:59930/KmBYQKHfcjNRe/rK3fUjg%0Ad/.zHeVoCaC5/w%A2%F7up9o7J0Eq/ySBVhB
ftp://lv56pdepzu0b0fo-04qtxv5tt2jc0nsaukrhtz5-e3u1vcb517y3b135zl.e0r1hson.dk/3TVoqjp6%1FCFSkt/006VZfho/gxrWxgDawM3Uk
Ftp://7n977.Niyt.2fgkzfhj.q7-DJ.Ow7a.it/5zfRi3PO8/1zfKT9%421tP/?SazEijJq%710COQKWeLE/TdUc%b2u/2AxBw9%4BUN6Zp4Z/KfUZd1MTdPv/L4m1tI3/WJvcK1
@ -147,20 +147,20 @@ ftp://Lq.es/%B1ZPdTZgB2mNFW/qre92rM
file:///IZ47ESCtX%aatQab1/V553gjR?Me/#9%68qPw
file:///Y?GG/BBqMPBJ/nsxX3qP/8P24WdqBxH
ftp://7vl2w.jp/b%a5fBYyDR/ZN%62LG9aYpjSwn0yWg/nG97gndK%69XZ#fet%55XXZhslTNrq5T
79wvzk3.24dyfkxg0f4z-hsqgqqzj2p9n59el0a.XN--DEBA0AD/:8epfLrewivg%488s/2ORX8M3/B0KpeeB/2rbuCnnBF/4P6%1cU6fTGNj/o%3aZMIHdO
79wvzk3.24dyfkxg0f4z-hsqgqqzj2p9n59el0a.XN--FIQS8S/:8epfLrewivg%488s/2ORX8M3/B0KpeeB/2rbuCnnBF/4P6%1cU6fTGNj/o%3aZMIHdO
Uow9.sF.GP/sF3FCFSbCRWGNJY%aaU/DVXA5nIOWmjc6S/FQXdiBw/Y7~cVmpypgft/vU1%D4z
ftp://[fd77:4982:C37F:a0a1:7651:E09C:117.093.145.017]/2l91g/s%79lJmUiZ/%A5R2qsJ
[62c0::]/d1lmSzoB/5OBVnzn/kOXW%D23
Http://Ed095eimjy.rlb5698d.kp/_l5uoOO/aA494s?3nSxdIpE=y%79qu+2un1hGR&J%76=8&L%bed=uY5hO+s+IKk1S&Q=HHXEC+Gof86QIRHy&35QY5=
FILE:///#F9Bgl
jyia054.l814D9SNHRRA5RJCCW.kvxga.XN--0ZWM56D/sBbx24%f2Tw2/Sd0Lul0Vg1bbIqW~/lveEw
jyia054.l814D9SNHRRA5RJCCW.kvxga.XN--3E0B707E/sBbx24%f2Tw2/Sd0Lul0Vg1bbIqW~/lveEw
File:///KKfIe63z/BETB.T%C6sG/RcYgnOycg
ftp://892f7.oel50j.32.9qj1p-g7lgw.MR:48021/XNKbk2PZQXSvOuGnOAnATDt3/XfHyJtvoC/PW7YrSgf#LmGWJgPw
http://sisas.ua/4CU60ZLK4VgY8AR89
FTP://7qf.hlj.TN/IXOeaf/t%c52Jxwy#YkcAy2
Ftp://Gbu5t.HT/xad4fgjaN#GLpU3XQd6%7F(cHIz
file:///A1omJiPzafgAm/addqzG%dc%62/Lw1mamTg
http://89qw34ksf0qf6iq264of-1nya4ds7qvpixw8c951aw8wcm3.qxk7usa.N8j1frzfgnkbi9y2.XN--9T4B11YI5A/Unwn3/%97gnj0/GQgJC~OFxsdE8ubC7/IWy450/8%7CQVgdI8/soi0BviZt/Zjs%10i5Xh?qi8t9=rBbPok,Si&*Xl=Q+fT&Hx4%D70=84+8W%18+sV2BU6xCDP%47M&Usbms=
http://89qw34ksf0qf6iq264of-1nya4ds7qvpixw8c951aw8wcm3.qxk7usa.N8j1frzfgnkbi9y2.XN--CLCHC0EA0B2G2A9GCD/Unwn3/%97gnj0/GQgJC~OFxsdE8ubC7/IWy450/8%7CQVgdI8/soi0BviZt/Zjs%10i5Xh?qi8t9=rBbPok,Si&*Xl=Q+fT&Hx4%D70=84+8W%18+sV2BU6xCDP%47M&Usbms=
Z7tid0uh.eZMOI-M1.umlsyksuzovqdw6wozbd.BW/m%e684OhC/ErAhpGiG
ftp://tw7d-6yu.im:2055/%66qbqzss/OmPGW;type=d
FTP://zst.tn/QcUpaA/VKvJ2/JN6AKew/iXYIiHm7mfPFmD%21E5/yTQpoiqdbaaS1/LnzOX#VqsobH
@ -228,7 +228,7 @@ file:///UIIGOxv6jvF2%c0/%A8J3%677Gmq8im1zklKhqx/HMhCSY2QcyxvL/
http://Qhk9z.zm/cOGBen/mBsDycEI5V7L1s%84WUj7863/p%5f~okuRD51b0M?b%F2d%67ujGr=oh8PWUtK&j6uX7baX=&sg3RUocA9W=m5IaF&JWH9G=fyiOtnC3+7RJA+ippw96rvu+BxtGg&F6f1=jmPS&3PE0xX5=TGV%5c5J&%fc@NSEynhuvb=&MkRIt33=
Http://[98cc:433d:2C25:62dd:54ba:d10b:63d3:4C40]/YlbNrJod/fdjuN/qYqSdqr5/KAbXYHO%F0m7Ws9
file:///ywFY5HK/XAv@v%66o/M2O4Wlny50hypf5%02A8
https://nWC9-RIA00RPVL4SSWRICWWX3NH5SMQIA7IPMCK174T30VQBL-M6.XN--0ZWM56D/CwE%e2rWaYZmE?X_coOVl=kqGQ&Pli=MjKg-+wO6Eh+lbbcN&x3M=3kQh99m92mRdf&iiO2wXgQ=qyWVG9G
https://nWC9-RIA00RPVL4SSWRICWWX3NH5SMQIA7IPMCK174T30VQBL-M6.XN--3E0B707E/CwE%e2rWaYZmE?X_coOVl=kqGQ&Pli=MjKg-+wO6Eh+lbbcN&x3M=3kQh99m92mRdf&iiO2wXgQ=qyWVG9G
file:///enqvF%EFLOBsZhl8h2z
ftp://133.4.130.192/p%b1LgcONfo%bc&kmH/Ibh6Lq%DCJhnswT%1A
ftp://1xf.ipl4f0y6c4.VA/LHuq~/p2nPbE/0YGGNJB%DEje2psef_B/aKOuMl1Q9
@ -240,7 +240,7 @@ http://nEN5ZN.EG/%0efsf4v30L
file:///19%9947/ksd3Sq7W78%27/2K_Ylzcu2q
r8sht9qzsc1e2wp.ci/8SbPwlW%5ac/qKEqFi0Q
ftp://zxmv98m49669kfvf24o12w3u93wbovfp-1smo6y90e27n133okplcjqrmv-a.CD/JM5RAAY/sJdBntYWuEY4uB7hz/ozRSmFJD/#Xv22:Xvg
6S8.Crwllo5e3.jmtz.XN--G6W251D/6InlQn/hnhu2f%ac8tX/apq%0D6o/
6S8.Crwllo5e3.jmtz.XN--GECRJ9C/6InlQn/hnhu2f%ac8tX/apq%0D6o/
file:///gVW/nnRNxPfMXKb%72Aq%4A
file:///Fzza388TQ
file:///
@ -314,7 +314,7 @@ file:///3%aexrb7UdZ5GpR4ZIfoxwL/vQV%4a2zQxki/QRji6gHpMGgBaM/d%71A2CTpZv-kF0tD/Ig
f5ms.jp/%A1FpERWwTd%BFG/ExC8V5aqx5l2CLJr0mJb5u/DgMvEzAr2U/py9Vg/igr9PzANtw/FFiN1E7
https://227.086.128.010:64985/MDKuFInA86qto5/_cK=4S%49Ic/SPp76/TlV%0Arlwfx/
Ftp://171.160.94.43/ALTgS46I4VM/55PbbK/5N%faTSE
Ftp://3zd7z.etw.XN--JXALPDLP/4UztCuTbW2z/LL%2cDI/dTYSi9
Ftp://3zd7z.etw.XN--KPRW13D/4UztCuTbW2z/LL%2cDI/dTYSi9
t6xfr.wxjz5p2t5.zl8m4.MN/2cbpjk/gsdm/5Mvc-j3rc/16Wb65&c7x
ftp://D02-auxxaeqnv9ve-jlmo3.l10vqu.12jl.2mvjwrsqm.BA/r71QLLNu6oGJjG/HbxrX1Grq8/QR%2agZv4hR
file:///XoCg%EDVf/A3ibJYjU
@ -476,7 +476,7 @@ ftp://53.151.134.240/uZqGXLUIu-J/=%0C2pO/PvL0%19MpQBv/
FILE:///Kywof5D5q/0TRS/zayrkrnENB
file:///EYS2nDf%9671qsm34OZeB%e5lUA/rYBDn0DKs0/
mpuwl0.BA/MkvAvc?j%11K4=9gE%613&qOOEP0t=g7EXs
g6tylc0.daeczh.4q.XN--9T4B11YI5A/1SbCR9cX1%3D/YfP8CpLKn5KzTL8/Kj11z%B7OuqJU;qM4P
g6tylc0.daeczh.4q.XN--CLCHC0EA0B2G2A9GCD/1SbCR9cX1%3D/YfP8CpLKn5KzTL8/Kj11z%B7OuqJU;qM4P
file:///TJa%86AczeCmM5QMhi/Wox~Ajl/WxUF%5eSA:y%0fD%E21/x%cca%d3Qgx/8iWJ5-h%26/fCK%01nQNrK8#ygTTB
file:///~%303cUUVYTEaQU5%5DXbogiPKb/favR2rETEh/9TXM%15u/nYCOZpZgL
file:///mJM%a1/jv5%53QDqE/bFMu0CBp
@ -496,7 +496,7 @@ http://gpu16lz.LS/9e%daJrwQfHEpFvsZ3jx/c4STIJ/CmvEGAUx9f/
file://ij9anjtok86ro.uN-BGDQ855IB.sDXAQR.5kr8kz.3J3M8XRM.18r3s0g-6.4rjsmwue0lwao0og17d-5-1.F1h3qgkul29yw2t4p4se5clomncxhmoy.g6c9tbz7.pa/5LMtmbl/1tfIF/pBOV7Hc
HTTPS://bF2RA.kw/1TA9pTTBg/nM/VSRo%85Kt?%62mxNfo=HDowgwkM3&9oPOLH2=yKOxIe+YNtt
5.Piba4ac.JE/55M1H/AZXdj
m-k6-ej7x.XN--HLCJ6AYA9ESC7A/suVrNQSIj9/TmRhHbe/o&0dbqR/
m-k6-ej7x.XN--J6W193G/suVrNQSIj9/TmRhHbe/o&0dbqR/
ftp://242.228.138.8/o%CC_QjILS%17aYH/%caw8CcVZyPRZ/
hGE9YH3D6.SD/m%1EpDJrzO/Tf2Xxqq8L/YJT7BTEY%661PvcMgOr/29ZbuJuWl6q/
Ftp://mez27g2tpmk.MC/%B8AHk%95etDns%46/gXbsCn%6C-/s8_Jmy/DhmfT~Di6KD
@ -633,7 +633,7 @@ http://047.014.184.200/Z_QdOwjzfBue4Nt/aEn/xuEQD/cXlnoxHIK%7d8h/1%eegEk7E0/8Ejku
Http://w9ys35.wb55p6l.hxl.rs/Y97%58Lp8JjLZw/5L
FILE://155.24.106.255/3VEZIT7
d1y8zvhwq40bi3tom.hPCZ.gJ-286X.TG/ayWKrgAvF6tn/L4SgquZT6C/1DmNe/CI69rJ/%f6QrzZGkSQ
lda5l5wc.XN--HGBK6AJ7F53BBA/pr80SSZ/eNM1%D50lp/Rc%8EimOET
lda5l5wc.XN--KPRY57D/pr80SSZ/eNM1%D50lp/Rc%8EimOET
l13t2t.sk/O%2BmRkw/@0AgGL@NX/wgt&aggDcp#0IYe'C
FILE://a6ys9a4.xj.BY/%99BGXp/F=yJtxc71/gvXuHuB9k
212.072.006.032/6kV8ce%2e/%e7lzm-HB%4artP/zg6tWMW7RIG?U7=HAXw$D3sM%7DyDJ&Gt=

View File

@ -75,7 +75,7 @@ public class TestUAX29URLEmailTokenizerFactory extends BaseTokenStreamFactoryTes
+ " samba Halta gamba "
+ "ftp://119.220.152.185/JgJgdZ/31aW5c/viWlfQSTs5/1c8U5T/ih5rXx/YfUJ/xBW1uHrQo6.R\n"
+ "M19nq.0URV4A.Me.CC/mj0kgt6hue/dRXv8YVLOw9v/CIOqb\n"
+ "Https://yu7v33rbt.vC6U3.XN--JXALPDLP/y%4fMSzkGFlm/wbDF4m"
+ "Https://yu7v33rbt.vC6U3.XN--KPRW13D/y%4fMSzkGFlm/wbDF4m"
+ " inter Locutio "
+ "[c2d4::]/%471j5l/j3KFN%AAAn/Fip-NisKH/\n"
+ "file:///aXvSZS34is/eIgM8s~U5dU4Ifd%c7"
@ -91,7 +91,7 @@ public class TestUAX29URLEmailTokenizerFactory extends BaseTokenStreamFactoryTes
"samba", "Halta", "gamba",
"ftp://119.220.152.185/JgJgdZ/31aW5c/viWlfQSTs5/1c8U5T/ih5rXx/YfUJ/xBW1uHrQo6.R",
"M19nq.0URV4A.Me.CC/mj0kgt6hue/dRXv8YVLOw9v/CIOqb",
"Https://yu7v33rbt.vC6U3.XN--JXALPDLP/y%4fMSzkGFlm/wbDF4m",
"Https://yu7v33rbt.vC6U3.XN--KPRW13D/y%4fMSzkGFlm/wbDF4m",
"inter", "Locutio",
"[c2d4::]/%471j5l/j3KFN%AAAn/Fip-NisKH/",
"file:///aXvSZS34is/eIgM8s~U5dU4Ifd%c7",

View File

@ -60,20 +60,21 @@ public class GenerateJflexTLDMacros {
private static final String APACHE_LICENSE
= "/*" + NL
+ " * Copyright 2001-2005 The Apache Software Foundation." + NL
+ " *" + NL
+ " * Licensed under the Apache License, Version 2.0 (the \"License\");" + NL
+ " * you may not use this file except in compliance with the License." + NL
+ " * You may obtain a copy of the License at" + NL
+ " *" + NL
+ " * http://www.apache.org/licenses/LICENSE-2.0" + NL
+ " *" + NL
+ " * Unless required by applicable law or agreed to in writing, software" + NL
+ " * distributed under the License is distributed on an \"AS IS\" BASIS," + NL
+ " * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied." + NL
+ " * See the License for the specific language governing permissions and" + NL
+ " * limitations under the License." + NL
+ " */" + NL + NL;
+ " * Licensed to the Apache Software Foundation (ASF) under one or more" + NL
+ " * contributor license agreements. See the NOTICE file distributed with" + NL
+ " * this work for additional information regarding copyright ownership." + NL
+ " * The ASF licenses this file to You under the Apache License, Version 2.0" + NL
+ " * (the \"License\"); you may not use this file except in compliance with" + NL
+ " * the License. You may obtain a copy of the License at" + NL
+ " *" + NL
+ " * http://www.apache.org/licenses/LICENSE-2.0" + NL
+ " *" + NL
+ " * Unless required by applicable law or agreed to in writing, software" + NL
+ " * distributed under the License is distributed on an \"AS IS\" BASIS," + NL
+ " * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied." + NL
+ " * See the License for the specific language governing permissions and" + NL
+ " * limitations under the License." + NL
+ " */" + NL;
private static final Pattern TLD_PATTERN_1
= Pattern.compile("([-A-Za-z0-9]+)\\.\\s+NS\\s+.*");

View File

@ -14,27 +14,52 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Default RBBI rules, based on UAX#29.
# This file is from ICU (with some small modifications, to avoid CJK dictionary break)
#
# Copyright (C) 2002-2013, International Business Machines Corporation
# and others. All Rights Reserved.
#
# file: word.txt
#
# ICU Word Break Rules
# See Unicode Standard Annex #29.
# These rules are based on UAX #29 Revision 22 for Unicode Version 6.3
#
# Note: Updates to word.txt will usually need to be merged into
# word_POSIX.txt also.
##############################################################################
#
# Character class definitions from TR 29
#
##############################################################################
!!chain;
#
# Character Class Definitions.
#
$CR = [\p{Word_Break = CR}];
$LF = [\p{Word_Break = LF}];
$Newline = [\p{Word_Break = Newline}];
$Extend = [\p{Word_Break = Extend}];
$Format = [\p{Word_Break = Format}];
$Katakana = [\p{Word_Break = Katakana}];
$ALetter = [\p{Word_Break = ALetter}];
$MidNumLet = [\p{Word_Break = MidNumLet}];
$MidLetter = [\p{Word_Break = MidLetter}];
$MidNum = [\p{Word_Break = MidNum}];
$Numeric = [\p{Word_Break = Numeric}[[:Decomposition_Type=Wide:]&[:General_Category=Decimal_Number:]]];
$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
$CR = [\p{Word_Break = CR}];
$LF = [\p{Word_Break = LF}];
$Newline = [\p{Word_Break = Newline}];
$Extend = [\p{Word_Break = Extend}];
$Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
$Format = [\p{Word_Break = Format}];
$Katakana = [\p{Word_Break = Katakana}];
$Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}];
$ALetter = [\p{Word_Break = ALetter}];
$Single_Quote = [\p{Word_Break = Single_Quote}];
$Double_Quote = [\p{Word_Break = Double_Quote}];
$MidNumLet = [\p{Word_Break = MidNumLet}];
$MidLetter = [\p{Word_Break = MidLetter}];
$MidNum = [\p{Word_Break = MidNum}];
$Numeric = [\p{Word_Break = Numeric}[[:Decomposition_Type=Wide:]&[:General_Category=Decimal_Number:]]];
$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
$Han = [:Han:];
$Hiragana = [:Hiragana:];
# Dictionary character set, for triggering language-based break engines. Currently
@ -42,24 +67,34 @@ $ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
# 5.0 or later as the definition of Complex_Context was corrected to include all
# characters requiring dictionary break.
$dictionary = [:LineBreak = Complex_Context:];
$Control = [\p{Grapheme_Cluster_Break = Control}];
$ALetterPlus = [$ALetter [$dictionary-$Extend-$Control]]; # Note: default ALetter does not
# include the dictionary characters.
$HangulSyllable = [\uac00-\ud7a3];
$ComplexContext = [:LineBreak = Complex_Context:];
$KanaKanji = [$Han $Hiragana $Katakana];
$dictionaryCJK = [$Han $Hiragana $HangulSyllable];
$dictionary = [$ComplexContext];
# leave CJK scripts out of ALetterPlus
$ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];
#
# Rules 4 Ignore Format and Extend characters,
# except when they appear at the beginning of a region of text.
#
$KatakanaEx = $Katakana ($Extend | $Format)*;
$ALetterEx = $ALetterPlus ($Extend | $Format)*;
$MidNumLetEx = $MidNumLet ($Extend | $Format)*;
$MidLetterEx = $MidLetter ($Extend | $Format)*;
$MidNumEx = $MidNum ($Extend | $Format)*;
$NumericEx = $Numeric ($Extend | $Format)*;
$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;
# TODO: check if handling of katakana in dictionary makes rules incorrect/void
$KatakanaEx = $Katakana ($Extend | $Format)*;
$Hebrew_LetterEx = $Hebrew_Letter ($Extend | $Format)*;
$ALetterEx = $ALetterPlus ($Extend | $Format)*;
$Single_QuoteEx = $Single_Quote ($Extend | $Format)*;
$Double_QuoteEx = $Double_Quote ($Extend | $Format)*;
$MidNumLetEx = $MidNumLet ($Extend | $Format)*;
$MidLetterEx = $MidLetter ($Extend | $Format)*;
$MidNumEx = $MidNum ($Extend | $Format)*;
$NumericEx = $Numeric ($Extend | $Format)*;
$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;
$Regional_IndicatorEx = $Regional_Indicator ($Extend | $Format)*;
$Hiragana = [\p{script=Hiragana}];
$Ideographic = [\p{Ideographic}];
$HiraganaEx = $Hiragana ($Extend | $Format)*;
$IdeographicEx = $Ideographic ($Extend | $Format)*;
@ -77,23 +112,31 @@ $CR $LF;
# of a region of Text. The rule here comes into play when the start of text
# begins with a group of Format chars, or with a "word" consisting of a single
# char that is not in any of the listed word break categories followed by
# format char(s).
# format char(s), or is not a CJK dictionary character.
[^$CR $LF $Newline]? ($Extend | $Format)+;
$NumericEx {100};
$ALetterEx {200};
$HangulSyllable {200};
$Hebrew_LetterEx{200};
$KatakanaEx {300}; # note: these status values override those from rule 5
$HiraganaEx {300}; # by virtual of being numerically larger.
$HiraganaEx {300}; # by virtue of being numerically larger.
$IdeographicEx {400}; #
#
# rule 5
# Do not break between most letters.
#
$ALetterEx $ALetterEx {200};
($ALetterEx | $Hebrew_LetterEx) ($ALetterEx | $Hebrew_LetterEx) {200};
# rule 6 and 7
$ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200};
($ALetterEx | $Hebrew_LetterEx) ($MidLetterEx | $MidNumLetEx | $Single_QuoteEx) ($ALetterEx | $Hebrew_LetterEx) {200};
# rule 7a
$Hebrew_LetterEx $Single_QuoteEx {200};
# rule 7b and 7c
$Hebrew_LetterEx $Double_QuoteEx $Hebrew_LetterEx {200};
# rule 8
@ -101,27 +144,35 @@ $NumericEx $NumericEx {100};
# rule 9
$ALetterEx $NumericEx {200};
($ALetterEx | $Hebrew_LetterEx) $NumericEx {200};
# rule 10
$NumericEx $ALetterEx {200};
$NumericEx ($ALetterEx | $Hebrew_LetterEx) {200};
# rule 11 and 12
$NumericEx ($MidNumEx | $MidNumLetEx) $NumericEx {100};
$NumericEx ($MidNumEx | $MidNumLetEx | $Single_QuoteEx) $NumericEx {100};
# rule 13
$KatakanaEx $KatakanaEx {300};
# rule 13a/b
$ALetterEx $ExtendNumLetEx {200}; # (13a)
$NumericEx $ExtendNumLetEx {100}; # (13a)
$KatakanaEx $ExtendNumLetEx {300}; # (13a)
$ExtendNumLetEx $ExtendNumLetEx {200}; # (13a)
$ALetterEx $ExtendNumLetEx {200}; # (13a)
$Hebrew_LetterEx $ExtendNumLetEx {200}; # (13a)
$NumericEx $ExtendNumLetEx {100}; # (13a)
$KatakanaEx $ExtendNumLetEx {300}; # (13a)
$ExtendNumLetEx $ExtendNumLetEx {200}; # (13a)
$ExtendNumLetEx $ALetterEx {200}; # (13b)
$ExtendNumLetEx $NumericEx {100}; # (13b)
$ExtendNumLetEx $KatakanaEx {300}; # (13b)
$ExtendNumLetEx $ALetterEx {200}; # (13b)
$ExtendNumLetEx $Hebrew_Letter {200}; # (13b)
$ExtendNumLetEx $NumericEx {100}; # (13b)
$ExtendNumLetEx $KatakanaEx {300}; # (13b)
# rule 13c
$Regional_IndicatorEx $Regional_IndicatorEx;
# special handling for CJK characters: chain for later dictionary segmentation
$HangulSyllable $HangulSyllable {200};

View File

@ -1,61 +0,0 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
#
# This is an example of rule tailoring for Hebrew.
# In this example the single-quote is added to the Extend category
# The double-quote is added to the MidLetter category.
#
!!chain;
$CR = [\p{Word_Break = CR}];
$LF = [\p{Word_Break = LF}];
$Newline = [\p{Word_Break = Newline}];
$Extend = [\p{Word_Break = Extend}\u0027];
$Format = [\p{Word_Break = Format}];
$ALetter = [\p{Word_Break = ALetter}];
$MidNumLet = [\p{Word_Break = MidNumLet}];
$MidLetter = [\p{Word_Break = MidLetter}\u0022];
$MidNum = [\p{Word_Break = MidNum}];
$Numeric = [\p{Word_Break = Numeric}];
$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
$dictionary = [:LineBreak = Complex_Context:];
$Control = [\p{Grapheme_Cluster_Break = Control}];
$ALetterPlus = [$ALetter [$dictionary-$Extend-$Control]];
$ALetterEx = $ALetterPlus ($Extend | $Format)*;
$MidNumLetEx = $MidNumLet ($Extend | $Format)*;
$MidLetterEx = $MidLetter ($Extend | $Format)*;
$MidNumEx = $MidNum ($Extend | $Format)*;
$NumericEx = $Numeric ($Extend | $Format)*;
$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;
!!forward;
$CR $LF;
[^$CR $LF $Newline]? ($Extend | $Format)+;
$NumericEx {100};
$ALetterEx {200};
$ALetterEx $ALetterEx {200};
$ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200};
$NumericEx $NumericEx {100};
$ALetterEx $NumericEx {200};
$NumericEx $ALetterEx {200};
$NumericEx ($MidNumEx | $MidNumLetEx) $NumericEx {100};
$ALetterEx $ExtendNumLetEx {200};
$NumericEx $ExtendNumLetEx {100};
$ExtendNumLetEx $ExtendNumLetEx {200};
$ExtendNumLetEx $ALetterEx {200};
$ExtendNumLetEx $NumericEx {100};

View File

@ -1,192 +0,0 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Parses Lao text, with syllable as token.
#
# The definition of Lao syllable is based from:
#
# Syllabification of Lao Script for Line Breaking
# Phonpasit Phissamay, Valaxay Dalolay, Chitaphone Chanhsililath, Oulaiphone Silimasak,
# Sarmad Hussain, Nadir Durrani, Science Technology and Environment Agency, CRULP
# http://www.panl10n.net/english/final%20reports/pdf%20files/Laos/LAO06.pdf
# http://www.panl10n.net/Presentations/Cambodia/Phonpassit/LineBreakingAlgo.pdf
#
# NOTE:
# There are some ambiguities in Lao syllabification without additional processing, as mentioned in the paper.
# For this reason, this RBBI grammar really only works with LaoBreakIterator, as it does this additional work.
#
# Syllable structure, where X is the nuclear consonant:
#
# +----+
# | X5 |
# +----+
# | X4 |
# +----+----+----+----+----+----+----+-----+
# | X0 | X1 | X | X6 | X7 | X8 | X9 | X10 |
# +----+----+----+----+----+----+----+-----+
# | X2 |
# +----+
# | X3 |
# +----+
#
# X0 represents a vowel which occurs before the nuclear consonant.
# It can always define the beginning of syllable.
$X0 = [\u0EC0-\u0EC4];
# X1 is a combination consonant which comes before the nuclear consonant,
# but only if nuclear consonant is one of {ງ ຍ ລ ວ ຼ ມ ນ ຣ}
$X1 = [\u0EAB];
# X represents the nuclear consonant.
$X = [\u0E81-\u0EAE\u0EDC\u0EDD];
# X2 is a combination consonant which comes after the nuclear consonant,
# which is placed under or next to the nuclear consonant.
$X2 = [\u0EBC\u0EA3\u0EA7\u0EA5];
# X3 represents a vowel which occurs under the nuclear consonant.
$X3 = [\u0EB8\u0EB9];
# X4 represents a vowel which occurs above the nuclear consonant.
$X4 = [\u0EB4-\u0EB7\u0ECD\u0EBB\u0EB1];
# X5 represents a tone mark which occurs above the nuclear consonant or upper vowel.
$X5 = [\u0EC8-\u0ECB];
# X6 represents a consonant vowel, which occurs after the nuclear consonant.
# It functions when the syllable doesnt have any vowels. And it always exists with X8.
$X6 = [\u0EA7\u0EAD\u0EBD];
# X7 represents a final vowel.
# However X7_1 always represents the end of syllable and it never exists with tone mark.
$X7 = [\u0EB0\u0EB2\u0EB3];
# X8 represents an alternate consonant.
$X8 = [\u0E81\u0E87\u0E8D\u0E94\u0E99\u0EA1\u0E9A\u0EA7];
# X9 represents alternate consonants to pronounce foreign terms, it always exist with X10_3.
$X9 = [\u0E88\u0EAA\u0E8A\u0E9E\u0E9F\u0EA5];
# X10 represents a sign mark.
# It always occurs at the end of a syllable, but mostly people keep it separate from syllable.
$X10 = [\u0EAF\u0EC6\u0ECC];
# Section 1
$X0_1 = [\u0EC0];
$X4_1_2 = [\u0EB4\u0EB5];
$X4_3_4 = [\u0EB6\u0EB7];
$X4_6 = [\u0EBB];
$X4_7 = [\u0EB1];
$X6_2 = [\u0EAD];
$X6_3 = [\u0EBD];
$X7_1 = [\u0EB0];
$X7_2 = [\u0EB2];
$X10_1 = [\u0EAF];
$X10_2 = [\u0EC6];
$X10_3 = [\u0ECC];
$Rule1_1 = $X0_1 ($X1)? $X ($X2)? ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
$Rule1_2 = $X0_1 ($X1)? $X ($X2)? $X4_1_2 ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
$Rule1_3 = $X0_1 ($X1)? $X ($X2)? $X4_3_4 ($X5)? $X6_2 ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
$Rule1_4 = $X0_1 ($X1)? $X ($X2)? ($X7_2)? $X7_1;
$Rule1_5 = $X0_1 ($X1)? $X ($X2)? $X4_6 ($X5)? $X7_2 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
$Rule1_6 = $X0_1 ($X1)? $X ($X2)? $X4_7 ($X5)? $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
$Rule1_7 = $X0_1 ($X1)? $X ($X2)? ($X4_7)? ($X5)? $X6_3 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
$Rule1 = ($Rule1_1 | $Rule1_2 | $Rule1_3 | $Rule1_4 | $Rule1_5 | $Rule1_6 | $Rule1_7);
# Section 2
$X0_2 = [\u0EC1];
$Rule2_1 = $X0_2 ($X1)? $X ($X2)? ($X5)? ($X6)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
$Rule2_2 = $X0_2 ($X1)? $X ($X2)? $X7_1;
$Rule2_3 = $X0_2 ($X1)? $X ($X2)? $X4_7 ($X5)? $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
$Rule2 = ($Rule2_1 | $Rule2_2 | $Rule2_3);
# Section 3
$X0_3 = [\u0EC2];
$X8_3 = [\u0E8D];
$X8_8 = [\u0EA7];
$Rule3_1 = $X0_3 ($X1)? $X ($X2)? ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
$Rule3_2 = $X0_3 ($X1)? $X ($X2)? $X7_1;
$Rule3_3 = $X0_3 ($X1)? $X ($X2)? $X4_7 ($X5)? ($X8_3 | $X8_8);
$Rule3 = ($Rule3_1 | $Rule3_2 | $Rule3_3);
# Section 4
$X0_4 = [\u0EC4];
$X6_1 = [\u0EA7];
$Rule4 = $X0_4 ($X1)? $X ($X2)? ($X5)? ($X6_1)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
# Section 5
$X0_5 = [\u0EC3];
$Rule5 = $X0_5 ($X1)? $X ($X2)? ($X5)? ($X6_1)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
# Section 6
$Rule6 = ($X1)? $X ($X2)? $X3 ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
# Section 7
$X4_1_4 = [\u0EB4-\u0EB7];
$Rule7 = ($X1)? $X ($X2)? $X4_1_4 ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
# Section 8
$X4_5 = [\u0ECD];
$Rule8 = ($X1)? $X ($X2)? $X4_5 ($X5)? ($X7_2)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
# Section 9
$Rule9_1 = ($X1)? $X ($X2)? $X4_6 ($X5)? $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
$Rule9_2 = ($X1)? $X ($X2)? $X4_6 ($X5)? $X6_1 $X7_1;
$Rule9 = ($Rule9_1 | $Rule9_2);
# Section 10
$Rule10 = ($X1)? $X ($X2)? $X4_7 ($X5)? ($X6_1)? $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
# Section 11
$Rule11 = ($X1)? $X ($X2)? ($X5)? $X6 $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
# Section 12
$Rule12 = ($X1)? $X ($X2)? ($X5)? $X7_1;
# Section 13
$Rule13 = ($X1)? $X ($X2)? ($X5)? $X7_2 ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
# Section 14
$X7_3 = [\u0EB3];
$Rule14 = ($X1)? $X ($X2)? ($X5)? $X7_3 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
$LaoSyllableEx = ($Rule1 | $Rule2 | $Rule3 | $Rule4 | $Rule5 | $Rule6 | $Rule7 | $Rule8 | $Rule9 | $Rule10 | $Rule11 | $Rule12 | $Rule13 | $Rule14);
$WordJoin = [:Line_Break=Word_Joiner:];
$LaoJoinedSyllableEx = $LaoSyllableEx ($WordJoin $LaoSyllableEx)*;
#
# default numerical definitions
#
$Extend = [\p{Word_Break = Extend}];
$Format = [\p{Word_Break = Format}];
$MidNumLet = [\p{Word_Break = MidNumLet}];
$MidNum = [\p{Word_Break = MidNum}];
$Numeric = [\p{Word_Break = Numeric}];
$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
$MidNumLetEx = $MidNumLet ($Extend | $Format)*;
$MidNumEx = $MidNum ($Extend | $Format)*;
$NumericEx = $Numeric ($Extend | $Format)*;
$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;
!!forward;
$LaoJoinedSyllableEx {200};
# default numeric rules
$NumericEx $ExtendNumLetEx? (($MidNumEx | $MidNumLetEx)? $NumericEx $ExtendNumLetEx?)* {100};

View File

@ -78,7 +78,6 @@ FF0D>002D
## Space Folding
# Rule: [[:Zs:] - [:Changes_When_NFKC_Casefolded=Yes:] - [\u0020]] > 0020
1680>0020
180E>0020
## Spacing Accents folding (done by kd)

View File

@ -1,4 +1,4 @@
# Copyright (C) 1999-2012, International Business Machines
# Copyright (C) 1999-2013, International Business Machines
# Corporation and others. All Rights Reserved.
#
# file name: nfc.txt
@ -7,7 +7,7 @@
#
# Complete data for Unicode NFC normalization.
* Unicode 6.1.0
* Unicode 6.3.0
# Canonical_Combining_Class (ccc) values
0300..0314:230

View File

@ -1,4 +1,4 @@
# Copyright (C) 1999-2012, International Business Machines
# Copyright (C) 1999-2013, International Business Machines
# Corporation and others. All Rights Reserved.
#
# file name: nfkc.txt
@ -11,7 +11,7 @@
# to NFKC one-way mappings.
# Use this file as the second gennorm2 input file after nfc.txt.
* Unicode 6.1.0
* Unicode 6.3.0
00A0>0020
00A8>0020 0308

View File

@ -1,5 +1,5 @@
# Unicode Character Database
# Copyright (c) 1991-2012 Unicode, Inc.
# Copyright (c) 1991-2013 Unicode, Inc.
# For terms of use, see http://www.unicode.org/terms_of_use.html
# For documentation, see http://www.unicode.org/reports/tr44/
#
@ -12,7 +12,7 @@
# and reformatted into syntax for the gennorm2 Normalizer2 data generator tool.
# Use this file as the third gennorm2 input file after nfc.txt and nfkc.txt.
* Unicode 6.1.0
* Unicode 6.3.0
0041>0061
0042>0062
@ -537,6 +537,7 @@
0555>0585
0556>0586
0587>0565 0582
061C>
0675>0627 0674
0676>0648 0674
0677>06C7 0674
@ -627,7 +628,7 @@
10FC>10DC
115F..1160>
17B4..17B5>
180B..180D>
180B..180E>
1D2C>0061
1D2D>00E6
1D2E>0062

View File

@ -21,7 +21,6 @@ import java.text.CharacterIterator;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.text.BreakIterator;
import com.ibm.icu.text.DictionaryBasedBreakIterator;
import com.ibm.icu.text.RuleBasedBreakIterator;
import com.ibm.icu.text.UTF16;
@ -60,15 +59,12 @@ abstract class BreakIteratorWrapper {
}
/**
* If its a DictionaryBasedBreakIterator, it doesn't return rulestatus, so
* treat it like a generic BreakIterator If its any other
* RuleBasedBreakIterator, the rule status can be used for token type. If its
* If its a RuleBasedBreakIterator, the rule status can be used for token type. If its
* any other BreakIterator, the rulestatus method is not available, so treat
* it like a generic BreakIterator.
*/
static BreakIteratorWrapper wrap(BreakIterator breakIterator) {
if (breakIterator instanceof RuleBasedBreakIterator
&& !(breakIterator instanceof DictionaryBasedBreakIterator))
if (breakIterator instanceof RuleBasedBreakIterator)
return new RBBIWrapper((RuleBasedBreakIterator) breakIterator);
else
return new BIWrapper(breakIterator);

View File

@ -41,12 +41,13 @@ final class CompositeBreakIterator {
private final BreakIteratorWrapper wordBreakers[] = new BreakIteratorWrapper[UScript.CODE_LIMIT];
private BreakIteratorWrapper rbbi;
private final ScriptIterator scriptIterator = new ScriptIterator();
private final ScriptIterator scriptIterator;
private char text[];
CompositeBreakIterator(ICUTokenizerConfig config) {
this.config = config;
this.scriptIterator = new ScriptIterator(config.combineCJ());
}
/**

View File

@ -35,12 +35,9 @@ import com.ibm.icu.util.ULocale;
* ({@link BreakIterator#getWordInstance(ULocale) BreakIterator.getWordInstance(ULocale.ROOT)}),
* but with the following tailorings:
* <ul>
* <li>Thai text is broken into words with a
* {@link com.ibm.icu.text.DictionaryBasedBreakIterator}
* <li>Lao, Myanmar, and Khmer text is broken into syllables
* <li>Thai, Lao, and CJK text is broken into words with a dictionary.
* <li>Myanmar, and Khmer text is broken into syllables
* based on custom BreakIterator rules.
* <li>Hebrew text has custom tailorings to handle special cases
* involving punctuation.
* </ul>
* @lucene.experimental
*/
@ -62,34 +59,44 @@ public class DefaultICUTokenizerConfig extends ICUTokenizerConfig {
* the default breakiterators in use. these can be expensive to
* instantiate, cheap to clone.
*/
private static final BreakIterator rootBreakIterator =
// we keep the cjk breaking separate, thats because it cannot be customized (because dictionary
// is only triggered when kind = WORD, but kind = LINE by default and we have no non-evil way to change it)
private static final BreakIterator cjkBreakIterator = BreakIterator.getWordInstance(ULocale.ROOT);
// the same as ROOT, except no dictionary segmentation for cjk
private static final BreakIterator defaultBreakIterator =
readBreakIterator("Default.brk");
private static final BreakIterator thaiBreakIterator =
BreakIterator.getWordInstance(new ULocale("th_TH"));
private static final BreakIterator hebrewBreakIterator =
readBreakIterator("Hebrew.brk");
private static final BreakIterator khmerBreakIterator =
readBreakIterator("Khmer.brk");
private static final BreakIterator laoBreakIterator =
new LaoBreakIterator(readBreakIterator("Lao.brk"));
private static final BreakIterator myanmarBreakIterator =
readBreakIterator("Myanmar.brk");
// TODO: deprecate this boolean? you only care if you are doing super-expert stuff...
private final boolean cjkAsWords;
/**
* Creates a new config. This object is lightweight, but the first
* time the class is referenced, breakiterators will be initialized.
* @param cjkAsWords true if cjk text should undergo dictionary-based segmentation,
* otherwise text will be segmented according to UAX#29 defaults.
* If this is true, all Han+Hiragana+Katakana words will be tagged as
* IDEOGRAPHIC.
*/
public DefaultICUTokenizerConfig() {}
public DefaultICUTokenizerConfig(boolean cjkAsWords) {
this.cjkAsWords = cjkAsWords;
}
@Override
public boolean combineCJ() {
return cjkAsWords;
}
@Override
public BreakIterator getBreakIterator(int script) {
switch(script) {
case UScript.THAI: return (BreakIterator)thaiBreakIterator.clone();
case UScript.HEBREW: return (BreakIterator)hebrewBreakIterator.clone();
case UScript.KHMER: return (BreakIterator)khmerBreakIterator.clone();
case UScript.LAO: return (BreakIterator)laoBreakIterator.clone();
case UScript.MYANMAR: return (BreakIterator)myanmarBreakIterator.clone();
default: return (BreakIterator)rootBreakIterator.clone();
case UScript.JAPANESE: return (BreakIterator)cjkBreakIterator.clone();
default: return (BreakIterator)defaultBreakIterator.clone();
}
}

View File

@ -68,7 +68,7 @@ public final class ICUTokenizer extends Tokenizer {
* @see DefaultICUTokenizerConfig
*/
public ICUTokenizer(Reader input) {
this(input, new DefaultICUTokenizerConfig());
this(input, new DefaultICUTokenizerConfig(true));
}
/**

View File

@ -36,4 +36,6 @@ public abstract class ICUTokenizerConfig {
/** Return a token type value for a given script and BreakIterator
* rule status. */
public abstract String getType(int script, int ruleStatus);
/** true if Han, Hiragana, and Katakana scripts should all be returned as Japanese */
public abstract boolean combineCJ();
}

View File

@ -70,7 +70,7 @@ import com.ibm.icu.text.RuleBasedBreakIterator;
* <pre class="prettyprint" >
* &lt;fieldType name="text_icu_custom" class="solr.TextField" positionIncrementGap="100"&gt;
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.ICUTokenizerFactory"
* &lt;tokenizer class="solr.ICUTokenizerFactory" cjkAsWords="true"
* rulefiles="Latn:my.Latin.rules.rbbi,Cyrl:my.Cyrillic.rules.rbbi"/&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
@ -79,6 +79,7 @@ public class ICUTokenizerFactory extends TokenizerFactory implements ResourceLoa
static final String RULEFILES = "rulefiles";
private final Map<Integer,String> tailored;
private ICUTokenizerConfig config;
private final boolean cjkAsWords;
/** Creates a new ICUTokenizerFactory */
public ICUTokenizerFactory(Map<String,String> args) {
@ -94,6 +95,7 @@ public class ICUTokenizerFactory extends TokenizerFactory implements ResourceLoa
tailored.put(UCharacter.getPropertyValueEnum(UProperty.SCRIPT, scriptCode), resourcePath);
}
}
cjkAsWords = getBoolean(args, "cjkAsWords", true);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
@ -103,7 +105,7 @@ public class ICUTokenizerFactory extends TokenizerFactory implements ResourceLoa
public void inform(ResourceLoader loader) throws IOException {
assert tailored != null : "init must be called first!";
if (tailored.isEmpty()) {
config = new DefaultICUTokenizerConfig();
config = new DefaultICUTokenizerConfig(cjkAsWords);
} else {
final BreakIterator breakers[] = new BreakIterator[UScript.CODE_LIMIT];
for (Map.Entry<Integer,String> entry : tailored.entrySet()) {
@ -111,7 +113,7 @@ public class ICUTokenizerFactory extends TokenizerFactory implements ResourceLoa
String resourcePath = entry.getValue();
breakers[code] = parseRules(resourcePath, loader);
}
config = new DefaultICUTokenizerConfig() {
config = new DefaultICUTokenizerConfig(cjkAsWords) {
@Override
public BreakIterator getBreakIterator(int script) {

View File

@ -1,230 +0,0 @@
package org.apache.lucene.analysis.icu.segmentation;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.text.CharacterIterator;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.text.BreakIterator;
import com.ibm.icu.text.RuleBasedBreakIterator;
import com.ibm.icu.text.UnicodeSet;
/**
* Syllable iterator for Lao text.
* <p>
* This breaks Lao text into syllables according to:
* <i>Syllabification of Lao Script for Line Breaking</i>
* Phonpasit Phissamay, Valaxay Dalolay, Chitaphone Chanhsililath, Oulaiphone Silimasak,
* Sarmad Hussain, Nadir Durrani, Science Technology and Environment Agency, CRULP.
* <ul>
* <li>http://www.panl10n.net/english/final%20reports/pdf%20files/Laos/LAO06.pdf
* <li>http://www.panl10n.net/Presentations/Cambodia/Phonpassit/LineBreakingAlgo.pdf
* </ul>
* <p>
* Most work is accomplished with RBBI rules, however some additional special logic is needed
* that cannot be coded in a grammar, and this is implemented here.
* <p>
* For example, what appears to be a final consonant might instead be part of the next syllable.
* Rules match in a greedy fashion, leaving an illegal sequence that matches no rules.
* <p>
* Take for instance the text ກວ່າດອກ
* The first rule greedily matches ກວ່າດ, but then ອກ is encountered, which is illegal.
* What LaoBreakIterator does, according to the paper:
* <ol>
* <li>backtrack and remove the from the last syllable, placing it on the current syllable.
* <li>verify the modified previous syllable (ກວ່າ ) is still legal.
* <li>verify the modified current syllable (ດອກ) is now legal.
* <li>If 2 or 3 fails, then restore the to the last syllable and skip the current character.
* </ol>
* <p>
* Finally, LaoBreakIterator also takes care of the second concern mentioned in the paper.
* This is the issue of combining marks being in the wrong order (typos).
* @lucene.experimental
*/
public class LaoBreakIterator extends BreakIterator {
RuleBasedBreakIterator rules;
CharArrayIterator text;
CharArrayIterator working = new CharArrayIterator();
int workingOffset = 0;
CharArrayIterator verifyText = new CharArrayIterator();
RuleBasedBreakIterator verify;
private static final UnicodeSet laoSet;
static {
laoSet = new UnicodeSet("[:Lao:]");
laoSet.compact();
laoSet.freeze();
}
/**
* Creates a new iterator, performing the backtracking verification
* across the provided <code>rules</code>.
*/
public LaoBreakIterator(RuleBasedBreakIterator rules) {
this.rules = (RuleBasedBreakIterator) rules.clone();
this.verify = (RuleBasedBreakIterator) rules.clone();
}
@Override
public int current() {
int current = rules.current();
return current == BreakIterator.DONE ? BreakIterator.DONE : workingOffset + current;
}
@Override
public int first() {
working.setText(this.text.getText(), this.text.getStart(), this.text.getLength());
rules.setText(working);
workingOffset = 0;
int first = rules.first();
return first == BreakIterator.DONE ? BreakIterator.DONE : workingOffset + first;
}
@Override
public int following(int offset) {
throw new UnsupportedOperationException();
}
@Override
public CharacterIterator getText() {
return text;
}
@Override
public int last() {
throw new UnsupportedOperationException();
}
@Override
public int next() {
int current = current();
int next = rules.next();
if (next == BreakIterator.DONE)
return next;
else
next += workingOffset;
char c = working.current();
int following = rules.next(); // lookahead
if (following != BreakIterator.DONE) {
following += workingOffset;
if (rules.getRuleStatus() == 0 && laoSet.contains(c) && verifyPushBack(current, next)) {
workingOffset = next - 1;
working.setText(text.getText(), text.getStart() + workingOffset, text.getLength() - workingOffset);
return next - 1;
}
rules.previous(); // undo the lookahead
}
return next;
}
@Override
public int next(int n) {
if (n < 0)
throw new UnsupportedOperationException("Backwards traversal is unsupported");
int result = current();
while (n > 0) {
result = next();
--n;
}
return result;
}
@Override
public int previous() {
throw new UnsupportedOperationException("Backwards traversal is unsupported");
}
@Override
public void setText(CharacterIterator text) {
if (!(text instanceof CharArrayIterator))
throw new UnsupportedOperationException("unsupported CharacterIterator");
this.text = (CharArrayIterator) text;
ccReorder(this.text.getText(), this.text.getStart(), this.text.getLength());
working.setText(this.text.getText(), this.text.getStart(), this.text.getLength());
rules.setText(working);
workingOffset = 0;
}
@Override
public void setText(String newText) {
CharArrayIterator ci = new CharArrayIterator();
ci.setText(newText.toCharArray(), 0, newText.length());
setText(ci);
}
private boolean verifyPushBack(int current, int next) {
int shortenedSyllable = next - current - 1;
verifyText.setText(text.getText(), text.getStart() + current, shortenedSyllable);
verify.setText(verifyText);
if (verify.next() != shortenedSyllable || verify.getRuleStatus() == 0)
return false;
verifyText.setText(text.getText(), text.getStart() + next - 1, text.getLength() - next + 1);
verify.setText(verifyText);
return (verify.next() != BreakIterator.DONE && verify.getRuleStatus() != 0);
}
// TODO: only bubblesort around runs of combining marks, instead of the entire text.
private void ccReorder(char[] text, int start, int length) {
boolean reordered;
do {
int prevCC = 0;
reordered = false;
for (int i = start; i < start + length; i++) {
final char c = text[i];
final int cc = UCharacter.getCombiningClass(c);
if (cc > 0 && cc < prevCC) {
// swap
text[i] = text[i - 1];
text[i - 1] = c;
reordered = true;
} else {
prevCC = cc;
}
}
} while (reordered == true);
}
/**
* Clone method. Creates another LaoBreakIterator with the same behavior
* and current state as this one.
* @return The clone.
*/
@Override
public LaoBreakIterator clone() {
LaoBreakIterator other = (LaoBreakIterator) super.clone();
other.rules = (RuleBasedBreakIterator) rules.clone();
other.verify = (RuleBasedBreakIterator) verify.clone();
if (text != null)
other.text = text.clone();
if (working != null)
other.working = working.clone();
if (verifyText != null)
other.verifyText = verifyText.clone();
return other;
}
}

View File

@ -59,6 +59,15 @@ final class ScriptIterator {
private int scriptStart;
private int scriptLimit;
private int scriptCode;
private final boolean combineCJ;
/**
* @param combineCJ if true: Han,Hiragana,Katakana will all return as {@link UScript#JAPANESE}
*/
ScriptIterator(boolean combineCJ) {
this.combineCJ = combineCJ;
}
/**
* Get the start of this script run
@ -162,10 +171,24 @@ final class ScriptIterator {
}
/** fast version of UScript.getScript(). Basic Latin is an array lookup */
private static int getScript(int codepoint) {
if (0 <= codepoint && codepoint < basicLatin.length)
private int getScript(int codepoint) {
if (0 <= codepoint && codepoint < basicLatin.length) {
return basicLatin[codepoint];
else
return UScript.getScript(codepoint);
} else {
int script = UScript.getScript(codepoint);
if (combineCJ) {
if (script == UScript.HAN || script == UScript.HIRAGANA || script == UScript.KATAKANA) {
return UScript.JAPANESE;
} else if (codepoint >= 0xFF10 && codepoint <= 0xFF19) {
// when using CJK dictionary breaking, don't let full width numbers go to it, otherwise
// they are treated as punctuation. we currently have no cleaner way to fix this!
return UScript.LATIN;
} else {
return script;
}
} else {
return script;
}
}
}
}

View File

@ -84,6 +84,10 @@ public class ScriptAttributeImpl extends AttributeImpl implements ScriptAttribut
@Override
public void reflectWith(AttributeReflector reflector) {
reflector.reflect(ScriptAttribute.class, "script", getName());
// when wordbreaking CJK, we use the 15924 code Japanese (Han+Hiragana+Katakana) to
// mark runs of Chinese/Japanese. our use is correct (as for chinese Han is a subset),
// but this is just to help prevent confusion.
String name = code == UScript.JAPANESE ? "Chinese/Japanese" : getName();
reflector.reflect(ScriptAttribute.class, "script", name);
}
}

View File

@ -14,6 +14,7 @@
See the License for the specific language governing permissions and
limitations under the License.
-->
<!-- :Post-Release-Update-Version.LUCENE_XY: - several mentions in this file -->
<html>
<head>
<META http-equiv="Content-Type" content="text/html; charset=UTF-8">
@ -114,9 +115,9 @@ algorithm.
<h3>Farsi Range Queries</h3>
<pre class="prettyprint">
Collator collator = Collator.getInstance(new ULocale("ar"));
ICUCollationKeyAnalyzer analyzer = new ICUCollationKeyAnalyzer(Version.LUCENE_40, collator);
ICUCollationKeyAnalyzer analyzer = new ICUCollationKeyAnalyzer(Version.LUCENE_50, collator);
RAMDirectory ramDir = new RAMDirectory();
IndexWriter writer = new IndexWriter(ramDir, new IndexWriterConfig(Version.LUCENE_40, analyzer));
IndexWriter writer = new IndexWriter(ramDir, new IndexWriterConfig(Version.LUCENE_50, analyzer));
Document doc = new Document();
doc.add(new Field("content", "\u0633\u0627\u0628",
Field.Store.YES, Field.Index.ANALYZED));
@ -124,7 +125,7 @@ algorithm.
writer.close();
IndexSearcher is = new IndexSearcher(ramDir, true);
QueryParser aqp = new QueryParser(Version.LUCENE_40, "content", analyzer);
QueryParser aqp = new QueryParser(Version.LUCENE_50, "content", analyzer);
aqp.setAnalyzeRangeTerms(true);
// Unicode order would include U+0633 in [ U+062F - U+0698 ], but Farsi
@ -140,9 +141,9 @@ algorithm.
<h3>Danish Sorting</h3>
<pre class="prettyprint">
Analyzer analyzer
= new ICUCollationKeyAnalyzer(Version.LUCENE_40, Collator.getInstance(new ULocale("da", "dk")));
= new ICUCollationKeyAnalyzer(Version.LUCENE_50, Collator.getInstance(new ULocale("da", "dk")));
RAMDirectory indexStore = new RAMDirectory();
IndexWriter writer = new IndexWriter(indexStore, new IndexWriterConfig(Version.LUCENE_40, analyzer));
IndexWriter writer = new IndexWriter(indexStore, new IndexWriterConfig(Version.LUCENE_50, analyzer));
String[] tracer = new String[] { "A", "B", "C", "D", "E" };
String[] data = new String[] { "HAT", "HUT", "H\u00C5T", "H\u00D8T", "HOT" };
String[] sortedTracerOrder = new String[] { "A", "E", "B", "D", "C" };
@ -168,15 +169,15 @@ algorithm.
<pre class="prettyprint">
Collator collator = Collator.getInstance(new ULocale("tr", "TR"));
collator.setStrength(Collator.PRIMARY);
Analyzer analyzer = new ICUCollationKeyAnalyzer(Version.LUCENE_40, collator);
Analyzer analyzer = new ICUCollationKeyAnalyzer(Version.LUCENE_50, collator);
RAMDirectory ramDir = new RAMDirectory();
IndexWriter writer = new IndexWriter(ramDir, new IndexWriterConfig(Version.LUCENE_40, analyzer));
IndexWriter writer = new IndexWriter(ramDir, new IndexWriterConfig(Version.LUCENE_50, analyzer));
Document doc = new Document();
doc.add(new Field("contents", "DIGY", Field.Store.NO, Field.Index.ANALYZED));
writer.addDocument(doc);
writer.close();
IndexSearcher is = new IndexSearcher(ramDir, true);
QueryParser parser = new QueryParser(Version.LUCENE_40, "contents", analyzer);
QueryParser parser = new QueryParser(Version.LUCENE_50, "contents", analyzer);
Query query = parser.parse("d\u0131gy"); // U+0131: dotless i
ScoreDoc[] result = is.search(query, null, 1000).scoreDocs;
assertEquals("The index Term should be included.", 1, result.length);
@ -353,7 +354,7 @@ and
<h1><a name="backcompat">Backwards Compatibility</a></h1>
<p>
This module exists to provide up-to-date Unicode functionality that supports
the most recent version of Unicode (currently 6.1). However, some users who wish
the most recent version of Unicode (currently 6.3). However, some users who wish
for stronger backwards compatibility can restrict
{@link org.apache.lucene.analysis.icu.ICUNormalizer2Filter} to operate on only
a specific Unicode Version by using a {@link com.ibm.icu.text.FilteredNormalizer2}.

View File

@ -42,7 +42,7 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
sb.append(whitespace);
sb.append("testing 1234");
String input = sb.toString();
ICUTokenizer tokenizer = new ICUTokenizer(new StringReader(input));
ICUTokenizer tokenizer = new ICUTokenizer(new StringReader(input), new DefaultICUTokenizerConfig(false));
assertTokenStreamContents(tokenizer, new String[] { "testing", "1234" });
}
@ -52,7 +52,7 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
sb.append('a');
}
String input = sb.toString();
ICUTokenizer tokenizer = new ICUTokenizer(new StringReader(input));
ICUTokenizer tokenizer = new ICUTokenizer(new StringReader(input), new DefaultICUTokenizerConfig(false));
char token[] = new char[4096];
Arrays.fill(token, 'a');
String expectedToken = new String(token);
@ -69,7 +69,7 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
Tokenizer tokenizer = new ICUTokenizer(reader);
Tokenizer tokenizer = new ICUTokenizer(reader, new DefaultICUTokenizerConfig(false));
TokenFilter filter = new ICUNormalizer2Filter(tokenizer);
return new TokenStreamComponents(tokenizer, filter);
}
@ -118,6 +118,7 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
public void testLao() throws Exception {
assertAnalyzesTo(a, "ກວ່າດອກ", new String[] { "ກວ່າ", "ດອກ" });
assertAnalyzesTo(a, "ພາສາລາວ", new String[] { "ພາສາ", "ລາວ"}, new String[] { "<ALPHANUM>", "<ALPHANUM>" });
}
public void testThai() throws Exception {
@ -138,6 +139,13 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
new String[] { "", "", "", "", "", "1234", "tests"});
}
public void testHebrew() throws Exception {
assertAnalyzesTo(a, "דנקנר תקף את הדו\"ח",
new String[] { "דנקנר", "תקף", "את", "הדו\"ח" });
assertAnalyzesTo(a, "חברת בת של מודי'ס",
new String[] { "חברת", "בת", "של", "מודי'ס" });
}
public void testEmpty() throws Exception {
assertAnalyzesTo(a, "", new String[] {});
assertAnalyzesTo(a, ".", new String[] {});

View File

@ -0,0 +1,91 @@
package org.apache.lucene.analysis.icu.segmentation;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.util.Random;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
/**
* test ICUTokenizer with dictionary-based CJ segmentation
*/
public class TestICUTokenizerCJK extends BaseTokenStreamTestCase {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
return new TokenStreamComponents(new ICUTokenizer(reader));
}
};
/**
* test stolen from smartcn
*/
public void testSimpleChinese() throws Exception {
assertAnalyzesTo(a, "我购买了道具和服装。",
new String[] { "", "购买", "", "道具", "", "服装" }
);
}
public void testChineseNumerics() throws Exception {
assertAnalyzesTo(a, "", new String[] { "" });
assertAnalyzesTo(a, "院內分機9483。",
new String[] { "", "", "分機", "" });
assertAnalyzesTo(a, "院內分機9483。",
new String[] { "", "", "分機", "9483" });
}
/**
* test stolen from kuromoji
*/
public void testSimpleJapanese() throws Exception {
assertAnalyzesTo(a, "それはまだ実験段階にあります",
new String[] { "それ", "", "まだ", "実験", "段階", "", "あり", "ます" }
);
}
public void testJapaneseTypes() throws Exception {
assertAnalyzesTo(a, "仮名遣い カタカナ",
new String[] { "仮名遣い", "カタカナ" },
new String[] { "<IDEOGRAPHIC>", "<IDEOGRAPHIC>" });
}
public void testKorean() throws Exception {
// Korean words
assertAnalyzesTo(a, "안녕하세요 한글입니다", new String[]{"안녕하세요", "한글입니다"});
}
/** make sure that we still tag korean as HANGUL (for further decomposition/ngram/whatever) */
public void testKoreanTypes() throws Exception {
assertAnalyzesTo(a, "훈민정음",
new String[] { "훈민정음" },
new String[] { "<HANGUL>" });
}
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
checkRandomData(random(), a, 10000*RANDOM_MULTIPLIER);
}
/** blast some random large strings through the analyzer */
public void testRandomHugeStrings() throws Exception {
Random random = random();
checkRandomData(random, a, 100*RANDOM_MULTIPLIER, 8192);
}
}

View File

@ -1,90 +0,0 @@
package org.apache.lucene.analysis.icu.segmentation;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.InputStream;
import org.apache.lucene.util.LuceneTestCase;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.text.BreakIterator;
import com.ibm.icu.text.RuleBasedBreakIterator;
import com.ibm.icu.text.UTF16;
/**
* Tests LaoBreakIterator and its RBBI rules
*/
public class TestLaoBreakIterator extends LuceneTestCase {
private BreakIterator wordIterator;
@Override
public void setUp() throws Exception {
super.setUp();
InputStream is = getClass().getResourceAsStream("Lao.brk");
wordIterator = new LaoBreakIterator(RuleBasedBreakIterator.getInstanceFromCompiledRules(is));
is.close();
}
private void assertBreaksTo(BreakIterator iterator, String sourceText, String tokens[]) {
char text[] = sourceText.toCharArray();
CharArrayIterator ci = new CharArrayIterator();
ci.setText(text, 0, text.length);
iterator.setText(ci);
for (int i = 0; i < tokens.length; i++) {
int start, end;
do {
start = iterator.current();
end = iterator.next();
} while (end != BreakIterator.DONE && !isWord(text, start, end));
assertTrue(start != BreakIterator.DONE);
assertTrue(end != BreakIterator.DONE);
assertEquals(tokens[i], new String(text, start, end - start));
}
assertTrue(iterator.next() == BreakIterator.DONE);
}
protected boolean isWord(char text[], int start, int end) {
int codepoint;
for (int i = start; i < end; i += UTF16.getCharCount(codepoint)) {
codepoint = UTF16.charAt(text, 0, end, start);
if (UCharacter.isLetterOrDigit(codepoint))
return true;
}
return false;
}
public void testBasicUsage() throws Exception {
assertBreaksTo(wordIterator, "ກວ່າດອກ", new String[] { "ກວ່າ", "ດອກ" });
assertBreaksTo(wordIterator, "ຜູ້​ເຂົ້າ", new String[] { "ຜູ້", "ເຂົ້າ" });
assertBreaksTo(wordIterator, "", new String[] {});
assertBreaksTo(wordIterator, "ສະບາຍດີ", new String[] { "ສະ", "ບາຍ", "ດີ" });
}
public void testNumerics() throws Exception {
assertBreaksTo(wordIterator, "໐໑໒໓", new String[] { "໐໑໒໓" });
assertBreaksTo(wordIterator, "໐໑໒໓.໕໖", new String[] { "໐໑໒໓.໕໖" });
}
public void testTextAndNumerics() throws Exception {
assertBreaksTo(wordIterator, "ກວ່າດອກ໐໑໒໓", new String[] { "ກວ່າ", "ດອກ", "໐໑໒໓" });
}
}

View File

@ -41,7 +41,7 @@ public class TestWithCJKBigramFilter extends BaseTokenStreamTestCase {
private Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer source = new ICUTokenizer(reader);
Tokenizer source = new ICUTokenizer(reader, new DefaultICUTokenizerConfig(false));
TokenStream result = new CJKBigramFilter(source);
return new TokenStreamComponents(source, new StopFilter(TEST_VERSION_CURRENT, result, CharArraySet.EMPTY_SET));
}
@ -56,7 +56,7 @@ public class TestWithCJKBigramFilter extends BaseTokenStreamTestCase {
private Analyzer analyzer2 = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer source = new ICUTokenizer(reader);
Tokenizer source = new ICUTokenizer(reader, new DefaultICUTokenizerConfig(false));
// we put this before the CJKBigramFilter, because the normalization might combine
// some halfwidth katakana forms, which will affect the bigramming.
TokenStream result = new ICUNormalizer2Filter(source);

View File

@ -36,40 +36,45 @@ public class GenerateJFlexSupplementaryMacros {
static {
DATE_FORMAT.setTimeZone(TimeZone.getTimeZone("UTC"));
}
private static final String APACHE_LICENSE
= "/*" + NL
+ " * Copyright 2010 The Apache Software Foundation." + NL
private static final String APACHE_LICENSE
= "/*" + NL
+ " * Licensed to the Apache Software Foundation (ASF) under one or more" + NL
+ " * contributor license agreements. See the NOTICE file distributed with" + NL
+ " * this work for additional information regarding copyright ownership." + NL
+ " * The ASF licenses this file to You under the Apache License, Version 2.0" + NL
+ " * (the \"License\"); you may not use this file except in compliance with" + NL
+ " * the License. You may obtain a copy of the License at" + NL
+ " *" + NL
+ " * Licensed under the Apache License, Version 2.0 (the \"License\");" + NL
+ " * you may not use this file except in compliance with the License." + NL
+ " * You may obtain a copy of the License at" + NL
+ " *" + NL
+ " * http://www.apache.org/licenses/LICENSE-2.0" + NL
+ " * http://www.apache.org/licenses/LICENSE-2.0" + NL
+ " *" + NL
+ " * Unless required by applicable law or agreed to in writing, software" + NL
+ " * distributed under the License is distributed on an \"AS IS\" BASIS," + NL
+ " * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied." + NL
+ " * See the License for the specific language governing permissions and" + NL
+ " * limitations under the License." + NL
+ " */" + NL + NL;
+ " */" + NL;
public static void main(String args[]) {
outputHeader();
outputMacro("ALetterSupp", "[:WordBreak=ALetter:]");
outputMacro("FormatSupp", "[:WordBreak=Format:]");
outputMacro("ExtendSupp", "[:WordBreak=Extend:]");
outputMacro("NumericSupp", "[:WordBreak=Numeric:]");
outputMacro("KatakanaSupp", "[:WordBreak=Katakana:]");
outputMacro("MidLetterSupp", "[:WordBreak=MidLetter:]");
outputMacro("MidNumSupp", "[:WordBreak=MidNum:]");
outputMacro("MidNumLetSupp", "[:WordBreak=MidNumLet:]");
outputMacro("ExtendNumLetSupp", "[:WordBreak=ExtendNumLet:]");
outputMacro("ExtendNumLetSupp", "[:WordBreak=ExtendNumLet:]");
outputMacro("ComplexContextSupp", "[:LineBreak=Complex_Context:]");
outputMacro("HanSupp", "[:Script=Han:]");
outputMacro("HiraganaSupp", "[:Script=Hiragana:]");
outputMacro("ALetterSupp", "[:WordBreak=ALetter:]");
outputMacro("FormatSupp", "[:WordBreak=Format:]");
outputMacro("NumericSupp", "[:WordBreak=Numeric:]");
outputMacro("ExtendSupp", "[:WordBreak=Extend:]");
outputMacro("KatakanaSupp", "[:WordBreak=Katakana:]");
outputMacro("MidLetterSupp", "[:WordBreak=MidLetter:]");
outputMacro("MidNumSupp", "[:WordBreak=MidNum:]");
outputMacro("MidNumLetSupp", "[:WordBreak=MidNumLet:]");
outputMacro("ExtendNumLetSupp", "[:WordBreak=ExtendNumLet:]");
outputMacro("ExtendNumLetSupp", "[:WordBreak=ExtendNumLet:]");
outputMacro("ComplexContextSupp", "[:LineBreak=Complex_Context:]");
outputMacro("HanSupp", "[:Script=Han:]");
outputMacro("HiraganaSupp", "[:Script=Hiragana:]");
outputMacro("SingleQuoteSupp", "[:WordBreak=Single_Quote:]");
outputMacro("DoubleQuoteSupp", "[:WordBreak=Double_Quote:]");
outputMacro("HebrewLetterSupp", "[:WordBreak=Hebrew_Letter:]");
outputMacro("RegionalIndicatorSupp", "[:WordBreak=Regional_Indicator:]");
}
static void outputHeader() {

View File

@ -62,7 +62,7 @@ import java.util.regex.Pattern;
public class GenerateUTR30DataFiles {
private static final String ICU_SVN_TAG_URL
= "http://source.icu-project.org/repos/icu/icu/tags";
private static final String ICU_RELEASE_TAG = "release-49-1-2";
private static final String ICU_RELEASE_TAG = "release-52-1";
private static final String ICU_DATA_NORM2_PATH = "source/data/unidata/norm2";
private static final String NFC_TXT = "nfc.txt";
private static final String NFKC_TXT = "nfkc.txt";

View File

@ -97,7 +97,8 @@ public class CreateIndexTask extends PerfTask {
}
public static IndexWriterConfig createWriterConfig(Config config, PerfRunData runData, OpenMode mode, IndexCommit commit) {
Version version = Version.valueOf(config.get("writer.version", Version.LUCENE_40.toString()));
// :Post-Release-Update-Version.LUCENE_XY:
Version version = Version.valueOf(config.get("writer.version", Version.LUCENE_50.toString()));
IndexWriterConfig iwConf = new IndexWriterConfig(version, runData.getAnalyzer());
iwConf.setOpenMode(mode);
IndexDeletionPolicy indexDeletionPolicy = getIndexDeletionPolicy(config);

View File

@ -37,7 +37,8 @@ public class CreateIndexTaskTest extends BenchmarkTestCase {
private PerfRunData createPerfRunData(String infoStreamValue) throws Exception {
Properties props = new Properties();
props.setProperty("writer.version", Version.LUCENE_40.toString());
// :Post-Release-Update-Version.LUCENE_XY:
props.setProperty("writer.version", Version.LUCENE_50.toString());
props.setProperty("print.props", "false"); // don't print anything
props.setProperty("directory", "RAMDirectory");
if (infoStreamValue != null) {

View File

@ -49,6 +49,9 @@ public class KNearestNeighborClassifier implements Classifier<BytesRef> {
private final int k;
private Query query;
private int minDocsFreq;
private int minTermFreq;
/**
* Create a {@link Classifier} using kNN algorithm
*
@ -58,6 +61,19 @@ public class KNearestNeighborClassifier implements Classifier<BytesRef> {
this.k = k;
}
/**
* Create a {@link Classifier} using kNN algorithm
*
* @param k the number of neighbors to analyze as an <code>int</code>
* @param minDocsFreq the minimum number of docs frequency for MLT to be set with {@link MoreLikeThis#setMinDocFreq(int)}
* @param minTermFreq the minimum number of term frequency for MLT to be set with {@link MoreLikeThis#setMinTermFreq(int)}
*/
public KNearestNeighborClassifier(int k, int minDocsFreq, int minTermFreq) {
this.k = k;
this.minDocsFreq = minDocsFreq;
this.minTermFreq = minTermFreq;
}
/**
* {@inheritDoc}
*/
@ -93,11 +109,11 @@ public class KNearestNeighborClassifier implements Classifier<BytesRef> {
}
double max = 0;
BytesRef assignedClass = new BytesRef();
for (BytesRef cl : classCounts.keySet()) {
Integer count = classCounts.get(cl);
for (Map.Entry<BytesRef, Integer> entry : classCounts.entrySet()) {
Integer count = entry.getValue();
if (count > max) {
max = count;
assignedClass = cl.clone();
assignedClass = entry.getKey().clone();
}
}
double score = max / (double) k;
@ -117,13 +133,7 @@ public class KNearestNeighborClassifier implements Classifier<BytesRef> {
*/
@Override
public void train(AtomicReader atomicReader, String textFieldName, String classFieldName, Analyzer analyzer, Query query) throws IOException {
this.textFieldNames = new String[]{textFieldName};
this.classFieldName = classFieldName;
mlt = new MoreLikeThis(atomicReader);
mlt.setAnalyzer(analyzer);
mlt.setFieldNames(new String[]{textFieldName});
indexSearcher = new IndexSearcher(atomicReader);
this.query = query;
train(atomicReader, new String[]{textFieldName}, classFieldName, analyzer, query);
}
/**
@ -137,6 +147,12 @@ public class KNearestNeighborClassifier implements Classifier<BytesRef> {
mlt.setAnalyzer(analyzer);
mlt.setFieldNames(textFieldNames);
indexSearcher = new IndexSearcher(atomicReader);
if (minDocsFreq > 0) {
mlt.setMinDocFreq(minDocsFreq);
}
if (minTermFreq > 0) {
mlt.setMinTermFreq(minTermFreq);
}
this.query = query;
}
}

View File

@ -64,23 +64,17 @@ public class SimpleNaiveBayesClassifier implements Classifier<BytesRef> {
* {@inheritDoc}
*/
@Override
public void train(AtomicReader atomicReader, String textFieldName, String classFieldName, Analyzer analyzer, Query query)
throws IOException {
this.atomicReader = atomicReader;
this.indexSearcher = new IndexSearcher(this.atomicReader);
this.textFieldNames = new String[]{textFieldName};
this.classFieldName = classFieldName;
this.analyzer = analyzer;
this.docsWithClassSize = countDocsWithClass();
this.query = query;
public void train(AtomicReader atomicReader, String textFieldName, String classFieldName, Analyzer analyzer) throws IOException {
train(atomicReader, textFieldName, classFieldName, analyzer, null);
}
/**
* {@inheritDoc}
*/
@Override
public void train(AtomicReader atomicReader, String textFieldName, String classFieldName, Analyzer analyzer) throws IOException {
train(atomicReader, textFieldName, classFieldName, analyzer, null);
public void train(AtomicReader atomicReader, String textFieldName, String classFieldName, Analyzer analyzer, Query query)
throws IOException {
train(atomicReader, new String[]{textFieldName}, classFieldName, analyzer, query);
}
/**
@ -137,7 +131,7 @@ public class SimpleNaiveBayesClassifier implements Classifier<BytesRef> {
if (atomicReader == null) {
throw new IOException("You must first call Classifier#train");
}
double max = 0d;
double max = - Double.MAX_VALUE;
BytesRef foundClass = new BytesRef();
Terms terms = MultiFields.getTerms(atomicReader, classFieldName);
@ -145,20 +139,20 @@ public class SimpleNaiveBayesClassifier implements Classifier<BytesRef> {
BytesRef next;
String[] tokenizedDoc = tokenizeDoc(inputDocument);
while ((next = termsEnum.next()) != null) {
// TODO : turn it to be in log scale
double clVal = calculatePrior(next) * calculateLikelihood(tokenizedDoc, next);
double clVal = calculateLogPrior(next) + calculateLogLikelihood(tokenizedDoc, next);
if (clVal > max) {
max = clVal;
foundClass = BytesRef.deepCopyOf(next);
}
}
return new ClassificationResult<BytesRef>(foundClass, max);
double score = 10 / Math.abs(max);
return new ClassificationResult<BytesRef>(foundClass, score);
}
private double calculateLikelihood(String[] tokenizedDoc, BytesRef c) throws IOException {
private double calculateLogLikelihood(String[] tokenizedDoc, BytesRef c) throws IOException {
// for each word
double result = 1d;
double result = 0d;
for (String word : tokenizedDoc) {
// search with text:word AND class:c
int hits = getWordFreqForClass(word, c);
@ -171,10 +165,10 @@ public class SimpleNaiveBayesClassifier implements Classifier<BytesRef> {
// P(w|c) = num/den
double wordProbability = num / den;
result *= wordProbability;
result += Math.log(wordProbability);
}
// P(d|c) = P(w1|c)*...*P(wn|c)
// log(P(d|c)) = log(P(w1|c))+...+log(P(wn|c))
return result;
}
@ -205,8 +199,8 @@ public class SimpleNaiveBayesClassifier implements Classifier<BytesRef> {
return totalHitCountCollector.getTotalHits();
}
private double calculatePrior(BytesRef currentClass) throws IOException {
return (double) docCount(currentClass) / docsWithClassSize;
private double calculateLogPrior(BytesRef currentClass) throws IOException {
return Math.log((double) docCount(currentClass)) - Math.log(docsWithClassSize);
}
private int docCount(BytesRef countedClass) throws IOException {

View File

@ -69,6 +69,7 @@ public class DatasetSplitter {
Analyzer analyzer, String... fieldNames) throws IOException {
// create IWs for train / test / cv IDXs
// :Post-Release-Update-Version.LUCENE_XY:
IndexWriter testWriter = new IndexWriter(testIndex, new IndexWriterConfig(Version.LUCENE_50, analyzer));
IndexWriter cvWriter = new IndexWriter(crossValidationIndex, new IndexWriterConfig(Version.LUCENE_50, analyzer));
IndexWriter trainingWriter = new IndexWriter(trainingIndex, new IndexWriterConfig(Version.LUCENE_50, analyzer));

View File

@ -39,14 +39,17 @@ import java.util.Random;
* Base class for testing {@link Classifier}s
*/
public abstract class ClassificationTestBase<T> extends LuceneTestCase {
public final static String POLITICS_INPUT = "Here are some interesting questions and answers about Mitt Romney.. If you don't know the answer to the question about Mitt Romney, then simply click on the answer below the question section.";
public final static String POLITICS_INPUT = "Here are some interesting questions and answers about Mitt Romney.. " +
"If you don't know the answer to the question about Mitt Romney, then simply click on the answer below the question section.";
public static final BytesRef POLITICS_RESULT = new BytesRef("politics");
public static final String TECHNOLOGY_INPUT = "Much is made of what the likes of Facebook, Google and Apple know about users. Truth is, Amazon may know more.";
public static final String TECHNOLOGY_INPUT = "Much is made of what the likes of Facebook, Google and Apple know about users." +
" Truth is, Amazon may know more.";
public static final BytesRef TECHNOLOGY_RESULT = new BytesRef("technology");
private RandomIndexWriter indexWriter;
private Directory dir;
private FieldType ft;
String textFieldName;
String categoryFieldName;
@ -61,6 +64,10 @@ public abstract class ClassificationTestBase<T> extends LuceneTestCase {
textFieldName = "text";
categoryFieldName = "cat";
booleanFieldName = "bool";
ft = new FieldType(TextField.TYPE_STORED);
ft.setStoreTermVectors(true);
ft.setStoreTermVectorOffsets(true);
ft.setStoreTermVectorPositions(true);
}
@Override
@ -72,7 +79,7 @@ public abstract class ClassificationTestBase<T> extends LuceneTestCase {
}
protected void checkCorrectClassification(Classifier<T> classifier, String inputDoc, T expectedResult, Analyzer analyzer, String textFieldName, String classFieldName) throws Exception {
checkCorrectClassification(classifier, inputDoc, expectedResult, analyzer, textFieldName, classFieldName, null);
checkCorrectClassification(classifier, inputDoc, expectedResult, analyzer, textFieldName, classFieldName, null);
}
protected void checkCorrectClassification(Classifier<T> classifier, String inputDoc, T expectedResult, Analyzer analyzer, String textFieldName, String classFieldName, Query query) throws Exception {
@ -90,63 +97,35 @@ public abstract class ClassificationTestBase<T> extends LuceneTestCase {
atomicReader.close();
}
}
protected void checkOnlineClassification(Classifier<T> classifier, String inputDoc, T expectedResult, Analyzer analyzer, String textFieldName, String classFieldName) throws Exception {
checkOnlineClassification(classifier, inputDoc, expectedResult, analyzer, textFieldName, classFieldName, null);
}
protected void checkPerformance(Classifier<T> classifier, Analyzer analyzer, String classFieldName) throws Exception {
protected void checkOnlineClassification(Classifier<T> classifier, String inputDoc, T expectedResult, Analyzer analyzer, String textFieldName, String classFieldName, Query query) throws Exception {
AtomicReader atomicReader = null;
long trainStart = System.currentTimeMillis();
try {
populatePerformanceIndex(analyzer);
populateSampleIndex(analyzer);
atomicReader = SlowCompositeReaderWrapper.wrap(indexWriter.getReader());
classifier.train(atomicReader, textFieldName, classFieldName, analyzer);
long trainEnd = System.currentTimeMillis();
long trainTime = trainEnd - trainStart;
assertTrue("training took more than 2 mins : " + trainTime / 1000 + "s", trainTime < 120000);
classifier.train(atomicReader, textFieldName, classFieldName, analyzer, query);
ClassificationResult<T> classificationResult = classifier.assignClass(inputDoc);
assertNotNull(classificationResult.getAssignedClass());
assertEquals("got an assigned class of " + classificationResult.getAssignedClass(), expectedResult, classificationResult.getAssignedClass());
assertTrue("got a not positive score " + classificationResult.getScore(), classificationResult.getScore() > 0);
updateSampleIndex(analyzer);
ClassificationResult<T> secondClassificationResult = classifier.assignClass(inputDoc);
assertEquals(classificationResult.getAssignedClass(), secondClassificationResult.getAssignedClass());
assertEquals(Double.valueOf(classificationResult.getScore()), Double.valueOf(secondClassificationResult.getScore()));
} finally {
if (atomicReader != null)
atomicReader.close();
}
}
private void populatePerformanceIndex(Analyzer analyzer) throws IOException {
private void populateSampleIndex(Analyzer analyzer) throws IOException {
indexWriter.deleteAll();
indexWriter.commit();
FieldType ft = new FieldType(TextField.TYPE_STORED);
ft.setStoreTermVectors(true);
ft.setStoreTermVectorOffsets(true);
ft.setStoreTermVectorPositions(true);
int docs = 1000;
Random random = random();
for (int i = 0; i < docs; i++) {
boolean b = random.nextBoolean();
Document doc = new Document();
doc.add(new Field(textFieldName, createRandomString(random), ft));
doc.add(new Field(categoryFieldName, b ? "technology" : "politics", ft));
doc.add(new Field(booleanFieldName, String.valueOf(b), ft));
indexWriter.addDocument(doc, analyzer);
}
indexWriter.commit();
}
private String createRandomString(Random random) {
StringBuilder builder = new StringBuilder();
for (int i = 0; i < 20; i++) {
builder.append(_TestUtil.randomSimpleString(random, 5));
builder.append(" ");
}
return builder.toString();
}
private void populateSampleIndex(Analyzer analyzer) throws Exception {
indexWriter.deleteAll();
indexWriter.commit();
FieldType ft = new FieldType(TextField.TYPE_STORED);
ft.setStoreTermVectors(true);
ft.setStoreTermVectorOffsets(true);
ft.setStoreTermVectorPositions(true);
String text;
Document doc = new Document();
@ -218,4 +197,112 @@ public abstract class ClassificationTestBase<T> extends LuceneTestCase {
indexWriter.commit();
}
protected void checkPerformance(Classifier<T> classifier, Analyzer analyzer, String classFieldName) throws Exception {
AtomicReader atomicReader = null;
long trainStart = System.currentTimeMillis();
try {
populatePerformanceIndex(analyzer);
atomicReader = SlowCompositeReaderWrapper.wrap(indexWriter.getReader());
classifier.train(atomicReader, textFieldName, classFieldName, analyzer);
long trainEnd = System.currentTimeMillis();
long trainTime = trainEnd - trainStart;
assertTrue("training took more than 2 mins : " + trainTime / 1000 + "s", trainTime < 120000);
} finally {
if (atomicReader != null)
atomicReader.close();
}
}
private void populatePerformanceIndex(Analyzer analyzer) throws IOException {
indexWriter.deleteAll();
indexWriter.commit();
FieldType ft = new FieldType(TextField.TYPE_STORED);
ft.setStoreTermVectors(true);
ft.setStoreTermVectorOffsets(true);
ft.setStoreTermVectorPositions(true);
int docs = 1000;
Random random = random();
for (int i = 0; i < docs; i++) {
boolean b = random.nextBoolean();
Document doc = new Document();
doc.add(new Field(textFieldName, createRandomString(random), ft));
doc.add(new Field(categoryFieldName, b ? "technology" : "politics", ft));
doc.add(new Field(booleanFieldName, String.valueOf(b), ft));
indexWriter.addDocument(doc, analyzer);
}
indexWriter.commit();
}
private String createRandomString(Random random) {
StringBuilder builder = new StringBuilder();
for (int i = 0; i < 20; i++) {
builder.append(_TestUtil.randomSimpleString(random, 5));
builder.append(" ");
}
return builder.toString();
}
private void updateSampleIndex(Analyzer analyzer) throws Exception {
String text;
Document doc = new Document();
text = "Warren Bennis says John F. Kennedy grasped a key lesson about the presidency that few have followed.";
doc.add(new Field(textFieldName, text, ft));
doc.add(new Field(categoryFieldName, "politics", ft));
doc.add(new Field(booleanFieldName, "true", ft));
indexWriter.addDocument(doc, analyzer);
doc = new Document();
text = "Julian Zelizer says Bill Clinton is still trying to shape his party, years after the White House, while George W. Bush opts for a much more passive role.";
doc.add(new Field(textFieldName, text, ft));
doc.add(new Field(categoryFieldName, "politics", ft));
doc.add(new Field(booleanFieldName, "true", ft));
indexWriter.addDocument(doc, analyzer);
doc = new Document();
text = "Crossfire: Sen. Tim Scott passes on Sen. Lindsey Graham endorsement";
doc.add(new Field(textFieldName, text, ft));
doc.add(new Field(categoryFieldName, "politics", ft));
doc.add(new Field(booleanFieldName, "true", ft));
indexWriter.addDocument(doc, analyzer);
doc = new Document();
text = "Illinois becomes 16th state to allow same-sex marriage.";
doc.add(new Field(textFieldName, text, ft));
doc.add(new Field(categoryFieldName, "politics", ft));
doc.add(new Field(booleanFieldName, "true", ft));
indexWriter.addDocument(doc, analyzer);
doc = new Document();
text = "Apple is developing iPhones with curved-glass screens and enhanced sensors that detect different levels of pressure, according to a new report.";
doc.add(new Field(textFieldName, text, ft));
doc.add(new Field(categoryFieldName, "technology", ft));
doc.add(new Field(booleanFieldName, "false", ft));
indexWriter.addDocument(doc, analyzer);
doc = new Document();
text = "The Xbox One is Microsoft's first new gaming console in eight years. It's a quality piece of hardware but it's also noteworthy because Microsoft is using it to make a statement.";
doc.add(new Field(textFieldName, text, ft));
doc.add(new Field(categoryFieldName, "technology", ft));
doc.add(new Field(booleanFieldName, "false", ft));
indexWriter.addDocument(doc, analyzer);
doc = new Document();
text = "Google says it will replace a Google Maps image after a California father complained it shows the body of his teen-age son, who was shot to death in 2009.";
doc.add(new Field(textFieldName, text, ft));
doc.add(new Field(categoryFieldName, "technology", ft));
doc.add(new Field(booleanFieldName, "false", ft));
indexWriter.addDocument(doc, analyzer);
doc = new Document();
text = "second unlabeled doc";
doc.add(new Field(textFieldName, text, ft));
indexWriter.addDocument(doc, analyzer);
indexWriter.commit();
}
}

View File

@ -29,7 +29,10 @@ public class KNearestNeighborClassifierTest extends ClassificationTestBase<Bytes
@Test
public void testBasicUsage() throws Exception {
checkCorrectClassification(new KNearestNeighborClassifier(1), TECHNOLOGY_INPUT, TECHNOLOGY_RESULT, new MockAnalyzer(random()), textFieldName, categoryFieldName);
// usage with default MLT min docs / term freq
checkCorrectClassification(new KNearestNeighborClassifier(3), POLITICS_INPUT, POLITICS_RESULT, new MockAnalyzer(random()), textFieldName, categoryFieldName);
// usage without custom min docs / term freq for MLT
checkCorrectClassification(new KNearestNeighborClassifier(3, 2, 1), TECHNOLOGY_INPUT, TECHNOLOGY_RESULT, new MockAnalyzer(random()), textFieldName, categoryFieldName);
}
@Test

View File

@ -159,7 +159,6 @@
<property name="javac.source.backwards" value="1.7"/>
<property name="javac.target.backwards" value="1.7"/>
<property name="javac.args" value="-Xlint -Xlint:-deprecation -Xlint:-serial -Xlint:-options"/>
<property name="bootclasspath" value=""/>
<property name="javadoc.link" value="http://download.oracle.com/javase/7/docs/api/"/>
<property name="javadoc.link.junit" value="http://junit.sourceforge.net/javadoc/"/>
<property name="javadoc.packagelist.dir" location="${common.dir}/tools/javadoc"/>
@ -169,6 +168,35 @@
<property name="javadoc.dir" location="${common.dir}/build/docs"/>
<property name="javadoc.maxmemory" value="512m" />
<property name="javadoc.noindex" value="true"/>
<!-- detect bootclasspath from given bootjdk path (including crazy AppleJDK special case) -->
<first id="-boot-rt.jar">
<fileset dir="${bootjdk}" erroronmissingdir="false" followsymlinks="true">
<include name="jre/lib/rt.jar" /><!-- Oracle JDK -->
<include name="lib/rt.jar" /><!-- Oracle JRE -->
<include name="bundle/Classes/classes.jar" /><!-- Apple JDK -->
</fileset>
</first>
<property name="bootclasspath" value="${toString:-boot-rt.jar}" />
<fail message="Invalid 'bootjdk' parameter, because it contains no class library JAR: ${bootjdk}">
<condition>
<and>
<isset property="bootjdk" />
<equals arg1="${bootclasspath}" arg2=""/>
</and>
</condition>
</fail>
<fail message="Invalid 'bootclasspath' parameter, because it does not point to a valid class library JAR: ${bootclasspath}">
<condition>
<not>
<or>
<equals arg1="${bootclasspath}" arg2=""/>
<available classname="java.lang.StringBuilder" classpath="${bootclasspath}" ignoresystemclasses="true"/>
</or>
</not>
</condition>
</fail>
<!-- Javadoc classpath -->
<path id="javadoc.classpath">
<path refid="classpath"/>
@ -355,7 +383,7 @@
<target name="resolve" depends="ivy-availability-check,ivy-configure">
<!-- todo, make this a property or something.
only special cases need bundles -->
<ivy:retrieve type="jar,bundle,tests" log="download-only"
<ivy:retrieve type="jar,bundle,test,test-jar,tests" log="download-only"
conf="${ivy.default.configuration}" sync="${ivy.sync}"/>
</target>
@ -448,7 +476,7 @@
<available property="jflex.present" classname="jflex.anttask.JFlexTask">
<classpath refid="jflex.classpath"/>
</available>
<fail unless="jflex.present">
<fail unless="jflex.present">&#xA0;
##################################################################
JFlex not found.
JFlex Home: ${jflex.home}
@ -456,14 +484,14 @@
Please install the jFlex 1.5 version (currently not released)
from its SVN repository:
svn co -r 623 http://jflex.svn.sourceforge.net/svnroot/jflex/trunk jflex
svn co -r 722 https://svn.code.sf.net/p/jflex/code/trunk jflex
cd jflex
mvn install
Then, create a build.properties file either in your home
directory, or within the Lucene directory and set the jflex.home
property to the path where the JFlex trunk checkout is located
(in the above example its the directory called "jflex").
(in the above example it's the directory called "jflex").
##################################################################
</fail>
@ -623,6 +651,7 @@
value="The Apache Software Foundation"/>
<attribute name="X-Compile-Source-JDK" value="${javac.source}"/>
<attribute name="X-Compile-Target-JDK" value="${javac.target}"/>
<attribute name="Main-Class" value="${main.class}"/>
</manifest>
</sequential>
</macrodef>
@ -979,6 +1008,9 @@
<!-- disable AWT while running tests -->
<sysproperty key="java.awt.headless" value="true"/>
<!-- turn jenkins blood red for hashmap bugs, even on jdk7 -->
<sysproperty key="jdk.map.althashing.threshold" value="0"/>
<!-- Only pass these to the test JVMs if defined in ANT. -->
<syspropertyset>
<propertyref prefix="tests.maxfailures" />
@ -1331,7 +1363,7 @@ ${tests-output}/junit4-*.suites - per-JVM executed suites
]]></fail>
<echo>Code coverage with Atlassian Clover enabled.</echo>
<ivy:cachepath organisation="com.cenqua.clover" module="clover" revision="3.2.0-SNAPSHOT"
<ivy:cachepath organisation="com.cenqua.clover" module="clover" revision="3.2.0"
inline="true" conf="master" pathid="clover.classpath"/>
<taskdef resource="cloverlib.xml" classpathref="clover.classpath" />
<mkdir dir="${clover.db.dir}"/>
@ -2168,7 +2200,7 @@ ${ant.project.name}.test.dependencies=${test.classpath.list}
<!-- GROOVY scripting engine for ANT tasks -->
<target name="resolve-groovy" unless="groovy.loaded" depends="ivy-availability-check,ivy-configure">
<ivy:cachepath organisation="org.codehaus.groovy" module="groovy-all" revision="2.1.5"
<ivy:cachepath organisation="org.codehaus.groovy" module="groovy-all" revision="2.2.1"
inline="true" conf="default" type="jar" transitive="true" pathid="groovy.classpath"/>
<taskdef name="groovy"
classname="org.codehaus.groovy.ant.Groovy"
@ -2182,7 +2214,7 @@ ${ant.project.name}.test.dependencies=${test.classpath.list}
<property name="forbidden-sysout-excludes" value=""/>
<target name="-install-forbidden-apis" unless="forbidden-apis.loaded" depends="ivy-availability-check,ivy-configure">
<ivy:cachepath organisation="de.thetaphi" module="forbiddenapis" revision="1.3"
<ivy:cachepath organisation="de.thetaphi" module="forbiddenapis" revision="1.4"
inline="true" conf="default" transitive="true" pathid="forbidden-apis.classpath"/>
<taskdef name="forbidden-apis" classname="de.thetaphi.forbiddenapis.AntTask" classpathref="forbidden-apis.classpath"/>
<property name="forbidden-apis.loaded" value="true"/>
@ -2226,7 +2258,7 @@ ${ant.project.name}.test.dependencies=${test.classpath.list}
<!-- PEGDOWN macro: Before using depend on the target "resolve-pegdown,resolve-groovy" -->
<target name="resolve-pegdown" unless="pegdown.loaded" depends="ivy-availability-check,ivy-configure">
<ivy:cachepath organisation="org.pegdown" module="pegdown" revision="1.4.0"
<ivy:cachepath organisation="org.pegdown" module="pegdown" revision="1.4.1"
inline="true" conf="default" transitive="true" pathid="pegdown.classpath"/>
<property name="pegdown.loaded" value="true"/>
</target>

View File

@ -373,6 +373,10 @@ public final class CompressingStoredFieldsReader extends StoredFieldsReader {
return compressionMode;
}
int getChunkSize() {
return chunkSize;
}
ChunkIterator chunkIterator(int startDocID) throws IOException {
ensureOpen();
fieldsStream.seek(indexReader.getStartPointer(startDocID));

View File

@ -337,7 +337,9 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter {
final Bits liveDocs = reader.getLiveDocs();
if (matchingFieldsReader == null
|| matchingFieldsReader.getVersion() != VERSION_CURRENT) { // means reader version is not the same as the writer version
|| matchingFieldsReader.getVersion() != VERSION_CURRENT // means reader version is not the same as the writer version
|| matchingFieldsReader.getCompressionMode() != compressionMode
|| matchingFieldsReader.getChunkSize() != chunkSize) { // the way data is decompressed depends on the chunk size
// naive merge...
for (int i = nextLiveDoc(0, liveDocs, maxDoc); i < maxDoc; i = nextLiveDoc(i + 1, liveDocs, maxDoc)) {
StoredDocument doc = reader.document(i);
@ -362,8 +364,7 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter {
startOffsets[i] = startOffsets[i - 1] + it.lengths[i - 1];
}
if (compressionMode == matchingFieldsReader.getCompressionMode() // same compression mode
&& numBufferedDocs == 0 // starting a new chunk
if (numBufferedDocs == 0 // starting a new chunk
&& startOffsets[it.chunkDocs - 1] < chunkSize // chunk is small enough
&& startOffsets[it.chunkDocs - 1] + it.lengths[it.chunkDocs - 1] >= chunkSize // chunk is large enough
&& nextDeletedDoc(it.docBase, liveDocs, it.docBase + it.chunkDocs) == it.docBase + it.chunkDocs) { // no deletion in the chunk

View File

@ -168,8 +168,9 @@ public abstract class IndexReader implements Closeable {
* @see #tryIncRef
*/
public final void incRef() {
ensureOpen();
refCount.incrementAndGet();
if (!tryIncRef()) {
ensureOpen();
}
}
/**

Some files were not shown because too many files have changed in this diff Show More