mirror of https://github.com/apache/lucene.git
LUCENE-5339: merge trunk
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5339@1552377 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
commit
0699ac7d42
|
@ -23,6 +23,8 @@
|
|||
/bin
|
||||
/bin.*
|
||||
/pom.xml
|
||||
/nbproject
|
||||
/nb-build
|
||||
|
||||
|
||||
# ./lucene
|
||||
|
|
38
build.xml
38
build.xml
|
@ -36,10 +36,7 @@
|
|||
depends="check-svn-working-copy,validate,documentation-lint"/>
|
||||
|
||||
<target name="test" description="Test both Lucene and Solr">
|
||||
<subant target="test" inheritall="false" failonerror="true">
|
||||
<fileset dir="lucene" includes="build.xml" />
|
||||
<fileset dir="solr" includes="build.xml" />
|
||||
</subant>
|
||||
<subant buildpath="." antfile="extra-targets.xml" target="-run-test" inheritall="false" failonerror="true" />
|
||||
</target>
|
||||
|
||||
<target name="pitest" description="Run PITest on both Lucene and Solr">
|
||||
|
@ -194,6 +191,39 @@
|
|||
</delete>
|
||||
</target>
|
||||
|
||||
<target name="netbeans" depends="resolve" description="Setup Netbeans configuration">
|
||||
<pathconvert property="netbeans.fileset.sourcefolders" pathsep="|" dirsep="/">
|
||||
<dirset dir="${basedir}/lucene" includes="**/src/java, **/src/examples, **/src/test, **/src/resources"
|
||||
excludes="tools/**, build/**, backwards/**" />
|
||||
<dirset dir="${basedir}/solr" includes="**/src/java, **/src/examples, **/src/test, **/src/resources"
|
||||
excludes="build/**" />
|
||||
<map from="${basedir}/" to=""/>
|
||||
</pathconvert>
|
||||
<!-- TODO: find a better way to exclude duplicate JAR files & fix the servlet-api mess! -->
|
||||
<pathconvert property="netbeans.path.libs" pathsep=":" dirsep="/">
|
||||
<fileset dir="${basedir}/lucene" includes="**/lib/*.jar"
|
||||
excludes="**/*servlet-api*.jar, analysis/uima/**, tools/**, build/**"/>
|
||||
<fileset dir="${basedir}/solr" includes="**/test-lib/*.jar,**/lib/*.jar"
|
||||
excludes="core/test-lib/*servlet-api*.jar, contrib/analysis-extras/**, test-framework/lib/junit*, test-framework/lib/ant*, test-framework/lib/randomizedtesting*, build/**, dist/**, package/**, example/solr-webapp/**" />
|
||||
<map from="${basedir}/" to=""/>
|
||||
</pathconvert>
|
||||
<mkdir dir="nbproject"/>
|
||||
<copy todir="nbproject" overwrite="true">
|
||||
<fileset dir="dev-tools/netbeans/nbproject"/>
|
||||
</copy>
|
||||
<xslt in="${ant.file}" out="nbproject/project.xml" style="dev-tools/netbeans/nb-project.xsl" force="true">
|
||||
<outputproperty name="indent" value="yes"/>
|
||||
<param name="netbeans.fileset.sourcefolders" expression="${netbeans.fileset.sourcefolders}"/>
|
||||
<param name="netbeans.path.libs" expression="${netbeans.path.libs}"/>
|
||||
<param name="netbeans.source-level" expression="1.7"/>
|
||||
</xslt>
|
||||
</target>
|
||||
|
||||
<target name="clean-netbeans" description="Removes all Netbeans configuration files">
|
||||
<delete dir="nbproject" failonerror="true"/>
|
||||
<delete dir="nb-build" failonerror="true"/>
|
||||
</target>
|
||||
|
||||
<target name="eclipse" depends="resolve" description="Setup Eclipse configuration">
|
||||
<basename file="${basedir}" property="eclipseprojectname"/>
|
||||
<copy file="dev-tools/eclipse/dot.project" tofile=".project" overwrite="false" encoding="UTF-8">
|
||||
|
|
|
@ -45,6 +45,9 @@
|
|||
<buildFile url="file://$PROJECT_DIR$/solr/contrib/dataimporthandler/build.xml" />
|
||||
<buildFile url="file://$PROJECT_DIR$/solr/contrib/extraction/build.xml" />
|
||||
<buildFile url="file://$PROJECT_DIR$/solr/contrib/langid/build.xml" />
|
||||
<buildFile url="file://$PROJECT_DIR$/solr/contrib/morphlines-cell/build.xml" />
|
||||
<buildFile url="file://$PROJECT_DIR$/solr/contrib/morphlines-core/build.xml" />
|
||||
<buildFile url="file://$PROJECT_DIR$/solr/contrib/map-reduce/build.xml" />
|
||||
<buildFile url="file://$PROJECT_DIR$/solr/contrib/uima/build.xml" />
|
||||
<buildFile url="file://$PROJECT_DIR$/solr/contrib/velocity/build.xml" />
|
||||
<buildFile url="file://$PROJECT_DIR$/solr/solrj/build.xml" />
|
||||
|
|
|
@ -0,0 +1,10 @@
|
|||
<component name="libraryTable">
|
||||
<library name="Solr morphlines cell library">
|
||||
<CLASSES>
|
||||
<root url="file://$PROJECT_DIR$/solr/contrib/morphlines-cell/lib" />
|
||||
</CLASSES>
|
||||
<JAVADOC />
|
||||
<SOURCES />
|
||||
<jarDirectory url="file://$PROJECT_DIR$/solr/contrib/morphlines-cell/lib" recursive="false" />
|
||||
</library>
|
||||
</component>
|
|
@ -0,0 +1,10 @@
|
|||
<component name="libraryTable">
|
||||
<library name="Solr morphlines core library">
|
||||
<CLASSES>
|
||||
<root url="file://$PROJECT_DIR$/solr/contrib/morphlines-core/lib" />
|
||||
</CLASSES>
|
||||
<JAVADOC />
|
||||
<SOURCES />
|
||||
<jarDirectory url="file://$PROJECT_DIR$/solr/contrib/morphlines-core/lib" recursive="false" />
|
||||
</library>
|
||||
</component>
|
|
@ -0,0 +1,10 @@
|
|||
<component name="libraryTable">
|
||||
<library name="Solr morphlines core test library">
|
||||
<CLASSES>
|
||||
<root url="file://$PROJECT_DIR$/solr/contrib/morphlines-core/test-lib" />
|
||||
</CLASSES>
|
||||
<JAVADOC />
|
||||
<SOURCES />
|
||||
<jarDirectory url="file://$PROJECT_DIR$/solr/contrib/morphlines-core/test-lib" recursive="false" />
|
||||
</library>
|
||||
</component>
|
|
@ -49,6 +49,9 @@
|
|||
<module filepath="$PROJECT_DIR$/solr/contrib/dataimporthandler/dataimporthandler.iml" />
|
||||
<module filepath="$PROJECT_DIR$/solr/contrib/extraction/extraction.iml" />
|
||||
<module filepath="$PROJECT_DIR$/solr/contrib/langid/langid.iml" />
|
||||
<module filepath="$PROJECT_DIR$/solr/contrib/morphlines-cell/morphlines-cell.iml" />
|
||||
<module filepath="$PROJECT_DIR$/solr/contrib/morphlines-core/morphlines-core.iml" />
|
||||
<module filepath="$PROJECT_DIR$/solr/contrib/map-reduce/map-reduce.iml" />
|
||||
<module filepath="$PROJECT_DIR$/solr/contrib/uima/uima.iml" />
|
||||
<module filepath="$PROJECT_DIR$/solr/contrib/velocity/velocity.iml" />
|
||||
<module filepath="$PROJECT_DIR$/solr/solrj/src/java/solrj.iml" />
|
||||
|
|
|
@ -235,6 +235,27 @@
|
|||
<option name="VM_PARAMETERS" value="-ea -DtempDir=temp -Djetty.testMode=1 -Djetty.insecurerandom=1 -Dsolr.directoryFactory=org.apache.solr.core.MockDirectoryFactory" />
|
||||
<option name="TEST_SEARCH_SCOPE"><value defaultName="singleModule" /></option>
|
||||
</configuration>
|
||||
<configuration default="false" name="Solr morphlines-cell contrib" type="JUnit" factoryName="JUnit">
|
||||
<module name="morphlines-cell" />
|
||||
<option name="TEST_OBJECT" value="package" />
|
||||
<option name="WORKING_DIRECTORY" value="file://$PROJECT_DIR$/idea-build/solr/contrib/morphlines-cell" />
|
||||
<option name="VM_PARAMETERS" value="-ea -DtempDir=temp -Djetty.testMode=1 -Djetty.insecurerandom=1 -Dsolr.directoryFactory=org.apache.solr.core.MockDirectoryFactory" />
|
||||
<option name="TEST_SEARCH_SCOPE"><value defaultName="singleModule" /></option>
|
||||
</configuration>
|
||||
<configuration default="false" name="Solr morphlines-core contrib" type="JUnit" factoryName="JUnit">
|
||||
<module name="morphlines-core" />
|
||||
<option name="TEST_OBJECT" value="package" />
|
||||
<option name="WORKING_DIRECTORY" value="file://$PROJECT_DIR$/idea-build/solr/contrib/morphlines-core" />
|
||||
<option name="VM_PARAMETERS" value="-ea -DtempDir=temp -Djetty.testMode=1 -Djetty.insecurerandom=1 -Dsolr.directoryFactory=org.apache.solr.core.MockDirectoryFactory" />
|
||||
<option name="TEST_SEARCH_SCOPE"><value defaultName="singleModule" /></option>
|
||||
</configuration>
|
||||
<configuration default="false" name="Solr mr (map-reduce) contrib" type="JUnit" factoryName="JUnit">
|
||||
<module name="map-reduce" />
|
||||
<option name="TEST_OBJECT" value="package" />
|
||||
<option name="WORKING_DIRECTORY" value="file://$PROJECT_DIR$/idea-build/solr/contrib/map-reduce" />
|
||||
<option name="VM_PARAMETERS" value="-ea -DtempDir=temp -Djetty.testMode=1 -Djetty.insecurerandom=1 -Dsolr.directoryFactory=org.apache.solr.core.MockDirectoryFactory" />
|
||||
<option name="TEST_SEARCH_SCOPE"><value defaultName="singleModule" /></option>
|
||||
</configuration>
|
||||
<configuration default="false" name="Solr uima contrib" type="JUnit" factoryName="JUnit">
|
||||
<module name="uima" />
|
||||
<option name="TEST_OBJECT" value="package" />
|
||||
|
@ -249,7 +270,7 @@
|
|||
<option name="VM_PARAMETERS" value="-ea" />
|
||||
<option name="TEST_SEARCH_SCOPE"><value defaultName="singleModule" /></option>
|
||||
</configuration>
|
||||
<list size="35">
|
||||
<list size="38">
|
||||
<item index="0" class="java.lang.String" itemvalue="JUnit.Lucene core" />
|
||||
<item index="1" class="java.lang.String" itemvalue="JUnit.Module analyzers-common" />
|
||||
<item index="2" class="java.lang.String" itemvalue="JUnit.Module analyzers-icu" />
|
||||
|
@ -281,10 +302,13 @@
|
|||
<item index="28" class="java.lang.String" itemvalue="JUnit.Solr dataimporthandler contrib" />
|
||||
<item index="29" class="java.lang.String" itemvalue="JUnit.Solr dataimporthandler-extras contrib" />
|
||||
<item index="30" class="java.lang.String" itemvalue="JUnit.Solr extraction contrib" />
|
||||
<item index="31" class="java.lang.String" itemvalue="JUnit.Solr langid contrib" />
|
||||
<item index="32" class="java.lang.String" itemvalue="JUnit.Solr uima contrib" />
|
||||
<item index="33" class="java.lang.String" itemvalue="JUnit.Solr velocity contrib" />
|
||||
<item index="34" class="java.lang.String" itemvalue="JUnit.Solrj" />
|
||||
<item index="31" class="java.lang.String" itemvalue="JUnit.Solr morphlines-cell contrib" />
|
||||
<item index="32" class="java.lang.String" itemvalue="JUnit.Solr morphlines-core contrib" />
|
||||
<item index="33" class="java.lang.String" itemvalue="JUnit.Solr mr (map-reduce) contrib" />
|
||||
<item index="34" class="java.lang.String" itemvalue="JUnit.Solr langid contrib" />
|
||||
<item index="35" class="java.lang.String" itemvalue="JUnit.Solr uima contrib" />
|
||||
<item index="36" class="java.lang.String" itemvalue="JUnit.Solr velocity contrib" />
|
||||
<item index="37" class="java.lang.String" itemvalue="JUnit.Solrj" />
|
||||
</list>
|
||||
</component>
|
||||
</project>
|
||||
|
|
|
@ -33,5 +33,6 @@
|
|||
<orderEntry type="module" module-name="analysis-common" />
|
||||
<orderEntry type="module" module-name="lucene-core" />
|
||||
<orderEntry type="module" module-name="queryparser" />
|
||||
<orderEntry type="module" module-name="queries" />
|
||||
</component>
|
||||
</module>
|
||||
|
|
|
@ -26,5 +26,7 @@
|
|||
<orderEntry type="module" module-name="lucene-core" />
|
||||
<orderEntry type="module" module-name="facet" />
|
||||
<orderEntry type="module" module-name="queryparser" />
|
||||
<orderEntry type="module" module-name="queries" />
|
||||
<orderEntry type="module" module-name="expressions" />
|
||||
</component>
|
||||
</module>
|
||||
|
|
|
@ -15,6 +15,7 @@
|
|||
<orderEntry type="library" scope="TEST" name="JUnit" level="project" />
|
||||
<orderEntry type="module" scope="TEST" module-name="lucene-test-framework" />
|
||||
<orderEntry type="module" module-name="analysis-common" />
|
||||
<orderEntry type="module" module-name="queries" />
|
||||
<orderEntry type="module" module-name="lucene-core" />
|
||||
</component>
|
||||
</module>
|
||||
|
|
|
@ -0,0 +1,40 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<module type="JAVA_MODULE" version="4">
|
||||
<component name="NewModuleRootManager" inherit-compiler-output="false">
|
||||
<output url="file://$MODULE_DIR$/../../../idea-build/solr/contrib/map-reduce/classes/java" />
|
||||
<output-test url="file://$MODULE_DIR$/../../../idea-build/solr/contrib/map-reduce/classes/test" />
|
||||
<exclude-output />
|
||||
<content url="file://$MODULE_DIR$">
|
||||
<sourceFolder url="file://$MODULE_DIR$/src/java" isTestSource="false" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/src/test" isTestSource="true" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/src/test-files" isTestSource="true" />
|
||||
</content>
|
||||
<orderEntry type="inheritedJdk" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
<orderEntry type="library" scope="TEST" name="JUnit" level="project" />
|
||||
<orderEntry type="library" name="Solr core library" level="project" />
|
||||
<orderEntry type="library" name="Solrj library" level="project" />
|
||||
<orderEntry type="library" name="Solr extraction library" level="project" />
|
||||
<orderEntry type="library" name="Solr morphlines core library" level="project" />
|
||||
<orderEntry type="library" name="Solr morphlines cell library" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Solr morphlines core test library" level="project" />
|
||||
<orderEntry type="module" scope="TEST" module-name="lucene-test-framework" />
|
||||
<orderEntry type="module" scope="TEST" module-name="solr-test-framework" />
|
||||
<orderEntry type="module" module-name="solr-core" />
|
||||
<orderEntry type="module" module-name="solrj" />
|
||||
<orderEntry type="module" module-name="misc" />
|
||||
<orderEntry type="module" module-name="extraction" />
|
||||
<orderEntry type="module" module-name="lucene-core" />
|
||||
<orderEntry type="module" module-name="morphlines-core" />
|
||||
<orderEntry type="module-library">
|
||||
<library>
|
||||
<CLASSES>
|
||||
<root url="file://$MODULE_DIR$/lib" />
|
||||
</CLASSES>
|
||||
<JAVADOC />
|
||||
<SOURCES />
|
||||
<jarDirectory url="file://$MODULE_DIR$/lib" recursive="false" />
|
||||
</library>
|
||||
</orderEntry>
|
||||
</component>
|
||||
</module>
|
|
@ -0,0 +1,28 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<module type="JAVA_MODULE" version="4">
|
||||
<component name="NewModuleRootManager" inherit-compiler-output="false">
|
||||
<output url="file://$MODULE_DIR$/../../../idea-build/solr/contrib/morphlines-cell/classes/java" />
|
||||
<output-test url="file://$MODULE_DIR$/../../../idea-build/solr/contrib/morphlines-cell/classes/test" />
|
||||
<exclude-output />
|
||||
<content url="file://$MODULE_DIR$">
|
||||
<sourceFolder url="file://$MODULE_DIR$/src/java" isTestSource="false" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/src/test" isTestSource="true" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/src/test-files" isTestSource="true" />
|
||||
</content>
|
||||
<orderEntry type="inheritedJdk" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
<orderEntry type="library" scope="TEST" name="JUnit" level="project" />
|
||||
<orderEntry type="library" name="Solr core library" level="project" />
|
||||
<orderEntry type="library" name="Solrj library" level="project" />
|
||||
<orderEntry type="library" name="Solr extraction library" level="project" />
|
||||
<orderEntry type="library" name="Solr morphlines core library" level="project" />
|
||||
<orderEntry type="library" name="Solr morphlines cell library" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Solr morphlines core test library" level="project" />
|
||||
<orderEntry type="module" scope="TEST" module-name="lucene-test-framework" />
|
||||
<orderEntry type="module" scope="TEST" module-name="solr-test-framework" />
|
||||
<orderEntry type="module" module-name="solr-core" />
|
||||
<orderEntry type="module" module-name="solrj" />
|
||||
<orderEntry type="module" module-name="extraction" />
|
||||
<orderEntry type="module" module-name="morphlines-core" />
|
||||
</component>
|
||||
</module>
|
|
@ -0,0 +1,27 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<module type="JAVA_MODULE" version="4">
|
||||
<component name="NewModuleRootManager" inherit-compiler-output="false">
|
||||
<output url="file://$MODULE_DIR$/../../../idea-build/solr/contrib/morphlines-core/classes/java" />
|
||||
<output-test url="file://$MODULE_DIR$/../../../idea-build/solr/contrib/morphlines-core/classes/test" />
|
||||
<exclude-output />
|
||||
<content url="file://$MODULE_DIR$">
|
||||
<sourceFolder url="file://$MODULE_DIR$/src/java" isTestSource="false" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/src/test" isTestSource="true" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/src/test-files" isTestSource="true" />
|
||||
</content>
|
||||
<orderEntry type="inheritedJdk" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
<orderEntry type="library" scope="TEST" name="JUnit" level="project" />
|
||||
<orderEntry type="library" name="Solr core library" level="project" />
|
||||
<orderEntry type="library" name="Solrj library" level="project" />
|
||||
<orderEntry type="library" name="Solr extraction library" level="project" />
|
||||
<orderEntry type="library" name="Solr morphlines core library" level="project" />
|
||||
<orderEntry type="library" scope="TEST" name="Solr morphlines core test library" level="project" />
|
||||
<orderEntry type="module" scope="TEST" module-name="lucene-test-framework" />
|
||||
<orderEntry type="module" scope="TEST" module-name="solr-test-framework" />
|
||||
<orderEntry type="module" module-name="solr-core" />
|
||||
<orderEntry type="module" module-name="solrj" />
|
||||
<orderEntry type="module" module-name="lucene-core" />
|
||||
<orderEntry type="module" module-name="analysis-common" />
|
||||
</component>
|
||||
</module>
|
|
@ -159,7 +159,7 @@
|
|||
<plugin>
|
||||
<groupId>de.thetaphi</groupId>
|
||||
<artifactId>forbiddenapis</artifactId>
|
||||
<version>1.3</version>
|
||||
<version>1.4</version>
|
||||
<configuration>
|
||||
<!--
|
||||
This is the default setting, we don't support too new Java versions.
|
||||
|
|
|
@ -58,10 +58,10 @@
|
|||
<artifactId>solr-test-framework</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
@solr-extraction.internal.dependencies@
|
||||
@solr-extraction.external.dependencies@
|
||||
@solr-extraction.internal.test.dependencies@
|
||||
@solr-extraction.external.test.dependencies@
|
||||
@solr-cell.internal.dependencies@
|
||||
@solr-cell.external.dependencies@
|
||||
@solr-cell.internal.test.dependencies@
|
||||
@solr-cell.external.test.dependencies@
|
||||
</dependencies>
|
||||
<build>
|
||||
<sourceDirectory>${module-path}/src/java</sourceDirectory>
|
||||
|
|
|
@ -0,0 +1,97 @@
|
|||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one
|
||||
or more contributor license agreements. See the NOTICE file
|
||||
distributed with this work for additional information
|
||||
regarding copyright ownership. The ASF licenses this file
|
||||
to you under the Apache License, Version 2.0 (the
|
||||
"License"); you may not use this file except in compliance
|
||||
with the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing,
|
||||
software distributed under the License is distributed on an
|
||||
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations
|
||||
under the License.
|
||||
-->
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<parent>
|
||||
<groupId>org.apache.solr</groupId>
|
||||
<artifactId>solr-parent</artifactId>
|
||||
<version>@version@</version>
|
||||
<relativePath>../../pom.xml</relativePath>
|
||||
</parent>
|
||||
<groupId>org.apache.solr</groupId>
|
||||
<artifactId>solr-map-reduce</artifactId>
|
||||
<packaging>jar</packaging>
|
||||
<name>Apache Solr map-reduce index construction</name>
|
||||
<description>Apache Solr - map-reduce index construction</description>
|
||||
<properties>
|
||||
<module-directory>solr/contrib/map-reduce</module-directory>
|
||||
<relative-top-level>../../../..</relative-top-level>
|
||||
<module-path>${relative-top-level}/${module-directory}</module-path>
|
||||
</properties>
|
||||
<scm>
|
||||
<connection>scm:svn:${vc-anonymous-base-url}/${module-directory}</connection>
|
||||
<developerConnection>scm:svn:${vc-dev-base-url}/${module-directory}</developerConnection>
|
||||
<url>${vc-browse-base-url}/${module-directory}</url>
|
||||
</scm>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<!-- lucene-test-framework dependency must be declared before lucene-core -->
|
||||
<!-- This dependency cannot be put into solr-parent, because local -->
|
||||
<!-- dependencies are always ordered before inherited dependencies. -->
|
||||
<groupId>org.apache.lucene</groupId>
|
||||
<artifactId>lucene-test-framework</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.solr</groupId>
|
||||
<artifactId>solr-test-framework</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
@solr-map-reduce.internal.dependencies@
|
||||
@solr-map-reduce.external.dependencies@
|
||||
@solr-map-reduce.internal.test.dependencies@
|
||||
@solr-map-reduce.external.test.dependencies@
|
||||
</dependencies>
|
||||
<build>
|
||||
<sourceDirectory>${module-path}/src/java</sourceDirectory>
|
||||
<testSourceDirectory>${module-path}/src/test</testSourceDirectory>
|
||||
<testResources>
|
||||
<testResource>
|
||||
<directory>${module-path}/src/test-files</directory>
|
||||
</testResource>
|
||||
<testResource>
|
||||
<directory>${top-level}/dev-tools/maven/solr</directory>
|
||||
<includes>
|
||||
<include>maven.testlogging.properties</include>
|
||||
</includes>
|
||||
</testResource>
|
||||
</testResources>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>de.thetaphi</groupId>
|
||||
<artifactId>forbiddenapis</artifactId>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>test-check-forbidden-servlet-api</id>
|
||||
<configuration>
|
||||
<signaturesFiles>
|
||||
<signaturesFile>${top-level}/lucene/tools/forbiddenApis/servlet-api.txt</signaturesFile>
|
||||
</signaturesFiles>
|
||||
</configuration>
|
||||
<goals>
|
||||
<goal>testCheck</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
</project>
|
|
@ -0,0 +1,104 @@
|
|||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one
|
||||
or more contributor license agreements. See the NOTICE file
|
||||
distributed with this work for additional information
|
||||
regarding copyright ownership. The ASF licenses this file
|
||||
to you under the Apache License, Version 2.0 (the
|
||||
"License"); you may not use this file except in compliance
|
||||
with the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing,
|
||||
software distributed under the License is distributed on an
|
||||
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations
|
||||
under the License.
|
||||
-->
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<parent>
|
||||
<groupId>org.apache.solr</groupId>
|
||||
<artifactId>solr-parent</artifactId>
|
||||
<version>@version@</version>
|
||||
<relativePath>../../pom.xml</relativePath>
|
||||
</parent>
|
||||
<groupId>org.apache.solr</groupId>
|
||||
<artifactId>solr-morphlines-cell</artifactId>
|
||||
<packaging>jar</packaging>
|
||||
<name>Apache Solr Cell Morphlines</name>
|
||||
<description>Apache Solr - Cell Morphlines</description>
|
||||
<properties>
|
||||
<module-directory>solr/contrib/morphlines-cell</module-directory>
|
||||
<relative-top-level>../../../..</relative-top-level>
|
||||
<module-path>${relative-top-level}/${module-directory}</module-path>
|
||||
</properties>
|
||||
<scm>
|
||||
<connection>scm:svn:${vc-anonymous-base-url}/${module-directory}</connection>
|
||||
<developerConnection>scm:svn:${vc-dev-base-url}/${module-directory}</developerConnection>
|
||||
<url>${vc-browse-base-url}/${module-directory}</url>
|
||||
</scm>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<!-- lucene-test-framework dependency must be declared before lucene-core -->
|
||||
<!-- This dependency cannot be put into solr-parent, because local -->
|
||||
<!-- dependencies are always ordered before inherited dependencies. -->
|
||||
<groupId>org.apache.lucene</groupId>
|
||||
<artifactId>lucene-test-framework</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.solr</groupId>
|
||||
<artifactId>solr-test-framework</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.solr</groupId>
|
||||
<artifactId>solr-morphlines-core</artifactId>
|
||||
<version>${project.version}</version>
|
||||
<type>test-jar</type>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
@solr-morphlines-cell.internal.dependencies@
|
||||
@solr-morphlines-cell.external.dependencies@
|
||||
@solr-morphlines-cell.internal.test.dependencies@
|
||||
@solr-morphlines-cell.external.test.dependencies@
|
||||
</dependencies>
|
||||
<build>
|
||||
<sourceDirectory>${module-path}/src/java</sourceDirectory>
|
||||
<testSourceDirectory>${module-path}/src/test</testSourceDirectory>
|
||||
<testResources>
|
||||
<testResource>
|
||||
<directory>${module-path}/src/test-files</directory>
|
||||
</testResource>
|
||||
<testResource>
|
||||
<directory>${top-level}/dev-tools/maven/solr</directory>
|
||||
<includes>
|
||||
<include>maven.testlogging.properties</include>
|
||||
</includes>
|
||||
</testResource>
|
||||
</testResources>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>de.thetaphi</groupId>
|
||||
<artifactId>forbiddenapis</artifactId>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>test-check-forbidden-servlet-api</id>
|
||||
<configuration>
|
||||
<signaturesFiles>
|
||||
<signaturesFile>${top-level}/lucene/tools/forbiddenApis/servlet-api.txt</signaturesFile>
|
||||
</signaturesFiles>
|
||||
</configuration>
|
||||
<goals>
|
||||
<goal>testCheck</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
</project>
|
|
@ -0,0 +1,108 @@
|
|||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one
|
||||
or more contributor license agreements. See the NOTICE file
|
||||
distributed with this work for additional information
|
||||
regarding copyright ownership. The ASF licenses this file
|
||||
to you under the Apache License, Version 2.0 (the
|
||||
"License"); you may not use this file except in compliance
|
||||
with the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing,
|
||||
software distributed under the License is distributed on an
|
||||
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations
|
||||
under the License.
|
||||
-->
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<parent>
|
||||
<groupId>org.apache.solr</groupId>
|
||||
<artifactId>solr-parent</artifactId>
|
||||
<version>@version@</version>
|
||||
<relativePath>../../pom.xml</relativePath>
|
||||
</parent>
|
||||
<groupId>org.apache.solr</groupId>
|
||||
<artifactId>solr-morphlines-core</artifactId>
|
||||
<packaging>jar</packaging>
|
||||
<name>Apache Solr Morphlines Core</name>
|
||||
<description>Apache Solr - Morphlines Core</description>
|
||||
<properties>
|
||||
<module-directory>solr/contrib/morphlines-core</module-directory>
|
||||
<relative-top-level>../../../..</relative-top-level>
|
||||
<module-path>${relative-top-level}/${module-directory}</module-path>
|
||||
</properties>
|
||||
<scm>
|
||||
<connection>scm:svn:${vc-anonymous-base-url}/${module-directory}</connection>
|
||||
<developerConnection>scm:svn:${vc-dev-base-url}/${module-directory}</developerConnection>
|
||||
<url>${vc-browse-base-url}/${module-directory}</url>
|
||||
</scm>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<!-- lucene-test-framework dependency must be declared before lucene-core -->
|
||||
<!-- This dependency cannot be put into solr-parent, because local -->
|
||||
<!-- dependencies are always ordered before inherited dependencies. -->
|
||||
<groupId>org.apache.lucene</groupId>
|
||||
<artifactId>lucene-test-framework</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.solr</groupId>
|
||||
<artifactId>solr-test-framework</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
@solr-morphlines-core.internal.dependencies@
|
||||
@solr-morphlines-core.external.dependencies@
|
||||
@solr-morphlines-core.internal.test.dependencies@
|
||||
@solr-morphlines-core.external.test.dependencies@
|
||||
</dependencies>
|
||||
<build>
|
||||
<sourceDirectory>${module-path}/src/java</sourceDirectory>
|
||||
<testSourceDirectory>${module-path}/src/test</testSourceDirectory>
|
||||
<testResources>
|
||||
<testResource>
|
||||
<directory>${module-path}/src/test-files</directory>
|
||||
</testResource>
|
||||
<testResource>
|
||||
<directory>${top-level}/dev-tools/maven/solr</directory>
|
||||
<includes>
|
||||
<include>maven.testlogging.properties</include>
|
||||
</includes>
|
||||
</testResource>
|
||||
</testResources>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>de.thetaphi</groupId>
|
||||
<artifactId>forbiddenapis</artifactId>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>test-check-forbidden-servlet-api</id>
|
||||
<configuration>
|
||||
<signaturesFiles>
|
||||
<signaturesFile>${top-level}/lucene/tools/forbiddenApis/servlet-api.txt</signaturesFile>
|
||||
</signaturesFiles>
|
||||
</configuration>
|
||||
<goals>
|
||||
<goal>testCheck</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-jar-plugin</artifactId>
|
||||
<executions>
|
||||
<execution>
|
||||
<goals>
|
||||
<goal>test-jar</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
</project>
|
|
@ -37,6 +37,9 @@
|
|||
<module>dataimporthandler-extras</module>
|
||||
<module>extraction</module>
|
||||
<module>langid</module>
|
||||
<module>morphlines-cell</module>
|
||||
<module>morphlines-core</module>
|
||||
<module>map-reduce</module>
|
||||
<module>uima</module>
|
||||
<module>velocity</module>
|
||||
</modules>
|
||||
|
|
|
@ -81,6 +81,11 @@
|
|||
<name>Public online Restlet repository</name>
|
||||
<url>http://maven.restlet.org</url>
|
||||
</repository>
|
||||
<repository>
|
||||
<id>releases.cloudera.com</id>
|
||||
<name>Cloudera Releases</name>
|
||||
<url>https://repository.cloudera.com/artifactory/libs-release</url>
|
||||
</repository>
|
||||
</repositories>
|
||||
<build>
|
||||
<pluginManagement>
|
||||
|
|
|
@ -0,0 +1,165 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<xsl:stylesheet version="1.0"
|
||||
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
|
||||
xmlns:str="http://exslt.org/strings"
|
||||
xmlns:common="http://exslt.org/common"
|
||||
extension-element-prefixes="str common">
|
||||
<xsl:param name="netbeans.fileset.sourcefolders"/>
|
||||
<xsl:param name="netbeans.path.libs"/>
|
||||
<xsl:param name="netbeans.source-level"/>
|
||||
|
||||
<xsl:variable name="netbeans.fileset.sourcefolders.sortedfrag">
|
||||
<xsl:for-each select="str:split($netbeans.fileset.sourcefolders,'|')">
|
||||
<!-- hack to sort **/src/java before **/src/test before **/src/resources : contains() returns "true" which sorts before "false" if descending: -->
|
||||
<xsl:sort select="string(contains(text(), '/src/java'))" order="descending" lang="en"/>
|
||||
<xsl:sort select="string(contains(text(), '/src/test'))" order="descending" lang="en"/>
|
||||
<xsl:sort select="string(contains(text(), '/src/resources'))" order="descending" lang="en"/>
|
||||
<!-- hack to sort the list, starts-with() returns "true" which sorts before "false" if descending: -->
|
||||
<xsl:sort select="string(starts-with(text(), 'lucene/core/'))" order="descending" lang="en"/>
|
||||
<xsl:sort select="string(starts-with(text(), 'lucene/test-framework/'))" order="descending" lang="en"/>
|
||||
<xsl:sort select="string(starts-with(text(), 'lucene/'))" order="descending" lang="en"/>
|
||||
<xsl:sort select="string(starts-with(text(), 'solr/core/'))" order="descending" lang="en"/>
|
||||
<xsl:sort select="string(starts-with(text(), 'solr/solrj/'))" order="descending" lang="en"/>
|
||||
<xsl:sort select="string(starts-with(text(), 'solr/test-framework/'))" order="descending" lang="en"/>
|
||||
<xsl:sort select="string(starts-with(text(), 'solr/'))" order="descending" lang="en"/>
|
||||
<!-- all others in one group above are sorted by path name: -->
|
||||
<xsl:sort select="text()" order="ascending" lang="en"/>
|
||||
<xsl:copy-of select="."/>
|
||||
</xsl:for-each>
|
||||
</xsl:variable>
|
||||
<xsl:variable name="netbeans.fileset.sourcefolders.sorted" select="common:node-set($netbeans.fileset.sourcefolders.sortedfrag)/*"/>
|
||||
|
||||
<xsl:variable name="netbeans.full.classpath.frag">
|
||||
<classpath mode="compile" xmlns="http://www.netbeans.org/ns/freeform-project-java/3">
|
||||
<xsl:value-of select="$netbeans.path.libs"/>
|
||||
<xsl:for-each select="$netbeans.fileset.sourcefolders.sorted[contains(text(), '/src/java')]">
|
||||
<xsl:text>:</xsl:text>
|
||||
<xsl:value-of select="."/>
|
||||
</xsl:for-each>
|
||||
</classpath>
|
||||
</xsl:variable>
|
||||
|
||||
<!--
|
||||
NOTE: This template matches the root element of any given input XML document!
|
||||
The XSL input file is ignored completely.
|
||||
-->
|
||||
<xsl:template match="/">
|
||||
<project xmlns="http://www.netbeans.org/ns/project/1">
|
||||
<type>org.netbeans.modules.ant.freeform</type>
|
||||
<configuration>
|
||||
<general-data xmlns="http://www.netbeans.org/ns/freeform-project/1">
|
||||
<name>lucene</name>
|
||||
<properties/>
|
||||
<folders>
|
||||
<xsl:for-each select="$netbeans.fileset.sourcefolders.sorted">
|
||||
<source-folder>
|
||||
<label>
|
||||
<xsl:value-of select="."/>
|
||||
</label>
|
||||
<xsl:if test="contains(text(), '/src/java') or contains(text(), '/src/test')">
|
||||
<type>java</type>
|
||||
</xsl:if>
|
||||
<location>
|
||||
<xsl:value-of select="."/>
|
||||
</location>
|
||||
</source-folder>
|
||||
</xsl:for-each>
|
||||
</folders>
|
||||
<ide-actions>
|
||||
<action name="build">
|
||||
<target>compile</target>
|
||||
</action>
|
||||
<action name="clean">
|
||||
<target>clean</target>
|
||||
</action>
|
||||
<action name="javadoc">
|
||||
<target>documentation</target>
|
||||
</action>
|
||||
<action name="test">
|
||||
<target>test</target>
|
||||
</action>
|
||||
<action name="rebuild">
|
||||
<target>clean</target>
|
||||
<target>compile</target>
|
||||
</action>
|
||||
</ide-actions>
|
||||
<view>
|
||||
<items>
|
||||
<xsl:for-each select="$netbeans.fileset.sourcefolders.sorted">
|
||||
<source-folder>
|
||||
<xsl:attribute name="style">
|
||||
<xsl:choose>
|
||||
<xsl:when test="contains(text(), '/src/java') or contains(text(), '/src/test')">packages</xsl:when>
|
||||
<xsl:otherwise>tree</xsl:otherwise>
|
||||
</xsl:choose>
|
||||
</xsl:attribute>
|
||||
<label>
|
||||
<xsl:value-of select="."/>
|
||||
</label>
|
||||
<location>
|
||||
<xsl:value-of select="."/>
|
||||
</location>
|
||||
</source-folder>
|
||||
</xsl:for-each>
|
||||
<source-file>
|
||||
<label>Project Build Script</label>
|
||||
<location>build.xml</location>
|
||||
</source-file>
|
||||
</items>
|
||||
<context-menu>
|
||||
<ide-action name="build"/>
|
||||
<ide-action name="rebuild"/>
|
||||
<ide-action name="clean"/>
|
||||
<ide-action name="javadoc"/>
|
||||
<ide-action name="test"/>
|
||||
</context-menu>
|
||||
</view>
|
||||
<subprojects/>
|
||||
</general-data>
|
||||
<java-data xmlns="http://www.netbeans.org/ns/freeform-project-java/3">
|
||||
<compilation-unit>
|
||||
<xsl:for-each select="$netbeans.fileset.sourcefolders.sorted[contains(text(), '/src/java')]">
|
||||
<package-root>
|
||||
<xsl:value-of select="."/>
|
||||
</package-root>
|
||||
</xsl:for-each>
|
||||
<xsl:copy-of select="$netbeans.full.classpath.frag"/>
|
||||
<built-to>nb-build/classes</built-to>
|
||||
<source-level>
|
||||
<xsl:value-of select="$netbeans.source-level"/>
|
||||
</source-level>
|
||||
</compilation-unit>
|
||||
<compilation-unit>
|
||||
<xsl:for-each select="$netbeans.fileset.sourcefolders.sorted[contains(text(), '/src/test')]">
|
||||
<package-root>
|
||||
<xsl:value-of select="."/>
|
||||
</package-root>
|
||||
</xsl:for-each>
|
||||
<unit-tests/>
|
||||
<xsl:copy-of select="$netbeans.full.classpath.frag"/>
|
||||
<built-to>nb-build/test-classes</built-to>
|
||||
<source-level>
|
||||
<xsl:value-of select="$netbeans.source-level"/>
|
||||
</source-level>
|
||||
</compilation-unit>
|
||||
</java-data>
|
||||
</configuration>
|
||||
</project>
|
||||
</xsl:template>
|
||||
</xsl:stylesheet>
|
|
@ -0,0 +1,9 @@
|
|||
auxiliary.org-netbeans-modules-editor-indent.CodeStyle.project.expand-tabs=true
|
||||
auxiliary.org-netbeans-modules-editor-indent.CodeStyle.project.indent-shift-width=2
|
||||
auxiliary.org-netbeans-modules-editor-indent.CodeStyle.project.spaces-per-tab=2
|
||||
auxiliary.org-netbeans-modules-editor-indent.CodeStyle.project.tab-size=2
|
||||
auxiliary.org-netbeans-modules-editor-indent.CodeStyle.project.text-limit-width=80
|
||||
auxiliary.org-netbeans-modules-editor-indent.CodeStyle.project.text-line-wrap=none
|
||||
auxiliary.org-netbeans-modules-editor-indent.CodeStyle.usedProfile=project
|
||||
auxiliary.org-netbeans-modules-editor-indent.text.x-java.CodeStyle.project.continuationIndentSize=4
|
||||
auxiliary.org-netbeans-modules-editor-indent.text.x-java.CodeStyle.project.spaceAfterTypeCast=false
|
|
@ -25,6 +25,27 @@
|
|||
|
||||
<import file="lucene/common-build.xml"/>
|
||||
|
||||
<target name="-run-test">
|
||||
<mkdir dir="lucene/build" />
|
||||
<tempfile property="tests.totals.tmpfile"
|
||||
destdir="lucene/build"
|
||||
prefix=".test-totals-"
|
||||
suffix=".tmp"
|
||||
deleteonexit="true"
|
||||
createfile="true" />
|
||||
|
||||
<subant target="test" inheritall="false" failonerror="true">
|
||||
<fileset dir="lucene" includes="build.xml" />
|
||||
<fileset dir="solr" includes="build.xml" />
|
||||
<propertyset>
|
||||
<propertyref name="tests.totals.tmpfile" />
|
||||
</propertyset>
|
||||
</subant>
|
||||
|
||||
<property name="tests.totals.toplevel" value="true" />
|
||||
<antcall target="-check-totals" />
|
||||
</target>
|
||||
|
||||
<!--
|
||||
Run after Junit tests.
|
||||
|
||||
|
@ -71,10 +92,12 @@
|
|||
<svn-checker failonmodifications="true"/>
|
||||
</target>
|
||||
|
||||
<property name="svnkit.version" value="1.7.8"/>
|
||||
|
||||
<macrodef xmlns:ivy="antlib:org.apache.ivy.ant" name="svn-checker">
|
||||
<attribute name="failonmodifications" default="true"/> <!-- false if file modifications are allowed -->
|
||||
<sequential>
|
||||
<ivy:cachepath organisation="org.tmatesoft.svnkit" module="svnkit" revision="1.7.8"
|
||||
<ivy:cachepath organisation="org.tmatesoft.svnkit" module="svnkit" revision="${svnkit.version}"
|
||||
inline="true" conf="default" transitive="true" pathid="svnkit.classpath"/>
|
||||
<local name="svn.checkprops.failed"/>
|
||||
<local name="svn.unversioned.failed"/>
|
||||
|
|
|
@ -68,6 +68,14 @@ New Features
|
|||
* LUCENE-5336: Add SimpleQueryParser: parser for human-entered queries.
|
||||
(Jack Conradson via Robert Muir)
|
||||
|
||||
* LUCENE-5329: suggest: DocumentDictionary and
|
||||
DocumentExpressionDictionary are now lenient for dirty documents
|
||||
(missing the term, weight or payload). (Areek Zillur via
|
||||
Mike McCandless)
|
||||
|
||||
* SOLR-1871: The RangeMapFloatFunction accepts an arbitrary ValueSource
|
||||
as target and default values. (Chris Harris, shalin)
|
||||
|
||||
* LUCENE-5371: Speed up Lucene range faceting from O(N) per hit to
|
||||
O(log(N)) per hit using segment trees; this only really starts to
|
||||
matter in practice if the number of ranges is over 10 or so. (Mike
|
||||
|
@ -83,6 +91,30 @@ Build
|
|||
* LUCENE-5322: Clean up / simplify Maven-related Ant targets.
|
||||
(Steve Rowe)
|
||||
|
||||
* LUCENE-5347: Upgrade forbidden-apis checker to version 1.4.
|
||||
(Uwe Schindler)
|
||||
|
||||
* LUCENE-4381: Upgrade analysis/icu to 52.1. (Robert Muir)
|
||||
|
||||
* LUCENE-5357: Upgrade StandardTokenizer and UAX29URLEmailTokenizer to
|
||||
Unicode 6.3; update UAX29URLEmailTokenizer's recognized top level
|
||||
domains in URLs and Emails from the IANA Root Zone Database.
|
||||
(Steve Rowe)
|
||||
|
||||
* LUCENE-5360: Add support for developing in Netbeans IDE.
|
||||
(Michal Hlavac, Uwe Schindler, Steve Rowe)
|
||||
|
||||
Bug fixes
|
||||
|
||||
* LUCENE-5285: Improved highlighting of multi-valued fields with
|
||||
FastVectorHighlighter. (Nik Everett via Adrien Grand)
|
||||
|
||||
Changes in Runtime Behavior
|
||||
|
||||
* LUCENE-5362: IndexReader and SegmentCoreReaders now throw
|
||||
AlreadyClosedException if the refCount in incremented but
|
||||
is less that 1. (Simon Willnauer)
|
||||
|
||||
======================= Lucene 4.6.0 =======================
|
||||
|
||||
New Features
|
||||
|
@ -176,39 +208,9 @@ New Features
|
|||
|
||||
Bug Fixes
|
||||
|
||||
* LUCENE-4998: Fixed a few places to pass IOContext.READONCE instead
|
||||
of IOContext.READ (Shikhar Bhushan via Mike McCandless)
|
||||
|
||||
* LUCENE-5242: DirectoryTaxonomyWriter.replaceTaxonomy did not fully reset
|
||||
its state, which could result in exceptions being thrown, as well as
|
||||
incorrect ordinals returned from getParent. (Shai Erera)
|
||||
|
||||
* LUCENE-5254: Fixed bounded memory leak, where objects like live
|
||||
docs bitset were not freed from an starting reader after reopening
|
||||
to a new reader and closing the original one. (Shai Erera, Mike
|
||||
McCandless)
|
||||
|
||||
* LUCENE-5262: Fixed file handle leaks when multiple attempts to open an
|
||||
NRT reader hit exceptions. (Shai Erera)
|
||||
|
||||
* LUCENE-5263: Transient IOExceptions, e.g. due to disk full or file
|
||||
descriptor exhaustion, hit at unlucky times inside IndexWriter could
|
||||
lead to silently losing deletions. (Shai Erera, Mike McCandless)
|
||||
|
||||
* LUCENE-5264: CommonTermsQuery ignored minMustMatch if only high-frequent
|
||||
terms were present in the query and the high-frequent operator was set
|
||||
to SHOULD. (Simon Willnauer)
|
||||
|
||||
* LUCENE-5269: Fix bug in NGramTokenFilter where it would sometimes count
|
||||
unicode characters incorrectly. (Mike McCandless, Robert Muir)
|
||||
|
||||
* LUCENE-5272: OpenBitSet.ensureCapacity did not modify numBits, causing
|
||||
false assertion errors in fastSet. (Shai Erera)
|
||||
|
||||
* LUCENE-5289: IndexWriter.hasUncommittedChanges was returning false
|
||||
when there were buffered delete-by-Term. (Shalin Shekhar Mangar,
|
||||
Mike McCandless)
|
||||
|
||||
* LUCENE-5303: OrdinalsCache did not use coreCacheKey, resulting in
|
||||
over caching across multiple threads. (Mike McCandless, Shai Erera)
|
||||
|
||||
|
@ -221,7 +223,11 @@ Bug Fixes
|
|||
deleted at a later point in time. This could cause short-term disk
|
||||
pollution or OOM if in-memory directories are used. (Simon Willnauer)
|
||||
|
||||
API Changes:
|
||||
* LUCENE-5342: Fixed bulk-merge issue in CompressingStoredFieldsFormat which
|
||||
created corrupted segments when mixing chunk sizes.
|
||||
Lucene41StoredFieldsFormat is not impacted. (Adrien Grand, Robert Muir)
|
||||
|
||||
API Changes
|
||||
|
||||
* LUCENE-5222: Add SortField.needsScores(). Previously it was not possible
|
||||
for a custom Sort that makes use of the relevance score to work correctly
|
||||
|
@ -314,6 +320,40 @@ Tests
|
|||
is either a "word" character or not), but now it gives a general longest-match
|
||||
behavior. (Nik Everett via Robert Muir)
|
||||
|
||||
======================= Lucene 4.5.1 =======================
|
||||
|
||||
Bug Fixes
|
||||
|
||||
* LUCENE-4998: Fixed a few places to pass IOContext.READONCE instead
|
||||
of IOContext.READ (Shikhar Bhushan via Mike McCandless)
|
||||
|
||||
* LUCENE-5242: DirectoryTaxonomyWriter.replaceTaxonomy did not fully reset
|
||||
its state, which could result in exceptions being thrown, as well as
|
||||
incorrect ordinals returned from getParent. (Shai Erera)
|
||||
|
||||
* LUCENE-5254: Fixed bounded memory leak, where objects like live
|
||||
docs bitset were not freed from an starting reader after reopening
|
||||
to a new reader and closing the original one. (Shai Erera, Mike
|
||||
McCandless)
|
||||
|
||||
* LUCENE-5262: Fixed file handle leaks when multiple attempts to open an
|
||||
NRT reader hit exceptions. (Shai Erera)
|
||||
|
||||
* LUCENE-5263: Transient IOExceptions, e.g. due to disk full or file
|
||||
descriptor exhaustion, hit at unlucky times inside IndexWriter could
|
||||
lead to silently losing deletions. (Shai Erera, Mike McCandless)
|
||||
|
||||
* LUCENE-5264: CommonTermsQuery ignored minMustMatch if only high-frequent
|
||||
terms were present in the query and the high-frequent operator was set
|
||||
to SHOULD. (Simon Willnauer)
|
||||
|
||||
* LUCENE-5269: Fix bug in NGramTokenFilter where it would sometimes count
|
||||
unicode characters incorrectly. (Mike McCandless, Robert Muir)
|
||||
|
||||
* LUCENE-5289: IndexWriter.hasUncommittedChanges was returning false
|
||||
when there were buffered delete-by-Term. (Shalin Shekhar Mangar,
|
||||
Mike McCandless)
|
||||
|
||||
======================= Lucene 4.5.0 =======================
|
||||
|
||||
New features
|
||||
|
|
|
@ -45,17 +45,13 @@
|
|||
<taskdef classname="jflex.anttask.JFlexTask" name="jflex">
|
||||
<classpath refid="jflex.classpath"/>
|
||||
</taskdef>
|
||||
<!-- this logic below looks duplicated with run-jflex, but its not, the regexp is different! -->
|
||||
<jflex file="src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex"
|
||||
outdir="src/java/org/apache/lucene/analysis/charfilter"
|
||||
nobak="on"/>
|
||||
<!-- Remove the inappropriate JFlex-generated constructors -->
|
||||
nobak="on" inputstreamctor="false"/>
|
||||
<!-- Remove the inappropriate JFlex-generated constructor -->
|
||||
<replaceregexp file="src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java"
|
||||
match="/\*\*\s*\*\s*Creates a new scanner.*this\(new java\.io\.InputStreamReader\(in\)\);\s*\}"
|
||||
replace="" flags="sg"/>
|
||||
<replaceregexp file="src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java"
|
||||
match="\/\*\s*The following code was generated by JFlex.*"
|
||||
replace="\/\* The following code was generated by JFlex. \*\/" flags=""/>
|
||||
match="/\*\*\s*\*\s*Creates a new scanner\s*\*\s*\*\s*@param\s*in\s*the java.io.Reader to read input from\.\s*\*/\s*public HTMLStripCharFilter\(java\.io\.Reader in\)\s*\{\s*this.zzReader = in;\s*\}"
|
||||
replace="" flags="s"/>
|
||||
</target>
|
||||
|
||||
<target name="generate-jflex-html-char-entities">
|
||||
|
@ -96,15 +92,7 @@
|
|||
<attribute name="dir"/>
|
||||
<attribute name="name"/>
|
||||
<sequential>
|
||||
<jflex file="@{dir}/@{name}.jflex"
|
||||
outdir="@{dir}"
|
||||
nobak="on" />
|
||||
<replaceregexp file="@{dir}/@{name}.java"
|
||||
match="/\*\*\s*\*\s*Creates a new scanner\..*this\(new java\.io\.InputStreamReader\(in\)\);\s*\}"
|
||||
replace="" flags="sg"/>
|
||||
<replaceregexp file="@{dir}/@{name}.java"
|
||||
match="\/\*\s*The following code was generated by JFlex.*"
|
||||
replace="\/\* The following code was generated by JFlex. \*\/" flags=""/>
|
||||
<jflex file="@{dir}/@{name}.jflex" outdir="@{dir}" nobak="on" inputstreamctor="false"/>
|
||||
</sequential>
|
||||
</macrodef>
|
||||
|
||||
|
|
|
@ -73,7 +73,7 @@ CharacterEntities = ( "AElig" | "Aacute" | "Acirc" | "Agrave" | "Alpha"
|
|||
upperCaseVariantsAccepted.put("amp", "AMP");
|
||||
}
|
||||
private static final CharArrayMap<Character> entityValues
|
||||
= new CharArrayMap<Character>(Version.LUCENE_40, 253, false);
|
||||
= new CharArrayMap<Character>(Version.LUCENE_CURRENT, 253, false);
|
||||
static {
|
||||
String[] entities = {
|
||||
"AElig", "\u00C6", "Aacute", "\u00C1", "Acirc", "\u00C2",
|
||||
|
|
|
@ -14,7 +14,7 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
// Generated using ICU4J 49.1.0.0
|
||||
// Generated using ICU4J 52.1.0.0
|
||||
// by org.apache.lucene.analysis.icu.GenerateHTMLStripCharFilterSupplementaryMacros
|
||||
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
/* The following code was generated by JFlex. */
|
||||
/* The following code was generated by JFlex 1.5.0-SNAPSHOT */
|
||||
|
||||
package org.apache.lucene.analysis.charfilter;
|
||||
|
||||
|
@ -152,77 +152,77 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
|
|||
"\21\1\1\41\32\1\5\0\113\1\3\0\3\1\17\0\15\1\1\0"+
|
||||
"\4\1\3\2\13\0\22\1\3\2\13\0\22\1\2\2\14\0\15\1"+
|
||||
"\1\0\3\1\1\0\2\2\14\0\64\1\40\2\3\0\1\1\4\0"+
|
||||
"\1\1\1\2\2\0\12\274\41\0\3\2\1\41\1\0\12\274\6\0"+
|
||||
"\130\1\10\0\51\1\1\2\1\1\5\0\106\1\12\0\35\1\3\0"+
|
||||
"\14\2\4\0\14\2\12\0\12\274\36\1\2\0\5\1\13\0\54\1"+
|
||||
"\4\0\21\2\7\1\2\2\6\0\12\274\1\2\45\0\27\1\5\2"+
|
||||
"\4\0\65\1\12\2\1\0\35\2\2\0\1\2\12\274\6\0\12\274"+
|
||||
"\15\0\1\1\130\0\5\2\57\1\21\2\7\1\4\0\12\274\21\0"+
|
||||
"\11\2\14\0\3\2\36\1\15\2\2\1\12\274\54\1\16\2\14\0"+
|
||||
"\44\1\24\2\10\0\12\274\3\0\3\1\12\274\44\1\122\0\3\2"+
|
||||
"\1\0\25\2\4\1\1\2\4\1\3\2\2\1\11\0\300\1\47\2"+
|
||||
"\25\0\4\2\u0116\1\2\0\6\1\2\0\46\1\2\0\6\1\2\0"+
|
||||
"\10\1\1\0\1\1\1\0\1\1\1\0\1\1\1\0\37\1\2\0"+
|
||||
"\65\1\1\0\7\1\1\0\1\1\3\0\3\1\1\0\7\1\3\0"+
|
||||
"\4\1\2\0\6\1\4\0\15\1\5\0\3\1\1\0\7\1\3\0"+
|
||||
"\13\41\35\0\2\41\5\0\1\41\17\0\2\2\23\0\1\2\12\0"+
|
||||
"\1\41\21\0\1\1\15\0\1\1\20\0\15\1\63\0\15\2\4\0"+
|
||||
"\1\2\3\0\14\2\21\0\1\1\4\0\1\1\2\0\12\1\1\0"+
|
||||
"\1\1\2\0\6\1\6\0\1\1\1\0\1\1\1\0\1\1\1\0"+
|
||||
"\20\1\2\0\4\1\5\0\5\1\4\0\1\1\21\0\51\1\u0a77\0"+
|
||||
"\57\1\1\0\57\1\1\0\205\1\6\0\4\1\3\2\2\1\14\0"+
|
||||
"\46\1\1\0\1\1\5\0\1\1\2\0\70\1\7\0\1\1\17\0"+
|
||||
"\1\2\27\1\11\0\7\1\1\0\7\1\1\0\7\1\1\0\7\1"+
|
||||
"\1\0\7\1\1\0\7\1\1\0\7\1\1\0\7\1\1\0\40\2"+
|
||||
"\u0200\0\1\41\4\0\3\1\31\0\11\1\6\2\1\0\5\1\2\0"+
|
||||
"\5\1\4\0\126\1\2\0\2\2\5\1\1\0\132\1\1\0\4\1"+
|
||||
"\5\0\51\1\3\0\136\1\21\0\33\1\65\0\20\1\u0200\0\u19b6\1"+
|
||||
"\112\0\u51cd\1\63\0\u048d\1\103\0\56\1\2\0\u010d\1\3\0\20\1"+
|
||||
"\12\274\2\1\24\0\57\1\1\2\4\0\12\2\1\0\31\1\7\0"+
|
||||
"\1\2\120\1\2\2\45\0\11\1\2\0\147\1\2\0\4\1\1\0"+
|
||||
"\4\1\14\0\13\1\115\0\12\1\1\2\3\1\1\2\4\1\1\2"+
|
||||
"\27\1\5\2\30\0\64\1\14\0\2\2\62\1\21\2\13\0\12\274"+
|
||||
"\6\0\22\2\6\1\3\0\1\1\4\0\12\274\34\1\10\2\2\0"+
|
||||
"\27\1\15\2\14\0\35\1\3\0\4\2\57\1\16\2\16\0\1\1"+
|
||||
"\12\274\46\0\51\1\16\2\11\0\3\1\1\2\10\1\2\2\2\0"+
|
||||
"\12\274\6\0\27\1\3\0\1\1\1\2\4\0\60\1\1\2\1\1"+
|
||||
"\3\2\2\1\2\2\5\1\2\2\1\1\1\2\1\1\30\0\3\1"+
|
||||
"\2\0\13\1\5\2\2\0\3\1\2\2\12\0\6\1\2\0\6\1"+
|
||||
"\2\0\6\1\11\0\7\1\1\0\7\1\221\0\43\1\10\2\1\0"+
|
||||
"\2\2\2\0\12\274\6\0\u2ba4\1\14\0\27\1\4\0\61\1\4\0"+
|
||||
"\1\170\1\223\1\103\1\165\1\136\1\214\2\0\1\160\1\153\2\0"+
|
||||
"\1\120\1\210\14\0\1\105\1\127\20\0\1\122\7\0\1\256\1\112"+
|
||||
"\5\0\1\143\4\0\51\120\1\110\3\120\1\124\1\220\17\0\1\133"+
|
||||
"\u02c1\0\1\252\277\0\2\123\1\212\3\222\2\211\1\222\1\211\2\222"+
|
||||
"\1\221\21\222\11\213\1\157\7\213\7\204\1\156\1\204\1\246\2\207"+
|
||||
"\1\166\1\246\1\207\1\166\10\246\2\167\5\203\2\155\5\203\1\107"+
|
||||
"\10\202\5\154\3\224\12\251\20\224\3\225\32\227\1\226\2\200\2\234"+
|
||||
"\1\235\2\234\2\235\2\234\1\235\3\200\1\177\2\200\12\250\1\247"+
|
||||
"\1\176\1\171\7\176\1\171\13\176\31\200\7\176\12\250\1\176\5\134"+
|
||||
"\3\245\3\142\1\140\4\142\2\140\10\142\1\140\7\141\1\137\2\141"+
|
||||
"\7\142\16\245\1\135\4\245\1\106\4\244\1\106\5\255\1\254\1\255"+
|
||||
"\3\254\7\255\1\254\23\255\5\264\3\255\6\264\2\255\6\253\5\263"+
|
||||
"\3\262\2\142\7\257\36\142\4\257\5\142\5\245\6\244\2\245\1\244"+
|
||||
"\4\141\13\253\12\244\26\253\15\134\1\243\2\134\1\152\3\237\1\134"+
|
||||
"\2\237\5\151\4\237\4\152\1\151\3\152\1\151\5\152\2\147\1\116"+
|
||||
"\2\147\1\116\1\147\2\116\1\147\1\116\12\147\1\116\4\146\1\115"+
|
||||
"\1\236\1\240\1\150\3\164\1\240\2\164\1\260\2\261\2\164\1\150"+
|
||||
"\1\164\1\150\1\164\1\150\1\164\3\150\1\164\2\150\1\164\1\150"+
|
||||
"\2\164\1\150\1\164\1\150\1\164\1\150\1\164\1\150\1\164\1\150"+
|
||||
"\1\162\2\145\1\162\1\145\2\162\4\145\1\162\7\145\1\162\4\145"+
|
||||
"\1\162\4\145\1\164\1\150\1\164\12\216\1\217\21\216\1\217\3\215"+
|
||||
"\1\217\3\216\1\217\1\216\2\144\2\216\1\217\15\241\4\201\4\206"+
|
||||
"\1\242\1\161\10\242\7\206\6\164\4\113\1\121\37\113\1\121\4\113"+
|
||||
"\25\174\1\131\11\174\21\130\5\174\1\104\12\117\5\174\6\205\4\162"+
|
||||
"\1\163\1\130\5\231\12\232\17\231\1\125\3\114\14\230\1\126\11\173"+
|
||||
"\1\172\5\173\4\233\13\175\2\132\11\173\1\172\31\173\1\172\4\126"+
|
||||
"\4\173\2\172\2\265\1\111\5\265\52\111\u1900\0\u016e\1\2\0\152\1"+
|
||||
"\46\0\7\1\14\0\5\1\5\0\1\1\1\2\12\1\1\0\15\1"+
|
||||
"\1\0\5\1\1\0\1\1\1\0\2\1\1\0\2\1\1\0\154\1"+
|
||||
"\41\0\u016b\1\22\0\100\1\2\0\66\1\50\0\14\1\4\0\20\2"+
|
||||
"\20\0\7\2\14\0\2\2\30\0\3\2\40\0\5\1\1\0\207\1"+
|
||||
"\23\0\12\274\7\0\32\1\4\0\1\2\1\0\32\1\13\0\131\1"+
|
||||
"\3\0\6\1\2\0\6\1\2\0\6\1\2\0\3\1\43\0";
|
||||
"\1\1\1\2\2\0\12\274\41\0\3\2\2\0\12\274\6\0\130\1"+
|
||||
"\10\0\51\1\1\2\1\1\5\0\106\1\12\0\35\1\3\0\14\2"+
|
||||
"\4\0\14\2\12\0\12\274\36\1\2\0\5\1\13\0\54\1\4\0"+
|
||||
"\21\2\7\1\2\2\6\0\12\274\1\2\45\0\27\1\5\2\4\0"+
|
||||
"\65\1\12\2\1\0\35\2\2\0\1\2\12\274\6\0\12\274\15\0"+
|
||||
"\1\1\130\0\5\2\57\1\21\2\7\1\4\0\12\274\21\0\11\2"+
|
||||
"\14\0\3\2\36\1\15\2\2\1\12\274\54\1\16\2\14\0\44\1"+
|
||||
"\24\2\10\0\12\274\3\0\3\1\12\274\44\1\122\0\3\2\1\0"+
|
||||
"\25\2\4\1\1\2\4\1\3\2\2\1\11\0\300\1\47\2\25\0"+
|
||||
"\4\2\u0116\1\2\0\6\1\2\0\46\1\2\0\6\1\2\0\10\1"+
|
||||
"\1\0\1\1\1\0\1\1\1\0\1\1\1\0\37\1\2\0\65\1"+
|
||||
"\1\0\7\1\1\0\1\1\3\0\3\1\1\0\7\1\3\0\4\1"+
|
||||
"\2\0\6\1\4\0\15\1\5\0\3\1\1\0\7\1\3\0\13\41"+
|
||||
"\35\0\2\41\5\0\1\41\17\0\2\2\23\0\1\2\12\0\1\41"+
|
||||
"\21\0\1\1\15\0\1\1\20\0\15\1\63\0\15\2\4\0\1\2"+
|
||||
"\3\0\14\2\21\0\1\1\4\0\1\1\2\0\12\1\1\0\1\1"+
|
||||
"\2\0\6\1\6\0\1\1\1\0\1\1\1\0\1\1\1\0\20\1"+
|
||||
"\2\0\4\1\5\0\5\1\4\0\1\1\21\0\51\1\u0a77\0\57\1"+
|
||||
"\1\0\57\1\1\0\205\1\6\0\4\1\3\2\2\1\14\0\46\1"+
|
||||
"\1\0\1\1\5\0\1\1\2\0\70\1\7\0\1\1\17\0\1\2"+
|
||||
"\27\1\11\0\7\1\1\0\7\1\1\0\7\1\1\0\7\1\1\0"+
|
||||
"\7\1\1\0\7\1\1\0\7\1\1\0\7\1\1\0\40\2\u0200\0"+
|
||||
"\1\41\4\0\3\1\31\0\11\1\6\2\1\0\5\1\2\0\5\1"+
|
||||
"\4\0\126\1\2\0\2\2\5\1\1\0\132\1\1\0\4\1\5\0"+
|
||||
"\51\1\3\0\136\1\21\0\33\1\65\0\20\1\u0200\0\u19b6\1\112\0"+
|
||||
"\u51cd\1\63\0\u048d\1\103\0\56\1\2\0\u010d\1\3\0\20\1\12\274"+
|
||||
"\2\1\24\0\57\1\1\2\4\0\12\2\1\0\31\1\7\0\1\2"+
|
||||
"\120\1\2\2\45\0\11\1\2\0\147\1\2\0\4\1\1\0\4\1"+
|
||||
"\14\0\13\1\115\0\12\1\1\2\3\1\1\2\4\1\1\2\27\1"+
|
||||
"\5\2\30\0\64\1\14\0\2\2\62\1\21\2\13\0\12\274\6\0"+
|
||||
"\22\2\6\1\3\0\1\1\4\0\12\274\34\1\10\2\2\0\27\1"+
|
||||
"\15\2\14\0\35\1\3\0\4\2\57\1\16\2\16\0\1\1\12\274"+
|
||||
"\46\0\51\1\16\2\11\0\3\1\1\2\10\1\2\2\2\0\12\274"+
|
||||
"\6\0\27\1\3\0\1\1\1\2\4\0\60\1\1\2\1\1\3\2"+
|
||||
"\2\1\2\2\5\1\2\2\1\1\1\2\1\1\30\0\3\1\2\0"+
|
||||
"\13\1\5\2\2\0\3\1\2\2\12\0\6\1\2\0\6\1\2\0"+
|
||||
"\6\1\11\0\7\1\1\0\7\1\221\0\43\1\10\2\1\0\2\2"+
|
||||
"\2\0\12\274\6\0\u2ba4\1\14\0\27\1\4\0\61\1\4\0\1\170"+
|
||||
"\1\223\1\103\1\165\1\136\1\214\2\0\1\160\1\153\2\0\1\120"+
|
||||
"\1\210\14\0\1\105\1\127\20\0\1\122\7\0\1\256\1\112\5\0"+
|
||||
"\1\143\4\0\51\120\1\110\3\120\1\124\1\220\17\0\1\133\u02c1\0"+
|
||||
"\1\252\277\0\2\123\1\212\3\222\2\211\1\222\1\211\2\222\1\221"+
|
||||
"\21\222\11\213\1\157\7\213\7\204\1\156\1\204\1\246\2\207\1\166"+
|
||||
"\1\246\1\207\1\166\10\246\2\167\5\203\2\155\5\203\1\107\10\202"+
|
||||
"\5\154\3\224\12\251\20\224\3\225\32\227\1\226\2\200\2\234\1\235"+
|
||||
"\2\234\2\235\2\234\1\235\3\200\1\177\2\200\12\250\1\247\1\176"+
|
||||
"\1\171\7\176\1\171\13\176\31\200\7\176\12\250\1\176\5\134\3\245"+
|
||||
"\3\142\1\140\4\142\2\140\10\142\1\140\7\141\1\137\2\141\7\142"+
|
||||
"\16\245\1\135\4\245\1\106\4\244\1\106\5\255\1\254\1\255\3\254"+
|
||||
"\7\255\1\254\23\255\5\264\3\255\6\264\2\255\6\253\5\263\3\262"+
|
||||
"\2\142\7\257\36\142\4\257\5\142\5\245\6\244\2\245\1\244\4\141"+
|
||||
"\13\253\12\244\26\253\15\134\1\243\2\134\1\152\3\237\1\134\2\237"+
|
||||
"\5\151\4\237\4\152\1\151\3\152\1\151\5\152\2\147\1\116\2\147"+
|
||||
"\1\116\1\147\2\116\1\147\1\116\12\147\1\116\4\146\1\115\1\236"+
|
||||
"\1\240\1\150\3\164\1\240\2\164\1\260\2\261\2\164\1\150\1\164"+
|
||||
"\1\150\1\164\1\150\1\164\3\150\1\164\2\150\1\164\1\150\2\164"+
|
||||
"\1\150\1\164\1\150\1\164\1\150\1\164\1\150\1\164\1\150\1\162"+
|
||||
"\2\145\1\162\1\145\2\162\4\145\1\162\7\145\1\162\4\145\1\162"+
|
||||
"\4\145\1\164\1\150\1\164\12\216\1\217\21\216\1\217\3\215\1\217"+
|
||||
"\3\216\1\217\1\216\2\144\2\216\1\217\15\241\4\201\4\206\1\242"+
|
||||
"\1\161\10\242\7\206\6\164\4\113\1\121\37\113\1\121\4\113\25\174"+
|
||||
"\1\131\11\174\21\130\5\174\1\104\12\117\5\174\6\205\4\162\1\163"+
|
||||
"\1\130\5\231\12\232\17\231\1\125\3\114\14\230\1\126\11\173\1\172"+
|
||||
"\5\173\4\233\13\175\2\132\11\173\1\172\31\173\1\172\4\126\4\173"+
|
||||
"\2\172\2\265\1\111\5\265\52\111\u1900\0\u016e\1\2\0\152\1\46\0"+
|
||||
"\7\1\14\0\5\1\5\0\1\1\1\2\12\1\1\0\15\1\1\0"+
|
||||
"\5\1\1\0\1\1\1\0\2\1\1\0\2\1\1\0\154\1\41\0"+
|
||||
"\u016b\1\22\0\100\1\2\0\66\1\50\0\14\1\4\0\20\2\20\0"+
|
||||
"\7\2\14\0\2\2\30\0\3\2\40\0\5\1\1\0\207\1\23\0"+
|
||||
"\12\274\7\0\32\1\4\0\1\2\1\0\32\1\13\0\131\1\3\0"+
|
||||
"\6\1\2\0\6\1\2\0\6\1\2\0\3\1\43\0";
|
||||
|
||||
/**
|
||||
* Translates characters to character classes
|
||||
|
@ -30673,7 +30673,7 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
|
|||
upperCaseVariantsAccepted.put("amp", "AMP");
|
||||
}
|
||||
private static final CharArrayMap<Character> entityValues
|
||||
= new CharArrayMap<Character>(Version.LUCENE_40, 253, false);
|
||||
= new CharArrayMap<Character>(Version.LUCENE_CURRENT, 253, false);
|
||||
static {
|
||||
String[] entities = {
|
||||
"AElig", "\u00C6", "Aacute", "\u00C1", "Acirc", "\u00C2",
|
||||
|
@ -30812,7 +30812,7 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
|
|||
escapeSTYLE = true;
|
||||
} else {
|
||||
if (null == this.escapedTags) {
|
||||
this.escapedTags = new CharArraySet(Version.LUCENE_40, 16, true);
|
||||
this.escapedTags = new CharArraySet(Version.LUCENE_CURRENT, 16, true);
|
||||
}
|
||||
this.escapedTags.add(tag);
|
||||
}
|
||||
|
@ -30895,6 +30895,7 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
|
|||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Unpacks the compressed character translation table.
|
||||
*
|
||||
|
@ -30905,7 +30906,7 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
|
|||
char [] map = new char[0x10000];
|
||||
int i = 0; /* index in packed string */
|
||||
int j = 0; /* index in unpacked array */
|
||||
while (i < 2778) {
|
||||
while (i < 2776) {
|
||||
int count = packed.charAt(i++);
|
||||
char value = packed.charAt(i++);
|
||||
do map[j++] = value; while (--count > 0);
|
||||
|
|
|
@ -34,7 +34,7 @@ import org.apache.lucene.analysis.util.OpenStringBuilder;
|
|||
*/
|
||||
%%
|
||||
|
||||
%unicode 6.1
|
||||
%unicode 6.3
|
||||
%apiprivate
|
||||
%type int
|
||||
%final
|
||||
|
@ -197,7 +197,7 @@ InlineElment = ( [aAbBiIqQsSuU] |
|
|||
escapeSTYLE = true;
|
||||
} else {
|
||||
if (null == this.escapedTags) {
|
||||
this.escapedTags = new CharArraySet(Version.LUCENE_40, 16, true);
|
||||
this.escapedTags = new CharArraySet(Version.LUCENE_CURRENT, 16, true);
|
||||
}
|
||||
this.escapedTags.add(tag);
|
||||
}
|
||||
|
|
|
@ -61,7 +61,7 @@ def main():
|
|||
print ' upperCaseVariantsAccepted.put("amp", "AMP");'
|
||||
print ' }'
|
||||
print ' private static final CharArrayMap<Character> entityValues'
|
||||
print ' = new CharArrayMap<Character>(Version.LUCENE_40, %i, false);' % len(keys)
|
||||
print ' = new CharArrayMap<Character>(Version.LUCENE_CURRENT, %i, false);' % len(keys)
|
||||
print ' static {'
|
||||
print ' String[] entities = {'
|
||||
output_line = ' '
|
||||
|
|
|
@ -196,7 +196,7 @@ public class GreekStemmer {
|
|||
return len;
|
||||
}
|
||||
|
||||
private static final CharArraySet exc4 = new CharArraySet(Version.LUCENE_50,
|
||||
private static final CharArraySet exc4 = new CharArraySet(Version.LUCENE_CURRENT,
|
||||
Arrays.asList("θ", "δ", "ελ", "γαλ", "ν", "π", "ιδ", "παρ"),
|
||||
false);
|
||||
|
||||
|
@ -222,7 +222,7 @@ public class GreekStemmer {
|
|||
return len;
|
||||
}
|
||||
|
||||
private static final CharArraySet exc6 = new CharArraySet(Version.LUCENE_50,
|
||||
private static final CharArraySet exc6 = new CharArraySet(Version.LUCENE_CURRENT,
|
||||
Arrays.asList("αλ", "αδ", "ενδ", "αμαν", "αμμοχαλ", "ηθ", "ανηθ",
|
||||
"αντιδ", "φυσ", "βρωμ", "γερ", "εξωδ", "καλπ", "καλλιν", "καταδ",
|
||||
"μουλ", "μπαν", "μπαγιατ", "μπολ", "μποσ", "νιτ", "ξικ", "συνομηλ",
|
||||
|
@ -247,7 +247,7 @@ public class GreekStemmer {
|
|||
return len;
|
||||
}
|
||||
|
||||
private static final CharArraySet exc7 = new CharArraySet(Version.LUCENE_50,
|
||||
private static final CharArraySet exc7 = new CharArraySet(Version.LUCENE_CURRENT,
|
||||
Arrays.asList("αναπ", "αποθ", "αποκ", "αποστ", "βουβ", "ξεθ", "ουλ",
|
||||
"πεθ", "πικρ", "ποτ", "σιχ", "χ"),
|
||||
false);
|
||||
|
@ -274,11 +274,11 @@ public class GreekStemmer {
|
|||
return len;
|
||||
}
|
||||
|
||||
private static final CharArraySet exc8a = new CharArraySet(Version.LUCENE_50,
|
||||
private static final CharArraySet exc8a = new CharArraySet(Version.LUCENE_CURRENT,
|
||||
Arrays.asList("τρ", "τσ"),
|
||||
false);
|
||||
|
||||
private static final CharArraySet exc8b = new CharArraySet(Version.LUCENE_50,
|
||||
private static final CharArraySet exc8b = new CharArraySet(Version.LUCENE_CURRENT,
|
||||
Arrays.asList("βετερ", "βουλκ", "βραχμ", "γ", "δραδουμ", "θ", "καλπουζ",
|
||||
"καστελ", "κορμορ", "λαοπλ", "μωαμεθ", "μ", "μουσουλμ", "ν", "ουλ",
|
||||
"π", "πελεκ", "πλ", "πολισ", "πορτολ", "σαρακατσ", "σουλτ",
|
||||
|
@ -337,7 +337,7 @@ public class GreekStemmer {
|
|||
return len;
|
||||
}
|
||||
|
||||
private static final CharArraySet exc9 = new CharArraySet(Version.LUCENE_50,
|
||||
private static final CharArraySet exc9 = new CharArraySet(Version.LUCENE_CURRENT,
|
||||
Arrays.asList("αβαρ", "βεν", "εναρ", "αβρ", "αδ", "αθ", "αν", "απλ",
|
||||
"βαρον", "ντρ", "σκ", "κοπ", "μπορ", "νιφ", "παγ", "παρακαλ", "σερπ",
|
||||
"σκελ", "συρφ", "τοκ", "υ", "δ", "εμ", "θαρρ", "θ"),
|
||||
|
@ -425,11 +425,11 @@ public class GreekStemmer {
|
|||
return len;
|
||||
}
|
||||
|
||||
private static final CharArraySet exc12a = new CharArraySet(Version.LUCENE_50,
|
||||
private static final CharArraySet exc12a = new CharArraySet(Version.LUCENE_CURRENT,
|
||||
Arrays.asList("π", "απ", "συμπ", "ασυμπ", "ακαταπ", "αμεταμφ"),
|
||||
false);
|
||||
|
||||
private static final CharArraySet exc12b = new CharArraySet(Version.LUCENE_50,
|
||||
private static final CharArraySet exc12b = new CharArraySet(Version.LUCENE_CURRENT,
|
||||
Arrays.asList("αλ", "αρ", "εκτελ", "ζ", "μ", "ξ", "παρακαλ", "αρ", "προ", "νισ"),
|
||||
false);
|
||||
|
||||
|
@ -449,7 +449,7 @@ public class GreekStemmer {
|
|||
return len;
|
||||
}
|
||||
|
||||
private static final CharArraySet exc13 = new CharArraySet(Version.LUCENE_50,
|
||||
private static final CharArraySet exc13 = new CharArraySet(Version.LUCENE_CURRENT,
|
||||
Arrays.asList("διαθ", "θ", "παρακαταθ", "προσθ", "συνθ"),
|
||||
false);
|
||||
|
||||
|
@ -483,7 +483,7 @@ public class GreekStemmer {
|
|||
return len;
|
||||
}
|
||||
|
||||
private static final CharArraySet exc14 = new CharArraySet(Version.LUCENE_50,
|
||||
private static final CharArraySet exc14 = new CharArraySet(Version.LUCENE_CURRENT,
|
||||
Arrays.asList("φαρμακ", "χαδ", "αγκ", "αναρρ", "βρομ", "εκλιπ", "λαμπιδ",
|
||||
"λεχ", "μ", "πατ", "ρ", "λ", "μεδ", "μεσαζ", "υποτειν", "αμ", "αιθ",
|
||||
"ανηκ", "δεσποζ", "ενδιαφερ", "δε", "δευτερευ", "καθαρευ", "πλε",
|
||||
|
@ -521,7 +521,7 @@ public class GreekStemmer {
|
|||
return len;
|
||||
}
|
||||
|
||||
private static final CharArraySet exc15a = new CharArraySet(Version.LUCENE_50,
|
||||
private static final CharArraySet exc15a = new CharArraySet(Version.LUCENE_CURRENT,
|
||||
Arrays.asList("αβαστ", "πολυφ", "αδηφ", "παμφ", "ρ", "ασπ", "αφ", "αμαλ",
|
||||
"αμαλλι", "ανυστ", "απερ", "ασπαρ", "αχαρ", "δερβεν", "δροσοπ",
|
||||
"ξεφ", "νεοπ", "νομοτ", "ολοπ", "ομοτ", "προστ", "προσωποπ", "συμπ",
|
||||
|
@ -530,7 +530,7 @@ public class GreekStemmer {
|
|||
"ουλαμ", "ουρ", "π", "τρ", "μ"),
|
||||
false);
|
||||
|
||||
private static final CharArraySet exc15b = new CharArraySet(Version.LUCENE_50,
|
||||
private static final CharArraySet exc15b = new CharArraySet(Version.LUCENE_CURRENT,
|
||||
Arrays.asList("ψοφ", "ναυλοχ"),
|
||||
false);
|
||||
|
||||
|
@ -567,7 +567,7 @@ public class GreekStemmer {
|
|||
return len;
|
||||
}
|
||||
|
||||
private static final CharArraySet exc16 = new CharArraySet(Version.LUCENE_50,
|
||||
private static final CharArraySet exc16 = new CharArraySet(Version.LUCENE_CURRENT,
|
||||
Arrays.asList("ν", "χερσον", "δωδεκαν", "ερημον", "μεγαλον", "επταν"),
|
||||
false);
|
||||
|
||||
|
@ -587,7 +587,7 @@ public class GreekStemmer {
|
|||
return len;
|
||||
}
|
||||
|
||||
private static final CharArraySet exc17 = new CharArraySet(Version.LUCENE_50,
|
||||
private static final CharArraySet exc17 = new CharArraySet(Version.LUCENE_CURRENT,
|
||||
Arrays.asList("ασβ", "σβ", "αχρ", "χρ", "απλ", "αειμν", "δυσχρ", "ευχρ", "κοινοχρ", "παλιμψ"),
|
||||
false);
|
||||
|
||||
|
@ -601,7 +601,7 @@ public class GreekStemmer {
|
|||
return len;
|
||||
}
|
||||
|
||||
private static final CharArraySet exc18 = new CharArraySet(Version.LUCENE_50,
|
||||
private static final CharArraySet exc18 = new CharArraySet(Version.LUCENE_CURRENT,
|
||||
Arrays.asList("ν", "ρ", "σπι", "στραβομουτσ", "κακομουτσ", "εξων"),
|
||||
false);
|
||||
|
||||
|
@ -625,7 +625,7 @@ public class GreekStemmer {
|
|||
return len;
|
||||
}
|
||||
|
||||
private static final CharArraySet exc19 = new CharArraySet(Version.LUCENE_50,
|
||||
private static final CharArraySet exc19 = new CharArraySet(Version.LUCENE_CURRENT,
|
||||
Arrays.asList("παρασουσ", "φ", "χ", "ωριοπλ", "αζ", "αλλοσουσ", "ασουσ"),
|
||||
false);
|
||||
|
||||
|
|
|
@ -280,10 +280,7 @@ public class KStemmer {
|
|||
DictEntry defaultEntry;
|
||||
DictEntry entry;
|
||||
|
||||
CharArrayMap<DictEntry> d = new CharArrayMap<DictEntry>(
|
||||
Version.LUCENE_50, 1000, false);
|
||||
|
||||
d = new CharArrayMap<DictEntry>(Version.LUCENE_50, 1000, false);
|
||||
CharArrayMap<DictEntry> d = new CharArrayMap<DictEntry>(Version.LUCENE_CURRENT, 1000, false);
|
||||
for (int i = 0; i < exceptionWords.length; i++) {
|
||||
if (!d.containsKey(exceptionWords[i])) {
|
||||
entry = new DictEntry(exceptionWords[i], true);
|
||||
|
|
|
@ -34,7 +34,7 @@ public class HunspellStemmer {
|
|||
private final int recursionCap;
|
||||
private final HunspellDictionary dictionary;
|
||||
private final StringBuilder segment = new StringBuilder();
|
||||
private CharacterUtils charUtils = CharacterUtils.getInstance(Version.LUCENE_40);
|
||||
private CharacterUtils charUtils = CharacterUtils.getInstance(Version.LUCENE_CURRENT);
|
||||
|
||||
/**
|
||||
* Constructs a new HunspellStemmer which will use the provided HunspellDictionary to create its stems. Uses the
|
||||
|
@ -324,7 +324,8 @@ public class HunspellStemmer {
|
|||
InputStream affixInputStream = new FileInputStream(args[offset++]);
|
||||
InputStream dicInputStream = new FileInputStream(args[offset++]);
|
||||
|
||||
HunspellDictionary dictionary = new HunspellDictionary(affixInputStream, dicInputStream, Version.LUCENE_40, ignoreCase);
|
||||
// :Post-Release-Update-Version.LUCENE_XY:
|
||||
HunspellDictionary dictionary = new HunspellDictionary(affixInputStream, dicInputStream, Version.LUCENE_50, ignoreCase);
|
||||
|
||||
affixInputStream.close();
|
||||
dicInputStream.close();
|
||||
|
|
|
@ -35,7 +35,7 @@ public final class RemoveDuplicatesTokenFilter extends TokenFilter {
|
|||
private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class);
|
||||
|
||||
// use a fixed version, as we don't care about case sensitivity.
|
||||
private final CharArraySet previous = new CharArraySet(Version.LUCENE_50, 8, false);
|
||||
private final CharArraySet previous = new CharArraySet(Version.LUCENE_CURRENT, 8, false);
|
||||
|
||||
/**
|
||||
* Creates a new RemoveDuplicatesTokenFilter
|
||||
|
|
|
@ -134,7 +134,7 @@ public abstract class RSLPStemmerBase {
|
|||
if (!exceptions[i].endsWith(suffix))
|
||||
throw new RuntimeException("useless exception '" + exceptions[i] + "' does not end with '" + suffix + "'");
|
||||
}
|
||||
this.exceptions = new CharArraySet(Version.LUCENE_50,
|
||||
this.exceptions = new CharArraySet(Version.LUCENE_CURRENT,
|
||||
Arrays.asList(exceptions), false);
|
||||
}
|
||||
|
||||
|
|
|
@ -1,9 +1,10 @@
|
|||
/*
|
||||
* Copyright 2001-2005 The Apache Software Foundation.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
|
@ -13,10 +14,9 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
// Generated from IANA Root Zone Database <http://www.internic.net/zones/root.zone>
|
||||
// file version from Saturday, July 14, 2012 4:34:14 AM UTC
|
||||
// generated on Sunday, July 15, 2012 12:59:44 AM UTC
|
||||
// file version from Friday, December 6, 2013 4:34:10 AM UTC
|
||||
// generated on Friday, December 6, 2013 3:21:59 PM UTC
|
||||
// by org.apache.lucene.analysis.standard.GenerateJflexTLDMacros
|
||||
|
||||
ASCIITLD = "." (
|
||||
|
@ -49,6 +49,7 @@ ASCIITLD = "." (
|
|||
| [bB][gG]
|
||||
| [bB][hH]
|
||||
| [bB][iI]
|
||||
| [bB][iI][kK][eE]
|
||||
| [bB][iI][zZ]
|
||||
| [bB][jJ]
|
||||
| [bB][mM]
|
||||
|
@ -62,6 +63,7 @@ ASCIITLD = "." (
|
|||
| [bB][yY]
|
||||
| [bB][zZ]
|
||||
| [cC][aA]
|
||||
| [cC][aA][mM][eE][rR][aA]
|
||||
| [cC][aA][tT]
|
||||
| [cC][cC]
|
||||
| [cC][dD]
|
||||
|
@ -71,10 +73,13 @@ ASCIITLD = "." (
|
|||
| [cC][iI]
|
||||
| [cC][kK]
|
||||
| [cC][lL]
|
||||
| [cC][lL][oO][tT][hH][iI][nN][gG]
|
||||
| [cC][mM]
|
||||
| [cC][nN]
|
||||
| [cC][oO]
|
||||
| [cC][oO][mM]
|
||||
| [cC][oO][nN][sS][tT][rR][uU][cC][tT][iI][oO][nN]
|
||||
| [cC][oO][nN][tT][rR][aA][cC][tT][oO][rR][sS]
|
||||
| [cC][oO][oO][pP]
|
||||
| [cC][rR]
|
||||
| [cC][uU]
|
||||
|
@ -84,6 +89,8 @@ ASCIITLD = "." (
|
|||
| [cC][yY]
|
||||
| [cC][zZ]
|
||||
| [dD][eE]
|
||||
| [dD][iI][aA][mM][oO][nN][dD][sS]
|
||||
| [dD][iI][rR][eE][cC][tT][oO][rR][yY]
|
||||
| [dD][jJ]
|
||||
| [dD][kK]
|
||||
| [dD][mM]
|
||||
|
@ -93,8 +100,11 @@ ASCIITLD = "." (
|
|||
| [eE][dD][uU]
|
||||
| [eE][eE]
|
||||
| [eE][gG]
|
||||
| [eE][nN][tT][eE][rR][pP][rR][iI][sS][eE][sS]
|
||||
| [eE][qQ][uU][iI][pP][mM][eE][nN][tT]
|
||||
| [eE][rR]
|
||||
| [eE][sS]
|
||||
| [eE][sS][tT][aA][tT][eE]
|
||||
| [eE][tT]
|
||||
| [eE][uU]
|
||||
| [fF][iI]
|
||||
|
@ -104,6 +114,7 @@ ASCIITLD = "." (
|
|||
| [fF][oO]
|
||||
| [fF][rR]
|
||||
| [gG][aA]
|
||||
| [gG][aA][lL][lL][eE][rR][yY]
|
||||
| [gG][bB]
|
||||
| [gG][dD]
|
||||
| [gG][eE]
|
||||
|
@ -118,14 +129,17 @@ ASCIITLD = "." (
|
|||
| [gG][pP]
|
||||
| [gG][qQ]
|
||||
| [gG][rR]
|
||||
| [gG][rR][aA][pP][hH][iI][cC][sS]
|
||||
| [gG][sS]
|
||||
| [gG][tT]
|
||||
| [gG][uU]
|
||||
| [gG][uU][rR][uU]
|
||||
| [gG][wW]
|
||||
| [gG][yY]
|
||||
| [hH][kK]
|
||||
| [hH][mM]
|
||||
| [hH][nN]
|
||||
| [hH][oO][lL][dD][iI][nN][gG][sS]
|
||||
| [hH][rR]
|
||||
| [hH][tT]
|
||||
| [hH][uU]
|
||||
|
@ -150,6 +164,7 @@ ASCIITLD = "." (
|
|||
| [kK][gG]
|
||||
| [kK][hH]
|
||||
| [kK][iI]
|
||||
| [kK][iI][tT][cC][hH][eE][nN]
|
||||
| [kK][mM]
|
||||
| [kK][nN]
|
||||
| [kK][pP]
|
||||
|
@ -158,9 +173,11 @@ ASCIITLD = "." (
|
|||
| [kK][yY]
|
||||
| [kK][zZ]
|
||||
| [lL][aA]
|
||||
| [lL][aA][nN][dD]
|
||||
| [lL][bB]
|
||||
| [lL][cC]
|
||||
| [lL][iI]
|
||||
| [lL][iI][gG][hH][tT][iI][nN][gG]
|
||||
| [lL][kK]
|
||||
| [lL][rR]
|
||||
| [lL][sS]
|
||||
|
@ -172,6 +189,7 @@ ASCIITLD = "." (
|
|||
| [mM][cC]
|
||||
| [mM][dD]
|
||||
| [mM][eE]
|
||||
| [mM][eE][nN][uU]
|
||||
| [mM][gG]
|
||||
| [mM][hH]
|
||||
| [mM][iI][lL]
|
||||
|
@ -214,10 +232,13 @@ ASCIITLD = "." (
|
|||
| [pP][fF]
|
||||
| [pP][gG]
|
||||
| [pP][hH]
|
||||
| [pP][hH][oO][tT][oO][gG][rR][aA][pP][hH][yY]
|
||||
| [pP][kK]
|
||||
| [pP][lL]
|
||||
| [pP][lL][uU][mM][bB][iI][nN][gG]
|
||||
| [pP][mM]
|
||||
| [pP][nN]
|
||||
| [pP][oO][sS][tT]
|
||||
| [pP][rR]
|
||||
| [pP][rR][oO]
|
||||
| [pP][sS]
|
||||
|
@ -235,9 +256,11 @@ ASCIITLD = "." (
|
|||
| [sS][cC]
|
||||
| [sS][dD]
|
||||
| [sS][eE]
|
||||
| [sS][eE][xX][yY]
|
||||
| [sS][gG]
|
||||
| [sS][hH]
|
||||
| [sS][iI]
|
||||
| [sS][iI][nN][gG][lL][eE][sS]
|
||||
| [sS][jJ]
|
||||
| [sS][kK]
|
||||
| [sS][lL]
|
||||
|
@ -251,18 +274,22 @@ ASCIITLD = "." (
|
|||
| [sS][xX]
|
||||
| [sS][yY]
|
||||
| [sS][zZ]
|
||||
| [tT][aA][tT][tT][oO][oO]
|
||||
| [tT][cC]
|
||||
| [tT][dD]
|
||||
| [tT][eE][cC][hH][nN][oO][lL][oO][gG][yY]
|
||||
| [tT][eE][lL]
|
||||
| [tT][fF]
|
||||
| [tT][gG]
|
||||
| [tT][hH]
|
||||
| [tT][iI][pP][sS]
|
||||
| [tT][jJ]
|
||||
| [tT][kK]
|
||||
| [tT][lL]
|
||||
| [tT][mM]
|
||||
| [tT][nN]
|
||||
| [tT][oO]
|
||||
| [tT][oO][dD][aA][yY]
|
||||
| [tT][pP]
|
||||
| [tT][rR]
|
||||
| [tT][rR][aA][vV][eE][lL]
|
||||
|
@ -273,61 +300,62 @@ ASCIITLD = "." (
|
|||
| [uU][aA]
|
||||
| [uU][gG]
|
||||
| [uU][kK]
|
||||
| [uU][nN][oO]
|
||||
| [uU][sS]
|
||||
| [uU][yY]
|
||||
| [uU][zZ]
|
||||
| [vV][aA]
|
||||
| [vV][cC]
|
||||
| [vV][eE]
|
||||
| [vV][eE][nN][tT][uU][rR][eE][sS]
|
||||
| [vV][gG]
|
||||
| [vV][iI]
|
||||
| [vV][nN]
|
||||
| [vV][oO][yY][aA][gG][eE]
|
||||
| [vV][uU]
|
||||
| [wW][fF]
|
||||
| [wW][sS]
|
||||
| [xX][nN]--0[zZ][wW][mM]56[dD]
|
||||
| [xX][nN]--11[bB]5[bB][sS]3[aA]9[aA][jJ]6[gG]
|
||||
| [xX][nN]--3[eE]0[bB]707[eE]
|
||||
| [xX][nN]--45[bB][rR][jJ]9[cC]
|
||||
| [xX][nN]--80[aA][kK][hH][bB][yY][kK][nN][jJ]4[fF]
|
||||
| [xX][nN]--80[aA][oO]21[aA]
|
||||
| [xX][nN]--80[aA][sS][eE][hH][dD][bB]
|
||||
| [xX][nN]--80[aA][sS][wW][gG]
|
||||
| [xX][nN]--90[aA]3[aA][cC]
|
||||
| [xX][nN]--9[tT]4[bB]11[yY][iI]5[aA]
|
||||
| [xX][nN]--[cC][lL][cC][hH][cC]0[eE][aA]0[bB]2[gG]2[aA]9[gG][cC][dD]
|
||||
| [xX][nN]--[dD][eE][bB][aA]0[aA][dD]
|
||||
| [xX][nN]--[fF][iI][qQ][sS]8[sS]
|
||||
| [xX][nN]--[fF][iI][qQ][zZ]9[sS]
|
||||
| [xX][nN]--[fF][pP][cC][rR][jJ]9[cC]3[dD]
|
||||
| [xX][nN]--[fF][zZ][cC]2[cC]9[eE]2[cC]
|
||||
| [xX][nN]--[gG]6[wW]251[dD]
|
||||
| [xX][nN]--[gG][eE][cC][rR][jJ]9[cC]
|
||||
| [xX][nN]--[hH]2[bB][rR][jJ]9[cC]
|
||||
| [xX][nN]--[hH][gG][bB][kK]6[aA][jJ]7[fF]53[bB][bB][aA]
|
||||
| [xX][nN]--[hH][lL][cC][jJ]6[aA][yY][aA]9[eE][sS][cC]7[aA]
|
||||
| [xX][nN]--[jJ]1[aA][mM][hH]
|
||||
| [xX][nN]--[jJ]6[wW]193[gG]
|
||||
| [xX][nN]--[jJ][xX][aA][lL][pP][dD][lL][pP]
|
||||
| [xX][nN]--[kK][gG][bB][eE][cC][hH][tT][vV]
|
||||
| [xX][nN]--[kK][pP][rR][wW]13[dD]
|
||||
| [xX][nN]--[kK][pP][rR][yY]57[dD]
|
||||
| [xX][nN]--[lL]1[aA][cC][cC]
|
||||
| [xX][nN]--[lL][gG][bB][bB][aA][tT]1[aA][dD]8[jJ]
|
||||
| [xX][nN]--[mM][gG][bB]9[aA][wW][bB][fF]
|
||||
| [xX][nN]--[mM][gG][bB][aA]3[aA]4[fF]16[aA]
|
||||
| [xX][nN]--[mM][gG][bB][aA][aA][mM]7[aA]8[hH]
|
||||
| [xX][nN]--[mM][gG][bB][aA][yY][hH]7[gG][pP][aA]
|
||||
| [xX][nN]--[mM][gG][bB][bB][hH]1[aA]71[eE]
|
||||
| [xX][nN]--[mM][gG][bB][cC]0[aA]9[aA][zZ][cC][gG]
|
||||
| [xX][nN]--[mM][gG][bB][eE][rR][pP]4[aA]5[dD]4[aA][rR]
|
||||
| [xX][nN]--[mM][gG][bB][xX]4[cC][dD]0[aA][bB]
|
||||
| [xX][nN]--[nN][gG][bB][cC]5[aA][zZ][dD]
|
||||
| [xX][nN]--[oO]3[cC][wW]4[hH]
|
||||
| [xX][nN]--[oO][gG][bB][pP][fF]8[fF][lL]
|
||||
| [xX][nN]--[pP]1[aA][iI]
|
||||
| [xX][nN]--[pP][gG][bB][sS]0[dD][hH]
|
||||
| [xX][nN]--[qQ]9[jJ][yY][bB]4[cC]
|
||||
| [xX][nN]--[sS]9[bB][rR][jJ]9[cC]
|
||||
| [xX][nN]--[uU][nN][uU][pP]4[yY]
|
||||
| [xX][nN]--[wW][gG][bB][hH]1[cC]
|
||||
| [xX][nN]--[wW][gG][bB][lL]6[aA]
|
||||
| [xX][nN]--[xX][kK][cC]2[aA][lL]3[hH][yY][eE]2[aA]
|
||||
| [xX][nN]--[xX][kK][cC]2[dD][lL]3[aA]5[eE][eE]0[hH]
|
||||
| [xX][nN]--[yY][fF][rR][oO]4[iI]67[oO]
|
||||
| [xX][nN]--[yY][gG][bB][iI]2[aA][mM][mM][xX]
|
||||
| [xX][nN]--[zZ][cC][kK][zZ][aA][hH]
|
||||
| [xX][xX][xX]
|
||||
| [yY][eE]
|
||||
| [yY][tT]
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
/* The following code was generated by JFlex. */
|
||||
/* The following code was generated by JFlex 1.5.0-SNAPSHOT */
|
||||
|
||||
package org.apache.lucene.analysis.standard;
|
||||
|
||||
|
@ -58,64 +58,63 @@ class ClassicTokenizerImpl implements StandardTokenizerInterface {
|
|||
* Translates characters to character classes
|
||||
*/
|
||||
private static final String ZZ_CMAP_PACKED =
|
||||
"\11\0\1\0\1\15\1\0\1\0\1\14\22\0\1\0\5\0\1\5"+
|
||||
"\1\3\4\0\1\11\1\7\1\4\1\11\12\2\6\0\1\6\32\12"+
|
||||
"\4\0\1\10\1\0\32\12\57\0\1\12\12\0\1\12\4\0\1\12"+
|
||||
"\5\0\27\12\1\0\37\12\1\0\u0128\12\2\0\22\12\34\0\136\12"+
|
||||
"\2\0\11\12\2\0\7\12\16\0\2\12\16\0\5\12\11\0\1\12"+
|
||||
"\213\0\1\12\13\0\1\12\1\0\3\12\1\0\1\12\1\0\24\12"+
|
||||
"\1\0\54\12\1\0\10\12\2\0\32\12\14\0\202\12\12\0\71\12"+
|
||||
"\2\0\2\12\2\0\2\12\3\0\46\12\2\0\2\12\67\0\46\12"+
|
||||
"\2\0\1\12\7\0\47\12\110\0\33\12\5\0\3\12\56\0\32\12"+
|
||||
"\5\0\13\12\25\0\12\2\7\0\143\12\1\0\1\12\17\0\2\12"+
|
||||
"\11\0\12\2\3\12\23\0\1\12\1\0\33\12\123\0\46\12\u015f\0"+
|
||||
"\65\12\3\0\1\12\22\0\1\12\7\0\12\12\4\0\12\2\25\0"+
|
||||
"\10\12\2\0\2\12\2\0\26\12\1\0\7\12\1\0\1\12\3\0"+
|
||||
"\4\12\42\0\2\12\1\0\3\12\4\0\12\2\2\12\23\0\6\12"+
|
||||
"\4\0\2\12\2\0\26\12\1\0\7\12\1\0\2\12\1\0\2\12"+
|
||||
"\1\0\2\12\37\0\4\12\1\0\1\12\7\0\12\2\2\0\3\12"+
|
||||
"\20\0\7\12\1\0\1\12\1\0\3\12\1\0\26\12\1\0\7\12"+
|
||||
"\1\0\2\12\1\0\5\12\3\0\1\12\22\0\1\12\17\0\1\12"+
|
||||
"\5\0\12\2\25\0\10\12\2\0\2\12\2\0\26\12\1\0\7\12"+
|
||||
"\1\0\2\12\2\0\4\12\3\0\1\12\36\0\2\12\1\0\3\12"+
|
||||
"\4\0\12\2\25\0\6\12\3\0\3\12\1\0\4\12\3\0\2\12"+
|
||||
"\1\0\1\12\1\0\2\12\3\0\2\12\3\0\3\12\3\0\10\12"+
|
||||
"\1\0\3\12\55\0\11\2\25\0\10\12\1\0\3\12\1\0\27\12"+
|
||||
"\1\0\12\12\1\0\5\12\46\0\2\12\4\0\12\2\25\0\10\12"+
|
||||
"\1\0\3\12\1\0\27\12\1\0\12\12\1\0\5\12\44\0\1\12"+
|
||||
"\1\0\2\12\4\0\12\2\25\0\10\12\1\0\3\12\1\0\27\12"+
|
||||
"\1\0\20\12\46\0\2\12\4\0\12\2\25\0\22\12\3\0\30\12"+
|
||||
"\1\0\11\12\1\0\1\12\2\0\7\12\71\0\1\1\60\12\1\1"+
|
||||
"\2\12\14\1\7\12\11\1\12\2\47\0\2\12\1\0\1\12\2\0"+
|
||||
"\2\12\1\0\1\12\2\0\1\12\6\0\4\12\1\0\7\12\1\0"+
|
||||
"\3\12\1\0\1\12\1\0\1\12\2\0\2\12\1\0\4\12\1\0"+
|
||||
"\2\12\11\0\1\12\2\0\5\12\1\0\1\12\11\0\12\2\2\0"+
|
||||
"\2\12\42\0\1\12\37\0\12\2\26\0\10\12\1\0\42\12\35\0"+
|
||||
"\4\12\164\0\42\12\1\0\5\12\1\0\2\12\25\0\12\2\6\0"+
|
||||
"\6\12\112\0\46\12\12\0\47\12\11\0\132\12\5\0\104\12\5\0"+
|
||||
"\122\12\6\0\7\12\1\0\77\12\1\0\1\12\1\0\4\12\2\0"+
|
||||
"\7\12\1\0\1\12\1\0\4\12\2\0\47\12\1\0\1\12\1\0"+
|
||||
"\4\12\2\0\37\12\1\0\1\12\1\0\4\12\2\0\7\12\1\0"+
|
||||
"\1\12\1\0\4\12\2\0\7\12\1\0\7\12\1\0\27\12\1\0"+
|
||||
"\37\12\1\0\1\12\1\0\4\12\2\0\7\12\1\0\47\12\1\0"+
|
||||
"\23\12\16\0\11\2\56\0\125\12\14\0\u026c\12\2\0\10\12\12\0"+
|
||||
"\32\12\5\0\113\12\225\0\64\12\54\0\12\2\46\0\12\2\6\0"+
|
||||
"\130\12\10\0\51\12\u0557\0\234\12\4\0\132\12\6\0\26\12\2\0"+
|
||||
"\6\12\2\0\46\12\2\0\6\12\2\0\10\12\1\0\1\12\1\0"+
|
||||
"\1\12\1\0\1\12\1\0\37\12\2\0\65\12\1\0\7\12\1\0"+
|
||||
"\1\12\3\0\3\12\1\0\7\12\3\0\4\12\2\0\6\12\4\0"+
|
||||
"\15\12\5\0\3\12\1\0\7\12\202\0\1\12\202\0\1\12\4\0"+
|
||||
"\1\12\2\0\12\12\1\0\1\12\3\0\5\12\6\0\1\12\1\0"+
|
||||
"\1\12\1\0\1\12\1\0\4\12\1\0\3\12\1\0\7\12\u0ecb\0"+
|
||||
"\2\12\52\0\5\12\12\0\1\13\124\13\10\13\2\13\2\13\132\13"+
|
||||
"\1\13\3\13\6\13\50\13\3\13\1\0\136\12\21\0\30\12\70\0"+
|
||||
"\20\13\u0100\0\200\13\200\0\u19b6\13\12\13\100\0\u51a6\13\132\13\u048d\12"+
|
||||
"\u0773\0\u2ba4\12\u215c\0\u012e\13\322\13\7\12\14\0\5\12\5\0\1\12"+
|
||||
"\1\0\12\12\1\0\15\12\1\0\5\12\1\0\1\12\1\0\2\12"+
|
||||
"\1\0\2\12\1\0\154\12\41\0\u016b\12\22\0\100\12\2\0\66\12"+
|
||||
"\50\0\14\12\164\0\3\12\1\0\1\12\1\0\207\12\23\0\12\2"+
|
||||
"\7\0\32\12\6\0\32\12\12\0\1\13\72\13\37\12\3\0\6\12"+
|
||||
"\2\0\6\12\2\0\6\12\2\0\3\12\43\0";
|
||||
"\46\0\1\5\1\3\4\0\1\11\1\7\1\4\1\11\12\2\6\0"+
|
||||
"\1\6\32\12\4\0\1\10\1\0\32\12\57\0\1\12\12\0\1\12"+
|
||||
"\4\0\1\12\5\0\27\12\1\0\37\12\1\0\u0128\12\2\0\22\12"+
|
||||
"\34\0\136\12\2\0\11\12\2\0\7\12\16\0\2\12\16\0\5\12"+
|
||||
"\11\0\1\12\213\0\1\12\13\0\1\12\1\0\3\12\1\0\1\12"+
|
||||
"\1\0\24\12\1\0\54\12\1\0\10\12\2\0\32\12\14\0\202\12"+
|
||||
"\12\0\71\12\2\0\2\12\2\0\2\12\3\0\46\12\2\0\2\12"+
|
||||
"\67\0\46\12\2\0\1\12\7\0\47\12\110\0\33\12\5\0\3\12"+
|
||||
"\56\0\32\12\5\0\13\12\25\0\12\2\7\0\143\12\1\0\1\12"+
|
||||
"\17\0\2\12\11\0\12\2\3\12\23\0\1\12\1\0\33\12\123\0"+
|
||||
"\46\12\u015f\0\65\12\3\0\1\12\22\0\1\12\7\0\12\12\4\0"+
|
||||
"\12\2\25\0\10\12\2\0\2\12\2\0\26\12\1\0\7\12\1\0"+
|
||||
"\1\12\3\0\4\12\42\0\2\12\1\0\3\12\4\0\12\2\2\12"+
|
||||
"\23\0\6\12\4\0\2\12\2\0\26\12\1\0\7\12\1\0\2\12"+
|
||||
"\1\0\2\12\1\0\2\12\37\0\4\12\1\0\1\12\7\0\12\2"+
|
||||
"\2\0\3\12\20\0\7\12\1\0\1\12\1\0\3\12\1\0\26\12"+
|
||||
"\1\0\7\12\1\0\2\12\1\0\5\12\3\0\1\12\22\0\1\12"+
|
||||
"\17\0\1\12\5\0\12\2\25\0\10\12\2\0\2\12\2\0\26\12"+
|
||||
"\1\0\7\12\1\0\2\12\2\0\4\12\3\0\1\12\36\0\2\12"+
|
||||
"\1\0\3\12\4\0\12\2\25\0\6\12\3\0\3\12\1\0\4\12"+
|
||||
"\3\0\2\12\1\0\1\12\1\0\2\12\3\0\2\12\3\0\3\12"+
|
||||
"\3\0\10\12\1\0\3\12\55\0\11\2\25\0\10\12\1\0\3\12"+
|
||||
"\1\0\27\12\1\0\12\12\1\0\5\12\46\0\2\12\4\0\12\2"+
|
||||
"\25\0\10\12\1\0\3\12\1\0\27\12\1\0\12\12\1\0\5\12"+
|
||||
"\44\0\1\12\1\0\2\12\4\0\12\2\25\0\10\12\1\0\3\12"+
|
||||
"\1\0\27\12\1\0\20\12\46\0\2\12\4\0\12\2\25\0\22\12"+
|
||||
"\3\0\30\12\1\0\11\12\1\0\1\12\2\0\7\12\71\0\1\1"+
|
||||
"\60\12\1\1\2\12\14\1\7\12\11\1\12\2\47\0\2\12\1\0"+
|
||||
"\1\12\2\0\2\12\1\0\1\12\2\0\1\12\6\0\4\12\1\0"+
|
||||
"\7\12\1\0\3\12\1\0\1\12\1\0\1\12\2\0\2\12\1\0"+
|
||||
"\4\12\1\0\2\12\11\0\1\12\2\0\5\12\1\0\1\12\11\0"+
|
||||
"\12\2\2\0\2\12\42\0\1\12\37\0\12\2\26\0\10\12\1\0"+
|
||||
"\42\12\35\0\4\12\164\0\42\12\1\0\5\12\1\0\2\12\25\0"+
|
||||
"\12\2\6\0\6\12\112\0\46\12\12\0\47\12\11\0\132\12\5\0"+
|
||||
"\104\12\5\0\122\12\6\0\7\12\1\0\77\12\1\0\1\12\1\0"+
|
||||
"\4\12\2\0\7\12\1\0\1\12\1\0\4\12\2\0\47\12\1\0"+
|
||||
"\1\12\1\0\4\12\2\0\37\12\1\0\1\12\1\0\4\12\2\0"+
|
||||
"\7\12\1\0\1\12\1\0\4\12\2\0\7\12\1\0\7\12\1\0"+
|
||||
"\27\12\1\0\37\12\1\0\1\12\1\0\4\12\2\0\7\12\1\0"+
|
||||
"\47\12\1\0\23\12\16\0\11\2\56\0\125\12\14\0\u026c\12\2\0"+
|
||||
"\10\12\12\0\32\12\5\0\113\12\225\0\64\12\54\0\12\2\46\0"+
|
||||
"\12\2\6\0\130\12\10\0\51\12\u0557\0\234\12\4\0\132\12\6\0"+
|
||||
"\26\12\2\0\6\12\2\0\46\12\2\0\6\12\2\0\10\12\1\0"+
|
||||
"\1\12\1\0\1\12\1\0\1\12\1\0\37\12\2\0\65\12\1\0"+
|
||||
"\7\12\1\0\1\12\3\0\3\12\1\0\7\12\3\0\4\12\2\0"+
|
||||
"\6\12\4\0\15\12\5\0\3\12\1\0\7\12\202\0\1\12\202\0"+
|
||||
"\1\12\4\0\1\12\2\0\12\12\1\0\1\12\3\0\5\12\6\0"+
|
||||
"\1\12\1\0\1\12\1\0\1\12\1\0\4\12\1\0\3\12\1\0"+
|
||||
"\7\12\u0ecb\0\2\12\52\0\5\12\12\0\1\13\124\13\10\13\2\13"+
|
||||
"\2\13\132\13\1\13\3\13\6\13\50\13\3\13\1\0\136\12\21\0"+
|
||||
"\30\12\70\0\20\13\u0100\0\200\13\200\0\u19b6\13\12\13\100\0\u51a6\13"+
|
||||
"\132\13\u048d\12\u0773\0\u2ba4\12\u215c\0\u012e\13\322\13\7\12\14\0\5\12"+
|
||||
"\5\0\1\12\1\0\12\12\1\0\15\12\1\0\5\12\1\0\1\12"+
|
||||
"\1\0\2\12\1\0\2\12\1\0\154\12\41\0\u016b\12\22\0\100\12"+
|
||||
"\2\0\66\12\50\0\14\12\164\0\3\12\1\0\1\12\1\0\207\12"+
|
||||
"\23\0\12\2\7\0\32\12\6\0\32\12\12\0\1\13\72\13\37\12"+
|
||||
"\3\0\6\12\2\0\6\12\2\0\6\12\2\0\3\12\43\0";
|
||||
|
||||
/**
|
||||
* Translates characters to character classes
|
||||
|
@ -128,13 +127,12 @@ class ClassicTokenizerImpl implements StandardTokenizerInterface {
|
|||
private static final int [] ZZ_ACTION = zzUnpackAction();
|
||||
|
||||
private static final String ZZ_ACTION_PACKED_0 =
|
||||
"\1\0\1\1\3\2\1\3\1\1\13\0\1\2\3\4"+
|
||||
"\2\0\1\5\1\0\1\5\3\4\6\5\1\6\1\4"+
|
||||
"\2\7\1\10\1\0\1\10\3\0\2\10\1\11\1\12"+
|
||||
"\1\4";
|
||||
"\1\0\1\1\3\2\1\3\13\0\1\2\3\4\2\0"+
|
||||
"\1\5\1\0\1\5\3\4\6\5\1\6\1\4\2\7"+
|
||||
"\1\10\1\0\1\10\3\0\2\10\1\11\1\12\1\4";
|
||||
|
||||
private static int [] zzUnpackAction() {
|
||||
int [] result = new int[51];
|
||||
int [] result = new int[50];
|
||||
int offset = 0;
|
||||
offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
|
||||
return result;
|
||||
|
@ -159,16 +157,16 @@ class ClassicTokenizerImpl implements StandardTokenizerInterface {
|
|||
private static final int [] ZZ_ROWMAP = zzUnpackRowMap();
|
||||
|
||||
private static final String ZZ_ROWMAP_PACKED_0 =
|
||||
"\0\0\0\16\0\34\0\52\0\70\0\16\0\106\0\124"+
|
||||
"\0\142\0\160\0\176\0\214\0\232\0\250\0\266\0\304"+
|
||||
"\0\322\0\340\0\356\0\374\0\u010a\0\u0118\0\u0126\0\u0134"+
|
||||
"\0\u0142\0\u0150\0\u015e\0\u016c\0\u017a\0\u0188\0\u0196\0\u01a4"+
|
||||
"\0\u01b2\0\u01c0\0\u01ce\0\u01dc\0\u01ea\0\u01f8\0\322\0\u0206"+
|
||||
"\0\u0214\0\u0222\0\u0230\0\u023e\0\u024c\0\u025a\0\124\0\214"+
|
||||
"\0\u0268\0\u0276\0\u0284";
|
||||
"\0\0\0\14\0\30\0\44\0\60\0\14\0\74\0\110"+
|
||||
"\0\124\0\140\0\154\0\170\0\204\0\220\0\234\0\250"+
|
||||
"\0\264\0\300\0\314\0\330\0\344\0\360\0\374\0\u0108"+
|
||||
"\0\u0114\0\u0120\0\u012c\0\u0138\0\u0144\0\u0150\0\u015c\0\u0168"+
|
||||
"\0\u0174\0\u0180\0\u018c\0\u0198\0\u01a4\0\250\0\u01b0\0\u01bc"+
|
||||
"\0\u01c8\0\u01d4\0\u01e0\0\u01ec\0\u01f8\0\74\0\154\0\u0204"+
|
||||
"\0\u0210\0\u021c";
|
||||
|
||||
private static int [] zzUnpackRowMap() {
|
||||
int [] result = new int[51];
|
||||
int [] result = new int[50];
|
||||
int offset = 0;
|
||||
offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
|
||||
return result;
|
||||
|
@ -191,49 +189,49 @@ class ClassicTokenizerImpl implements StandardTokenizerInterface {
|
|||
private static final int [] ZZ_TRANS = zzUnpackTrans();
|
||||
|
||||
private static final String ZZ_TRANS_PACKED_0 =
|
||||
"\1\2\1\3\1\4\7\2\1\5\1\6\1\7\1\2"+
|
||||
"\17\0\2\3\1\0\1\10\1\0\1\11\2\12\1\13"+
|
||||
"\1\3\4\0\1\3\1\4\1\0\1\14\1\0\1\11"+
|
||||
"\2\15\1\16\1\4\4\0\1\3\1\4\1\17\1\20"+
|
||||
"\1\21\1\22\2\12\1\13\1\23\20\0\1\2\1\0"+
|
||||
"\1\24\1\25\7\0\1\26\4\0\2\27\7\0\1\27"+
|
||||
"\4\0\1\30\1\31\7\0\1\32\5\0\1\33\7\0"+
|
||||
"\1\13\4\0\1\34\1\35\7\0\1\36\4\0\1\37"+
|
||||
"\1\40\7\0\1\41\4\0\1\42\1\43\7\0\1\44"+
|
||||
"\15\0\1\45\4\0\1\24\1\25\7\0\1\46\15\0"+
|
||||
"\1\47\4\0\2\27\7\0\1\50\4\0\1\3\1\4"+
|
||||
"\1\17\1\10\1\21\1\22\2\12\1\13\1\23\4\0"+
|
||||
"\2\24\1\0\1\51\1\0\1\11\2\52\1\0\1\24"+
|
||||
"\4\0\1\24\1\25\1\0\1\53\1\0\1\11\2\54"+
|
||||
"\1\55\1\25\4\0\1\24\1\25\1\0\1\51\1\0"+
|
||||
"\1\11\2\52\1\0\1\26\4\0\2\27\1\0\1\56"+
|
||||
"\2\0\1\56\2\0\1\27\4\0\2\30\1\0\1\52"+
|
||||
"\1\0\1\11\2\52\1\0\1\30\4\0\1\30\1\31"+
|
||||
"\1\0\1\54\1\0\1\11\2\54\1\55\1\31\4\0"+
|
||||
"\1\30\1\31\1\0\1\52\1\0\1\11\2\52\1\0"+
|
||||
"\1\32\5\0\1\33\1\0\1\55\2\0\3\55\1\33"+
|
||||
"\4\0\2\34\1\0\1\57\1\0\1\11\2\12\1\13"+
|
||||
"\1\34\4\0\1\34\1\35\1\0\1\60\1\0\1\11"+
|
||||
"\2\15\1\16\1\35\4\0\1\34\1\35\1\0\1\57"+
|
||||
"\1\0\1\11\2\12\1\13\1\36\4\0\2\37\1\0"+
|
||||
"\1\12\1\0\1\11\2\12\1\13\1\37\4\0\1\37"+
|
||||
"\1\40\1\0\1\15\1\0\1\11\2\15\1\16\1\40"+
|
||||
"\4\0\1\37\1\40\1\0\1\12\1\0\1\11\2\12"+
|
||||
"\1\13\1\41\4\0\2\42\1\0\1\13\2\0\3\13"+
|
||||
"\1\42\4\0\1\42\1\43\1\0\1\16\2\0\3\16"+
|
||||
"\1\43\4\0\1\42\1\43\1\0\1\13\2\0\3\13"+
|
||||
"\1\44\6\0\1\17\6\0\1\45\4\0\1\24\1\25"+
|
||||
"\1\0\1\61\1\0\1\11\2\52\1\0\1\26\4\0"+
|
||||
"\2\27\1\0\1\56\2\0\1\56\2\0\1\50\4\0"+
|
||||
"\2\24\7\0\1\24\4\0\2\30\7\0\1\30\4\0"+
|
||||
"\2\34\7\0\1\34\4\0\2\37\7\0\1\37\4\0"+
|
||||
"\2\42\7\0\1\42\4\0\2\62\7\0\1\62\4\0"+
|
||||
"\2\24\7\0\1\63\4\0\2\62\1\0\1\56\2\0"+
|
||||
"\1\56\2\0\1\62\4\0\2\24\1\0\1\61\1\0"+
|
||||
"\1\11\2\52\1\0\1\24\3\0";
|
||||
"\1\2\1\3\1\4\7\2\1\5\1\6\15\0\2\3"+
|
||||
"\1\0\1\7\1\0\1\10\2\11\1\12\1\3\2\0"+
|
||||
"\1\3\1\4\1\0\1\13\1\0\1\10\2\14\1\15"+
|
||||
"\1\4\2\0\1\3\1\4\1\16\1\17\1\20\1\21"+
|
||||
"\2\11\1\12\1\22\2\0\1\23\1\24\7\0\1\25"+
|
||||
"\2\0\2\26\7\0\1\26\2\0\1\27\1\30\7\0"+
|
||||
"\1\31\3\0\1\32\7\0\1\12\2\0\1\33\1\34"+
|
||||
"\7\0\1\35\2\0\1\36\1\37\7\0\1\40\2\0"+
|
||||
"\1\41\1\42\7\0\1\43\13\0\1\44\2\0\1\23"+
|
||||
"\1\24\7\0\1\45\13\0\1\46\2\0\2\26\7\0"+
|
||||
"\1\47\2\0\1\3\1\4\1\16\1\7\1\20\1\21"+
|
||||
"\2\11\1\12\1\22\2\0\2\23\1\0\1\50\1\0"+
|
||||
"\1\10\2\51\1\0\1\23\2\0\1\23\1\24\1\0"+
|
||||
"\1\52\1\0\1\10\2\53\1\54\1\24\2\0\1\23"+
|
||||
"\1\24\1\0\1\50\1\0\1\10\2\51\1\0\1\25"+
|
||||
"\2\0\2\26\1\0\1\55\2\0\1\55\2\0\1\26"+
|
||||
"\2\0\2\27\1\0\1\51\1\0\1\10\2\51\1\0"+
|
||||
"\1\27\2\0\1\27\1\30\1\0\1\53\1\0\1\10"+
|
||||
"\2\53\1\54\1\30\2\0\1\27\1\30\1\0\1\51"+
|
||||
"\1\0\1\10\2\51\1\0\1\31\3\0\1\32\1\0"+
|
||||
"\1\54\2\0\3\54\1\32\2\0\2\33\1\0\1\56"+
|
||||
"\1\0\1\10\2\11\1\12\1\33\2\0\1\33\1\34"+
|
||||
"\1\0\1\57\1\0\1\10\2\14\1\15\1\34\2\0"+
|
||||
"\1\33\1\34\1\0\1\56\1\0\1\10\2\11\1\12"+
|
||||
"\1\35\2\0\2\36\1\0\1\11\1\0\1\10\2\11"+
|
||||
"\1\12\1\36\2\0\1\36\1\37\1\0\1\14\1\0"+
|
||||
"\1\10\2\14\1\15\1\37\2\0\1\36\1\37\1\0"+
|
||||
"\1\11\1\0\1\10\2\11\1\12\1\40\2\0\2\41"+
|
||||
"\1\0\1\12\2\0\3\12\1\41\2\0\1\41\1\42"+
|
||||
"\1\0\1\15\2\0\3\15\1\42\2\0\1\41\1\42"+
|
||||
"\1\0\1\12\2\0\3\12\1\43\4\0\1\16\6\0"+
|
||||
"\1\44\2\0\1\23\1\24\1\0\1\60\1\0\1\10"+
|
||||
"\2\51\1\0\1\25\2\0\2\26\1\0\1\55\2\0"+
|
||||
"\1\55\2\0\1\47\2\0\2\23\7\0\1\23\2\0"+
|
||||
"\2\27\7\0\1\27\2\0\2\33\7\0\1\33\2\0"+
|
||||
"\2\36\7\0\1\36\2\0\2\41\7\0\1\41\2\0"+
|
||||
"\2\61\7\0\1\61\2\0\2\23\7\0\1\62\2\0"+
|
||||
"\2\61\1\0\1\55\2\0\1\55\2\0\1\61\2\0"+
|
||||
"\2\23\1\0\1\60\1\0\1\10\2\51\1\0\1\23"+
|
||||
"\1\0";
|
||||
|
||||
private static int [] zzUnpackTrans() {
|
||||
int [] result = new int[658];
|
||||
int [] result = new int[552];
|
||||
int offset = 0;
|
||||
offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
|
||||
return result;
|
||||
|
@ -271,11 +269,11 @@ class ClassicTokenizerImpl implements StandardTokenizerInterface {
|
|||
private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
|
||||
|
||||
private static final String ZZ_ATTRIBUTE_PACKED_0 =
|
||||
"\1\0\1\11\3\1\1\11\1\1\13\0\4\1\2\0"+
|
||||
"\1\1\1\0\17\1\1\0\1\1\3\0\5\1";
|
||||
"\1\0\1\11\3\1\1\11\13\0\4\1\2\0\1\1"+
|
||||
"\1\0\17\1\1\0\1\1\3\0\5\1";
|
||||
|
||||
private static int [] zzUnpackAttribute() {
|
||||
int [] result = new int[51];
|
||||
int [] result = new int[50];
|
||||
int offset = 0;
|
||||
offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
|
||||
return result;
|
||||
|
@ -372,7 +370,6 @@ public final void getText(CharTermAttribute t) {
|
|||
|
||||
/**
|
||||
* Creates a new scanner
|
||||
* There is also a java.io.InputStream version of this constructor.
|
||||
*
|
||||
* @param in the java.io.Reader to read input from.
|
||||
*/
|
||||
|
@ -381,7 +378,6 @@ public final void getText(CharTermAttribute t) {
|
|||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Unpacks the compressed character translation table.
|
||||
*
|
||||
|
@ -392,7 +388,7 @@ public final void getText(CharTermAttribute t) {
|
|||
char [] map = new char[0x10000];
|
||||
int i = 0; /* index in packed string */
|
||||
int j = 0; /* index in unpacked array */
|
||||
while (i < 1154) {
|
||||
while (i < 1138) {
|
||||
int count = packed.charAt(i++);
|
||||
char value = packed.charAt(i++);
|
||||
do map[j++] = value; while (--count > 0);
|
||||
|
|
|
@ -116,8 +116,6 @@ LETTER = !(![:letter:]|{CJ})
|
|||
// Chinese and Japanese (but NOT Korean, which is included in [:letter:])
|
||||
CJ = [\u3100-\u312f\u3040-\u309F\u30A0-\u30FF\u31F0-\u31FF\u3300-\u337f\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\uff65-\uff9f]
|
||||
|
||||
WHITESPACE = \r\n | [ \r\n\t\f]
|
||||
|
||||
%%
|
||||
|
||||
{ALPHANUM} { return ALPHANUM; }
|
||||
|
@ -131,4 +129,4 @@ WHITESPACE = \r\n | [ \r\n\t\f]
|
|||
{ACRONYM_DEP} { return ACRONYM_DEP; }
|
||||
|
||||
/** Ignore the rest */
|
||||
. | {WHITESPACE} { /* Break so we don't hit fall-through warning: */ break;/* ignore */ }
|
||||
[^] { /* Break so we don't hit fall-through warning: */ break;/* ignore */ }
|
||||
|
|
|
@ -18,4 +18,4 @@
|
|||
|
||||
WARNING: if you change StandardTokenizerImpl*.jflex or UAX29URLEmailTokenizer
|
||||
and need to regenerate the tokenizer, only use the trunk version
|
||||
of JFlex 1.5 (with a minimum SVN revision 607) at the moment!
|
||||
of JFlex 1.5 (with a minimum SVN revision 722) at the moment!
|
||||
|
|
|
@ -1,9 +1,10 @@
|
|||
/*
|
||||
* Copyright 2010 The Apache Software Foundation.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
|
@ -13,8 +14,7 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
// Generated using ICU4J 49.1.0.0
|
||||
// Generated using ICU4J 52.1.0.0
|
||||
// by org.apache.lucene.analysis.icu.GenerateJFlexSupplementaryMacros
|
||||
|
||||
|
||||
|
@ -39,6 +39,12 @@ FormatSupp = (
|
|||
| ([\ud834][\uDD73-\uDD7A])
|
||||
| ([\udb40][\uDC01\uDC20-\uDC7F])
|
||||
)
|
||||
NumericSupp = (
|
||||
([\ud805][\uDEC0-\uDEC9])
|
||||
| ([\ud804][\uDC66-\uDC6F\uDCF0-\uDCF9\uDD36-\uDD3F\uDDD0-\uDDD9])
|
||||
| ([\ud835][\uDFCE-\uDFFF])
|
||||
| ([\ud801][\uDCA0-\uDCA9])
|
||||
)
|
||||
ExtendSupp = (
|
||||
([\ud81b][\uDF51-\uDF7E\uDF8F-\uDF92])
|
||||
| ([\ud805][\uDEAB-\uDEB7])
|
||||
|
@ -48,12 +54,6 @@ ExtendSupp = (
|
|||
| ([\udb40][\uDD00-\uDDEF])
|
||||
| ([\ud802][\uDE01-\uDE03\uDE05\uDE06\uDE0C-\uDE0F\uDE38-\uDE3A\uDE3F])
|
||||
)
|
||||
NumericSupp = (
|
||||
([\ud805][\uDEC0-\uDEC9])
|
||||
| ([\ud804][\uDC66-\uDC6F\uDCF0-\uDCF9\uDD36-\uDD3F\uDDD0-\uDDD9])
|
||||
| ([\ud835][\uDFCE-\uDFFF])
|
||||
| ([\ud801][\uDCA0-\uDCA9])
|
||||
)
|
||||
KatakanaSupp = (
|
||||
([\ud82c][\uDC00])
|
||||
)
|
||||
|
@ -129,3 +129,15 @@ HiraganaSupp = (
|
|||
([\ud83c][\uDE00])
|
||||
| ([\ud82c][\uDC01])
|
||||
)
|
||||
SingleQuoteSupp = (
|
||||
[]
|
||||
)
|
||||
DoubleQuoteSupp = (
|
||||
[]
|
||||
)
|
||||
HebrewLetterSupp = (
|
||||
[]
|
||||
)
|
||||
RegionalIndicatorSupp = (
|
||||
([\ud83c][\uDDE6-\uDDFF])
|
||||
)
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -32,11 +32,13 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
|||
* Asian languages, including Thai, Lao, Myanmar, and Khmer</li>
|
||||
* <li><IDEOGRAPHIC>: A single CJKV ideographic character</li>
|
||||
* <li><HIRAGANA>: A single hiragana character</li>
|
||||
* <li><KATAKANA>: A sequence of katakana characters</li>
|
||||
* <li><HANGUL>: A sequence of Hangul characters</li>
|
||||
* </ul>
|
||||
*/
|
||||
%%
|
||||
|
||||
%unicode 6.1
|
||||
%unicode 6.3
|
||||
%integer
|
||||
%final
|
||||
%public
|
||||
|
@ -47,33 +49,40 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
|||
%buffer 4096
|
||||
|
||||
%include SUPPLEMENTARY.jflex-macro
|
||||
ALetter = ([\p{WB:ALetter}] | {ALetterSupp})
|
||||
Format = ([\p{WB:Format}] | {FormatSupp})
|
||||
Numeric = ([\p{WB:Numeric}] | {NumericSupp})
|
||||
Extend = ([\p{WB:Extend}] | {ExtendSupp})
|
||||
Katakana = ([\p{WB:Katakana}] | {KatakanaSupp})
|
||||
MidLetter = ([\p{WB:MidLetter}] | {MidLetterSupp})
|
||||
MidNum = ([\p{WB:MidNum}] | {MidNumSupp})
|
||||
MidNumLet = ([\p{WB:MidNumLet}] | {MidNumLetSupp})
|
||||
ExtendNumLet = ([\p{WB:ExtendNumLet}] | {ExtendNumLetSupp})
|
||||
ComplexContext = ([\p{LB:Complex_Context}] | {ComplexContextSupp})
|
||||
Han = ([\p{Script:Han}] | {HanSupp})
|
||||
Hiragana = ([\p{Script:Hiragana}] | {HiraganaSupp})
|
||||
ALetter = (\p{WB:ALetter} | {ALetterSupp})
|
||||
Format = (\p{WB:Format} | {FormatSupp})
|
||||
Numeric = ([\p{WB:Numeric}[\p{Blk:HalfAndFullForms}&&\p{Nd}]] | {NumericSupp})
|
||||
Extend = (\p{WB:Extend} | {ExtendSupp})
|
||||
Katakana = (\p{WB:Katakana} | {KatakanaSupp})
|
||||
MidLetter = (\p{WB:MidLetter} | {MidLetterSupp})
|
||||
MidNum = (\p{WB:MidNum} | {MidNumSupp})
|
||||
MidNumLet = (\p{WB:MidNumLet} | {MidNumLetSupp})
|
||||
ExtendNumLet = (\p{WB:ExtendNumLet} | {ExtendNumLetSupp})
|
||||
ComplexContext = (\p{LB:Complex_Context} | {ComplexContextSupp})
|
||||
Han = (\p{Script:Han} | {HanSupp})
|
||||
Hiragana = (\p{Script:Hiragana} | {HiraganaSupp})
|
||||
SingleQuote = (\p{WB:Single_Quote} | {SingleQuoteSupp})
|
||||
DoubleQuote = (\p{WB:Double_Quote} | {DoubleQuoteSupp})
|
||||
HebrewLetter = (\p{WB:Hebrew_Letter} | {HebrewLetterSupp})
|
||||
RegionalIndicator = (\p{WB:Regional_Indicator} | {RegionalIndicatorSupp})
|
||||
HebrewOrALetter = ({HebrewLetter} | {ALetter})
|
||||
|
||||
// Script=Hangul & Aletter
|
||||
HangulEx = (!(!\p{Script:Hangul}|!\p{WB:ALetter})) ({Format} | {Extend})*
|
||||
// UAX#29 WB4. X (Extend | Format)* --> X
|
||||
//
|
||||
ALetterEx = {ALetter} ({Format} | {Extend})*
|
||||
// TODO: Convert hard-coded full-width numeric range to property intersection (something like [\p{Full-Width}&&\p{Numeric}]) once JFlex supports it
|
||||
NumericEx = ({Numeric} | [\uFF10-\uFF19]) ({Format} | {Extend})*
|
||||
HangulEx = [\p{Script:Hangul}&&[\p{WB:ALetter}\p{WB:Hebrew_Letter}]] ({Format} | {Extend})*
|
||||
HebrewOrALetterEx = {HebrewOrALetter} ({Format} | {Extend})*
|
||||
NumericEx = {Numeric} ({Format} | {Extend})*
|
||||
KatakanaEx = {Katakana} ({Format} | {Extend})*
|
||||
MidLetterEx = ({MidLetter} | {MidNumLet}) ({Format} | {Extend})*
|
||||
MidNumericEx = ({MidNum} | {MidNumLet}) ({Format} | {Extend})*
|
||||
MidLetterEx = ({MidLetter} | {MidNumLet} | {SingleQuote}) ({Format} | {Extend})*
|
||||
MidNumericEx = ({MidNum} | {MidNumLet} | {SingleQuote}) ({Format} | {Extend})*
|
||||
ExtendNumLetEx = {ExtendNumLet} ({Format} | {Extend})*
|
||||
|
||||
HanEx = {Han} ({Format} | {Extend})*
|
||||
HiraganaEx = {Hiragana} ({Format} | {Extend})*
|
||||
SingleQuoteEx = {SingleQuote} ({Format} | {Extend})*
|
||||
DoubleQuoteEx = {DoubleQuote} ({Format} | {Extend})*
|
||||
HebrewLetterEx = {HebrewLetter} ({Format} | {Extend})*
|
||||
RegionalIndicatorEx = {RegionalIndicator} ({Format} | {Extend})*
|
||||
|
||||
|
||||
%{
|
||||
/** Alphanumeric sequences */
|
||||
|
@ -121,15 +130,12 @@ HiraganaEx = {Hiragana} ({Format} | {Extend})*
|
|||
<<EOF>> { return StandardTokenizerInterface.YYEOF; }
|
||||
|
||||
// UAX#29 WB8. Numeric × Numeric
|
||||
// WB11. Numeric (MidNum | MidNumLet) × Numeric
|
||||
// WB12. Numeric × (MidNum | MidNumLet) Numeric
|
||||
// WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
|
||||
// WB13b. ExtendNumLet × (ALetter | Numeric | Katakana)
|
||||
// WB11. Numeric (MidNum | MidNumLet | Single_Quote) × Numeric
|
||||
// WB12. Numeric × (MidNum | MidNumLet | Single_Quote) Numeric
|
||||
// WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
|
||||
// WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana)
|
||||
//
|
||||
{ExtendNumLetEx}* {NumericEx} ({ExtendNumLetEx}+ {NumericEx}
|
||||
| {MidNumericEx} {NumericEx}
|
||||
| {NumericEx})*
|
||||
{ExtendNumLetEx}*
|
||||
{ExtendNumLetEx}* {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx} )* {ExtendNumLetEx}*
|
||||
{ return NUMERIC_TYPE; }
|
||||
|
||||
// subset of the below for typing purposes only!
|
||||
|
@ -139,21 +145,31 @@ HiraganaEx = {Hiragana} ({Format} | {Extend})*
|
|||
{KatakanaEx}+
|
||||
{ return KATAKANA_TYPE; }
|
||||
|
||||
// UAX#29 WB5. ALetter × ALetter
|
||||
// WB6. ALetter × (MidLetter | MidNumLet) ALetter
|
||||
// WB7. ALetter (MidLetter | MidNumLet) × ALetter
|
||||
// WB9. ALetter × Numeric
|
||||
// WB10. Numeric × ALetter
|
||||
// UAX#29 WB5. (ALetter | Hebrew_Letter) × (ALetter | Hebrew_Letter)
|
||||
// WB6. (ALetter | Hebrew_Letter) × (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
|
||||
// WB7. (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) × (ALetter | Hebrew_Letter)
|
||||
// WB7a. Hebrew_Letter × Single_Quote
|
||||
// WB7b. Hebrew_Letter × Double_Quote Hebrew_Letter
|
||||
// WB7c. Hebrew_Letter Double_Quote × Hebrew_Letter
|
||||
// WB9. (ALetter | Hebrew_Letter) × Numeric
|
||||
// WB10. Numeric × (ALetter | Hebrew_Letter)
|
||||
// WB13. Katakana × Katakana
|
||||
// WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
|
||||
// WB13b. ExtendNumLet × (ALetter | Numeric | Katakana)
|
||||
// WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
|
||||
// WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana)
|
||||
//
|
||||
{ExtendNumLetEx}* ( {KatakanaEx} ({ExtendNumLetEx}* {KatakanaEx})*
|
||||
| ( {NumericEx} ({ExtendNumLetEx}+ {NumericEx} | {MidNumericEx} {NumericEx} | {NumericEx})*
|
||||
| {ALetterEx} ({ExtendNumLetEx}+ {ALetterEx} | {MidLetterEx} {ALetterEx} | {ALetterEx})* )+ )
|
||||
({ExtendNumLetEx}+ ( {KatakanaEx} ({ExtendNumLetEx}* {KatakanaEx})*
|
||||
| ( {NumericEx} ({ExtendNumLetEx}+ {NumericEx} | {MidNumericEx} {NumericEx} | {NumericEx})*
|
||||
| {ALetterEx} ({ExtendNumLetEx}+ {ALetterEx} | {MidLetterEx} {ALetterEx} | {ALetterEx})* )+ ) )*
|
||||
{ExtendNumLetEx}* ( {KatakanaEx} ( {ExtendNumLetEx}* {KatakanaEx} )*
|
||||
| ( {HebrewLetterEx} ( {SingleQuoteEx} | {DoubleQuoteEx} {HebrewLetterEx} )
|
||||
| {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} )* {NumericEx} )*
|
||||
| {HebrewOrALetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx} )* {HebrewOrALetterEx} )*
|
||||
)+
|
||||
)
|
||||
({ExtendNumLetEx}+ ( {KatakanaEx} ( {ExtendNumLetEx}* {KatakanaEx} )*
|
||||
| ( {HebrewLetterEx} ( {SingleQuoteEx} | {DoubleQuoteEx} {HebrewLetterEx} )
|
||||
| {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} )* {NumericEx} )*
|
||||
| {HebrewOrALetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx} )* {HebrewOrALetterEx} )*
|
||||
)+
|
||||
)
|
||||
)*
|
||||
{ExtendNumLetEx}*
|
||||
{ return WORD_TYPE; }
|
||||
|
||||
|
@ -166,7 +182,7 @@ HiraganaEx = {Hiragana} ({Format} | {Extend})*
|
|||
// annex. That means that satisfactory treatment of languages like Chinese
|
||||
// or Thai requires special handling.
|
||||
//
|
||||
// In Unicode 6.1, only one character has the \p{Line_Break = Contingent_Break}
|
||||
// In Unicode 6.3, only one character has the \p{Line_Break = Contingent_Break}
|
||||
// property: U+FFFC (  ) OBJECT REPLACEMENT CHARACTER.
|
||||
//
|
||||
// In the ICU implementation of UAX#29, \p{Line_Break = Complex_Context}
|
||||
|
@ -188,6 +204,8 @@ HiraganaEx = {Hiragana} ({Format} | {Extend})*
|
|||
// UAX#29 WB3. CR × LF
|
||||
// WB3a. (Newline | CR | LF) ÷
|
||||
// WB3b. ÷ (Newline | CR | LF)
|
||||
// WB13c. Regional_Indicator × Regional_Indicator
|
||||
// WB14. Any ÷ Any
|
||||
//
|
||||
[^] { /* Break so we don't hit fall-through warning: */ break; /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */ }
|
||||
{RegionalIndicatorEx} {RegionalIndicatorEx}+ | [^]
|
||||
{ /* Break so we don't hit fall-through warning: */ break; /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */ }
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -35,11 +35,13 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
|||
* Asian languages, including Thai, Lao, Myanmar, and Khmer</li>
|
||||
* <li><IDEOGRAPHIC>: A single CJKV ideographic character</li>
|
||||
* <li><HIRAGANA>: A single hiragana character</li>
|
||||
* <li><KATAKANA>: A sequence of katakana characters</li>
|
||||
* <li><HANGUL>: A sequence of Hangul characters</li>
|
||||
* </ul>
|
||||
*/
|
||||
%%
|
||||
|
||||
%unicode 6.1
|
||||
%unicode 6.3
|
||||
%integer
|
||||
%final
|
||||
%public
|
||||
|
@ -50,33 +52,39 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
|||
%buffer 4096
|
||||
|
||||
%include SUPPLEMENTARY.jflex-macro
|
||||
ALetter = ([\p{WB:ALetter}] | {ALetterSupp})
|
||||
Format = ([\p{WB:Format}] | {FormatSupp})
|
||||
Numeric = ([\p{WB:Numeric}] | {NumericSupp})
|
||||
Extend = ([\p{WB:Extend}] | {ExtendSupp})
|
||||
Katakana = ([\p{WB:Katakana}] | {KatakanaSupp})
|
||||
MidLetter = ([\p{WB:MidLetter}] | {MidLetterSupp})
|
||||
MidNum = ([\p{WB:MidNum}] | {MidNumSupp})
|
||||
MidNumLet = ([\p{WB:MidNumLet}] | {MidNumLetSupp})
|
||||
ExtendNumLet = ([\p{WB:ExtendNumLet}] | {ExtendNumLetSupp})
|
||||
ComplexContext = ([\p{LB:Complex_Context}] | {ComplexContextSupp})
|
||||
Han = ([\p{Script:Han}] | {HanSupp})
|
||||
Hiragana = ([\p{Script:Hiragana}] | {HiraganaSupp})
|
||||
ALetter = (\p{WB:ALetter} | {ALetterSupp})
|
||||
Format = (\p{WB:Format} | {FormatSupp})
|
||||
Numeric = ([\p{WB:Numeric}[\p{Blk:HalfAndFullForms}&&\p{Nd}]] | {NumericSupp})
|
||||
Extend = (\p{WB:Extend} | {ExtendSupp})
|
||||
Katakana = (\p{WB:Katakana} | {KatakanaSupp})
|
||||
MidLetter = (\p{WB:MidLetter} | {MidLetterSupp})
|
||||
MidNum = (\p{WB:MidNum} | {MidNumSupp})
|
||||
MidNumLet = (\p{WB:MidNumLet} | {MidNumLetSupp})
|
||||
ExtendNumLet = (\p{WB:ExtendNumLet} | {ExtendNumLetSupp})
|
||||
ComplexContext = (\p{LB:Complex_Context} | {ComplexContextSupp})
|
||||
Han = (\p{Script:Han} | {HanSupp})
|
||||
Hiragana = (\p{Script:Hiragana} | {HiraganaSupp})
|
||||
SingleQuote = (\p{WB:Single_Quote} | {SingleQuoteSupp})
|
||||
DoubleQuote = (\p{WB:Double_Quote} | {DoubleQuoteSupp})
|
||||
HebrewLetter = (\p{WB:Hebrew_Letter} | {HebrewLetterSupp})
|
||||
RegionalIndicator = (\p{WB:Regional_Indicator} | {RegionalIndicatorSupp})
|
||||
HebrewOrALetter = ({HebrewLetter} | {ALetter})
|
||||
|
||||
// Script=Hangul & Aletter
|
||||
HangulEx = (!(!\p{Script:Hangul}|!\p{WB:ALetter})) ({Format} | {Extend})*
|
||||
// UAX#29 WB4. X (Extend | Format)* --> X
|
||||
//
|
||||
ALetterEx = {ALetter} ({Format} | {Extend})*
|
||||
// TODO: Convert hard-coded full-width numeric range to property intersection (something like [\p{Full-Width}&&\p{Numeric}]) once JFlex supports it
|
||||
NumericEx = ({Numeric} | [\uFF10-\uFF19]) ({Format} | {Extend})*
|
||||
HangulEx = [\p{Script:Hangul}&&[\p{WB:ALetter}\p{WB:Hebrew_Letter}]] ({Format} | {Extend})*
|
||||
HebrewOrALetterEx = {HebrewOrALetter} ({Format} | {Extend})*
|
||||
NumericEx = {Numeric} ({Format} | {Extend})*
|
||||
KatakanaEx = {Katakana} ({Format} | {Extend})*
|
||||
MidLetterEx = ({MidLetter} | {MidNumLet}) ({Format} | {Extend})*
|
||||
MidNumericEx = ({MidNum} | {MidNumLet}) ({Format} | {Extend})*
|
||||
MidLetterEx = ({MidLetter} | {MidNumLet} | {SingleQuote}) ({Format} | {Extend})*
|
||||
MidNumericEx = ({MidNum} | {MidNumLet} | {SingleQuote}) ({Format} | {Extend})*
|
||||
ExtendNumLetEx = {ExtendNumLet} ({Format} | {Extend})*
|
||||
|
||||
HanEx = {Han} ({Format} | {Extend})*
|
||||
HiraganaEx = {Hiragana} ({Format} | {Extend})*
|
||||
SingleQuoteEx = {SingleQuote} ({Format} | {Extend})*
|
||||
DoubleQuoteEx = {DoubleQuote} ({Format} | {Extend})*
|
||||
HebrewLetterEx = {HebrewLetter} ({Format} | {Extend})*
|
||||
RegionalIndicatorEx = {RegionalIndicator} ({Format} | {Extend})*
|
||||
|
||||
// URL and E-mail syntax specifications:
|
||||
//
|
||||
|
@ -213,15 +221,12 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
|
|||
{EMAIL} { return EMAIL_TYPE; }
|
||||
|
||||
// UAX#29 WB8. Numeric × Numeric
|
||||
// WB11. Numeric (MidNum | MidNumLet) × Numeric
|
||||
// WB12. Numeric × (MidNum | MidNumLet) Numeric
|
||||
// WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
|
||||
// WB13b. ExtendNumLet × (ALetter | Numeric | Katakana)
|
||||
// WB11. Numeric (MidNum | MidNumLet | Single_Quote) × Numeric
|
||||
// WB12. Numeric × (MidNum | MidNumLet | Single_Quote) Numeric
|
||||
// WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
|
||||
// WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana)
|
||||
//
|
||||
{ExtendNumLetEx}* {NumericEx} ({ExtendNumLetEx}+ {NumericEx}
|
||||
| {MidNumericEx} {NumericEx}
|
||||
| {NumericEx})*
|
||||
{ExtendNumLetEx}*
|
||||
{ExtendNumLetEx}* {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} ) {NumericEx} )* {ExtendNumLetEx}*
|
||||
{ return NUMERIC_TYPE; }
|
||||
|
||||
// subset of the below for typing purposes only!
|
||||
|
@ -231,21 +236,31 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
|
|||
{KatakanaEx}+
|
||||
{ return KATAKANA_TYPE; }
|
||||
|
||||
// UAX#29 WB5. ALetter × ALetter
|
||||
// WB6. ALetter × (MidLetter | MidNumLet) ALetter
|
||||
// WB7. ALetter (MidLetter | MidNumLet) × ALetter
|
||||
// WB9. ALetter × Numeric
|
||||
// WB10. Numeric × ALetter
|
||||
// UAX#29 WB5. (ALetter | Hebrew_Letter) × (ALetter | Hebrew_Letter)
|
||||
// WB6. (ALetter | Hebrew_Letter) × (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
|
||||
// WB7. (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) × (ALetter | Hebrew_Letter)
|
||||
// WB7a. Hebrew_Letter × Single_Quote
|
||||
// WB7b. Hebrew_Letter × Double_Quote Hebrew_Letter
|
||||
// WB7c. Hebrew_Letter Double_Quote × Hebrew_Letter
|
||||
// WB9. (ALetter | Hebrew_Letter) × Numeric
|
||||
// WB10. Numeric × (ALetter | Hebrew_Letter)
|
||||
// WB13. Katakana × Katakana
|
||||
// WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
|
||||
// WB13b. ExtendNumLet × (ALetter | Numeric | Katakana)
|
||||
// WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
|
||||
// WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric | Katakana)
|
||||
//
|
||||
{ExtendNumLetEx}* ( {KatakanaEx} ({ExtendNumLetEx}* {KatakanaEx})*
|
||||
| ( {NumericEx} ({ExtendNumLetEx}+ {NumericEx} | {MidNumericEx} {NumericEx} | {NumericEx})*
|
||||
| {ALetterEx} ({ExtendNumLetEx}+ {ALetterEx} | {MidLetterEx} {ALetterEx} | {ALetterEx})* )+ )
|
||||
({ExtendNumLetEx}+ ( {KatakanaEx} ({ExtendNumLetEx}* {KatakanaEx})*
|
||||
| ( {NumericEx} ({ExtendNumLetEx}+ {NumericEx} | {MidNumericEx} {NumericEx} | {NumericEx})*
|
||||
| {ALetterEx} ({ExtendNumLetEx}+ {ALetterEx} | {MidLetterEx} {ALetterEx} | {ALetterEx})* )+ ) )*
|
||||
{ExtendNumLetEx}* ( {KatakanaEx} ( {ExtendNumLetEx}* {KatakanaEx} )*
|
||||
| ( {HebrewLetterEx} ( {SingleQuoteEx} | {DoubleQuoteEx} {HebrewLetterEx} )
|
||||
| {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} )* {NumericEx} )*
|
||||
| {HebrewOrALetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx} )* {HebrewOrALetterEx} )*
|
||||
)+
|
||||
)
|
||||
({ExtendNumLetEx}+ ( {KatakanaEx} ( {ExtendNumLetEx}* {KatakanaEx} )*
|
||||
| ( {HebrewLetterEx} ( {SingleQuoteEx} | {DoubleQuoteEx} {HebrewLetterEx} )
|
||||
| {NumericEx} ( ( {ExtendNumLetEx}* | {MidNumericEx} )* {NumericEx} )*
|
||||
| {HebrewOrALetterEx} ( ( {ExtendNumLetEx}* | {MidLetterEx} )* {HebrewOrALetterEx} )*
|
||||
)+
|
||||
)
|
||||
)*
|
||||
{ExtendNumLetEx}*
|
||||
{ return WORD_TYPE; }
|
||||
|
||||
|
@ -258,7 +273,7 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
|
|||
// annex. That means that satisfactory treatment of languages like Chinese
|
||||
// or Thai requires special handling.
|
||||
//
|
||||
// In Unicode 6.1, only one character has the \p{Line_Break = Contingent_Break}
|
||||
// In Unicode 6.3, only one character has the \p{Line_Break = Contingent_Break}
|
||||
// property: U+FFFC (  ) OBJECT REPLACEMENT CHARACTER.
|
||||
//
|
||||
// In the ICU implementation of UAX#29, \p{Line_Break = Complex_Context}
|
||||
|
@ -280,6 +295,8 @@ EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
|
|||
// UAX#29 WB3. CR × LF
|
||||
// WB3a. (Newline | CR | LF) ÷
|
||||
// WB3b. ÷ (Newline | CR | LF)
|
||||
// WB13c. Regional_Indicator × Regional_Indicator
|
||||
// WB14. Any ÷ Any
|
||||
//
|
||||
[^] { /* Break so we don't hit fall-through warning: */ break;/* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */ }
|
||||
{RegionalIndicatorEx} {RegionalIndicatorEx}+ | [^]
|
||||
{ /* Break so we don't hit fall-through warning: */ break; /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */ }
|
||||
|
|
|
@ -133,8 +133,8 @@ public class SynonymFilterFactory extends TokenFilterFactory implements Resource
|
|||
analyzer = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = factory == null ? new WhitespaceTokenizer(Version.LUCENE_50, reader) : factory.create(reader);
|
||||
TokenStream stream = ignoreCase ? new LowerCaseFilter(Version.LUCENE_50, tokenizer) : tokenizer;
|
||||
Tokenizer tokenizer = factory == null ? new WhitespaceTokenizer(Version.LUCENE_CURRENT, reader) : factory.create(reader);
|
||||
TokenStream stream = ignoreCase ? new LowerCaseFilter(Version.LUCENE_CURRENT, tokenizer) : tokenizer;
|
||||
return new TokenStreamComponents(tokenizer, stream);
|
||||
}
|
||||
};
|
||||
|
@ -201,7 +201,7 @@ public class SynonymFilterFactory extends TokenFilterFactory implements Resource
|
|||
private Analyzer loadAnalyzer(ResourceLoader loader, String cname) throws IOException {
|
||||
Class<? extends Analyzer> clazz = loader.findClass(cname, Analyzer.class);
|
||||
try {
|
||||
Analyzer analyzer = clazz.getConstructor(Version.class).newInstance(Version.LUCENE_50);
|
||||
Analyzer analyzer = clazz.getConstructor(Version.class).newInstance(Version.LUCENE_CURRENT);
|
||||
if (analyzer instanceof ResourceLoaderAware) {
|
||||
((ResourceLoaderAware) analyzer).inform(loader);
|
||||
}
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
/* The following code was generated by JFlex. */
|
||||
/* The following code was generated by JFlex 1.5.0-SNAPSHOT */
|
||||
|
||||
package org.apache.lucene.analysis.wikipedia;
|
||||
|
||||
|
@ -84,21 +84,20 @@ class WikipediaTokenizerImpl {
|
|||
private static final int [] ZZ_ACTION = zzUnpackAction();
|
||||
|
||||
private static final String ZZ_ACTION_PACKED_0 =
|
||||
"\12\0\4\1\4\2\1\3\1\1\1\4\1\1\2\5"+
|
||||
"\1\6\2\5\1\7\1\5\2\10\1\11\1\12\1\11"+
|
||||
"\1\13\1\14\1\10\1\15\1\16\1\15\1\17\1\20"+
|
||||
"\1\10\1\21\1\10\4\22\1\23\1\22\1\24\1\25"+
|
||||
"\1\26\3\0\1\27\14\0\1\30\1\31\1\32\1\33"+
|
||||
"\1\11\1\0\1\34\1\35\1\36\1\0\1\37\1\0"+
|
||||
"\1\40\3\0\1\41\1\42\2\43\1\42\2\44\2\0"+
|
||||
"\1\43\1\0\14\43\1\42\3\0\1\11\1\45\3\0"+
|
||||
"\1\46\1\47\5\0\1\50\4\0\1\50\2\0\2\50"+
|
||||
"\2\0\1\11\5\0\1\31\1\42\1\43\1\51\3\0"+
|
||||
"\1\11\2\0\1\52\30\0\1\53\2\0\1\54\1\55"+
|
||||
"\1\56";
|
||||
"\12\0\4\1\4\2\1\3\1\4\1\1\2\5\1\6"+
|
||||
"\1\5\1\7\1\5\2\10\1\11\1\5\1\12\1\11"+
|
||||
"\1\13\1\14\1\15\1\16\1\15\1\17\1\20\1\10"+
|
||||
"\1\21\1\10\4\22\1\23\1\24\1\25\1\26\3\0"+
|
||||
"\1\27\14\0\1\30\1\31\1\32\1\33\1\11\1\0"+
|
||||
"\1\34\1\35\1\36\1\0\1\37\1\0\1\40\3\0"+
|
||||
"\1\41\1\42\2\43\1\42\2\44\2\0\1\43\1\0"+
|
||||
"\14\43\1\42\3\0\1\11\1\45\3\0\1\46\1\47"+
|
||||
"\5\0\1\50\4\0\1\50\2\0\2\50\2\0\1\11"+
|
||||
"\5\0\1\31\1\42\1\43\1\51\3\0\1\11\2\0"+
|
||||
"\1\52\30\0\1\53\2\0\1\54\1\55\1\56";
|
||||
|
||||
private static int [] zzUnpackAction() {
|
||||
int [] result = new int[184];
|
||||
int [] result = new int[181];
|
||||
int offset = 0;
|
||||
offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
|
||||
return result;
|
||||
|
@ -125,30 +124,30 @@ class WikipediaTokenizerImpl {
|
|||
private static final String ZZ_ROWMAP_PACKED_0 =
|
||||
"\0\0\0\54\0\130\0\204\0\260\0\334\0\u0108\0\u0134"+
|
||||
"\0\u0160\0\u018c\0\u01b8\0\u01e4\0\u0210\0\u023c\0\u0268\0\u0294"+
|
||||
"\0\u02c0\0\u02ec\0\u01b8\0\u0318\0\u0344\0\u0370\0\u01b8\0\u039c"+
|
||||
"\0\u03c8\0\u03f4\0\u0420\0\u044c\0\u0478\0\u01b8\0\u039c\0\u04a4"+
|
||||
"\0\u01b8\0\u04d0\0\u04fc\0\u0528\0\u0554\0\u0580\0\u05ac\0\u05d8"+
|
||||
"\0\u0604\0\u0630\0\u065c\0\u0688\0\u06b4\0\u01b8\0\u06e0\0\u039c"+
|
||||
"\0\u070c\0\u0738\0\u0764\0\u0790\0\u01b8\0\u01b8\0\u07bc\0\u07e8"+
|
||||
"\0\u0814\0\u01b8\0\u0840\0\u086c\0\u0898\0\u08c4\0\u08f0\0\u091c"+
|
||||
"\0\u0948\0\u0974\0\u09a0\0\u09cc\0\u09f8\0\u0a24\0\u0a50\0\u0a7c"+
|
||||
"\0\u01b8\0\u01b8\0\u0aa8\0\u0ad4\0\u0b00\0\u0b00\0\u01b8\0\u0b2c"+
|
||||
"\0\u02c0\0\u02ec\0\u01b8\0\u0318\0\u0344\0\u01b8\0\u0370\0\u039c"+
|
||||
"\0\u03c8\0\u03f4\0\u0420\0\u01b8\0\u0370\0\u044c\0\u0478\0\u01b8"+
|
||||
"\0\u04a4\0\u04d0\0\u04fc\0\u0528\0\u0554\0\u0580\0\u05ac\0\u05d8"+
|
||||
"\0\u0604\0\u0630\0\u065c\0\u01b8\0\u0688\0\u0370\0\u06b4\0\u06e0"+
|
||||
"\0\u070c\0\u01b8\0\u01b8\0\u0738\0\u0764\0\u0790\0\u01b8\0\u07bc"+
|
||||
"\0\u07e8\0\u0814\0\u0840\0\u086c\0\u0898\0\u08c4\0\u08f0\0\u091c"+
|
||||
"\0\u0948\0\u0974\0\u09a0\0\u09cc\0\u09f8\0\u01b8\0\u01b8\0\u0a24"+
|
||||
"\0\u0a50\0\u0a7c\0\u0a7c\0\u01b8\0\u0aa8\0\u0ad4\0\u0b00\0\u0b2c"+
|
||||
"\0\u0b58\0\u0b84\0\u0bb0\0\u0bdc\0\u0c08\0\u0c34\0\u0c60\0\u0c8c"+
|
||||
"\0\u0cb8\0\u0ce4\0\u0d10\0\u0898\0\u0d3c\0\u0d68\0\u0d94\0\u0dc0"+
|
||||
"\0\u0814\0\u0cb8\0\u0ce4\0\u0d10\0\u0d3c\0\u0d68\0\u0d94\0\u0dc0"+
|
||||
"\0\u0dec\0\u0e18\0\u0e44\0\u0e70\0\u0e9c\0\u0ec8\0\u0ef4\0\u0f20"+
|
||||
"\0\u0f4c\0\u0f78\0\u0fa4\0\u0fd0\0\u0ffc\0\u1028\0\u1054\0\u1080"+
|
||||
"\0\u10ac\0\u10d8\0\u01b8\0\u1104\0\u1130\0\u115c\0\u1188\0\u01b8"+
|
||||
"\0\u0f4c\0\u0f78\0\u0fa4\0\u0fd0\0\u0ffc\0\u1028\0\u1054\0\u01b8"+
|
||||
"\0\u1080\0\u10ac\0\u10d8\0\u1104\0\u01b8\0\u1130\0\u115c\0\u1188"+
|
||||
"\0\u11b4\0\u11e0\0\u120c\0\u1238\0\u1264\0\u1290\0\u12bc\0\u12e8"+
|
||||
"\0\u1314\0\u1340\0\u136c\0\u1398\0\u13c4\0\u086c\0\u09f8\0\u13f0"+
|
||||
"\0\u141c\0\u1448\0\u1474\0\u14a0\0\u14cc\0\u14f8\0\u1524\0\u01b8"+
|
||||
"\0\u1550\0\u157c\0\u15a8\0\u15d4\0\u1600\0\u162c\0\u1658\0\u1684"+
|
||||
"\0\u16b0\0\u01b8\0\u16dc\0\u1708\0\u1734\0\u1760\0\u178c\0\u17b8"+
|
||||
"\0\u1314\0\u1340\0\u07e8\0\u0974\0\u136c\0\u1398\0\u13c4\0\u13f0"+
|
||||
"\0\u141c\0\u1448\0\u1474\0\u14a0\0\u01b8\0\u14cc\0\u14f8\0\u1524"+
|
||||
"\0\u1550\0\u157c\0\u15a8\0\u15d4\0\u1600\0\u162c\0\u01b8\0\u1658"+
|
||||
"\0\u1684\0\u16b0\0\u16dc\0\u1708\0\u1734\0\u1760\0\u178c\0\u17b8"+
|
||||
"\0\u17e4\0\u1810\0\u183c\0\u1868\0\u1894\0\u18c0\0\u18ec\0\u1918"+
|
||||
"\0\u1944\0\u1970\0\u199c\0\u19c8\0\u19f4\0\u1a20\0\u1a4c\0\u1a78"+
|
||||
"\0\u1aa4\0\u1ad0\0\u1afc\0\u1b28\0\u1b54\0\u01b8\0\u01b8\0\u01b8";
|
||||
"\0\u1aa4\0\u1ad0\0\u01b8\0\u01b8\0\u01b8";
|
||||
|
||||
private static int [] zzUnpackRowMap() {
|
||||
int [] result = new int[184];
|
||||
int [] result = new int[181];
|
||||
int offset = 0;
|
||||
offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
|
||||
return result;
|
||||
|
@ -172,152 +171,149 @@ class WikipediaTokenizerImpl {
|
|||
|
||||
private static final String ZZ_TRANS_PACKED_0 =
|
||||
"\1\13\1\14\5\13\1\15\1\13\1\16\3\13\1\17"+
|
||||
"\1\20\1\21\1\22\1\23\1\24\2\13\1\25\2\13"+
|
||||
"\15\17\1\26\2\13\3\17\1\13\7\27\1\30\5\27"+
|
||||
"\4\31\1\27\1\32\3\27\1\33\1\27\15\31\3\27"+
|
||||
"\3\31\10\27\1\30\5\27\4\34\1\27\1\32\3\27"+
|
||||
"\1\35\1\27\15\34\3\27\3\34\1\27\7\36\1\37"+
|
||||
"\5\36\4\40\1\36\1\32\2\27\1\36\1\41\1\36"+
|
||||
"\15\40\3\36\1\42\2\40\2\36\1\43\5\36\1\37"+
|
||||
"\5\36\4\44\1\36\1\45\2\36\1\46\2\36\15\44"+
|
||||
"\3\36\3\44\10\36\1\37\5\36\4\47\1\36\1\45"+
|
||||
"\2\36\1\46\2\36\15\47\3\36\3\47\10\36\1\37"+
|
||||
"\5\36\4\47\1\36\1\45\2\36\1\50\2\36\15\47"+
|
||||
"\3\36\3\47\10\36\1\37\1\36\1\51\3\36\4\52"+
|
||||
"\1\36\1\45\5\36\15\52\3\36\3\52\10\36\1\53"+
|
||||
"\5\36\4\54\1\36\1\45\5\36\15\54\1\36\1\55"+
|
||||
"\1\36\3\54\1\36\1\56\1\57\5\56\1\60\1\56"+
|
||||
"\1\61\3\56\4\62\1\56\1\63\2\56\1\64\2\56"+
|
||||
"\15\62\2\56\1\65\3\62\1\56\55\0\1\66\62\0"+
|
||||
"\1\67\4\0\4\70\7\0\6\70\1\71\6\70\3\0"+
|
||||
"\3\70\12\0\1\72\43\0\1\73\1\74\1\75\1\76"+
|
||||
"\2\77\1\0\1\100\3\0\1\100\1\17\1\20\1\21"+
|
||||
"\1\22\7\0\15\17\3\0\3\17\3\0\1\101\1\0"+
|
||||
"\1\102\2\103\1\0\1\104\3\0\1\104\3\20\1\22"+
|
||||
"\7\0\15\20\3\0\3\20\2\0\1\73\1\105\1\75"+
|
||||
"\1\76\2\103\1\0\1\104\3\0\1\104\1\21\1\20"+
|
||||
"\1\21\1\22\7\0\15\21\3\0\3\21\3\0\1\106"+
|
||||
"\1\0\1\102\2\77\1\0\1\100\3\0\1\100\4\22"+
|
||||
"\7\0\15\22\3\0\3\22\24\0\1\13\55\0\1\107"+
|
||||
"\73\0\1\110\16\0\1\67\4\0\4\70\7\0\15\70"+
|
||||
"\3\0\3\70\16\0\4\31\7\0\15\31\3\0\3\31"+
|
||||
"\24\0\1\27\56\0\1\111\42\0\4\34\7\0\15\34"+
|
||||
"\3\0\3\34\27\0\1\112\42\0\4\40\7\0\15\40"+
|
||||
"\3\0\3\40\16\0\4\40\7\0\2\40\1\113\12\40"+
|
||||
"\3\0\3\40\2\0\1\114\67\0\4\44\7\0\15\44"+
|
||||
"\3\0\3\44\24\0\1\36\55\0\1\115\43\0\4\47"+
|
||||
"\7\0\15\47\3\0\3\47\26\0\1\116\37\0\1\117"+
|
||||
"\57\0\4\52\7\0\15\52\3\0\3\52\11\0\1\120"+
|
||||
"\4\0\4\70\7\0\15\70\3\0\3\70\16\0\4\54"+
|
||||
"\7\0\15\54\3\0\3\54\47\0\1\117\6\0\1\121"+
|
||||
"\63\0\1\122\57\0\4\62\7\0\15\62\3\0\3\62"+
|
||||
"\24\0\1\56\55\0\1\123\43\0\4\70\7\0\15\70"+
|
||||
"\3\0\3\70\14\0\1\36\1\0\4\124\1\0\3\125"+
|
||||
"\3\0\15\124\3\0\3\124\14\0\1\36\1\0\4\124"+
|
||||
"\1\0\3\125\3\0\3\124\1\126\11\124\3\0\3\124"+
|
||||
"\16\0\1\127\1\0\1\127\10\0\15\127\3\0\3\127"+
|
||||
"\16\0\1\130\1\131\1\132\1\133\7\0\15\130\3\0"+
|
||||
"\3\130\16\0\1\134\1\0\1\134\10\0\15\134\3\0"+
|
||||
"\3\134\16\0\1\135\1\136\1\135\1\136\7\0\15\135"+
|
||||
"\3\0\3\135\16\0\1\137\2\140\1\141\7\0\15\137"+
|
||||
"\3\0\3\137\16\0\1\100\2\142\10\0\15\100\3\0"+
|
||||
"\3\100\16\0\1\143\2\144\1\145\7\0\15\143\3\0"+
|
||||
"\3\143\16\0\4\136\7\0\15\136\3\0\3\136\16\0"+
|
||||
"\1\146\2\147\1\150\7\0\15\146\3\0\3\146\16\0"+
|
||||
"\1\151\2\152\1\153\7\0\15\151\3\0\3\151\16\0"+
|
||||
"\1\154\1\144\1\155\1\145\7\0\15\154\3\0\3\154"+
|
||||
"\16\0\1\156\2\131\1\133\7\0\15\156\3\0\3\156"+
|
||||
"\30\0\1\157\1\160\64\0\1\161\27\0\4\40\7\0"+
|
||||
"\2\40\1\162\12\40\3\0\3\40\2\0\1\163\101\0"+
|
||||
"\1\164\1\165\40\0\4\70\7\0\6\70\1\166\6\70"+
|
||||
"\3\0\3\70\2\0\1\167\63\0\1\170\71\0\1\171"+
|
||||
"\1\172\34\0\1\173\1\0\1\36\1\0\4\124\1\0"+
|
||||
"\3\125\3\0\15\124\3\0\3\124\16\0\4\174\1\0"+
|
||||
"\3\125\3\0\15\174\3\0\3\174\12\0\1\173\1\0"+
|
||||
"\1\36\1\0\4\124\1\0\3\125\3\0\10\124\1\175"+
|
||||
"\4\124\3\0\3\124\2\0\1\73\13\0\1\127\1\0"+
|
||||
"\1\127\10\0\15\127\3\0\3\127\3\0\1\176\1\0"+
|
||||
"\1\102\2\177\6\0\1\130\1\131\1\132\1\133\7\0"+
|
||||
"\15\130\3\0\3\130\3\0\1\200\1\0\1\102\2\201"+
|
||||
"\1\0\1\202\3\0\1\202\3\131\1\133\7\0\15\131"+
|
||||
"\3\0\3\131\3\0\1\203\1\0\1\102\2\201\1\0"+
|
||||
"\1\202\3\0\1\202\1\132\1\131\1\132\1\133\7\0"+
|
||||
"\15\132\3\0\3\132\3\0\1\204\1\0\1\102\2\177"+
|
||||
"\6\0\4\133\7\0\15\133\3\0\3\133\3\0\1\205"+
|
||||
"\2\0\1\205\7\0\1\135\1\136\1\135\1\136\7\0"+
|
||||
"\15\135\3\0\3\135\3\0\1\205\2\0\1\205\7\0"+
|
||||
"\4\136\7\0\15\136\3\0\3\136\3\0\1\177\1\0"+
|
||||
"\1\102\2\177\6\0\1\137\2\140\1\141\7\0\15\137"+
|
||||
"\3\0\3\137\3\0\1\201\1\0\1\102\2\201\1\0"+
|
||||
"\1\202\3\0\1\202\3\140\1\141\7\0\15\140\3\0"+
|
||||
"\3\140\3\0\1\177\1\0\1\102\2\177\6\0\4\141"+
|
||||
"\7\0\15\141\3\0\3\141\3\0\1\202\2\0\2\202"+
|
||||
"\1\0\1\202\3\0\1\202\3\142\10\0\15\142\3\0"+
|
||||
"\3\142\3\0\1\106\1\0\1\102\2\77\1\0\1\100"+
|
||||
"\3\0\1\100\1\143\2\144\1\145\7\0\15\143\3\0"+
|
||||
"\3\143\3\0\1\101\1\0\1\102\2\103\1\0\1\104"+
|
||||
"\3\0\1\104\3\144\1\145\7\0\15\144\3\0\3\144"+
|
||||
"\3\0\1\106\1\0\1\102\2\77\1\0\1\100\3\0"+
|
||||
"\1\100\4\145\7\0\15\145\3\0\3\145\3\0\1\77"+
|
||||
"\1\0\1\102\2\77\1\0\1\100\3\0\1\100\1\146"+
|
||||
"\2\147\1\150\7\0\15\146\3\0\3\146\3\0\1\103"+
|
||||
"\1\0\1\102\2\103\1\0\1\104\3\0\1\104\3\147"+
|
||||
"\1\150\7\0\15\147\3\0\3\147\3\0\1\77\1\0"+
|
||||
"\1\102\2\77\1\0\1\100\3\0\1\100\4\150\7\0"+
|
||||
"\15\150\3\0\3\150\3\0\1\100\2\0\2\100\1\0"+
|
||||
"\1\100\3\0\1\100\1\151\2\152\1\153\7\0\15\151"+
|
||||
"\3\0\3\151\3\0\1\104\2\0\2\104\1\0\1\104"+
|
||||
"\3\0\1\104\3\152\1\153\7\0\15\152\3\0\3\152"+
|
||||
"\3\0\1\100\2\0\2\100\1\0\1\100\3\0\1\100"+
|
||||
"\4\153\7\0\15\153\3\0\3\153\3\0\1\206\1\0"+
|
||||
"\1\102\2\77\1\0\1\100\3\0\1\100\1\154\1\144"+
|
||||
"\1\155\1\145\7\0\15\154\3\0\3\154\3\0\1\207"+
|
||||
"\1\0\1\102\2\103\1\0\1\104\3\0\1\104\1\155"+
|
||||
"\1\144\1\155\1\145\7\0\15\155\3\0\3\155\3\0"+
|
||||
"\1\204\1\0\1\102\2\177\6\0\1\156\2\131\1\133"+
|
||||
"\7\0\15\156\3\0\3\156\31\0\1\160\54\0\1\210"+
|
||||
"\64\0\1\211\26\0\4\40\7\0\15\40\3\0\1\40"+
|
||||
"\1\212\1\40\31\0\1\165\54\0\1\213\35\0\1\36"+
|
||||
"\1\0\4\124\1\0\3\125\3\0\3\124\1\214\11\124"+
|
||||
"\3\0\3\124\2\0\1\215\102\0\1\172\54\0\1\216"+
|
||||
"\34\0\1\217\52\0\1\173\3\0\4\174\7\0\15\174"+
|
||||
"\3\0\3\174\12\0\1\173\1\0\1\220\1\0\4\124"+
|
||||
"\1\0\3\125\3\0\15\124\3\0\3\124\16\0\1\221"+
|
||||
"\1\133\1\221\1\133\7\0\15\221\3\0\3\221\16\0"+
|
||||
"\4\141\7\0\15\141\3\0\3\141\16\0\4\145\7\0"+
|
||||
"\15\145\3\0\3\145\16\0\4\150\7\0\15\150\3\0"+
|
||||
"\3\150\16\0\4\153\7\0\15\153\3\0\3\153\16\0"+
|
||||
"\1\222\1\145\1\222\1\145\7\0\15\222\3\0\3\222"+
|
||||
"\16\0\4\133\7\0\15\133\3\0\3\133\16\0\4\223"+
|
||||
"\7\0\15\223\3\0\3\223\33\0\1\224\61\0\1\225"+
|
||||
"\30\0\4\40\6\0\1\226\15\40\3\0\2\40\1\227"+
|
||||
"\33\0\1\230\32\0\1\173\1\0\1\36\1\0\4\124"+
|
||||
"\1\0\3\125\3\0\10\124\1\231\4\124\3\0\3\124"+
|
||||
"\2\0\1\232\104\0\1\233\36\0\4\234\7\0\15\234"+
|
||||
"\3\0\3\234\3\0\1\176\1\0\1\102\2\177\6\0"+
|
||||
"\1\221\1\133\1\221\1\133\7\0\15\221\3\0\3\221"+
|
||||
"\3\0\1\206\1\0\1\102\2\77\1\0\1\100\3\0"+
|
||||
"\1\100\1\222\1\145\1\222\1\145\7\0\15\222\3\0"+
|
||||
"\3\222\3\0\1\205\2\0\1\205\7\0\4\223\7\0"+
|
||||
"\15\223\3\0\3\223\34\0\1\235\55\0\1\236\26\0"+
|
||||
"\1\237\60\0\4\40\6\0\1\226\15\40\3\0\3\40"+
|
||||
"\34\0\1\240\31\0\1\173\1\0\1\117\1\0\4\124"+
|
||||
"\1\0\3\125\3\0\15\124\3\0\3\124\34\0\1\241"+
|
||||
"\32\0\1\242\2\0\4\234\7\0\15\234\3\0\3\234"+
|
||||
"\35\0\1\243\62\0\1\244\20\0\1\245\77\0\1\246"+
|
||||
"\53\0\1\247\32\0\1\36\1\0\4\174\1\0\3\125"+
|
||||
"\3\0\15\174\3\0\3\174\36\0\1\250\53\0\1\251"+
|
||||
"\33\0\4\252\7\0\15\252\3\0\3\252\36\0\1\253"+
|
||||
"\53\0\1\254\54\0\1\255\61\0\1\256\11\0\1\257"+
|
||||
"\12\0\4\252\7\0\15\252\3\0\3\252\37\0\1\260"+
|
||||
"\53\0\1\261\54\0\1\262\22\0\1\13\62\0\4\263"+
|
||||
"\7\0\15\263\3\0\3\263\40\0\1\264\53\0\1\265"+
|
||||
"\43\0\1\266\26\0\2\263\1\0\2\263\1\0\2\263"+
|
||||
"\2\0\5\263\7\0\15\263\3\0\4\263\27\0\1\267"+
|
||||
"\53\0\1\270\24\0";
|
||||
"\1\20\1\21\1\22\1\23\3\13\1\24\2\13\15\17"+
|
||||
"\1\25\2\13\3\17\1\13\7\26\1\27\5\26\4\30"+
|
||||
"\5\26\1\31\1\26\15\30\3\26\3\30\10\26\1\27"+
|
||||
"\5\26\4\32\5\26\1\33\1\26\15\32\3\26\3\32"+
|
||||
"\1\26\7\34\1\35\5\34\4\36\1\34\1\37\2\26"+
|
||||
"\1\34\1\40\1\34\15\36\3\34\1\41\2\36\2\34"+
|
||||
"\1\42\5\34\1\35\5\34\4\43\4\34\1\44\2\34"+
|
||||
"\15\43\3\34\3\43\10\34\1\35\5\34\4\45\4\34"+
|
||||
"\1\44\2\34\15\45\3\34\3\45\10\34\1\35\5\34"+
|
||||
"\4\45\4\34\1\46\2\34\15\45\3\34\3\45\10\34"+
|
||||
"\1\35\1\34\1\47\3\34\4\50\7\34\15\50\3\34"+
|
||||
"\3\50\10\34\1\51\5\34\4\52\7\34\15\52\1\34"+
|
||||
"\1\53\1\34\3\52\1\34\1\54\1\55\5\54\1\56"+
|
||||
"\1\54\1\57\3\54\4\60\4\54\1\61\2\54\15\60"+
|
||||
"\2\54\1\62\3\60\1\54\55\0\1\63\62\0\1\64"+
|
||||
"\4\0\4\65\7\0\6\65\1\66\6\65\3\0\3\65"+
|
||||
"\12\0\1\67\43\0\1\70\1\71\1\72\1\73\2\74"+
|
||||
"\1\0\1\75\3\0\1\75\1\17\1\20\1\21\1\22"+
|
||||
"\7\0\15\17\3\0\3\17\3\0\1\76\1\0\1\77"+
|
||||
"\2\100\1\0\1\101\3\0\1\101\3\20\1\22\7\0"+
|
||||
"\15\20\3\0\3\20\2\0\1\70\1\102\1\72\1\73"+
|
||||
"\2\100\1\0\1\101\3\0\1\101\1\21\1\20\1\21"+
|
||||
"\1\22\7\0\15\21\3\0\3\21\3\0\1\103\1\0"+
|
||||
"\1\77\2\74\1\0\1\75\3\0\1\75\4\22\7\0"+
|
||||
"\15\22\3\0\3\22\26\0\1\104\73\0\1\105\16\0"+
|
||||
"\1\64\4\0\4\65\7\0\15\65\3\0\3\65\16\0"+
|
||||
"\4\30\7\0\15\30\3\0\3\30\27\0\1\106\42\0"+
|
||||
"\4\32\7\0\15\32\3\0\3\32\27\0\1\107\42\0"+
|
||||
"\4\36\7\0\15\36\3\0\3\36\24\0\1\26\45\0"+
|
||||
"\4\36\7\0\2\36\1\110\12\36\3\0\3\36\2\0"+
|
||||
"\1\111\67\0\4\43\7\0\15\43\3\0\3\43\26\0"+
|
||||
"\1\112\43\0\4\45\7\0\15\45\3\0\3\45\26\0"+
|
||||
"\1\113\37\0\1\114\57\0\4\50\7\0\15\50\3\0"+
|
||||
"\3\50\11\0\1\115\4\0\4\65\7\0\15\65\3\0"+
|
||||
"\3\65\16\0\4\52\7\0\15\52\3\0\3\52\47\0"+
|
||||
"\1\114\6\0\1\116\63\0\1\117\57\0\4\60\7\0"+
|
||||
"\15\60\3\0\3\60\26\0\1\120\43\0\4\65\7\0"+
|
||||
"\15\65\3\0\3\65\14\0\1\34\1\0\4\121\1\0"+
|
||||
"\3\122\3\0\15\121\3\0\3\121\14\0\1\34\1\0"+
|
||||
"\4\121\1\0\3\122\3\0\3\121\1\123\11\121\3\0"+
|
||||
"\3\121\16\0\1\124\1\0\1\124\10\0\15\124\3\0"+
|
||||
"\3\124\16\0\1\125\1\126\1\127\1\130\7\0\15\125"+
|
||||
"\3\0\3\125\16\0\1\131\1\0\1\131\10\0\15\131"+
|
||||
"\3\0\3\131\16\0\1\132\1\133\1\132\1\133\7\0"+
|
||||
"\15\132\3\0\3\132\16\0\1\134\2\135\1\136\7\0"+
|
||||
"\15\134\3\0\3\134\16\0\1\75\2\137\10\0\15\75"+
|
||||
"\3\0\3\75\16\0\1\140\2\141\1\142\7\0\15\140"+
|
||||
"\3\0\3\140\16\0\4\133\7\0\15\133\3\0\3\133"+
|
||||
"\16\0\1\143\2\144\1\145\7\0\15\143\3\0\3\143"+
|
||||
"\16\0\1\146\2\147\1\150\7\0\15\146\3\0\3\146"+
|
||||
"\16\0\1\151\1\141\1\152\1\142\7\0\15\151\3\0"+
|
||||
"\3\151\16\0\1\153\2\126\1\130\7\0\15\153\3\0"+
|
||||
"\3\153\30\0\1\154\1\155\64\0\1\156\27\0\4\36"+
|
||||
"\7\0\2\36\1\157\12\36\3\0\3\36\2\0\1\160"+
|
||||
"\101\0\1\161\1\162\40\0\4\65\7\0\6\65\1\163"+
|
||||
"\6\65\3\0\3\65\2\0\1\164\63\0\1\165\71\0"+
|
||||
"\1\166\1\167\34\0\1\170\1\0\1\34\1\0\4\121"+
|
||||
"\1\0\3\122\3\0\15\121\3\0\3\121\16\0\4\171"+
|
||||
"\1\0\3\122\3\0\15\171\3\0\3\171\12\0\1\170"+
|
||||
"\1\0\1\34\1\0\4\121\1\0\3\122\3\0\10\121"+
|
||||
"\1\172\4\121\3\0\3\121\2\0\1\70\13\0\1\124"+
|
||||
"\1\0\1\124\10\0\15\124\3\0\3\124\3\0\1\173"+
|
||||
"\1\0\1\77\2\174\6\0\1\125\1\126\1\127\1\130"+
|
||||
"\7\0\15\125\3\0\3\125\3\0\1\175\1\0\1\77"+
|
||||
"\2\176\1\0\1\177\3\0\1\177\3\126\1\130\7\0"+
|
||||
"\15\126\3\0\3\126\3\0\1\200\1\0\1\77\2\176"+
|
||||
"\1\0\1\177\3\0\1\177\1\127\1\126\1\127\1\130"+
|
||||
"\7\0\15\127\3\0\3\127\3\0\1\201\1\0\1\77"+
|
||||
"\2\174\6\0\4\130\7\0\15\130\3\0\3\130\3\0"+
|
||||
"\1\202\2\0\1\202\7\0\1\132\1\133\1\132\1\133"+
|
||||
"\7\0\15\132\3\0\3\132\3\0\1\202\2\0\1\202"+
|
||||
"\7\0\4\133\7\0\15\133\3\0\3\133\3\0\1\174"+
|
||||
"\1\0\1\77\2\174\6\0\1\134\2\135\1\136\7\0"+
|
||||
"\15\134\3\0\3\134\3\0\1\176\1\0\1\77\2\176"+
|
||||
"\1\0\1\177\3\0\1\177\3\135\1\136\7\0\15\135"+
|
||||
"\3\0\3\135\3\0\1\174\1\0\1\77\2\174\6\0"+
|
||||
"\4\136\7\0\15\136\3\0\3\136\3\0\1\177\2\0"+
|
||||
"\2\177\1\0\1\177\3\0\1\177\3\137\10\0\15\137"+
|
||||
"\3\0\3\137\3\0\1\103\1\0\1\77\2\74\1\0"+
|
||||
"\1\75\3\0\1\75\1\140\2\141\1\142\7\0\15\140"+
|
||||
"\3\0\3\140\3\0\1\76\1\0\1\77\2\100\1\0"+
|
||||
"\1\101\3\0\1\101\3\141\1\142\7\0\15\141\3\0"+
|
||||
"\3\141\3\0\1\103\1\0\1\77\2\74\1\0\1\75"+
|
||||
"\3\0\1\75\4\142\7\0\15\142\3\0\3\142\3\0"+
|
||||
"\1\74\1\0\1\77\2\74\1\0\1\75\3\0\1\75"+
|
||||
"\1\143\2\144\1\145\7\0\15\143\3\0\3\143\3\0"+
|
||||
"\1\100\1\0\1\77\2\100\1\0\1\101\3\0\1\101"+
|
||||
"\3\144\1\145\7\0\15\144\3\0\3\144\3\0\1\74"+
|
||||
"\1\0\1\77\2\74\1\0\1\75\3\0\1\75\4\145"+
|
||||
"\7\0\15\145\3\0\3\145\3\0\1\75\2\0\2\75"+
|
||||
"\1\0\1\75\3\0\1\75\1\146\2\147\1\150\7\0"+
|
||||
"\15\146\3\0\3\146\3\0\1\101\2\0\2\101\1\0"+
|
||||
"\1\101\3\0\1\101\3\147\1\150\7\0\15\147\3\0"+
|
||||
"\3\147\3\0\1\75\2\0\2\75\1\0\1\75\3\0"+
|
||||
"\1\75\4\150\7\0\15\150\3\0\3\150\3\0\1\203"+
|
||||
"\1\0\1\77\2\74\1\0\1\75\3\0\1\75\1\151"+
|
||||
"\1\141\1\152\1\142\7\0\15\151\3\0\3\151\3\0"+
|
||||
"\1\204\1\0\1\77\2\100\1\0\1\101\3\0\1\101"+
|
||||
"\1\152\1\141\1\152\1\142\7\0\15\152\3\0\3\152"+
|
||||
"\3\0\1\201\1\0\1\77\2\174\6\0\1\153\2\126"+
|
||||
"\1\130\7\0\15\153\3\0\3\153\31\0\1\155\54\0"+
|
||||
"\1\205\64\0\1\206\26\0\4\36\7\0\15\36\3\0"+
|
||||
"\1\36\1\207\1\36\31\0\1\162\54\0\1\210\35\0"+
|
||||
"\1\34\1\0\4\121\1\0\3\122\3\0\3\121\1\211"+
|
||||
"\11\121\3\0\3\121\2\0\1\212\102\0\1\167\54\0"+
|
||||
"\1\213\34\0\1\214\52\0\1\170\3\0\4\171\7\0"+
|
||||
"\15\171\3\0\3\171\12\0\1\170\1\0\1\215\1\0"+
|
||||
"\4\121\1\0\3\122\3\0\15\121\3\0\3\121\16\0"+
|
||||
"\1\216\1\130\1\216\1\130\7\0\15\216\3\0\3\216"+
|
||||
"\16\0\4\136\7\0\15\136\3\0\3\136\16\0\4\142"+
|
||||
"\7\0\15\142\3\0\3\142\16\0\4\145\7\0\15\145"+
|
||||
"\3\0\3\145\16\0\4\150\7\0\15\150\3\0\3\150"+
|
||||
"\16\0\1\217\1\142\1\217\1\142\7\0\15\217\3\0"+
|
||||
"\3\217\16\0\4\130\7\0\15\130\3\0\3\130\16\0"+
|
||||
"\4\220\7\0\15\220\3\0\3\220\33\0\1\221\61\0"+
|
||||
"\1\222\30\0\4\36\6\0\1\223\15\36\3\0\2\36"+
|
||||
"\1\224\33\0\1\225\32\0\1\170\1\0\1\34\1\0"+
|
||||
"\4\121\1\0\3\122\3\0\10\121\1\226\4\121\3\0"+
|
||||
"\3\121\2\0\1\227\104\0\1\230\36\0\4\231\7\0"+
|
||||
"\15\231\3\0\3\231\3\0\1\173\1\0\1\77\2\174"+
|
||||
"\6\0\1\216\1\130\1\216\1\130\7\0\15\216\3\0"+
|
||||
"\3\216\3\0\1\203\1\0\1\77\2\74\1\0\1\75"+
|
||||
"\3\0\1\75\1\217\1\142\1\217\1\142\7\0\15\217"+
|
||||
"\3\0\3\217\3\0\1\202\2\0\1\202\7\0\4\220"+
|
||||
"\7\0\15\220\3\0\3\220\34\0\1\232\55\0\1\233"+
|
||||
"\26\0\1\234\60\0\4\36\6\0\1\223\15\36\3\0"+
|
||||
"\3\36\34\0\1\235\31\0\1\170\1\0\1\114\1\0"+
|
||||
"\4\121\1\0\3\122\3\0\15\121\3\0\3\121\34\0"+
|
||||
"\1\236\32\0\1\237\2\0\4\231\7\0\15\231\3\0"+
|
||||
"\3\231\35\0\1\240\62\0\1\241\20\0\1\242\77\0"+
|
||||
"\1\243\53\0\1\244\32\0\1\34\1\0\4\171\1\0"+
|
||||
"\3\122\3\0\15\171\3\0\3\171\36\0\1\245\53\0"+
|
||||
"\1\246\33\0\4\247\7\0\15\247\3\0\3\247\36\0"+
|
||||
"\1\250\53\0\1\251\54\0\1\252\61\0\1\253\11\0"+
|
||||
"\1\254\12\0\4\247\7\0\15\247\3\0\3\247\37\0"+
|
||||
"\1\255\53\0\1\256\54\0\1\257\22\0\1\13\62\0"+
|
||||
"\4\260\7\0\15\260\3\0\3\260\40\0\1\261\53\0"+
|
||||
"\1\262\43\0\1\263\26\0\2\260\1\0\2\260\1\0"+
|
||||
"\2\260\2\0\5\260\7\0\15\260\3\0\4\260\27\0"+
|
||||
"\1\264\53\0\1\265\24\0";
|
||||
|
||||
private static int [] zzUnpackTrans() {
|
||||
int [] result = new int[7040];
|
||||
int [] result = new int[6908];
|
||||
int offset = 0;
|
||||
offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
|
||||
return result;
|
||||
|
@ -355,8 +351,8 @@ class WikipediaTokenizerImpl {
|
|||
private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
|
||||
|
||||
private static final String ZZ_ATTRIBUTE_PACKED_0 =
|
||||
"\12\0\1\11\7\1\1\11\3\1\1\11\6\1\1\11"+
|
||||
"\2\1\1\11\14\1\1\11\6\1\2\11\3\0\1\11"+
|
||||
"\12\0\1\11\7\1\1\11\2\1\1\11\5\1\1\11"+
|
||||
"\3\1\1\11\13\1\1\11\5\1\2\11\3\0\1\11"+
|
||||
"\14\0\2\1\2\11\1\1\1\0\2\1\1\11\1\0"+
|
||||
"\1\1\1\0\1\1\3\0\7\1\2\0\1\1\1\0"+
|
||||
"\15\1\3\0\1\1\1\11\3\0\1\1\1\11\5\0"+
|
||||
|
@ -365,7 +361,7 @@ class WikipediaTokenizerImpl {
|
|||
"\2\0\3\11";
|
||||
|
||||
private static int [] zzUnpackAttribute() {
|
||||
int [] result = new int[184];
|
||||
int [] result = new int[181];
|
||||
int offset = 0;
|
||||
offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
|
||||
return result;
|
||||
|
@ -508,7 +504,6 @@ final void reset() {
|
|||
|
||||
/**
|
||||
* Creates a new scanner
|
||||
* There is also a java.io.InputStream version of this constructor.
|
||||
*
|
||||
* @param in the java.io.Reader to read input from.
|
||||
*/
|
||||
|
@ -517,7 +512,6 @@ final void reset() {
|
|||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Unpacks the compressed character translation table.
|
||||
*
|
||||
|
|
|
@ -212,7 +212,7 @@ DOUBLE_EQUALS = "="{2}
|
|||
{DOUBLE_BRACE} {numWikiTokensSeen = 0; positionInc = 1; currentTokType = CITATION; yybegin(DOUBLE_BRACE_STATE);/* Break so we don't hit fall-through warning: */ break;}
|
||||
{CITATION} {numWikiTokensSeen = 0; positionInc = 1; currentTokType = CITATION; yybegin(DOUBLE_BRACE_STATE);/* Break so we don't hit fall-through warning: */ break;}
|
||||
//ignore
|
||||
. | {WHITESPACE} |{INFOBOX} {numWikiTokensSeen = 0; positionInc = 1; /* Break so we don't hit fall-through warning: */ break;}
|
||||
[^] |{INFOBOX} {numWikiTokensSeen = 0; positionInc = 1; /* Break so we don't hit fall-through warning: */ break;}
|
||||
}
|
||||
|
||||
<INTERNAL_LINK_STATE>{
|
||||
|
@ -221,7 +221,7 @@ DOUBLE_EQUALS = "="{2}
|
|||
{ALPHANUM} {yybegin(INTERNAL_LINK_STATE); numWikiTokensSeen++; return currentTokType;}
|
||||
{DOUBLE_BRACKET_CLOSE} {numLinkToks = 0; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;}
|
||||
//ignore
|
||||
. | {WHITESPACE} { positionInc = 1; /* Break so we don't hit fall-through warning: */ break;}
|
||||
[^] { positionInc = 1; /* Break so we don't hit fall-through warning: */ break;}
|
||||
}
|
||||
|
||||
<EXTERNAL_LINK_STATE>{
|
||||
|
@ -236,7 +236,7 @@ DOUBLE_EQUALS = "="{2}
|
|||
{ALPHANUM} {yybegin(CATEGORY_STATE); numWikiTokensSeen++; return currentTokType;}
|
||||
{DOUBLE_BRACKET_CLOSE} {yybegin(YYINITIAL);/* Break so we don't hit fall-through warning: */ break;}
|
||||
//ignore
|
||||
. | {WHITESPACE} { positionInc = 1; /* Break so we don't hit fall-through warning: */ break;}
|
||||
[^] { positionInc = 1; /* Break so we don't hit fall-through warning: */ break;}
|
||||
}
|
||||
//italics
|
||||
<TWO_SINGLE_QUOTES_STATE>{
|
||||
|
@ -249,7 +249,7 @@ DOUBLE_EQUALS = "="{2}
|
|||
{EXTERNAL_LINK} {currentTokType = EXTERNAL_LINK; numWikiTokensSeen = 0; yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;}
|
||||
|
||||
//ignore
|
||||
. | {WHITESPACE} { /* Break so we don't hit fall-through warning: */ break;/* ignore */ }
|
||||
[^] { /* Break so we don't hit fall-through warning: */ break;/* ignore */ }
|
||||
}
|
||||
//bold
|
||||
<THREE_SINGLE_QUOTES_STATE>{
|
||||
|
@ -260,7 +260,7 @@ DOUBLE_EQUALS = "="{2}
|
|||
{EXTERNAL_LINK} {currentTokType = EXTERNAL_LINK; numWikiTokensSeen = 0; yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;}
|
||||
|
||||
//ignore
|
||||
. | {WHITESPACE} { /* Break so we don't hit fall-through warning: */ break;/* ignore */ }
|
||||
[^] { /* Break so we don't hit fall-through warning: */ break;/* ignore */ }
|
||||
|
||||
}
|
||||
//bold italics
|
||||
|
@ -272,7 +272,7 @@ DOUBLE_EQUALS = "="{2}
|
|||
{EXTERNAL_LINK} {currentTokType = EXTERNAL_LINK; numWikiTokensSeen = 0; yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;}
|
||||
|
||||
//ignore
|
||||
. | {WHITESPACE} { /* Break so we don't hit fall-through warning: */ break;/* ignore */ }
|
||||
[^] { /* Break so we don't hit fall-through warning: */ break;/* ignore */ }
|
||||
}
|
||||
|
||||
<DOUBLE_EQUALS_STATE>{
|
||||
|
@ -280,7 +280,7 @@ DOUBLE_EQUALS = "="{2}
|
|||
{ALPHANUM} {currentTokType = HEADING; yybegin(DOUBLE_EQUALS_STATE); numWikiTokensSeen++; return currentTokType;}
|
||||
{DOUBLE_EQUALS} {yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;}
|
||||
//ignore
|
||||
. | {WHITESPACE} { /* Break so we don't hit fall-through warning: */ break;/* ignore */ }
|
||||
[^] { /* Break so we don't hit fall-through warning: */ break;/* ignore */ }
|
||||
}
|
||||
|
||||
<DOUBLE_BRACE_STATE>{
|
||||
|
@ -288,7 +288,7 @@ DOUBLE_EQUALS = "="{2}
|
|||
{DOUBLE_BRACE_CLOSE} {yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;}
|
||||
{CITATION_CLOSE} {yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;}
|
||||
//ignore
|
||||
. | {WHITESPACE} { /* Break so we don't hit fall-through warning: */ break;/* ignore */ }
|
||||
[^] { /* Break so we don't hit fall-through warning: */ break;/* ignore */ }
|
||||
}
|
||||
|
||||
<STRING> {
|
||||
|
@ -305,7 +305,7 @@ DOUBLE_EQUALS = "="{2}
|
|||
|
||||
{PIPE} {yybegin(STRING); return currentTokType;/*pipe*/}
|
||||
|
||||
.|{WHITESPACE} { /* Break so we don't hit fall-through warning: */ break;/* ignore STRING */ }
|
||||
[^] { /* Break so we don't hit fall-through warning: */ break;/* ignore STRING */ }
|
||||
}
|
||||
|
||||
|
||||
|
@ -327,7 +327,7 @@ DOUBLE_EQUALS = "="{2}
|
|||
//end wikipedia
|
||||
|
||||
/** Ignore the rest */
|
||||
. | {WHITESPACE}|{TAGS} { /* Break so we don't hit fall-through warning: */ break;/* ignore */ }
|
||||
[^] | {TAGS} { /* Break so we don't hit fall-through warning: */ break;/* ignore */ }
|
||||
|
||||
|
||||
//INTERNAL_LINK = "["{2}({ALPHANUM}+{WHITESPACE}*)+"]"{2}
|
||||
|
|
|
@ -202,7 +202,7 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
public void testUnicodeWordBreaks() throws Exception {
|
||||
WordBreakTestUnicode_6_1_0 wordBreakTest = new WordBreakTestUnicode_6_1_0();
|
||||
WordBreakTestUnicode_6_3_0 wordBreakTest = new WordBreakTestUnicode_6_3_0();
|
||||
wordBreakTest.test(a);
|
||||
}
|
||||
|
||||
|
@ -231,6 +231,8 @@ public class TestStandardAnalyzer extends BaseTokenStreamTestCase {
|
|||
checkOneTerm(a, "아゙", "아゙"); // hangul
|
||||
}
|
||||
|
||||
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
checkRandomData(random(), new StandardAnalyzer(TEST_VERSION_CURRENT), 1000*RANDOM_MULTIPLIER);
|
||||
|
|
|
@ -60,7 +60,7 @@ public class TestStopAnalyzer extends BaseTokenStreamTestCase {
|
|||
|
||||
public void testStopList() throws IOException {
|
||||
CharArraySet stopWordsSet = new CharArraySet(TEST_VERSION_CURRENT, asSet("good", "test", "analyzer"), false);
|
||||
StopAnalyzer newStop = new StopAnalyzer(Version.LUCENE_40, stopWordsSet);
|
||||
StopAnalyzer newStop = new StopAnalyzer(TEST_VERSION_CURRENT, stopWordsSet);
|
||||
try (TokenStream stream = newStop.tokenStream("test", "This is a good test of the english stop analyzer")) {
|
||||
assertNotNull(stream);
|
||||
CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
|
||||
|
|
|
@ -94,7 +94,7 @@ public class TestStopFilter extends BaseTokenStreamTestCase {
|
|||
// LUCENE-3849: make sure after .end() we see the "ending" posInc
|
||||
public void testEndStopword() throws Exception {
|
||||
CharArraySet stopSet = StopFilter.makeStopSet(TEST_VERSION_CURRENT, "of");
|
||||
StopFilter stpf = new StopFilter(Version.LUCENE_40, new MockTokenizer(new StringReader("test of"), MockTokenizer.WHITESPACE, false), stopSet);
|
||||
StopFilter stpf = new StopFilter(TEST_VERSION_CURRENT, new MockTokenizer(new StringReader("test of"), MockTokenizer.WHITESPACE, false), stopSet);
|
||||
assertTokenStreamContents(stpf, new String[] { "test" },
|
||||
new int[] {0},
|
||||
new int[] {4},
|
||||
|
|
|
@ -424,7 +424,7 @@ public class TestUAX29URLEmailTokenizer extends BaseTokenStreamTestCase {
|
|||
}
|
||||
|
||||
public void testUnicodeWordBreaks() throws Exception {
|
||||
WordBreakTestUnicode_6_1_0 wordBreakTest = new WordBreakTestUnicode_6_1_0();
|
||||
WordBreakTestUnicode_6_3_0 wordBreakTest = new WordBreakTestUnicode_6_3_0();
|
||||
wordBreakTest.test(a);
|
||||
}
|
||||
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -78,13 +78,13 @@ LTLNFsgB@[191.56.104.113]
|
|||
iT0LOq.jtPW=G06~cETxl2ge@Ah0.4hn72v.tQ.LU
|
||||
VGLn@z3E2.3an2.MM
|
||||
TWmfsxn@[112.192.017.029]
|
||||
2tP07A@2twe6u0d6uw6o.sed7n.109mx.XN--KGBECHTV
|
||||
2tP07A@2twe6u0d6uw6o.sed7n.109mx.XN--KPRW13D
|
||||
CjaPC63@['\RDrwk]
|
||||
Ayydpdoa@tdgypppmen.wf
|
||||
"gfKP9"@jo3-r0.mz
|
||||
aTMgDW4@t5gax.XN--0ZWM56D
|
||||
aTMgDW4@t5gax.XN--3E0B707E
|
||||
mcDrMO3FQ@nwc21.y5qd45lesryrp.IL
|
||||
NZqj@v50egeveepk.z290kk.Bc3.xn--jxalpdlp
|
||||
NZqj@v50egeveepk.z290kk.Bc3.xn--kprw13d
|
||||
XtAhFnq@[218.214.251.103]
|
||||
x0S8uos@[109.82.126.233]
|
||||
ALB4KFavj16pODdd@i206d6s.MM
|
||||
|
|
|
@ -78,6 +78,7 @@ import org.junit.Ignore;
|
|||
* \\p{Script = Hiragana}
|
||||
* \\p{LineBreak = Complex_Context} (From $line_break_url)
|
||||
* \\p{WordBreak = ALetter} (From $word_break_url)
|
||||
* \\p{WordBreak = Hebrew_Letter}
|
||||
* \\p{WordBreak = Katakana}
|
||||
* \\p{WordBreak = Numeric} (Excludes full-width Arabic digits)
|
||||
* [\\uFF10-\\uFF19] (Full-width Arabic digits)
|
||||
|
@ -97,7 +98,7 @@ parse_Unicode_data_file($line_break_url, $codepoints, {'sa' => 1});
|
|||
parse_Unicode_data_file($scripts_url, $codepoints,
|
||||
{'han' => 1, 'hiragana' => 1});
|
||||
parse_Unicode_data_file($word_break_url, $codepoints,
|
||||
{'aletter' => 1, 'katakana' => 1, 'numeric' => 1});
|
||||
{'aletter' => 1, 'hebrew_letter' => 1, 'katakana' => 1, 'numeric' => 1});
|
||||
my @tests = split /\r?\n/, get_URL_content($word_break_test_url);
|
||||
|
||||
my $output_path = File::Spec->catpath($volume, $directory, $output_filename);
|
||||
|
@ -109,25 +110,33 @@ print STDERR "Writing '$output_path'...";
|
|||
print OUT $header;
|
||||
|
||||
for my $line (@tests) {
|
||||
next if ($line =~ /^\s*\#/);
|
||||
# ÷ 0001 × 0300 ÷ # ÷ [0.2] <START OF HEADING> (Other) × [4.0] COMBINING GRAVE ACCENT (Extend_FE) ÷ [0.3]
|
||||
next if ($line =~ /^\s*(?:|\#.*)$/); # Skip blank or comment-only lines
|
||||
# Example line: ÷ 0001 × 0300 ÷ # ÷ [0.2] <START OF HEADING> (Other) × [4.0] COMBINING GRAVE ACCENT (Extend_FE) ÷ [0.3]
|
||||
my ($sequence) = $line =~ /^(.*?)\s*\#/;
|
||||
$line =~ s/\t/ /g; # Convert tabs to two spaces (no tabs allowed in Lucene source)
|
||||
print OUT " // $line\n";
|
||||
$sequence =~ s/\s*÷\s*$//; # Trim trailing break character
|
||||
my $test_string = $sequence;
|
||||
$test_string =~ s/\s*÷\s*/\\u/g;
|
||||
$test_string =~ s/\s*×\s*/\\u/g;
|
||||
$test_string =~ s/\\u([0-9A-F]{5,})/join('', map { "\\u$_" } above_BMP_char_to_surrogates($1))/ge;
|
||||
$test_string =~ s/\\u000A/\\n/g;
|
||||
$test_string =~ s/\\u000D/\\r/g;
|
||||
$test_string =~ s/\\u0022/\\\"/g;
|
||||
$sequence =~ s/^\s*÷\s*//; # Trim leading break character
|
||||
my @tokens = ();
|
||||
for my $candidate (split /\s*÷\s*/, $sequence) {
|
||||
my @chars = ();
|
||||
my $has_wanted_char = 0;
|
||||
while ($candidate =~ /([0-9A-F]+)/gi) {
|
||||
push @chars, $1;
|
||||
my $hexchar = $1;
|
||||
if (4 == length($hexchar)) {
|
||||
push @chars, $hexchar;
|
||||
} else {
|
||||
push @chars, above_BMP_char_to_surrogates($hexchar);
|
||||
}
|
||||
unless ($has_wanted_char) {
|
||||
$has_wanted_char = 1 if (defined($codepoints->[hex($1)]));
|
||||
$has_wanted_char = 1 if (defined($codepoints->[hex($hexchar)]));
|
||||
}
|
||||
}
|
||||
if ($has_wanted_char) {
|
||||
|
@ -144,6 +153,21 @@ close OUT;
|
|||
print STDERR "done.\n";
|
||||
|
||||
|
||||
# sub above_BMP_char_to_surrogates
|
||||
#
|
||||
# Converts hex references to chars above the BMP (i.e., greater than 0xFFFF)
|
||||
# to the corresponding UTF-16 surrogate pair
|
||||
#
|
||||
# Assumption: input string is a sequence more than four hex digits
|
||||
#
|
||||
sub above_BMP_char_to_surrogates {
|
||||
my $ch = hex(shift);
|
||||
my $high_surrogate = 0xD800 + (($ch - 0x10000) >> 10);
|
||||
my $low_surrogate = 0xDC00 + ($ch & 0x3FF);
|
||||
return map { sprintf("%04X", $_) } ($high_surrogate, $low_surrogate);
|
||||
}
|
||||
|
||||
|
||||
# sub parse_Unicode_data_file
|
||||
#
|
||||
# Downloads and parses the specified Unicode data file, parses it, and
|
||||
|
|
|
@ -121,14 +121,14 @@ Bzzzzzzzz! Bzzzzzzzzzzzzzzz! Tell them "0\!P?".shQVdSerA@2qmqj8ul.hm the leg
|
|||
of LTLNFsgB@[191.56.104.113] all, until it has read it is
|
||||
iT0LOq.jtPW=G06~cETxl2ge@Ah0.4hn72v.tQ.LU there. <VGLn@z3E2.3an2.MM> Once
|
||||
TWmfsxn@[112.192.017.029] Spiros under the place
|
||||
2tP07A@2twe6u0d6uw6o.sed7n.109mx.XN--KGBECHTV as were not a house of the
|
||||
2tP07A@2twe6u0d6uw6o.sed7n.109mx.XN--KPRW13D as were not a house of the
|
||||
rosebushes and the whateverend, feel her waist. She changes everything. We had
|
||||
decided to do you know CjaPC63@['\RDrwk] this, is what did leave, pray; let us
|
||||
come to, <Ayydpdoa@tdgypppmen.wf> what history as died. Strange, Spiros with
|
||||
delight: That night "gfKP9"@jo3-r0.mz and gold case
|
||||
<aTMgDW4@t5gax.XN--0ZWM56D> is spring: the aeon arising, wherein he returned,
|
||||
<aTMgDW4@t5gax.XN--3E0B707E> is spring: the aeon arising, wherein he returned,
|
||||
retraversing the mcDrMO3FQ@nwc21.y5qd45lesryrp.IL gates, first
|
||||
<NZqj@v50egeveepk.z290kk.Bc3.xn--jxalpdlp> to reach session. Initiating first
|
||||
<NZqj@v50egeveepk.z290kk.Bc3.xn--kprw13d> to reach session. Initiating first
|
||||
part of the main hall toward his own spurs. Hes an <XtAhFnq@[218.214.251.103]>
|
||||
Irifix And older ones who wins? ADAM: x0S8uos@[109.82.126.233] The violin and
|
||||
reality. The hidden set up to come. ROSE WAKINS: No answer. The
|
||||
|
|
|
@ -24,7 +24,7 @@ and Joe recited this iron bars with their account, poor elth, and she had been
|
|||
almost drove me towards evening. At
|
||||
HTTP://173.202.175.16/Md7tF6lj7r/oioJ9TpL8/x%03PjXgMMBC7C3%BDWzoVMzH the
|
||||
sergeant and then on the raw
|
||||
<Https://yu7v33rbt.vC6U3.XN--JXALPDLP/y%4fMSzkGFlm/wbDF4m> afternoon towards
|
||||
<Https://yu7v33rbt.vC6U3.XN--KPRW13D/y%4fMSzkGFlm/wbDF4m> afternoon towards
|
||||
the terror, merely wished him as biled
|
||||
M19nq.0URV4A.Me.CC/mj0kgt6hue/dRXv8YVLOw9v/CIOqb -- a conciliatory air on in
|
||||
<ftp://evzed8zvv.l2xkky.Dq85qcl1.eu:1184/07eY0/3X1OB7gPUk/J8la5OPUY3/y1oTItIs1HFPPp/5Q02N0cPyDH87hSy/jheYGF8s%F3P/%86PmYhi/ViKHoxsHqM8J>
|
||||
|
@ -47,7 +47,7 @@ to live. You didn't know nothing could attend more.' He had been a coming! Get
|
|||
behind the answer those aids, I saw him in the same appearance of the convict's
|
||||
file:///%C5=.%8by/uuFXEaW8.%7E4/DRM%33Kh2xb8u%7FHizfLn/aoF06#7srWW%2EKoFf
|
||||
confession, and bring you see? '
|
||||
HTTP://yA2O3F.XN--0ZWM56D/qPDTt/MwMXGQq2S7JT/TJ2iCND said my limbs. Joe in an
|
||||
HTTP://yA2O3F.XN--3E0B707E/qPDTt/MwMXGQq2S7JT/TJ2iCND said my limbs. Joe in an
|
||||
accusatory manner as well known that Joe Gargery marry her cup. `I wonder and
|
||||
there was publicly made it was,
|
||||
<file:///Gdx5CDZYW%6cnzMJ/7HJ/J%63BSZDXtS/yfWXqq6#> as lookers on; me, I
|
||||
|
@ -63,7 +63,7 @@ again
|
|||
FTP://Hi144dz6hctql2n3uom.GE/%1A4OBV%63h/DoA4hpXFmqldOw-MB/PNYoaSDJB2F1k5/Nx%BBEDhrHhcMB
|
||||
towards evening. At last, and kneaded, and a dead man taking any. There was
|
||||
publicly made out there?' said I,
|
||||
ftp://w0yaysrl.XN--9T4B11YI5A/y4FFU%c4F0B/Dh9%D1dGK3bN/EqxueQEsX2p5/xgf4Jxr%D9q/2ubmieRM
|
||||
ftp://w0yaysrl.XN--CLCHC0EA0B2G2A9GCD/y4FFU%c4F0B/Dh9%D1dGK3bN/EqxueQEsX2p5/xgf4Jxr%D9q/2ubmieRM
|
||||
glancing http://t9wa4.rjcahbc06qmyk9jkhu3f.ZA/vIwW3sc3Pg/Bwmeo6KAjkRY at the
|
||||
N54l6e.vu/1m2%8bMFjv/oBdy%36.eL;33/N%d21Qvm/ river wound, twenty miles of the
|
||||
number called, hears the awful it lights; here and trimmings of Caesar. This
|
||||
|
@ -155,7 +155,7 @@ ftp://E1cdf-p.XN--MGBERP4A5D4AR:60510/qMaw4kSSgYM/7jgIuL/gSVW6O91/2bhnsj/kl7R5sg
|
|||
at me, and that her walking z3ymb.KM/DdnrqoBz=YtxSB away so much of the
|
||||
grievous circumstances foreshadowed. After receiving the way, that I thought,
|
||||
if she should go to?' `Good again!' cried the
|
||||
FTP://7kgip3z.XN--HGBK6AJ7F53BBA:15983/OYEQzIA0 society of a savoury pork pie,
|
||||
FTP://7kgip3z.XN--KPRY57D:15983/OYEQzIA0 society of a savoury pork pie,
|
||||
and nezt6awdc.lSZDSU14B1OH.4n6nkmjyyj.cc they challenged, hears nothin' all my
|
||||
hands in herself, and bring him by hand. `This,' ftp://085.062.055.011/bopfVV/
|
||||
said he wore ftp://Mbbn8n.6ge03fiivyc7of.PS/mvb/X8VNt/5WrMZpw/flC6Rs a dog of
|
||||
|
@ -191,7 +191,7 @@ and tingling, and that I had won of the shoulder. `Excuse me, and we departed
|
|||
from Richard the furthest end of
|
||||
http://ch43n.51rkj.rze.mq/pJjrSAiuSv/3x/EK%59ReZM9w both imp and stung by the
|
||||
bright fire, another look
|
||||
zQFC1SPO96J.Jy20d8.xn--0zwm56d:863/0OWpT4dpkMURAGe/nFg/LQBUr%3E/af7dO1 over her
|
||||
zQFC1SPO96J.Jy20d8.xn--3e0b707e:863/0OWpT4dpkMURAGe/nFg/LQBUr%3E/af7dO1 over her
|
||||
best use asking questions, and feet,
|
||||
<ftp://Xctk9iigg.cat/u3cX1d/Sx6m3dql/d%46;type=d#0i%3cT1yMkZQ> hanging to try
|
||||
back was the poker. `It was not warmly. `Seems
|
||||
|
@ -204,7 +204,7 @@ kitchen wall,
|
|||
Ftp://2gifamku.jqv10es.MX/yJ0rhtMYX/Y1Wq%F90RYO1F/NT0%aeAG3/r3Act1 he ate the
|
||||
house, end with the Ghost in order): Forty-three pence?' To five hundred
|
||||
Gargerys.' `I say, Pip; stay
|
||||
7WO6F.XN--11B5BS3A9AJ6G/1L%f9G0NEu/L2lD/mQGNS9UhgCEb out with
|
||||
7WO6F.XN--45BRJ9C/1L%f9G0NEu/L2lD/mQGNS9UhgCEb out with
|
||||
ftp://mIMU.t4d24n4lyx39.zURN708MCNGK-TJ42GLLBQRJHVENGPO.bw:59930/KmBYQKHfcjNRe/rK3fUjg%0Ad/.zHeVoCaC5/w%A2%F7up9o7J0Eq/ySBVhB
|
||||
his shot, and reposing no help to my seat. It was in the kitchen wall, because
|
||||
I calculated the sounds by giving me by the name for a rush of Joe's forge
|
||||
|
@ -299,7 +299,7 @@ She drew the kitchen, carrying file:///Y?GG/BBqMPBJ/nsxX3qP/8P24WdqBxH so low
|
|||
wooden hut
|
||||
ftp://7vl2w.jp/b%a5fBYyDR/ZN%62LG9aYpjSwn0yWg/nG97gndK%69XZ#fet%55XXZhslTNrq5T
|
||||
where it seemed to give Pirrip as
|
||||
<79wvzk3.24dyfkxg0f4z-hsqgqqzj2p9n59el0a.XN--DEBA0AD/:8epfLrewivg%488s/2ORX8M3/B0KpeeB/2rbuCnnBF/4P6%1cU6fTGNj/o%3aZMIHdO>
|
||||
<79wvzk3.24dyfkxg0f4z-hsqgqqzj2p9n59el0a.XN--FIQS8S/:8epfLrewivg%488s/2ORX8M3/B0KpeeB/2rbuCnnBF/4P6%1cU6fTGNj/o%3aZMIHdO>
|
||||
to say, on the guiltily coarse his head, he tried to the
|
||||
Uow9.sF.GP/sF3FCFSbCRWGNJY%aaU/DVXA5nIOWmjc6S/FQXdiBw/Y7~cVmpypgft/vU1%D4z
|
||||
remark. `There's one sprinkled all I was possible she beggared me. All these
|
||||
|
@ -311,7 +311,7 @@ Http://Ed095eimjy.rlb5698d.kp/_l5uoOO/aA494s?3nSxdIpE=y%79qu+2un1hGR&J%76=8&L%be
|
|||
he shook her veil so thick nor my milk and would impart all had returned, with
|
||||
soap-suds, I had FILE:///#F9Bgl just like thin snow. `Enough of his right side
|
||||
of thenceforth sitting
|
||||
jyia054.l814D9SNHRRA5RJCCW.kvxga.XN--0ZWM56D/sBbx24%f2Tw2/Sd0Lul0Vg1bbIqW~/lveEw
|
||||
jyia054.l814D9SNHRRA5RJCCW.kvxga.XN--3E0B707E/sBbx24%f2Tw2/Sd0Lul0Vg1bbIqW~/lveEw
|
||||
in File:///KKfIe63z/BETB.T%C6sG/RcYgnOycg my soul. I sat down on it, I have
|
||||
been a spoon that the pie, blacksmith?' asked Estella of it made a mouth wide
|
||||
open, and so
|
||||
|
@ -324,7 +324,7 @@ FTP://7qf.hlj.TN/IXOeaf/t%c52Jxwy#YkcAy2 of the stranger looked at it, I
|
|||
pointed to Ftp://Gbu5t.HT/xad4fgjaN#GLpU3XQd6%7F(cHIz himself. No glimpse of
|
||||
file:///A1omJiPzafgAm/addqzG%dc%62/Lw1mamTg herself, I saw that he would have
|
||||
been there, I was too far and uncomfortable by it.
|
||||
http://89qw34ksf0qf6iq264of-1nya4ds7qvpixw8c951aw8wcm3.qxk7usa.N8j1frzfgnkbi9y2.XN--9T4B11YI5A/Unwn3/%97gnj0/GQgJC~OFxsdE8ubC7/IWy450/8%7CQVgdI8/soi0BviZt/Zjs%10i5Xh?qi8t9=rBbPok,Si&*Xl=Q+fT&Hx4%D70=84+8W%18+sV2BU6xCDP%47M&Usbms=
|
||||
http://89qw34ksf0qf6iq264of-1nya4ds7qvpixw8c951aw8wcm3.qxk7usa.N8j1frzfgnkbi9y2.XN--CLCHC0EA0B2G2A9GCD/Unwn3/%97gnj0/GQgJC~OFxsdE8ubC7/IWy450/8%7CQVgdI8/soi0BviZt/Zjs%10i5Xh?qi8t9=rBbPok,Si&*Xl=Q+fT&Hx4%D70=84+8W%18+sV2BU6xCDP%47M&Usbms=
|
||||
Under the Above,' I rather to become transfixed -- he gave me out of the
|
||||
kitchen empty-handed, to keep him, I had made a
|
||||
Z7tid0uh.eZMOI-M1.umlsyksuzovqdw6wozbd.BW/m%e684OhC/ErAhpGiG subject, if he had
|
||||
|
@ -468,7 +468,7 @@ hard twist upon his -- `Well, boy,' Uncle Pumblechook: a look at the sermon he
|
|||
had heard it had hesitated as little window, violently plunging and she had
|
||||
committed, and had all about the present calling, which the fingers of tea on
|
||||
Saturdays than this country, gentlemen, but I could see those,
|
||||
https://nWC9-RIA00RPVL4SSWRICWWX3NH5SMQIA7IPMCK174T30VQBL-M6.XN--0ZWM56D/CwE%e2rWaYZmE?X_coOVl=kqGQ&Pli=MjKg-+wO6Eh+lbbcN&x3M=3kQh99m92mRdf&iiO2wXgQ=qyWVG9G
|
||||
https://nWC9-RIA00RPVL4SSWRICWWX3NH5SMQIA7IPMCK174T30VQBL-M6.XN--3E0B707E/CwE%e2rWaYZmE?X_coOVl=kqGQ&Pli=MjKg-+wO6Eh+lbbcN&x3M=3kQh99m92mRdf&iiO2wXgQ=qyWVG9G
|
||||
too, if you remember what stock she told me again. `But I know what
|
||||
file:///enqvF%EFLOBsZhl8h2z wittles is?' `Yes, ma'am.' `Estella, take me again
|
||||
and ftp://133.4.130.192/p%b1LgcONfo%bc&kmH/Ibh6Lq%DCJhnswT%1A refractory
|
||||
|
@ -493,7 +493,7 @@ right-side
|
|||
ftp://zxmv98m49669kfvf24o12w3u93wbovfp-1smo6y90e27n133okplcjqrmv-a.CD/JM5RAAY/sJdBntYWuEY4uB7hz/ozRSmFJD/#Xv22:Xvg
|
||||
flaxen curls and tables, and a foot of the blacksmith's.' `Halloa!' said Joe,
|
||||
staring at that it had withered like a infunt, and took another look about the
|
||||
rum <6S8.Crwllo5e3.jmtz.XN--G6W251D/6InlQn/hnhu2f%ac8tX/apq%0D6o/> out at once.
|
||||
rum <6S8.Crwllo5e3.jmtz.XN--GECRJ9C/6InlQn/hnhu2f%ac8tX/apq%0D6o/> out at once.
|
||||
Three Jolly Bargemen to think she seemed to tell you were. When we saw the file
|
||||
coming at my slice. I have mentioned it with the wooden hut where we had got up
|
||||
trying to file:///gVW/nnRNxPfMXKb%72Aq%4A hand. If ever grateful for. If a
|
||||
|
@ -662,7 +662,7 @@ open,' he
|
|||
https://227.086.128.010:64985/MDKuFInA86qto5/_cK=4S%49Ic/SPp76/TlV%0Arlwfx/
|
||||
wiped the liquor. He was the bad; and some one
|
||||
Ftp://171.160.94.43/ALTgS46I4VM/55PbbK/5N%faTSE another
|
||||
Ftp://3zd7z.etw.XN--JXALPDLP/4UztCuTbW2z/LL%2cDI/dTYSi9 turned to put straws
|
||||
Ftp://3zd7z.etw.XN--KPRW13D/4UztCuTbW2z/LL%2cDI/dTYSi9 turned to put straws
|
||||
down by a most powerfully down
|
||||
t6xfr.wxjz5p2t5.zl8m4.MN/2cbpjk/gsdm/5Mvc-j3rc/16Wb65&c7x to me, and all that
|
||||
know the window,
|
||||
|
@ -993,7 +993,7 @@ upon a door, which was gobbling mincemeat, meatbone, bread, some lace for it
|
|||
that Joe's blue file:///EYS2nDf%9671qsm34OZeB%e5lUA/rYBDn0DKs0/ eyes, had an
|
||||
hour longer than at me, and dismal, and gloves, and that's further than I
|
||||
mpuwl0.BA/MkvAvc?j%11K4=9gE%613&qOOEP0t=g7EXs looked on. `Now, boy!
|
||||
g6tylc0.daeczh.4q.XN--9T4B11YI5A/1SbCR9cX1%3D/YfP8CpLKn5KzTL8/Kj11z%B7OuqJU;qM4P
|
||||
g6tylc0.daeczh.4q.XN--CLCHC0EA0B2G2A9GCD/1SbCR9cX1%3D/YfP8CpLKn5KzTL8/Kj11z%B7OuqJU;qM4P
|
||||
Why, here's a ridiculous old chap. And looked up by hand. `Why don't like
|
||||
`sulks.' Therefore, I was in such game?' Everybody, myself drifting down his
|
||||
chest and he had made me worse by-and-by. I was a
|
||||
|
@ -1035,7 +1035,7 @@ in every word out again. `You are prison-ships, and they fought
|
|||
<HTTPS://bF2RA.kw/1TA9pTTBg/nM/VSRo%85Kt?%62mxNfo=HDowgwkM3&9oPOLH2=yKOxIe+YNtt>
|
||||
for us heavy. `I Bolted, myself, 5.Piba4ac.JE/55M1H/AZXdj and thread, and we
|
||||
after him, or to inspire confidence. This was brought you spoke all the act, he
|
||||
couldn't m-k6-ej7x.XN--HLCJ6AYA9ESC7A/suVrNQSIj9/TmRhHbe/o&0dbqR/ keep the fire
|
||||
couldn't m-k6-ej7x.XN--J6W193G/suVrNQSIj9/TmRhHbe/o&0dbqR/ keep the fire
|
||||
between the forge was <ftp://242.228.138.8/o%CC_QjILS%17aYH/%caw8CcVZyPRZ/>
|
||||
busy in it. Until
|
||||
hGE9YH3D6.SD/m%1EpDJrzO/Tf2Xxqq8L/YJT7BTEY%661PvcMgOr/29ZbuJuWl6q/ she jammed
|
||||
|
@ -1329,7 +1329,7 @@ sort Http://w9ys35.wb55p6l.hxl.rs/Y97%58Lp8JjLZw/5L --
|
|||
FILE://155.24.106.255/3VEZIT7 if it was to him, I might not do not afraid of
|
||||
report, and looking rather to make nothing of a confidential voice,
|
||||
d1y8zvhwq40bi3tom.hPCZ.gJ-286X.TG/ayWKrgAvF6tn/L4SgquZT6C/1DmNe/CI69rJ/%f6QrzZGkSQ
|
||||
as lda5l5wc.XN--HGBK6AJ7F53BBA/pr80SSZ/eNM1%D50lp/Rc%8EimOET if he would be
|
||||
as lda5l5wc.XN--KPRY57D/pr80SSZ/eNM1%D50lp/Rc%8EimOET if he would be
|
||||
supposed,' said the wind and so we were read the conversation consisted of it
|
||||
had so that we saw some bread, some
|
||||
l13t2t.sk/O%2BmRkw/@0AgGL@NX/wgt&aggDcp#0IYe'C brandy out: no black velvet
|
||||
|
|
|
@ -10,7 +10,7 @@ http://Rcbu6/Oxc%C0IkGSZ8rO9IUpd/BEvkvw3nWNXZ/P%17tp3gjATN/0ZRzs
|
|||
file:///2CdsP/U2GCLT
|
||||
Http://Pzw978uzb.ai/yB;mt/o8hVKG/%231Y/Xb1%bb6v1fhjfdkfkBvxed?8mq~=OvF&STpJJk=ws0ZO&0DRA=
|
||||
HTTP://173.202.175.16/Md7tF6lj7r/oioJ9TpL8/x%03PjXgMMBC7C3%BDWzoVMzH
|
||||
Https://yu7v33rbt.vC6U3.XN--JXALPDLP/y%4fMSzkGFlm/wbDF4m
|
||||
Https://yu7v33rbt.vC6U3.XN--KPRW13D/y%4fMSzkGFlm/wbDF4m
|
||||
M19nq.0URV4A.Me.CC/mj0kgt6hue/dRXv8YVLOw9v/CIOqb
|
||||
ftp://evzed8zvv.l2xkky.Dq85qcl1.eu:1184/07eY0/3X1OB7gPUk/J8la5OPUY3/y1oTItIs1HFPPp/5Q02N0cPyDH87hSy/jheYGF8s%F3P/%86PmYhi/ViKHoxsHqM8J
|
||||
ftp://213.7.210.47/%e5pFkj6e6Jczc/ypJGG/z%663jYR/37IxLQBPr/Ciq50EUIdueyj
|
||||
|
@ -23,13 +23,13 @@ Ftp://Xmswrxn8d-1s.pe.gm/dB6C3xTk%D3x/EKOiTmk%7c/API/0cdgpi;Type=a
|
|||
FILE:///rKnQkS0MAF#tM%53_2%03%d6ZICH
|
||||
ftp://R5ecjkf1yx4wpskfh.tv0y3m90ak.0R605.se:51297/zpWcRRcG/1woSqw7ZUko/
|
||||
file:///%C5=.%8by/uuFXEaW8.%7E4/DRM%33Kh2xb8u%7FHizfLn/aoF06#7srWW%2EKoFf
|
||||
HTTP://yA2O3F.XN--0ZWM56D/qPDTt/MwMXGQq2S7JT/TJ2iCND
|
||||
HTTP://yA2O3F.XN--3E0B707E/qPDTt/MwMXGQq2S7JT/TJ2iCND
|
||||
file:///Gdx5CDZYW%6cnzMJ/7HJ/J%63BSZDXtS/yfWXqq6#
|
||||
http://1qvgjd1.TP/7oq5gWW/Gwqf8fxBXR4/?Br,q=ayMz0&1IO%370N7=;Sl1czc2L+5bRISfD+w&ygP3FhV%E1w36=2Rx
|
||||
ftp://5SCC6BUYP.Knf1cvlc22z9.1dc3rixt5ugyq4/5OnYTSN/QpCdo/t3zqkI/pn5skT/oJgrGy7
|
||||
http://2dkbeuwsto3i3e8jaxi6su9wjlmwygtpdp7g65611z-2bbr82uhjqkdv2jrh7.KZ/FiSvI/aaB&dPQ%42kLdM
|
||||
FTP://Hi144dz6hctql2n3uom.GE/%1A4OBV%63h/DoA4hpXFmqldOw-MB/PNYoaSDJB2F1k5/Nx%BBEDhrHhcMB
|
||||
ftp://w0yaysrl.XN--9T4B11YI5A/y4FFU%c4F0B/Dh9%D1dGK3bN/EqxueQEsX2p5/xgf4Jxr%D9q/2ubmieRM
|
||||
ftp://w0yaysrl.XN--CLCHC0EA0B2G2A9GCD/y4FFU%c4F0B/Dh9%D1dGK3bN/EqxueQEsX2p5/xgf4Jxr%D9q/2ubmieRM
|
||||
http://t9wa4.rjcahbc06qmyk9jkhu3f.ZA/vIwW3sc3Pg/Bwmeo6KAjkRY
|
||||
N54l6e.vu/1m2%8bMFjv/oBdy%36.eL;33/N%d21Qvm/
|
||||
http://ah-2d4.ASIA/qmp
|
||||
|
@ -75,7 +75,7 @@ http://4u3o/BKdhwRyzG
|
|||
file:///LdsHfPABFz1vRD1OB6Yl/RS6&1Gmz/mfYul/
|
||||
ftp://E1cdf-p.XN--MGBERP4A5D4AR:60510/qMaw4kSSgYM/7jgIuL/gSVW6O91/2bhnsj/kl7R5sgn6&X5EiZdZ0WhTX3T/fa%f3Azz
|
||||
z3ymb.KM/DdnrqoBz=YtxSB
|
||||
FTP://7kgip3z.XN--HGBK6AJ7F53BBA:15983/OYEQzIA0
|
||||
FTP://7kgip3z.XN--KPRY57D:15983/OYEQzIA0
|
||||
nezt6awdc.lSZDSU14B1OH.4n6nkmjyyj.cc
|
||||
ftp://085.062.055.011/bopfVV/
|
||||
ftp://Mbbn8n.6ge03fiivyc7of.PS/mvb/X8VNt/5WrMZpw/flC6Rs
|
||||
|
@ -93,12 +93,12 @@ https://[3790:ad57:0B63::e5f7:f6ac:164C]/Obax;zcD/Y%48%9a/Z2xcdar
|
|||
bl60k0jqkc9.oow84o1.BF/Xly5cTna/BzoQuHi3r8e/o5BDNrvT/=6HRdBjH/Mrp5%02/p%e9pT2Ae
|
||||
ftp://Bs3ceuxd8ii66gt.X8wwdpt.BB:27095/3BfkvfzcmTS/FTffh&S/gIWvJ5Kd/AlOQ%3EnO
|
||||
http://ch43n.51rkj.rze.mq/pJjrSAiuSv/3x/EK%59ReZM9w
|
||||
zQFC1SPO96J.Jy20d8.xn--0zwm56d:863/0OWpT4dpkMURAGe/nFg/LQBUr%3E/af7dO1
|
||||
zQFC1SPO96J.Jy20d8.xn--3e0b707e:863/0OWpT4dpkMURAGe/nFg/LQBUr%3E/af7dO1
|
||||
ftp://Xctk9iigg.cat/u3cX1d/Sx6m3dql/d%46;type=d#0i%3cT1yMkZQ
|
||||
HTTPS://56aderic0knmip9lkqdqag14.uk:45885/lELiK:/vF%4C5Enwqy/P5NGJ2b/dD6sg1yMV
|
||||
ftp://vlt.3g45k63viz2.tcnm3.UA:60664/AJ9iqYk%c1/uKbohn2/K%D1kequ4z8rxFpJ
|
||||
Ftp://2gifamku.jqv10es.MX/yJ0rhtMYX/Y1Wq%F90RYO1F/NT0%aeAG3/r3Act1
|
||||
7WO6F.XN--11B5BS3A9AJ6G/1L%f9G0NEu/L2lD/mQGNS9UhgCEb
|
||||
7WO6F.XN--45BRJ9C/1L%f9G0NEu/L2lD/mQGNS9UhgCEb
|
||||
ftp://mIMU.t4d24n4lyx39.zURN708MCNGK-TJ42GLLBQRJHVENGPO.bw:59930/KmBYQKHfcjNRe/rK3fUjg%0Ad/.zHeVoCaC5/w%A2%F7up9o7J0Eq/ySBVhB
|
||||
ftp://lv56pdepzu0b0fo-04qtxv5tt2jc0nsaukrhtz5-e3u1vcb517y3b135zl.e0r1hson.dk/3TVoqjp6%1FCFSkt/006VZfho/gxrWxgDawM3Uk
|
||||
Ftp://7n977.Niyt.2fgkzfhj.q7-DJ.Ow7a.it/5zfRi3PO8/1zfKT9%421tP/?SazEijJq%710COQKWeLE/TdUc%b2u/2AxBw9%4BUN6Zp4Z/KfUZd1MTdPv/L4m1tI3/WJvcK1
|
||||
|
@ -147,20 +147,20 @@ ftp://Lq.es/%B1ZPdTZgB2mNFW/qre92rM
|
|||
file:///IZ47ESCtX%aatQab1/V553gjR?Me/#9%68qPw
|
||||
file:///Y?GG/BBqMPBJ/nsxX3qP/8P24WdqBxH
|
||||
ftp://7vl2w.jp/b%a5fBYyDR/ZN%62LG9aYpjSwn0yWg/nG97gndK%69XZ#fet%55XXZhslTNrq5T
|
||||
79wvzk3.24dyfkxg0f4z-hsqgqqzj2p9n59el0a.XN--DEBA0AD/:8epfLrewivg%488s/2ORX8M3/B0KpeeB/2rbuCnnBF/4P6%1cU6fTGNj/o%3aZMIHdO
|
||||
79wvzk3.24dyfkxg0f4z-hsqgqqzj2p9n59el0a.XN--FIQS8S/:8epfLrewivg%488s/2ORX8M3/B0KpeeB/2rbuCnnBF/4P6%1cU6fTGNj/o%3aZMIHdO
|
||||
Uow9.sF.GP/sF3FCFSbCRWGNJY%aaU/DVXA5nIOWmjc6S/FQXdiBw/Y7~cVmpypgft/vU1%D4z
|
||||
ftp://[fd77:4982:C37F:a0a1:7651:E09C:117.093.145.017]/2l91g/s%79lJmUiZ/%A5R2qsJ
|
||||
[62c0::]/d1lmSzoB/5OBVnzn/kOXW%D23
|
||||
Http://Ed095eimjy.rlb5698d.kp/_l5uoOO/aA494s?3nSxdIpE=y%79qu+2un1hGR&J%76=8&L%bed=uY5hO+s+IKk1S&Q=HHXEC+Gof86QIRHy&35QY5=
|
||||
FILE:///#F9Bgl
|
||||
jyia054.l814D9SNHRRA5RJCCW.kvxga.XN--0ZWM56D/sBbx24%f2Tw2/Sd0Lul0Vg1bbIqW~/lveEw
|
||||
jyia054.l814D9SNHRRA5RJCCW.kvxga.XN--3E0B707E/sBbx24%f2Tw2/Sd0Lul0Vg1bbIqW~/lveEw
|
||||
File:///KKfIe63z/BETB.T%C6sG/RcYgnOycg
|
||||
ftp://892f7.oel50j.32.9qj1p-g7lgw.MR:48021/XNKbk2PZQXSvOuGnOAnATDt3/XfHyJtvoC/PW7YrSgf#LmGWJgPw
|
||||
http://sisas.ua/4CU60ZLK4VgY8AR89
|
||||
FTP://7qf.hlj.TN/IXOeaf/t%c52Jxwy#YkcAy2
|
||||
Ftp://Gbu5t.HT/xad4fgjaN#GLpU3XQd6%7F(cHIz
|
||||
file:///A1omJiPzafgAm/addqzG%dc%62/Lw1mamTg
|
||||
http://89qw34ksf0qf6iq264of-1nya4ds7qvpixw8c951aw8wcm3.qxk7usa.N8j1frzfgnkbi9y2.XN--9T4B11YI5A/Unwn3/%97gnj0/GQgJC~OFxsdE8ubC7/IWy450/8%7CQVgdI8/soi0BviZt/Zjs%10i5Xh?qi8t9=rBbPok,Si&*Xl=Q+fT&Hx4%D70=84+8W%18+sV2BU6xCDP%47M&Usbms=
|
||||
http://89qw34ksf0qf6iq264of-1nya4ds7qvpixw8c951aw8wcm3.qxk7usa.N8j1frzfgnkbi9y2.XN--CLCHC0EA0B2G2A9GCD/Unwn3/%97gnj0/GQgJC~OFxsdE8ubC7/IWy450/8%7CQVgdI8/soi0BviZt/Zjs%10i5Xh?qi8t9=rBbPok,Si&*Xl=Q+fT&Hx4%D70=84+8W%18+sV2BU6xCDP%47M&Usbms=
|
||||
Z7tid0uh.eZMOI-M1.umlsyksuzovqdw6wozbd.BW/m%e684OhC/ErAhpGiG
|
||||
ftp://tw7d-6yu.im:2055/%66qbqzss/OmPGW;type=d
|
||||
FTP://zst.tn/QcUpaA/VKvJ2/JN6AKew/iXYIiHm7mfPFmD%21E5/yTQpoiqdbaaS1/LnzOX#VqsobH
|
||||
|
@ -228,7 +228,7 @@ file:///UIIGOxv6jvF2%c0/%A8J3%677Gmq8im1zklKhqx/HMhCSY2QcyxvL/
|
|||
http://Qhk9z.zm/cOGBen/mBsDycEI5V7L1s%84WUj7863/p%5f~okuRD51b0M?b%F2d%67ujGr=oh8PWUtK&j6uX7baX=&sg3RUocA9W=m5IaF&JWH9G=fyiOtnC3+7RJA+ippw96rvu+BxtGg&F6f1=jmPS&3PE0xX5=TGV%5c5J&%fc@NSEynhuvb=&MkRIt33=
|
||||
Http://[98cc:433d:2C25:62dd:54ba:d10b:63d3:4C40]/YlbNrJod/fdjuN/qYqSdqr5/KAbXYHO%F0m7Ws9
|
||||
file:///ywFY5HK/XAv@v%66o/M2O4Wlny50hypf5%02A8
|
||||
https://nWC9-RIA00RPVL4SSWRICWWX3NH5SMQIA7IPMCK174T30VQBL-M6.XN--0ZWM56D/CwE%e2rWaYZmE?X_coOVl=kqGQ&Pli=MjKg-+wO6Eh+lbbcN&x3M=3kQh99m92mRdf&iiO2wXgQ=qyWVG9G
|
||||
https://nWC9-RIA00RPVL4SSWRICWWX3NH5SMQIA7IPMCK174T30VQBL-M6.XN--3E0B707E/CwE%e2rWaYZmE?X_coOVl=kqGQ&Pli=MjKg-+wO6Eh+lbbcN&x3M=3kQh99m92mRdf&iiO2wXgQ=qyWVG9G
|
||||
file:///enqvF%EFLOBsZhl8h2z
|
||||
ftp://133.4.130.192/p%b1LgcONfo%bc&kmH/Ibh6Lq%DCJhnswT%1A
|
||||
ftp://1xf.ipl4f0y6c4.VA/LHuq~/p2nPbE/0YGGNJB%DEje2psef_B/aKOuMl1Q9
|
||||
|
@ -240,7 +240,7 @@ http://nEN5ZN.EG/%0efsf4v30L
|
|||
file:///19%9947/ksd3Sq7W78%27/2K_Ylzcu2q
|
||||
r8sht9qzsc1e2wp.ci/8SbPwlW%5ac/qKEqFi0Q
|
||||
ftp://zxmv98m49669kfvf24o12w3u93wbovfp-1smo6y90e27n133okplcjqrmv-a.CD/JM5RAAY/sJdBntYWuEY4uB7hz/ozRSmFJD/#Xv22:Xvg
|
||||
6S8.Crwllo5e3.jmtz.XN--G6W251D/6InlQn/hnhu2f%ac8tX/apq%0D6o/
|
||||
6S8.Crwllo5e3.jmtz.XN--GECRJ9C/6InlQn/hnhu2f%ac8tX/apq%0D6o/
|
||||
file:///gVW/nnRNxPfMXKb%72Aq%4A
|
||||
file:///Fzza388TQ
|
||||
file:///
|
||||
|
@ -314,7 +314,7 @@ file:///3%aexrb7UdZ5GpR4ZIfoxwL/vQV%4a2zQxki/QRji6gHpMGgBaM/d%71A2CTpZv-kF0tD/Ig
|
|||
f5ms.jp/%A1FpERWwTd%BFG/ExC8V5aqx5l2CLJr0mJb5u/DgMvEzAr2U/py9Vg/igr9PzANtw/FFiN1E7
|
||||
https://227.086.128.010:64985/MDKuFInA86qto5/_cK=4S%49Ic/SPp76/TlV%0Arlwfx/
|
||||
Ftp://171.160.94.43/ALTgS46I4VM/55PbbK/5N%faTSE
|
||||
Ftp://3zd7z.etw.XN--JXALPDLP/4UztCuTbW2z/LL%2cDI/dTYSi9
|
||||
Ftp://3zd7z.etw.XN--KPRW13D/4UztCuTbW2z/LL%2cDI/dTYSi9
|
||||
t6xfr.wxjz5p2t5.zl8m4.MN/2cbpjk/gsdm/5Mvc-j3rc/16Wb65&c7x
|
||||
ftp://D02-auxxaeqnv9ve-jlmo3.l10vqu.12jl.2mvjwrsqm.BA/r71QLLNu6oGJjG/HbxrX1Grq8/QR%2agZv4hR
|
||||
file:///XoCg%EDVf/A3ibJYjU
|
||||
|
@ -476,7 +476,7 @@ ftp://53.151.134.240/uZqGXLUIu-J/=%0C2pO/PvL0%19MpQBv/
|
|||
FILE:///Kywof5D5q/0TRS/zayrkrnENB
|
||||
file:///EYS2nDf%9671qsm34OZeB%e5lUA/rYBDn0DKs0/
|
||||
mpuwl0.BA/MkvAvc?j%11K4=9gE%613&qOOEP0t=g7EXs
|
||||
g6tylc0.daeczh.4q.XN--9T4B11YI5A/1SbCR9cX1%3D/YfP8CpLKn5KzTL8/Kj11z%B7OuqJU;qM4P
|
||||
g6tylc0.daeczh.4q.XN--CLCHC0EA0B2G2A9GCD/1SbCR9cX1%3D/YfP8CpLKn5KzTL8/Kj11z%B7OuqJU;qM4P
|
||||
file:///TJa%86AczeCmM5QMhi/Wox~Ajl/WxUF%5eSA:y%0fD%E21/x%cca%d3Qgx/8iWJ5-h%26/fCK%01nQNrK8#ygTTB
|
||||
file:///~%303cUUVYTEaQU5%5DXbogiPKb/favR2rETEh/9TXM%15u/nYCOZpZgL
|
||||
file:///mJM%a1/jv5%53QDqE/bFMu0CBp
|
||||
|
@ -496,7 +496,7 @@ http://gpu16lz.LS/9e%daJrwQfHEpFvsZ3jx/c4STIJ/CmvEGAUx9f/
|
|||
file://ij9anjtok86ro.uN-BGDQ855IB.sDXAQR.5kr8kz.3J3M8XRM.18r3s0g-6.4rjsmwue0lwao0og17d-5-1.F1h3qgkul29yw2t4p4se5clomncxhmoy.g6c9tbz7.pa/5LMtmbl/1tfIF/pBOV7Hc
|
||||
HTTPS://bF2RA.kw/1TA9pTTBg/nM/VSRo%85Kt?%62mxNfo=HDowgwkM3&9oPOLH2=yKOxIe+YNtt
|
||||
5.Piba4ac.JE/55M1H/AZXdj
|
||||
m-k6-ej7x.XN--HLCJ6AYA9ESC7A/suVrNQSIj9/TmRhHbe/o&0dbqR/
|
||||
m-k6-ej7x.XN--J6W193G/suVrNQSIj9/TmRhHbe/o&0dbqR/
|
||||
ftp://242.228.138.8/o%CC_QjILS%17aYH/%caw8CcVZyPRZ/
|
||||
hGE9YH3D6.SD/m%1EpDJrzO/Tf2Xxqq8L/YJT7BTEY%661PvcMgOr/29ZbuJuWl6q/
|
||||
Ftp://mez27g2tpmk.MC/%B8AHk%95etDns%46/gXbsCn%6C-/s8_Jmy/DhmfT~Di6KD
|
||||
|
@ -633,7 +633,7 @@ http://047.014.184.200/Z_QdOwjzfBue4Nt/aEn/xuEQD/cXlnoxHIK%7d8h/1%eegEk7E0/8Ejku
|
|||
Http://w9ys35.wb55p6l.hxl.rs/Y97%58Lp8JjLZw/5L
|
||||
FILE://155.24.106.255/3VEZIT7
|
||||
d1y8zvhwq40bi3tom.hPCZ.gJ-286X.TG/ayWKrgAvF6tn/L4SgquZT6C/1DmNe/CI69rJ/%f6QrzZGkSQ
|
||||
lda5l5wc.XN--HGBK6AJ7F53BBA/pr80SSZ/eNM1%D50lp/Rc%8EimOET
|
||||
lda5l5wc.XN--KPRY57D/pr80SSZ/eNM1%D50lp/Rc%8EimOET
|
||||
l13t2t.sk/O%2BmRkw/@0AgGL@NX/wgt&aggDcp#0IYe'C
|
||||
FILE://a6ys9a4.xj.BY/%99BGXp/F=yJtxc71/gvXuHuB9k
|
||||
212.072.006.032/6kV8ce%2e/%e7lzm-HB%4artP/zg6tWMW7RIG?U7=HAXw$D3sM%7DyDJ&Gt=
|
||||
|
|
|
@ -75,7 +75,7 @@ public class TestUAX29URLEmailTokenizerFactory extends BaseTokenStreamFactoryTes
|
|||
+ " samba Halta gamba "
|
||||
+ "ftp://119.220.152.185/JgJgdZ/31aW5c/viWlfQSTs5/1c8U5T/ih5rXx/YfUJ/xBW1uHrQo6.R\n"
|
||||
+ "M19nq.0URV4A.Me.CC/mj0kgt6hue/dRXv8YVLOw9v/CIOqb\n"
|
||||
+ "Https://yu7v33rbt.vC6U3.XN--JXALPDLP/y%4fMSzkGFlm/wbDF4m"
|
||||
+ "Https://yu7v33rbt.vC6U3.XN--KPRW13D/y%4fMSzkGFlm/wbDF4m"
|
||||
+ " inter Locutio "
|
||||
+ "[c2d4::]/%471j5l/j3KFN%AAAn/Fip-NisKH/\n"
|
||||
+ "file:///aXvSZS34is/eIgM8s~U5dU4Ifd%c7"
|
||||
|
@ -91,7 +91,7 @@ public class TestUAX29URLEmailTokenizerFactory extends BaseTokenStreamFactoryTes
|
|||
"samba", "Halta", "gamba",
|
||||
"ftp://119.220.152.185/JgJgdZ/31aW5c/viWlfQSTs5/1c8U5T/ih5rXx/YfUJ/xBW1uHrQo6.R",
|
||||
"M19nq.0URV4A.Me.CC/mj0kgt6hue/dRXv8YVLOw9v/CIOqb",
|
||||
"Https://yu7v33rbt.vC6U3.XN--JXALPDLP/y%4fMSzkGFlm/wbDF4m",
|
||||
"Https://yu7v33rbt.vC6U3.XN--KPRW13D/y%4fMSzkGFlm/wbDF4m",
|
||||
"inter", "Locutio",
|
||||
"[c2d4::]/%471j5l/j3KFN%AAAn/Fip-NisKH/",
|
||||
"file:///aXvSZS34is/eIgM8s~U5dU4Ifd%c7",
|
||||
|
|
|
@ -60,11 +60,12 @@ public class GenerateJflexTLDMacros {
|
|||
|
||||
private static final String APACHE_LICENSE
|
||||
= "/*" + NL
|
||||
+ " * Copyright 2001-2005 The Apache Software Foundation." + NL
|
||||
+ " *" + NL
|
||||
+ " * Licensed under the Apache License, Version 2.0 (the \"License\");" + NL
|
||||
+ " * you may not use this file except in compliance with the License." + NL
|
||||
+ " * You may obtain a copy of the License at" + NL
|
||||
+ " * Licensed to the Apache Software Foundation (ASF) under one or more" + NL
|
||||
+ " * contributor license agreements. See the NOTICE file distributed with" + NL
|
||||
+ " * this work for additional information regarding copyright ownership." + NL
|
||||
+ " * The ASF licenses this file to You under the Apache License, Version 2.0" + NL
|
||||
+ " * (the \"License\"); you may not use this file except in compliance with" + NL
|
||||
+ " * the License. You may obtain a copy of the License at" + NL
|
||||
+ " *" + NL
|
||||
+ " * http://www.apache.org/licenses/LICENSE-2.0" + NL
|
||||
+ " *" + NL
|
||||
|
@ -73,7 +74,7 @@ public class GenerateJflexTLDMacros {
|
|||
+ " * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied." + NL
|
||||
+ " * See the License for the specific language governing permissions and" + NL
|
||||
+ " * limitations under the License." + NL
|
||||
+ " */" + NL + NL;
|
||||
+ " */" + NL;
|
||||
|
||||
private static final Pattern TLD_PATTERN_1
|
||||
= Pattern.compile("([-A-Za-z0-9]+)\\.\\s+NS\\s+.*");
|
||||
|
|
|
@ -14,11 +14,29 @@
|
|||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
# Default RBBI rules, based on UAX#29.
|
||||
# This file is from ICU (with some small modifications, to avoid CJK dictionary break)
|
||||
#
|
||||
# Copyright (C) 2002-2013, International Business Machines Corporation
|
||||
# and others. All Rights Reserved.
|
||||
#
|
||||
# file: word.txt
|
||||
#
|
||||
# ICU Word Break Rules
|
||||
# See Unicode Standard Annex #29.
|
||||
# These rules are based on UAX #29 Revision 22 for Unicode Version 6.3
|
||||
#
|
||||
# Note: Updates to word.txt will usually need to be merged into
|
||||
# word_POSIX.txt also.
|
||||
|
||||
##############################################################################
|
||||
#
|
||||
# Character class definitions from TR 29
|
||||
#
|
||||
##############################################################################
|
||||
|
||||
!!chain;
|
||||
|
||||
|
||||
#
|
||||
# Character Class Definitions.
|
||||
#
|
||||
|
@ -27,39 +45,56 @@ $CR = [\p{Word_Break = CR}];
|
|||
$LF = [\p{Word_Break = LF}];
|
||||
$Newline = [\p{Word_Break = Newline}];
|
||||
$Extend = [\p{Word_Break = Extend}];
|
||||
$Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
|
||||
$Format = [\p{Word_Break = Format}];
|
||||
$Katakana = [\p{Word_Break = Katakana}];
|
||||
$Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}];
|
||||
$ALetter = [\p{Word_Break = ALetter}];
|
||||
$Single_Quote = [\p{Word_Break = Single_Quote}];
|
||||
$Double_Quote = [\p{Word_Break = Double_Quote}];
|
||||
$MidNumLet = [\p{Word_Break = MidNumLet}];
|
||||
$MidLetter = [\p{Word_Break = MidLetter}];
|
||||
$MidNum = [\p{Word_Break = MidNum}];
|
||||
$Numeric = [\p{Word_Break = Numeric}[[:Decomposition_Type=Wide:]&[:General_Category=Decimal_Number:]]];
|
||||
$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
|
||||
|
||||
$Han = [:Han:];
|
||||
$Hiragana = [:Hiragana:];
|
||||
|
||||
|
||||
# Dictionary character set, for triggering language-based break engines. Currently
|
||||
# limited to LineBreak=Complex_Context. Note that this set only works in Unicode
|
||||
# 5.0 or later as the definition of Complex_Context was corrected to include all
|
||||
# characters requiring dictionary break.
|
||||
|
||||
$dictionary = [:LineBreak = Complex_Context:];
|
||||
$Control = [\p{Grapheme_Cluster_Break = Control}];
|
||||
$ALetterPlus = [$ALetter [$dictionary-$Extend-$Control]]; # Note: default ALetter does not
|
||||
# include the dictionary characters.
|
||||
$HangulSyllable = [\uac00-\ud7a3];
|
||||
$ComplexContext = [:LineBreak = Complex_Context:];
|
||||
$KanaKanji = [$Han $Hiragana $Katakana];
|
||||
$dictionaryCJK = [$Han $Hiragana $HangulSyllable];
|
||||
$dictionary = [$ComplexContext];
|
||||
|
||||
# leave CJK scripts out of ALetterPlus
|
||||
$ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];
|
||||
|
||||
|
||||
#
|
||||
# Rules 4 Ignore Format and Extend characters,
|
||||
# except when they appear at the beginning of a region of text.
|
||||
#
|
||||
# TODO: check if handling of katakana in dictionary makes rules incorrect/void
|
||||
$KatakanaEx = $Katakana ($Extend | $Format)*;
|
||||
$Hebrew_LetterEx = $Hebrew_Letter ($Extend | $Format)*;
|
||||
$ALetterEx = $ALetterPlus ($Extend | $Format)*;
|
||||
$Single_QuoteEx = $Single_Quote ($Extend | $Format)*;
|
||||
$Double_QuoteEx = $Double_Quote ($Extend | $Format)*;
|
||||
$MidNumLetEx = $MidNumLet ($Extend | $Format)*;
|
||||
$MidLetterEx = $MidLetter ($Extend | $Format)*;
|
||||
$MidNumEx = $MidNum ($Extend | $Format)*;
|
||||
$NumericEx = $Numeric ($Extend | $Format)*;
|
||||
$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;
|
||||
$Regional_IndicatorEx = $Regional_Indicator ($Extend | $Format)*;
|
||||
|
||||
$Hiragana = [\p{script=Hiragana}];
|
||||
$Ideographic = [\p{Ideographic}];
|
||||
$HiraganaEx = $Hiragana ($Extend | $Format)*;
|
||||
$IdeographicEx = $Ideographic ($Extend | $Format)*;
|
||||
|
@ -77,23 +112,31 @@ $CR $LF;
|
|||
# of a region of Text. The rule here comes into play when the start of text
|
||||
# begins with a group of Format chars, or with a "word" consisting of a single
|
||||
# char that is not in any of the listed word break categories followed by
|
||||
# format char(s).
|
||||
# format char(s), or is not a CJK dictionary character.
|
||||
[^$CR $LF $Newline]? ($Extend | $Format)+;
|
||||
|
||||
$NumericEx {100};
|
||||
$ALetterEx {200};
|
||||
$HangulSyllable {200};
|
||||
$Hebrew_LetterEx{200};
|
||||
$KatakanaEx {300}; # note: these status values override those from rule 5
|
||||
$HiraganaEx {300}; # by virtual of being numerically larger.
|
||||
$HiraganaEx {300}; # by virtue of being numerically larger.
|
||||
$IdeographicEx {400}; #
|
||||
|
||||
#
|
||||
# rule 5
|
||||
# Do not break between most letters.
|
||||
#
|
||||
$ALetterEx $ALetterEx {200};
|
||||
($ALetterEx | $Hebrew_LetterEx) ($ALetterEx | $Hebrew_LetterEx) {200};
|
||||
|
||||
# rule 6 and 7
|
||||
$ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200};
|
||||
($ALetterEx | $Hebrew_LetterEx) ($MidLetterEx | $MidNumLetEx | $Single_QuoteEx) ($ALetterEx | $Hebrew_LetterEx) {200};
|
||||
|
||||
# rule 7a
|
||||
$Hebrew_LetterEx $Single_QuoteEx {200};
|
||||
|
||||
# rule 7b and 7c
|
||||
$Hebrew_LetterEx $Double_QuoteEx $Hebrew_LetterEx {200};
|
||||
|
||||
# rule 8
|
||||
|
||||
|
@ -101,27 +144,35 @@ $NumericEx $NumericEx {100};
|
|||
|
||||
# rule 9
|
||||
|
||||
$ALetterEx $NumericEx {200};
|
||||
($ALetterEx | $Hebrew_LetterEx) $NumericEx {200};
|
||||
|
||||
# rule 10
|
||||
|
||||
$NumericEx $ALetterEx {200};
|
||||
$NumericEx ($ALetterEx | $Hebrew_LetterEx) {200};
|
||||
|
||||
# rule 11 and 12
|
||||
|
||||
$NumericEx ($MidNumEx | $MidNumLetEx) $NumericEx {100};
|
||||
$NumericEx ($MidNumEx | $MidNumLetEx | $Single_QuoteEx) $NumericEx {100};
|
||||
|
||||
# rule 13
|
||||
|
||||
$KatakanaEx $KatakanaEx {300};
|
||||
|
||||
# rule 13a/b
|
||||
|
||||
$ALetterEx $ExtendNumLetEx {200}; # (13a)
|
||||
$Hebrew_LetterEx $ExtendNumLetEx {200}; # (13a)
|
||||
$NumericEx $ExtendNumLetEx {100}; # (13a)
|
||||
$KatakanaEx $ExtendNumLetEx {300}; # (13a)
|
||||
$ExtendNumLetEx $ExtendNumLetEx {200}; # (13a)
|
||||
|
||||
$ExtendNumLetEx $ALetterEx {200}; # (13b)
|
||||
$ExtendNumLetEx $Hebrew_Letter {200}; # (13b)
|
||||
$ExtendNumLetEx $NumericEx {100}; # (13b)
|
||||
$ExtendNumLetEx $KatakanaEx {300}; # (13b)
|
||||
|
||||
# rule 13c
|
||||
|
||||
$Regional_IndicatorEx $Regional_IndicatorEx;
|
||||
|
||||
# special handling for CJK characters: chain for later dictionary segmentation
|
||||
$HangulSyllable $HangulSyllable {200};
|
||||
|
|
|
@ -1,61 +0,0 @@
|
|||
#
|
||||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
#
|
||||
# This is an example of rule tailoring for Hebrew.
|
||||
# In this example the single-quote is added to the Extend category
|
||||
# The double-quote is added to the MidLetter category.
|
||||
#
|
||||
!!chain;
|
||||
$CR = [\p{Word_Break = CR}];
|
||||
$LF = [\p{Word_Break = LF}];
|
||||
$Newline = [\p{Word_Break = Newline}];
|
||||
$Extend = [\p{Word_Break = Extend}\u0027];
|
||||
$Format = [\p{Word_Break = Format}];
|
||||
$ALetter = [\p{Word_Break = ALetter}];
|
||||
$MidNumLet = [\p{Word_Break = MidNumLet}];
|
||||
$MidLetter = [\p{Word_Break = MidLetter}\u0022];
|
||||
$MidNum = [\p{Word_Break = MidNum}];
|
||||
$Numeric = [\p{Word_Break = Numeric}];
|
||||
$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
|
||||
$dictionary = [:LineBreak = Complex_Context:];
|
||||
$Control = [\p{Grapheme_Cluster_Break = Control}];
|
||||
$ALetterPlus = [$ALetter [$dictionary-$Extend-$Control]];
|
||||
|
||||
$ALetterEx = $ALetterPlus ($Extend | $Format)*;
|
||||
$MidNumLetEx = $MidNumLet ($Extend | $Format)*;
|
||||
$MidLetterEx = $MidLetter ($Extend | $Format)*;
|
||||
$MidNumEx = $MidNum ($Extend | $Format)*;
|
||||
$NumericEx = $Numeric ($Extend | $Format)*;
|
||||
$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;
|
||||
|
||||
!!forward;
|
||||
|
||||
$CR $LF;
|
||||
[^$CR $LF $Newline]? ($Extend | $Format)+;
|
||||
$NumericEx {100};
|
||||
$ALetterEx {200};
|
||||
$ALetterEx $ALetterEx {200};
|
||||
$ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200};
|
||||
$NumericEx $NumericEx {100};
|
||||
$ALetterEx $NumericEx {200};
|
||||
$NumericEx $ALetterEx {200};
|
||||
$NumericEx ($MidNumEx | $MidNumLetEx) $NumericEx {100};
|
||||
$ALetterEx $ExtendNumLetEx {200};
|
||||
$NumericEx $ExtendNumLetEx {100};
|
||||
$ExtendNumLetEx $ExtendNumLetEx {200};
|
||||
$ExtendNumLetEx $ALetterEx {200};
|
||||
$ExtendNumLetEx $NumericEx {100};
|
|
@ -1,192 +0,0 @@
|
|||
#
|
||||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
# Parses Lao text, with syllable as token.
|
||||
#
|
||||
# The definition of Lao syllable is based from:
|
||||
#
|
||||
# Syllabification of Lao Script for Line Breaking
|
||||
# Phonpasit Phissamay, Valaxay Dalolay, Chitaphone Chanhsililath, Oulaiphone Silimasak,
|
||||
# Sarmad Hussain, Nadir Durrani, Science Technology and Environment Agency, CRULP
|
||||
# http://www.panl10n.net/english/final%20reports/pdf%20files/Laos/LAO06.pdf
|
||||
# http://www.panl10n.net/Presentations/Cambodia/Phonpassit/LineBreakingAlgo.pdf
|
||||
#
|
||||
# NOTE:
|
||||
# There are some ambiguities in Lao syllabification without additional processing, as mentioned in the paper.
|
||||
# For this reason, this RBBI grammar really only works with LaoBreakIterator, as it does this additional work.
|
||||
#
|
||||
# Syllable structure, where X is the nuclear consonant:
|
||||
#
|
||||
# +----+
|
||||
# | X5 |
|
||||
# +----+
|
||||
# | X4 |
|
||||
# +----+----+----+----+----+----+----+-----+
|
||||
# | X0 | X1 | X | X6 | X7 | X8 | X9 | X10 |
|
||||
# +----+----+----+----+----+----+----+-----+
|
||||
# | X2 |
|
||||
# +----+
|
||||
# | X3 |
|
||||
# +----+
|
||||
#
|
||||
# X0 represents a vowel which occurs before the nuclear consonant.
|
||||
# It can always define the beginning of syllable.
|
||||
$X0 = [\u0EC0-\u0EC4];
|
||||
# X1 is a combination consonant which comes before the nuclear consonant,
|
||||
# but only if nuclear consonant is one of {ງ ຍ ລ ວ ຼ ມ ນ ຣ}
|
||||
$X1 = [\u0EAB];
|
||||
# X represents the nuclear consonant.
|
||||
$X = [\u0E81-\u0EAE\u0EDC\u0EDD];
|
||||
# X2 is a combination consonant which comes after the nuclear consonant,
|
||||
# which is placed under or next to the nuclear consonant.
|
||||
$X2 = [\u0EBC\u0EA3\u0EA7\u0EA5];
|
||||
# X3 represents a vowel which occurs under the nuclear consonant.
|
||||
$X3 = [\u0EB8\u0EB9];
|
||||
# X4 represents a vowel which occurs above the nuclear consonant.
|
||||
$X4 = [\u0EB4-\u0EB7\u0ECD\u0EBB\u0EB1];
|
||||
# X5 represents a tone mark which occurs above the nuclear consonant or upper vowel.
|
||||
$X5 = [\u0EC8-\u0ECB];
|
||||
# X6 represents a consonant vowel, which occurs after the nuclear consonant.
|
||||
# It functions when the syllable doesn’t have any vowels. And it always exists with X8.
|
||||
$X6 = [\u0EA7\u0EAD\u0EBD];
|
||||
# X7 represents a final vowel.
|
||||
# However X7_1 always represents the end of syllable and it never exists with tone mark.
|
||||
$X7 = [\u0EB0\u0EB2\u0EB3];
|
||||
# X8 represents an alternate consonant.
|
||||
$X8 = [\u0E81\u0E87\u0E8D\u0E94\u0E99\u0EA1\u0E9A\u0EA7];
|
||||
# X9 represents alternate consonants to pronounce foreign terms, it always exist with X10_3.
|
||||
$X9 = [\u0E88\u0EAA\u0E8A\u0E9E\u0E9F\u0EA5];
|
||||
# X10 represents a sign mark.
|
||||
# It always occurs at the end of a syllable, but mostly people keep it separate from syllable.
|
||||
$X10 = [\u0EAF\u0EC6\u0ECC];
|
||||
|
||||
# Section 1
|
||||
$X0_1 = [\u0EC0];
|
||||
$X4_1_2 = [\u0EB4\u0EB5];
|
||||
$X4_3_4 = [\u0EB6\u0EB7];
|
||||
$X4_6 = [\u0EBB];
|
||||
$X4_7 = [\u0EB1];
|
||||
$X6_2 = [\u0EAD];
|
||||
$X6_3 = [\u0EBD];
|
||||
$X7_1 = [\u0EB0];
|
||||
$X7_2 = [\u0EB2];
|
||||
$X10_1 = [\u0EAF];
|
||||
$X10_2 = [\u0EC6];
|
||||
$X10_3 = [\u0ECC];
|
||||
|
||||
$Rule1_1 = $X0_1 ($X1)? $X ($X2)? ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
|
||||
$Rule1_2 = $X0_1 ($X1)? $X ($X2)? $X4_1_2 ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
|
||||
$Rule1_3 = $X0_1 ($X1)? $X ($X2)? $X4_3_4 ($X5)? $X6_2 ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
|
||||
$Rule1_4 = $X0_1 ($X1)? $X ($X2)? ($X7_2)? $X7_1;
|
||||
$Rule1_5 = $X0_1 ($X1)? $X ($X2)? $X4_6 ($X5)? $X7_2 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
|
||||
$Rule1_6 = $X0_1 ($X1)? $X ($X2)? $X4_7 ($X5)? $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
|
||||
$Rule1_7 = $X0_1 ($X1)? $X ($X2)? ($X4_7)? ($X5)? $X6_3 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
|
||||
|
||||
$Rule1 = ($Rule1_1 | $Rule1_2 | $Rule1_3 | $Rule1_4 | $Rule1_5 | $Rule1_6 | $Rule1_7);
|
||||
|
||||
# Section 2
|
||||
$X0_2 = [\u0EC1];
|
||||
|
||||
$Rule2_1 = $X0_2 ($X1)? $X ($X2)? ($X5)? ($X6)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
|
||||
$Rule2_2 = $X0_2 ($X1)? $X ($X2)? $X7_1;
|
||||
$Rule2_3 = $X0_2 ($X1)? $X ($X2)? $X4_7 ($X5)? $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
|
||||
|
||||
$Rule2 = ($Rule2_1 | $Rule2_2 | $Rule2_3);
|
||||
|
||||
# Section 3
|
||||
$X0_3 = [\u0EC2];
|
||||
$X8_3 = [\u0E8D];
|
||||
$X8_8 = [\u0EA7];
|
||||
|
||||
$Rule3_1 = $X0_3 ($X1)? $X ($X2)? ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
|
||||
$Rule3_2 = $X0_3 ($X1)? $X ($X2)? $X7_1;
|
||||
$Rule3_3 = $X0_3 ($X1)? $X ($X2)? $X4_7 ($X5)? ($X8_3 | $X8_8);
|
||||
|
||||
$Rule3 = ($Rule3_1 | $Rule3_2 | $Rule3_3);
|
||||
|
||||
# Section 4
|
||||
$X0_4 = [\u0EC4];
|
||||
$X6_1 = [\u0EA7];
|
||||
|
||||
$Rule4 = $X0_4 ($X1)? $X ($X2)? ($X5)? ($X6_1)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
|
||||
|
||||
# Section 5
|
||||
$X0_5 = [\u0EC3];
|
||||
|
||||
$Rule5 = $X0_5 ($X1)? $X ($X2)? ($X5)? ($X6_1)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
|
||||
|
||||
# Section 6
|
||||
$Rule6 = ($X1)? $X ($X2)? $X3 ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
|
||||
|
||||
# Section 7
|
||||
$X4_1_4 = [\u0EB4-\u0EB7];
|
||||
|
||||
$Rule7 = ($X1)? $X ($X2)? $X4_1_4 ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
|
||||
|
||||
# Section 8
|
||||
$X4_5 = [\u0ECD];
|
||||
|
||||
$Rule8 = ($X1)? $X ($X2)? $X4_5 ($X5)? ($X7_2)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
|
||||
|
||||
# Section 9
|
||||
|
||||
$Rule9_1 = ($X1)? $X ($X2)? $X4_6 ($X5)? $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
|
||||
$Rule9_2 = ($X1)? $X ($X2)? $X4_6 ($X5)? $X6_1 $X7_1;
|
||||
|
||||
$Rule9 = ($Rule9_1 | $Rule9_2);
|
||||
|
||||
# Section 10
|
||||
$Rule10 = ($X1)? $X ($X2)? $X4_7 ($X5)? ($X6_1)? $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
|
||||
|
||||
# Section 11
|
||||
$Rule11 = ($X1)? $X ($X2)? ($X5)? $X6 $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
|
||||
|
||||
# Section 12
|
||||
$Rule12 = ($X1)? $X ($X2)? ($X5)? $X7_1;
|
||||
|
||||
# Section 13
|
||||
$Rule13 = ($X1)? $X ($X2)? ($X5)? $X7_2 ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
|
||||
|
||||
# Section 14
|
||||
$X7_3 = [\u0EB3];
|
||||
|
||||
$Rule14 = ($X1)? $X ($X2)? ($X5)? $X7_3 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
|
||||
|
||||
$LaoSyllableEx = ($Rule1 | $Rule2 | $Rule3 | $Rule4 | $Rule5 | $Rule6 | $Rule7 | $Rule8 | $Rule9 | $Rule10 | $Rule11 | $Rule12 | $Rule13 | $Rule14);
|
||||
|
||||
$WordJoin = [:Line_Break=Word_Joiner:];
|
||||
|
||||
$LaoJoinedSyllableEx = $LaoSyllableEx ($WordJoin $LaoSyllableEx)*;
|
||||
|
||||
#
|
||||
# default numerical definitions
|
||||
#
|
||||
$Extend = [\p{Word_Break = Extend}];
|
||||
$Format = [\p{Word_Break = Format}];
|
||||
$MidNumLet = [\p{Word_Break = MidNumLet}];
|
||||
$MidNum = [\p{Word_Break = MidNum}];
|
||||
$Numeric = [\p{Word_Break = Numeric}];
|
||||
$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
|
||||
$MidNumLetEx = $MidNumLet ($Extend | $Format)*;
|
||||
$MidNumEx = $MidNum ($Extend | $Format)*;
|
||||
$NumericEx = $Numeric ($Extend | $Format)*;
|
||||
$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;
|
||||
|
||||
!!forward;
|
||||
|
||||
$LaoJoinedSyllableEx {200};
|
||||
# default numeric rules
|
||||
$NumericEx $ExtendNumLetEx? (($MidNumEx | $MidNumLetEx)? $NumericEx $ExtendNumLetEx?)* {100};
|
|
@ -78,7 +78,6 @@ FF0D>002D
|
|||
## Space Folding
|
||||
# Rule: [[:Zs:] - [:Changes_When_NFKC_Casefolded=Yes:] - [\u0020]] > 0020
|
||||
1680>0020
|
||||
180E>0020
|
||||
|
||||
## Spacing Accents folding (done by kd)
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Copyright (C) 1999-2012, International Business Machines
|
||||
# Copyright (C) 1999-2013, International Business Machines
|
||||
# Corporation and others. All Rights Reserved.
|
||||
#
|
||||
# file name: nfc.txt
|
||||
|
@ -7,7 +7,7 @@
|
|||
#
|
||||
# Complete data for Unicode NFC normalization.
|
||||
|
||||
* Unicode 6.1.0
|
||||
* Unicode 6.3.0
|
||||
|
||||
# Canonical_Combining_Class (ccc) values
|
||||
0300..0314:230
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Copyright (C) 1999-2012, International Business Machines
|
||||
# Copyright (C) 1999-2013, International Business Machines
|
||||
# Corporation and others. All Rights Reserved.
|
||||
#
|
||||
# file name: nfkc.txt
|
||||
|
@ -11,7 +11,7 @@
|
|||
# to NFKC one-way mappings.
|
||||
# Use this file as the second gennorm2 input file after nfc.txt.
|
||||
|
||||
* Unicode 6.1.0
|
||||
* Unicode 6.3.0
|
||||
|
||||
00A0>0020
|
||||
00A8>0020 0308
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
# Unicode Character Database
|
||||
# Copyright (c) 1991-2012 Unicode, Inc.
|
||||
# Copyright (c) 1991-2013 Unicode, Inc.
|
||||
# For terms of use, see http://www.unicode.org/terms_of_use.html
|
||||
# For documentation, see http://www.unicode.org/reports/tr44/
|
||||
#
|
||||
|
@ -12,7 +12,7 @@
|
|||
# and reformatted into syntax for the gennorm2 Normalizer2 data generator tool.
|
||||
# Use this file as the third gennorm2 input file after nfc.txt and nfkc.txt.
|
||||
|
||||
* Unicode 6.1.0
|
||||
* Unicode 6.3.0
|
||||
|
||||
0041>0061
|
||||
0042>0062
|
||||
|
@ -537,6 +537,7 @@
|
|||
0555>0585
|
||||
0556>0586
|
||||
0587>0565 0582
|
||||
061C>
|
||||
0675>0627 0674
|
||||
0676>0648 0674
|
||||
0677>06C7 0674
|
||||
|
@ -627,7 +628,7 @@
|
|||
10FC>10DC
|
||||
115F..1160>
|
||||
17B4..17B5>
|
||||
180B..180D>
|
||||
180B..180E>
|
||||
1D2C>0061
|
||||
1D2D>00E6
|
||||
1D2E>0062
|
||||
|
|
|
@ -21,7 +21,6 @@ import java.text.CharacterIterator;
|
|||
|
||||
import com.ibm.icu.lang.UCharacter;
|
||||
import com.ibm.icu.text.BreakIterator;
|
||||
import com.ibm.icu.text.DictionaryBasedBreakIterator;
|
||||
import com.ibm.icu.text.RuleBasedBreakIterator;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
|
||||
|
@ -60,15 +59,12 @@ abstract class BreakIteratorWrapper {
|
|||
}
|
||||
|
||||
/**
|
||||
* If its a DictionaryBasedBreakIterator, it doesn't return rulestatus, so
|
||||
* treat it like a generic BreakIterator If its any other
|
||||
* RuleBasedBreakIterator, the rule status can be used for token type. If its
|
||||
* If its a RuleBasedBreakIterator, the rule status can be used for token type. If its
|
||||
* any other BreakIterator, the rulestatus method is not available, so treat
|
||||
* it like a generic BreakIterator.
|
||||
*/
|
||||
static BreakIteratorWrapper wrap(BreakIterator breakIterator) {
|
||||
if (breakIterator instanceof RuleBasedBreakIterator
|
||||
&& !(breakIterator instanceof DictionaryBasedBreakIterator))
|
||||
if (breakIterator instanceof RuleBasedBreakIterator)
|
||||
return new RBBIWrapper((RuleBasedBreakIterator) breakIterator);
|
||||
else
|
||||
return new BIWrapper(breakIterator);
|
||||
|
|
|
@ -41,12 +41,13 @@ final class CompositeBreakIterator {
|
|||
private final BreakIteratorWrapper wordBreakers[] = new BreakIteratorWrapper[UScript.CODE_LIMIT];
|
||||
|
||||
private BreakIteratorWrapper rbbi;
|
||||
private final ScriptIterator scriptIterator = new ScriptIterator();
|
||||
private final ScriptIterator scriptIterator;
|
||||
|
||||
private char text[];
|
||||
|
||||
CompositeBreakIterator(ICUTokenizerConfig config) {
|
||||
this.config = config;
|
||||
this.scriptIterator = new ScriptIterator(config.combineCJ());
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -35,12 +35,9 @@ import com.ibm.icu.util.ULocale;
|
|||
* ({@link BreakIterator#getWordInstance(ULocale) BreakIterator.getWordInstance(ULocale.ROOT)}),
|
||||
* but with the following tailorings:
|
||||
* <ul>
|
||||
* <li>Thai text is broken into words with a
|
||||
* {@link com.ibm.icu.text.DictionaryBasedBreakIterator}
|
||||
* <li>Lao, Myanmar, and Khmer text is broken into syllables
|
||||
* <li>Thai, Lao, and CJK text is broken into words with a dictionary.
|
||||
* <li>Myanmar, and Khmer text is broken into syllables
|
||||
* based on custom BreakIterator rules.
|
||||
* <li>Hebrew text has custom tailorings to handle special cases
|
||||
* involving punctuation.
|
||||
* </ul>
|
||||
* @lucene.experimental
|
||||
*/
|
||||
|
@ -62,34 +59,44 @@ public class DefaultICUTokenizerConfig extends ICUTokenizerConfig {
|
|||
* the default breakiterators in use. these can be expensive to
|
||||
* instantiate, cheap to clone.
|
||||
*/
|
||||
private static final BreakIterator rootBreakIterator =
|
||||
// we keep the cjk breaking separate, thats because it cannot be customized (because dictionary
|
||||
// is only triggered when kind = WORD, but kind = LINE by default and we have no non-evil way to change it)
|
||||
private static final BreakIterator cjkBreakIterator = BreakIterator.getWordInstance(ULocale.ROOT);
|
||||
// the same as ROOT, except no dictionary segmentation for cjk
|
||||
private static final BreakIterator defaultBreakIterator =
|
||||
readBreakIterator("Default.brk");
|
||||
private static final BreakIterator thaiBreakIterator =
|
||||
BreakIterator.getWordInstance(new ULocale("th_TH"));
|
||||
private static final BreakIterator hebrewBreakIterator =
|
||||
readBreakIterator("Hebrew.brk");
|
||||
private static final BreakIterator khmerBreakIterator =
|
||||
readBreakIterator("Khmer.brk");
|
||||
private static final BreakIterator laoBreakIterator =
|
||||
new LaoBreakIterator(readBreakIterator("Lao.brk"));
|
||||
private static final BreakIterator myanmarBreakIterator =
|
||||
readBreakIterator("Myanmar.brk");
|
||||
|
||||
// TODO: deprecate this boolean? you only care if you are doing super-expert stuff...
|
||||
private final boolean cjkAsWords;
|
||||
|
||||
/**
|
||||
* Creates a new config. This object is lightweight, but the first
|
||||
* time the class is referenced, breakiterators will be initialized.
|
||||
* @param cjkAsWords true if cjk text should undergo dictionary-based segmentation,
|
||||
* otherwise text will be segmented according to UAX#29 defaults.
|
||||
* If this is true, all Han+Hiragana+Katakana words will be tagged as
|
||||
* IDEOGRAPHIC.
|
||||
*/
|
||||
public DefaultICUTokenizerConfig() {}
|
||||
public DefaultICUTokenizerConfig(boolean cjkAsWords) {
|
||||
this.cjkAsWords = cjkAsWords;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean combineCJ() {
|
||||
return cjkAsWords;
|
||||
}
|
||||
|
||||
@Override
|
||||
public BreakIterator getBreakIterator(int script) {
|
||||
switch(script) {
|
||||
case UScript.THAI: return (BreakIterator)thaiBreakIterator.clone();
|
||||
case UScript.HEBREW: return (BreakIterator)hebrewBreakIterator.clone();
|
||||
case UScript.KHMER: return (BreakIterator)khmerBreakIterator.clone();
|
||||
case UScript.LAO: return (BreakIterator)laoBreakIterator.clone();
|
||||
case UScript.MYANMAR: return (BreakIterator)myanmarBreakIterator.clone();
|
||||
default: return (BreakIterator)rootBreakIterator.clone();
|
||||
case UScript.JAPANESE: return (BreakIterator)cjkBreakIterator.clone();
|
||||
default: return (BreakIterator)defaultBreakIterator.clone();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -68,7 +68,7 @@ public final class ICUTokenizer extends Tokenizer {
|
|||
* @see DefaultICUTokenizerConfig
|
||||
*/
|
||||
public ICUTokenizer(Reader input) {
|
||||
this(input, new DefaultICUTokenizerConfig());
|
||||
this(input, new DefaultICUTokenizerConfig(true));
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -36,4 +36,6 @@ public abstract class ICUTokenizerConfig {
|
|||
/** Return a token type value for a given script and BreakIterator
|
||||
* rule status. */
|
||||
public abstract String getType(int script, int ruleStatus);
|
||||
/** true if Han, Hiragana, and Katakana scripts should all be returned as Japanese */
|
||||
public abstract boolean combineCJ();
|
||||
}
|
||||
|
|
|
@ -70,7 +70,7 @@ import com.ibm.icu.text.RuleBasedBreakIterator;
|
|||
* <pre class="prettyprint" >
|
||||
* <fieldType name="text_icu_custom" class="solr.TextField" positionIncrementGap="100">
|
||||
* <analyzer>
|
||||
* <tokenizer class="solr.ICUTokenizerFactory"
|
||||
* <tokenizer class="solr.ICUTokenizerFactory" cjkAsWords="true"
|
||||
* rulefiles="Latn:my.Latin.rules.rbbi,Cyrl:my.Cyrillic.rules.rbbi"/>
|
||||
* </analyzer>
|
||||
* </fieldType></pre>
|
||||
|
@ -79,6 +79,7 @@ public class ICUTokenizerFactory extends TokenizerFactory implements ResourceLoa
|
|||
static final String RULEFILES = "rulefiles";
|
||||
private final Map<Integer,String> tailored;
|
||||
private ICUTokenizerConfig config;
|
||||
private final boolean cjkAsWords;
|
||||
|
||||
/** Creates a new ICUTokenizerFactory */
|
||||
public ICUTokenizerFactory(Map<String,String> args) {
|
||||
|
@ -94,6 +95,7 @@ public class ICUTokenizerFactory extends TokenizerFactory implements ResourceLoa
|
|||
tailored.put(UCharacter.getPropertyValueEnum(UProperty.SCRIPT, scriptCode), resourcePath);
|
||||
}
|
||||
}
|
||||
cjkAsWords = getBoolean(args, "cjkAsWords", true);
|
||||
if (!args.isEmpty()) {
|
||||
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||
}
|
||||
|
@ -103,7 +105,7 @@ public class ICUTokenizerFactory extends TokenizerFactory implements ResourceLoa
|
|||
public void inform(ResourceLoader loader) throws IOException {
|
||||
assert tailored != null : "init must be called first!";
|
||||
if (tailored.isEmpty()) {
|
||||
config = new DefaultICUTokenizerConfig();
|
||||
config = new DefaultICUTokenizerConfig(cjkAsWords);
|
||||
} else {
|
||||
final BreakIterator breakers[] = new BreakIterator[UScript.CODE_LIMIT];
|
||||
for (Map.Entry<Integer,String> entry : tailored.entrySet()) {
|
||||
|
@ -111,7 +113,7 @@ public class ICUTokenizerFactory extends TokenizerFactory implements ResourceLoa
|
|||
String resourcePath = entry.getValue();
|
||||
breakers[code] = parseRules(resourcePath, loader);
|
||||
}
|
||||
config = new DefaultICUTokenizerConfig() {
|
||||
config = new DefaultICUTokenizerConfig(cjkAsWords) {
|
||||
|
||||
@Override
|
||||
public BreakIterator getBreakIterator(int script) {
|
||||
|
|
|
@ -1,230 +0,0 @@
|
|||
package org.apache.lucene.analysis.icu.segmentation;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.text.CharacterIterator;
|
||||
|
||||
import com.ibm.icu.lang.UCharacter;
|
||||
import com.ibm.icu.text.BreakIterator;
|
||||
import com.ibm.icu.text.RuleBasedBreakIterator;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
|
||||
/**
|
||||
* Syllable iterator for Lao text.
|
||||
* <p>
|
||||
* This breaks Lao text into syllables according to:
|
||||
* <i>Syllabification of Lao Script for Line Breaking</i>
|
||||
* Phonpasit Phissamay, Valaxay Dalolay, Chitaphone Chanhsililath, Oulaiphone Silimasak,
|
||||
* Sarmad Hussain, Nadir Durrani, Science Technology and Environment Agency, CRULP.
|
||||
* <ul>
|
||||
* <li>http://www.panl10n.net/english/final%20reports/pdf%20files/Laos/LAO06.pdf
|
||||
* <li>http://www.panl10n.net/Presentations/Cambodia/Phonpassit/LineBreakingAlgo.pdf
|
||||
* </ul>
|
||||
* <p>
|
||||
* Most work is accomplished with RBBI rules, however some additional special logic is needed
|
||||
* that cannot be coded in a grammar, and this is implemented here.
|
||||
* <p>
|
||||
* For example, what appears to be a final consonant might instead be part of the next syllable.
|
||||
* Rules match in a greedy fashion, leaving an illegal sequence that matches no rules.
|
||||
* <p>
|
||||
* Take for instance the text ກວ່າດອກ
|
||||
* The first rule greedily matches ກວ່າດ, but then ອກ is encountered, which is illegal.
|
||||
* What LaoBreakIterator does, according to the paper:
|
||||
* <ol>
|
||||
* <li>backtrack and remove the ດ from the last syllable, placing it on the current syllable.
|
||||
* <li>verify the modified previous syllable (ກວ່າ ) is still legal.
|
||||
* <li>verify the modified current syllable (ດອກ) is now legal.
|
||||
* <li>If 2 or 3 fails, then restore the ດ to the last syllable and skip the current character.
|
||||
* </ol>
|
||||
* <p>
|
||||
* Finally, LaoBreakIterator also takes care of the second concern mentioned in the paper.
|
||||
* This is the issue of combining marks being in the wrong order (typos).
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class LaoBreakIterator extends BreakIterator {
|
||||
RuleBasedBreakIterator rules;
|
||||
CharArrayIterator text;
|
||||
|
||||
CharArrayIterator working = new CharArrayIterator();
|
||||
int workingOffset = 0;
|
||||
|
||||
CharArrayIterator verifyText = new CharArrayIterator();
|
||||
RuleBasedBreakIterator verify;
|
||||
|
||||
private static final UnicodeSet laoSet;
|
||||
static {
|
||||
laoSet = new UnicodeSet("[:Lao:]");
|
||||
laoSet.compact();
|
||||
laoSet.freeze();
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new iterator, performing the backtracking verification
|
||||
* across the provided <code>rules</code>.
|
||||
*/
|
||||
public LaoBreakIterator(RuleBasedBreakIterator rules) {
|
||||
this.rules = (RuleBasedBreakIterator) rules.clone();
|
||||
this.verify = (RuleBasedBreakIterator) rules.clone();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int current() {
|
||||
int current = rules.current();
|
||||
return current == BreakIterator.DONE ? BreakIterator.DONE : workingOffset + current;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int first() {
|
||||
working.setText(this.text.getText(), this.text.getStart(), this.text.getLength());
|
||||
rules.setText(working);
|
||||
workingOffset = 0;
|
||||
int first = rules.first();
|
||||
return first == BreakIterator.DONE ? BreakIterator.DONE : workingOffset + first;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int following(int offset) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public CharacterIterator getText() {
|
||||
return text;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int last() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int next() {
|
||||
int current = current();
|
||||
int next = rules.next();
|
||||
if (next == BreakIterator.DONE)
|
||||
return next;
|
||||
else
|
||||
next += workingOffset;
|
||||
|
||||
char c = working.current();
|
||||
int following = rules.next(); // lookahead
|
||||
if (following != BreakIterator.DONE) {
|
||||
following += workingOffset;
|
||||
if (rules.getRuleStatus() == 0 && laoSet.contains(c) && verifyPushBack(current, next)) {
|
||||
workingOffset = next - 1;
|
||||
working.setText(text.getText(), text.getStart() + workingOffset, text.getLength() - workingOffset);
|
||||
return next - 1;
|
||||
}
|
||||
rules.previous(); // undo the lookahead
|
||||
}
|
||||
|
||||
return next;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int next(int n) {
|
||||
if (n < 0)
|
||||
throw new UnsupportedOperationException("Backwards traversal is unsupported");
|
||||
|
||||
int result = current();
|
||||
while (n > 0) {
|
||||
result = next();
|
||||
--n;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int previous() {
|
||||
throw new UnsupportedOperationException("Backwards traversal is unsupported");
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setText(CharacterIterator text) {
|
||||
if (!(text instanceof CharArrayIterator))
|
||||
throw new UnsupportedOperationException("unsupported CharacterIterator");
|
||||
this.text = (CharArrayIterator) text;
|
||||
ccReorder(this.text.getText(), this.text.getStart(), this.text.getLength());
|
||||
working.setText(this.text.getText(), this.text.getStart(), this.text.getLength());
|
||||
rules.setText(working);
|
||||
workingOffset = 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setText(String newText) {
|
||||
CharArrayIterator ci = new CharArrayIterator();
|
||||
ci.setText(newText.toCharArray(), 0, newText.length());
|
||||
setText(ci);
|
||||
}
|
||||
|
||||
private boolean verifyPushBack(int current, int next) {
|
||||
int shortenedSyllable = next - current - 1;
|
||||
|
||||
verifyText.setText(text.getText(), text.getStart() + current, shortenedSyllable);
|
||||
verify.setText(verifyText);
|
||||
if (verify.next() != shortenedSyllable || verify.getRuleStatus() == 0)
|
||||
return false;
|
||||
|
||||
|
||||
verifyText.setText(text.getText(), text.getStart() + next - 1, text.getLength() - next + 1);
|
||||
verify.setText(verifyText);
|
||||
|
||||
return (verify.next() != BreakIterator.DONE && verify.getRuleStatus() != 0);
|
||||
}
|
||||
|
||||
// TODO: only bubblesort around runs of combining marks, instead of the entire text.
|
||||
private void ccReorder(char[] text, int start, int length) {
|
||||
boolean reordered;
|
||||
do {
|
||||
int prevCC = 0;
|
||||
reordered = false;
|
||||
for (int i = start; i < start + length; i++) {
|
||||
final char c = text[i];
|
||||
final int cc = UCharacter.getCombiningClass(c);
|
||||
if (cc > 0 && cc < prevCC) {
|
||||
// swap
|
||||
text[i] = text[i - 1];
|
||||
text[i - 1] = c;
|
||||
reordered = true;
|
||||
} else {
|
||||
prevCC = cc;
|
||||
}
|
||||
}
|
||||
|
||||
} while (reordered == true);
|
||||
}
|
||||
|
||||
/**
|
||||
* Clone method. Creates another LaoBreakIterator with the same behavior
|
||||
* and current state as this one.
|
||||
* @return The clone.
|
||||
*/
|
||||
@Override
|
||||
public LaoBreakIterator clone() {
|
||||
LaoBreakIterator other = (LaoBreakIterator) super.clone();
|
||||
other.rules = (RuleBasedBreakIterator) rules.clone();
|
||||
other.verify = (RuleBasedBreakIterator) verify.clone();
|
||||
if (text != null)
|
||||
other.text = text.clone();
|
||||
if (working != null)
|
||||
other.working = working.clone();
|
||||
if (verifyText != null)
|
||||
other.verifyText = verifyText.clone();
|
||||
return other;
|
||||
}
|
||||
}
|
|
@ -60,6 +60,15 @@ final class ScriptIterator {
|
|||
private int scriptLimit;
|
||||
private int scriptCode;
|
||||
|
||||
private final boolean combineCJ;
|
||||
|
||||
/**
|
||||
* @param combineCJ if true: Han,Hiragana,Katakana will all return as {@link UScript#JAPANESE}
|
||||
*/
|
||||
ScriptIterator(boolean combineCJ) {
|
||||
this.combineCJ = combineCJ;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the start of this script run
|
||||
*
|
||||
|
@ -162,10 +171,24 @@ final class ScriptIterator {
|
|||
}
|
||||
|
||||
/** fast version of UScript.getScript(). Basic Latin is an array lookup */
|
||||
private static int getScript(int codepoint) {
|
||||
if (0 <= codepoint && codepoint < basicLatin.length)
|
||||
private int getScript(int codepoint) {
|
||||
if (0 <= codepoint && codepoint < basicLatin.length) {
|
||||
return basicLatin[codepoint];
|
||||
else
|
||||
return UScript.getScript(codepoint);
|
||||
} else {
|
||||
int script = UScript.getScript(codepoint);
|
||||
if (combineCJ) {
|
||||
if (script == UScript.HAN || script == UScript.HIRAGANA || script == UScript.KATAKANA) {
|
||||
return UScript.JAPANESE;
|
||||
} else if (codepoint >= 0xFF10 && codepoint <= 0xFF19) {
|
||||
// when using CJK dictionary breaking, don't let full width numbers go to it, otherwise
|
||||
// they are treated as punctuation. we currently have no cleaner way to fix this!
|
||||
return UScript.LATIN;
|
||||
} else {
|
||||
return script;
|
||||
}
|
||||
} else {
|
||||
return script;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -84,6 +84,10 @@ public class ScriptAttributeImpl extends AttributeImpl implements ScriptAttribut
|
|||
|
||||
@Override
|
||||
public void reflectWith(AttributeReflector reflector) {
|
||||
reflector.reflect(ScriptAttribute.class, "script", getName());
|
||||
// when wordbreaking CJK, we use the 15924 code Japanese (Han+Hiragana+Katakana) to
|
||||
// mark runs of Chinese/Japanese. our use is correct (as for chinese Han is a subset),
|
||||
// but this is just to help prevent confusion.
|
||||
String name = code == UScript.JAPANESE ? "Chinese/Japanese" : getName();
|
||||
reflector.reflect(ScriptAttribute.class, "script", name);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -14,6 +14,7 @@
|
|||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<!-- :Post-Release-Update-Version.LUCENE_XY: - several mentions in this file -->
|
||||
<html>
|
||||
<head>
|
||||
<META http-equiv="Content-Type" content="text/html; charset=UTF-8">
|
||||
|
@ -114,9 +115,9 @@ algorithm.
|
|||
<h3>Farsi Range Queries</h3>
|
||||
<pre class="prettyprint">
|
||||
Collator collator = Collator.getInstance(new ULocale("ar"));
|
||||
ICUCollationKeyAnalyzer analyzer = new ICUCollationKeyAnalyzer(Version.LUCENE_40, collator);
|
||||
ICUCollationKeyAnalyzer analyzer = new ICUCollationKeyAnalyzer(Version.LUCENE_50, collator);
|
||||
RAMDirectory ramDir = new RAMDirectory();
|
||||
IndexWriter writer = new IndexWriter(ramDir, new IndexWriterConfig(Version.LUCENE_40, analyzer));
|
||||
IndexWriter writer = new IndexWriter(ramDir, new IndexWriterConfig(Version.LUCENE_50, analyzer));
|
||||
Document doc = new Document();
|
||||
doc.add(new Field("content", "\u0633\u0627\u0628",
|
||||
Field.Store.YES, Field.Index.ANALYZED));
|
||||
|
@ -124,7 +125,7 @@ algorithm.
|
|||
writer.close();
|
||||
IndexSearcher is = new IndexSearcher(ramDir, true);
|
||||
|
||||
QueryParser aqp = new QueryParser(Version.LUCENE_40, "content", analyzer);
|
||||
QueryParser aqp = new QueryParser(Version.LUCENE_50, "content", analyzer);
|
||||
aqp.setAnalyzeRangeTerms(true);
|
||||
|
||||
// Unicode order would include U+0633 in [ U+062F - U+0698 ], but Farsi
|
||||
|
@ -140,9 +141,9 @@ algorithm.
|
|||
<h3>Danish Sorting</h3>
|
||||
<pre class="prettyprint">
|
||||
Analyzer analyzer
|
||||
= new ICUCollationKeyAnalyzer(Version.LUCENE_40, Collator.getInstance(new ULocale("da", "dk")));
|
||||
= new ICUCollationKeyAnalyzer(Version.LUCENE_50, Collator.getInstance(new ULocale("da", "dk")));
|
||||
RAMDirectory indexStore = new RAMDirectory();
|
||||
IndexWriter writer = new IndexWriter(indexStore, new IndexWriterConfig(Version.LUCENE_40, analyzer));
|
||||
IndexWriter writer = new IndexWriter(indexStore, new IndexWriterConfig(Version.LUCENE_50, analyzer));
|
||||
String[] tracer = new String[] { "A", "B", "C", "D", "E" };
|
||||
String[] data = new String[] { "HAT", "HUT", "H\u00C5T", "H\u00D8T", "HOT" };
|
||||
String[] sortedTracerOrder = new String[] { "A", "E", "B", "D", "C" };
|
||||
|
@ -168,15 +169,15 @@ algorithm.
|
|||
<pre class="prettyprint">
|
||||
Collator collator = Collator.getInstance(new ULocale("tr", "TR"));
|
||||
collator.setStrength(Collator.PRIMARY);
|
||||
Analyzer analyzer = new ICUCollationKeyAnalyzer(Version.LUCENE_40, collator);
|
||||
Analyzer analyzer = new ICUCollationKeyAnalyzer(Version.LUCENE_50, collator);
|
||||
RAMDirectory ramDir = new RAMDirectory();
|
||||
IndexWriter writer = new IndexWriter(ramDir, new IndexWriterConfig(Version.LUCENE_40, analyzer));
|
||||
IndexWriter writer = new IndexWriter(ramDir, new IndexWriterConfig(Version.LUCENE_50, analyzer));
|
||||
Document doc = new Document();
|
||||
doc.add(new Field("contents", "DIGY", Field.Store.NO, Field.Index.ANALYZED));
|
||||
writer.addDocument(doc);
|
||||
writer.close();
|
||||
IndexSearcher is = new IndexSearcher(ramDir, true);
|
||||
QueryParser parser = new QueryParser(Version.LUCENE_40, "contents", analyzer);
|
||||
QueryParser parser = new QueryParser(Version.LUCENE_50, "contents", analyzer);
|
||||
Query query = parser.parse("d\u0131gy"); // U+0131: dotless i
|
||||
ScoreDoc[] result = is.search(query, null, 1000).scoreDocs;
|
||||
assertEquals("The index Term should be included.", 1, result.length);
|
||||
|
@ -353,7 +354,7 @@ and
|
|||
<h1><a name="backcompat">Backwards Compatibility</a></h1>
|
||||
<p>
|
||||
This module exists to provide up-to-date Unicode functionality that supports
|
||||
the most recent version of Unicode (currently 6.1). However, some users who wish
|
||||
the most recent version of Unicode (currently 6.3). However, some users who wish
|
||||
for stronger backwards compatibility can restrict
|
||||
{@link org.apache.lucene.analysis.icu.ICUNormalizer2Filter} to operate on only
|
||||
a specific Unicode Version by using a {@link com.ibm.icu.text.FilteredNormalizer2}.
|
||||
|
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -42,7 +42,7 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
|
|||
sb.append(whitespace);
|
||||
sb.append("testing 1234");
|
||||
String input = sb.toString();
|
||||
ICUTokenizer tokenizer = new ICUTokenizer(new StringReader(input));
|
||||
ICUTokenizer tokenizer = new ICUTokenizer(new StringReader(input), new DefaultICUTokenizerConfig(false));
|
||||
assertTokenStreamContents(tokenizer, new String[] { "testing", "1234" });
|
||||
}
|
||||
|
||||
|
@ -52,7 +52,7 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
|
|||
sb.append('a');
|
||||
}
|
||||
String input = sb.toString();
|
||||
ICUTokenizer tokenizer = new ICUTokenizer(new StringReader(input));
|
||||
ICUTokenizer tokenizer = new ICUTokenizer(new StringReader(input), new DefaultICUTokenizerConfig(false));
|
||||
char token[] = new char[4096];
|
||||
Arrays.fill(token, 'a');
|
||||
String expectedToken = new String(token);
|
||||
|
@ -69,7 +69,7 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
|
|||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader reader) {
|
||||
Tokenizer tokenizer = new ICUTokenizer(reader);
|
||||
Tokenizer tokenizer = new ICUTokenizer(reader, new DefaultICUTokenizerConfig(false));
|
||||
TokenFilter filter = new ICUNormalizer2Filter(tokenizer);
|
||||
return new TokenStreamComponents(tokenizer, filter);
|
||||
}
|
||||
|
@ -118,6 +118,7 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
|
|||
|
||||
public void testLao() throws Exception {
|
||||
assertAnalyzesTo(a, "ກວ່າດອກ", new String[] { "ກວ່າ", "ດອກ" });
|
||||
assertAnalyzesTo(a, "ພາສາລາວ", new String[] { "ພາສາ", "ລາວ"}, new String[] { "<ALPHANUM>", "<ALPHANUM>" });
|
||||
}
|
||||
|
||||
public void testThai() throws Exception {
|
||||
|
@ -138,6 +139,13 @@ public class TestICUTokenizer extends BaseTokenStreamTestCase {
|
|||
new String[] { "我", "是", "中", "国", "人", "1234", "tests"});
|
||||
}
|
||||
|
||||
public void testHebrew() throws Exception {
|
||||
assertAnalyzesTo(a, "דנקנר תקף את הדו\"ח",
|
||||
new String[] { "דנקנר", "תקף", "את", "הדו\"ח" });
|
||||
assertAnalyzesTo(a, "חברת בת של מודי'ס",
|
||||
new String[] { "חברת", "בת", "של", "מודי'ס" });
|
||||
}
|
||||
|
||||
public void testEmpty() throws Exception {
|
||||
assertAnalyzesTo(a, "", new String[] {});
|
||||
assertAnalyzesTo(a, ".", new String[] {});
|
||||
|
|
|
@ -0,0 +1,91 @@
|
|||
package org.apache.lucene.analysis.icu.segmentation;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
import java.util.Random;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
|
||||
/**
|
||||
* test ICUTokenizer with dictionary-based CJ segmentation
|
||||
*/
|
||||
public class TestICUTokenizerCJK extends BaseTokenStreamTestCase {
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
return new TokenStreamComponents(new ICUTokenizer(reader));
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* test stolen from smartcn
|
||||
*/
|
||||
public void testSimpleChinese() throws Exception {
|
||||
assertAnalyzesTo(a, "我购买了道具和服装。",
|
||||
new String[] { "我", "购买", "了", "道具", "和", "服装" }
|
||||
);
|
||||
}
|
||||
|
||||
public void testChineseNumerics() throws Exception {
|
||||
assertAnalyzesTo(a, "9483", new String[] { "9483" });
|
||||
assertAnalyzesTo(a, "院內分機9483。",
|
||||
new String[] { "院", "內", "分機", "9483" });
|
||||
assertAnalyzesTo(a, "院內分機9483。",
|
||||
new String[] { "院", "內", "分機", "9483" });
|
||||
}
|
||||
|
||||
/**
|
||||
* test stolen from kuromoji
|
||||
*/
|
||||
public void testSimpleJapanese() throws Exception {
|
||||
assertAnalyzesTo(a, "それはまだ実験段階にあります",
|
||||
new String[] { "それ", "は", "まだ", "実験", "段階", "に", "あり", "ます" }
|
||||
);
|
||||
}
|
||||
|
||||
public void testJapaneseTypes() throws Exception {
|
||||
assertAnalyzesTo(a, "仮名遣い カタカナ",
|
||||
new String[] { "仮名遣い", "カタカナ" },
|
||||
new String[] { "<IDEOGRAPHIC>", "<IDEOGRAPHIC>" });
|
||||
}
|
||||
|
||||
public void testKorean() throws Exception {
|
||||
// Korean words
|
||||
assertAnalyzesTo(a, "안녕하세요 한글입니다", new String[]{"안녕하세요", "한글입니다"});
|
||||
}
|
||||
|
||||
/** make sure that we still tag korean as HANGUL (for further decomposition/ngram/whatever) */
|
||||
public void testKoreanTypes() throws Exception {
|
||||
assertAnalyzesTo(a, "훈민정음",
|
||||
new String[] { "훈민정음" },
|
||||
new String[] { "<HANGUL>" });
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomStrings() throws Exception {
|
||||
checkRandomData(random(), a, 10000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
|
||||
/** blast some random large strings through the analyzer */
|
||||
public void testRandomHugeStrings() throws Exception {
|
||||
Random random = random();
|
||||
checkRandomData(random, a, 100*RANDOM_MULTIPLIER, 8192);
|
||||
}
|
||||
}
|
|
@ -1,90 +0,0 @@
|
|||
package org.apache.lucene.analysis.icu.segmentation;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.InputStream;
|
||||
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
import com.ibm.icu.lang.UCharacter;
|
||||
import com.ibm.icu.text.BreakIterator;
|
||||
import com.ibm.icu.text.RuleBasedBreakIterator;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
|
||||
/**
|
||||
* Tests LaoBreakIterator and its RBBI rules
|
||||
*/
|
||||
public class TestLaoBreakIterator extends LuceneTestCase {
|
||||
private BreakIterator wordIterator;
|
||||
|
||||
@Override
|
||||
public void setUp() throws Exception {
|
||||
super.setUp();
|
||||
InputStream is = getClass().getResourceAsStream("Lao.brk");
|
||||
wordIterator = new LaoBreakIterator(RuleBasedBreakIterator.getInstanceFromCompiledRules(is));
|
||||
is.close();
|
||||
}
|
||||
|
||||
private void assertBreaksTo(BreakIterator iterator, String sourceText, String tokens[]) {
|
||||
char text[] = sourceText.toCharArray();
|
||||
CharArrayIterator ci = new CharArrayIterator();
|
||||
ci.setText(text, 0, text.length);
|
||||
iterator.setText(ci);
|
||||
|
||||
for (int i = 0; i < tokens.length; i++) {
|
||||
int start, end;
|
||||
do {
|
||||
start = iterator.current();
|
||||
end = iterator.next();
|
||||
} while (end != BreakIterator.DONE && !isWord(text, start, end));
|
||||
assertTrue(start != BreakIterator.DONE);
|
||||
assertTrue(end != BreakIterator.DONE);
|
||||
assertEquals(tokens[i], new String(text, start, end - start));
|
||||
}
|
||||
|
||||
assertTrue(iterator.next() == BreakIterator.DONE);
|
||||
}
|
||||
|
||||
protected boolean isWord(char text[], int start, int end) {
|
||||
int codepoint;
|
||||
for (int i = start; i < end; i += UTF16.getCharCount(codepoint)) {
|
||||
codepoint = UTF16.charAt(text, 0, end, start);
|
||||
|
||||
if (UCharacter.isLetterOrDigit(codepoint))
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
public void testBasicUsage() throws Exception {
|
||||
assertBreaksTo(wordIterator, "ກວ່າດອກ", new String[] { "ກວ່າ", "ດອກ" });
|
||||
assertBreaksTo(wordIterator, "ຜູ້ເຂົ້າ", new String[] { "ຜູ້", "ເຂົ້າ" });
|
||||
assertBreaksTo(wordIterator, "", new String[] {});
|
||||
assertBreaksTo(wordIterator, "ສະບາຍດີ", new String[] { "ສະ", "ບາຍ", "ດີ" });
|
||||
}
|
||||
|
||||
public void testNumerics() throws Exception {
|
||||
assertBreaksTo(wordIterator, "໐໑໒໓", new String[] { "໐໑໒໓" });
|
||||
assertBreaksTo(wordIterator, "໐໑໒໓.໕໖", new String[] { "໐໑໒໓.໕໖" });
|
||||
}
|
||||
|
||||
public void testTextAndNumerics() throws Exception {
|
||||
assertBreaksTo(wordIterator, "ກວ່າດອກ໐໑໒໓", new String[] { "ກວ່າ", "ດອກ", "໐໑໒໓" });
|
||||
}
|
||||
}
|
|
@ -41,7 +41,7 @@ public class TestWithCJKBigramFilter extends BaseTokenStreamTestCase {
|
|||
private Analyzer analyzer = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer source = new ICUTokenizer(reader);
|
||||
Tokenizer source = new ICUTokenizer(reader, new DefaultICUTokenizerConfig(false));
|
||||
TokenStream result = new CJKBigramFilter(source);
|
||||
return new TokenStreamComponents(source, new StopFilter(TEST_VERSION_CURRENT, result, CharArraySet.EMPTY_SET));
|
||||
}
|
||||
|
@ -56,7 +56,7 @@ public class TestWithCJKBigramFilter extends BaseTokenStreamTestCase {
|
|||
private Analyzer analyzer2 = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer source = new ICUTokenizer(reader);
|
||||
Tokenizer source = new ICUTokenizer(reader, new DefaultICUTokenizerConfig(false));
|
||||
// we put this before the CJKBigramFilter, because the normalization might combine
|
||||
// some halfwidth katakana forms, which will affect the bigramming.
|
||||
TokenStream result = new ICUNormalizer2Filter(source);
|
||||
|
|
|
@ -39,11 +39,12 @@ public class GenerateJFlexSupplementaryMacros {
|
|||
|
||||
private static final String APACHE_LICENSE
|
||||
= "/*" + NL
|
||||
+ " * Copyright 2010 The Apache Software Foundation." + NL
|
||||
+ " *" + NL
|
||||
+ " * Licensed under the Apache License, Version 2.0 (the \"License\");" + NL
|
||||
+ " * you may not use this file except in compliance with the License." + NL
|
||||
+ " * You may obtain a copy of the License at" + NL
|
||||
+ " * Licensed to the Apache Software Foundation (ASF) under one or more" + NL
|
||||
+ " * contributor license agreements. See the NOTICE file distributed with" + NL
|
||||
+ " * this work for additional information regarding copyright ownership." + NL
|
||||
+ " * The ASF licenses this file to You under the Apache License, Version 2.0" + NL
|
||||
+ " * (the \"License\"); you may not use this file except in compliance with" + NL
|
||||
+ " * the License. You may obtain a copy of the License at" + NL
|
||||
+ " *" + NL
|
||||
+ " * http://www.apache.org/licenses/LICENSE-2.0" + NL
|
||||
+ " *" + NL
|
||||
|
@ -52,15 +53,15 @@ public class GenerateJFlexSupplementaryMacros {
|
|||
+ " * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied." + NL
|
||||
+ " * See the License for the specific language governing permissions and" + NL
|
||||
+ " * limitations under the License." + NL
|
||||
+ " */" + NL + NL;
|
||||
+ " */" + NL;
|
||||
|
||||
|
||||
public static void main(String args[]) {
|
||||
outputHeader();
|
||||
outputMacro("ALetterSupp", "[:WordBreak=ALetter:]");
|
||||
outputMacro("FormatSupp", "[:WordBreak=Format:]");
|
||||
outputMacro("ExtendSupp", "[:WordBreak=Extend:]");
|
||||
outputMacro("NumericSupp", "[:WordBreak=Numeric:]");
|
||||
outputMacro("ExtendSupp", "[:WordBreak=Extend:]");
|
||||
outputMacro("KatakanaSupp", "[:WordBreak=Katakana:]");
|
||||
outputMacro("MidLetterSupp", "[:WordBreak=MidLetter:]");
|
||||
outputMacro("MidNumSupp", "[:WordBreak=MidNum:]");
|
||||
|
@ -70,6 +71,10 @@ public class GenerateJFlexSupplementaryMacros {
|
|||
outputMacro("ComplexContextSupp", "[:LineBreak=Complex_Context:]");
|
||||
outputMacro("HanSupp", "[:Script=Han:]");
|
||||
outputMacro("HiraganaSupp", "[:Script=Hiragana:]");
|
||||
outputMacro("SingleQuoteSupp", "[:WordBreak=Single_Quote:]");
|
||||
outputMacro("DoubleQuoteSupp", "[:WordBreak=Double_Quote:]");
|
||||
outputMacro("HebrewLetterSupp", "[:WordBreak=Hebrew_Letter:]");
|
||||
outputMacro("RegionalIndicatorSupp", "[:WordBreak=Regional_Indicator:]");
|
||||
}
|
||||
|
||||
static void outputHeader() {
|
||||
|
|
|
@ -62,7 +62,7 @@ import java.util.regex.Pattern;
|
|||
public class GenerateUTR30DataFiles {
|
||||
private static final String ICU_SVN_TAG_URL
|
||||
= "http://source.icu-project.org/repos/icu/icu/tags";
|
||||
private static final String ICU_RELEASE_TAG = "release-49-1-2";
|
||||
private static final String ICU_RELEASE_TAG = "release-52-1";
|
||||
private static final String ICU_DATA_NORM2_PATH = "source/data/unidata/norm2";
|
||||
private static final String NFC_TXT = "nfc.txt";
|
||||
private static final String NFKC_TXT = "nfkc.txt";
|
||||
|
|
|
@ -97,7 +97,8 @@ public class CreateIndexTask extends PerfTask {
|
|||
}
|
||||
|
||||
public static IndexWriterConfig createWriterConfig(Config config, PerfRunData runData, OpenMode mode, IndexCommit commit) {
|
||||
Version version = Version.valueOf(config.get("writer.version", Version.LUCENE_40.toString()));
|
||||
// :Post-Release-Update-Version.LUCENE_XY:
|
||||
Version version = Version.valueOf(config.get("writer.version", Version.LUCENE_50.toString()));
|
||||
IndexWriterConfig iwConf = new IndexWriterConfig(version, runData.getAnalyzer());
|
||||
iwConf.setOpenMode(mode);
|
||||
IndexDeletionPolicy indexDeletionPolicy = getIndexDeletionPolicy(config);
|
||||
|
|
|
@ -37,7 +37,8 @@ public class CreateIndexTaskTest extends BenchmarkTestCase {
|
|||
|
||||
private PerfRunData createPerfRunData(String infoStreamValue) throws Exception {
|
||||
Properties props = new Properties();
|
||||
props.setProperty("writer.version", Version.LUCENE_40.toString());
|
||||
// :Post-Release-Update-Version.LUCENE_XY:
|
||||
props.setProperty("writer.version", Version.LUCENE_50.toString());
|
||||
props.setProperty("print.props", "false"); // don't print anything
|
||||
props.setProperty("directory", "RAMDirectory");
|
||||
if (infoStreamValue != null) {
|
||||
|
|
|
@ -49,6 +49,9 @@ public class KNearestNeighborClassifier implements Classifier<BytesRef> {
|
|||
private final int k;
|
||||
private Query query;
|
||||
|
||||
private int minDocsFreq;
|
||||
private int minTermFreq;
|
||||
|
||||
/**
|
||||
* Create a {@link Classifier} using kNN algorithm
|
||||
*
|
||||
|
@ -58,6 +61,19 @@ public class KNearestNeighborClassifier implements Classifier<BytesRef> {
|
|||
this.k = k;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a {@link Classifier} using kNN algorithm
|
||||
*
|
||||
* @param k the number of neighbors to analyze as an <code>int</code>
|
||||
* @param minDocsFreq the minimum number of docs frequency for MLT to be set with {@link MoreLikeThis#setMinDocFreq(int)}
|
||||
* @param minTermFreq the minimum number of term frequency for MLT to be set with {@link MoreLikeThis#setMinTermFreq(int)}
|
||||
*/
|
||||
public KNearestNeighborClassifier(int k, int minDocsFreq, int minTermFreq) {
|
||||
this.k = k;
|
||||
this.minDocsFreq = minDocsFreq;
|
||||
this.minTermFreq = minTermFreq;
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
|
@ -93,11 +109,11 @@ public class KNearestNeighborClassifier implements Classifier<BytesRef> {
|
|||
}
|
||||
double max = 0;
|
||||
BytesRef assignedClass = new BytesRef();
|
||||
for (BytesRef cl : classCounts.keySet()) {
|
||||
Integer count = classCounts.get(cl);
|
||||
for (Map.Entry<BytesRef, Integer> entry : classCounts.entrySet()) {
|
||||
Integer count = entry.getValue();
|
||||
if (count > max) {
|
||||
max = count;
|
||||
assignedClass = cl.clone();
|
||||
assignedClass = entry.getKey().clone();
|
||||
}
|
||||
}
|
||||
double score = max / (double) k;
|
||||
|
@ -117,13 +133,7 @@ public class KNearestNeighborClassifier implements Classifier<BytesRef> {
|
|||
*/
|
||||
@Override
|
||||
public void train(AtomicReader atomicReader, String textFieldName, String classFieldName, Analyzer analyzer, Query query) throws IOException {
|
||||
this.textFieldNames = new String[]{textFieldName};
|
||||
this.classFieldName = classFieldName;
|
||||
mlt = new MoreLikeThis(atomicReader);
|
||||
mlt.setAnalyzer(analyzer);
|
||||
mlt.setFieldNames(new String[]{textFieldName});
|
||||
indexSearcher = new IndexSearcher(atomicReader);
|
||||
this.query = query;
|
||||
train(atomicReader, new String[]{textFieldName}, classFieldName, analyzer, query);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -137,6 +147,12 @@ public class KNearestNeighborClassifier implements Classifier<BytesRef> {
|
|||
mlt.setAnalyzer(analyzer);
|
||||
mlt.setFieldNames(textFieldNames);
|
||||
indexSearcher = new IndexSearcher(atomicReader);
|
||||
if (minDocsFreq > 0) {
|
||||
mlt.setMinDocFreq(minDocsFreq);
|
||||
}
|
||||
if (minTermFreq > 0) {
|
||||
mlt.setMinTermFreq(minTermFreq);
|
||||
}
|
||||
this.query = query;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -64,23 +64,17 @@ public class SimpleNaiveBayesClassifier implements Classifier<BytesRef> {
|
|||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public void train(AtomicReader atomicReader, String textFieldName, String classFieldName, Analyzer analyzer, Query query)
|
||||
throws IOException {
|
||||
this.atomicReader = atomicReader;
|
||||
this.indexSearcher = new IndexSearcher(this.atomicReader);
|
||||
this.textFieldNames = new String[]{textFieldName};
|
||||
this.classFieldName = classFieldName;
|
||||
this.analyzer = analyzer;
|
||||
this.docsWithClassSize = countDocsWithClass();
|
||||
this.query = query;
|
||||
public void train(AtomicReader atomicReader, String textFieldName, String classFieldName, Analyzer analyzer) throws IOException {
|
||||
train(atomicReader, textFieldName, classFieldName, analyzer, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public void train(AtomicReader atomicReader, String textFieldName, String classFieldName, Analyzer analyzer) throws IOException {
|
||||
train(atomicReader, textFieldName, classFieldName, analyzer, null);
|
||||
public void train(AtomicReader atomicReader, String textFieldName, String classFieldName, Analyzer analyzer, Query query)
|
||||
throws IOException {
|
||||
train(atomicReader, new String[]{textFieldName}, classFieldName, analyzer, query);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -137,7 +131,7 @@ public class SimpleNaiveBayesClassifier implements Classifier<BytesRef> {
|
|||
if (atomicReader == null) {
|
||||
throw new IOException("You must first call Classifier#train");
|
||||
}
|
||||
double max = 0d;
|
||||
double max = - Double.MAX_VALUE;
|
||||
BytesRef foundClass = new BytesRef();
|
||||
|
||||
Terms terms = MultiFields.getTerms(atomicReader, classFieldName);
|
||||
|
@ -145,20 +139,20 @@ public class SimpleNaiveBayesClassifier implements Classifier<BytesRef> {
|
|||
BytesRef next;
|
||||
String[] tokenizedDoc = tokenizeDoc(inputDocument);
|
||||
while ((next = termsEnum.next()) != null) {
|
||||
// TODO : turn it to be in log scale
|
||||
double clVal = calculatePrior(next) * calculateLikelihood(tokenizedDoc, next);
|
||||
double clVal = calculateLogPrior(next) + calculateLogLikelihood(tokenizedDoc, next);
|
||||
if (clVal > max) {
|
||||
max = clVal;
|
||||
foundClass = BytesRef.deepCopyOf(next);
|
||||
}
|
||||
}
|
||||
return new ClassificationResult<BytesRef>(foundClass, max);
|
||||
double score = 10 / Math.abs(max);
|
||||
return new ClassificationResult<BytesRef>(foundClass, score);
|
||||
}
|
||||
|
||||
|
||||
private double calculateLikelihood(String[] tokenizedDoc, BytesRef c) throws IOException {
|
||||
private double calculateLogLikelihood(String[] tokenizedDoc, BytesRef c) throws IOException {
|
||||
// for each word
|
||||
double result = 1d;
|
||||
double result = 0d;
|
||||
for (String word : tokenizedDoc) {
|
||||
// search with text:word AND class:c
|
||||
int hits = getWordFreqForClass(word, c);
|
||||
|
@ -171,10 +165,10 @@ public class SimpleNaiveBayesClassifier implements Classifier<BytesRef> {
|
|||
|
||||
// P(w|c) = num/den
|
||||
double wordProbability = num / den;
|
||||
result *= wordProbability;
|
||||
result += Math.log(wordProbability);
|
||||
}
|
||||
|
||||
// P(d|c) = P(w1|c)*...*P(wn|c)
|
||||
// log(P(d|c)) = log(P(w1|c))+...+log(P(wn|c))
|
||||
return result;
|
||||
}
|
||||
|
||||
|
@ -205,8 +199,8 @@ public class SimpleNaiveBayesClassifier implements Classifier<BytesRef> {
|
|||
return totalHitCountCollector.getTotalHits();
|
||||
}
|
||||
|
||||
private double calculatePrior(BytesRef currentClass) throws IOException {
|
||||
return (double) docCount(currentClass) / docsWithClassSize;
|
||||
private double calculateLogPrior(BytesRef currentClass) throws IOException {
|
||||
return Math.log((double) docCount(currentClass)) - Math.log(docsWithClassSize);
|
||||
}
|
||||
|
||||
private int docCount(BytesRef countedClass) throws IOException {
|
||||
|
|
|
@ -69,6 +69,7 @@ public class DatasetSplitter {
|
|||
Analyzer analyzer, String... fieldNames) throws IOException {
|
||||
|
||||
// create IWs for train / test / cv IDXs
|
||||
// :Post-Release-Update-Version.LUCENE_XY:
|
||||
IndexWriter testWriter = new IndexWriter(testIndex, new IndexWriterConfig(Version.LUCENE_50, analyzer));
|
||||
IndexWriter cvWriter = new IndexWriter(crossValidationIndex, new IndexWriterConfig(Version.LUCENE_50, analyzer));
|
||||
IndexWriter trainingWriter = new IndexWriter(trainingIndex, new IndexWriterConfig(Version.LUCENE_50, analyzer));
|
||||
|
|
|
@ -39,14 +39,17 @@ import java.util.Random;
|
|||
* Base class for testing {@link Classifier}s
|
||||
*/
|
||||
public abstract class ClassificationTestBase<T> extends LuceneTestCase {
|
||||
public final static String POLITICS_INPUT = "Here are some interesting questions and answers about Mitt Romney.. If you don't know the answer to the question about Mitt Romney, then simply click on the answer below the question section.";
|
||||
public final static String POLITICS_INPUT = "Here are some interesting questions and answers about Mitt Romney.. " +
|
||||
"If you don't know the answer to the question about Mitt Romney, then simply click on the answer below the question section.";
|
||||
public static final BytesRef POLITICS_RESULT = new BytesRef("politics");
|
||||
|
||||
public static final String TECHNOLOGY_INPUT = "Much is made of what the likes of Facebook, Google and Apple know about users. Truth is, Amazon may know more.";
|
||||
public static final String TECHNOLOGY_INPUT = "Much is made of what the likes of Facebook, Google and Apple know about users." +
|
||||
" Truth is, Amazon may know more.";
|
||||
public static final BytesRef TECHNOLOGY_RESULT = new BytesRef("technology");
|
||||
|
||||
private RandomIndexWriter indexWriter;
|
||||
private Directory dir;
|
||||
private FieldType ft;
|
||||
|
||||
String textFieldName;
|
||||
String categoryFieldName;
|
||||
|
@ -61,6 +64,10 @@ public abstract class ClassificationTestBase<T> extends LuceneTestCase {
|
|||
textFieldName = "text";
|
||||
categoryFieldName = "cat";
|
||||
booleanFieldName = "bool";
|
||||
ft = new FieldType(TextField.TYPE_STORED);
|
||||
ft.setStoreTermVectors(true);
|
||||
ft.setStoreTermVectorOffsets(true);
|
||||
ft.setStoreTermVectorPositions(true);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -90,63 +97,35 @@ public abstract class ClassificationTestBase<T> extends LuceneTestCase {
|
|||
atomicReader.close();
|
||||
}
|
||||
}
|
||||
protected void checkOnlineClassification(Classifier<T> classifier, String inputDoc, T expectedResult, Analyzer analyzer, String textFieldName, String classFieldName) throws Exception {
|
||||
checkOnlineClassification(classifier, inputDoc, expectedResult, analyzer, textFieldName, classFieldName, null);
|
||||
}
|
||||
|
||||
protected void checkPerformance(Classifier<T> classifier, Analyzer analyzer, String classFieldName) throws Exception {
|
||||
protected void checkOnlineClassification(Classifier<T> classifier, String inputDoc, T expectedResult, Analyzer analyzer, String textFieldName, String classFieldName, Query query) throws Exception {
|
||||
AtomicReader atomicReader = null;
|
||||
long trainStart = System.currentTimeMillis();
|
||||
try {
|
||||
populatePerformanceIndex(analyzer);
|
||||
populateSampleIndex(analyzer);
|
||||
atomicReader = SlowCompositeReaderWrapper.wrap(indexWriter.getReader());
|
||||
classifier.train(atomicReader, textFieldName, classFieldName, analyzer);
|
||||
long trainEnd = System.currentTimeMillis();
|
||||
long trainTime = trainEnd - trainStart;
|
||||
assertTrue("training took more than 2 mins : " + trainTime / 1000 + "s", trainTime < 120000);
|
||||
classifier.train(atomicReader, textFieldName, classFieldName, analyzer, query);
|
||||
ClassificationResult<T> classificationResult = classifier.assignClass(inputDoc);
|
||||
assertNotNull(classificationResult.getAssignedClass());
|
||||
assertEquals("got an assigned class of " + classificationResult.getAssignedClass(), expectedResult, classificationResult.getAssignedClass());
|
||||
assertTrue("got a not positive score " + classificationResult.getScore(), classificationResult.getScore() > 0);
|
||||
updateSampleIndex(analyzer);
|
||||
ClassificationResult<T> secondClassificationResult = classifier.assignClass(inputDoc);
|
||||
assertEquals(classificationResult.getAssignedClass(), secondClassificationResult.getAssignedClass());
|
||||
assertEquals(Double.valueOf(classificationResult.getScore()), Double.valueOf(secondClassificationResult.getScore()));
|
||||
|
||||
} finally {
|
||||
if (atomicReader != null)
|
||||
atomicReader.close();
|
||||
}
|
||||
}
|
||||
|
||||
private void populatePerformanceIndex(Analyzer analyzer) throws IOException {
|
||||
private void populateSampleIndex(Analyzer analyzer) throws IOException {
|
||||
indexWriter.deleteAll();
|
||||
indexWriter.commit();
|
||||
|
||||
FieldType ft = new FieldType(TextField.TYPE_STORED);
|
||||
ft.setStoreTermVectors(true);
|
||||
ft.setStoreTermVectorOffsets(true);
|
||||
ft.setStoreTermVectorPositions(true);
|
||||
int docs = 1000;
|
||||
Random random = random();
|
||||
for (int i = 0; i < docs; i++) {
|
||||
boolean b = random.nextBoolean();
|
||||
Document doc = new Document();
|
||||
doc.add(new Field(textFieldName, createRandomString(random), ft));
|
||||
doc.add(new Field(categoryFieldName, b ? "technology" : "politics", ft));
|
||||
doc.add(new Field(booleanFieldName, String.valueOf(b), ft));
|
||||
indexWriter.addDocument(doc, analyzer);
|
||||
}
|
||||
indexWriter.commit();
|
||||
}
|
||||
|
||||
private String createRandomString(Random random) {
|
||||
StringBuilder builder = new StringBuilder();
|
||||
for (int i = 0; i < 20; i++) {
|
||||
builder.append(_TestUtil.randomSimpleString(random, 5));
|
||||
builder.append(" ");
|
||||
}
|
||||
return builder.toString();
|
||||
}
|
||||
|
||||
private void populateSampleIndex(Analyzer analyzer) throws Exception {
|
||||
|
||||
indexWriter.deleteAll();
|
||||
indexWriter.commit();
|
||||
|
||||
FieldType ft = new FieldType(TextField.TYPE_STORED);
|
||||
ft.setStoreTermVectors(true);
|
||||
ft.setStoreTermVectorOffsets(true);
|
||||
ft.setStoreTermVectorPositions(true);
|
||||
|
||||
String text;
|
||||
|
||||
Document doc = new Document();
|
||||
|
@ -218,4 +197,112 @@ public abstract class ClassificationTestBase<T> extends LuceneTestCase {
|
|||
|
||||
indexWriter.commit();
|
||||
}
|
||||
|
||||
protected void checkPerformance(Classifier<T> classifier, Analyzer analyzer, String classFieldName) throws Exception {
|
||||
AtomicReader atomicReader = null;
|
||||
long trainStart = System.currentTimeMillis();
|
||||
try {
|
||||
populatePerformanceIndex(analyzer);
|
||||
atomicReader = SlowCompositeReaderWrapper.wrap(indexWriter.getReader());
|
||||
classifier.train(atomicReader, textFieldName, classFieldName, analyzer);
|
||||
long trainEnd = System.currentTimeMillis();
|
||||
long trainTime = trainEnd - trainStart;
|
||||
assertTrue("training took more than 2 mins : " + trainTime / 1000 + "s", trainTime < 120000);
|
||||
} finally {
|
||||
if (atomicReader != null)
|
||||
atomicReader.close();
|
||||
}
|
||||
}
|
||||
|
||||
private void populatePerformanceIndex(Analyzer analyzer) throws IOException {
|
||||
indexWriter.deleteAll();
|
||||
indexWriter.commit();
|
||||
|
||||
FieldType ft = new FieldType(TextField.TYPE_STORED);
|
||||
ft.setStoreTermVectors(true);
|
||||
ft.setStoreTermVectorOffsets(true);
|
||||
ft.setStoreTermVectorPositions(true);
|
||||
int docs = 1000;
|
||||
Random random = random();
|
||||
for (int i = 0; i < docs; i++) {
|
||||
boolean b = random.nextBoolean();
|
||||
Document doc = new Document();
|
||||
doc.add(new Field(textFieldName, createRandomString(random), ft));
|
||||
doc.add(new Field(categoryFieldName, b ? "technology" : "politics", ft));
|
||||
doc.add(new Field(booleanFieldName, String.valueOf(b), ft));
|
||||
indexWriter.addDocument(doc, analyzer);
|
||||
}
|
||||
indexWriter.commit();
|
||||
}
|
||||
|
||||
private String createRandomString(Random random) {
|
||||
StringBuilder builder = new StringBuilder();
|
||||
for (int i = 0; i < 20; i++) {
|
||||
builder.append(_TestUtil.randomSimpleString(random, 5));
|
||||
builder.append(" ");
|
||||
}
|
||||
return builder.toString();
|
||||
}
|
||||
|
||||
private void updateSampleIndex(Analyzer analyzer) throws Exception {
|
||||
|
||||
String text;
|
||||
|
||||
Document doc = new Document();
|
||||
text = "Warren Bennis says John F. Kennedy grasped a key lesson about the presidency that few have followed.";
|
||||
doc.add(new Field(textFieldName, text, ft));
|
||||
doc.add(new Field(categoryFieldName, "politics", ft));
|
||||
doc.add(new Field(booleanFieldName, "true", ft));
|
||||
|
||||
indexWriter.addDocument(doc, analyzer);
|
||||
|
||||
doc = new Document();
|
||||
text = "Julian Zelizer says Bill Clinton is still trying to shape his party, years after the White House, while George W. Bush opts for a much more passive role.";
|
||||
doc.add(new Field(textFieldName, text, ft));
|
||||
doc.add(new Field(categoryFieldName, "politics", ft));
|
||||
doc.add(new Field(booleanFieldName, "true", ft));
|
||||
indexWriter.addDocument(doc, analyzer);
|
||||
|
||||
doc = new Document();
|
||||
text = "Crossfire: Sen. Tim Scott passes on Sen. Lindsey Graham endorsement";
|
||||
doc.add(new Field(textFieldName, text, ft));
|
||||
doc.add(new Field(categoryFieldName, "politics", ft));
|
||||
doc.add(new Field(booleanFieldName, "true", ft));
|
||||
indexWriter.addDocument(doc, analyzer);
|
||||
|
||||
doc = new Document();
|
||||
text = "Illinois becomes 16th state to allow same-sex marriage.";
|
||||
doc.add(new Field(textFieldName, text, ft));
|
||||
doc.add(new Field(categoryFieldName, "politics", ft));
|
||||
doc.add(new Field(booleanFieldName, "true", ft));
|
||||
indexWriter.addDocument(doc, analyzer);
|
||||
|
||||
doc = new Document();
|
||||
text = "Apple is developing iPhones with curved-glass screens and enhanced sensors that detect different levels of pressure, according to a new report.";
|
||||
doc.add(new Field(textFieldName, text, ft));
|
||||
doc.add(new Field(categoryFieldName, "technology", ft));
|
||||
doc.add(new Field(booleanFieldName, "false", ft));
|
||||
indexWriter.addDocument(doc, analyzer);
|
||||
|
||||
doc = new Document();
|
||||
text = "The Xbox One is Microsoft's first new gaming console in eight years. It's a quality piece of hardware but it's also noteworthy because Microsoft is using it to make a statement.";
|
||||
doc.add(new Field(textFieldName, text, ft));
|
||||
doc.add(new Field(categoryFieldName, "technology", ft));
|
||||
doc.add(new Field(booleanFieldName, "false", ft));
|
||||
indexWriter.addDocument(doc, analyzer);
|
||||
|
||||
doc = new Document();
|
||||
text = "Google says it will replace a Google Maps image after a California father complained it shows the body of his teen-age son, who was shot to death in 2009.";
|
||||
doc.add(new Field(textFieldName, text, ft));
|
||||
doc.add(new Field(categoryFieldName, "technology", ft));
|
||||
doc.add(new Field(booleanFieldName, "false", ft));
|
||||
indexWriter.addDocument(doc, analyzer);
|
||||
|
||||
doc = new Document();
|
||||
text = "second unlabeled doc";
|
||||
doc.add(new Field(textFieldName, text, ft));
|
||||
indexWriter.addDocument(doc, analyzer);
|
||||
|
||||
indexWriter.commit();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -29,7 +29,10 @@ public class KNearestNeighborClassifierTest extends ClassificationTestBase<Bytes
|
|||
|
||||
@Test
|
||||
public void testBasicUsage() throws Exception {
|
||||
checkCorrectClassification(new KNearestNeighborClassifier(1), TECHNOLOGY_INPUT, TECHNOLOGY_RESULT, new MockAnalyzer(random()), textFieldName, categoryFieldName);
|
||||
// usage with default MLT min docs / term freq
|
||||
checkCorrectClassification(new KNearestNeighborClassifier(3), POLITICS_INPUT, POLITICS_RESULT, new MockAnalyzer(random()), textFieldName, categoryFieldName);
|
||||
// usage without custom min docs / term freq for MLT
|
||||
checkCorrectClassification(new KNearestNeighborClassifier(3, 2, 1), TECHNOLOGY_INPUT, TECHNOLOGY_RESULT, new MockAnalyzer(random()), textFieldName, categoryFieldName);
|
||||
}
|
||||
|
||||
@Test
|
||||
|
|
|
@ -159,7 +159,6 @@
|
|||
<property name="javac.source.backwards" value="1.7"/>
|
||||
<property name="javac.target.backwards" value="1.7"/>
|
||||
<property name="javac.args" value="-Xlint -Xlint:-deprecation -Xlint:-serial -Xlint:-options"/>
|
||||
<property name="bootclasspath" value=""/>
|
||||
<property name="javadoc.link" value="http://download.oracle.com/javase/7/docs/api/"/>
|
||||
<property name="javadoc.link.junit" value="http://junit.sourceforge.net/javadoc/"/>
|
||||
<property name="javadoc.packagelist.dir" location="${common.dir}/tools/javadoc"/>
|
||||
|
@ -169,6 +168,35 @@
|
|||
<property name="javadoc.dir" location="${common.dir}/build/docs"/>
|
||||
<property name="javadoc.maxmemory" value="512m" />
|
||||
<property name="javadoc.noindex" value="true"/>
|
||||
|
||||
<!-- detect bootclasspath from given bootjdk path (including crazy AppleJDK special case) -->
|
||||
<first id="-boot-rt.jar">
|
||||
<fileset dir="${bootjdk}" erroronmissingdir="false" followsymlinks="true">
|
||||
<include name="jre/lib/rt.jar" /><!-- Oracle JDK -->
|
||||
<include name="lib/rt.jar" /><!-- Oracle JRE -->
|
||||
<include name="bundle/Classes/classes.jar" /><!-- Apple JDK -->
|
||||
</fileset>
|
||||
</first>
|
||||
<property name="bootclasspath" value="${toString:-boot-rt.jar}" />
|
||||
<fail message="Invalid 'bootjdk' parameter, because it contains no class library JAR: ${bootjdk}">
|
||||
<condition>
|
||||
<and>
|
||||
<isset property="bootjdk" />
|
||||
<equals arg1="${bootclasspath}" arg2=""/>
|
||||
</and>
|
||||
</condition>
|
||||
</fail>
|
||||
<fail message="Invalid 'bootclasspath' parameter, because it does not point to a valid class library JAR: ${bootclasspath}">
|
||||
<condition>
|
||||
<not>
|
||||
<or>
|
||||
<equals arg1="${bootclasspath}" arg2=""/>
|
||||
<available classname="java.lang.StringBuilder" classpath="${bootclasspath}" ignoresystemclasses="true"/>
|
||||
</or>
|
||||
</not>
|
||||
</condition>
|
||||
</fail>
|
||||
|
||||
<!-- Javadoc classpath -->
|
||||
<path id="javadoc.classpath">
|
||||
<path refid="classpath"/>
|
||||
|
@ -355,7 +383,7 @@
|
|||
<target name="resolve" depends="ivy-availability-check,ivy-configure">
|
||||
<!-- todo, make this a property or something.
|
||||
only special cases need bundles -->
|
||||
<ivy:retrieve type="jar,bundle,tests" log="download-only"
|
||||
<ivy:retrieve type="jar,bundle,test,test-jar,tests" log="download-only"
|
||||
conf="${ivy.default.configuration}" sync="${ivy.sync}"/>
|
||||
</target>
|
||||
|
||||
|
@ -448,7 +476,7 @@
|
|||
<available property="jflex.present" classname="jflex.anttask.JFlexTask">
|
||||
<classpath refid="jflex.classpath"/>
|
||||
</available>
|
||||
<fail unless="jflex.present">
|
||||
<fail unless="jflex.present"> 
|
||||
##################################################################
|
||||
JFlex not found.
|
||||
JFlex Home: ${jflex.home}
|
||||
|
@ -456,14 +484,14 @@
|
|||
Please install the jFlex 1.5 version (currently not released)
|
||||
from its SVN repository:
|
||||
|
||||
svn co -r 623 http://jflex.svn.sourceforge.net/svnroot/jflex/trunk jflex
|
||||
svn co -r 722 https://svn.code.sf.net/p/jflex/code/trunk jflex
|
||||
cd jflex
|
||||
mvn install
|
||||
|
||||
Then, create a build.properties file either in your home
|
||||
directory, or within the Lucene directory and set the jflex.home
|
||||
property to the path where the JFlex trunk checkout is located
|
||||
(in the above example its the directory called "jflex").
|
||||
(in the above example it's the directory called "jflex").
|
||||
|
||||
##################################################################
|
||||
</fail>
|
||||
|
@ -623,6 +651,7 @@
|
|||
value="The Apache Software Foundation"/>
|
||||
<attribute name="X-Compile-Source-JDK" value="${javac.source}"/>
|
||||
<attribute name="X-Compile-Target-JDK" value="${javac.target}"/>
|
||||
<attribute name="Main-Class" value="${main.class}"/>
|
||||
</manifest>
|
||||
</sequential>
|
||||
</macrodef>
|
||||
|
@ -979,6 +1008,9 @@
|
|||
<!-- disable AWT while running tests -->
|
||||
<sysproperty key="java.awt.headless" value="true"/>
|
||||
|
||||
<!-- turn jenkins blood red for hashmap bugs, even on jdk7 -->
|
||||
<sysproperty key="jdk.map.althashing.threshold" value="0"/>
|
||||
|
||||
<!-- Only pass these to the test JVMs if defined in ANT. -->
|
||||
<syspropertyset>
|
||||
<propertyref prefix="tests.maxfailures" />
|
||||
|
@ -1331,7 +1363,7 @@ ${tests-output}/junit4-*.suites - per-JVM executed suites
|
|||
|
||||
]]></fail>
|
||||
<echo>Code coverage with Atlassian Clover enabled.</echo>
|
||||
<ivy:cachepath organisation="com.cenqua.clover" module="clover" revision="3.2.0-SNAPSHOT"
|
||||
<ivy:cachepath organisation="com.cenqua.clover" module="clover" revision="3.2.0"
|
||||
inline="true" conf="master" pathid="clover.classpath"/>
|
||||
<taskdef resource="cloverlib.xml" classpathref="clover.classpath" />
|
||||
<mkdir dir="${clover.db.dir}"/>
|
||||
|
@ -2168,7 +2200,7 @@ ${ant.project.name}.test.dependencies=${test.classpath.list}
|
|||
|
||||
<!-- GROOVY scripting engine for ANT tasks -->
|
||||
<target name="resolve-groovy" unless="groovy.loaded" depends="ivy-availability-check,ivy-configure">
|
||||
<ivy:cachepath organisation="org.codehaus.groovy" module="groovy-all" revision="2.1.5"
|
||||
<ivy:cachepath organisation="org.codehaus.groovy" module="groovy-all" revision="2.2.1"
|
||||
inline="true" conf="default" type="jar" transitive="true" pathid="groovy.classpath"/>
|
||||
<taskdef name="groovy"
|
||||
classname="org.codehaus.groovy.ant.Groovy"
|
||||
|
@ -2182,7 +2214,7 @@ ${ant.project.name}.test.dependencies=${test.classpath.list}
|
|||
<property name="forbidden-sysout-excludes" value=""/>
|
||||
|
||||
<target name="-install-forbidden-apis" unless="forbidden-apis.loaded" depends="ivy-availability-check,ivy-configure">
|
||||
<ivy:cachepath organisation="de.thetaphi" module="forbiddenapis" revision="1.3"
|
||||
<ivy:cachepath organisation="de.thetaphi" module="forbiddenapis" revision="1.4"
|
||||
inline="true" conf="default" transitive="true" pathid="forbidden-apis.classpath"/>
|
||||
<taskdef name="forbidden-apis" classname="de.thetaphi.forbiddenapis.AntTask" classpathref="forbidden-apis.classpath"/>
|
||||
<property name="forbidden-apis.loaded" value="true"/>
|
||||
|
@ -2226,7 +2258,7 @@ ${ant.project.name}.test.dependencies=${test.classpath.list}
|
|||
<!-- PEGDOWN macro: Before using depend on the target "resolve-pegdown,resolve-groovy" -->
|
||||
|
||||
<target name="resolve-pegdown" unless="pegdown.loaded" depends="ivy-availability-check,ivy-configure">
|
||||
<ivy:cachepath organisation="org.pegdown" module="pegdown" revision="1.4.0"
|
||||
<ivy:cachepath organisation="org.pegdown" module="pegdown" revision="1.4.1"
|
||||
inline="true" conf="default" transitive="true" pathid="pegdown.classpath"/>
|
||||
<property name="pegdown.loaded" value="true"/>
|
||||
</target>
|
||||
|
|
|
@ -373,6 +373,10 @@ public final class CompressingStoredFieldsReader extends StoredFieldsReader {
|
|||
return compressionMode;
|
||||
}
|
||||
|
||||
int getChunkSize() {
|
||||
return chunkSize;
|
||||
}
|
||||
|
||||
ChunkIterator chunkIterator(int startDocID) throws IOException {
|
||||
ensureOpen();
|
||||
fieldsStream.seek(indexReader.getStartPointer(startDocID));
|
||||
|
|
|
@ -337,7 +337,9 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter {
|
|||
final Bits liveDocs = reader.getLiveDocs();
|
||||
|
||||
if (matchingFieldsReader == null
|
||||
|| matchingFieldsReader.getVersion() != VERSION_CURRENT) { // means reader version is not the same as the writer version
|
||||
|| matchingFieldsReader.getVersion() != VERSION_CURRENT // means reader version is not the same as the writer version
|
||||
|| matchingFieldsReader.getCompressionMode() != compressionMode
|
||||
|| matchingFieldsReader.getChunkSize() != chunkSize) { // the way data is decompressed depends on the chunk size
|
||||
// naive merge...
|
||||
for (int i = nextLiveDoc(0, liveDocs, maxDoc); i < maxDoc; i = nextLiveDoc(i + 1, liveDocs, maxDoc)) {
|
||||
StoredDocument doc = reader.document(i);
|
||||
|
@ -362,8 +364,7 @@ public final class CompressingStoredFieldsWriter extends StoredFieldsWriter {
|
|||
startOffsets[i] = startOffsets[i - 1] + it.lengths[i - 1];
|
||||
}
|
||||
|
||||
if (compressionMode == matchingFieldsReader.getCompressionMode() // same compression mode
|
||||
&& numBufferedDocs == 0 // starting a new chunk
|
||||
if (numBufferedDocs == 0 // starting a new chunk
|
||||
&& startOffsets[it.chunkDocs - 1] < chunkSize // chunk is small enough
|
||||
&& startOffsets[it.chunkDocs - 1] + it.lengths[it.chunkDocs - 1] >= chunkSize // chunk is large enough
|
||||
&& nextDeletedDoc(it.docBase, liveDocs, it.docBase + it.chunkDocs) == it.docBase + it.chunkDocs) { // no deletion in the chunk
|
||||
|
|
|
@ -168,8 +168,9 @@ public abstract class IndexReader implements Closeable {
|
|||
* @see #tryIncRef
|
||||
*/
|
||||
public final void incRef() {
|
||||
if (!tryIncRef()) {
|
||||
ensureOpen();
|
||||
refCount.incrementAndGet();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue