mirror of https://github.com/apache/lucene.git
LUCENE-3892: merge trunk
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/pforcodec_3892@1372366 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
commit
789981c9fd
|
@ -0,0 +1,49 @@
|
|||
<?xml version="1.0"?>
|
||||
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
<project name="clover" basedir=".">
|
||||
<import file="lucene/common-build.xml"/>
|
||||
|
||||
<!--
|
||||
Run after Junit tests.
|
||||
|
||||
This target is in a separate file, as it needs to include common-build.xml,
|
||||
but must run from top-level!
|
||||
-->
|
||||
<target name="generate-clover-reports" depends="clover">
|
||||
<fail unless="run.clover">Clover not enabled!</fail>
|
||||
<mkdir dir="${clover.report.dir}"/>
|
||||
<fileset dir="." id="clover.test.result.files">
|
||||
<include name="*/build/**/test/TEST-*.xml"/>
|
||||
<exclude name="lucene/build/backwards/**"/>
|
||||
</fileset>
|
||||
<clover-report>
|
||||
<current outfile="${clover.report.dir}" title="${final.name}" numThreads="0">
|
||||
<format type="html" filter="assert"/>
|
||||
<testresults refid="clover.test.result.files"/>
|
||||
</current>
|
||||
<current outfile="${clover.report.dir}/clover.xml" title="${final.name}">
|
||||
<format type="xml" filter="assert"/>
|
||||
<testresults refid="clover.test.result.files"/>
|
||||
</current>
|
||||
</clover-report>
|
||||
<echo>You can find the merged Lucene/Solr Clover report in '${clover.report.dir}'.</echo>
|
||||
</target>
|
||||
|
||||
</project>
|
134
build.xml
134
build.xml
|
@ -51,11 +51,28 @@
|
|||
</sequential>
|
||||
</target>
|
||||
|
||||
<target name="validate" description="Validate dependencies, licenses, etc.">
|
||||
<sequential><subant target="validate" inheritall="false" failonerror="true">
|
||||
<fileset dir="lucene" includes="build.xml" />
|
||||
<fileset dir="solr" includes="build.xml" />
|
||||
</subant></sequential>
|
||||
<target name="validate" description="Validate dependencies, licenses, etc." depends="-validate-source-patterns">
|
||||
<subant target="validate" inheritall="false" failonerror="true">
|
||||
<fileset dir="lucene" includes="build.xml" />
|
||||
<fileset dir="solr" includes="build.xml" />
|
||||
</subant>
|
||||
</target>
|
||||
|
||||
<target name="-validate-source-patterns" unless="disable.source-patterns">
|
||||
<!-- check that there are no nocommits or @author javadoc tags: -->
|
||||
<property name="validate.currDir" location="."/>
|
||||
<pathconvert pathsep="${line.separator}" dirsep="/" property="validate.patternsFound" setonempty="false">
|
||||
<fileset dir="${validate.currDir}">
|
||||
<include name="**/*.java"/>
|
||||
<exclude name="**/backwards/**"/>
|
||||
<or>
|
||||
<containsregexp expression="@author\b" casesensitive="yes"/>
|
||||
<containsregexp expression="\bno(n|)commit\b" casesensitive="no"/>
|
||||
</or>
|
||||
</fileset>
|
||||
<map from="${validate.currDir}${file.separator}" to="* "/>
|
||||
</pathconvert>
|
||||
<fail if="validate.patternsFound">The following files contain @author tags or nocommits:${line.separator}${validate.patternsFound}</fail>
|
||||
</target>
|
||||
|
||||
<target name="rat-sources" description="Runs rat across all sources and tests">
|
||||
|
@ -184,4 +201,111 @@
|
|||
</subant>
|
||||
</sequential>
|
||||
</target>
|
||||
|
||||
<!-- define here, as common-build is not included! -->
|
||||
<property name="python32.exe" value="python3.2" />
|
||||
<property name="fakeRelease" value="lucene/build/fakeRelease"/>
|
||||
<property name="fakeReleaseTmp" value="lucene/build/fakeReleaseTmp"/>
|
||||
<property name="fakeReleaseVersion" value="5.0"/> <!-- *not* -SNAPSHOT, the real version -->
|
||||
|
||||
<target name="nightly-smoke" description="Builds an unsigned release and smoke tests it." depends="clean">
|
||||
<sequential>
|
||||
<fail unless="JAVA6_HOME">JAVA6_HOME property is not defined.</fail>
|
||||
<fail unless="JAVA7_HOME">JAVA7_HOME property is not defined.</fail>
|
||||
<subant target="prepare-release-no-sign" inheritall="false" failonerror="true">
|
||||
<fileset dir="lucene" includes="build.xml" />
|
||||
<fileset dir="solr" includes="build.xml" />
|
||||
<property name="version" value="${fakeReleaseVersion}" />
|
||||
</subant>
|
||||
<delete dir="${fakeRelease}"/>
|
||||
<delete dir="${fakeReleaseTmp}"/>
|
||||
<mkdir dir="${fakeRelease}"/>
|
||||
<copy todir="${fakeRelease}/lucene">
|
||||
<fileset dir="lucene/dist"/>
|
||||
</copy>
|
||||
<copy todir="${fakeRelease}/lucene/changes">
|
||||
<fileset dir="lucene/build/docs/changes"/>
|
||||
</copy>
|
||||
<get src="http://people.apache.org/keys/group/lucene.asc"
|
||||
dest="${fakeRelease}/lucene/KEYS"/>
|
||||
<copy todir="${fakeRelease}/solr">
|
||||
<fileset dir="solr/package"/>
|
||||
</copy>
|
||||
<copy file="${fakeRelease}/lucene/KEYS" todir="${fakeRelease}/solr"/>
|
||||
<makeurl file="${fakeRelease}" validate="false" property="fakeRelease.uri"/>
|
||||
<exec executable="${python32.exe}" failonerror="true">
|
||||
<arg value="-u"/>
|
||||
<arg value="dev-tools/scripts/smokeTestRelease.py"/>
|
||||
<arg value="${fakeRelease.uri}"/>
|
||||
<arg value="${fakeReleaseVersion}"/>
|
||||
<arg value="${fakeReleaseTmp}"/>
|
||||
<arg value="false"/>
|
||||
<env key="JAVA6_HOME" value="${JAVA6_HOME}"/>
|
||||
<env key="JAVA7_HOME" value="${JAVA7_HOME}"/>
|
||||
</exec>
|
||||
<delete dir="${fakeRelease}"/>
|
||||
<delete dir="${fakeReleaseTmp}"/>
|
||||
</sequential>
|
||||
</target>
|
||||
|
||||
<!-- Calls only generate-clover-reports on Lucene, as Solr's is just a clone with other target; the database itsself is fixed -->
|
||||
<target name="generate-clover-reports">
|
||||
<subant target="generate-clover-reports" inheritall="false" failonerror="true">
|
||||
<fileset dir="." includes="build-clover.xml" />
|
||||
</subant>
|
||||
</target>
|
||||
|
||||
<!-- Jenkins tasks -->
|
||||
<target name="jenkins-hourly" depends="clean,test,validate,-jenkins-javadocs-lint,-svn-status"/>
|
||||
|
||||
<target name="jenkins-clover">
|
||||
<antcall target="-jenkins-clover">
|
||||
<param name="run.clover" value="true"/>
|
||||
<!-- must be 1, as clover does not like parallel test runs: -->
|
||||
<param name="tests.jvms" value="1"/>
|
||||
<!-- Also override some other props to be fast, ignoring what's set on command line: -->
|
||||
<param name="tests.multiplier" value="1"/>
|
||||
<param name="tests.slow" value="false"/>
|
||||
<param name="tests.nightly" value="false"/>
|
||||
<param name="tests.weekly" value="false"/>
|
||||
<param name="tests.multiplier" value="1"/>
|
||||
</antcall>
|
||||
</target>
|
||||
<target name="-jenkins-clover" depends="clean,test,generate-clover-reports"/>
|
||||
|
||||
<!-- we need this extra condition, as we want to match only on "true", not solely if property is set: -->
|
||||
<property name="disable.javadocs-lint" value="false" />
|
||||
<condition property="-disable.javadocs-lint">
|
||||
<equals arg1="${disable.javadocs-lint}" arg2="true"/>
|
||||
</condition>
|
||||
<target name="-jenkins-javadocs-lint" unless="-disable.javadocs-lint">
|
||||
<antcall target="javadocs-lint"/>
|
||||
</target>
|
||||
|
||||
<!-- define here, as common-build is not included! -->
|
||||
<property name="svn.exe" value="svn" />
|
||||
|
||||
<target name="-svn-status">
|
||||
<exec executable="${svn.exe}" dir="." failonerror="true">
|
||||
<arg value="status"/>
|
||||
<redirector outputproperty="svn.status.output">
|
||||
<outputfilterchain>
|
||||
<linecontainsregexp>
|
||||
<regexp pattern="^\?" />
|
||||
</linecontainsregexp>
|
||||
<tokenfilter>
|
||||
<replaceregex pattern="^........" replace="* " />
|
||||
<replacestring from="${file.separator}" to="/" />
|
||||
</tokenfilter>
|
||||
</outputfilterchain>
|
||||
</redirector>
|
||||
</exec>
|
||||
<fail message="Source checkout is dirty after running tests!!! Offending files:${line.separator}${svn.status.output}">
|
||||
<condition>
|
||||
<not>
|
||||
<equals arg1="${svn.status.output}" arg2=""/>
|
||||
</not>
|
||||
</condition>
|
||||
</fail>
|
||||
</target>
|
||||
</project>
|
||||
|
|
|
@ -174,6 +174,6 @@
|
|||
<classpathentry kind="lib" path="solr/contrib/velocity/lib/commons-beanutils-1.7.0.jar"/>
|
||||
<classpathentry kind="lib" path="solr/contrib/velocity/lib/commons-collections-3.2.1.jar"/>
|
||||
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
|
||||
<classpathentry kind="lib" path="lucene/test-framework/lib/randomizedtesting-runner-1.6.0.jar"/>
|
||||
<classpathentry kind="lib" path="lucene/test-framework/lib/randomizedtesting-runner-2.0.0.rc5.jar"/>
|
||||
<classpathentry kind="output" path="bin/other"/>
|
||||
</classpath>
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
<library name="JUnit">
|
||||
<CLASSES>
|
||||
<root url="jar://$PROJECT_DIR$/lucene/test-framework/lib/junit-4.10.jar!/" />
|
||||
<root url="jar://$PROJECT_DIR$/lucene/test-framework/lib/randomizedtesting-runner-1.6.0.jar!/" />
|
||||
<root url="jar://$PROJECT_DIR$/lucene/test-framework/lib/randomizedtesting-runner-2.0.0.rc5.jar!/" />
|
||||
</CLASSES>
|
||||
<JAVADOC />
|
||||
<SOURCES />
|
||||
|
|
|
@ -36,27 +36,25 @@ A. How to use nightly Jenkins-built Lucene/Solr Maven artifacts
|
|||
|
||||
B. How to generate Lucene/Solr Maven artifacts
|
||||
|
||||
Prerequisites: JDK 1.6+ and Ant 1.7.X
|
||||
Prerequisites: JDK 1.6+ and Ant 1.8.2+
|
||||
|
||||
Run 'ant generate-maven-artifacts' to create an internal Maven
|
||||
repository, including POMs, binary .jars, source .jars, and javadoc
|
||||
.jars.
|
||||
|
||||
You can run the above command in four possible places: the top-level
|
||||
directory; under lucene/; under solr/; or under modules/. From the
|
||||
top-level directory, from lucene/, or from modules/, the internal
|
||||
repository will be located at dist/maven/. From solr/, the internal
|
||||
repository will be located at package/maven/.
|
||||
You can run the above command in three possible places: the top-level
|
||||
directory; under lucene/; or under solr/. From the top-level directory
|
||||
or from lucene/, the internal repository will be located at dist/maven/.
|
||||
From solr/, the internal repository will be located at package/maven/.
|
||||
|
||||
|
||||
C. How to deploy Maven artifacts to a repository
|
||||
|
||||
Prerequisites: JDK 1.6+ and Ant 1.7.X
|
||||
Prerequisites: JDK 1.6+ and Ant 1.8.2+
|
||||
|
||||
You can deploy targets for all of Lucene/Solr, only Lucene, only Solr,
|
||||
or only modules/, as in B. above. To deploy to a Maven repository, the
|
||||
command is the same as in B. above, with the addition of two system
|
||||
properties:
|
||||
You can deploy targets for all of Lucene/Solr, only Lucene, or only Solr,
|
||||
as in B. above. To deploy to a Maven repository, the command is the same
|
||||
as in B. above, with the addition of two system properties:
|
||||
|
||||
ant -Dm2.repository.id=my-repo-id \
|
||||
-Dm2.repository.url=http://example.org/my/repo \
|
||||
|
@ -101,7 +99,7 @@ D. How to use Maven to build Lucene/Solr
|
|||
the default, you can supply an alternate version on the command line
|
||||
with the above command, e.g.:
|
||||
|
||||
ant -Dversion=5.0-my-special-version get-maven-poms
|
||||
ant -Dversion=my-special-version get-maven-poms
|
||||
|
||||
Note: if you change the version in the POMs, there is one test method
|
||||
that will fail under maven-surefire-plugin:
|
||||
|
|
|
@ -37,15 +37,9 @@
|
|||
<module-path>${top-level}/${module-directory}</module-path>
|
||||
</properties>
|
||||
<scm>
|
||||
<connection>
|
||||
scm:svn:http://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
|
||||
</connection>
|
||||
<developerConnection>
|
||||
scm:svn:https://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
|
||||
</developerConnection>
|
||||
<url>
|
||||
http://svn.apache.org/viewvc/lucene/dev/trunk/${module-directory}
|
||||
</url>
|
||||
<connection>scm:svn:${vc-anonymous-base-url}/${module-directory}</connection>
|
||||
<developerConnection>scm:svn:${vc-dev-base-url}/${module-directory}</developerConnection>
|
||||
<url>${vc-browse-base-url}/${module-directory}</url>
|
||||
</scm>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
|
@ -77,33 +71,5 @@
|
|||
</excludes>
|
||||
</testResource>
|
||||
</testResources>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.codehaus.mojo</groupId>
|
||||
<artifactId>appassembler-maven-plugin</artifactId>
|
||||
<configuration>
|
||||
<extraJvmArguments>-Xmx128M</extraJvmArguments>
|
||||
<repositoryLayout>flat</repositoryLayout>
|
||||
<platforms>
|
||||
<platform>windows</platform>
|
||||
<platform>unix</platform>
|
||||
</platforms>
|
||||
<programs>
|
||||
<program>
|
||||
<mainClass>org.apache.lucene.analysis.charfilter.HtmlStripCharFilter</mainClass>
|
||||
<name>HtmlStripCharFilter</name>
|
||||
</program>
|
||||
<program>
|
||||
<mainClass>org.apache.lucene.analysis.en.PorterStemmer</mainClass>
|
||||
<name>EnglishPorterStemmer</name>
|
||||
</program>
|
||||
<program>
|
||||
<mainClass>org.tartarus.snowball.TestApp</mainClass>
|
||||
<name>SnowballTestApp</name>
|
||||
</program>
|
||||
</programs>
|
||||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
</project>
|
||||
|
|
|
@ -40,15 +40,9 @@
|
|||
<module-path>${top-level}/${module-directory}</module-path>
|
||||
</properties>
|
||||
<scm>
|
||||
<connection>
|
||||
scm:svn:http://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
|
||||
</connection>
|
||||
<developerConnection>
|
||||
scm:svn:https://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
|
||||
</developerConnection>
|
||||
<url>
|
||||
http://svn.apache.org/viewvc/lucene/dev/trunk/${module-directory}
|
||||
</url>
|
||||
<connection>scm:svn:${vc-anonymous-base-url}/${module-directory}</connection>
|
||||
<developerConnection>scm:svn:${vc-dev-base-url}/${module-directory}</developerConnection>
|
||||
<url>${vc-browse-base-url}/${module-directory}</url>
|
||||
</scm>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
|
|
|
@ -39,15 +39,9 @@
|
|||
<module-path>${top-level}/${module-directory}</module-path>
|
||||
</properties>
|
||||
<scm>
|
||||
<connection>
|
||||
scm:svn:http://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
|
||||
</connection>
|
||||
<developerConnection>
|
||||
scm:svn:https://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
|
||||
</developerConnection>
|
||||
<url>
|
||||
http://svn.apache.org/viewvc/lucene/dev/trunk/${module-directory}
|
||||
</url>
|
||||
<connection>scm:svn:${vc-anonymous-base-url}/${module-directory}</connection>
|
||||
<developerConnection>scm:svn:${vc-dev-base-url}/${module-directory}</developerConnection>
|
||||
<url>${vc-browse-base-url}/${module-directory}</url>
|
||||
</scm>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
|
|
|
@ -39,15 +39,9 @@
|
|||
<module-path>${top-level}/${module-directory}</module-path>
|
||||
</properties>
|
||||
<scm>
|
||||
<connection>
|
||||
scm:svn:http://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
|
||||
</connection>
|
||||
<developerConnection>
|
||||
scm:svn:https://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
|
||||
</developerConnection>
|
||||
<url>
|
||||
http://svn.apache.org/viewvc/lucene/dev/trunk/${module-directory}
|
||||
</url>
|
||||
<connection>scm:svn:${vc-anonymous-base-url}/${module-directory}</connection>
|
||||
<developerConnection>scm:svn:${vc-dev-base-url}/${module-directory}</developerConnection>
|
||||
<url>${vc-browse-base-url}/${module-directory}</url>
|
||||
</scm>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
|
@ -75,6 +69,11 @@
|
|||
<build>
|
||||
<sourceDirectory>${module-path}/src/java</sourceDirectory>
|
||||
<testSourceDirectory>${module-path}/src/test</testSourceDirectory>
|
||||
<resources>
|
||||
<resource>
|
||||
<directory>${module-path}/src/resources</directory>
|
||||
</resource>
|
||||
</resources>
|
||||
<testResources>
|
||||
<testResource>
|
||||
<directory>${project.build.testSourceDirectory}</directory>
|
||||
|
|
|
@ -39,15 +39,9 @@
|
|||
<module-path>${top-level}/${module-directory}</module-path>
|
||||
</properties>
|
||||
<scm>
|
||||
<connection>
|
||||
scm:svn:http://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
|
||||
</connection>
|
||||
<developerConnection>
|
||||
scm:svn:https://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
|
||||
</developerConnection>
|
||||
<url>
|
||||
http://svn.apache.org/viewvc/lucene/dev/trunk/${module-directory}
|
||||
</url>
|
||||
<connection>scm:svn:${vc-anonymous-base-url}/${module-directory}</connection>
|
||||
<developerConnection>scm:svn:${vc-dev-base-url}/${module-directory}</developerConnection>
|
||||
<url>${vc-browse-base-url}/${module-directory}</url>
|
||||
</scm>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
|
@ -75,6 +69,11 @@
|
|||
<build>
|
||||
<sourceDirectory>${module-path}/src/java</sourceDirectory>
|
||||
<testSourceDirectory>${module-path}/src/test</testSourceDirectory>
|
||||
<resources>
|
||||
<resource>
|
||||
<directory>${module-path}/src/resources</directory>
|
||||
</resource>
|
||||
</resources>
|
||||
<testResources>
|
||||
<testResource>
|
||||
<directory>${project.build.testSourceDirectory}</directory>
|
||||
|
|
|
@ -37,15 +37,9 @@
|
|||
<module-path>${top-level}/${module-directory}</module-path>
|
||||
</properties>
|
||||
<scm>
|
||||
<connection>
|
||||
scm:svn:http://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
|
||||
</connection>
|
||||
<developerConnection>
|
||||
scm:svn:https://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
|
||||
</developerConnection>
|
||||
<url>
|
||||
http://svn.apache.org/viewvc/lucene/dev/trunk/${module-directory}
|
||||
</url>
|
||||
<connection>scm:svn:${vc-anonymous-base-url}/${module-directory}</connection>
|
||||
<developerConnection>scm:svn:${vc-dev-base-url}/${module-directory}</developerConnection>
|
||||
<url>${vc-browse-base-url}/${module-directory}</url>
|
||||
</scm>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
|
|
|
@ -37,15 +37,9 @@
|
|||
<module-path>${top-level}/${module-directory}</module-path>
|
||||
</properties>
|
||||
<scm>
|
||||
<connection>
|
||||
scm:svn:http://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
|
||||
</connection>
|
||||
<developerConnection>
|
||||
scm:svn:https://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
|
||||
</developerConnection>
|
||||
<url>
|
||||
http://svn.apache.org/viewvc/lucene/dev/trunk/${module-directory}
|
||||
</url>
|
||||
<connection>scm:svn:${vc-anonymous-base-url}/${module-directory}</connection>
|
||||
<developerConnection>scm:svn:${vc-dev-base-url}/${module-directory}</developerConnection>
|
||||
<url>${vc-browse-base-url}/${module-directory}</url>
|
||||
</scm>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
|
|
|
@ -41,15 +41,9 @@
|
|||
<module-path>${top-level}/${module-directory}</module-path>
|
||||
</properties>
|
||||
<scm>
|
||||
<connection>
|
||||
scm:svn:http://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
|
||||
</connection>
|
||||
<developerConnection>
|
||||
scm:svn:https://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
|
||||
</developerConnection>
|
||||
<url>
|
||||
http://svn.apache.org/viewvc/lucene/dev/trunk/${module-directory}
|
||||
</url>
|
||||
<connection>scm:svn:${vc-anonymous-base-url}/${module-directory}</connection>
|
||||
<developerConnection>scm:svn:${vc-dev-base-url}/${module-directory}</developerConnection>
|
||||
<url>${vc-browse-base-url}/${module-directory}</url>
|
||||
</scm>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
|
|
|
@ -37,15 +37,9 @@
|
|||
<module-path>${top-level}/${module-directory}</module-path>
|
||||
</properties>
|
||||
<scm>
|
||||
<connection>
|
||||
scm:svn:http://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
|
||||
</connection>
|
||||
<developerConnection>
|
||||
scm:svn:https://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
|
||||
</developerConnection>
|
||||
<url>
|
||||
http://svn.apache.org/viewvc/lucene/dev/trunk/${module-directory}
|
||||
</url>
|
||||
<connection>scm:svn:${vc-anonymous-base-url}/${module-directory}</connection>
|
||||
<developerConnection>scm:svn:${vc-dev-base-url}/${module-directory}</developerConnection>
|
||||
<url>${vc-browse-base-url}/${module-directory}</url>
|
||||
</scm>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
|
@ -120,41 +114,5 @@
|
|||
</includes>
|
||||
</testResource>
|
||||
</testResources>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.codehaus.mojo</groupId>
|
||||
<artifactId>appassembler-maven-plugin</artifactId>
|
||||
<configuration>
|
||||
<extraJvmArguments>-Xmx128M</extraJvmArguments>
|
||||
<repositoryLayout>flat</repositoryLayout>
|
||||
<platforms>
|
||||
<platform>windows</platform>
|
||||
<platform>unix</platform>
|
||||
</platforms>
|
||||
<programs>
|
||||
<program>
|
||||
<mainClass>org.apache.lucene.benchmark.byTask.Benchmark</mainClass>
|
||||
<name>Benchmark</name>
|
||||
</program>
|
||||
<program>
|
||||
<mainClass>org.apache.lucene.benchmark.quality.trec.QueryDriver</mainClass>
|
||||
<name>QueryDriver</name>
|
||||
</program>
|
||||
<program>
|
||||
<mainClass>org.apache.lucene.benchmark.quality.utils.QualityQueriesFinder</mainClass>
|
||||
<name>QualityQueriesFinder</name>
|
||||
</program>
|
||||
<program>
|
||||
<mainClass>org.apache.lucene.benchmark.utils.ExtractReuters</mainClass>
|
||||
<name>ExtractReuters</name>
|
||||
</program>
|
||||
<program>
|
||||
<mainClass>org.apache.lucene.benchmark.utils.ExtractWikipedia</mainClass>
|
||||
<name>ExtractWikipedia</name>
|
||||
</program>
|
||||
</programs>
|
||||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
</project>
|
||||
|
|
|
@ -37,15 +37,9 @@
|
|||
<module-path>${top-level}/${module-directory}</module-path>
|
||||
</properties>
|
||||
<scm>
|
||||
<connection>
|
||||
scm:svn:http://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
|
||||
</connection>
|
||||
<developerConnection>
|
||||
scm:svn:https://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
|
||||
</developerConnection>
|
||||
<url>
|
||||
http://svn.apache.org/viewvc/lucene/dev/trunk/${module-directory}
|
||||
</url>
|
||||
<connection>scm:svn:${vc-anonymous-base-url}/${module-directory}</connection>
|
||||
<developerConnection>scm:svn:${vc-dev-base-url}/${module-directory}</developerConnection>
|
||||
<url>${vc-browse-base-url}/${module-directory}</url>
|
||||
</scm>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
|
@ -104,40 +98,6 @@
|
|||
</systemPropertyVariables>
|
||||
</configuration>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.codehaus.mojo</groupId>
|
||||
<artifactId>appassembler-maven-plugin</artifactId>
|
||||
<configuration>
|
||||
<extraJvmArguments>-Xmx128M</extraJvmArguments>
|
||||
<repositoryLayout>flat</repositoryLayout>
|
||||
<platforms>
|
||||
<platform>windows</platform>
|
||||
<platform>unix</platform>
|
||||
</platforms>
|
||||
<programs>
|
||||
<program>
|
||||
<mainClass>org.apache.lucene.index.CheckIndex</mainClass>
|
||||
<name>CheckIndex</name>
|
||||
</program>
|
||||
<program>
|
||||
<mainClass>org.apache.lucene.index.IndexReader</mainClass>
|
||||
<name>IndexReader</name>
|
||||
</program>
|
||||
<program>
|
||||
<mainClass>org.apache.lucene.store.LockStressTest</mainClass>
|
||||
<name>LockStressTest</name>
|
||||
</program>
|
||||
<program>
|
||||
<mainClass>org.apache.lucene.store.LockVerifyServer</mainClass>
|
||||
<name>LockVerifyServer</name>
|
||||
</program>
|
||||
<program>
|
||||
<mainClass>org.apache.lucene.util.English</mainClass>
|
||||
<name>English</name>
|
||||
</program>
|
||||
</programs>
|
||||
</configuration>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.codehaus.mojo</groupId>
|
||||
<artifactId>build-helper-maven-plugin</artifactId>
|
||||
|
|
|
@ -37,15 +37,9 @@
|
|||
<module-path>${top-level}/${module-directory}</module-path>
|
||||
</properties>
|
||||
<scm>
|
||||
<connection>
|
||||
scm:svn:http://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
|
||||
</connection>
|
||||
<developerConnection>
|
||||
scm:svn:https://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
|
||||
</developerConnection>
|
||||
<url>
|
||||
http://svn.apache.org/viewvc/lucene/dev/trunk/${module-directory}
|
||||
</url>
|
||||
<connection>scm:svn:${vc-anonymous-base-url}/${module-directory}</connection>
|
||||
<developerConnection>scm:svn:${vc-dev-base-url}/${module-directory}</developerConnection>
|
||||
<url>${vc-browse-base-url}/${module-directory}</url>
|
||||
</scm>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
|
@ -87,30 +81,5 @@
|
|||
</excludes>
|
||||
</testResource>
|
||||
</testResources>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.codehaus.mojo</groupId>
|
||||
<artifactId>appassembler-maven-plugin</artifactId>
|
||||
<configuration>
|
||||
<extraJvmArguments>-Xmx128M</extraJvmArguments>
|
||||
<repositoryLayout>flat</repositoryLayout>
|
||||
<assembleDirectory>${build-directory}</assembleDirectory>
|
||||
<platforms>
|
||||
<platform>windows</platform>
|
||||
<platform>unix</platform>
|
||||
</platforms>
|
||||
<programs>
|
||||
<program>
|
||||
<mainClass>org.apache.lucene.demo.IndexFiles</mainClass>
|
||||
<name>IndexFiles</name>
|
||||
</program>
|
||||
<program>
|
||||
<mainClass>org.apache.lucene.demo.SearchFiles</mainClass>
|
||||
<name>SearchFiles</name>
|
||||
</program>
|
||||
</programs>
|
||||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
</project>
|
||||
|
|
|
@ -39,15 +39,9 @@
|
|||
<module-path>${top-level}/${module-directory}</module-path>
|
||||
</properties>
|
||||
<scm>
|
||||
<connection>
|
||||
scm:svn:http://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
|
||||
</connection>
|
||||
<developerConnection>
|
||||
scm:svn:https://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
|
||||
</developerConnection>
|
||||
<url>
|
||||
http://svn.apache.org/viewvc/lucene/dev/trunk/${module-directory}
|
||||
</url>
|
||||
<connection>scm:svn:${vc-anonymous-base-url}/${module-directory}</connection>
|
||||
<developerConnection>scm:svn:${vc-dev-base-url}/${module-directory}</developerConnection>
|
||||
<url>${vc-browse-base-url}/${module-directory}</url>
|
||||
</scm>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
|
|
|
@ -37,15 +37,9 @@
|
|||
<module-path>${top-level}/${module-directory}</module-path>
|
||||
</properties>
|
||||
<scm>
|
||||
<connection>
|
||||
scm:svn:http://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
|
||||
</connection>
|
||||
<developerConnection>
|
||||
scm:svn:https://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
|
||||
</developerConnection>
|
||||
<url>
|
||||
http://svn.apache.org/viewvc/lucene/dev/trunk/${module-directory}
|
||||
</url>
|
||||
<connection>scm:svn:${vc-anonymous-base-url}/${module-directory}</connection>
|
||||
<developerConnection>scm:svn:${vc-dev-base-url}/${module-directory}</developerConnection>
|
||||
<url>${vc-browse-base-url}/${module-directory}</url>
|
||||
</scm>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
|
|
|
@ -39,15 +39,9 @@
|
|||
<module-path>${top-level}/${module-directory}</module-path>
|
||||
</properties>
|
||||
<scm>
|
||||
<connection>
|
||||
scm:svn:http://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
|
||||
</connection>
|
||||
<developerConnection>
|
||||
scm:svn:https://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
|
||||
</developerConnection>
|
||||
<url>
|
||||
http://svn.apache.org/viewvc/lucene/dev/trunk/${module-directory}
|
||||
</url>
|
||||
<connection>scm:svn:${vc-anonymous-base-url}/${module-directory}</connection>
|
||||
<developerConnection>scm:svn:${vc-dev-base-url}/${module-directory}</developerConnection>
|
||||
<url>${vc-browse-base-url}/${module-directory}</url>
|
||||
</scm>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
|
|
|
@ -37,15 +37,9 @@
|
|||
<module-path>${top-level}/${module-directory}</module-path>
|
||||
</properties>
|
||||
<scm>
|
||||
<connection>
|
||||
scm:svn:http://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
|
||||
</connection>
|
||||
<developerConnection>
|
||||
scm:svn:https://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
|
||||
</developerConnection>
|
||||
<url>
|
||||
http://svn.apache.org/viewvc/lucene/dev/trunk/${module-directory}
|
||||
</url>
|
||||
<connection>scm:svn:${vc-anonymous-base-url}/${module-directory}</connection>
|
||||
<developerConnection>scm:svn:${vc-dev-base-url}/${module-directory}</developerConnection>
|
||||
<url>${vc-browse-base-url}/${module-directory}</url>
|
||||
</scm>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
|
|
|
@ -39,15 +39,9 @@
|
|||
<module-path>${top-level}/${module-directory}</module-path>
|
||||
</properties>
|
||||
<scm>
|
||||
<connection>
|
||||
scm:svn:http://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
|
||||
</connection>
|
||||
<developerConnection>
|
||||
scm:svn:https://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
|
||||
</developerConnection>
|
||||
<url>
|
||||
http://svn.apache.org/viewvc/lucene/dev/trunk/${module-directory}
|
||||
</url>
|
||||
<connection>scm:svn:${vc-anonymous-base-url}/${module-directory}</connection>
|
||||
<developerConnection>scm:svn:${vc-dev-base-url}/${module-directory}</developerConnection>
|
||||
<url>${vc-browse-base-url}/${module-directory}</url>
|
||||
</scm>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
|
|
|
@ -37,15 +37,9 @@
|
|||
<module-path>${top-level}/${module-directory}</module-path>
|
||||
</properties>
|
||||
<scm>
|
||||
<connection>
|
||||
scm:svn:http://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
|
||||
</connection>
|
||||
<developerConnection>
|
||||
scm:svn:https://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
|
||||
</developerConnection>
|
||||
<url>
|
||||
http://svn.apache.org/viewvc/lucene/dev/trunk/${module-directory}
|
||||
</url>
|
||||
<connection>scm:svn:${vc-anonymous-base-url}/${module-directory}</connection>
|
||||
<developerConnection>scm:svn:${vc-dev-base-url}/${module-directory}</developerConnection>
|
||||
<url>${vc-browse-base-url}/${module-directory}</url>
|
||||
</scm>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
|
@ -72,49 +66,5 @@
|
|||
</excludes>
|
||||
</testResource>
|
||||
</testResources>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.codehaus.mojo</groupId>
|
||||
<artifactId>appassembler-maven-plugin</artifactId>
|
||||
<configuration>
|
||||
<extraJvmArguments>-Xmx128M</extraJvmArguments>
|
||||
<repositoryLayout>flat</repositoryLayout>
|
||||
<platforms>
|
||||
<platform>windows</platform>
|
||||
<platform>unix</platform>
|
||||
</platforms>
|
||||
<programs>
|
||||
<program>
|
||||
<mainClass>org.apache.lucene.index.FieldNormModifier</mainClass>
|
||||
<name>FieldNormModifier</name>
|
||||
</program>
|
||||
<program>
|
||||
<mainClass>org.apache.lucene.index.IndexSplitter</mainClass>
|
||||
<name>IndexSplitter</name>
|
||||
</program>
|
||||
<program>
|
||||
<mainClass>org.apache.lucene.index.MultiPassIndexSplitter</mainClass>
|
||||
<name>MultiPassIndexSplitter</name>
|
||||
</program>
|
||||
<program>
|
||||
<mainClass>org.apache.lucene.misc.GetTermInfo</mainClass>
|
||||
<name>GetTermInfo</name>
|
||||
</program>
|
||||
<program>
|
||||
<mainClass>org.apache.lucene.misc.HighFreqTerms</mainClass>
|
||||
<name>HighFreqTerms</name>
|
||||
</program>
|
||||
<program>
|
||||
<mainClass>org.apache.lucene.misc.IndexMergeTool</mainClass>
|
||||
<name>IndexMergeTool</name>
|
||||
</program>
|
||||
<program>
|
||||
<mainClass>org.apache.lucene.misc.LengthNormModifier</mainClass>
|
||||
<name>LengthNormModifier</name>
|
||||
</program>
|
||||
</programs>
|
||||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
</project>
|
||||
|
|
|
@ -35,15 +35,9 @@
|
|||
<module-directory>lucene</module-directory>
|
||||
</properties>
|
||||
<scm>
|
||||
<connection>
|
||||
scm:svn:http://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
|
||||
</connection>
|
||||
<developerConnection>
|
||||
scm:svn:https://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
|
||||
</developerConnection>
|
||||
<url>
|
||||
http://svn.apache.org/viewvc/lucene/dev/trunk/${module-directory}
|
||||
</url>
|
||||
<connection>scm:svn:${vc-anonymous-base-url}/${module-directory}</connection>
|
||||
<developerConnection>scm:svn:${vc-dev-base-url}/${module-directory}</developerConnection>
|
||||
<url>${vc-browse-base-url}/${module-directory}</url>
|
||||
</scm>
|
||||
<modules>
|
||||
<module>core</module>
|
||||
|
|
|
@ -37,15 +37,9 @@
|
|||
<module-path>${top-level}/${module-directory}</module-path>
|
||||
</properties>
|
||||
<scm>
|
||||
<connection>
|
||||
scm:svn:http://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
|
||||
</connection>
|
||||
<developerConnection>
|
||||
scm:svn:https://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
|
||||
</developerConnection>
|
||||
<url>
|
||||
http://svn.apache.org/viewvc/lucene/dev/trunk/${module-directory}
|
||||
</url>
|
||||
<connection>scm:svn:${vc-anonymous-base-url}/${module-directory}</connection>
|
||||
<developerConnection>scm:svn:${vc-dev-base-url}/${module-directory}</developerConnection>
|
||||
<url>${vc-browse-base-url}/${module-directory}</url>
|
||||
</scm>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
|
|
|
@ -37,15 +37,9 @@
|
|||
<module-path>${top-level}/${module-directory}</module-path>
|
||||
</properties>
|
||||
<scm>
|
||||
<connection>
|
||||
scm:svn:http://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
|
||||
</connection>
|
||||
<developerConnection>
|
||||
scm:svn:https://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
|
||||
</developerConnection>
|
||||
<url>
|
||||
http://svn.apache.org/viewvc/lucene/dev/trunk/${module-directory}
|
||||
</url>
|
||||
<connection>scm:svn:${vc-anonymous-base-url}/${module-directory}</connection>
|
||||
<developerConnection>scm:svn:${vc-dev-base-url}/${module-directory}</developerConnection>
|
||||
<url>${vc-browse-base-url}/${module-directory}</url>
|
||||
</scm>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
|
|
|
@ -37,15 +37,9 @@
|
|||
<module-path>${top-level}/${module-directory}</module-path>
|
||||
</properties>
|
||||
<scm>
|
||||
<connection>
|
||||
scm:svn:http://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
|
||||
</connection>
|
||||
<developerConnection>
|
||||
scm:svn:https://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
|
||||
</developerConnection>
|
||||
<url>
|
||||
http://svn.apache.org/viewvc/lucene/dev/trunk/${module-directory}
|
||||
</url>
|
||||
<connection>scm:svn:${vc-anonymous-base-url}/${module-directory}</connection>
|
||||
<developerConnection>scm:svn:${vc-dev-base-url}/${module-directory}</developerConnection>
|
||||
<url>${vc-browse-base-url}/${module-directory}</url>
|
||||
</scm>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
|
|
|
@ -37,15 +37,9 @@
|
|||
<module-path>${top-level}/${module-directory}</module-path>
|
||||
</properties>
|
||||
<scm>
|
||||
<connection>
|
||||
scm:svn:http://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
|
||||
</connection>
|
||||
<developerConnection>
|
||||
scm:svn:https://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
|
||||
</developerConnection>
|
||||
<url>
|
||||
http://svn.apache.org/viewvc/lucene/dev/trunk/${module-directory}
|
||||
</url>
|
||||
<connection>scm:svn:${vc-anonymous-base-url}/${module-directory}</connection>
|
||||
<developerConnection>scm:svn:${vc-dev-base-url}/${module-directory}</developerConnection>
|
||||
<url>${vc-browse-base-url}/${module-directory}</url>
|
||||
</scm>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
|
|
|
@ -37,15 +37,9 @@
|
|||
<module-path>${top-level}/${module-directory}</module-path>
|
||||
</properties>
|
||||
<scm>
|
||||
<connection>
|
||||
scm:svn:http://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
|
||||
</connection>
|
||||
<developerConnection>
|
||||
scm:svn:https://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
|
||||
</developerConnection>
|
||||
<url>
|
||||
http://svn.apache.org/viewvc/lucene/dev/trunk/${module-directory}
|
||||
</url>
|
||||
<connection>scm:svn:${vc-anonymous-base-url}/${module-directory}</connection>
|
||||
<developerConnection>scm:svn:${vc-dev-base-url}/${module-directory}</developerConnection>
|
||||
<url>${vc-browse-base-url}/${module-directory}</url>
|
||||
</scm>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
|
|
|
@ -31,15 +31,18 @@
|
|||
<version>@version@</version>
|
||||
<packaging>pom</packaging>
|
||||
<name>Grandparent POM for Apache Lucene Core and Apache Solr</name>
|
||||
<description>Parent POM for Apache Lucene Core and Apache Solr</description>
|
||||
<url>http://lucene.apache.org/java</url>
|
||||
<description>Grandparent POM for Apache Lucene Core and Apache Solr</description>
|
||||
<url>http://lucene.apache.org</url>
|
||||
<modules>
|
||||
<module>lucene</module>
|
||||
<module>solr</module>
|
||||
</modules>
|
||||
<properties>
|
||||
<top-level>..</top-level>
|
||||
<base.specification.version>4.0.0</base.specification.version>
|
||||
<vc-anonymous-base-url>http://svn.apache.org/repos/asf/lucene/dev/trunk</vc-anonymous-base-url>
|
||||
<vc-dev-base-url>https://svn.apache.org/repos/asf/lucene/dev/trunk</vc-dev-base-url>
|
||||
<vc-browse-base-url>http://svn.apache.org/viewvc/lucene/dev/trunk</vc-browse-base-url>
|
||||
<base.specification.version>5.0.0</base.specification.version>
|
||||
<maven.build.timestamp.format>yyyy-MM-dd HH:mm:ss</maven.build.timestamp.format>
|
||||
<java.compat.version>1.6</java.compat.version>
|
||||
<jetty.version>8.1.2.v20120308</jetty.version>
|
||||
|
@ -69,11 +72,11 @@
|
|||
</properties>
|
||||
<issueManagement>
|
||||
<system>JIRA</system>
|
||||
<url>http://issues.apache.org/jira/browse/LUCENE</url>
|
||||
<url>https://issues.apache.org/jira/browse/LUCENE</url>
|
||||
</issueManagement>
|
||||
<ciManagement>
|
||||
<system>Hudson</system>
|
||||
<url>http://lucene.zones.apache.org:8080/hudson/job/Lucene-Nightly/</url>
|
||||
<system>Jenkins</system>
|
||||
<url>https://builds.apache.org/computer/lucene/</url>
|
||||
</ciManagement>
|
||||
<mailingLists>
|
||||
<mailingList>
|
||||
|
@ -109,15 +112,9 @@
|
|||
</mailingLists>
|
||||
<inceptionYear>2000</inceptionYear>
|
||||
<scm>
|
||||
<connection>
|
||||
scm:svn:http://svn.apache.org/repos/asf/lucene/dev/trunk
|
||||
</connection>
|
||||
<developerConnection>
|
||||
scm:svn:https://svn.apache.org/repos/asf/lucene/dev/trunk
|
||||
</developerConnection>
|
||||
<url>
|
||||
http://svn.apache.org/viewvc/lucene/dev/trunk
|
||||
</url>
|
||||
<connection>scm:svn:${vc-anonymous-base-url}</connection>
|
||||
<developerConnection>scm:svn:${vc-dev-base-url}</developerConnection>
|
||||
<url>${vc-browse-base-url}</url>
|
||||
</scm>
|
||||
<licenses>
|
||||
<license>
|
||||
|
@ -388,7 +385,7 @@
|
|||
<dependency>
|
||||
<groupId>com.carrotsearch.randomizedtesting</groupId>
|
||||
<artifactId>randomizedtesting-runner</artifactId>
|
||||
<version>1.6.0</version>
|
||||
<version>2.0.0.rc5</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
</dependencyManagement>
|
||||
|
@ -549,11 +546,6 @@
|
|||
</archive>
|
||||
</configuration>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.codehaus.mojo</groupId>
|
||||
<artifactId>appassembler-maven-plugin</artifactId>
|
||||
<version>1.2.1</version>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.codehaus.mojo</groupId>
|
||||
<artifactId>build-helper-maven-plugin</artifactId>
|
||||
|
|
|
@ -35,18 +35,11 @@
|
|||
<module-directory>solr/contrib/analysis-extras</module-directory>
|
||||
<top-level>../../../..</top-level>
|
||||
<module-path>${top-level}/${module-directory}</module-path>
|
||||
<surefire-top-level>${top-level}/../..</surefire-top-level>
|
||||
</properties>
|
||||
<scm>
|
||||
<connection>
|
||||
scm:svn:http://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
|
||||
</connection>
|
||||
<developerConnection>
|
||||
scm:svn:https://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
|
||||
</developerConnection>
|
||||
<url>
|
||||
http://svn.apache.org/viewvc/lucene/dev/trunk/${module-directory}
|
||||
</url>
|
||||
<connection>scm:svn:${vc-anonymous-base-url}/${module-directory}</connection>
|
||||
<developerConnection>scm:svn:${vc-dev-base-url}/${module-directory}</developerConnection>
|
||||
<url>${vc-browse-base-url}/${module-directory}</url>
|
||||
</scm>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
|
@ -101,17 +94,12 @@
|
|||
<testResource>
|
||||
<directory>${top-level}/solr/core/src/test-files</directory>
|
||||
</testResource>
|
||||
<testResource>
|
||||
<directory>${top-level}/dev-tools/maven/solr</directory>
|
||||
<includes>
|
||||
<include>maven.testlogging.properties</include>
|
||||
</includes>
|
||||
</testResource>
|
||||
</testResources>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-surefire-plugin</artifactId>
|
||||
<configuration>
|
||||
<systemPropertyVariables>
|
||||
<java.util.logging.config.file>${surefire-top-level}/solr/testlogging.properties</java.util.logging.config.file>
|
||||
</systemPropertyVariables>
|
||||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
</project>
|
||||
|
|
|
@ -35,18 +35,11 @@
|
|||
<module-directory>solr/contrib/clustering</module-directory>
|
||||
<top-level>../../../..</top-level>
|
||||
<module-path>${top-level}/${module-directory}</module-path>
|
||||
<surefire-top-level>${top-level}/../..</surefire-top-level>
|
||||
</properties>
|
||||
<scm>
|
||||
<connection>
|
||||
scm:svn:http://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
|
||||
</connection>
|
||||
<developerConnection>
|
||||
scm:svn:https://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
|
||||
</developerConnection>
|
||||
<url>
|
||||
http://svn.apache.org/viewvc/lucene/dev/trunk/${module-directory}
|
||||
</url>
|
||||
<connection>scm:svn:${vc-anonymous-base-url}/${module-directory}</connection>
|
||||
<developerConnection>scm:svn:${vc-dev-base-url}/${module-directory}</developerConnection>
|
||||
<url>${vc-browse-base-url}/${module-directory}</url>
|
||||
</scm>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
|
@ -106,17 +99,12 @@
|
|||
<testResource>
|
||||
<directory>${top-level}/solr/core/src/test-files</directory>
|
||||
</testResource>
|
||||
<testResource>
|
||||
<directory>${top-level}/dev-tools/maven/solr</directory>
|
||||
<includes>
|
||||
<include>maven.testlogging.properties</include>
|
||||
</includes>
|
||||
</testResource>
|
||||
</testResources>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-surefire-plugin</artifactId>
|
||||
<configuration>
|
||||
<systemPropertyVariables>
|
||||
<java.util.logging.config.file>${surefire-top-level}/solr/testlogging.properties</java.util.logging.config.file>
|
||||
</systemPropertyVariables>
|
||||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
</project>
|
||||
|
|
|
@ -35,18 +35,11 @@
|
|||
<module-directory>solr/contrib/dataimporthandler-extras</module-directory>
|
||||
<top-level>../../../..</top-level>
|
||||
<module-path>${top-level}/${module-directory}</module-path>
|
||||
<surefire-top-level>${top-level}/../..</surefire-top-level>
|
||||
</properties>
|
||||
<scm>
|
||||
<connection>
|
||||
scm:svn:http://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
|
||||
</connection>
|
||||
<developerConnection>
|
||||
scm:svn:https://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
|
||||
</developerConnection>
|
||||
<url>
|
||||
http://svn.apache.org/viewvc/lucene/dev/trunk/${module-directory}
|
||||
</url>
|
||||
<connection>scm:svn:${vc-anonymous-base-url}/${module-directory}</connection>
|
||||
<developerConnection>scm:svn:${vc-dev-base-url}/${module-directory}</developerConnection>
|
||||
<url>${vc-browse-base-url}/${module-directory}</url>
|
||||
</scm>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
|
@ -104,17 +97,12 @@
|
|||
<testResource>
|
||||
<directory>${top-level}/solr/core/src/test-files</directory>
|
||||
</testResource>
|
||||
<testResource>
|
||||
<directory>${top-level}/dev-tools/maven/solr</directory>
|
||||
<includes>
|
||||
<include>maven.testlogging.properties</include>
|
||||
</includes>
|
||||
</testResource>
|
||||
</testResources>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-surefire-plugin</artifactId>
|
||||
<configuration>
|
||||
<systemPropertyVariables>
|
||||
<java.util.logging.config.file>${surefire-top-level}/solr/testlogging.properties</java.util.logging.config.file>
|
||||
</systemPropertyVariables>
|
||||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
</project>
|
||||
|
|
|
@ -35,18 +35,11 @@
|
|||
<module-directory>solr/contrib/dataimporthandler</module-directory>
|
||||
<top-level>../../../..</top-level>
|
||||
<module-path>${top-level}/${module-directory}</module-path>
|
||||
<surefire-top-level>${top-level}/../..</surefire-top-level>
|
||||
</properties>
|
||||
<scm>
|
||||
<connection>
|
||||
scm:svn:http://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
|
||||
</connection>
|
||||
<developerConnection>
|
||||
scm:svn:https://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
|
||||
</developerConnection>
|
||||
<url>
|
||||
http://svn.apache.org/viewvc/lucene/dev/trunk/${module-directory}
|
||||
</url>
|
||||
<connection>scm:svn:${vc-anonymous-base-url}/${module-directory}</connection>
|
||||
<developerConnection>scm:svn:${vc-dev-base-url}/${module-directory}</developerConnection>
|
||||
<url>${vc-browse-base-url}/${module-directory}</url>
|
||||
</scm>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
|
@ -90,6 +83,12 @@
|
|||
<testResource>
|
||||
<directory>${top-level}/solr/core/src/test-files</directory>
|
||||
</testResource>
|
||||
<testResource>
|
||||
<directory>${top-level}/dev-tools/maven/solr</directory>
|
||||
<includes>
|
||||
<include>maven.testlogging.properties</include>
|
||||
</includes>
|
||||
</testResource>
|
||||
</testResources>
|
||||
<plugins>
|
||||
<plugin>
|
||||
|
@ -103,15 +102,6 @@
|
|||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-surefire-plugin</artifactId>
|
||||
<configuration>
|
||||
<systemPropertyVariables>
|
||||
<java.util.logging.config.file>${surefire-top-level}/solr/testlogging.properties</java.util.logging.config.file>
|
||||
</systemPropertyVariables>
|
||||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
</project>
|
||||
|
|
|
@ -38,18 +38,11 @@
|
|||
<module-directory>solr/contrib/extraction</module-directory>
|
||||
<top-level>../../../..</top-level>
|
||||
<module-path>${top-level}/${module-directory}</module-path>
|
||||
<surefire-top-level>${top-level}/../..</surefire-top-level>
|
||||
</properties>
|
||||
<scm>
|
||||
<connection>
|
||||
scm:svn:http://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
|
||||
</connection>
|
||||
<developerConnection>
|
||||
scm:svn:https://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
|
||||
</developerConnection>
|
||||
<url>
|
||||
http://svn.apache.org/viewvc/lucene/dev/trunk/${module-directory}
|
||||
</url>
|
||||
<connection>scm:svn:${vc-anonymous-base-url}/${module-directory}</connection>
|
||||
<developerConnection>scm:svn:${vc-dev-base-url}/${module-directory}</developerConnection>
|
||||
<url>${vc-browse-base-url}/${module-directory}</url>
|
||||
</scm>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
|
@ -102,17 +95,12 @@
|
|||
<testResource>
|
||||
<directory>${top-level}/solr/core/src/test-files</directory>
|
||||
</testResource>
|
||||
<testResource>
|
||||
<directory>${top-level}/dev-tools/maven/solr</directory>
|
||||
<includes>
|
||||
<include>maven.testlogging.properties</include>
|
||||
</includes>
|
||||
</testResource>
|
||||
</testResources>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-surefire-plugin</artifactId>
|
||||
<configuration>
|
||||
<systemPropertyVariables>
|
||||
<java.util.logging.config.file>${surefire-top-level}/solr/testlogging.properties</java.util.logging.config.file>
|
||||
</systemPropertyVariables>
|
||||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
</project>
|
||||
|
|
|
@ -39,18 +39,11 @@
|
|||
<module-directory>solr/contrib/langid</module-directory>
|
||||
<top-level>../../../..</top-level>
|
||||
<module-path>${top-level}/${module-directory}</module-path>
|
||||
<surefire-top-level>${top-level}/../..</surefire-top-level>
|
||||
</properties>
|
||||
<scm>
|
||||
<connection>
|
||||
scm:svn:http://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
|
||||
</connection>
|
||||
<developerConnection>
|
||||
scm:svn:https://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
|
||||
</developerConnection>
|
||||
<url>
|
||||
http://svn.apache.org/viewvc/lucene/dev/trunk/${module-directory}
|
||||
</url>
|
||||
<connection>scm:svn:${vc-anonymous-base-url}/${module-directory}</connection>
|
||||
<developerConnection>scm:svn:${vc-dev-base-url}/${module-directory}</developerConnection>
|
||||
<url>${vc-browse-base-url}/${module-directory}</url>
|
||||
</scm>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
|
@ -107,17 +100,12 @@
|
|||
<testResource>
|
||||
<directory>${top-level}/solr/core/src/test-files</directory>
|
||||
</testResource>
|
||||
<testResource>
|
||||
<directory>${top-level}/dev-tools/maven/solr</directory>
|
||||
<includes>
|
||||
<include>maven.testlogging.properties</include>
|
||||
</includes>
|
||||
</testResource>
|
||||
</testResources>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-surefire-plugin</artifactId>
|
||||
<configuration>
|
||||
<systemPropertyVariables>
|
||||
<java.util.logging.config.file>${surefire-top-level}/solr/testlogging.properties</java.util.logging.config.file>
|
||||
</systemPropertyVariables>
|
||||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
</project>
|
||||
|
|
|
@ -35,18 +35,11 @@
|
|||
<module-directory>solr/contrib/uima</module-directory>
|
||||
<top-level>../../../..</top-level>
|
||||
<module-path>${top-level}/${module-directory}</module-path>
|
||||
<surefire-top-level>${top-level}/../..</surefire-top-level>
|
||||
</properties>
|
||||
<scm>
|
||||
<connection>
|
||||
scm:svn:http://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
|
||||
</connection>
|
||||
<developerConnection>
|
||||
scm:svn:https://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
|
||||
</developerConnection>
|
||||
<url>
|
||||
http://svn.apache.org/viewvc/lucene/dev/trunk/${module-directory}
|
||||
</url>
|
||||
<connection>scm:svn:${vc-anonymous-base-url}/${module-directory}</connection>
|
||||
<developerConnection>scm:svn:${vc-dev-base-url}/${module-directory}</developerConnection>
|
||||
<url>${vc-browse-base-url}/${module-directory}</url>
|
||||
</scm>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
|
@ -121,17 +114,12 @@
|
|||
<testResource>
|
||||
<directory>${module-path}/src/test-files</directory>
|
||||
</testResource>
|
||||
<testResource>
|
||||
<directory>${top-level}/dev-tools/maven/solr</directory>
|
||||
<includes>
|
||||
<include>maven.testlogging.properties</include>
|
||||
</includes>
|
||||
</testResource>
|
||||
</testResources>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-surefire-plugin</artifactId>
|
||||
<configuration>
|
||||
<systemPropertyVariables>
|
||||
<java.util.logging.config.file>${surefire-top-level}/solr/testlogging.properties</java.util.logging.config.file>
|
||||
</systemPropertyVariables>
|
||||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
</project>
|
||||
|
|
|
@ -35,18 +35,11 @@
|
|||
<module-directory>solr/contrib/velocity</module-directory>
|
||||
<top-level>../../../..</top-level>
|
||||
<module-path>${top-level}/${module-directory}</module-path>
|
||||
<surefire-top-level>${top-level}/../..</surefire-top-level>
|
||||
</properties>
|
||||
<scm>
|
||||
<connection>
|
||||
scm:svn:http://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
|
||||
</connection>
|
||||
<developerConnection>
|
||||
scm:svn:https://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
|
||||
</developerConnection>
|
||||
<url>
|
||||
http://svn.apache.org/viewvc/lucene/dev/trunk/${module-directory}
|
||||
</url>
|
||||
<connection>scm:svn:${vc-anonymous-base-url}/${module-directory}</connection>
|
||||
<developerConnection>scm:svn:${vc-dev-base-url}/${module-directory}</developerConnection>
|
||||
<url>${vc-browse-base-url}/${module-directory}</url>
|
||||
</scm>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
|
@ -142,17 +135,12 @@
|
|||
<testResource>
|
||||
<directory>${top-level}/solr/core/src/test-files</directory>
|
||||
</testResource>
|
||||
<testResource>
|
||||
<directory>${top-level}/dev-tools/maven/solr</directory>
|
||||
<includes>
|
||||
<include>maven.testlogging.properties</include>
|
||||
</includes>
|
||||
</testResource>
|
||||
</testResources>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-surefire-plugin</artifactId>
|
||||
<configuration>
|
||||
<systemPropertyVariables>
|
||||
<java.util.logging.config.file>${surefire-top-level}/solr/testlogging.properties</java.util.logging.config.file>
|
||||
</systemPropertyVariables>
|
||||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
</project>
|
||||
|
|
|
@ -35,18 +35,11 @@
|
|||
<module-directory>solr/core</module-directory>
|
||||
<top-level>../../..</top-level>
|
||||
<module-path>${top-level}/${module-directory}</module-path>
|
||||
<surefire-top-level>${top-level}/../..</surefire-top-level>
|
||||
</properties>
|
||||
<scm>
|
||||
<connection>
|
||||
scm:svn:http://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
|
||||
</connection>
|
||||
<developerConnection>
|
||||
scm:svn:https://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
|
||||
</developerConnection>
|
||||
<url>
|
||||
http://svn.apache.org/viewvc/lucene/dev/trunk/${module-directory}
|
||||
</url>
|
||||
<connection>scm:svn:${vc-anonymous-base-url}/${module-directory}</connection>
|
||||
<developerConnection>scm:svn:${vc-dev-base-url}/${module-directory}</developerConnection>
|
||||
<url>${vc-browse-base-url}/${module-directory}</url>
|
||||
</scm>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
|
@ -243,48 +236,14 @@
|
|||
<testResource>
|
||||
<directory>${top-level}/solr/solrj/src/test-files</directory>
|
||||
</testResource>
|
||||
<testResource>
|
||||
<directory>${top-level}/dev-tools/maven/solr</directory>
|
||||
<includes>
|
||||
<include>maven.testlogging.properties</include>
|
||||
</includes>
|
||||
</testResource>
|
||||
</testResources>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-surefire-plugin</artifactId>
|
||||
<configuration>
|
||||
<systemPropertyVariables>
|
||||
<java.util.logging.config.file>${surefire-top-level}/solr/testlogging.properties</java.util.logging.config.file>
|
||||
</systemPropertyVariables>
|
||||
</configuration>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.codehaus.mojo</groupId>
|
||||
<artifactId>appassembler-maven-plugin</artifactId>
|
||||
<configuration>
|
||||
<extraJvmArguments>-Xmx128M</extraJvmArguments>
|
||||
<repositoryLayout>flat</repositoryLayout>
|
||||
<platforms>
|
||||
<platform>windows</platform>
|
||||
<platform>unix</platform>
|
||||
</platforms>
|
||||
<programs>
|
||||
<program>
|
||||
<mainClass>org.apache.solr.client.solrj.embedded.JettySolrRunner</mainClass>
|
||||
<name>JettySolrRunner</name>
|
||||
</program>
|
||||
<program>
|
||||
<mainClass>org.apache.solr.util.BitSetPerf</mainClass>
|
||||
<name>BitSetPerf</name>
|
||||
<extraJvmArguments>-Xms128m -Xbatch</extraJvmArguments>
|
||||
</program>
|
||||
<program>
|
||||
<mainClass>org.apache.solr.util.SimplePostTool</mainClass>
|
||||
<name>SimplePostTool</name>
|
||||
</program>
|
||||
<program>
|
||||
<mainClass>org.apache.solr.util.SuggestMissingFactories</mainClass>
|
||||
<name>SuggestMissingFactories</name>
|
||||
</program>
|
||||
</programs>
|
||||
</configuration>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.codehaus.mojo</groupId>
|
||||
<artifactId>build-helper-maven-plugin</artifactId>
|
||||
|
|
|
@ -0,0 +1,2 @@
|
|||
handlers=java.util.logging.ConsoleHandler
|
||||
.level=SEVERE
|
|
@ -43,26 +43,14 @@
|
|||
<module-directory>solr</module-directory>
|
||||
</properties>
|
||||
<scm>
|
||||
<connection>
|
||||
scm:svn:http://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
|
||||
</connection>
|
||||
<developerConnection>
|
||||
scm:svn:https://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
|
||||
</developerConnection>
|
||||
<url>
|
||||
http://svn.apache.org/viewvc/lucene/dev/trunk/${module-directory}
|
||||
</url>
|
||||
<connection>scm:svn:${vc-anonymous-base-url}/${module-directory}</connection>
|
||||
<developerConnection>scm:svn:${vc-dev-base-url}/${module-directory}</developerConnection>
|
||||
<url>${vc-browse-base-url}/${module-directory}</url>
|
||||
</scm>
|
||||
<issueManagement>
|
||||
<system>JIRA</system>
|
||||
<url>http://issues.apache.org/jira/browse/SOLR</url>
|
||||
<url>https://issues.apache.org/jira/browse/SOLR</url>
|
||||
</issueManagement>
|
||||
<ciManagement>
|
||||
<system>Hudson</system>
|
||||
<url>
|
||||
http://lucene.zones.apache.org:8080/hudson/job/Solr-Nightly/
|
||||
</url>
|
||||
</ciManagement>
|
||||
<mailingLists>
|
||||
<mailingList>
|
||||
<name>Solr User List</name>
|
||||
|
@ -111,6 +99,15 @@
|
|||
<doctitle>${project.name} ${project.version} API (${now.version})</doctitle>
|
||||
</configuration>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-surefire-plugin</artifactId>
|
||||
<configuration>
|
||||
<systemPropertyVariables>
|
||||
<java.util.logging.config.file>../test-classes/maven.testlogging.properties</java.util.logging.config.file>
|
||||
</systemPropertyVariables>
|
||||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</pluginManagement>
|
||||
</build>
|
||||
|
|
|
@ -37,15 +37,9 @@
|
|||
<module-path>${top-level}/${module-directory}</module-path>
|
||||
</properties>
|
||||
<scm>
|
||||
<connection>
|
||||
scm:svn:http://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
|
||||
</connection>
|
||||
<developerConnection>
|
||||
scm:svn:https://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
|
||||
</developerConnection>
|
||||
<url>
|
||||
http://svn.apache.org/viewvc/lucene/dev/trunk/${module-directory}
|
||||
</url>
|
||||
<connection>scm:svn:${vc-anonymous-base-url}/${module-directory}</connection>
|
||||
<developerConnection>scm:svn:${vc-dev-base-url}/${module-directory}</developerConnection>
|
||||
<url>${vc-browse-base-url}/${module-directory}</url>
|
||||
</scm>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
|
|
|
@ -37,15 +37,9 @@
|
|||
<module-path>${top-level}/${module-directory}</module-path>
|
||||
</properties>
|
||||
<scm>
|
||||
<connection>
|
||||
scm:svn:http://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
|
||||
</connection>
|
||||
<developerConnection>
|
||||
scm:svn:https://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
|
||||
</developerConnection>
|
||||
<url>
|
||||
http://svn.apache.org/viewvc/lucene/dev/trunk/${module-directory}
|
||||
</url>
|
||||
<connection>scm:svn:${vc-anonymous-base-url}/${module-directory}</connection>
|
||||
<developerConnection>scm:svn:${vc-dev-base-url}/${module-directory}</developerConnection>
|
||||
<url>${vc-browse-base-url}/${module-directory}</url>
|
||||
</scm>
|
||||
<dependencies>
|
||||
<!-- These dependencies are compile scope because this is a test framework. -->
|
||||
|
@ -60,20 +54,29 @@
|
|||
<artifactId>solr-core</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>javax.servlet</groupId>
|
||||
<artifactId>servlet-api</artifactId>
|
||||
<!-- SOLR-3263: Provided scope is required to avoid jar signing conflicts -->
|
||||
<scope>provided</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>junit</groupId>
|
||||
<artifactId>junit</artifactId>
|
||||
</dependency>
|
||||
<!-- If your tests don't use BaseDistributedSearchTestCase or SolrJettyTestBase,
|
||||
you can exclude the three Jetty dependencies below. -->
|
||||
<dependency>
|
||||
<groupId>org.eclipse.jetty</groupId>
|
||||
<artifactId>jetty-server</artifactId>
|
||||
<scope>runtime</scope>
|
||||
<artifactId>jetty-servlet</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.eclipse.jetty</groupId>
|
||||
<artifactId>jetty-util</artifactId>
|
||||
</dependency>
|
||||
<!-- If your tests don't use BaseDistributedSearchTestCase or SolrJettyTestBase,
|
||||
you can exclude the two Jetty dependencies below. -->
|
||||
<dependency>
|
||||
<groupId>org.eclipse.jetty</groupId>
|
||||
<artifactId>jetty-server</artifactId>
|
||||
<scope>runtime</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
|
|
|
@ -37,15 +37,9 @@
|
|||
<module-path>${top-level}/${module-directory}</module-path>
|
||||
</properties>
|
||||
<scm>
|
||||
<connection>
|
||||
scm:svn:http://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
|
||||
</connection>
|
||||
<developerConnection>
|
||||
scm:svn:https://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
|
||||
</developerConnection>
|
||||
<url>
|
||||
http://svn.apache.org/viewvc/lucene/dev/trunk/${module-directory}
|
||||
</url>
|
||||
<connection>scm:svn:${vc-anonymous-base-url}/${module-directory}</connection>
|
||||
<developerConnection>scm:svn:${vc-dev-base-url}/${module-directory}</developerConnection>
|
||||
<url>${vc-browse-base-url}/${module-directory}</url>
|
||||
</scm>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
|
|
|
@ -58,7 +58,7 @@ def javaExe(version):
|
|||
|
||||
def verifyJavaVersion(version):
|
||||
s = os.popen('%s; java -version 2>&1' % javaExe(version)).read()
|
||||
if s.find('java version "%s.' % version) == -1:
|
||||
if s.find(' version "%s.' % version) == -1:
|
||||
raise RuntimeError('got wrong version for java %s:\n%s' % (version, s))
|
||||
|
||||
# http://s.apache.org/lusolr32rc2
|
||||
|
@ -363,6 +363,10 @@ def verifyDigests(artifact, urlString, tmpDir):
|
|||
raise RuntimeError('SHA1 digest mismatch for %s: expected %s but got %s' % (artifact, sha1Expected, sha1Actual))
|
||||
|
||||
def getDirEntries(urlString):
|
||||
if urlString.startswith('file:/') and not urlString.startswith('file://'):
|
||||
# stupid bogus ant URI
|
||||
urlString = "file:///" + urlString[6:]
|
||||
|
||||
if urlString.startswith('file://'):
|
||||
path = urlString[7:]
|
||||
if path.endswith('/'):
|
||||
|
@ -1026,7 +1030,7 @@ def crawl(downloadedFiles, urlString, targetDir, exclusions=set()):
|
|||
|
||||
def main():
|
||||
|
||||
if len(sys.argv) != 4:
|
||||
if len(sys.argv) < 4:
|
||||
print()
|
||||
print('Usage python -u %s BaseURL version tmpDir' % sys.argv[0])
|
||||
print()
|
||||
|
@ -1035,8 +1039,11 @@ def main():
|
|||
baseURL = sys.argv[1]
|
||||
version = sys.argv[2]
|
||||
tmpDir = os.path.abspath(sys.argv[3])
|
||||
isSigned = True
|
||||
if len(sys.argv) == 5:
|
||||
isSigned = (sys.argv[4] == "True")
|
||||
|
||||
smokeTest(baseURL, version, tmpDir, True)
|
||||
smokeTest(baseURL, version, tmpDir, isSigned)
|
||||
|
||||
def smokeTest(baseURL, version, tmpDir, isSigned):
|
||||
|
||||
|
@ -1090,4 +1097,5 @@ if __name__ == '__main__':
|
|||
except:
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
sys.exit(1)
|
||||
sys.exit(0)
|
||||
|
|
|
@ -6,6 +6,56 @@ http://s.apache.org/luceneversions
|
|||
|
||||
======================= Lucene 5.0.0 =======================
|
||||
|
||||
======================= Lucene 4.0.0 =======================
|
||||
|
||||
New Features
|
||||
|
||||
* LUCENE-1888: Added the option to store payloads in the term
|
||||
vectors (IndexableFieldType.storeTermVectorPayloads()). Note
|
||||
that you must store term vector positions to store payloads.
|
||||
(Robert Muir)
|
||||
|
||||
API Changes
|
||||
|
||||
* LUCENE-4299: Added Terms.hasPositions() and Terms.hasOffsets().
|
||||
Previously you had no real way to know that a term vector field
|
||||
had positions or offsets, since this can be configured on a
|
||||
per-field-per-document basis. (Robert Muir)
|
||||
|
||||
* Removed DocsAndPositionsEnum.hasPayload() and simplified the
|
||||
contract of getPayload(). It returns null if there is no payload,
|
||||
otherwise returns the current payload. You can now call it multiple
|
||||
times per position if you want. (Robert Muir)
|
||||
|
||||
* Removed FieldsEnum. Fields API instead implements Iterable<String>
|
||||
and exposes Iterator, so you can iterate over field names with
|
||||
for (String field : fields) instead. (Robert Muir)
|
||||
|
||||
Bug Fixes
|
||||
|
||||
* LUCENE-4297: BooleanScorer2 would multiply the coord() factor
|
||||
twice for conjunctions: for most users this is no problem, but
|
||||
if you had a customized Similarity that returned something other
|
||||
than 1 when overlap == maxOverlap (always the case for conjunctions),
|
||||
then the score would be incorrect. (Pascal Chollet, Robert Muir)
|
||||
|
||||
* LUCENE-4298: MultiFields.getTermDocsEnum(IndexReader, Bits, String, BytesRef)
|
||||
did not work at all, it would infinitely recurse.
|
||||
(Alberto Paro via Robert Muir)
|
||||
|
||||
* LUCENE-4300: BooleanQuery's rewrite was not always safe: if you
|
||||
had a custom Similarity where coord(1,1) != 1F, then the rewritten
|
||||
query would be scored differently. (Robert Muir)
|
||||
|
||||
* Don't allow negatives in the positions file. If you have an index
|
||||
from 2.4.0 or earlier with such negative positions, and you already
|
||||
upgraded to 3.x, then to Lucene 4.0-ALPHA or -BETA, you should run
|
||||
CheckIndex. If it fails, then you need to upgrade again to 4.0 (Robert Muir)
|
||||
|
||||
Build
|
||||
|
||||
* LUCENE-3985: Upgrade to randomizedtesting 2.0.0. Added support for
|
||||
thread leak detection. Added support for suite timeouts. (Dawid Weiss)
|
||||
|
||||
======================= Lucene 4.0.0-BETA =======================
|
||||
|
||||
|
@ -47,6 +97,11 @@ New features
|
|||
int docID), to attempt deletion by docID as long as the provided
|
||||
reader is an NRT reader, and the segment has not yet been merged
|
||||
away (Mike McCandless).
|
||||
|
||||
* LUCENE-4286: Added option to CJKBigramFilter to always also output
|
||||
unigrams. This can be used for a unigram+bigram approach, or at
|
||||
index-time only for better support of short queries.
|
||||
(Tom Burton-West, Robert Muir)
|
||||
|
||||
API Changes
|
||||
|
||||
|
@ -115,6 +170,10 @@ Optimizations
|
|||
making them substantially more lightweight. Behavior is unchanged.
|
||||
(Robert Muir)
|
||||
|
||||
* LUCENE-4291: Reduced internal buffer size for Jflex-based tokenizers
|
||||
such as StandardTokenizer from 32kb to 8kb.
|
||||
(Raintung Li, Steven Rowe, Robert Muir)
|
||||
|
||||
Bug Fixes
|
||||
|
||||
* LUCENE-4109: BooleanQueries are not parsed correctly with the
|
||||
|
@ -164,6 +223,9 @@ Bug Fixes
|
|||
* LUCENE-4282: Automaton FuzzyQuery didnt always deliver all results.
|
||||
(Johannes Christen, Uwe Schindler, Robert Muir)
|
||||
|
||||
* LUCENE-4289: Fix minor idf inconsistencies/inefficiencies in highlighter.
|
||||
(Robert Muir)
|
||||
|
||||
Changes in Runtime Behavior
|
||||
|
||||
* LUCENE-4109: Enable position increments in the flexible queryparser by default.
|
||||
|
|
|
@ -9,7 +9,7 @@ enumeration APIs. Here are the major changes:
|
|||
by the BytesRef class (which provides an offset + length "slice"
|
||||
into an existing byte[]).
|
||||
|
||||
* Fields are separately enumerated (FieldsEnum) from the terms
|
||||
* Fields are separately enumerated (Fields.iterator()) from the terms
|
||||
within each field (TermEnum). So instead of this:
|
||||
|
||||
TermEnum termsEnum = ...;
|
||||
|
@ -20,10 +20,8 @@ enumeration APIs. Here are the major changes:
|
|||
|
||||
Do this:
|
||||
|
||||
FieldsEnum fieldsEnum = ...;
|
||||
String field;
|
||||
while((field = fieldsEnum.next()) != null) {
|
||||
TermsEnum termsEnum = fieldsEnum.terms();
|
||||
for(String field : fields) {
|
||||
TermsEnum termsEnum = fields.terms(field);
|
||||
BytesRef text;
|
||||
while((text = termsEnum.next()) != null) {
|
||||
System.out.println("field=" + field + "; text=" + text.utf8ToString());
|
||||
|
@ -316,11 +314,12 @@ an AtomicReader. Note: using "atomicity emulators" can cause serious
|
|||
slowdowns due to the need to merge terms, postings, DocValues, and
|
||||
FieldCache, use them with care!
|
||||
|
||||
## LUCENE-2413: Analyzer package changes
|
||||
## LUCENE-2413,LUCENE-3396: Analyzer package changes
|
||||
|
||||
Lucene's core and contrib analyzers, along with Solr's analyzers,
|
||||
were consolidated into lucene/analysis. During the refactoring some
|
||||
package names have changed:
|
||||
package names have changed, and ReusableAnalyzerBase was renamed to
|
||||
Analyzer:
|
||||
|
||||
- o.a.l.analysis.KeywordAnalyzer -> o.a.l.analysis.core.KeywordAnalyzer
|
||||
- o.a.l.analysis.KeywordTokenizer -> o.a.l.analysis.core.KeywordTokenizer
|
||||
|
@ -345,7 +344,7 @@ package names have changed:
|
|||
- o.a.l.analysis.NormalizeCharMap -> o.a.l.analysis.charfilter.NormalizeCharMap
|
||||
- o.a.l.analysis.CharArraySet -> o.a.l.analysis.util.CharArraySet
|
||||
- o.a.l.analysis.CharArrayMap -> o.a.l.analysis.util.CharArrayMap
|
||||
- o.a.l.analysis.ReusableAnalyzerBase -> o.a.l.analysis.util.ReusableAnalyzerBase
|
||||
- o.a.l.analysis.ReusableAnalyzerBase -> o.a.l.analysis.Analyzer
|
||||
- o.a.l.analysis.StopwordAnalyzerBase -> o.a.l.analysis.util.StopwordAnalyzerBase
|
||||
- o.a.l.analysis.WordListLoader -> o.a.l.analysis.util.WordListLoader
|
||||
- o.a.l.analysis.CharTokenizer -> o.a.l.analysis.util.CharTokenizer
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 7/26/12 6:22 PM */
|
||||
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 8/6/12 11:57 AM */
|
||||
|
||||
package org.apache.lucene.analysis.charfilter;
|
||||
|
||||
|
@ -40,8 +40,8 @@ import org.apache.lucene.analysis.util.OpenStringBuilder;
|
|||
/**
|
||||
* This class is a scanner generated by
|
||||
* <a href="http://www.jflex.de/">JFlex</a> 1.5.0-SNAPSHOT
|
||||
* on 7/26/12 6:22 PM from the specification file
|
||||
* <tt>C:/svn/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex</tt>
|
||||
* on 8/6/12 11:57 AM from the specification file
|
||||
* <tt>/home/rmuir/workspace/lucene-trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex</tt>
|
||||
*/
|
||||
public final class HTMLStripCharFilter extends BaseCharFilter {
|
||||
|
||||
|
@ -31255,6 +31255,93 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
|
|||
{ yybegin(STYLE);
|
||||
}
|
||||
case 55: break;
|
||||
case 27:
|
||||
{ // add (previously matched input length) + (this match length) - (substitution length)
|
||||
cumulativeDiff += inputSegment.length() + yylength() - 1;
|
||||
// position the correction at (already output length) + (substitution length)
|
||||
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
yybegin(YYINITIAL);
|
||||
return BLOCK_LEVEL_START_TAG_REPLACEMENT;
|
||||
}
|
||||
case 56: break;
|
||||
case 30:
|
||||
{ int length = yylength();
|
||||
inputSegment.write(zzBuffer, zzStartRead, length);
|
||||
entitySegment.clear();
|
||||
char ch = entityValues.get(zzBuffer, zzStartRead, length).charValue();
|
||||
entitySegment.append(ch);
|
||||
outputSegment = entitySegment;
|
||||
yybegin(CHARACTER_REFERENCE_TAIL);
|
||||
}
|
||||
case 57: break;
|
||||
case 48:
|
||||
{ inputSegment.clear();
|
||||
yybegin(YYINITIAL);
|
||||
// add (previously matched input length) -- current match and substitution handled below
|
||||
cumulativeDiff += yychar - inputStart;
|
||||
// position the offset correction at (already output length) -- substitution handled below
|
||||
int offsetCorrectionPos = outputCharCount;
|
||||
int returnValue;
|
||||
if (escapeSTYLE) {
|
||||
inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
outputSegment = inputSegment;
|
||||
returnValue = outputSegment.nextChar();
|
||||
} else {
|
||||
// add (this match length) - (substitution length)
|
||||
cumulativeDiff += yylength() - 1;
|
||||
// add (substitution length)
|
||||
++offsetCorrectionPos;
|
||||
returnValue = STYLE_REPLACEMENT;
|
||||
}
|
||||
addOffCorrectMap(offsetCorrectionPos, cumulativeDiff);
|
||||
return returnValue;
|
||||
}
|
||||
case 58: break;
|
||||
case 8:
|
||||
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
if (null != escapedTags
|
||||
&& escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
|
||||
yybegin(START_TAG_TAIL_INCLUDE);
|
||||
} else {
|
||||
yybegin(START_TAG_TAIL_SUBSTITUTE);
|
||||
}
|
||||
}
|
||||
case 59: break;
|
||||
case 2:
|
||||
{ inputStart = yychar;
|
||||
inputSegment.clear();
|
||||
inputSegment.append('<');
|
||||
yybegin(LEFT_ANGLE_BRACKET);
|
||||
}
|
||||
case 60: break;
|
||||
case 44:
|
||||
{ restoreState = STYLE_COMMENT; yybegin(SERVER_SIDE_INCLUDE);
|
||||
}
|
||||
case 61: break;
|
||||
case 21:
|
||||
{ previousRestoreState = restoreState;
|
||||
restoreState = SERVER_SIDE_INCLUDE;
|
||||
yybegin(SINGLE_QUOTED_STRING);
|
||||
}
|
||||
case 62: break;
|
||||
case 11:
|
||||
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
yybegin(LEFT_ANGLE_BRACKET_SPACE);
|
||||
}
|
||||
case 63: break;
|
||||
case 35:
|
||||
{ yybegin(SCRIPT);
|
||||
}
|
||||
case 64: break;
|
||||
case 42:
|
||||
{ restoreState = COMMENT; yybegin(SERVER_SIDE_INCLUDE);
|
||||
}
|
||||
case 65: break;
|
||||
case 10:
|
||||
{ inputSegment.append('!'); yybegin(BANG);
|
||||
}
|
||||
case 66: break;
|
||||
case 51:
|
||||
{ // Handle paired UTF-16 surrogates.
|
||||
String surrogatePair = yytext();
|
||||
|
@ -31288,13 +31375,331 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
|
|||
inputSegment.append('#');
|
||||
yybegin(NUMERIC_CHARACTER);
|
||||
}
|
||||
case 56: break;
|
||||
case 21:
|
||||
case 67: break;
|
||||
case 4:
|
||||
{ yypushback(1);
|
||||
outputSegment = inputSegment;
|
||||
outputSegment.restart();
|
||||
yybegin(YYINITIAL);
|
||||
return outputSegment.nextChar();
|
||||
}
|
||||
case 68: break;
|
||||
case 43:
|
||||
{ restoreState = SCRIPT_COMMENT; yybegin(SERVER_SIDE_INCLUDE);
|
||||
}
|
||||
case 69: break;
|
||||
case 52:
|
||||
{ // Handle paired UTF-16 surrogates.
|
||||
String surrogatePair = yytext();
|
||||
char highSurrogate = '\u0000';
|
||||
try { // High surrogates are in decimal range [55296, 56319]
|
||||
highSurrogate = (char)Integer.parseInt(surrogatePair.substring(1, 6));
|
||||
} catch(Exception e) { // should never happen
|
||||
assert false: "Exception parsing high surrogate '"
|
||||
+ surrogatePair.substring(1, 6) + "'";
|
||||
}
|
||||
if (Character.isHighSurrogate(highSurrogate)) {
|
||||
outputSegment = entitySegment;
|
||||
outputSegment.clear();
|
||||
try {
|
||||
outputSegment.unsafeWrite
|
||||
((char)Integer.parseInt(surrogatePair.substring(10, 14), 16));
|
||||
} catch(Exception e) { // should never happen
|
||||
assert false: "Exception parsing low surrogate '"
|
||||
+ surrogatePair.substring(10, 14) + "'";
|
||||
}
|
||||
// add (previously matched input length) + (this match length) - (substitution length)
|
||||
cumulativeDiff += inputSegment.length() + yylength() - 2;
|
||||
// position the correction at (already output length) + (substitution length)
|
||||
addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
yybegin(YYINITIAL);
|
||||
return highSurrogate;
|
||||
}
|
||||
yypushback(surrogatePair.length() - 1); // Consume only '#'
|
||||
inputSegment.append('#');
|
||||
yybegin(NUMERIC_CHARACTER);
|
||||
}
|
||||
case 70: break;
|
||||
case 28:
|
||||
{ restoreState = STYLE_COMMENT; yybegin(SINGLE_QUOTED_STRING);
|
||||
}
|
||||
case 71: break;
|
||||
case 50:
|
||||
{ // Handle paired UTF-16 surrogates.
|
||||
outputSegment = entitySegment;
|
||||
outputSegment.clear();
|
||||
String surrogatePair = yytext();
|
||||
char highSurrogate = '\u0000';
|
||||
try {
|
||||
highSurrogate = (char)Integer.parseInt(surrogatePair.substring(2, 6), 16);
|
||||
} catch(Exception e) { // should never happen
|
||||
assert false: "Exception parsing high surrogate '"
|
||||
+ surrogatePair.substring(2, 6) + "'";
|
||||
}
|
||||
try {
|
||||
outputSegment.unsafeWrite
|
||||
((char)Integer.parseInt(surrogatePair.substring(10, 14), 16));
|
||||
} catch(Exception e) { // should never happen
|
||||
assert false: "Exception parsing low surrogate '"
|
||||
+ surrogatePair.substring(10, 14) + "'";
|
||||
}
|
||||
// add (previously matched input length) + (this match length) - (substitution length)
|
||||
cumulativeDiff += inputSegment.length() + yylength() - 2;
|
||||
// position the correction at (already output length) + (substitution length)
|
||||
addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
yybegin(YYINITIAL);
|
||||
return highSurrogate;
|
||||
}
|
||||
case 72: break;
|
||||
case 16:
|
||||
{ restoreState = SCRIPT_COMMENT; yybegin(SINGLE_QUOTED_STRING);
|
||||
}
|
||||
case 73: break;
|
||||
case 22:
|
||||
{ previousRestoreState = restoreState;
|
||||
restoreState = SERVER_SIDE_INCLUDE;
|
||||
yybegin(SINGLE_QUOTED_STRING);
|
||||
yybegin(DOUBLE_QUOTED_STRING);
|
||||
}
|
||||
case 57: break;
|
||||
case 74: break;
|
||||
case 26:
|
||||
{ // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
|
||||
cumulativeDiff += inputSegment.length() + yylength();
|
||||
// position the correction at (already output length) [ + (substitution length) = 0 ]
|
||||
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
outputSegment = inputSegment;
|
||||
yybegin(YYINITIAL);
|
||||
}
|
||||
case 75: break;
|
||||
case 20:
|
||||
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
}
|
||||
case 76: break;
|
||||
case 47:
|
||||
{ // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
|
||||
cumulativeDiff += inputSegment.length() + yylength();
|
||||
// position the correction at (already output length) [ + (substitution length) = 0 ]
|
||||
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
yybegin(CDATA);
|
||||
}
|
||||
case 77: break;
|
||||
case 33:
|
||||
{ yybegin(YYINITIAL);
|
||||
if (escapeBR) {
|
||||
inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
outputSegment = inputSegment;
|
||||
return outputSegment.nextChar();
|
||||
} else {
|
||||
// add (previously matched input length) + (this match length) - (substitution length)
|
||||
cumulativeDiff += inputSegment.length() + yylength() - 1;
|
||||
// position the correction at (already output length) + (substitution length)
|
||||
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
|
||||
inputSegment.reset();
|
||||
return BR_START_TAG_REPLACEMENT;
|
||||
}
|
||||
}
|
||||
case 78: break;
|
||||
case 23:
|
||||
{ yybegin(restoreState); restoreState = previousRestoreState;
|
||||
}
|
||||
case 79: break;
|
||||
case 32:
|
||||
{ yybegin(COMMENT);
|
||||
}
|
||||
case 80: break;
|
||||
case 24:
|
||||
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
outputSegment = inputSegment;
|
||||
yybegin(YYINITIAL);
|
||||
return outputSegment.nextChar();
|
||||
}
|
||||
case 81: break;
|
||||
case 3:
|
||||
{ inputStart = yychar;
|
||||
inputSegment.clear();
|
||||
inputSegment.append('&');
|
||||
yybegin(AMPERSAND);
|
||||
}
|
||||
case 82: break;
|
||||
case 46:
|
||||
{ yybegin(SCRIPT);
|
||||
if (escapeSCRIPT) {
|
||||
inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
outputSegment = inputSegment;
|
||||
inputStart += 1 + yylength();
|
||||
return outputSegment.nextChar();
|
||||
}
|
||||
}
|
||||
case 83: break;
|
||||
case 14:
|
||||
{ // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
|
||||
cumulativeDiff += inputSegment.length() + yylength();
|
||||
// position the correction at (already output length) [ + (substitution length) = 0 ]
|
||||
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
yybegin(YYINITIAL);
|
||||
}
|
||||
case 84: break;
|
||||
case 6:
|
||||
{ int matchLength = yylength();
|
||||
inputSegment.write(zzBuffer, zzStartRead, matchLength);
|
||||
if (matchLength <= 7) { // 0x10FFFF = 1114111: max 7 decimal chars
|
||||
String decimalCharRef = yytext();
|
||||
int codePoint = 0;
|
||||
try {
|
||||
codePoint = Integer.parseInt(decimalCharRef);
|
||||
} catch(Exception e) {
|
||||
assert false: "Exception parsing code point '" + decimalCharRef + "'";
|
||||
}
|
||||
if (codePoint <= 0x10FFFF) {
|
||||
outputSegment = entitySegment;
|
||||
outputSegment.clear();
|
||||
if (codePoint >= Character.MIN_SURROGATE
|
||||
&& codePoint <= Character.MAX_SURROGATE) {
|
||||
outputSegment.unsafeWrite(REPLACEMENT_CHARACTER);
|
||||
} else {
|
||||
outputSegment.setLength
|
||||
(Character.toChars(codePoint, outputSegment.getArray(), 0));
|
||||
}
|
||||
yybegin(CHARACTER_REFERENCE_TAIL);
|
||||
} else {
|
||||
outputSegment = inputSegment;
|
||||
yybegin(YYINITIAL);
|
||||
return outputSegment.nextChar();
|
||||
}
|
||||
} else {
|
||||
outputSegment = inputSegment;
|
||||
yybegin(YYINITIAL);
|
||||
return outputSegment.nextChar();
|
||||
}
|
||||
}
|
||||
case 85: break;
|
||||
case 34:
|
||||
{ // add (previously matched input length) + (this match length) [ - (substitution length) = 0]
|
||||
cumulativeDiff += yychar - inputStart + yylength();
|
||||
// position the correction at (already output length) [ + (substitution length) = 0]
|
||||
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
yybegin(YYINITIAL);
|
||||
}
|
||||
case 86: break;
|
||||
case 5:
|
||||
{ inputSegment.append('#'); yybegin(NUMERIC_CHARACTER);
|
||||
}
|
||||
case 87: break;
|
||||
case 13:
|
||||
{ inputSegment.append(zzBuffer[zzStartRead]);
|
||||
}
|
||||
case 88: break;
|
||||
case 18:
|
||||
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
if (null != escapedTags
|
||||
&& escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
|
||||
yybegin(END_TAG_TAIL_INCLUDE);
|
||||
} else {
|
||||
yybegin(END_TAG_TAIL_SUBSTITUTE);
|
||||
}
|
||||
}
|
||||
case 89: break;
|
||||
case 40:
|
||||
{ yybegin(SCRIPT_COMMENT);
|
||||
}
|
||||
case 90: break;
|
||||
case 37:
|
||||
{ // add (this match length) [ - (substitution length) = 0 ]
|
||||
cumulativeDiff += yylength();
|
||||
// position the correction at (already output length) [ + (substitution length) = 0 ]
|
||||
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
||||
yybegin(YYINITIAL);
|
||||
}
|
||||
case 91: break;
|
||||
case 12:
|
||||
{ inputSegment.append('/'); yybegin(LEFT_ANGLE_BRACKET_SLASH);
|
||||
}
|
||||
case 92: break;
|
||||
case 9:
|
||||
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
if (null != escapedTags
|
||||
&& escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
|
||||
yybegin(START_TAG_TAIL_INCLUDE);
|
||||
} else {
|
||||
yybegin(START_TAG_TAIL_EXCLUDE);
|
||||
}
|
||||
}
|
||||
case 93: break;
|
||||
case 49:
|
||||
{ inputSegment.clear();
|
||||
yybegin(YYINITIAL);
|
||||
// add (previously matched input length) -- current match and substitution handled below
|
||||
cumulativeDiff += yychar - inputStart;
|
||||
// position at (already output length) -- substitution handled below
|
||||
int offsetCorrectionPos = outputCharCount;
|
||||
int returnValue;
|
||||
if (escapeSCRIPT) {
|
||||
inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
outputSegment = inputSegment;
|
||||
returnValue = outputSegment.nextChar();
|
||||
} else {
|
||||
// add (this match length) - (substitution length)
|
||||
cumulativeDiff += yylength() - 1;
|
||||
// add (substitution length)
|
||||
++offsetCorrectionPos;
|
||||
returnValue = SCRIPT_REPLACEMENT;
|
||||
}
|
||||
addOffCorrectMap(offsetCorrectionPos, cumulativeDiff);
|
||||
return returnValue;
|
||||
}
|
||||
case 94: break;
|
||||
case 29:
|
||||
{ restoreState = STYLE_COMMENT; yybegin(DOUBLE_QUOTED_STRING);
|
||||
}
|
||||
case 95: break;
|
||||
case 17:
|
||||
{ restoreState = SCRIPT_COMMENT; yybegin(DOUBLE_QUOTED_STRING);
|
||||
}
|
||||
case 96: break;
|
||||
case 45:
|
||||
{ yybegin(STYLE);
|
||||
if (escapeSTYLE) {
|
||||
inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
outputSegment = inputSegment;
|
||||
inputStart += 1 + yylength();
|
||||
return outputSegment.nextChar();
|
||||
}
|
||||
}
|
||||
case 97: break;
|
||||
case 7:
|
||||
{ // add (previously matched input length) + (this match length) - (substitution length)
|
||||
cumulativeDiff += inputSegment.length() + yylength() - outputSegment.length();
|
||||
// position the correction at (already output length) + (substitution length)
|
||||
addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
|
||||
yybegin(YYINITIAL);
|
||||
return outputSegment.nextChar();
|
||||
}
|
||||
case 98: break;
|
||||
case 19:
|
||||
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
if (null != escapedTags
|
||||
&& escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
|
||||
yybegin(END_TAG_TAIL_INCLUDE);
|
||||
} else {
|
||||
yybegin(END_TAG_TAIL_EXCLUDE);
|
||||
}
|
||||
}
|
||||
case 99: break;
|
||||
case 25:
|
||||
{ // add (previously matched input length) + (this match length) - (substitution length)
|
||||
cumulativeDiff += inputSegment.length() + yylength() - 1;
|
||||
// position the correction at (already output length) + (substitution length)
|
||||
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
yybegin(YYINITIAL);
|
||||
return BLOCK_LEVEL_END_TAG_REPLACEMENT;
|
||||
}
|
||||
case 100: break;
|
||||
case 31:
|
||||
{ int matchLength = yylength();
|
||||
inputSegment.write(zzBuffer, zzStartRead, matchLength);
|
||||
|
@ -31329,66 +31734,7 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
|
|||
return outputSegment.nextChar();
|
||||
}
|
||||
}
|
||||
case 58: break;
|
||||
case 19:
|
||||
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
if (null != escapedTags
|
||||
&& escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
|
||||
yybegin(END_TAG_TAIL_INCLUDE);
|
||||
} else {
|
||||
yybegin(END_TAG_TAIL_EXCLUDE);
|
||||
}
|
||||
}
|
||||
case 59: break;
|
||||
case 2:
|
||||
{ inputStart = yychar;
|
||||
inputSegment.clear();
|
||||
inputSegment.append('<');
|
||||
yybegin(LEFT_ANGLE_BRACKET);
|
||||
}
|
||||
case 60: break;
|
||||
case 27:
|
||||
{ // add (previously matched input length) + (this match length) - (substitution length)
|
||||
cumulativeDiff += inputSegment.length() + yylength() - 1;
|
||||
// position the correction at (already output length) + (substitution length)
|
||||
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
yybegin(YYINITIAL);
|
||||
return BLOCK_LEVEL_START_TAG_REPLACEMENT;
|
||||
}
|
||||
case 61: break;
|
||||
case 44:
|
||||
{ restoreState = STYLE_COMMENT; yybegin(SERVER_SIDE_INCLUDE);
|
||||
}
|
||||
case 62: break;
|
||||
case 35:
|
||||
{ yybegin(SCRIPT);
|
||||
}
|
||||
case 63: break;
|
||||
case 42:
|
||||
{ restoreState = COMMENT; yybegin(SERVER_SIDE_INCLUDE);
|
||||
}
|
||||
case 64: break;
|
||||
case 10:
|
||||
{ inputSegment.append('!'); yybegin(BANG);
|
||||
}
|
||||
case 65: break;
|
||||
case 33:
|
||||
{ yybegin(YYINITIAL);
|
||||
if (escapeBR) {
|
||||
inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
outputSegment = inputSegment;
|
||||
return outputSegment.nextChar();
|
||||
} else {
|
||||
// add (previously matched input length) + (this match length) - (substitution length)
|
||||
cumulativeDiff += inputSegment.length() + yylength() - 1;
|
||||
// position the correction at (already output length) + (substitution length)
|
||||
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
|
||||
inputSegment.reset();
|
||||
return BR_START_TAG_REPLACEMENT;
|
||||
}
|
||||
}
|
||||
case 66: break;
|
||||
case 101: break;
|
||||
case 53:
|
||||
{ // Handle paired UTF-16 surrogates.
|
||||
String surrogatePair = yytext();
|
||||
|
@ -31424,288 +31770,7 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
|
|||
inputSegment.append('#');
|
||||
yybegin(NUMERIC_CHARACTER);
|
||||
}
|
||||
case 67: break;
|
||||
case 43:
|
||||
{ restoreState = SCRIPT_COMMENT; yybegin(SERVER_SIDE_INCLUDE);
|
||||
}
|
||||
case 68: break;
|
||||
case 30:
|
||||
{ int length = yylength();
|
||||
inputSegment.write(zzBuffer, zzStartRead, length);
|
||||
entitySegment.clear();
|
||||
char ch = entityValues.get(zzBuffer, zzStartRead, length).charValue();
|
||||
entitySegment.append(ch);
|
||||
outputSegment = entitySegment;
|
||||
yybegin(CHARACTER_REFERENCE_TAIL);
|
||||
}
|
||||
case 69: break;
|
||||
case 28:
|
||||
{ restoreState = STYLE_COMMENT; yybegin(SINGLE_QUOTED_STRING);
|
||||
}
|
||||
case 70: break;
|
||||
case 3:
|
||||
{ inputStart = yychar;
|
||||
inputSegment.clear();
|
||||
inputSegment.append('&');
|
||||
yybegin(AMPERSAND);
|
||||
}
|
||||
case 71: break;
|
||||
case 16:
|
||||
{ restoreState = SCRIPT_COMMENT; yybegin(SINGLE_QUOTED_STRING);
|
||||
}
|
||||
case 72: break;
|
||||
case 52:
|
||||
{ // Handle paired UTF-16 surrogates.
|
||||
String surrogatePair = yytext();
|
||||
char highSurrogate = '\u0000';
|
||||
try { // High surrogates are in decimal range [55296, 56319]
|
||||
highSurrogate = (char)Integer.parseInt(surrogatePair.substring(1, 6));
|
||||
} catch(Exception e) { // should never happen
|
||||
assert false: "Exception parsing high surrogate '"
|
||||
+ surrogatePair.substring(1, 6) + "'";
|
||||
}
|
||||
if (Character.isHighSurrogate(highSurrogate)) {
|
||||
outputSegment = entitySegment;
|
||||
outputSegment.clear();
|
||||
try {
|
||||
outputSegment.unsafeWrite
|
||||
((char)Integer.parseInt(surrogatePair.substring(10, 14), 16));
|
||||
} catch(Exception e) { // should never happen
|
||||
assert false: "Exception parsing low surrogate '"
|
||||
+ surrogatePair.substring(10, 14) + "'";
|
||||
}
|
||||
// add (previously matched input length) + (this match length) - (substitution length)
|
||||
cumulativeDiff += inputSegment.length() + yylength() - 2;
|
||||
// position the correction at (already output length) + (substitution length)
|
||||
addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
yybegin(YYINITIAL);
|
||||
return highSurrogate;
|
||||
}
|
||||
yypushback(surrogatePair.length() - 1); // Consume only '#'
|
||||
inputSegment.append('#');
|
||||
yybegin(NUMERIC_CHARACTER);
|
||||
}
|
||||
case 73: break;
|
||||
case 6:
|
||||
{ int matchLength = yylength();
|
||||
inputSegment.write(zzBuffer, zzStartRead, matchLength);
|
||||
if (matchLength <= 7) { // 0x10FFFF = 1114111: max 7 decimal chars
|
||||
String decimalCharRef = yytext();
|
||||
int codePoint = 0;
|
||||
try {
|
||||
codePoint = Integer.parseInt(decimalCharRef);
|
||||
} catch(Exception e) {
|
||||
assert false: "Exception parsing code point '" + decimalCharRef + "'";
|
||||
}
|
||||
if (codePoint <= 0x10FFFF) {
|
||||
outputSegment = entitySegment;
|
||||
outputSegment.clear();
|
||||
if (codePoint >= Character.MIN_SURROGATE
|
||||
&& codePoint <= Character.MAX_SURROGATE) {
|
||||
outputSegment.unsafeWrite(REPLACEMENT_CHARACTER);
|
||||
} else {
|
||||
outputSegment.setLength
|
||||
(Character.toChars(codePoint, outputSegment.getArray(), 0));
|
||||
}
|
||||
yybegin(CHARACTER_REFERENCE_TAIL);
|
||||
} else {
|
||||
outputSegment = inputSegment;
|
||||
yybegin(YYINITIAL);
|
||||
return outputSegment.nextChar();
|
||||
}
|
||||
} else {
|
||||
outputSegment = inputSegment;
|
||||
yybegin(YYINITIAL);
|
||||
return outputSegment.nextChar();
|
||||
}
|
||||
}
|
||||
case 74: break;
|
||||
case 37:
|
||||
{ // add (this match length) [ - (substitution length) = 0 ]
|
||||
cumulativeDiff += yylength();
|
||||
// position the correction at (already output length) [ + (substitution length) = 0 ]
|
||||
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
||||
yybegin(YYINITIAL);
|
||||
}
|
||||
case 75: break;
|
||||
case 8:
|
||||
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
if (null != escapedTags
|
||||
&& escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
|
||||
yybegin(START_TAG_TAIL_INCLUDE);
|
||||
} else {
|
||||
yybegin(START_TAG_TAIL_SUBSTITUTE);
|
||||
}
|
||||
}
|
||||
case 76: break;
|
||||
case 46:
|
||||
{ yybegin(SCRIPT);
|
||||
if (escapeSCRIPT) {
|
||||
inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
outputSegment = inputSegment;
|
||||
inputStart += 1 + yylength();
|
||||
return outputSegment.nextChar();
|
||||
}
|
||||
}
|
||||
case 77: break;
|
||||
case 11:
|
||||
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
yybegin(LEFT_ANGLE_BRACKET_SPACE);
|
||||
}
|
||||
case 78: break;
|
||||
case 20:
|
||||
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
}
|
||||
case 79: break;
|
||||
case 34:
|
||||
{ // add (previously matched input length) + (this match length) [ - (substitution length) = 0]
|
||||
cumulativeDiff += yychar - inputStart + yylength();
|
||||
// position the correction at (already output length) [ + (substitution length) = 0]
|
||||
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
yybegin(YYINITIAL);
|
||||
}
|
||||
case 80: break;
|
||||
case 23:
|
||||
{ yybegin(restoreState); restoreState = previousRestoreState;
|
||||
}
|
||||
case 81: break;
|
||||
case 32:
|
||||
{ yybegin(COMMENT);
|
||||
}
|
||||
case 82: break;
|
||||
case 14:
|
||||
{ // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
|
||||
cumulativeDiff += inputSegment.length() + yylength();
|
||||
// position the correction at (already output length) [ + (substitution length) = 0 ]
|
||||
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
yybegin(YYINITIAL);
|
||||
}
|
||||
case 83: break;
|
||||
case 18:
|
||||
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
if (null != escapedTags
|
||||
&& escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
|
||||
yybegin(END_TAG_TAIL_INCLUDE);
|
||||
} else {
|
||||
yybegin(END_TAG_TAIL_SUBSTITUTE);
|
||||
}
|
||||
}
|
||||
case 84: break;
|
||||
case 25:
|
||||
{ // add (previously matched input length) + (this match length) - (substitution length)
|
||||
cumulativeDiff += inputSegment.length() + yylength() - 1;
|
||||
// position the correction at (already output length) + (substitution length)
|
||||
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
yybegin(YYINITIAL);
|
||||
return BLOCK_LEVEL_END_TAG_REPLACEMENT;
|
||||
}
|
||||
case 85: break;
|
||||
case 7:
|
||||
{ // add (previously matched input length) + (this match length) - (substitution length)
|
||||
cumulativeDiff += inputSegment.length() + yylength() - outputSegment.length();
|
||||
// position the correction at (already output length) + (substitution length)
|
||||
addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
|
||||
yybegin(YYINITIAL);
|
||||
return outputSegment.nextChar();
|
||||
}
|
||||
case 86: break;
|
||||
case 48:
|
||||
{ inputSegment.clear();
|
||||
yybegin(YYINITIAL);
|
||||
// add (previously matched input length) -- current match and substitution handled below
|
||||
cumulativeDiff += yychar - inputStart;
|
||||
// position the offset correction at (already output length) -- substitution handled below
|
||||
int offsetCorrectionPos = outputCharCount;
|
||||
int returnValue;
|
||||
if (escapeSTYLE) {
|
||||
inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
outputSegment = inputSegment;
|
||||
returnValue = outputSegment.nextChar();
|
||||
} else {
|
||||
// add (this match length) - (substitution length)
|
||||
cumulativeDiff += yylength() - 1;
|
||||
// add (substitution length)
|
||||
++offsetCorrectionPos;
|
||||
returnValue = STYLE_REPLACEMENT;
|
||||
}
|
||||
addOffCorrectMap(offsetCorrectionPos, cumulativeDiff);
|
||||
return returnValue;
|
||||
}
|
||||
case 87: break;
|
||||
case 5:
|
||||
{ inputSegment.append('#'); yybegin(NUMERIC_CHARACTER);
|
||||
}
|
||||
case 88: break;
|
||||
case 26:
|
||||
{ // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
|
||||
cumulativeDiff += inputSegment.length() + yylength();
|
||||
// position the correction at (already output length) [ + (substitution length) = 0 ]
|
||||
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
outputSegment = inputSegment;
|
||||
yybegin(YYINITIAL);
|
||||
}
|
||||
case 89: break;
|
||||
case 13:
|
||||
{ inputSegment.append(zzBuffer[zzStartRead]);
|
||||
}
|
||||
case 90: break;
|
||||
case 50:
|
||||
{ // Handle paired UTF-16 surrogates.
|
||||
outputSegment = entitySegment;
|
||||
outputSegment.clear();
|
||||
String surrogatePair = yytext();
|
||||
char highSurrogate = '\u0000';
|
||||
try {
|
||||
highSurrogate = (char)Integer.parseInt(surrogatePair.substring(2, 6), 16);
|
||||
} catch(Exception e) { // should never happen
|
||||
assert false: "Exception parsing high surrogate '"
|
||||
+ surrogatePair.substring(2, 6) + "'";
|
||||
}
|
||||
try {
|
||||
outputSegment.unsafeWrite
|
||||
((char)Integer.parseInt(surrogatePair.substring(10, 14), 16));
|
||||
} catch(Exception e) { // should never happen
|
||||
assert false: "Exception parsing low surrogate '"
|
||||
+ surrogatePair.substring(10, 14) + "'";
|
||||
}
|
||||
// add (previously matched input length) + (this match length) - (substitution length)
|
||||
cumulativeDiff += inputSegment.length() + yylength() - 2;
|
||||
// position the correction at (already output length) + (substitution length)
|
||||
addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
yybegin(YYINITIAL);
|
||||
return highSurrogate;
|
||||
}
|
||||
case 91: break;
|
||||
case 40:
|
||||
{ yybegin(SCRIPT_COMMENT);
|
||||
}
|
||||
case 92: break;
|
||||
case 45:
|
||||
{ yybegin(STYLE);
|
||||
if (escapeSTYLE) {
|
||||
inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
outputSegment = inputSegment;
|
||||
inputStart += 1 + yylength();
|
||||
return outputSegment.nextChar();
|
||||
}
|
||||
}
|
||||
case 93: break;
|
||||
case 22:
|
||||
{ previousRestoreState = restoreState;
|
||||
restoreState = SERVER_SIDE_INCLUDE;
|
||||
yybegin(DOUBLE_QUOTED_STRING);
|
||||
}
|
||||
case 94: break;
|
||||
case 12:
|
||||
{ inputSegment.append('/'); yybegin(LEFT_ANGLE_BRACKET_SLASH);
|
||||
}
|
||||
case 95: break;
|
||||
case 102: break;
|
||||
case 36:
|
||||
{ yybegin(YYINITIAL);
|
||||
if (escapeBR) {
|
||||
|
@ -31721,83 +31786,18 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
|
|||
return BR_END_TAG_REPLACEMENT;
|
||||
}
|
||||
}
|
||||
case 96: break;
|
||||
case 24:
|
||||
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
outputSegment = inputSegment;
|
||||
yybegin(YYINITIAL);
|
||||
return outputSegment.nextChar();
|
||||
}
|
||||
case 97: break;
|
||||
case 47:
|
||||
{ // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
|
||||
cumulativeDiff += inputSegment.length() + yylength();
|
||||
// position the correction at (already output length) [ + (substitution length) = 0 ]
|
||||
addOffCorrectMap(outputCharCount, cumulativeDiff);
|
||||
inputSegment.clear();
|
||||
yybegin(CDATA);
|
||||
}
|
||||
case 98: break;
|
||||
case 29:
|
||||
{ restoreState = STYLE_COMMENT; yybegin(DOUBLE_QUOTED_STRING);
|
||||
}
|
||||
case 99: break;
|
||||
case 17:
|
||||
{ restoreState = SCRIPT_COMMENT; yybegin(DOUBLE_QUOTED_STRING);
|
||||
}
|
||||
case 100: break;
|
||||
case 9:
|
||||
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
if (null != escapedTags
|
||||
&& escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
|
||||
yybegin(START_TAG_TAIL_INCLUDE);
|
||||
} else {
|
||||
yybegin(START_TAG_TAIL_EXCLUDE);
|
||||
}
|
||||
}
|
||||
case 101: break;
|
||||
case 49:
|
||||
{ inputSegment.clear();
|
||||
yybegin(YYINITIAL);
|
||||
// add (previously matched input length) -- current match and substitution handled below
|
||||
cumulativeDiff += yychar - inputStart;
|
||||
// position at (already output length) -- substitution handled below
|
||||
int offsetCorrectionPos = outputCharCount;
|
||||
int returnValue;
|
||||
if (escapeSCRIPT) {
|
||||
inputSegment.write(zzBuffer, zzStartRead, yylength());
|
||||
outputSegment = inputSegment;
|
||||
returnValue = outputSegment.nextChar();
|
||||
} else {
|
||||
// add (this match length) - (substitution length)
|
||||
cumulativeDiff += yylength() - 1;
|
||||
// add (substitution length)
|
||||
++offsetCorrectionPos;
|
||||
returnValue = SCRIPT_REPLACEMENT;
|
||||
}
|
||||
addOffCorrectMap(offsetCorrectionPos, cumulativeDiff);
|
||||
return returnValue;
|
||||
}
|
||||
case 102: break;
|
||||
case 103: break;
|
||||
case 38:
|
||||
{ yybegin(restoreState);
|
||||
}
|
||||
case 103: break;
|
||||
case 104: break;
|
||||
case 41:
|
||||
{ yybegin(STYLE_COMMENT);
|
||||
}
|
||||
case 104: break;
|
||||
case 105: break;
|
||||
case 1:
|
||||
{ return zzBuffer[zzStartRead];
|
||||
}
|
||||
case 105: break;
|
||||
case 4:
|
||||
{ yypushback(1);
|
||||
outputSegment = inputSegment;
|
||||
outputSegment.restart();
|
||||
yybegin(YYINITIAL);
|
||||
return outputSegment.nextChar();
|
||||
}
|
||||
case 106: break;
|
||||
default:
|
||||
if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
|
||||
|
|
|
@ -141,9 +141,9 @@ InlineElment = ( [aAbBiIqQsSuU] |
|
|||
[vV][aA][rR] )
|
||||
|
||||
|
||||
%include src/java/org/apache/lucene/analysis/charfilter/HTMLCharacterEntities.jflex
|
||||
%include HTMLCharacterEntities.jflex
|
||||
|
||||
%include src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.SUPPLEMENTARY.jflex-macro
|
||||
%include HTMLStripCharFilter.SUPPLEMENTARY.jflex-macro
|
||||
|
||||
%{
|
||||
private static final int INITIAL_INPUT_SEGMENT_SIZE = 1024;
|
||||
|
|
|
@ -24,6 +24,8 @@ import org.apache.lucene.analysis.TokenStream;
|
|||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
|
||||
|
@ -35,6 +37,12 @@ import org.apache.lucene.util.ArrayUtil;
|
|||
* {@link #CJKBigramFilter(TokenStream, int)} to explicitly control which
|
||||
* of the CJK scripts are turned into bigrams.
|
||||
* <p>
|
||||
* By default, when a CJK character has no adjacent characters to form
|
||||
* a bigram, it is output in unigram form. If you want to always output
|
||||
* both unigrams and bigrams, set the <code>outputUnigrams</code>
|
||||
* flag in {@link CJKBigramFilter#CJKBigramFilter(TokenStream, int, boolean)}.
|
||||
* This can be used for a combined unigram+bigram approach.
|
||||
* <p>
|
||||
* In all cases, all non-CJK input is passed thru unmodified.
|
||||
*/
|
||||
public final class CJKBigramFilter extends TokenFilter {
|
||||
|
@ -67,10 +75,16 @@ public final class CJKBigramFilter extends TokenFilter {
|
|||
private final Object doHiragana;
|
||||
private final Object doKatakana;
|
||||
private final Object doHangul;
|
||||
|
||||
// true if we should output unigram tokens always
|
||||
private final boolean outputUnigrams;
|
||||
private boolean ngramState; // false = output unigram, true = output bigram
|
||||
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
|
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
private final PositionLengthAttribute posLengthAtt = addAttribute(PositionLengthAttribute.class);
|
||||
|
||||
// buffers containing codepoint and offsets in parallel
|
||||
int buffer[] = new int[8];
|
||||
|
@ -88,23 +102,36 @@ public final class CJKBigramFilter extends TokenFilter {
|
|||
|
||||
/**
|
||||
* Calls {@link CJKBigramFilter#CJKBigramFilter(TokenStream, int)
|
||||
* CJKBigramFilter(HAN | HIRAGANA | KATAKANA | HANGUL)}
|
||||
* CJKBigramFilter(in, HAN | HIRAGANA | KATAKANA | HANGUL)}
|
||||
*/
|
||||
public CJKBigramFilter(TokenStream in) {
|
||||
this(in, HAN | HIRAGANA | KATAKANA | HANGUL);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new CJKBigramFilter, specifying which writing systems should be bigrammed.
|
||||
* @param flags OR'ed set from {@link CJKBigramFilter#HAN}, {@link CJKBigramFilter#HIRAGANA},
|
||||
* {@link CJKBigramFilter#KATAKANA}, {@link CJKBigramFilter#HANGUL}
|
||||
* Calls {@link CJKBigramFilter#CJKBigramFilter(TokenStream, int, boolean)
|
||||
* CJKBigramFilter(in, flags, false)}
|
||||
*/
|
||||
public CJKBigramFilter(TokenStream in, int flags) {
|
||||
this(in, flags, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new CJKBigramFilter, specifying which writing systems should be bigrammed,
|
||||
* and whether or not unigrams should also be output.
|
||||
* @param flags OR'ed set from {@link CJKBigramFilter#HAN}, {@link CJKBigramFilter#HIRAGANA},
|
||||
* {@link CJKBigramFilter#KATAKANA}, {@link CJKBigramFilter#HANGUL}
|
||||
* @param outputUnigrams true if unigrams for the selected writing systems should also be output.
|
||||
* when this is false, this is only done when there are no adjacent characters to form
|
||||
* a bigram.
|
||||
*/
|
||||
public CJKBigramFilter(TokenStream in, int flags, boolean outputUnigrams) {
|
||||
super(in);
|
||||
doHan = (flags & HAN) == 0 ? NO : HAN_TYPE;
|
||||
doHiragana = (flags & HIRAGANA) == 0 ? NO : HIRAGANA_TYPE;
|
||||
doKatakana = (flags & KATAKANA) == 0 ? NO : KATAKANA_TYPE;
|
||||
doHangul = (flags & HANGUL) == 0 ? NO : HANGUL_TYPE;
|
||||
this.outputUnigrams = outputUnigrams;
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -120,7 +147,24 @@ public final class CJKBigramFilter extends TokenFilter {
|
|||
// case 1: we have multiple remaining codepoints buffered,
|
||||
// so we can emit a bigram here.
|
||||
|
||||
flushBigram();
|
||||
if (outputUnigrams) {
|
||||
|
||||
// when also outputting unigrams, we output the unigram first,
|
||||
// then rewind back to revisit the bigram.
|
||||
// so an input of ABC is A + (rewind)AB + B + (rewind)BC + C
|
||||
// the logic in hasBufferedUnigram ensures we output the C,
|
||||
// even though it did actually have adjacent CJK characters.
|
||||
|
||||
if (ngramState) {
|
||||
flushBigram();
|
||||
} else {
|
||||
flushUnigram();
|
||||
index--;
|
||||
}
|
||||
ngramState = !ngramState;
|
||||
} else {
|
||||
flushBigram();
|
||||
}
|
||||
return true;
|
||||
} else if (doNext()) {
|
||||
|
||||
|
@ -260,6 +304,11 @@ public final class CJKBigramFilter extends TokenFilter {
|
|||
termAtt.setLength(len2);
|
||||
offsetAtt.setOffset(startOffset[index], endOffset[index+1]);
|
||||
typeAtt.setType(DOUBLE_TYPE);
|
||||
// when outputting unigrams, all bigrams are synonyms that span two unigrams
|
||||
if (outputUnigrams) {
|
||||
posIncAtt.setPositionIncrement(0);
|
||||
posLengthAtt.setPositionLength(2);
|
||||
}
|
||||
index++;
|
||||
}
|
||||
|
||||
|
@ -292,7 +341,13 @@ public final class CJKBigramFilter extends TokenFilter {
|
|||
* inputs.
|
||||
*/
|
||||
private boolean hasBufferedUnigram() {
|
||||
return bufferLen == 1 && index == 0;
|
||||
if (outputUnigrams) {
|
||||
// when outputting unigrams always
|
||||
return bufferLen - index == 1;
|
||||
} else {
|
||||
// otherwise its only when we have a lone CJK character
|
||||
return bufferLen == 1 && index == 0;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -303,5 +358,6 @@ public final class CJKBigramFilter extends TokenFilter {
|
|||
lastEndOffset = 0;
|
||||
loneState = null;
|
||||
exhausted = false;
|
||||
ngramState = false;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -33,12 +33,13 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
|
|||
* <filter class="solr.LowerCaseFilterFactory"/>
|
||||
* <filter class="solr.CJKBigramFilterFactory"
|
||||
* han="true" hiragana="true"
|
||||
* katakana="true" hangul="true" />
|
||||
* katakana="true" hangul="true" outputUnigrams="false" />
|
||||
* </analyzer>
|
||||
* </fieldType></pre>
|
||||
*/
|
||||
public class CJKBigramFilterFactory extends TokenFilterFactory {
|
||||
int flags;
|
||||
boolean outputUnigrams;
|
||||
|
||||
@Override
|
||||
public void init(Map<String,String> args) {
|
||||
|
@ -56,10 +57,11 @@ public class CJKBigramFilterFactory extends TokenFilterFactory {
|
|||
if (getBoolean("hangul", true)) {
|
||||
flags |= CJKBigramFilter.HANGUL;
|
||||
}
|
||||
outputUnigrams = getBoolean("outputUnigrams", false);
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream create(TokenStream input) {
|
||||
return new CJKBigramFilter(input, flags);
|
||||
return new CJKBigramFilter(input, flags, outputUnigrams);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 7/15/12 1:57 AM */
|
||||
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 8/6/12 11:57 AM */
|
||||
|
||||
package org.apache.lucene.analysis.standard;
|
||||
|
||||
|
@ -33,8 +33,8 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
|||
/**
|
||||
* This class is a scanner generated by
|
||||
* <a href="http://www.jflex.de/">JFlex</a> 1.5.0-SNAPSHOT
|
||||
* on 7/15/12 1:57 AM from the specification file
|
||||
* <tt>C:/cygwin/home/s/svn/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex</tt>
|
||||
* on 8/6/12 11:57 AM from the specification file
|
||||
* <tt>/home/rmuir/workspace/lucene-trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex</tt>
|
||||
*/
|
||||
class ClassicTokenizerImpl implements StandardTokenizerInterface {
|
||||
|
||||
|
@ -42,7 +42,7 @@ class ClassicTokenizerImpl implements StandardTokenizerInterface {
|
|||
public static final int YYEOF = -1;
|
||||
|
||||
/** initial size of the lookahead buffer */
|
||||
private static final int ZZ_BUFFERSIZE = 16384;
|
||||
private static final int ZZ_BUFFERSIZE = 4096;
|
||||
|
||||
/** lexical states */
|
||||
public static final int YYINITIAL = 0;
|
||||
|
|
|
@ -36,6 +36,7 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
|||
%function getNextToken
|
||||
%pack
|
||||
%char
|
||||
%buffer 4096
|
||||
|
||||
%{
|
||||
|
||||
|
|
|
@ -14,7 +14,7 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
// Generated using ICU4J 49.1.0.0 on Thursday, July 26, 2012 10:22:01 PM UTC
|
||||
// Generated using ICU4J 49.1.0.0 on Monday, August 6, 2012 3:57:23 PM UTC
|
||||
// by org.apache.lucene.analysis.icu.GenerateJFlexSupplementaryMacros
|
||||
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 7/26/12 6:22 PM */
|
||||
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 8/6/12 11:57 AM */
|
||||
|
||||
package org.apache.lucene.analysis.standard;
|
||||
|
||||
|
@ -43,7 +43,7 @@ public final class StandardTokenizerImpl implements StandardTokenizerInterface {
|
|||
public static final int YYEOF = -1;
|
||||
|
||||
/** initial size of the lookahead buffer */
|
||||
private static final int ZZ_BUFFERSIZE = 16384;
|
||||
private static final int ZZ_BUFFERSIZE = 4096;
|
||||
|
||||
/** lexical states */
|
||||
public static final int YYINITIAL = 0;
|
||||
|
|
|
@ -44,8 +44,9 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
|||
%implements StandardTokenizerInterface
|
||||
%function getNextToken
|
||||
%char
|
||||
%buffer 4096
|
||||
|
||||
%include src/java/org/apache/lucene/analysis/standard/SUPPLEMENTARY.jflex-macro
|
||||
%include SUPPLEMENTARY.jflex-macro
|
||||
ALetter = ([\p{WB:ALetter}] | {ALetterSupp})
|
||||
Format = ([\p{WB:Format}] | {FormatSupp})
|
||||
Numeric = ([\p{WB:Numeric}] | {NumericSupp})
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 7/26/12 6:22 PM */
|
||||
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 8/6/12 11:57 AM */
|
||||
|
||||
package org.apache.lucene.analysis.standard;
|
||||
|
||||
|
@ -46,7 +46,7 @@ public final class UAX29URLEmailTokenizerImpl implements StandardTokenizerInterf
|
|||
public static final int YYEOF = -1;
|
||||
|
||||
/** initial size of the lookahead buffer */
|
||||
private static final int ZZ_BUFFERSIZE = 16384;
|
||||
private static final int ZZ_BUFFERSIZE = 4096;
|
||||
|
||||
/** lexical states */
|
||||
public static final int YYINITIAL = 0;
|
||||
|
|
|
@ -47,8 +47,9 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
|||
%implements StandardTokenizerInterface
|
||||
%function getNextToken
|
||||
%char
|
||||
%buffer 4096
|
||||
|
||||
%include src/java/org/apache/lucene/analysis/standard/SUPPLEMENTARY.jflex-macro
|
||||
%include SUPPLEMENTARY.jflex-macro
|
||||
ALetter = ([\p{WB:ALetter}] | {ALetterSupp})
|
||||
Format = ([\p{WB:Format}] | {FormatSupp})
|
||||
Numeric = ([\p{WB:Numeric}] | {NumericSupp})
|
||||
|
@ -88,7 +89,7 @@ HiraganaEx = {Hiragana} ({Format} | {Extend})*
|
|||
// RFC-5321: Simple Mail Transfer Protocol
|
||||
// RFC-5322: Internet Message Format
|
||||
|
||||
%include src/java/org/apache/lucene/analysis/standard/ASCIITLD.jflex-macro
|
||||
%include ASCIITLD.jflex-macro
|
||||
|
||||
DomainLabel = [A-Za-z0-9] ([-A-Za-z0-9]* [A-Za-z0-9])?
|
||||
DomainNameStrict = {DomainLabel} ("." {DomainLabel})* {ASCIITLD}
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 7/15/12 1:57 AM */
|
||||
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 8/6/12 11:57 AM */
|
||||
|
||||
package org.apache.lucene.analysis.wikipedia;
|
||||
|
||||
|
@ -25,8 +25,8 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
|||
/**
|
||||
* This class is a scanner generated by
|
||||
* <a href="http://www.jflex.de/">JFlex</a> 1.5.0-SNAPSHOT
|
||||
* on 7/15/12 1:57 AM from the specification file
|
||||
* <tt>C:/cygwin/home/s/svn/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex</tt>
|
||||
* on 8/6/12 11:57 AM from the specification file
|
||||
* <tt>/home/rmuir/workspace/lucene-trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex</tt>
|
||||
*/
|
||||
class WikipediaTokenizerImpl {
|
||||
|
||||
|
@ -34,7 +34,7 @@ class WikipediaTokenizerImpl {
|
|||
public static final int YYEOF = -1;
|
||||
|
||||
/** initial size of the lookahead buffer */
|
||||
private static final int ZZ_BUFFERSIZE = 16384;
|
||||
private static final int ZZ_BUFFERSIZE = 4096;
|
||||
|
||||
/** lexical states */
|
||||
public static final int THREE_SINGLE_QUOTES_STATE = 10;
|
||||
|
|
|
@ -27,6 +27,7 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
|||
%function getNextToken
|
||||
%pack
|
||||
%char
|
||||
%buffer 4096
|
||||
|
||||
%{
|
||||
|
||||
|
|
|
@ -18,6 +18,7 @@ package org.apache.lucene.analysis.cjk;
|
|||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
import java.util.Random;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
|
@ -33,6 +34,15 @@ public class TestCJKBigramFilter extends BaseTokenStreamTestCase {
|
|||
}
|
||||
};
|
||||
|
||||
Analyzer unibiAnalyzer = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer t = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
|
||||
return new TokenStreamComponents(t,
|
||||
new CJKBigramFilter(t, 0xff, true));
|
||||
}
|
||||
};
|
||||
|
||||
public void testHuge() throws Exception {
|
||||
assertAnalyzesTo(analyzer, "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた"
|
||||
+ "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた"
|
||||
|
@ -62,6 +72,96 @@ public class TestCJKBigramFilter extends BaseTokenStreamTestCase {
|
|||
}
|
||||
};
|
||||
assertAnalyzesTo(a, "多くの学生が試験に落ちた。",
|
||||
new String[] { "多", "く", "の", "学生", "が", "試験", "に", "落", "ち", "た" });
|
||||
new String[] { "多", "く", "の", "学生", "が", "試験", "に", "落", "ち", "た" },
|
||||
new int[] { 0, 1, 2, 3, 5, 6, 8, 9, 10, 11 },
|
||||
new int[] { 1, 2, 3, 5, 6, 8, 9, 10, 11, 12 },
|
||||
new String[] { "<SINGLE>", "<HIRAGANA>", "<HIRAGANA>", "<DOUBLE>", "<HIRAGANA>", "<DOUBLE>",
|
||||
"<HIRAGANA>", "<SINGLE>", "<HIRAGANA>", "<HIRAGANA>", "<SINGLE>" },
|
||||
new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
|
||||
new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 });
|
||||
}
|
||||
|
||||
public void testAllScripts() throws Exception {
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer t = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
|
||||
return new TokenStreamComponents(t,
|
||||
new CJKBigramFilter(t, 0xff, false));
|
||||
}
|
||||
};
|
||||
assertAnalyzesTo(a, "多くの学生が試験に落ちた。",
|
||||
new String[] { "多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた" });
|
||||
}
|
||||
|
||||
public void testUnigramsAndBigramsAllScripts() throws Exception {
|
||||
assertAnalyzesTo(unibiAnalyzer, "多くの学生が試験に落ちた。",
|
||||
new String[] {
|
||||
"多", "多く", "く", "くの", "の", "の学", "学", "学生", "生",
|
||||
"生が", "が", "が試", "試", "試験", "験", "験に", "に",
|
||||
"に落", "落", "落ち", "ち", "ちた", "た"
|
||||
},
|
||||
new int[] { 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6,
|
||||
6, 7, 7, 8, 8, 9, 9, 10, 10, 11 },
|
||||
new int[] { 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7,
|
||||
8, 8, 9, 9, 10, 10, 11, 11, 12, 12 },
|
||||
new String[] { "<SINGLE>", "<DOUBLE>", "<SINGLE>", "<DOUBLE>", "<SINGLE>", "<DOUBLE>", "<SINGLE>", "<DOUBLE>",
|
||||
"<SINGLE>", "<DOUBLE>", "<SINGLE>", "<DOUBLE>", "<SINGLE>", "<DOUBLE>", "<SINGLE>", "<DOUBLE>",
|
||||
"<SINGLE>", "<DOUBLE>", "<SINGLE>", "<DOUBLE>", "<SINGLE>", "<DOUBLE>", "<SINGLE>" },
|
||||
new int[] { 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
|
||||
0, 1, 0, 1, 0, 1, 0, 1, 0, 1 },
|
||||
new int[] { 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1,
|
||||
2, 1, 2, 1, 2, 1, 2, 1, 2, 1 }
|
||||
);
|
||||
}
|
||||
|
||||
public void testUnigramsAndBigramsHanOnly() throws Exception {
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer t = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
|
||||
return new TokenStreamComponents(t, new CJKBigramFilter(t, CJKBigramFilter.HAN, true));
|
||||
}
|
||||
};
|
||||
assertAnalyzesTo(a, "多くの学生が試験に落ちた。",
|
||||
new String[] { "多", "く", "の", "学", "学生", "生", "が", "試", "試験", "験", "に", "落", "ち", "た" },
|
||||
new int[] { 0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 10, 11 },
|
||||
new int[] { 1, 2, 3, 4, 5, 5, 6, 7, 8, 8, 9, 10, 11, 12 },
|
||||
new String[] { "<SINGLE>", "<HIRAGANA>", "<HIRAGANA>", "<SINGLE>", "<DOUBLE>",
|
||||
"<SINGLE>", "<HIRAGANA>", "<SINGLE>", "<DOUBLE>", "<SINGLE>",
|
||||
"<HIRAGANA>", "<SINGLE>", "<HIRAGANA>", "<HIRAGANA>", "<SINGLE>" },
|
||||
new int[] { 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1 },
|
||||
new int[] { 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1 });
|
||||
}
|
||||
|
||||
public void testUnigramsAndBigramsHuge() throws Exception {
|
||||
assertAnalyzesTo(unibiAnalyzer, "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた"
|
||||
+ "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた"
|
||||
+ "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた",
|
||||
new String[] {
|
||||
"多", "多く", "く", "くの", "の", "の学", "学", "学生", "生", "生が", "が", "が試", "試", "試験", "験", "験に", "に", "に落", "落", "落ち", "ち", "ちた", "た", "た多",
|
||||
"多", "多く", "く", "くの", "の", "の学", "学", "学生", "生", "生が", "が", "が試", "試", "試験", "験", "験に", "に", "に落", "落", "落ち", "ち", "ちた", "た", "た多",
|
||||
"多", "多く", "く", "くの", "の", "の学", "学", "学生", "生", "生が", "が", "が試", "試", "試験", "験", "験に", "に", "に落", "落", "落ち", "ち", "ちた", "た", "た多",
|
||||
"多", "多く", "く", "くの", "の", "の学", "学", "学生", "生", "生が", "が", "が試", "試", "試験", "験", "験に", "に", "に落", "落", "落ち", "ち", "ちた", "た", "た多",
|
||||
"多", "多く", "く", "くの", "の", "の学", "学", "学生", "生", "生が", "が", "が試", "試", "試験", "験", "験に", "に", "に落", "落", "落ち", "ち", "ちた", "た", "た多",
|
||||
"多", "多く", "く", "くの", "の", "の学", "学", "学生", "生", "生が", "が", "が試", "試", "試験", "験", "験に", "に", "に落", "落", "落ち", "ち", "ちた", "た", "た多",
|
||||
"多", "多く", "く", "くの", "の", "の学", "学", "学生", "生", "生が", "が", "が試", "試", "試験", "験", "験に", "に", "に落", "落", "落ち", "ち", "ちた", "た", "た多",
|
||||
"多", "多く", "く", "くの", "の", "の学", "学", "学生", "生", "生が", "が", "が試", "試", "試験", "験", "験に", "に", "に落", "落", "落ち", "ち", "ちた", "た", "た多",
|
||||
"多", "多く", "く", "くの", "の", "の学", "学", "学生", "生", "生が", "が", "が試", "試", "試験", "験", "験に", "に", "に落", "落", "落ち", "ち", "ちた", "た", "た多",
|
||||
"多", "多く", "く", "くの", "の", "の学", "学", "学生", "生", "生が", "が", "が試", "試", "試験", "験", "験に", "に", "に落", "落", "落ち", "ち", "ちた", "た", "た多",
|
||||
"多", "多く", "く", "くの", "の", "の学", "学", "学生", "生", "生が", "が", "が試", "試", "試験", "験", "験に", "に", "に落", "落", "落ち", "ち", "ちた", "た"
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomUnibiStrings() throws Exception {
|
||||
checkRandomData(random(), unibiAnalyzer, 1000*RANDOM_MULTIPLIER);
|
||||
}
|
||||
|
||||
/** blast some random strings through the analyzer */
|
||||
public void testRandomUnibiHugeStrings() throws Exception {
|
||||
Random random = random();
|
||||
checkRandomData(random, unibiAnalyzer, 100*RANDOM_MULTIPLIER, 8192);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -52,4 +52,16 @@ public class TestCJKBigramFilterFactory extends BaseTokenStreamTestCase {
|
|||
assertTokenStreamContents(stream,
|
||||
new String[] { "多", "く", "の", "学生", "が", "試験", "に", "落", "ち", "た" });
|
||||
}
|
||||
|
||||
public void testHanOnlyUnigrams() throws Exception {
|
||||
Reader reader = new StringReader("多くの学生が試験に落ちた。");
|
||||
CJKBigramFilterFactory factory = new CJKBigramFilterFactory();
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
args.put("hiragana", "false");
|
||||
args.put("outputUnigrams", "true");
|
||||
factory.init(args);
|
||||
TokenStream stream = factory.create(new StandardTokenizer(TEST_VERSION_CURRENT, reader));
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] { "多", "く", "の", "学", "学生", "生", "が", "試", "試験", "験", "に", "落", "ち", "た" });
|
||||
}
|
||||
}
|
||||
|
|
|
@ -100,8 +100,7 @@ public class TestAllAnalyzersHaveFactories extends LuceneTestCase {
|
|||
private static final ResourceLoader loader = new StringMockResourceLoader("");
|
||||
|
||||
public void test() throws Exception {
|
||||
List<Class<?>> analysisClasses = new ArrayList<Class<?>>();
|
||||
TestRandomChains.getClassesForPackage("org.apache.lucene.analysis", analysisClasses);
|
||||
List<Class<?>> analysisClasses = TestRandomChains.getClassesForPackage("org.apache.lucene.analysis");
|
||||
|
||||
for (final Class<?> c : analysisClasses) {
|
||||
final int modifiers = c.getModifiers();
|
||||
|
|
|
@ -25,6 +25,7 @@ import java.io.StringReader;
|
|||
import java.lang.reflect.Constructor;
|
||||
import java.lang.reflect.InvocationTargetException;
|
||||
import java.lang.reflect.Modifier;
|
||||
import java.net.URI;
|
||||
import java.net.URL;
|
||||
import java.nio.CharBuffer;
|
||||
import java.util.ArrayList;
|
||||
|
@ -165,8 +166,7 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
|
|||
|
||||
@BeforeClass
|
||||
public static void beforeClass() throws Exception {
|
||||
List<Class<?>> analysisClasses = new ArrayList<Class<?>>();
|
||||
getClassesForPackage("org.apache.lucene.analysis", analysisClasses);
|
||||
List<Class<?>> analysisClasses = getClassesForPackage("org.apache.lucene.analysis");
|
||||
tokenizers = new ArrayList<Constructor<? extends Tokenizer>>();
|
||||
tokenfilters = new ArrayList<Constructor<? extends TokenFilter>>();
|
||||
charfilters = new ArrayList<Constructor<? extends CharFilter>>();
|
||||
|
@ -235,19 +235,30 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
|
|||
private static <T> Constructor<T> castConstructor(Class<T> instanceClazz, Constructor<?> ctor) {
|
||||
return (Constructor<T>) ctor;
|
||||
}
|
||||
static void getClassesForPackage(String pckgname, List<Class<?>> classes) throws Exception {
|
||||
|
||||
public static List<Class<?>> getClassesForPackage(String pckgname) throws Exception {
|
||||
final List<Class<?>> classes = new ArrayList<Class<?>>();
|
||||
collectClassesForPackage(pckgname, classes);
|
||||
assertFalse("No classes found in package '"+pckgname+"'; maybe your test classes are packaged as JAR file?", classes.isEmpty());
|
||||
return classes;
|
||||
}
|
||||
|
||||
private static void collectClassesForPackage(String pckgname, List<Class<?>> classes) throws Exception {
|
||||
final ClassLoader cld = TestRandomChains.class.getClassLoader();
|
||||
final String path = pckgname.replace('.', '/');
|
||||
final Enumeration<URL> resources = cld.getResources(path);
|
||||
while (resources.hasMoreElements()) {
|
||||
final File directory = new File(resources.nextElement().toURI());
|
||||
final URI uri = resources.nextElement().toURI();
|
||||
if (!"file".equalsIgnoreCase(uri.getScheme()))
|
||||
continue;
|
||||
final File directory = new File(uri);
|
||||
if (directory.exists()) {
|
||||
String[] files = directory.list();
|
||||
for (String file : files) {
|
||||
if (new File(directory, file).isDirectory()) {
|
||||
// recurse
|
||||
String subPackage = pckgname + "." + file;
|
||||
getClassesForPackage(subPackage, classes);
|
||||
collectClassesForPackage(subPackage, classes);
|
||||
}
|
||||
if (file.endsWith(".class")) {
|
||||
String clazzName = file.substring(0, file.length() - 6);
|
||||
|
|
|
@ -43,7 +43,6 @@ import org.apache.lucene.facet.taxonomy.TaxonomyReader;
|
|||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.DocsEnum;
|
||||
import org.apache.lucene.index.Fields;
|
||||
import org.apache.lucene.index.FieldsEnum;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
|
||||
|
@ -156,7 +155,12 @@ public class TestPerfTasksLogic extends BenchmarkTestCase {
|
|||
|
||||
CountingSearchTestTask.numSearches = 0;
|
||||
execBenchmark(algLines);
|
||||
assertTrue(CountingSearchTestTask.numSearches > 0);
|
||||
|
||||
// NOTE: cannot assert this, because on a super-slow
|
||||
// system, it could be after waiting 0.5 seconds that
|
||||
// the search threads hadn't yet succeeded in starting
|
||||
// up and then they start up and do no searching:
|
||||
//assertTrue(CountingSearchTestTask.numSearches > 0);
|
||||
}
|
||||
|
||||
public void testHighlighting() throws Exception {
|
||||
|
@ -201,6 +205,7 @@ public class TestPerfTasksLogic extends BenchmarkTestCase {
|
|||
// 1. alg definition (required in every "logic" test)
|
||||
String algLines[] = {
|
||||
"doc.stored=true",//doc storage is required in order to have text to highlight
|
||||
"doc.term.vector=true",
|
||||
"doc.term.vector.offsets=true",
|
||||
"content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
|
||||
"docs.file=" + getReuters20LinesFile(),
|
||||
|
@ -487,13 +492,13 @@ public class TestPerfTasksLogic extends BenchmarkTestCase {
|
|||
|
||||
int totalTokenCount2 = 0;
|
||||
|
||||
FieldsEnum fields = MultiFields.getFields(reader).iterator();
|
||||
String fieldName = null;
|
||||
while((fieldName = fields.next()) != null) {
|
||||
Fields fields = MultiFields.getFields(reader);
|
||||
|
||||
for (String fieldName : fields) {
|
||||
if (fieldName.equals(DocMaker.ID_FIELD) || fieldName.equals(DocMaker.DATE_MSEC_FIELD) || fieldName.equals(DocMaker.TIME_SEC_FIELD)) {
|
||||
continue;
|
||||
}
|
||||
Terms terms = fields.terms();
|
||||
Terms terms = fields.terms(fieldName);
|
||||
if (terms == null) {
|
||||
continue;
|
||||
}
|
||||
|
|
|
@ -139,29 +139,6 @@
|
|||
|
||||
<target name="compile-core" depends="compile-lucene-core"/>
|
||||
|
||||
<!--
|
||||
Run after Junit tests.
|
||||
-->
|
||||
<target name="generate-clover-reports" depends="clover">
|
||||
<fail unless="run.clover">Clover not enabled!</fail>
|
||||
<mkdir dir="${clover.report.dir}"/>
|
||||
<fileset dir="build" id="clover.test.result.files">
|
||||
<include name="**/test/TEST-*.xml"/>
|
||||
<!-- do not include BW tests -->
|
||||
<exclude name="backwards/**"/>
|
||||
</fileset>
|
||||
<clover-report>
|
||||
<current outfile="${clover.report.dir}" title="${final.name}" numThreads="0">
|
||||
<format type="html" filter="assert"/>
|
||||
<testresults refid="clover.test.result.files"/>
|
||||
</current>
|
||||
<current outfile="${clover.report.dir}/clover.xml" title="${final.name}">
|
||||
<format type="xml" filter="assert"/>
|
||||
<testresults refid="clover.test.result.files"/>
|
||||
</current>
|
||||
</clover-report>
|
||||
</target>
|
||||
|
||||
<!-- Validation (license/notice/api checks). -->
|
||||
<target name="validate" depends="check-licenses,rat-sources,check-forbidden-apis" description="Validate stuff." />
|
||||
|
||||
|
@ -176,6 +153,7 @@
|
|||
<apiFileSet dir="${custom-tasks.dir}/forbiddenApis">
|
||||
<include name="jdk.txt" />
|
||||
<include name="jdk-deprecated.txt" />
|
||||
<include name="executors.txt" />
|
||||
</apiFileSet>
|
||||
<fileset dir="${basedir}/build" includes="**/*.class" />
|
||||
</forbidden-apis>
|
||||
|
|
|
@ -88,7 +88,7 @@
|
|||
<property name="tests.timezone" value="random" />
|
||||
<property name="tests.directory" value="random" />
|
||||
<property name="tests.linedocsfile" value="europarl.lines.txt.gz" />
|
||||
<property name="tests.loggingfile" value="/dev/null"/>
|
||||
<property name="tests.loggingfile" value="${common.dir}/tools/junit4/logging.properties"/>
|
||||
<property name="tests.nightly" value="false" />
|
||||
<property name="tests.weekly" value="false" />
|
||||
<property name="tests.slow" value="true" />
|
||||
|
@ -700,15 +700,22 @@
|
|||
<condition property="tests.method" value="${testmethod}*">
|
||||
<isset property="testmethod" />
|
||||
</condition>
|
||||
|
||||
<condition property="tests.showSuccess" value="true">
|
||||
<or>
|
||||
<isset property="tests.class" />
|
||||
<isset property="tests.method" />
|
||||
</or>
|
||||
</condition>
|
||||
<!-- default -->
|
||||
<property name="tests.showSuccess" value="false"/>
|
||||
|
||||
|
||||
<condition property="tests.showOutput" value="always">
|
||||
<or>
|
||||
<isset property="tests.class" />
|
||||
<isset property="tests.method" />
|
||||
</or>
|
||||
</condition>
|
||||
<property name="tests.showOutput" value="onerror"/>
|
||||
|
||||
<!-- Test macro using junit4. -->
|
||||
<macrodef name="test-macro" description="Executes junit tests.">
|
||||
|
@ -854,6 +861,7 @@
|
|||
<syspropertyset>
|
||||
<propertyref prefix="tests.maxfailures" />
|
||||
<propertyref prefix="tests.failfast" />
|
||||
<propertyref prefix="tests.badapples" />
|
||||
</syspropertyset>
|
||||
|
||||
<!-- Pass randomized settings to the forked JVM. -->
|
||||
|
@ -875,8 +883,7 @@
|
|||
<junit4:report-text
|
||||
showThrowable="true"
|
||||
showStackTraces="true"
|
||||
showOutputStream="true"
|
||||
showErrorStream="true"
|
||||
showOutput="${tests.showOutput}"
|
||||
|
||||
showStatusOk="${tests.showSuccess}"
|
||||
showStatusError="${tests.showError}"
|
||||
|
@ -896,8 +903,7 @@
|
|||
file="@{junit.output.dir}/tests-report.txt"
|
||||
showThrowable="true"
|
||||
showStackTraces="true"
|
||||
showOutputStream="true"
|
||||
showErrorStream="true"
|
||||
showOutput="always"
|
||||
|
||||
showStatusOk="true"
|
||||
showStatusError="true"
|
||||
|
@ -913,8 +919,7 @@
|
|||
file="@{junit.output.dir}/tests-failures.txt"
|
||||
showThrowable="true"
|
||||
showStackTraces="true"
|
||||
showOutputStream="true"
|
||||
showErrorStream="true"
|
||||
showOutput="onerror"
|
||||
|
||||
showStatusOk="false"
|
||||
showStatusError="true"
|
||||
|
@ -929,8 +934,13 @@
|
|||
the slowest tests or for reuse in balancing). -->
|
||||
<junit4:report-execution-times file="@{junit.output.dir}/tests-timehints.txt" historyLength="5" />
|
||||
|
||||
<junit4:report-ant-xml dir="@{junit.output.dir}" />
|
||||
<junit4:report-json file="@{junit.output.dir}/tests-report-${ant.project.name}/index.html" />
|
||||
<!-- ANT-compatible XMLs for jenkins records etc. -->
|
||||
<junit4:report-ant-xml dir="@{junit.output.dir}" outputStreams="no" />
|
||||
|
||||
<!--
|
||||
Enable if you wish to have a nice HTML5 report.
|
||||
<junit4:report-json file="@{junit.output.dir}/tests-report-${ant.project.name}/index.html" outputStreams="no" />
|
||||
-->
|
||||
</listeners>
|
||||
|
||||
<!-- Input test classes. -->
|
||||
|
|
|
@ -480,7 +480,7 @@ public class MyAnalyzer extends Analyzer {
|
|||
System.out.println(termAtt.toString());
|
||||
}
|
||||
|
||||
stream.end()
|
||||
stream.end();
|
||||
} finally {
|
||||
stream.close();
|
||||
}
|
||||
|
@ -509,7 +509,7 @@ easily by adding a LengthFilter to the chain. Only the
|
|||
{@literal @Override}
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
final Tokenizer source = new WhitespaceTokenizer(matchVersion, reader);
|
||||
TokenStream result = new LengthFilter(source, 3, Integer.MAX_VALUE);
|
||||
TokenStream result = new LengthFilter(true, source, 3, Integer.MAX_VALUE);
|
||||
return new TokenStreamComponents(source, result);
|
||||
}
|
||||
</pre>
|
||||
|
|
|
@ -27,7 +27,6 @@ import org.apache.lucene.index.DocsEnum;
|
|||
import org.apache.lucene.index.FieldInfo.IndexOptions;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.FieldInfos;
|
||||
import org.apache.lucene.index.FieldsEnum;
|
||||
import org.apache.lucene.index.IndexFileNames;
|
||||
import org.apache.lucene.index.TermState;
|
||||
import org.apache.lucene.index.Terms;
|
||||
|
@ -40,6 +39,7 @@ import org.apache.lucene.util.ArrayUtil;
|
|||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.DoubleBarrelLRUCache;
|
||||
import org.apache.lucene.util.UnmodifiableIterator;
|
||||
|
||||
/** Handles a terms dict, but decouples all details of
|
||||
* doc/freqs/positions reading to an instance of {@link
|
||||
|
@ -184,8 +184,8 @@ public class BlockTermsReader extends FieldsProducer {
|
|||
}
|
||||
|
||||
@Override
|
||||
public FieldsEnum iterator() {
|
||||
return new TermFieldsEnum();
|
||||
public Iterator<String> iterator() {
|
||||
return new UnmodifiableIterator<String>(fields.keySet().iterator());
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -199,32 +199,6 @@ public class BlockTermsReader extends FieldsProducer {
|
|||
return fields.size();
|
||||
}
|
||||
|
||||
// Iterates through all fields
|
||||
private class TermFieldsEnum extends FieldsEnum {
|
||||
final Iterator<FieldReader> it;
|
||||
FieldReader current;
|
||||
|
||||
TermFieldsEnum() {
|
||||
it = fields.values().iterator();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String next() {
|
||||
if (it.hasNext()) {
|
||||
current = it.next();
|
||||
return current.fieldInfo.name;
|
||||
} else {
|
||||
current = null;
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Terms terms() throws IOException {
|
||||
return current;
|
||||
}
|
||||
}
|
||||
|
||||
private class FieldReader extends Terms {
|
||||
final long numTerms;
|
||||
final FieldInfo fieldInfo;
|
||||
|
@ -253,6 +227,21 @@ public class BlockTermsReader extends FieldsProducer {
|
|||
return new SegmentTermsEnum();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasOffsets() {
|
||||
return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasPositions() {
|
||||
return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasPayloads() {
|
||||
return fieldInfo.hasPayloads();
|
||||
}
|
||||
|
||||
@Override
|
||||
public long size() {
|
||||
return numTerms;
|
||||
|
|
|
@ -31,7 +31,6 @@ import org.apache.lucene.index.DocsEnum;
|
|||
import org.apache.lucene.index.FieldInfo.IndexOptions;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.FieldInfos;
|
||||
import org.apache.lucene.index.FieldsEnum;
|
||||
import org.apache.lucene.index.IndexFileNames;
|
||||
import org.apache.lucene.index.TermState;
|
||||
import org.apache.lucene.index.Terms;
|
||||
|
@ -46,6 +45,7 @@ import org.apache.lucene.util.BytesRef;
|
|||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.RamUsageEstimator;
|
||||
import org.apache.lucene.util.StringHelper;
|
||||
import org.apache.lucene.util.UnmodifiableIterator;
|
||||
import org.apache.lucene.util.automaton.CompiledAutomaton;
|
||||
import org.apache.lucene.util.automaton.RunAutomaton;
|
||||
import org.apache.lucene.util.automaton.Transition;
|
||||
|
@ -199,8 +199,8 @@ public class BlockTreeTermsReader extends FieldsProducer {
|
|||
}
|
||||
|
||||
@Override
|
||||
public FieldsEnum iterator() {
|
||||
return new TermFieldsEnum();
|
||||
public Iterator<String> iterator() {
|
||||
return new UnmodifiableIterator<String>(fields.keySet().iterator());
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -214,32 +214,6 @@ public class BlockTreeTermsReader extends FieldsProducer {
|
|||
return fields.size();
|
||||
}
|
||||
|
||||
// Iterates through all fields
|
||||
private class TermFieldsEnum extends FieldsEnum {
|
||||
final Iterator<FieldReader> it;
|
||||
FieldReader current;
|
||||
|
||||
TermFieldsEnum() {
|
||||
it = fields.values().iterator();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String next() {
|
||||
if (it.hasNext()) {
|
||||
current = it.next();
|
||||
return current.fieldInfo.name;
|
||||
} else {
|
||||
current = null;
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Terms terms() throws IOException {
|
||||
return current;
|
||||
}
|
||||
}
|
||||
|
||||
// for debugging
|
||||
String brToString(BytesRef b) {
|
||||
if (b == null) {
|
||||
|
@ -456,6 +430,21 @@ public class BlockTreeTermsReader extends FieldsProducer {
|
|||
return BytesRef.getUTF8SortedAsUnicodeComparator();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasOffsets() {
|
||||
return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasPositions() {
|
||||
return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasPayloads() {
|
||||
return fieldInfo.hasPayloads();
|
||||
}
|
||||
|
||||
@Override
|
||||
public TermsEnum iterator(TermsEnum reuse) throws IOException {
|
||||
return new SegmentTermsEnum();
|
||||
|
|
|
@ -22,7 +22,6 @@ import java.io.IOException;
|
|||
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.Fields;
|
||||
import org.apache.lucene.index.FieldsEnum;
|
||||
import org.apache.lucene.index.MergeState;
|
||||
import org.apache.lucene.index.SegmentWriteState; // javadocs
|
||||
import org.apache.lucene.index.Terms;
|
||||
|
@ -53,13 +52,10 @@ public abstract class FieldsConsumer implements Closeable {
|
|||
public abstract void close() throws IOException;
|
||||
|
||||
public void merge(MergeState mergeState, Fields fields) throws IOException {
|
||||
FieldsEnum fieldsEnum = fields.iterator();
|
||||
assert fieldsEnum != null;
|
||||
String field;
|
||||
while((field = fieldsEnum.next()) != null) {
|
||||
for (String field : fields) {
|
||||
mergeState.fieldInfo = mergeState.fieldInfos.fieldInfo(field);
|
||||
assert mergeState.fieldInfo != null : "FieldInfo for field is null: "+ field;
|
||||
Terms terms = fieldsEnum.terms();
|
||||
Terms terms = fields.terms(field);
|
||||
if (terms != null) {
|
||||
final TermsConsumer termsConsumer = addField(mergeState.fieldInfo);
|
||||
termsConsumer.merge(mergeState, terms.iterator(null));
|
||||
|
|
|
@ -124,15 +124,17 @@ public final class MappingMultiDocsAndPositionsEnum extends DocsAndPositionsEnum
|
|||
@Override
|
||||
public BytesRef getPayload() throws IOException {
|
||||
BytesRef payload = current.getPayload();
|
||||
if (mergeState.currentPayloadProcessor[upto] != null) {
|
||||
if (mergeState.currentPayloadProcessor[upto] != null && payload != null) {
|
||||
// to not violate the D&P api, we must give the processor a private copy
|
||||
// TODO: reuse a BytesRef if there is a PPP
|
||||
payload = BytesRef.deepCopyOf(payload);
|
||||
mergeState.currentPayloadProcessor[upto].processPayload(payload);
|
||||
if (payload.length == 0) {
|
||||
// don't let PayloadProcessors corrumpt the index
|
||||
return null;
|
||||
}
|
||||
}
|
||||
return payload;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasPayload() {
|
||||
return current.hasPayload();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -112,12 +112,7 @@ public abstract class PostingsConsumer {
|
|||
totTF += freq;
|
||||
for(int i=0;i<freq;i++) {
|
||||
final int position = postingsEnum.nextPosition();
|
||||
final BytesRef payload;
|
||||
if (postingsEnum.hasPayload()) {
|
||||
payload = postingsEnum.getPayload();
|
||||
} else {
|
||||
payload = null;
|
||||
}
|
||||
final BytesRef payload = postingsEnum.getPayload();
|
||||
this.addPosition(position, payload, -1, -1);
|
||||
}
|
||||
this.finishDoc();
|
||||
|
@ -137,12 +132,7 @@ public abstract class PostingsConsumer {
|
|||
totTF += freq;
|
||||
for(int i=0;i<freq;i++) {
|
||||
final int position = postingsEnum.nextPosition();
|
||||
final BytesRef payload;
|
||||
if (postingsEnum.hasPayload()) {
|
||||
payload = postingsEnum.getPayload();
|
||||
} else {
|
||||
payload = null;
|
||||
}
|
||||
final BytesRef payload = postingsEnum.getPayload();
|
||||
this.addPosition(position, payload, postingsEnum.startOffset(), postingsEnum.endOffset());
|
||||
}
|
||||
this.finishDoc();
|
||||
|
|
|
@ -26,8 +26,9 @@ import org.apache.lucene.index.DocsAndPositionsEnum;
|
|||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.FieldInfos;
|
||||
import org.apache.lucene.index.Fields;
|
||||
import org.apache.lucene.index.FieldsEnum;
|
||||
import org.apache.lucene.index.MergeState;
|
||||
import org.apache.lucene.index.PayloadProcessorProvider.PayloadProcessor;
|
||||
import org.apache.lucene.index.PayloadProcessorProvider.ReaderPayloadProcessor;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
|
@ -41,14 +42,14 @@ import org.apache.lucene.util.BytesRef;
|
|||
* <ol>
|
||||
* <li>For every document, {@link #startDocument(int)} is called,
|
||||
* informing the Codec how many fields will be written.
|
||||
* <li>{@link #startField(FieldInfo, int, boolean, boolean)} is called for
|
||||
* <li>{@link #startField(FieldInfo, int, boolean, boolean, boolean)} is called for
|
||||
* each field in the document, informing the codec how many terms
|
||||
* will be written for that field, and whether or not positions
|
||||
* or offsets are enabled.
|
||||
* will be written for that field, and whether or not positions,
|
||||
* offsets, or payloads are enabled.
|
||||
* <li>Within each field, {@link #startTerm(BytesRef, int)} is called
|
||||
* for each term.
|
||||
* <li>If offsets and/or positions are enabled, then
|
||||
* {@link #addPosition(int, int, int)} will be called for each term
|
||||
* {@link #addPosition(int, int, int, BytesRef)} will be called for each term
|
||||
* occurrence.
|
||||
* <li>After all documents have been written, {@link #finish(FieldInfos, int)}
|
||||
* is called for verification/sanity-checks.
|
||||
|
@ -60,7 +61,7 @@ import org.apache.lucene.util.BytesRef;
|
|||
public abstract class TermVectorsWriter implements Closeable {
|
||||
|
||||
/** Called before writing the term vectors of the document.
|
||||
* {@link #startField(FieldInfo, int, boolean, boolean)} will
|
||||
* {@link #startField(FieldInfo, int, boolean, boolean, boolean)} will
|
||||
* be called <code>numVectorFields</code> times. Note that if term
|
||||
* vectors are enabled, this is called even if the document
|
||||
* has no vector fields, in this case <code>numVectorFields</code>
|
||||
|
@ -69,17 +70,17 @@ public abstract class TermVectorsWriter implements Closeable {
|
|||
|
||||
/** Called before writing the terms of the field.
|
||||
* {@link #startTerm(BytesRef, int)} will be called <code>numTerms</code> times. */
|
||||
public abstract void startField(FieldInfo info, int numTerms, boolean positions, boolean offsets) throws IOException;
|
||||
public abstract void startField(FieldInfo info, int numTerms, boolean positions, boolean offsets, boolean payloads) throws IOException;
|
||||
|
||||
/** Adds a term and its term frequency <code>freq</code>.
|
||||
* If this field has positions and/or offsets enabled, then
|
||||
* {@link #addPosition(int, int, int)} will be called
|
||||
* {@link #addPosition(int, int, int, BytesRef)} will be called
|
||||
* <code>freq</code> times respectively.
|
||||
*/
|
||||
public abstract void startTerm(BytesRef term, int freq) throws IOException;
|
||||
|
||||
/** Adds a term position and offsets */
|
||||
public abstract void addPosition(int position, int startOffset, int endOffset) throws IOException;
|
||||
public abstract void addPosition(int position, int startOffset, int endOffset, BytesRef payload) throws IOException;
|
||||
|
||||
/** Aborts writing entirely, implementation should remove
|
||||
* any partially-written files, etc. */
|
||||
|
@ -99,7 +100,7 @@ public abstract class TermVectorsWriter implements Closeable {
|
|||
* This is an expert API that allows the codec to consume
|
||||
* positions and offsets directly from the indexer.
|
||||
* <p>
|
||||
* The default implementation calls {@link #addPosition(int, int, int)},
|
||||
* The default implementation calls {@link #addPosition(int, int, int, BytesRef)},
|
||||
* but subclasses can override this if they want to efficiently write
|
||||
* all the positions, then all the offsets, for example.
|
||||
* <p>
|
||||
|
@ -111,15 +112,36 @@ public abstract class TermVectorsWriter implements Closeable {
|
|||
public void addProx(int numProx, DataInput positions, DataInput offsets) throws IOException {
|
||||
int position = 0;
|
||||
int lastOffset = 0;
|
||||
BytesRef payload = null;
|
||||
|
||||
for (int i = 0; i < numProx; i++) {
|
||||
final int startOffset;
|
||||
final int endOffset;
|
||||
final BytesRef thisPayload;
|
||||
|
||||
if (positions == null) {
|
||||
position = -1;
|
||||
thisPayload = null;
|
||||
} else {
|
||||
position += positions.readVInt();
|
||||
int code = positions.readVInt();
|
||||
position += code >>> 1;
|
||||
if ((code & 1) != 0) {
|
||||
// This position has a payload
|
||||
final int payloadLength = positions.readVInt();
|
||||
|
||||
if (payload == null) {
|
||||
payload = new BytesRef();
|
||||
payload.bytes = new byte[payloadLength];
|
||||
} else if (payload.bytes.length < payloadLength) {
|
||||
payload.grow(payloadLength);
|
||||
}
|
||||
|
||||
positions.readBytes(payload.bytes, 0, payloadLength);
|
||||
payload.length = payloadLength;
|
||||
thisPayload = payload;
|
||||
} else {
|
||||
thisPayload = null;
|
||||
}
|
||||
}
|
||||
|
||||
if (offsets == null) {
|
||||
|
@ -129,24 +151,31 @@ public abstract class TermVectorsWriter implements Closeable {
|
|||
endOffset = startOffset + offsets.readVInt();
|
||||
lastOffset = endOffset;
|
||||
}
|
||||
addPosition(position, startOffset, endOffset);
|
||||
addPosition(position, startOffset, endOffset, thisPayload);
|
||||
}
|
||||
}
|
||||
|
||||
/** Merges in the term vectors from the readers in
|
||||
* <code>mergeState</code>. The default implementation skips
|
||||
* over deleted documents, and uses {@link #startDocument(int)},
|
||||
* {@link #startField(FieldInfo, int, boolean, boolean)},
|
||||
* {@link #startTerm(BytesRef, int)}, {@link #addPosition(int, int, int)},
|
||||
* {@link #startField(FieldInfo, int, boolean, boolean, boolean)},
|
||||
* {@link #startTerm(BytesRef, int)}, {@link #addPosition(int, int, int, BytesRef)},
|
||||
* and {@link #finish(FieldInfos, int)},
|
||||
* returning the number of documents that were written.
|
||||
* Implementations can override this method for more sophisticated
|
||||
* merging (bulk-byte copying, etc). */
|
||||
public int merge(MergeState mergeState) throws IOException {
|
||||
int docCount = 0;
|
||||
for (AtomicReader reader : mergeState.readers) {
|
||||
for (int i = 0; i < mergeState.readers.size(); i++) {
|
||||
final AtomicReader reader = mergeState.readers.get(i);
|
||||
final int maxDoc = reader.maxDoc();
|
||||
final Bits liveDocs = reader.getLiveDocs();
|
||||
// set PayloadProcessor
|
||||
if (mergeState.payloadProcessorProvider != null) {
|
||||
mergeState.currentReaderPayloadProcessor = mergeState.readerPayloadProcessor[i];
|
||||
} else {
|
||||
mergeState.currentReaderPayloadProcessor = null;
|
||||
}
|
||||
for (int docID = 0; docID < maxDoc; docID++) {
|
||||
if (liveDocs != null && !liveDocs.get(docID)) {
|
||||
// skip deleted docs
|
||||
|
@ -155,7 +184,7 @@ public abstract class TermVectorsWriter implements Closeable {
|
|||
// NOTE: it's very important to first assign to vectors then pass it to
|
||||
// termVectorsWriter.addAllDocVectors; see LUCENE-1282
|
||||
Fields vectors = reader.getTermVectors(docID);
|
||||
addAllDocVectors(vectors, mergeState.fieldInfos);
|
||||
addAllDocVectors(vectors, mergeState);
|
||||
docCount++;
|
||||
mergeState.checkAbort.work(300);
|
||||
}
|
||||
|
@ -169,7 +198,7 @@ public abstract class TermVectorsWriter implements Closeable {
|
|||
* implementation requires that the vectors implement
|
||||
* both Fields.size and
|
||||
* Terms.size. */
|
||||
protected final void addAllDocVectors(Fields vectors, FieldInfos fieldInfos) throws IOException {
|
||||
protected final void addAllDocVectors(Fields vectors, MergeState mergeState) throws IOException {
|
||||
if (vectors == null) {
|
||||
startDocument(0);
|
||||
return;
|
||||
|
@ -181,54 +210,55 @@ public abstract class TermVectorsWriter implements Closeable {
|
|||
}
|
||||
startDocument(numFields);
|
||||
|
||||
final FieldsEnum fieldsEnum = vectors.iterator();
|
||||
String fieldName;
|
||||
String lastFieldName = null;
|
||||
|
||||
TermsEnum termsEnum = null;
|
||||
DocsAndPositionsEnum docsAndPositionsEnum = null;
|
||||
|
||||
final ReaderPayloadProcessor readerPayloadProcessor = mergeState.currentReaderPayloadProcessor;
|
||||
PayloadProcessor payloadProcessor = null;
|
||||
|
||||
while((fieldName = fieldsEnum.next()) != null) {
|
||||
final FieldInfo fieldInfo = fieldInfos.fieldInfo(fieldName);
|
||||
for(String fieldName : vectors) {
|
||||
final FieldInfo fieldInfo = mergeState.fieldInfos.fieldInfo(fieldName);
|
||||
|
||||
assert lastFieldName == null || fieldName.compareTo(lastFieldName) > 0: "lastFieldName=" + lastFieldName + " fieldName=" + fieldName;
|
||||
lastFieldName = fieldName;
|
||||
|
||||
final Terms terms = fieldsEnum.terms();
|
||||
final Terms terms = vectors.terms(fieldName);
|
||||
if (terms == null) {
|
||||
// FieldsEnum shouldn't lie...
|
||||
continue;
|
||||
}
|
||||
|
||||
final boolean hasPositions = terms.hasPositions();
|
||||
final boolean hasOffsets = terms.hasOffsets();
|
||||
final boolean hasPayloads = terms.hasPayloads();
|
||||
assert !hasPayloads || hasPositions;
|
||||
|
||||
final int numTerms = (int) terms.size();
|
||||
if (numTerms == -1) {
|
||||
throw new IllegalStateException("terms.size() must be implemented (it returned -1)");
|
||||
}
|
||||
final TermsEnum termsEnum = terms.iterator(null);
|
||||
|
||||
DocsAndPositionsEnum docsAndPositionsEnum = null;
|
||||
|
||||
boolean startedField = false;
|
||||
|
||||
// NOTE: this is tricky, because TermVectors allow
|
||||
// indexing offsets but NOT positions. So we must
|
||||
// lazily init the field by checking whether first
|
||||
// position we see is -1 or not.
|
||||
|
||||
startField(fieldInfo, numTerms, hasPositions, hasOffsets, hasPayloads);
|
||||
termsEnum = terms.iterator(termsEnum);
|
||||
|
||||
int termCount = 0;
|
||||
while(termsEnum.next() != null) {
|
||||
termCount++;
|
||||
|
||||
final int freq = (int) termsEnum.totalTermFreq();
|
||||
|
||||
if (startedField) {
|
||||
startTerm(termsEnum.term(), freq);
|
||||
|
||||
startTerm(termsEnum.term(), freq);
|
||||
|
||||
if (hasPayloads && readerPayloadProcessor != null) {
|
||||
payloadProcessor = readerPayloadProcessor.getProcessor(fieldName, termsEnum.term());
|
||||
}
|
||||
|
||||
// TODO: we need a "query" API where we can ask (via
|
||||
// flex API) what this term was indexed with...
|
||||
// Both positions & offsets:
|
||||
docsAndPositionsEnum = termsEnum.docsAndPositions(null, null);
|
||||
boolean hasOffsets = false;
|
||||
boolean hasPositions = false;
|
||||
|
||||
if (docsAndPositionsEnum != null) {
|
||||
if (hasPositions || hasOffsets) {
|
||||
docsAndPositionsEnum = termsEnum.docsAndPositions(null, docsAndPositionsEnum);
|
||||
assert docsAndPositionsEnum != null;
|
||||
|
||||
final int docID = docsAndPositionsEnum.nextDoc();
|
||||
assert docID != DocIdSetIterator.NO_MORE_DOCS;
|
||||
assert docsAndPositionsEnum.freq() == freq;
|
||||
|
@ -237,27 +267,21 @@ public abstract class TermVectorsWriter implements Closeable {
|
|||
final int pos = docsAndPositionsEnum.nextPosition();
|
||||
final int startOffset = docsAndPositionsEnum.startOffset();
|
||||
final int endOffset = docsAndPositionsEnum.endOffset();
|
||||
if (!startedField) {
|
||||
assert numTerms > 0;
|
||||
hasPositions = pos != -1;
|
||||
hasOffsets = startOffset != -1;
|
||||
startField(fieldInfo, numTerms, hasPositions, hasOffsets);
|
||||
startTerm(termsEnum.term(), freq);
|
||||
startedField = true;
|
||||
}
|
||||
if (hasOffsets) {
|
||||
assert startOffset != -1;
|
||||
assert endOffset != -1;
|
||||
|
||||
BytesRef payload = docsAndPositionsEnum.getPayload();
|
||||
|
||||
if (payloadProcessor != null && payload != null) {
|
||||
// to not violate the D&P api, we must give the processor a private copy
|
||||
payload = BytesRef.deepCopyOf(payload);
|
||||
payloadProcessor.processPayload(payload);
|
||||
if (payload.length == 0) {
|
||||
// don't let PayloadProcessors corrumpt the index
|
||||
payload = null;
|
||||
}
|
||||
}
|
||||
|
||||
assert !hasPositions || pos >= 0;
|
||||
addPosition(pos, startOffset, endOffset);
|
||||
}
|
||||
} else {
|
||||
if (!startedField) {
|
||||
assert numTerms > 0;
|
||||
startField(fieldInfo, numTerms, hasPositions, hasOffsets);
|
||||
startTerm(termsEnum.term(), freq);
|
||||
startedField = true;
|
||||
addPosition(pos, startOffset, endOffset, payload);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -954,11 +954,6 @@ public final class BlockPostingsReader extends PostingsReaderBase {
|
|||
return -1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasPayload() {
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public BytesRef getPayload() {
|
||||
return null;
|
||||
|
@ -1226,10 +1221,6 @@ public final class BlockPostingsReader extends PostingsReaderBase {
|
|||
if (DEBUG) {
|
||||
System.out.println(" FPR.nextDoc");
|
||||
}
|
||||
if (indexHasPayloads) {
|
||||
payloadByteUpto += payloadLength;
|
||||
payloadLength = 0;
|
||||
}
|
||||
while (true) {
|
||||
if (DEBUG) {
|
||||
System.out.println(" docUpto=" + docUpto + " (of df=" + docFreq + ") docBufferUpto=" + docBufferUpto);
|
||||
|
@ -1255,7 +1246,6 @@ public final class BlockPostingsReader extends PostingsReaderBase {
|
|||
System.out.println(" return doc=" + doc + " freq=" + freq + " posPendingCount=" + posPendingCount);
|
||||
}
|
||||
position = 0;
|
||||
payloadLength = 0;
|
||||
lastStartOffset = 0;
|
||||
return doc;
|
||||
}
|
||||
|
@ -1355,12 +1345,7 @@ public final class BlockPostingsReader extends PostingsReaderBase {
|
|||
if (DEBUG) {
|
||||
System.out.println(" return doc=" + accum);
|
||||
}
|
||||
if (indexHasPayloads) {
|
||||
payloadByteUpto += payloadLength;
|
||||
payloadLength = 0;
|
||||
}
|
||||
position = 0;
|
||||
payloadLength = 0;
|
||||
lastStartOffset = 0;
|
||||
return doc = accum;
|
||||
} else {
|
||||
|
@ -1433,7 +1418,6 @@ public final class BlockPostingsReader extends PostingsReaderBase {
|
|||
}
|
||||
|
||||
position = 0;
|
||||
payloadLength = 0;
|
||||
lastStartOffset = 0;
|
||||
}
|
||||
|
||||
|
@ -1461,16 +1445,6 @@ public final class BlockPostingsReader extends PostingsReaderBase {
|
|||
posBufferUpto = BLOCK_SIZE;
|
||||
}
|
||||
|
||||
if (indexHasPayloads) {
|
||||
if (DEBUG) {
|
||||
if (payloadLength != 0) {
|
||||
System.out.println(" skip unread payload length=" + payloadLength);
|
||||
}
|
||||
}
|
||||
payloadByteUpto += payloadLength;
|
||||
payloadLength = 0;
|
||||
}
|
||||
|
||||
if (posPendingCount > freq) {
|
||||
skipPositions();
|
||||
posPendingCount = freq;
|
||||
|
@ -1484,6 +1458,10 @@ public final class BlockPostingsReader extends PostingsReaderBase {
|
|||
|
||||
if (indexHasPayloads) {
|
||||
payloadLength = payloadLengthBuffer[posBufferUpto];
|
||||
payload.bytes = payloadBytes;
|
||||
payload.offset = payloadByteUpto;
|
||||
payload.length = payloadLength;
|
||||
payloadByteUpto += payloadLength;
|
||||
}
|
||||
|
||||
if (indexHasOffsets) {
|
||||
|
@ -1510,22 +1488,16 @@ public final class BlockPostingsReader extends PostingsReaderBase {
|
|||
return endOffset;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasPayload() {
|
||||
return payloadLength != 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public BytesRef getPayload() {
|
||||
if (DEBUG) {
|
||||
System.out.println(" FPR.getPayload payloadLength=" + payloadLength + " payloadByteUpto=" + payloadByteUpto);
|
||||
}
|
||||
payload.bytes = payloadBytes;
|
||||
payload.offset = payloadByteUpto;
|
||||
payload.length = payloadLength;
|
||||
payloadByteUpto += payloadLength;
|
||||
payloadLength = 0;
|
||||
return payload;
|
||||
if (payloadLength == 0) {
|
||||
return null;
|
||||
} else {
|
||||
return payload;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -21,6 +21,7 @@ import java.io.IOException;
|
|||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Map.Entry;
|
||||
|
@ -35,7 +36,6 @@ import org.apache.lucene.codecs.TermsConsumer;
|
|||
import org.apache.lucene.index.DocsAndPositionsEnum;
|
||||
import org.apache.lucene.index.DocsEnum;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.FieldsEnum;
|
||||
import org.apache.lucene.index.IndexFileNames;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
|
@ -44,7 +44,6 @@ import org.apache.lucene.index.TermsEnum;
|
|||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.FuzzySet;
|
||||
|
@ -187,9 +186,8 @@ public class BloomFilteringPostingsFormat extends PostingsFormat {
|
|||
|
||||
}
|
||||
|
||||
public FieldsEnum iterator() throws IOException {
|
||||
return new BloomFilteredFieldsEnum(delegateFieldsProducer.iterator(),
|
||||
bloomsByFieldName);
|
||||
public Iterator<String> iterator() {
|
||||
return delegateFieldsProducer.iterator();
|
||||
}
|
||||
|
||||
public void close() throws IOException {
|
||||
|
@ -217,44 +215,6 @@ public class BloomFilteringPostingsFormat extends PostingsFormat {
|
|||
return delegateFieldsProducer.getUniqueTermCount();
|
||||
}
|
||||
|
||||
// Not all fields in a segment may be subject to a bloom filter. This class
|
||||
// wraps Terms objects appropriately if a filtering request is present
|
||||
class BloomFilteredFieldsEnum extends FieldsEnum {
|
||||
private FieldsEnum delegateFieldsEnum;
|
||||
private HashMap<String,FuzzySet> bloomsByFieldName;
|
||||
private String currentFieldName;
|
||||
|
||||
public BloomFilteredFieldsEnum(FieldsEnum iterator,
|
||||
HashMap<String,FuzzySet> bloomsByFieldName) {
|
||||
this.delegateFieldsEnum = iterator;
|
||||
this.bloomsByFieldName = bloomsByFieldName;
|
||||
}
|
||||
|
||||
public AttributeSource attributes() {
|
||||
return delegateFieldsEnum.attributes();
|
||||
}
|
||||
|
||||
public String next() throws IOException {
|
||||
currentFieldName = delegateFieldsEnum.next();
|
||||
return currentFieldName;
|
||||
}
|
||||
|
||||
public Terms terms() throws IOException {
|
||||
FuzzySet filter = bloomsByFieldName.get(currentFieldName);
|
||||
if (filter == null) {
|
||||
return delegateFieldsEnum.terms();
|
||||
} else {
|
||||
Terms result = delegateFieldsEnum.terms();
|
||||
if (result == null) {
|
||||
return null;
|
||||
}
|
||||
// wrap the terms object with a bloom filter
|
||||
return new BloomFilteredTerms(result, filter);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
class BloomFilteredTerms extends Terms {
|
||||
private Terms delegateTerms;
|
||||
private FuzzySet filter;
|
||||
|
@ -314,6 +274,21 @@ public class BloomFilteringPostingsFormat extends PostingsFormat {
|
|||
public int getDocCount() throws IOException {
|
||||
return delegateTerms.getDocCount();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasOffsets() {
|
||||
return delegateTerms.hasOffsets();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasPositions() {
|
||||
return delegateTerms.hasPositions();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasPayloads() {
|
||||
return delegateTerms.hasPayloads();
|
||||
}
|
||||
}
|
||||
|
||||
class BloomFilteredTermsEnum extends TermsEnum {
|
||||
|
|
|
@ -873,12 +873,7 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
|
|||
* payload was indexed. */
|
||||
@Override
|
||||
public BytesRef getPayload() throws IOException {
|
||||
throw new IOException("No payloads exist for this field!");
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasPayload() {
|
||||
return false;
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1152,28 +1147,26 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
|
|||
@Override
|
||||
public BytesRef getPayload() throws IOException {
|
||||
if (storePayloads) {
|
||||
if (payloadLength <= 0) {
|
||||
return null;
|
||||
}
|
||||
assert lazyProxPointer == -1;
|
||||
assert posPendingCount < freq;
|
||||
if (!payloadPending) {
|
||||
throw new IOException("Either no payload exists at this term position or an attempt was made to load it more than once.");
|
||||
}
|
||||
if (payloadLength > payload.bytes.length) {
|
||||
payload.grow(payloadLength);
|
||||
}
|
||||
|
||||
if (payloadPending) {
|
||||
if (payloadLength > payload.bytes.length) {
|
||||
payload.grow(payloadLength);
|
||||
}
|
||||
|
||||
proxIn.readBytes(payload.bytes, 0, payloadLength);
|
||||
payload.length = payloadLength;
|
||||
payloadPending = false;
|
||||
proxIn.readBytes(payload.bytes, 0, payloadLength);
|
||||
payload.length = payloadLength;
|
||||
payloadPending = false;
|
||||
}
|
||||
|
||||
return payload;
|
||||
} else {
|
||||
throw new IOException("No payloads exist for this field!");
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasPayload() {
|
||||
return payloadPending && payloadLength > 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -67,33 +67,46 @@ import org.apache.lucene.store.IOContext;
|
|||
* <li><a name="tvf" id="tvf"></a>
|
||||
* <p>The Field or .tvf file.</p>
|
||||
* <p>This file contains, for each field that has a term vector stored, a list of
|
||||
* the terms, their frequencies and, optionally, position and offset
|
||||
* the terms, their frequencies and, optionally, position, offset, and payload
|
||||
* information.</p>
|
||||
* <p>Field (.tvf) --> Header,<NumTerms, Position/Offset, TermFreqs>
|
||||
* <p>Field (.tvf) --> Header,<NumTerms, Flags, TermFreqs>
|
||||
* <sup>NumFields</sup></p>
|
||||
* <ul>
|
||||
* <li>Header --> {@link CodecUtil#writeHeader CodecHeader}</li>
|
||||
* <li>NumTerms --> {@link DataOutput#writeVInt VInt}</li>
|
||||
* <li>Position/Offset --> {@link DataOutput#writeByte Byte}</li>
|
||||
* <li>TermFreqs --> <TermText, TermFreq, Positions?, Offsets?>
|
||||
* <li>Flags --> {@link DataOutput#writeByte Byte}</li>
|
||||
* <li>TermFreqs --> <TermText, TermFreq, Positions?, PayloadData?, Offsets?>
|
||||
* <sup>NumTerms</sup></li>
|
||||
* <li>TermText --> <PrefixLength, Suffix></li>
|
||||
* <li>PrefixLength --> {@link DataOutput#writeVInt VInt}</li>
|
||||
* <li>Suffix --> {@link DataOutput#writeString String}</li>
|
||||
* <li>TermFreq --> {@link DataOutput#writeVInt VInt}</li>
|
||||
* <li>Positions --> <{@link DataOutput#writeVInt VInt}><sup>TermFreq</sup></li>
|
||||
* <li>Positions --> <PositionDelta PayloadLength?><sup>TermFreq</sup></li>
|
||||
* <li>PositionDelta --> {@link DataOutput#writeVInt VInt}</li>
|
||||
* <li>PayloadLength --> {@link DataOutput#writeVInt VInt}</li>
|
||||
* <li>PayloadData --> {@link DataOutput#writeByte Byte}<sup>NumPayloadBytes</sup></li>
|
||||
* <li>Offsets --> <{@link DataOutput#writeVInt VInt}, {@link DataOutput#writeVInt VInt}><sup>TermFreq</sup></li>
|
||||
* </ul>
|
||||
* <p>Notes:</p>
|
||||
* <ul>
|
||||
* <li>Position/Offset byte stores whether this term vector has position or offset
|
||||
* <li>Flags byte stores whether this term vector has position, offset, payload.
|
||||
* information stored.</li>
|
||||
* <li>Term byte prefixes are shared. The PrefixLength is the number of initial
|
||||
* bytes from the previous term which must be pre-pended to a term's suffix
|
||||
* in order to form the term's bytes. Thus, if the previous term's text was "bone"
|
||||
* and the term is "boy", the PrefixLength is two and the suffix is "y".</li>
|
||||
* <li>Positions are stored as delta encoded VInts. This means we only store the
|
||||
* difference of the current position from the last position</li>
|
||||
* <li>PositionDelta is, if payloads are disabled for the term's field, the
|
||||
* difference between the position of the current occurrence in the document and
|
||||
* the previous occurrence (or zero, if this is the first occurrence in this
|
||||
* document). If payloads are enabled for the term's field, then PositionDelta/2
|
||||
* is the difference between the current and the previous position. If payloads
|
||||
* are enabled and PositionDelta is odd, then PayloadLength is stored, indicating
|
||||
* the length of the payload at the current term position.</li>
|
||||
* <li>PayloadData is metadata associated with a term position. If
|
||||
* PayloadLength is stored at the current position, then it indicates the length
|
||||
* of this payload. If PayloadLength is not stored, then this payload has the same
|
||||
* length as the payload at the previous position. PayloadData encodes the
|
||||
* concatenated bytes for all of a terms occurrences.</li>
|
||||
* <li>Offsets are stored as delta encoded VInts. The first VInt is the
|
||||
* startOffset, the second is the endOffset.</li>
|
||||
* </ul>
|
||||
|
|
|
@ -21,7 +21,9 @@ import java.io.IOException;
|
|||
import java.util.Arrays;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
import java.util.NoSuchElementException;
|
||||
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.codecs.TermVectorsReader;
|
||||
|
@ -30,7 +32,6 @@ import org.apache.lucene.index.DocsEnum;
|
|||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.FieldInfos;
|
||||
import org.apache.lucene.index.Fields;
|
||||
import org.apache.lucene.index.FieldsEnum;
|
||||
import org.apache.lucene.index.IndexFileNames;
|
||||
import org.apache.lucene.index.SegmentInfo;
|
||||
import org.apache.lucene.index.Terms;
|
||||
|
@ -55,6 +56,8 @@ public class Lucene40TermVectorsReader extends TermVectorsReader {
|
|||
|
||||
static final byte STORE_OFFSET_WITH_TERMVECTOR = 0x2;
|
||||
|
||||
static final byte STORE_PAYLOAD_WITH_TERMVECTOR = 0x4;
|
||||
|
||||
/** Extension of vectors fields file */
|
||||
static final String VECTORS_FIELDS_EXTENSION = "tvf";
|
||||
|
||||
|
@ -68,8 +71,10 @@ public class Lucene40TermVectorsReader extends TermVectorsReader {
|
|||
static final String CODEC_NAME_DOCS = "Lucene40TermVectorsDocs";
|
||||
static final String CODEC_NAME_INDEX = "Lucene40TermVectorsIndex";
|
||||
|
||||
static final int VERSION_START = 0;
|
||||
static final int VERSION_CURRENT = VERSION_START;
|
||||
static final int VERSION_NO_PAYLOADS = 0;
|
||||
static final int VERSION_PAYLOADS = 1;
|
||||
static final int VERSION_START = VERSION_NO_PAYLOADS;
|
||||
static final int VERSION_CURRENT = VERSION_PAYLOADS;
|
||||
|
||||
static final long HEADER_LENGTH_FIELDS = CodecUtil.headerLength(CODEC_NAME_FIELDS);
|
||||
static final long HEADER_LENGTH_DOCS = CodecUtil.headerLength(CODEC_NAME_DOCS);
|
||||
|
@ -245,9 +250,8 @@ public class Lucene40TermVectorsReader extends TermVectorsReader {
|
|||
}
|
||||
|
||||
@Override
|
||||
public FieldsEnum iterator() throws IOException {
|
||||
|
||||
return new FieldsEnum() {
|
||||
public Iterator<String> iterator() {
|
||||
return new Iterator<String>() {
|
||||
private int fieldUpto;
|
||||
|
||||
@Override
|
||||
|
@ -255,13 +259,18 @@ public class Lucene40TermVectorsReader extends TermVectorsReader {
|
|||
if (fieldNumbers != null && fieldUpto < fieldNumbers.length) {
|
||||
return fieldInfos.fieldInfo(fieldNumbers[fieldUpto++]).name;
|
||||
} else {
|
||||
return null;
|
||||
throw new NoSuchElementException();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Terms terms() throws IOException {
|
||||
return TVFields.this.terms(fieldInfos.fieldInfo(fieldNumbers[fieldUpto-1]).name);
|
||||
public boolean hasNext() {
|
||||
return fieldNumbers != null && fieldUpto < fieldNumbers.length;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
};
|
||||
}
|
||||
|
@ -296,10 +305,17 @@ public class Lucene40TermVectorsReader extends TermVectorsReader {
|
|||
private class TVTerms extends Terms {
|
||||
private final int numTerms;
|
||||
private final long tvfFPStart;
|
||||
private final boolean storePositions;
|
||||
private final boolean storeOffsets;
|
||||
private final boolean storePayloads;
|
||||
|
||||
public TVTerms(long tvfFP) throws IOException {
|
||||
tvf.seek(tvfFP);
|
||||
numTerms = tvf.readVInt();
|
||||
final byte bits = tvf.readByte();
|
||||
storePositions = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0;
|
||||
storeOffsets = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0;
|
||||
storePayloads = (bits & STORE_PAYLOAD_WITH_TERMVECTOR) != 0;
|
||||
tvfFPStart = tvf.getFilePointer();
|
||||
}
|
||||
|
||||
|
@ -314,7 +330,7 @@ public class Lucene40TermVectorsReader extends TermVectorsReader {
|
|||
} else {
|
||||
termsEnum = new TVTermsEnum();
|
||||
}
|
||||
termsEnum.reset(numTerms, tvfFPStart);
|
||||
termsEnum.reset(numTerms, tvfFPStart, storePositions, storeOffsets, storePayloads);
|
||||
return termsEnum;
|
||||
}
|
||||
|
||||
|
@ -345,6 +361,21 @@ public class Lucene40TermVectorsReader extends TermVectorsReader {
|
|||
// this...? I guess codec could buffer and re-sort...
|
||||
return BytesRef.getUTF8SortedAsUnicodeComparator();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasOffsets() {
|
||||
return storeOffsets;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasPositions() {
|
||||
return storePositions;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasPayloads() {
|
||||
return storePayloads;
|
||||
}
|
||||
}
|
||||
|
||||
private class TVTermsEnum extends TermsEnum {
|
||||
|
@ -357,11 +388,17 @@ public class Lucene40TermVectorsReader extends TermVectorsReader {
|
|||
private BytesRef term = new BytesRef();
|
||||
private boolean storePositions;
|
||||
private boolean storeOffsets;
|
||||
private boolean storePayloads;
|
||||
private long tvfFP;
|
||||
|
||||
private int[] positions;
|
||||
private int[] startOffsets;
|
||||
private int[] endOffsets;
|
||||
|
||||
// one shared byte[] for any term's payloads
|
||||
private int[] payloadOffsets;
|
||||
private int lastPayloadLength;
|
||||
private byte[] payloadData;
|
||||
|
||||
// NOTE: tvf is pre-positioned by caller
|
||||
public TVTermsEnum() {
|
||||
|
@ -373,17 +410,20 @@ public class Lucene40TermVectorsReader extends TermVectorsReader {
|
|||
return tvf == origTVF;
|
||||
}
|
||||
|
||||
public void reset(int numTerms, long tvfFPStart) throws IOException {
|
||||
public void reset(int numTerms, long tvfFPStart, boolean storePositions, boolean storeOffsets, boolean storePayloads) throws IOException {
|
||||
this.numTerms = numTerms;
|
||||
this.storePositions = storePositions;
|
||||
this.storeOffsets = storeOffsets;
|
||||
this.storePayloads = storePayloads;
|
||||
nextTerm = 0;
|
||||
tvf.seek(tvfFPStart);
|
||||
final byte bits = tvf.readByte();
|
||||
storePositions = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0;
|
||||
storeOffsets = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0;
|
||||
tvfFP = 1+tvfFPStart;
|
||||
positions = null;
|
||||
startOffsets = null;
|
||||
endOffsets = null;
|
||||
payloadOffsets = null;
|
||||
payloadData = null;
|
||||
lastPayloadLength = -1;
|
||||
}
|
||||
|
||||
// NOTE: slow! (linear scan)
|
||||
|
@ -430,7 +470,26 @@ public class Lucene40TermVectorsReader extends TermVectorsReader {
|
|||
tvf.readBytes(term.bytes, start, deltaLen);
|
||||
freq = tvf.readVInt();
|
||||
|
||||
if (storePositions) {
|
||||
if (storePayloads) {
|
||||
positions = new int[freq];
|
||||
payloadOffsets = new int[freq];
|
||||
int totalPayloadLength = 0;
|
||||
int pos = 0;
|
||||
for(int posUpto=0;posUpto<freq;posUpto++) {
|
||||
int code = tvf.readVInt();
|
||||
pos += code >>> 1;
|
||||
positions[posUpto] = pos;
|
||||
if ((code & 1) != 0) {
|
||||
// length change
|
||||
lastPayloadLength = tvf.readVInt();
|
||||
}
|
||||
payloadOffsets[posUpto] = totalPayloadLength;
|
||||
totalPayloadLength += lastPayloadLength;
|
||||
assert totalPayloadLength >= 0;
|
||||
}
|
||||
payloadData = new byte[totalPayloadLength];
|
||||
tvf.readBytes(payloadData, 0, payloadData.length);
|
||||
} else if (storePositions /* no payloads */) {
|
||||
// TODO: we could maybe reuse last array, if we can
|
||||
// somehow be careful about consumer never using two
|
||||
// D&PEnums at once...
|
||||
|
@ -502,14 +561,12 @@ public class Lucene40TermVectorsReader extends TermVectorsReader {
|
|||
} else {
|
||||
docsAndPositionsEnum = new TVDocsAndPositionsEnum();
|
||||
}
|
||||
docsAndPositionsEnum.reset(liveDocs, positions, startOffsets, endOffsets);
|
||||
docsAndPositionsEnum.reset(liveDocs, positions, startOffsets, endOffsets, payloadOffsets, payloadData);
|
||||
return docsAndPositionsEnum;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Comparator<BytesRef> getComparator() {
|
||||
// TODO: really indexer hardwires
|
||||
// this...? I guess codec could buffer and re-sort...
|
||||
return BytesRef.getUTF8SortedAsUnicodeComparator();
|
||||
}
|
||||
}
|
||||
|
@ -567,6 +624,9 @@ public class Lucene40TermVectorsReader extends TermVectorsReader {
|
|||
private int[] positions;
|
||||
private int[] startOffsets;
|
||||
private int[] endOffsets;
|
||||
private int[] payloadOffsets;
|
||||
private BytesRef payload = new BytesRef();
|
||||
private byte[] payloadBytes;
|
||||
|
||||
@Override
|
||||
public int freq() throws IOException {
|
||||
|
@ -602,11 +662,13 @@ public class Lucene40TermVectorsReader extends TermVectorsReader {
|
|||
}
|
||||
}
|
||||
|
||||
public void reset(Bits liveDocs, int[] positions, int[] startOffsets, int[] endOffsets) {
|
||||
public void reset(Bits liveDocs, int[] positions, int[] startOffsets, int[] endOffsets, int[] payloadLengths, byte[] payloadBytes) {
|
||||
this.liveDocs = liveDocs;
|
||||
this.positions = positions;
|
||||
this.startOffsets = startOffsets;
|
||||
this.endOffsets = endOffsets;
|
||||
this.payloadOffsets = payloadLengths;
|
||||
this.payloadBytes = payloadBytes;
|
||||
this.doc = -1;
|
||||
didNext = false;
|
||||
nextPos = 0;
|
||||
|
@ -614,12 +676,19 @@ public class Lucene40TermVectorsReader extends TermVectorsReader {
|
|||
|
||||
@Override
|
||||
public BytesRef getPayload() {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasPayload() {
|
||||
return false;
|
||||
if (payloadOffsets == null) {
|
||||
return null;
|
||||
} else {
|
||||
int off = payloadOffsets[nextPos-1];
|
||||
int end = nextPos == payloadOffsets.length ? payloadBytes.length : payloadOffsets[nextPos];
|
||||
if (end - off == 0) {
|
||||
return null;
|
||||
}
|
||||
payload.bytes = payloadBytes;
|
||||
payload.offset = off;
|
||||
payload.length = end - off;
|
||||
return payload;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -106,12 +106,14 @@ public final class Lucene40TermVectorsWriter extends TermVectorsWriter {
|
|||
private String lastFieldName;
|
||||
|
||||
@Override
|
||||
public void startField(FieldInfo info, int numTerms, boolean positions, boolean offsets) throws IOException {
|
||||
public void startField(FieldInfo info, int numTerms, boolean positions, boolean offsets, boolean payloads) throws IOException {
|
||||
assert lastFieldName == null || info.name.compareTo(lastFieldName) > 0: "fieldName=" + info.name + " lastFieldName=" + lastFieldName;
|
||||
lastFieldName = info.name;
|
||||
this.positions = positions;
|
||||
this.offsets = offsets;
|
||||
this.payloads = payloads;
|
||||
lastTerm.length = 0;
|
||||
lastPayloadLength = -1; // force first payload to write its length
|
||||
fps[fieldCount++] = tvf.getFilePointer();
|
||||
tvd.writeVInt(info.number);
|
||||
tvf.writeVInt(numTerms);
|
||||
|
@ -120,6 +122,8 @@ public final class Lucene40TermVectorsWriter extends TermVectorsWriter {
|
|||
bits |= Lucene40TermVectorsReader.STORE_POSITIONS_WITH_TERMVECTOR;
|
||||
if (offsets)
|
||||
bits |= Lucene40TermVectorsReader.STORE_OFFSET_WITH_TERMVECTOR;
|
||||
if (payloads)
|
||||
bits |= Lucene40TermVectorsReader.STORE_PAYLOAD_WITH_TERMVECTOR;
|
||||
tvf.writeByte(bits);
|
||||
|
||||
assert fieldCount <= numVectorFields;
|
||||
|
@ -138,10 +142,12 @@ public final class Lucene40TermVectorsWriter extends TermVectorsWriter {
|
|||
// we also don't buffer during bulk merges.
|
||||
private int offsetStartBuffer[] = new int[10];
|
||||
private int offsetEndBuffer[] = new int[10];
|
||||
private int offsetIndex = 0;
|
||||
private int offsetFreq = 0;
|
||||
private BytesRef payloadData = new BytesRef(10);
|
||||
private int bufferedIndex = 0;
|
||||
private int bufferedFreq = 0;
|
||||
private boolean positions = false;
|
||||
private boolean offsets = false;
|
||||
private boolean payloads = false;
|
||||
|
||||
@Override
|
||||
public void startTerm(BytesRef term, int freq) throws IOException {
|
||||
|
@ -158,20 +164,40 @@ public final class Lucene40TermVectorsWriter extends TermVectorsWriter {
|
|||
// we might need to buffer if its a non-bulk merge
|
||||
offsetStartBuffer = ArrayUtil.grow(offsetStartBuffer, freq);
|
||||
offsetEndBuffer = ArrayUtil.grow(offsetEndBuffer, freq);
|
||||
offsetIndex = 0;
|
||||
offsetFreq = freq;
|
||||
}
|
||||
bufferedIndex = 0;
|
||||
bufferedFreq = freq;
|
||||
payloadData.length = 0;
|
||||
}
|
||||
|
||||
int lastPosition = 0;
|
||||
int lastOffset = 0;
|
||||
int lastPayloadLength = -1; // force first payload to write its length
|
||||
|
||||
BytesRef scratch = new BytesRef(); // used only by this optimized flush below
|
||||
|
||||
@Override
|
||||
public void addProx(int numProx, DataInput positions, DataInput offsets) throws IOException {
|
||||
// TODO: technically we could just copy bytes and not re-encode if we knew the length...
|
||||
if (positions != null) {
|
||||
if (payloads) {
|
||||
// TODO, maybe overkill and just call super.addProx() in this case?
|
||||
// we do avoid buffering the offsets in RAM though.
|
||||
for (int i = 0; i < numProx; i++) {
|
||||
tvf.writeVInt(positions.readVInt());
|
||||
int code = positions.readVInt();
|
||||
if ((code & 1) == 1) {
|
||||
int length = positions.readVInt();
|
||||
scratch.grow(length);
|
||||
scratch.length = length;
|
||||
positions.readBytes(scratch.bytes, scratch.offset, scratch.length);
|
||||
writePosition(code >>> 1, scratch);
|
||||
} else {
|
||||
writePosition(code >>> 1, null);
|
||||
}
|
||||
}
|
||||
tvf.writeBytes(payloadData.bytes, payloadData.offset, payloadData.length);
|
||||
} else if (positions != null) {
|
||||
// pure positions, no payloads
|
||||
for (int i = 0; i < numProx; i++) {
|
||||
tvf.writeVInt(positions.readVInt() >>> 1);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -184,28 +210,36 @@ public final class Lucene40TermVectorsWriter extends TermVectorsWriter {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void addPosition(int position, int startOffset, int endOffset) throws IOException {
|
||||
if (positions && offsets) {
|
||||
public void addPosition(int position, int startOffset, int endOffset, BytesRef payload) throws IOException {
|
||||
if (positions && (offsets || payloads)) {
|
||||
// write position delta
|
||||
tvf.writeVInt(position - lastPosition);
|
||||
writePosition(position - lastPosition, payload);
|
||||
lastPosition = position;
|
||||
|
||||
// buffer offsets
|
||||
offsetStartBuffer[offsetIndex] = startOffset;
|
||||
offsetEndBuffer[offsetIndex] = endOffset;
|
||||
offsetIndex++;
|
||||
if (offsets) {
|
||||
offsetStartBuffer[bufferedIndex] = startOffset;
|
||||
offsetEndBuffer[bufferedIndex] = endOffset;
|
||||
}
|
||||
|
||||
bufferedIndex++;
|
||||
|
||||
// dump buffer if we are done
|
||||
if (offsetIndex == offsetFreq) {
|
||||
for (int i = 0; i < offsetIndex; i++) {
|
||||
tvf.writeVInt(offsetStartBuffer[i] - lastOffset);
|
||||
tvf.writeVInt(offsetEndBuffer[i] - offsetStartBuffer[i]);
|
||||
lastOffset = offsetEndBuffer[i];
|
||||
if (bufferedIndex == bufferedFreq) {
|
||||
if (payloads) {
|
||||
tvf.writeBytes(payloadData.bytes, payloadData.offset, payloadData.length);
|
||||
}
|
||||
for (int i = 0; i < bufferedIndex; i++) {
|
||||
if (offsets) {
|
||||
tvf.writeVInt(offsetStartBuffer[i] - lastOffset);
|
||||
tvf.writeVInt(offsetEndBuffer[i] - offsetStartBuffer[i]);
|
||||
lastOffset = offsetEndBuffer[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if (positions) {
|
||||
// write position delta
|
||||
tvf.writeVInt(position - lastPosition);
|
||||
writePosition(position - lastPosition, payload);
|
||||
lastPosition = position;
|
||||
} else if (offsets) {
|
||||
// write offset deltas
|
||||
|
@ -214,6 +248,30 @@ public final class Lucene40TermVectorsWriter extends TermVectorsWriter {
|
|||
lastOffset = endOffset;
|
||||
}
|
||||
}
|
||||
|
||||
private void writePosition(int delta, BytesRef payload) throws IOException {
|
||||
if (payloads) {
|
||||
int payloadLength = payload == null ? 0 : payload.length;
|
||||
|
||||
if (payloadLength != lastPayloadLength) {
|
||||
lastPayloadLength = payloadLength;
|
||||
tvf.writeVInt((delta<<1)|1);
|
||||
tvf.writeVInt(payloadLength);
|
||||
} else {
|
||||
tvf.writeVInt(delta << 1);
|
||||
}
|
||||
if (payloadLength > 0) {
|
||||
if (payloadLength + payloadData.length < 0) {
|
||||
// we overflowed the payload buffer, just throw UOE
|
||||
// having > Integer.MAX_VALUE bytes of payload for a single term in a single doc is nuts.
|
||||
throw new UnsupportedOperationException("A term cannot have more than Integer.MAX_VALUE bytes of payload data in a single document");
|
||||
}
|
||||
payloadData.append(payload);
|
||||
}
|
||||
} else {
|
||||
tvf.writeVInt(delta);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void abort() {
|
||||
|
@ -255,7 +313,14 @@ public final class Lucene40TermVectorsWriter extends TermVectorsWriter {
|
|||
|
||||
int idx = 0;
|
||||
int numDocs = 0;
|
||||
for (final AtomicReader reader : mergeState.readers) {
|
||||
for (int i = 0; i < mergeState.readers.size(); i++) {
|
||||
final AtomicReader reader = mergeState.readers.get(i);
|
||||
// set PayloadProcessor
|
||||
if (mergeState.payloadProcessorProvider != null) {
|
||||
mergeState.currentReaderPayloadProcessor = mergeState.readerPayloadProcessor[i];
|
||||
} else {
|
||||
mergeState.currentReaderPayloadProcessor = null;
|
||||
}
|
||||
final SegmentReader matchingSegmentReader = mergeState.matchingSegmentReaders[idx++];
|
||||
Lucene40TermVectorsReader matchingVectorsReader = null;
|
||||
if (matchingSegmentReader != null) {
|
||||
|
@ -288,8 +353,8 @@ public final class Lucene40TermVectorsWriter extends TermVectorsWriter {
|
|||
final int maxDoc = reader.maxDoc();
|
||||
final Bits liveDocs = reader.getLiveDocs();
|
||||
int totalNumDocs = 0;
|
||||
if (matchingVectorsReader != null) {
|
||||
// We can bulk-copy because the fieldInfos are "congruent"
|
||||
if (matchingVectorsReader != null && mergeState.currentReaderPayloadProcessor == null) {
|
||||
// We can bulk-copy because the fieldInfos are "congruent" and there is no payload processor
|
||||
for (int docNum = 0; docNum < maxDoc;) {
|
||||
if (!liveDocs.get(docNum)) {
|
||||
// skip deleted docs
|
||||
|
@ -324,7 +389,7 @@ public final class Lucene40TermVectorsWriter extends TermVectorsWriter {
|
|||
// NOTE: it's very important to first assign to vectors then pass it to
|
||||
// termVectorsWriter.addAllDocVectors; see LUCENE-1282
|
||||
Fields vectors = reader.getTermVectors(docNum);
|
||||
addAllDocVectors(vectors, mergeState.fieldInfos);
|
||||
addAllDocVectors(vectors, mergeState);
|
||||
totalNumDocs++;
|
||||
mergeState.checkAbort.work(300);
|
||||
}
|
||||
|
@ -339,8 +404,8 @@ public final class Lucene40TermVectorsWriter extends TermVectorsWriter {
|
|||
int rawDocLengths2[])
|
||||
throws IOException {
|
||||
final int maxDoc = reader.maxDoc();
|
||||
if (matchingVectorsReader != null) {
|
||||
// We can bulk-copy because the fieldInfos are "congruent"
|
||||
if (matchingVectorsReader != null && mergeState.currentReaderPayloadProcessor == null) {
|
||||
// We can bulk-copy because the fieldInfos are "congruent" and there is no payload processor
|
||||
int docCount = 0;
|
||||
while (docCount < maxDoc) {
|
||||
int len = Math.min(MAX_RAW_MERGE_DOCS, maxDoc - docCount);
|
||||
|
@ -354,7 +419,7 @@ public final class Lucene40TermVectorsWriter extends TermVectorsWriter {
|
|||
// NOTE: it's very important to first assign to vectors then pass it to
|
||||
// termVectorsWriter.addAllDocVectors; see LUCENE-1282
|
||||
Fields vectors = reader.getTermVectors(docNum);
|
||||
addAllDocVectors(vectors, mergeState.fieldInfos);
|
||||
addAllDocVectors(vectors, mergeState);
|
||||
mergeState.checkAbort.work(300);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -366,7 +366,7 @@ the {@link org.apache.lucene.codecs.Codec Codec} api. Fast per-document storage
|
|||
factors need no longer be a single byte, they can be any DocValues
|
||||
{@link org.apache.lucene.index.DocValues.Type type}. Terms need not be unicode
|
||||
strings, they can be any byte sequence. Term offsets can optionally be indexed
|
||||
into the postings lists.</li>
|
||||
into the postings lists. Payloads can be stored in the term vectors.</li>
|
||||
</ul>
|
||||
<a name="Limitations" id="Limitations"></a>
|
||||
<h2>Limitations</h2>
|
||||
|
|
|
@ -32,7 +32,6 @@ import org.apache.lucene.index.DocsEnum;
|
|||
import org.apache.lucene.index.FieldInfo.IndexOptions;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.Fields;
|
||||
import org.apache.lucene.index.FieldsEnum;
|
||||
import org.apache.lucene.index.OrdTermState;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
|
@ -44,6 +43,7 @@ import org.apache.lucene.store.RAMOutputStream;
|
|||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.UnmodifiableIterator;
|
||||
import org.apache.lucene.util.automaton.CompiledAutomaton;
|
||||
import org.apache.lucene.util.automaton.RunAutomaton;
|
||||
import org.apache.lucene.util.automaton.Transition;
|
||||
|
@ -124,36 +124,14 @@ public class DirectPostingsFormat extends PostingsFormat {
|
|||
private final Map<String,DirectField> fields = new TreeMap<String,DirectField>();
|
||||
|
||||
public DirectFields(SegmentReadState state, Fields fields, int minSkipCount, int lowFreqCutoff) throws IOException {
|
||||
FieldsEnum fieldsEnum = fields.iterator();
|
||||
String field;
|
||||
while ((field = fieldsEnum.next()) != null) {
|
||||
this.fields.put(field, new DirectField(state, field, fieldsEnum.terms(), minSkipCount, lowFreqCutoff));
|
||||
for (String field : fields) {
|
||||
this.fields.put(field, new DirectField(state, field, fields.terms(field), minSkipCount, lowFreqCutoff));
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public FieldsEnum iterator() {
|
||||
|
||||
final Iterator<Map.Entry<String,DirectField>> iter = fields.entrySet().iterator();
|
||||
|
||||
return new FieldsEnum() {
|
||||
Map.Entry<String,DirectField> current;
|
||||
|
||||
@Override
|
||||
public String next() {
|
||||
if (iter.hasNext()) {
|
||||
current = iter.next();
|
||||
return current.getKey();
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Terms terms() {
|
||||
return current.getValue();
|
||||
}
|
||||
};
|
||||
public Iterator<String> iterator() {
|
||||
return new UnmodifiableIterator<String>(fields.keySet().iterator());
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -348,9 +326,8 @@ public class DirectPostingsFormat extends PostingsFormat {
|
|||
scratch.add(docsAndPositionsEnum.endOffset());
|
||||
}
|
||||
if (hasPayloads) {
|
||||
final BytesRef payload;
|
||||
if (docsAndPositionsEnum.hasPayload()) {
|
||||
payload = docsAndPositionsEnum.getPayload();
|
||||
final BytesRef payload = docsAndPositionsEnum.getPayload();
|
||||
if (payload != null) {
|
||||
scratch.add(payload.length);
|
||||
ros.writeBytes(payload.bytes, payload.offset, payload.length);
|
||||
} else {
|
||||
|
@ -421,9 +398,8 @@ public class DirectPostingsFormat extends PostingsFormat {
|
|||
for(int pos=0;pos<freq;pos++) {
|
||||
positions[upto][posUpto] = docsAndPositionsEnum.nextPosition();
|
||||
if (hasPayloads) {
|
||||
if (docsAndPositionsEnum.hasPayload()) {
|
||||
BytesRef payload = docsAndPositionsEnum.getPayload();
|
||||
assert payload != null;
|
||||
BytesRef payload = docsAndPositionsEnum.getPayload();
|
||||
if (payload != null) {
|
||||
byte[] payloadBytes = new byte[payload.length];
|
||||
System.arraycopy(payload.bytes, payload.offset, payloadBytes, 0, payload.length);
|
||||
payloads[upto][pos] = payloadBytes;
|
||||
|
@ -635,6 +611,21 @@ public class DirectPostingsFormat extends PostingsFormat {
|
|||
return BytesRef.getUTF8SortedAsUnicodeComparator();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasOffsets() {
|
||||
return hasOffsets;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasPositions() {
|
||||
return hasPos;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasPayloads() {
|
||||
return hasPayloads;
|
||||
}
|
||||
|
||||
private final class DirectTermsEnum extends TermsEnum {
|
||||
|
||||
private final BytesRef scratch = new BytesRef();
|
||||
|
@ -1791,18 +1782,12 @@ public class DirectPostingsFormat extends PostingsFormat {
|
|||
return docID;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasPayload() {
|
||||
return payloadLength > 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public BytesRef getPayload() {
|
||||
if (payloadLength > 0) {
|
||||
payload.bytes = payloadBytes;
|
||||
payload.offset = lastPayloadOffset;
|
||||
payload.length = payloadLength;
|
||||
payloadLength = 0;
|
||||
return payload;
|
||||
} else {
|
||||
return null;
|
||||
|
@ -1995,7 +1980,6 @@ public class DirectPostingsFormat extends PostingsFormat {
|
|||
private int upto;
|
||||
private int docID = -1;
|
||||
private int posUpto;
|
||||
private boolean gotPayload;
|
||||
private int[] curPositions;
|
||||
|
||||
public HighFreqDocsAndPositionsEnum(Bits liveDocs, boolean hasOffsets) {
|
||||
|
@ -2065,7 +2049,6 @@ public class DirectPostingsFormat extends PostingsFormat {
|
|||
@Override
|
||||
public int nextPosition() {
|
||||
posUpto += posJump;
|
||||
gotPayload = false;
|
||||
return curPositions[posUpto];
|
||||
}
|
||||
|
||||
|
@ -2199,21 +2182,22 @@ public class DirectPostingsFormat extends PostingsFormat {
|
|||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasPayload() {
|
||||
return !gotPayload && payloads != null && payloads[upto][posUpto/(hasOffsets ? 3 : 1)] != null;
|
||||
}
|
||||
|
||||
private final BytesRef payload = new BytesRef();
|
||||
|
||||
@Override
|
||||
public BytesRef getPayload() {
|
||||
final byte[] payloadBytes = payloads[upto][posUpto/(hasOffsets ? 3:1)];
|
||||
payload.bytes = payloadBytes;
|
||||
payload.length = payloadBytes.length;
|
||||
payload.offset = 0;
|
||||
gotPayload = true;
|
||||
return payload;
|
||||
if (payloads == null) {
|
||||
return null;
|
||||
} else {
|
||||
final byte[] payloadBytes = payloads[upto][posUpto/(hasOffsets ? 3:1)];
|
||||
if (payloadBytes == null) {
|
||||
return null;
|
||||
}
|
||||
payload.bytes = payloadBytes;
|
||||
payload.length = payloadBytes.length;
|
||||
payload.offset = 0;
|
||||
return payload;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -34,7 +34,6 @@ import org.apache.lucene.index.DocsEnum;
|
|||
import org.apache.lucene.index.FieldInfo.IndexOptions;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.FieldInfos;
|
||||
import org.apache.lucene.index.FieldsEnum;
|
||||
import org.apache.lucene.index.IndexFileNames;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
|
@ -49,6 +48,7 @@ import org.apache.lucene.util.ArrayUtil;
|
|||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
import org.apache.lucene.util.UnmodifiableIterator;
|
||||
import org.apache.lucene.util.fst.Builder;
|
||||
import org.apache.lucene.util.fst.ByteSequenceOutputs;
|
||||
import org.apache.lucene.util.fst.BytesRefFSTEnum;
|
||||
|
@ -446,7 +446,6 @@ public class MemoryPostingsFormat extends PostingsFormat {
|
|||
private int numDocs;
|
||||
private int posPending;
|
||||
private int payloadLength;
|
||||
private boolean payloadRetrieved;
|
||||
final boolean storeOffsets;
|
||||
int offsetLength;
|
||||
int startOffset;
|
||||
|
@ -484,7 +483,6 @@ public class MemoryPostingsFormat extends PostingsFormat {
|
|||
payloadLength = 0;
|
||||
this.numDocs = numDocs;
|
||||
posPending = 0;
|
||||
payloadRetrieved = false;
|
||||
startOffset = storeOffsets ? 0 : -1; // always return -1 if no offsets are stored
|
||||
offsetLength = 0;
|
||||
return this;
|
||||
|
@ -577,10 +575,6 @@ public class MemoryPostingsFormat extends PostingsFormat {
|
|||
payload.offset = in.getPosition();
|
||||
in.skipBytes(payloadLength);
|
||||
payload.length = payloadLength;
|
||||
// Necessary, in case caller changed the
|
||||
// payload.bytes from prior call:
|
||||
payload.bytes = buffer;
|
||||
payloadRetrieved = false;
|
||||
}
|
||||
|
||||
//System.out.println(" pos=" + pos + " payload=" + payload + " fp=" + in.getPosition());
|
||||
|
@ -599,13 +593,7 @@ public class MemoryPostingsFormat extends PostingsFormat {
|
|||
|
||||
@Override
|
||||
public BytesRef getPayload() {
|
||||
payloadRetrieved = true;
|
||||
return payload;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasPayload() {
|
||||
return !payloadRetrieved && payload.length > 0;
|
||||
return payload.length > 0 ? payload : null;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -834,6 +822,21 @@ public class MemoryPostingsFormat extends PostingsFormat {
|
|||
public Comparator<BytesRef> getComparator() {
|
||||
return BytesRef.getUTF8SortedAsUnicodeComparator();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasOffsets() {
|
||||
return field.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasPositions() {
|
||||
return field.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasPayloads() {
|
||||
return field.hasPayloads();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -859,24 +862,8 @@ public class MemoryPostingsFormat extends PostingsFormat {
|
|||
|
||||
return new FieldsProducer() {
|
||||
@Override
|
||||
public FieldsEnum iterator() {
|
||||
final Iterator<TermsReader> iter = fields.values().iterator();
|
||||
|
||||
return new FieldsEnum() {
|
||||
|
||||
private TermsReader current;
|
||||
|
||||
@Override
|
||||
public String next() {
|
||||
current = iter.next();
|
||||
return current.field.name;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Terms terms() {
|
||||
return current;
|
||||
}
|
||||
};
|
||||
public Iterator<String> iterator() {
|
||||
return new UnmodifiableIterator<String>(fields.keySet().iterator());
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -30,11 +30,11 @@ import org.apache.lucene.codecs.FieldsProducer;
|
|||
import org.apache.lucene.codecs.PostingsFormat;
|
||||
import org.apache.lucene.codecs.TermsConsumer;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.FieldsEnum;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.UnmodifiableIterator;
|
||||
|
||||
/**
|
||||
* Enables per field format support.
|
||||
|
@ -197,34 +197,9 @@ public abstract class PerFieldPostingsFormat extends PostingsFormat {
|
|||
}
|
||||
}
|
||||
|
||||
private final class FieldsIterator extends FieldsEnum {
|
||||
private final Iterator<String> it;
|
||||
private String current;
|
||||
|
||||
public FieldsIterator() {
|
||||
it = fields.keySet().iterator();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String next() {
|
||||
if (it.hasNext()) {
|
||||
current = it.next();
|
||||
} else {
|
||||
current = null;
|
||||
}
|
||||
|
||||
return current;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Terms terms() throws IOException {
|
||||
return fields.get(current).terms(current);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public FieldsEnum iterator() throws IOException {
|
||||
return new FieldsIterator();
|
||||
public Iterator<String> iterator() {
|
||||
return new UnmodifiableIterator<String>(fields.keySet().iterator());
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -532,19 +532,13 @@ public class PulsingPostingsReader extends PostingsReaderBase {
|
|||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasPayload() {
|
||||
return storePayloads && !payloadRetrieved && payloadLength > 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public BytesRef getPayload() throws IOException {
|
||||
//System.out.println("PR getPayload payloadLength=" + payloadLength + " this=" + this);
|
||||
if (payloadRetrieved) {
|
||||
throw new IOException("Either no payload exists at this term position or an attempt was made to load it more than once.");
|
||||
}
|
||||
payloadRetrieved = true;
|
||||
if (payloadLength > 0) {
|
||||
return payload;
|
||||
} else if (storePayloads && payloadLength > 0) {
|
||||
payloadRetrieved = true;
|
||||
if (payload == null) {
|
||||
payload = new BytesRef(payloadLength);
|
||||
} else {
|
||||
|
|
|
@ -714,7 +714,11 @@ public class SepPostingsReader extends PostingsReaderBase {
|
|||
@Override
|
||||
public BytesRef getPayload() throws IOException {
|
||||
if (!payloadPending) {
|
||||
throw new IOException("Either no payload exists at this term position or an attempt was made to load it more than once.");
|
||||
return null;
|
||||
}
|
||||
|
||||
if (pendingPayloadBytes == 0) {
|
||||
return payload;
|
||||
}
|
||||
|
||||
assert pendingPayloadBytes >= payloadLength;
|
||||
|
@ -731,15 +735,9 @@ public class SepPostingsReader extends PostingsReaderBase {
|
|||
}
|
||||
|
||||
payloadIn.readBytes(payload.bytes, 0, payloadLength);
|
||||
payloadPending = false;
|
||||
payload.length = payloadLength;
|
||||
pendingPayloadBytes = 0;
|
||||
return payload;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasPayload() {
|
||||
return payloadPending && payloadLength > 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -20,14 +20,17 @@ package org.apache.lucene.codecs.simpletext;
|
|||
import java.io.IOException;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
import java.util.TreeMap;
|
||||
import java.util.TreeSet;
|
||||
|
||||
import org.apache.lucene.codecs.FieldsProducer;
|
||||
import org.apache.lucene.index.DocsAndPositionsEnum;
|
||||
import org.apache.lucene.index.DocsEnum;
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.FieldInfo.IndexOptions;
|
||||
import org.apache.lucene.index.FieldInfos;
|
||||
import org.apache.lucene.index.FieldsEnum;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
|
@ -40,6 +43,7 @@ import org.apache.lucene.util.IntsRef;
|
|||
import org.apache.lucene.util.OpenBitSet;
|
||||
import org.apache.lucene.util.StringHelper;
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
import org.apache.lucene.util.UnmodifiableIterator;
|
||||
import org.apache.lucene.util.fst.Builder;
|
||||
import org.apache.lucene.util.fst.BytesRefFSTEnum;
|
||||
import org.apache.lucene.util.fst.FST;
|
||||
|
@ -48,7 +52,7 @@ import org.apache.lucene.util.fst.PositiveIntOutputs;
|
|||
import org.apache.lucene.util.fst.Util;
|
||||
|
||||
class SimpleTextFieldsReader extends FieldsProducer {
|
||||
|
||||
private final TreeMap<String,Long> fields;
|
||||
private final IndexInput in;
|
||||
private final FieldInfos fieldInfos;
|
||||
|
||||
|
@ -66,35 +70,22 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
|||
in = state.dir.openInput(SimpleTextPostingsFormat.getPostingsFileName(state.segmentInfo.name, state.segmentSuffix), state.context);
|
||||
|
||||
fieldInfos = state.fieldInfos;
|
||||
fields = readFields((IndexInput)in.clone());
|
||||
}
|
||||
|
||||
private class SimpleTextFieldsEnum extends FieldsEnum {
|
||||
private final IndexInput in;
|
||||
private final BytesRef scratch = new BytesRef(10);
|
||||
private String current;
|
||||
|
||||
public SimpleTextFieldsEnum() {
|
||||
this.in = (IndexInput) SimpleTextFieldsReader.this.in.clone();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String next() throws IOException {
|
||||
while(true) {
|
||||
SimpleTextUtil.readLine(in, scratch);
|
||||
if (scratch.equals(END)) {
|
||||
current = null;
|
||||
return null;
|
||||
}
|
||||
if (StringHelper.startsWith(scratch, FIELD)) {
|
||||
return current = new String(scratch.bytes, scratch.offset + FIELD.length, scratch.length - FIELD.length, "UTF-8");
|
||||
}
|
||||
|
||||
private TreeMap<String,Long> readFields(IndexInput in) throws IOException {
|
||||
BytesRef scratch = new BytesRef(10);
|
||||
TreeMap<String,Long> fields = new TreeMap<String,Long>();
|
||||
|
||||
while (true) {
|
||||
SimpleTextUtil.readLine(in, scratch);
|
||||
if (scratch.equals(END)) {
|
||||
return fields;
|
||||
} else if (StringHelper.startsWith(scratch, FIELD)) {
|
||||
String fieldName = new String(scratch.bytes, scratch.offset + FIELD.length, scratch.length - FIELD.length, "UTF-8");
|
||||
fields.put(fieldName, in.getFilePointer());
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Terms terms() throws IOException {
|
||||
return SimpleTextFieldsReader.this.terms(current);
|
||||
}
|
||||
}
|
||||
|
||||
private class SimpleTextTermsEnum extends TermsEnum {
|
||||
|
@ -471,18 +462,7 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
|||
|
||||
@Override
|
||||
public BytesRef getPayload() {
|
||||
// Some tests rely on only being able to retrieve the
|
||||
// payload once
|
||||
try {
|
||||
return payload;
|
||||
} finally {
|
||||
payload = null;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasPayload() {
|
||||
return payload != null;
|
||||
return payload;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -498,7 +478,7 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
|||
|
||||
private class SimpleTextTerms extends Terms {
|
||||
private final long termsStart;
|
||||
private final IndexOptions indexOptions;
|
||||
private final FieldInfo fieldInfo;
|
||||
private long sumTotalTermFreq;
|
||||
private long sumDocFreq;
|
||||
private int docCount;
|
||||
|
@ -509,7 +489,7 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
|||
|
||||
public SimpleTextTerms(String field, long termsStart) throws IOException {
|
||||
this.termsStart = termsStart;
|
||||
indexOptions = fieldInfos.fieldInfo(field).getIndexOptions();
|
||||
fieldInfo = fieldInfos.fieldInfo(field);
|
||||
loadTerms();
|
||||
}
|
||||
|
||||
|
@ -579,7 +559,7 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
|||
@Override
|
||||
public TermsEnum iterator(TermsEnum reuse) throws IOException {
|
||||
if (fst != null) {
|
||||
return new SimpleTextTermsEnum(fst, indexOptions);
|
||||
return new SimpleTextTermsEnum(fst, fieldInfo.getIndexOptions());
|
||||
} else {
|
||||
return TermsEnum.EMPTY;
|
||||
}
|
||||
|
@ -597,7 +577,7 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
|||
|
||||
@Override
|
||||
public long getSumTotalTermFreq() {
|
||||
return indexOptions == IndexOptions.DOCS_ONLY ? -1 : sumTotalTermFreq;
|
||||
return fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY ? -1 : sumTotalTermFreq;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -609,11 +589,26 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
|||
public int getDocCount() throws IOException {
|
||||
return docCount;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasOffsets() {
|
||||
return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasPositions() {
|
||||
return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasPayloads() {
|
||||
return fieldInfo.hasPayloads();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public FieldsEnum iterator() throws IOException {
|
||||
return new SimpleTextFieldsEnum();
|
||||
public Iterator<String> iterator() {
|
||||
return new UnmodifiableIterator<String>(fields.keySet().iterator());
|
||||
}
|
||||
|
||||
private final Map<String,Terms> termsCache = new HashMap<String,Terms>();
|
||||
|
@ -622,15 +617,13 @@ class SimpleTextFieldsReader extends FieldsProducer {
|
|||
synchronized public Terms terms(String field) throws IOException {
|
||||
Terms terms = termsCache.get(field);
|
||||
if (terms == null) {
|
||||
SimpleTextFieldsEnum fe = (SimpleTextFieldsEnum) iterator();
|
||||
String fieldUpto;
|
||||
while((fieldUpto = fe.next()) != null) {
|
||||
if (fieldUpto.equals(field)) {
|
||||
terms = new SimpleTextTerms(field, fe.in.getFilePointer());
|
||||
break;
|
||||
}
|
||||
Long fp = fields.get(field);
|
||||
if (fp == null) {
|
||||
return null;
|
||||
} else {
|
||||
terms = new SimpleTextTerms(field, fp);
|
||||
termsCache.put(field, terms);
|
||||
}
|
||||
termsCache.put(field, terms);
|
||||
}
|
||||
return terms;
|
||||
}
|
||||
|
|
|
@ -29,7 +29,6 @@ import org.apache.lucene.codecs.TermVectorsReader;
|
|||
import org.apache.lucene.index.DocsAndPositionsEnum;
|
||||
import org.apache.lucene.index.DocsEnum;
|
||||
import org.apache.lucene.index.Fields;
|
||||
import org.apache.lucene.index.FieldsEnum;
|
||||
import org.apache.lucene.index.IndexFileNames;
|
||||
import org.apache.lucene.index.SegmentInfo;
|
||||
import org.apache.lucene.index.Terms;
|
||||
|
@ -45,6 +44,7 @@ import org.apache.lucene.util.CharsRef;
|
|||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.StringHelper;
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
import org.apache.lucene.util.UnmodifiableIterator;
|
||||
|
||||
import static org.apache.lucene.codecs.simpletext.SimpleTextTermVectorsWriter.*;
|
||||
|
||||
|
@ -126,11 +126,15 @@ public class SimpleTextTermVectorsReader extends TermVectorsReader {
|
|||
assert StringHelper.startsWith(scratch, FIELDOFFSETS);
|
||||
boolean offsets = Boolean.parseBoolean(readString(FIELDOFFSETS.length, scratch));
|
||||
|
||||
readLine();
|
||||
assert StringHelper.startsWith(scratch, FIELDPAYLOADS);
|
||||
boolean payloads = Boolean.parseBoolean(readString(FIELDPAYLOADS.length, scratch));
|
||||
|
||||
readLine();
|
||||
assert StringHelper.startsWith(scratch, FIELDTERMCOUNT);
|
||||
int termCount = parseIntAt(FIELDTERMCOUNT.length);
|
||||
|
||||
SimpleTVTerms terms = new SimpleTVTerms();
|
||||
SimpleTVTerms terms = new SimpleTVTerms(offsets, positions, payloads);
|
||||
fields.put(fieldName, terms);
|
||||
|
||||
for (int j = 0; j < termCount; j++) {
|
||||
|
@ -152,6 +156,9 @@ public class SimpleTextTermVectorsReader extends TermVectorsReader {
|
|||
if (positions || offsets) {
|
||||
if (positions) {
|
||||
postings.positions = new int[postings.freq];
|
||||
if (payloads) {
|
||||
postings.payloads = new BytesRef[postings.freq];
|
||||
}
|
||||
}
|
||||
|
||||
if (offsets) {
|
||||
|
@ -164,6 +171,17 @@ public class SimpleTextTermVectorsReader extends TermVectorsReader {
|
|||
readLine();
|
||||
assert StringHelper.startsWith(scratch, POSITION);
|
||||
postings.positions[k] = parseIntAt(POSITION.length);
|
||||
if (payloads) {
|
||||
readLine();
|
||||
assert StringHelper.startsWith(scratch, PAYLOAD);
|
||||
if (scratch.length - PAYLOAD.length == 0) {
|
||||
postings.payloads[k] = null;
|
||||
} else {
|
||||
byte payloadBytes[] = new byte[scratch.length - PAYLOAD.length];
|
||||
System.arraycopy(scratch.bytes, scratch.offset+PAYLOAD.length, payloadBytes, 0, payloadBytes.length);
|
||||
postings.payloads[k] = new BytesRef(payloadBytes);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (offsets) {
|
||||
|
@ -222,26 +240,8 @@ public class SimpleTextTermVectorsReader extends TermVectorsReader {
|
|||
}
|
||||
|
||||
@Override
|
||||
public FieldsEnum iterator() throws IOException {
|
||||
return new FieldsEnum() {
|
||||
private Iterator<Map.Entry<String,SimpleTVTerms>> iterator = fields.entrySet().iterator();
|
||||
private Map.Entry<String,SimpleTVTerms> current = null;
|
||||
|
||||
@Override
|
||||
public String next() {
|
||||
if (!iterator.hasNext()) {
|
||||
return null;
|
||||
} else {
|
||||
current = iterator.next();
|
||||
return current.getKey();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Terms terms() {
|
||||
return current.getValue();
|
||||
}
|
||||
};
|
||||
public Iterator<String> iterator() {
|
||||
return new UnmodifiableIterator<String>(fields.keySet().iterator());
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -257,8 +257,14 @@ public class SimpleTextTermVectorsReader extends TermVectorsReader {
|
|||
|
||||
private static class SimpleTVTerms extends Terms {
|
||||
final SortedMap<BytesRef,SimpleTVPostings> terms;
|
||||
final boolean hasOffsets;
|
||||
final boolean hasPositions;
|
||||
final boolean hasPayloads;
|
||||
|
||||
SimpleTVTerms() {
|
||||
SimpleTVTerms(boolean hasOffsets, boolean hasPositions, boolean hasPayloads) {
|
||||
this.hasOffsets = hasOffsets;
|
||||
this.hasPositions = hasPositions;
|
||||
this.hasPayloads = hasPayloads;
|
||||
terms = new TreeMap<BytesRef,SimpleTVPostings>();
|
||||
}
|
||||
|
||||
|
@ -292,6 +298,21 @@ public class SimpleTextTermVectorsReader extends TermVectorsReader {
|
|||
public int getDocCount() throws IOException {
|
||||
return 1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasOffsets() {
|
||||
return hasOffsets;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasPositions() {
|
||||
return hasPositions;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasPayloads() {
|
||||
return hasPayloads;
|
||||
}
|
||||
}
|
||||
|
||||
private static class SimpleTVPostings {
|
||||
|
@ -299,6 +320,7 @@ public class SimpleTextTermVectorsReader extends TermVectorsReader {
|
|||
private int positions[];
|
||||
private int startOffsets[];
|
||||
private int endOffsets[];
|
||||
private BytesRef payloads[];
|
||||
}
|
||||
|
||||
private static class SimpleTVTermsEnum extends TermsEnum {
|
||||
|
@ -372,7 +394,7 @@ public class SimpleTextTermVectorsReader extends TermVectorsReader {
|
|||
}
|
||||
// TODO: reuse
|
||||
SimpleTVDocsAndPositionsEnum e = new SimpleTVDocsAndPositionsEnum();
|
||||
e.reset(liveDocs, postings.positions, postings.startOffsets, postings.endOffsets);
|
||||
e.reset(liveDocs, postings.positions, postings.startOffsets, postings.endOffsets, postings.payloads);
|
||||
return e;
|
||||
}
|
||||
|
||||
|
@ -433,6 +455,7 @@ public class SimpleTextTermVectorsReader extends TermVectorsReader {
|
|||
private int nextPos;
|
||||
private Bits liveDocs;
|
||||
private int[] positions;
|
||||
private BytesRef[] payloads;
|
||||
private int[] startOffsets;
|
||||
private int[] endOffsets;
|
||||
|
||||
|
@ -470,11 +493,12 @@ public class SimpleTextTermVectorsReader extends TermVectorsReader {
|
|||
}
|
||||
}
|
||||
|
||||
public void reset(Bits liveDocs, int[] positions, int[] startOffsets, int[] endOffsets) {
|
||||
public void reset(Bits liveDocs, int[] positions, int[] startOffsets, int[] endOffsets, BytesRef payloads[]) {
|
||||
this.liveDocs = liveDocs;
|
||||
this.positions = positions;
|
||||
this.startOffsets = startOffsets;
|
||||
this.endOffsets = endOffsets;
|
||||
this.payloads = payloads;
|
||||
this.doc = -1;
|
||||
didNext = false;
|
||||
nextPos = 0;
|
||||
|
@ -482,12 +506,7 @@ public class SimpleTextTermVectorsReader extends TermVectorsReader {
|
|||
|
||||
@Override
|
||||
public BytesRef getPayload() {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasPayload() {
|
||||
return false;
|
||||
return payloads == null ? null : payloads[nextPos-1];
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -45,10 +45,12 @@ public class SimpleTextTermVectorsWriter extends TermVectorsWriter {
|
|||
static final BytesRef FIELDNAME = new BytesRef(" name ");
|
||||
static final BytesRef FIELDPOSITIONS = new BytesRef(" positions ");
|
||||
static final BytesRef FIELDOFFSETS = new BytesRef(" offsets ");
|
||||
static final BytesRef FIELDPAYLOADS = new BytesRef(" payloads ");
|
||||
static final BytesRef FIELDTERMCOUNT = new BytesRef(" numterms ");
|
||||
static final BytesRef TERMTEXT = new BytesRef(" term ");
|
||||
static final BytesRef TERMFREQ = new BytesRef(" freq ");
|
||||
static final BytesRef POSITION = new BytesRef(" position ");
|
||||
static final BytesRef PAYLOAD = new BytesRef(" payload ");
|
||||
static final BytesRef STARTOFFSET = new BytesRef(" startoffset ");
|
||||
static final BytesRef ENDOFFSET = new BytesRef(" endoffset ");
|
||||
|
||||
|
@ -61,6 +63,7 @@ public class SimpleTextTermVectorsWriter extends TermVectorsWriter {
|
|||
private final BytesRef scratch = new BytesRef();
|
||||
private boolean offsets;
|
||||
private boolean positions;
|
||||
private boolean payloads;
|
||||
|
||||
public SimpleTextTermVectorsWriter(Directory directory, String segment, IOContext context) throws IOException {
|
||||
this.directory = directory;
|
||||
|
@ -89,7 +92,7 @@ public class SimpleTextTermVectorsWriter extends TermVectorsWriter {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void startField(FieldInfo info, int numTerms, boolean positions, boolean offsets) throws IOException {
|
||||
public void startField(FieldInfo info, int numTerms, boolean positions, boolean offsets, boolean payloads) throws IOException {
|
||||
write(FIELD);
|
||||
write(Integer.toString(info.number));
|
||||
newLine();
|
||||
|
@ -106,12 +109,17 @@ public class SimpleTextTermVectorsWriter extends TermVectorsWriter {
|
|||
write(Boolean.toString(offsets));
|
||||
newLine();
|
||||
|
||||
write(FIELDPAYLOADS);
|
||||
write(Boolean.toString(payloads));
|
||||
newLine();
|
||||
|
||||
write(FIELDTERMCOUNT);
|
||||
write(Integer.toString(numTerms));
|
||||
newLine();
|
||||
|
||||
this.positions = positions;
|
||||
this.offsets = offsets;
|
||||
this.payloads = payloads;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -126,13 +134,22 @@ public class SimpleTextTermVectorsWriter extends TermVectorsWriter {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void addPosition(int position, int startOffset, int endOffset) throws IOException {
|
||||
public void addPosition(int position, int startOffset, int endOffset, BytesRef payload) throws IOException {
|
||||
assert positions || offsets;
|
||||
|
||||
if (positions) {
|
||||
write(POSITION);
|
||||
write(Integer.toString(position));
|
||||
newLine();
|
||||
|
||||
if (payloads) {
|
||||
write(PAYLOAD);
|
||||
if (payload != null) {
|
||||
assert payload.length > 0;
|
||||
write(payload);
|
||||
}
|
||||
newLine();
|
||||
}
|
||||
}
|
||||
|
||||
if (offsets) {
|
||||
|
|
|
@ -39,6 +39,7 @@ public class FieldType implements IndexableFieldType {
|
|||
private boolean storeTermVectors;
|
||||
private boolean storeTermVectorOffsets;
|
||||
private boolean storeTermVectorPositions;
|
||||
private boolean storeTermVectorPayloads;
|
||||
private boolean omitNorms;
|
||||
private IndexOptions indexOptions = IndexOptions.DOCS_AND_FREQS_AND_POSITIONS;
|
||||
private DocValues.Type docValueType;
|
||||
|
@ -53,6 +54,7 @@ public class FieldType implements IndexableFieldType {
|
|||
this.storeTermVectors = ref.storeTermVectors();
|
||||
this.storeTermVectorOffsets = ref.storeTermVectorOffsets();
|
||||
this.storeTermVectorPositions = ref.storeTermVectorPositions();
|
||||
this.storeTermVectorPayloads = ref.storeTermVectorPayloads();
|
||||
this.omitNorms = ref.omitNorms();
|
||||
this.indexOptions = ref.indexOptions();
|
||||
this.docValueType = ref.docValueType();
|
||||
|
@ -132,6 +134,15 @@ public class FieldType implements IndexableFieldType {
|
|||
this.storeTermVectorPositions = value;
|
||||
}
|
||||
|
||||
public boolean storeTermVectorPayloads() {
|
||||
return this.storeTermVectorPayloads;
|
||||
}
|
||||
|
||||
public void setStoreTermVectorPayloads(boolean value) {
|
||||
checkIfFrozen();
|
||||
this.storeTermVectorPayloads = value;
|
||||
}
|
||||
|
||||
public boolean omitNorms() {
|
||||
return this.omitNorms;
|
||||
}
|
||||
|
@ -198,24 +209,19 @@ public class FieldType implements IndexableFieldType {
|
|||
result.append(",");
|
||||
result.append("indexed");
|
||||
if (tokenized()) {
|
||||
if (result.length() > 0)
|
||||
result.append(",");
|
||||
result.append("tokenized");
|
||||
result.append(",tokenized");
|
||||
}
|
||||
if (storeTermVectors()) {
|
||||
if (result.length() > 0)
|
||||
result.append(",");
|
||||
result.append("termVector");
|
||||
result.append(",termVector");
|
||||
}
|
||||
if (storeTermVectorOffsets()) {
|
||||
if (result.length() > 0)
|
||||
result.append(",");
|
||||
result.append("termVectorOffsets");
|
||||
result.append(",termVectorOffsets");
|
||||
}
|
||||
if (storeTermVectorPositions()) {
|
||||
if (result.length() > 0)
|
||||
result.append(",");
|
||||
result.append("termVectorPosition");
|
||||
result.append(",termVectorPosition");
|
||||
if (storeTermVectorPayloads()) {
|
||||
result.append(",termVectorPayloads");
|
||||
}
|
||||
}
|
||||
if (omitNorms()) {
|
||||
result.append(",omitNorms");
|
||||
|
@ -232,7 +238,9 @@ public class FieldType implements IndexableFieldType {
|
|||
}
|
||||
}
|
||||
if (docValueType != null) {
|
||||
result.append(",docValueType=");
|
||||
if (result.length() > 0)
|
||||
result.append(",");
|
||||
result.append("docValueType=");
|
||||
result.append(docValueType);
|
||||
}
|
||||
|
||||
|
|
|
@ -685,12 +685,7 @@ public class CheckIndex {
|
|||
DocsAndPositionsEnum postings = null;
|
||||
|
||||
String lastField = null;
|
||||
final FieldsEnum fieldsEnum = fields.iterator();
|
||||
while(true) {
|
||||
final String field = fieldsEnum.next();
|
||||
if (field == null) {
|
||||
break;
|
||||
}
|
||||
for (String field : fields) {
|
||||
// MultiFieldsEnum relies upon this order...
|
||||
if (lastField != null && field.compareTo(lastField) <= 0) {
|
||||
throw new RuntimeException("fields out of order: lastField=" + lastField + " field=" + field);
|
||||
|
@ -713,11 +708,16 @@ public class CheckIndex {
|
|||
// assert fields.terms(field) != null;
|
||||
computedFieldCount++;
|
||||
|
||||
final Terms terms = fieldsEnum.terms();
|
||||
final Terms terms = fields.terms(field);
|
||||
if (terms == null) {
|
||||
continue;
|
||||
}
|
||||
|
||||
final boolean hasPositions = terms.hasPositions();
|
||||
final boolean hasOffsets = terms.hasOffsets();
|
||||
// term vectors cannot omit TF
|
||||
final boolean hasFreqs = isVectors || fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
|
||||
|
||||
final TermsEnum termsEnum = terms.iterator(null);
|
||||
|
||||
boolean hasOrd = true;
|
||||
|
@ -777,17 +777,10 @@ public class CheckIndex {
|
|||
status.termCount++;
|
||||
|
||||
final DocsEnum docs2;
|
||||
final boolean hasPositions;
|
||||
// if we are checking vectors, we have freqs implicitly
|
||||
final boolean hasFreqs = isVectors || fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
|
||||
// if we are checking vectors, offsets are a free-for-all anyway
|
||||
final boolean hasOffsets = isVectors || fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
|
||||
if (postings != null) {
|
||||
docs2 = postings;
|
||||
hasPositions = true;
|
||||
} else {
|
||||
docs2 = docs;
|
||||
hasPositions = false;
|
||||
}
|
||||
|
||||
int lastDoc = -1;
|
||||
|
@ -824,22 +817,17 @@ public class CheckIndex {
|
|||
if (hasPositions) {
|
||||
for(int j=0;j<freq;j++) {
|
||||
final int pos = postings.nextPosition();
|
||||
// NOTE: pos=-1 is allowed because of ancient bug
|
||||
// (LUCENE-1542) whereby IndexWriter could
|
||||
// write pos=-1 when first token's posInc is 0
|
||||
// (separately: analyzers should not give
|
||||
// posInc=0 to first token); also, term
|
||||
// vectors are allowed to return pos=-1 if
|
||||
// they indexed offset but not positions:
|
||||
if (pos < -1) {
|
||||
|
||||
if (pos < 0) {
|
||||
throw new RuntimeException("term " + term + ": doc " + doc + ": pos " + pos + " is out of bounds");
|
||||
}
|
||||
if (pos < lastPos) {
|
||||
throw new RuntimeException("term " + term + ": doc " + doc + ": pos " + pos + " < lastPos " + lastPos);
|
||||
}
|
||||
lastPos = pos;
|
||||
if (postings.hasPayload()) {
|
||||
postings.getPayload();
|
||||
BytesRef payload = postings.getPayload();
|
||||
if (payload != null && payload.length < 1) {
|
||||
throw new RuntimeException("term " + term + ": doc " + doc + ": pos " + pos + " payload length is out of bounds " + payload.length);
|
||||
}
|
||||
if (hasOffsets) {
|
||||
int startOffset = postings.startOffset();
|
||||
|
@ -924,14 +912,8 @@ public class CheckIndex {
|
|||
int lastOffset = 0;
|
||||
for(int posUpto=0;posUpto<freq;posUpto++) {
|
||||
final int pos = postings.nextPosition();
|
||||
// NOTE: pos=-1 is allowed because of ancient bug
|
||||
// (LUCENE-1542) whereby IndexWriter could
|
||||
// write pos=-1 when first token's posInc is 0
|
||||
// (separately: analyzers should not give
|
||||
// posInc=0 to first token); also, term
|
||||
// vectors are allowed to return pos=-1 if
|
||||
// they indexed offset but not positions:
|
||||
if (pos < -1) {
|
||||
|
||||
if (pos < 0) {
|
||||
throw new RuntimeException("position " + pos + " is out of bounds");
|
||||
}
|
||||
if (pos < lastPosition) {
|
||||
|
@ -1000,11 +982,7 @@ public class CheckIndex {
|
|||
// only happen if it's a ghost field (field with
|
||||
// no terms, eg there used to be terms but all
|
||||
// docs got deleted and then merged away):
|
||||
// make sure TermsEnum is empty:
|
||||
final Terms fieldTerms2 = fieldsEnum.terms();
|
||||
if (fieldTerms2 != null && fieldTerms2.iterator(null).next() != null) {
|
||||
throw new RuntimeException("Fields.terms(field=" + field + ") returned null yet the field appears to have terms");
|
||||
}
|
||||
|
||||
} else {
|
||||
if (fieldTerms instanceof BlockTreeTermsReader.FieldReader) {
|
||||
final BlockTreeTermsReader.Stats stats = ((BlockTreeTermsReader.FieldReader) fieldTerms).computeStats();
|
||||
|
@ -1415,9 +1393,7 @@ public class CheckIndex {
|
|||
status.docCount++;
|
||||
}
|
||||
|
||||
FieldsEnum fieldsEnum = tfv.iterator();
|
||||
String field = null;
|
||||
while((field = fieldsEnum.next()) != null) {
|
||||
for(String field : tfv) {
|
||||
if (doStats) {
|
||||
status.totVectors++;
|
||||
}
|
||||
|
@ -1432,6 +1408,8 @@ public class CheckIndex {
|
|||
Terms terms = tfv.terms(field);
|
||||
termsEnum = terms.iterator(termsEnum);
|
||||
final boolean postingsHasFreq = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
|
||||
final boolean postingsHasPayload = fieldInfo.hasPayloads();
|
||||
final boolean vectorsHasPayload = terms.hasPayloads();
|
||||
|
||||
Terms postingsTerms = postingsFields.terms(field);
|
||||
if (postingsTerms == null) {
|
||||
|
@ -1439,19 +1417,18 @@ public class CheckIndex {
|
|||
}
|
||||
postingsTermsEnum = postingsTerms.iterator(postingsTermsEnum);
|
||||
|
||||
final boolean hasProx = terms.hasOffsets() || terms.hasPositions();
|
||||
BytesRef term = null;
|
||||
while ((term = termsEnum.next()) != null) {
|
||||
|
||||
final boolean hasProx;
|
||||
|
||||
// Try positions:
|
||||
postings = termsEnum.docsAndPositions(null, postings);
|
||||
if (postings == null) {
|
||||
hasProx = false;
|
||||
// Try docIDs & freqs:
|
||||
docs = termsEnum.docs(null, docs);
|
||||
if (hasProx) {
|
||||
postings = termsEnum.docsAndPositions(null, postings);
|
||||
assert postings != null;
|
||||
docs = null;
|
||||
} else {
|
||||
hasProx = true;
|
||||
docs = termsEnum.docs(null, docs);
|
||||
assert docs != null;
|
||||
postings = null;
|
||||
}
|
||||
|
||||
final DocsEnum docs2;
|
||||
|
@ -1504,7 +1481,7 @@ public class CheckIndex {
|
|||
int pos = postings.nextPosition();
|
||||
if (postingsPostings != null) {
|
||||
int postingsPos = postingsPostings.nextPosition();
|
||||
if (pos != -1 && postingsPos != -1 && pos != postingsPos) {
|
||||
if (terms.hasPositions() && pos != postingsPos) {
|
||||
throw new RuntimeException("vector term=" + term + " field=" + field + " doc=" + j + ": pos=" + pos + " differs from postings pos=" + postingsPos);
|
||||
}
|
||||
}
|
||||
|
@ -1535,6 +1512,34 @@ public class CheckIndex {
|
|||
throw new RuntimeException("vector term=" + term + " field=" + field + " doc=" + j + ": endOffset=" + endOffset + " differs from postings endOffset=" + postingsEndOffset);
|
||||
}
|
||||
}
|
||||
|
||||
BytesRef payload = postings.getPayload();
|
||||
|
||||
if (payload != null) {
|
||||
assert vectorsHasPayload;
|
||||
}
|
||||
|
||||
if (postingsHasPayload && vectorsHasPayload) {
|
||||
assert postingsPostings != null;
|
||||
|
||||
if (payload == null) {
|
||||
// we have payloads, but not at this position.
|
||||
// postings has payloads too, it should not have one at this position
|
||||
if (postingsPostings.getPayload() != null) {
|
||||
throw new RuntimeException("vector term=" + term + " field=" + field + " doc=" + j + " has no payload but postings does: " + postingsPostings.getPayload());
|
||||
}
|
||||
} else {
|
||||
// we have payloads, and one at this position
|
||||
// postings should also have one at this position, with the same bytes.
|
||||
if (postingsPostings.getPayload() == null) {
|
||||
throw new RuntimeException("vector term=" + term + " field=" + field + " doc=" + j + " has payload=" + payload + " but postings does not.");
|
||||
}
|
||||
BytesRef postingsPayload = postingsPostings.getPayload();
|
||||
if (!payload.equals(postingsPayload)) {
|
||||
throw new RuntimeException("vector term=" + term + " field=" + field + " doc=" + j + " has payload=" + payload + " but differs from postings payload=" + postingsPayload);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -24,7 +24,7 @@ import java.util.List;
|
|||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.util.PriorityQueue;
|
||||
import org.apache.lucene.util.MergedIterator;
|
||||
import org.apache.lucene.index.BufferedDeletesStream.QueryAndLimit;
|
||||
|
||||
class CoalescedDeletes {
|
||||
|
@ -48,13 +48,14 @@ class CoalescedDeletes {
|
|||
|
||||
public Iterable<Term> termsIterable() {
|
||||
return new Iterable<Term>() {
|
||||
@SuppressWarnings("unchecked")
|
||||
@Override
|
||||
public Iterator<Term> iterator() {
|
||||
ArrayList<Iterator<Term>> subs = new ArrayList<Iterator<Term>>(iterables.size());
|
||||
for (Iterable<Term> iterable : iterables) {
|
||||
subs.add(iterable.iterator());
|
||||
Iterator<Term> subs[] = new Iterator[iterables.size()];
|
||||
for (int i = 0; i < iterables.size(); i++) {
|
||||
subs[i] = iterables.get(i).iterator();
|
||||
}
|
||||
return mergedIterator(subs);
|
||||
return new MergedIterator<Term>(subs);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
@ -86,106 +87,4 @@ class CoalescedDeletes {
|
|||
}
|
||||
};
|
||||
}
|
||||
|
||||
/** provides a merged view across multiple iterators */
|
||||
static Iterator<Term> mergedIterator(final List<Iterator<Term>> iterators) {
|
||||
return new Iterator<Term>() {
|
||||
Term current;
|
||||
TermMergeQueue queue = new TermMergeQueue(iterators.size());
|
||||
SubIterator[] top = new SubIterator[iterators.size()];
|
||||
int numTop;
|
||||
|
||||
{
|
||||
int index = 0;
|
||||
for (Iterator<Term> iterator : iterators) {
|
||||
if (iterator.hasNext()) {
|
||||
SubIterator sub = new SubIterator();
|
||||
sub.current = iterator.next();
|
||||
sub.iterator = iterator;
|
||||
sub.index = index++;
|
||||
queue.add(sub);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
if (queue.size() > 0) {
|
||||
return true;
|
||||
}
|
||||
|
||||
for (int i = 0; i < numTop; i++) {
|
||||
if (top[i].iterator.hasNext()) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public Term next() {
|
||||
// restore queue
|
||||
pushTop();
|
||||
|
||||
// gather equal top fields
|
||||
if (queue.size() > 0) {
|
||||
pullTop();
|
||||
} else {
|
||||
current = null;
|
||||
}
|
||||
return current;
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
private void pullTop() {
|
||||
// extract all subs from the queue that have the same top term
|
||||
assert numTop == 0;
|
||||
while (true) {
|
||||
top[numTop++] = queue.pop();
|
||||
if (queue.size() == 0
|
||||
|| !(queue.top()).current.equals(top[0].current)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
current = top[0].current;
|
||||
}
|
||||
|
||||
private void pushTop() {
|
||||
// call next() on each top, and put back into queue
|
||||
for (int i = 0; i < numTop; i++) {
|
||||
if (top[i].iterator.hasNext()) {
|
||||
top[i].current = top[i].iterator.next();
|
||||
queue.add(top[i]);
|
||||
} else {
|
||||
// no more terms
|
||||
top[i].current = null;
|
||||
}
|
||||
}
|
||||
numTop = 0;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
private static class SubIterator {
|
||||
Iterator<Term> iterator;
|
||||
Term current;
|
||||
int index;
|
||||
}
|
||||
|
||||
private static class TermMergeQueue extends PriorityQueue<SubIterator> {
|
||||
TermMergeQueue(int size) {
|
||||
super(size);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean lessThan(SubIterator a, SubIterator b) {
|
||||
final int cmp = a.current.compareTo(b.current);
|
||||
if (cmp != 0) {
|
||||
return cmp < 0;
|
||||
} else {
|
||||
return a.index < b.index;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -105,7 +105,7 @@ public abstract class DocValues implements Closeable {
|
|||
* <p>
|
||||
* {@link Source} instances obtained from this method are closed / released
|
||||
* from the cache once this {@link DocValues} instance is closed by the
|
||||
* {@link IndexReader}, {@link Fields} or {@link FieldsEnum} the
|
||||
* {@link IndexReader}, {@link Fields} or the
|
||||
* {@link DocValues} was created from.
|
||||
*/
|
||||
public Source getSource() throws IOException {
|
||||
|
|
|
@ -48,11 +48,8 @@ public abstract class DocsAndPositionsEnum extends DocsEnum {
|
|||
public abstract int endOffset() throws IOException;
|
||||
|
||||
/** Returns the payload at this position, or null if no
|
||||
* payload was indexed. Only call this once per
|
||||
* position. You should not modify anything (neither
|
||||
* members of the returned BytesRef nor bytes in the
|
||||
* byte[]). */
|
||||
* payload was indexed. You should not modify anything
|
||||
* (neither members of the returned BytesRef nor bytes
|
||||
* in the byte[]). */
|
||||
public abstract BytesRef getPayload() throws IOException;
|
||||
|
||||
public abstract boolean hasPayload();
|
||||
}
|
||||
|
|
|
@ -18,15 +18,16 @@ package org.apache.lucene.index;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Iterator;
|
||||
|
||||
/** Flex API for access to fields and terms
|
||||
* @lucene.experimental */
|
||||
|
||||
public abstract class Fields {
|
||||
public abstract class Fields implements Iterable<String> {
|
||||
|
||||
/** Returns an iterator that will step through all fields
|
||||
* names. This will not return null. */
|
||||
public abstract FieldsEnum iterator() throws IOException;
|
||||
public abstract Iterator<String> iterator();
|
||||
|
||||
/** Get the {@link Terms} for this field. This will return
|
||||
* null if the field does not exist. */
|
||||
|
@ -45,12 +46,7 @@ public abstract class Fields {
|
|||
// TODO: deprecate?
|
||||
public long getUniqueTermCount() throws IOException {
|
||||
long numTerms = 0;
|
||||
FieldsEnum it = iterator();
|
||||
while(true) {
|
||||
String field = it.next();
|
||||
if (field == null) {
|
||||
break;
|
||||
}
|
||||
for (String field : this) {
|
||||
Terms terms = terms(field);
|
||||
if (terms != null) {
|
||||
final long termCount = terms.size();
|
||||
|
|
|
@ -1,79 +0,0 @@
|
|||
package org.apache.lucene.index;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
|
||||
/** Enumerates indexed fields. You must first call {@link
|
||||
* #next} before calling {@link #terms}.
|
||||
*
|
||||
* @lucene.experimental */
|
||||
|
||||
public abstract class FieldsEnum {
|
||||
|
||||
// TODO: maybe allow retrieving FieldInfo for current
|
||||
// field, as optional method?
|
||||
|
||||
private AttributeSource atts = null;
|
||||
|
||||
/**
|
||||
* Returns the related attributes.
|
||||
*/
|
||||
public AttributeSource attributes() {
|
||||
if (atts == null) {
|
||||
atts = new AttributeSource();
|
||||
}
|
||||
return atts;
|
||||
}
|
||||
|
||||
/** Increments the enumeration to the next field. Returns
|
||||
* null when there are no more fields.*/
|
||||
public abstract String next() throws IOException;
|
||||
|
||||
// TODO: would be nice to require/fix all impls so they
|
||||
// never return null here... we have to fix the writers to
|
||||
// never write 0-terms fields... or maybe allow a non-null
|
||||
// Terms instance in just this case
|
||||
|
||||
/** Get {@link Terms} for the current field. After {@link #next} returns
|
||||
* null this method should not be called. This method may
|
||||
* return null in some cases, which means the provided
|
||||
* field does not have any terms. */
|
||||
public abstract Terms terms() throws IOException;
|
||||
|
||||
// TODO: should we allow pulling Terms as well? not just
|
||||
// the iterator?
|
||||
|
||||
public final static FieldsEnum[] EMPTY_ARRAY = new FieldsEnum[0];
|
||||
|
||||
/** Provides zero fields */
|
||||
public final static FieldsEnum EMPTY = new FieldsEnum() {
|
||||
|
||||
@Override
|
||||
public String next() {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Terms terms() {
|
||||
throw new IllegalStateException("this method should never be called");
|
||||
}
|
||||
};
|
||||
}
|
|
@ -24,6 +24,7 @@ import org.apache.lucene.util.automaton.CompiledAutomaton;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.util.Comparator;
|
||||
import java.util.Iterator;
|
||||
|
||||
/** A <code>FilterAtomicReader</code> contains another AtomicReader, which it
|
||||
* uses as its basic source of data, possibly transforming the data along the
|
||||
|
@ -46,7 +47,7 @@ public class FilterAtomicReader extends AtomicReader {
|
|||
}
|
||||
|
||||
@Override
|
||||
public FieldsEnum iterator() throws IOException {
|
||||
public Iterator<String> iterator() {
|
||||
return in.iterator();
|
||||
}
|
||||
|
||||
|
@ -109,28 +110,20 @@ public class FilterAtomicReader extends AtomicReader {
|
|||
public TermsEnum intersect(CompiledAutomaton automaton, BytesRef bytes) throws java.io.IOException {
|
||||
return in.intersect(automaton, bytes);
|
||||
}
|
||||
}
|
||||
|
||||
/** Base class for filtering {@link TermsEnum} implementations. */
|
||||
public static class FilterFieldsEnum extends FieldsEnum {
|
||||
protected final FieldsEnum in;
|
||||
public FilterFieldsEnum(FieldsEnum in) {
|
||||
this.in = in;
|
||||
@Override
|
||||
public boolean hasOffsets() {
|
||||
return in.hasOffsets();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String next() throws IOException {
|
||||
return in.next();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Terms terms() throws IOException {
|
||||
return in.terms();
|
||||
public boolean hasPositions() {
|
||||
return in.hasPositions();
|
||||
}
|
||||
|
||||
@Override
|
||||
public AttributeSource attributes() {
|
||||
return in.attributes();
|
||||
public boolean hasPayloads() {
|
||||
return in.hasPayloads();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -292,11 +285,6 @@ public class FilterAtomicReader extends AtomicReader {
|
|||
public BytesRef getPayload() throws IOException {
|
||||
return in.getPayload();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasPayload() {
|
||||
return in.hasPayload();
|
||||
}
|
||||
|
||||
@Override
|
||||
public AttributeSource attributes() {
|
||||
|
|
|
@ -173,7 +173,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
|
|||
postings.lastDocCodes[termID] = docState.docID;
|
||||
} else {
|
||||
postings.lastDocCodes[termID] = docState.docID << 1;
|
||||
postings.docFreqs[termID] = 1;
|
||||
postings.termFreqs[termID] = 1;
|
||||
if (hasProx) {
|
||||
writeProx(termID, fieldState.position);
|
||||
if (hasOffsets) {
|
||||
|
@ -194,10 +194,10 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
|
|||
|
||||
FreqProxPostingsArray postings = (FreqProxPostingsArray) termsHashPerField.postingsArray;
|
||||
|
||||
assert !hasFreq || postings.docFreqs[termID] > 0;
|
||||
assert !hasFreq || postings.termFreqs[termID] > 0;
|
||||
|
||||
if (!hasFreq) {
|
||||
assert postings.docFreqs == null;
|
||||
assert postings.termFreqs == null;
|
||||
if (docState.docID != postings.lastDocIDs[termID]) {
|
||||
assert docState.docID > postings.lastDocIDs[termID];
|
||||
termsHashPerField.writeVInt(0, postings.lastDocCodes[termID]);
|
||||
|
@ -212,13 +212,13 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
|
|||
|
||||
// Now that we know doc freq for previous doc,
|
||||
// write it & lastDocCode
|
||||
if (1 == postings.docFreqs[termID]) {
|
||||
if (1 == postings.termFreqs[termID]) {
|
||||
termsHashPerField.writeVInt(0, postings.lastDocCodes[termID]|1);
|
||||
} else {
|
||||
termsHashPerField.writeVInt(0, postings.lastDocCodes[termID]);
|
||||
termsHashPerField.writeVInt(0, postings.docFreqs[termID]);
|
||||
termsHashPerField.writeVInt(0, postings.termFreqs[termID]);
|
||||
}
|
||||
postings.docFreqs[termID] = 1;
|
||||
postings.termFreqs[termID] = 1;
|
||||
fieldState.maxTermFrequency = Math.max(1, fieldState.maxTermFrequency);
|
||||
postings.lastDocCodes[termID] = (docState.docID - postings.lastDocIDs[termID]) << 1;
|
||||
postings.lastDocIDs[termID] = docState.docID;
|
||||
|
@ -233,7 +233,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
|
|||
}
|
||||
fieldState.uniqueTermCount++;
|
||||
} else {
|
||||
fieldState.maxTermFrequency = Math.max(fieldState.maxTermFrequency, ++postings.docFreqs[termID]);
|
||||
fieldState.maxTermFrequency = Math.max(fieldState.maxTermFrequency, ++postings.termFreqs[termID]);
|
||||
if (hasProx) {
|
||||
writeProx(termID, fieldState.position-postings.lastPositions[termID]);
|
||||
}
|
||||
|
@ -252,7 +252,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
|
|||
public FreqProxPostingsArray(int size, boolean writeFreqs, boolean writeProx, boolean writeOffsets) {
|
||||
super(size);
|
||||
if (writeFreqs) {
|
||||
docFreqs = new int[size];
|
||||
termFreqs = new int[size];
|
||||
}
|
||||
lastDocIDs = new int[size];
|
||||
lastDocCodes = new int[size];
|
||||
|
@ -267,7 +267,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
|
|||
//System.out.println("PA init freqs=" + writeFreqs + " pos=" + writeProx + " offs=" + writeOffsets);
|
||||
}
|
||||
|
||||
int docFreqs[]; // # times this term occurs in the current doc
|
||||
int termFreqs[]; // # times this term occurs in the current doc
|
||||
int lastDocIDs[]; // Last docID where this term occurred
|
||||
int lastDocCodes[]; // Code for prior doc
|
||||
int lastPositions[]; // Last position where this term occurred
|
||||
|
@ -275,7 +275,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
|
|||
|
||||
@Override
|
||||
ParallelPostingsArray newInstance(int size) {
|
||||
return new FreqProxPostingsArray(size, docFreqs != null, lastPositions != null, lastOffsets != null);
|
||||
return new FreqProxPostingsArray(size, termFreqs != null, lastPositions != null, lastOffsets != null);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -295,9 +295,9 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
|
|||
assert to.lastOffsets != null;
|
||||
System.arraycopy(lastOffsets, 0, to.lastOffsets, 0, numToCopy);
|
||||
}
|
||||
if (docFreqs != null) {
|
||||
assert to.docFreqs != null;
|
||||
System.arraycopy(docFreqs, 0, to.docFreqs, 0, numToCopy);
|
||||
if (termFreqs != null) {
|
||||
assert to.termFreqs != null;
|
||||
System.arraycopy(termFreqs, 0, to.termFreqs, 0, numToCopy);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -310,7 +310,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
|
|||
if (lastOffsets != null) {
|
||||
bytes += RamUsageEstimator.NUM_BYTES_INT;
|
||||
}
|
||||
if (docFreqs != null) {
|
||||
if (termFreqs != null) {
|
||||
bytes += RamUsageEstimator.NUM_BYTES_INT;
|
||||
}
|
||||
|
||||
|
@ -416,21 +416,21 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
|
|||
// Now termStates has numToMerge FieldMergeStates
|
||||
// which all share the same term. Now we must
|
||||
// interleave the docID streams.
|
||||
int numDocs = 0;
|
||||
int docFreq = 0;
|
||||
long totTF = 0;
|
||||
int docID = 0;
|
||||
|
||||
while(true) {
|
||||
//System.out.println(" cycle");
|
||||
final int termDocFreq;
|
||||
final int termFreq;
|
||||
if (freq.eof()) {
|
||||
if (postings.lastDocCodes[termID] != -1) {
|
||||
// Return last doc
|
||||
docID = postings.lastDocIDs[termID];
|
||||
if (readTermFreq) {
|
||||
termDocFreq = postings.docFreqs[termID];
|
||||
termFreq = postings.termFreqs[termID];
|
||||
} else {
|
||||
termDocFreq = -1;
|
||||
termFreq = -1;
|
||||
}
|
||||
postings.lastDocCodes[termID] = -1;
|
||||
} else {
|
||||
|
@ -441,20 +441,20 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
|
|||
final int code = freq.readVInt();
|
||||
if (!readTermFreq) {
|
||||
docID += code;
|
||||
termDocFreq = -1;
|
||||
termFreq = -1;
|
||||
} else {
|
||||
docID += code >>> 1;
|
||||
if ((code & 1) != 0) {
|
||||
termDocFreq = 1;
|
||||
termFreq = 1;
|
||||
} else {
|
||||
termDocFreq = freq.readVInt();
|
||||
termFreq = freq.readVInt();
|
||||
}
|
||||
}
|
||||
|
||||
assert docID != postings.lastDocIDs[termID];
|
||||
}
|
||||
|
||||
numDocs++;
|
||||
docFreq++;
|
||||
assert docID < state.segmentInfo.getDocCount(): "doc=" + docID + " maxDoc=" + state.segmentInfo.getDocCount();
|
||||
|
||||
// NOTE: we could check here if the docID was
|
||||
|
@ -469,7 +469,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
|
|||
// 2nd sweep does the real flush, but I suspect
|
||||
// that'd add too much time to flush.
|
||||
visitedDocs.set(docID);
|
||||
postingsConsumer.startDoc(docID, writeTermFreq ? termDocFreq : -1);
|
||||
postingsConsumer.startDoc(docID, writeTermFreq ? termFreq : -1);
|
||||
if (docID < delDocLimit) {
|
||||
// Mark it deleted. TODO: we could also skip
|
||||
// writing its postings; this would be
|
||||
|
@ -485,7 +485,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
|
|||
}
|
||||
}
|
||||
|
||||
totTF += termDocFreq;
|
||||
totTF += termFreq;
|
||||
|
||||
// Carefully copy over the prox + payload info,
|
||||
// changing the format to match Lucene's segment
|
||||
|
@ -495,7 +495,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
|
|||
// we did record positions (& maybe payload) and/or offsets
|
||||
int position = 0;
|
||||
int offset = 0;
|
||||
for(int j=0;j<termDocFreq;j++) {
|
||||
for(int j=0;j<termFreq;j++) {
|
||||
final BytesRef thisPayload;
|
||||
|
||||
if (readPositions) {
|
||||
|
@ -542,9 +542,9 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
|
|||
}
|
||||
postingsConsumer.finishDoc();
|
||||
}
|
||||
termsConsumer.finishTerm(text, new TermStats(numDocs, writeTermFreq ? totTF : -1));
|
||||
termsConsumer.finishTerm(text, new TermStats(docFreq, writeTermFreq ? totTF : -1));
|
||||
sumTotalTermFreq += totTF;
|
||||
sumDocFreq += numDocs;
|
||||
sumDocFreq += docFreq;
|
||||
}
|
||||
|
||||
termsConsumer.finish(writeTermFreq ? sumTotalTermFreq : -1, sumDocFreq, visitedDocs.cardinality());
|
||||
|
|
|
@ -2312,9 +2312,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit {
|
|||
}
|
||||
SegmentInfos sis = new SegmentInfos(); // read infos from dir
|
||||
sis.read(dir);
|
||||
final Set<String> dsFilesCopied = new HashSet<String>();
|
||||
final Map<String, String> dsNames = new HashMap<String, String>();
|
||||
final Set<String> copiedFiles = new HashSet<String>();
|
||||
|
||||
for (SegmentInfoPerCommit info : sis) {
|
||||
assert !infos.contains(info): "dup info dir=" + info.info.dir + " name=" + info.info.name;
|
||||
|
||||
|
@ -2327,7 +2325,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit {
|
|||
|
||||
IOContext context = new IOContext(new MergeInfo(info.info.getDocCount(), info.info.sizeInBytes(), true, -1));
|
||||
|
||||
infos.add(copySegmentAsIs(info, newSegName, dsNames, dsFilesCopied, context, copiedFiles));
|
||||
infos.add(copySegmentAsIs(info, newSegName, context));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -2463,25 +2461,9 @@ public class IndexWriter implements Closeable, TwoPhaseCommit {
|
|||
}
|
||||
|
||||
/** Copies the segment files as-is into the IndexWriter's directory. */
|
||||
// TODO: this can be substantially simplified now that 3.x support/shared docstores is removed!
|
||||
private SegmentInfoPerCommit copySegmentAsIs(SegmentInfoPerCommit info, String segName,
|
||||
Map<String, String> dsNames, Set<String> dsFilesCopied, IOContext context,
|
||||
Set<String> copiedFiles)
|
||||
private SegmentInfoPerCommit copySegmentAsIs(SegmentInfoPerCommit info, String segName, IOContext context)
|
||||
throws IOException {
|
||||
// Determine if the doc store of this segment needs to be copied. It's
|
||||
// only relevant for segments that share doc store with others,
|
||||
// because the DS might have been copied already, in which case we
|
||||
// just want to update the DS name of this SegmentInfo.
|
||||
final String dsName = info.info.name;
|
||||
assert dsName != null;
|
||||
final String newDsName;
|
||||
if (dsNames.containsKey(dsName)) {
|
||||
newDsName = dsNames.get(dsName);
|
||||
} else {
|
||||
dsNames.put(dsName, segName);
|
||||
newDsName = segName;
|
||||
}
|
||||
|
||||
|
||||
// note: we don't really need this fis (its copied), but we load it up
|
||||
// so we don't pass a null value to the si writer
|
||||
FieldInfos fis = getFieldInfos(info.info);
|
||||
|
@ -2496,7 +2478,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit {
|
|||
}
|
||||
|
||||
//System.out.println("copy seg=" + info.info.name + " version=" + info.info.getVersion());
|
||||
// Same SI as before but we change directory, name and docStoreSegment:
|
||||
// Same SI as before but we change directory and name
|
||||
SegmentInfo newInfo = new SegmentInfo(directory, info.info.getVersion(), segName, info.info.getDocCount(),
|
||||
info.info.getUseCompoundFile(),
|
||||
info.info.getCodec(), info.info.getDiagnostics(), attributes);
|
||||
|
@ -2513,16 +2495,10 @@ public class IndexWriter implements Closeable, TwoPhaseCommit {
|
|||
}
|
||||
newInfo.setFiles(segFiles);
|
||||
|
||||
// We must rewrite the SI file because it references
|
||||
// segment name (its own name, if its 3.x, and doc
|
||||
// store segment name):
|
||||
// We must rewrite the SI file because it references segment name in its list of files, etc
|
||||
TrackingDirectoryWrapper trackingDir = new TrackingDirectoryWrapper(directory);
|
||||
try {
|
||||
newInfo.getCodec().segmentInfoFormat().getSegmentInfoWriter().write(trackingDir, newInfo, fis, context);
|
||||
} catch (UnsupportedOperationException uoe) {
|
||||
// OK: 3x codec cannot write a new SI file;
|
||||
// SegmentInfos will write this on commit
|
||||
}
|
||||
|
||||
newInfo.getCodec().segmentInfoFormat().getSegmentInfoWriter().write(trackingDir, newInfo, fis, context);
|
||||
|
||||
final Collection<String> siFiles = trackingDir.getCreatedFiles();
|
||||
|
||||
|
@ -2537,8 +2513,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit {
|
|||
}
|
||||
|
||||
assert !directory.fileExists(newFileName): "file \"" + newFileName + "\" already exists; siFiles=" + siFiles;
|
||||
assert !copiedFiles.contains(file): "file \"" + file + "\" is being copied more than once";
|
||||
copiedFiles.add(file);
|
||||
|
||||
info.info.dir.copy(directory, file, newFileName, context);
|
||||
}
|
||||
|
||||
|
|
|
@ -42,6 +42,9 @@ public interface IndexableFieldType {
|
|||
|
||||
/** True if term vector positions should be indexed */
|
||||
public boolean storeTermVectorPositions();
|
||||
|
||||
/** True if term vector payloads should be indexed */
|
||||
public boolean storeTermVectorPayloads();
|
||||
|
||||
/** True if norms should not be indexed */
|
||||
public boolean omitNorms();
|
||||
|
|
|
@ -199,6 +199,7 @@ public class MergeState {
|
|||
// and we could make a codec(wrapper) to do all of this privately so IW is uninvolved
|
||||
public PayloadProcessorProvider payloadProcessorProvider;
|
||||
public ReaderPayloadProcessor[] readerPayloadProcessor;
|
||||
public ReaderPayloadProcessor currentReaderPayloadProcessor;
|
||||
public PayloadProcessor[] currentPayloadProcessor;
|
||||
|
||||
// TODO: get rid of this? it tells you which segments are 'aligned' (e.g. for bulk merging)
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue