LUCENE-3892: merge trunk

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/pforcodec_3892@1372366 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2012-08-13 11:16:57 +00:00
commit 789981c9fd
369 changed files with 5511 additions and 3981 deletions

49
build-clover.xml Normal file
View File

@ -0,0 +1,49 @@
<?xml version="1.0"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<project name="clover" basedir=".">
<import file="lucene/common-build.xml"/>
<!--
Run after Junit tests.
This target is in a separate file, as it needs to include common-build.xml,
but must run from top-level!
-->
<target name="generate-clover-reports" depends="clover">
<fail unless="run.clover">Clover not enabled!</fail>
<mkdir dir="${clover.report.dir}"/>
<fileset dir="." id="clover.test.result.files">
<include name="*/build/**/test/TEST-*.xml"/>
<exclude name="lucene/build/backwards/**"/>
</fileset>
<clover-report>
<current outfile="${clover.report.dir}" title="${final.name}" numThreads="0">
<format type="html" filter="assert"/>
<testresults refid="clover.test.result.files"/>
</current>
<current outfile="${clover.report.dir}/clover.xml" title="${final.name}">
<format type="xml" filter="assert"/>
<testresults refid="clover.test.result.files"/>
</current>
</clover-report>
<echo>You can find the merged Lucene/Solr Clover report in '${clover.report.dir}'.</echo>
</target>
</project>

134
build.xml
View File

@ -51,11 +51,28 @@
</sequential>
</target>
<target name="validate" description="Validate dependencies, licenses, etc.">
<sequential><subant target="validate" inheritall="false" failonerror="true">
<fileset dir="lucene" includes="build.xml" />
<fileset dir="solr" includes="build.xml" />
</subant></sequential>
<target name="validate" description="Validate dependencies, licenses, etc." depends="-validate-source-patterns">
<subant target="validate" inheritall="false" failonerror="true">
<fileset dir="lucene" includes="build.xml" />
<fileset dir="solr" includes="build.xml" />
</subant>
</target>
<target name="-validate-source-patterns" unless="disable.source-patterns">
<!-- check that there are no nocommits or @author javadoc tags: -->
<property name="validate.currDir" location="."/>
<pathconvert pathsep="${line.separator}" dirsep="/" property="validate.patternsFound" setonempty="false">
<fileset dir="${validate.currDir}">
<include name="**/*.java"/>
<exclude name="**/backwards/**"/>
<or>
<containsregexp expression="@author\b" casesensitive="yes"/>
<containsregexp expression="\bno(n|)commit\b" casesensitive="no"/>
</or>
</fileset>
<map from="${validate.currDir}${file.separator}" to="* "/>
</pathconvert>
<fail if="validate.patternsFound">The following files contain @author tags or nocommits:${line.separator}${validate.patternsFound}</fail>
</target>
<target name="rat-sources" description="Runs rat across all sources and tests">
@ -184,4 +201,111 @@
</subant>
</sequential>
</target>
<!-- define here, as common-build is not included! -->
<property name="python32.exe" value="python3.2" />
<property name="fakeRelease" value="lucene/build/fakeRelease"/>
<property name="fakeReleaseTmp" value="lucene/build/fakeReleaseTmp"/>
<property name="fakeReleaseVersion" value="5.0"/> <!-- *not* -SNAPSHOT, the real version -->
<target name="nightly-smoke" description="Builds an unsigned release and smoke tests it." depends="clean">
<sequential>
<fail unless="JAVA6_HOME">JAVA6_HOME property is not defined.</fail>
<fail unless="JAVA7_HOME">JAVA7_HOME property is not defined.</fail>
<subant target="prepare-release-no-sign" inheritall="false" failonerror="true">
<fileset dir="lucene" includes="build.xml" />
<fileset dir="solr" includes="build.xml" />
<property name="version" value="${fakeReleaseVersion}" />
</subant>
<delete dir="${fakeRelease}"/>
<delete dir="${fakeReleaseTmp}"/>
<mkdir dir="${fakeRelease}"/>
<copy todir="${fakeRelease}/lucene">
<fileset dir="lucene/dist"/>
</copy>
<copy todir="${fakeRelease}/lucene/changes">
<fileset dir="lucene/build/docs/changes"/>
</copy>
<get src="http://people.apache.org/keys/group/lucene.asc"
dest="${fakeRelease}/lucene/KEYS"/>
<copy todir="${fakeRelease}/solr">
<fileset dir="solr/package"/>
</copy>
<copy file="${fakeRelease}/lucene/KEYS" todir="${fakeRelease}/solr"/>
<makeurl file="${fakeRelease}" validate="false" property="fakeRelease.uri"/>
<exec executable="${python32.exe}" failonerror="true">
<arg value="-u"/>
<arg value="dev-tools/scripts/smokeTestRelease.py"/>
<arg value="${fakeRelease.uri}"/>
<arg value="${fakeReleaseVersion}"/>
<arg value="${fakeReleaseTmp}"/>
<arg value="false"/>
<env key="JAVA6_HOME" value="${JAVA6_HOME}"/>
<env key="JAVA7_HOME" value="${JAVA7_HOME}"/>
</exec>
<delete dir="${fakeRelease}"/>
<delete dir="${fakeReleaseTmp}"/>
</sequential>
</target>
<!-- Calls only generate-clover-reports on Lucene, as Solr's is just a clone with other target; the database itsself is fixed -->
<target name="generate-clover-reports">
<subant target="generate-clover-reports" inheritall="false" failonerror="true">
<fileset dir="." includes="build-clover.xml" />
</subant>
</target>
<!-- Jenkins tasks -->
<target name="jenkins-hourly" depends="clean,test,validate,-jenkins-javadocs-lint,-svn-status"/>
<target name="jenkins-clover">
<antcall target="-jenkins-clover">
<param name="run.clover" value="true"/>
<!-- must be 1, as clover does not like parallel test runs: -->
<param name="tests.jvms" value="1"/>
<!-- Also override some other props to be fast, ignoring what's set on command line: -->
<param name="tests.multiplier" value="1"/>
<param name="tests.slow" value="false"/>
<param name="tests.nightly" value="false"/>
<param name="tests.weekly" value="false"/>
<param name="tests.multiplier" value="1"/>
</antcall>
</target>
<target name="-jenkins-clover" depends="clean,test,generate-clover-reports"/>
<!-- we need this extra condition, as we want to match only on "true", not solely if property is set: -->
<property name="disable.javadocs-lint" value="false" />
<condition property="-disable.javadocs-lint">
<equals arg1="${disable.javadocs-lint}" arg2="true"/>
</condition>
<target name="-jenkins-javadocs-lint" unless="-disable.javadocs-lint">
<antcall target="javadocs-lint"/>
</target>
<!-- define here, as common-build is not included! -->
<property name="svn.exe" value="svn" />
<target name="-svn-status">
<exec executable="${svn.exe}" dir="." failonerror="true">
<arg value="status"/>
<redirector outputproperty="svn.status.output">
<outputfilterchain>
<linecontainsregexp>
<regexp pattern="^\?" />
</linecontainsregexp>
<tokenfilter>
<replaceregex pattern="^........" replace="* " />
<replacestring from="${file.separator}" to="/" />
</tokenfilter>
</outputfilterchain>
</redirector>
</exec>
<fail message="Source checkout is dirty after running tests!!! Offending files:${line.separator}${svn.status.output}">
<condition>
<not>
<equals arg1="${svn.status.output}" arg2=""/>
</not>
</condition>
</fail>
</target>
</project>

View File

@ -174,6 +174,6 @@
<classpathentry kind="lib" path="solr/contrib/velocity/lib/commons-beanutils-1.7.0.jar"/>
<classpathentry kind="lib" path="solr/contrib/velocity/lib/commons-collections-3.2.1.jar"/>
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
<classpathentry kind="lib" path="lucene/test-framework/lib/randomizedtesting-runner-1.6.0.jar"/>
<classpathentry kind="lib" path="lucene/test-framework/lib/randomizedtesting-runner-2.0.0.rc5.jar"/>
<classpathentry kind="output" path="bin/other"/>
</classpath>

View File

@ -2,7 +2,7 @@
<library name="JUnit">
<CLASSES>
<root url="jar://$PROJECT_DIR$/lucene/test-framework/lib/junit-4.10.jar!/" />
<root url="jar://$PROJECT_DIR$/lucene/test-framework/lib/randomizedtesting-runner-1.6.0.jar!/" />
<root url="jar://$PROJECT_DIR$/lucene/test-framework/lib/randomizedtesting-runner-2.0.0.rc5.jar!/" />
</CLASSES>
<JAVADOC />
<SOURCES />

View File

@ -36,27 +36,25 @@ A. How to use nightly Jenkins-built Lucene/Solr Maven artifacts
B. How to generate Lucene/Solr Maven artifacts
Prerequisites: JDK 1.6+ and Ant 1.7.X
Prerequisites: JDK 1.6+ and Ant 1.8.2+
Run 'ant generate-maven-artifacts' to create an internal Maven
repository, including POMs, binary .jars, source .jars, and javadoc
.jars.
You can run the above command in four possible places: the top-level
directory; under lucene/; under solr/; or under modules/. From the
top-level directory, from lucene/, or from modules/, the internal
repository will be located at dist/maven/. From solr/, the internal
repository will be located at package/maven/.
You can run the above command in three possible places: the top-level
directory; under lucene/; or under solr/. From the top-level directory
or from lucene/, the internal repository will be located at dist/maven/.
From solr/, the internal repository will be located at package/maven/.
C. How to deploy Maven artifacts to a repository
Prerequisites: JDK 1.6+ and Ant 1.7.X
Prerequisites: JDK 1.6+ and Ant 1.8.2+
You can deploy targets for all of Lucene/Solr, only Lucene, only Solr,
or only modules/, as in B. above. To deploy to a Maven repository, the
command is the same as in B. above, with the addition of two system
properties:
You can deploy targets for all of Lucene/Solr, only Lucene, or only Solr,
as in B. above. To deploy to a Maven repository, the command is the same
as in B. above, with the addition of two system properties:
ant -Dm2.repository.id=my-repo-id \
-Dm2.repository.url=http://example.org/my/repo \
@ -101,7 +99,7 @@ D. How to use Maven to build Lucene/Solr
the default, you can supply an alternate version on the command line
with the above command, e.g.:
ant -Dversion=5.0-my-special-version get-maven-poms
ant -Dversion=my-special-version get-maven-poms
Note: if you change the version in the POMs, there is one test method
that will fail under maven-surefire-plugin:

View File

@ -37,15 +37,9 @@
<module-path>${top-level}/${module-directory}</module-path>
</properties>
<scm>
<connection>
scm:svn:http://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
</connection>
<developerConnection>
scm:svn:https://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
</developerConnection>
<url>
http://svn.apache.org/viewvc/lucene/dev/trunk/${module-directory}
</url>
<connection>scm:svn:${vc-anonymous-base-url}/${module-directory}</connection>
<developerConnection>scm:svn:${vc-dev-base-url}/${module-directory}</developerConnection>
<url>${vc-browse-base-url}/${module-directory}</url>
</scm>
<dependencies>
<dependency>
@ -77,33 +71,5 @@
</excludes>
</testResource>
</testResources>
<plugins>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>appassembler-maven-plugin</artifactId>
<configuration>
<extraJvmArguments>-Xmx128M</extraJvmArguments>
<repositoryLayout>flat</repositoryLayout>
<platforms>
<platform>windows</platform>
<platform>unix</platform>
</platforms>
<programs>
<program>
<mainClass>org.apache.lucene.analysis.charfilter.HtmlStripCharFilter</mainClass>
<name>HtmlStripCharFilter</name>
</program>
<program>
<mainClass>org.apache.lucene.analysis.en.PorterStemmer</mainClass>
<name>EnglishPorterStemmer</name>
</program>
<program>
<mainClass>org.tartarus.snowball.TestApp</mainClass>
<name>SnowballTestApp</name>
</program>
</programs>
</configuration>
</plugin>
</plugins>
</build>
</project>

View File

@ -40,15 +40,9 @@
<module-path>${top-level}/${module-directory}</module-path>
</properties>
<scm>
<connection>
scm:svn:http://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
</connection>
<developerConnection>
scm:svn:https://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
</developerConnection>
<url>
http://svn.apache.org/viewvc/lucene/dev/trunk/${module-directory}
</url>
<connection>scm:svn:${vc-anonymous-base-url}/${module-directory}</connection>
<developerConnection>scm:svn:${vc-dev-base-url}/${module-directory}</developerConnection>
<url>${vc-browse-base-url}/${module-directory}</url>
</scm>
<dependencies>
<dependency>

View File

@ -39,15 +39,9 @@
<module-path>${top-level}/${module-directory}</module-path>
</properties>
<scm>
<connection>
scm:svn:http://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
</connection>
<developerConnection>
scm:svn:https://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
</developerConnection>
<url>
http://svn.apache.org/viewvc/lucene/dev/trunk/${module-directory}
</url>
<connection>scm:svn:${vc-anonymous-base-url}/${module-directory}</connection>
<developerConnection>scm:svn:${vc-dev-base-url}/${module-directory}</developerConnection>
<url>${vc-browse-base-url}/${module-directory}</url>
</scm>
<dependencies>
<dependency>

View File

@ -39,15 +39,9 @@
<module-path>${top-level}/${module-directory}</module-path>
</properties>
<scm>
<connection>
scm:svn:http://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
</connection>
<developerConnection>
scm:svn:https://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
</developerConnection>
<url>
http://svn.apache.org/viewvc/lucene/dev/trunk/${module-directory}
</url>
<connection>scm:svn:${vc-anonymous-base-url}/${module-directory}</connection>
<developerConnection>scm:svn:${vc-dev-base-url}/${module-directory}</developerConnection>
<url>${vc-browse-base-url}/${module-directory}</url>
</scm>
<dependencies>
<dependency>
@ -75,6 +69,11 @@
<build>
<sourceDirectory>${module-path}/src/java</sourceDirectory>
<testSourceDirectory>${module-path}/src/test</testSourceDirectory>
<resources>
<resource>
<directory>${module-path}/src/resources</directory>
</resource>
</resources>
<testResources>
<testResource>
<directory>${project.build.testSourceDirectory}</directory>

View File

@ -39,15 +39,9 @@
<module-path>${top-level}/${module-directory}</module-path>
</properties>
<scm>
<connection>
scm:svn:http://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
</connection>
<developerConnection>
scm:svn:https://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
</developerConnection>
<url>
http://svn.apache.org/viewvc/lucene/dev/trunk/${module-directory}
</url>
<connection>scm:svn:${vc-anonymous-base-url}/${module-directory}</connection>
<developerConnection>scm:svn:${vc-dev-base-url}/${module-directory}</developerConnection>
<url>${vc-browse-base-url}/${module-directory}</url>
</scm>
<dependencies>
<dependency>
@ -75,6 +69,11 @@
<build>
<sourceDirectory>${module-path}/src/java</sourceDirectory>
<testSourceDirectory>${module-path}/src/test</testSourceDirectory>
<resources>
<resource>
<directory>${module-path}/src/resources</directory>
</resource>
</resources>
<testResources>
<testResource>
<directory>${project.build.testSourceDirectory}</directory>

View File

@ -37,15 +37,9 @@
<module-path>${top-level}/${module-directory}</module-path>
</properties>
<scm>
<connection>
scm:svn:http://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
</connection>
<developerConnection>
scm:svn:https://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
</developerConnection>
<url>
http://svn.apache.org/viewvc/lucene/dev/trunk/${module-directory}
</url>
<connection>scm:svn:${vc-anonymous-base-url}/${module-directory}</connection>
<developerConnection>scm:svn:${vc-dev-base-url}/${module-directory}</developerConnection>
<url>${vc-browse-base-url}/${module-directory}</url>
</scm>
<dependencies>
<dependency>

View File

@ -37,15 +37,9 @@
<module-path>${top-level}/${module-directory}</module-path>
</properties>
<scm>
<connection>
scm:svn:http://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
</connection>
<developerConnection>
scm:svn:https://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
</developerConnection>
<url>
http://svn.apache.org/viewvc/lucene/dev/trunk/${module-directory}
</url>
<connection>scm:svn:${vc-anonymous-base-url}/${module-directory}</connection>
<developerConnection>scm:svn:${vc-dev-base-url}/${module-directory}</developerConnection>
<url>${vc-browse-base-url}/${module-directory}</url>
</scm>
<dependencies>
<dependency>

View File

@ -41,15 +41,9 @@
<module-path>${top-level}/${module-directory}</module-path>
</properties>
<scm>
<connection>
scm:svn:http://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
</connection>
<developerConnection>
scm:svn:https://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
</developerConnection>
<url>
http://svn.apache.org/viewvc/lucene/dev/trunk/${module-directory}
</url>
<connection>scm:svn:${vc-anonymous-base-url}/${module-directory}</connection>
<developerConnection>scm:svn:${vc-dev-base-url}/${module-directory}</developerConnection>
<url>${vc-browse-base-url}/${module-directory}</url>
</scm>
<dependencies>
<dependency>

View File

@ -37,15 +37,9 @@
<module-path>${top-level}/${module-directory}</module-path>
</properties>
<scm>
<connection>
scm:svn:http://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
</connection>
<developerConnection>
scm:svn:https://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
</developerConnection>
<url>
http://svn.apache.org/viewvc/lucene/dev/trunk/${module-directory}
</url>
<connection>scm:svn:${vc-anonymous-base-url}/${module-directory}</connection>
<developerConnection>scm:svn:${vc-dev-base-url}/${module-directory}</developerConnection>
<url>${vc-browse-base-url}/${module-directory}</url>
</scm>
<dependencies>
<dependency>
@ -120,41 +114,5 @@
</includes>
</testResource>
</testResources>
<plugins>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>appassembler-maven-plugin</artifactId>
<configuration>
<extraJvmArguments>-Xmx128M</extraJvmArguments>
<repositoryLayout>flat</repositoryLayout>
<platforms>
<platform>windows</platform>
<platform>unix</platform>
</platforms>
<programs>
<program>
<mainClass>org.apache.lucene.benchmark.byTask.Benchmark</mainClass>
<name>Benchmark</name>
</program>
<program>
<mainClass>org.apache.lucene.benchmark.quality.trec.QueryDriver</mainClass>
<name>QueryDriver</name>
</program>
<program>
<mainClass>org.apache.lucene.benchmark.quality.utils.QualityQueriesFinder</mainClass>
<name>QualityQueriesFinder</name>
</program>
<program>
<mainClass>org.apache.lucene.benchmark.utils.ExtractReuters</mainClass>
<name>ExtractReuters</name>
</program>
<program>
<mainClass>org.apache.lucene.benchmark.utils.ExtractWikipedia</mainClass>
<name>ExtractWikipedia</name>
</program>
</programs>
</configuration>
</plugin>
</plugins>
</build>
</project>

View File

@ -37,15 +37,9 @@
<module-path>${top-level}/${module-directory}</module-path>
</properties>
<scm>
<connection>
scm:svn:http://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
</connection>
<developerConnection>
scm:svn:https://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
</developerConnection>
<url>
http://svn.apache.org/viewvc/lucene/dev/trunk/${module-directory}
</url>
<connection>scm:svn:${vc-anonymous-base-url}/${module-directory}</connection>
<developerConnection>scm:svn:${vc-dev-base-url}/${module-directory}</developerConnection>
<url>${vc-browse-base-url}/${module-directory}</url>
</scm>
<dependencies>
<dependency>
@ -104,40 +98,6 @@
</systemPropertyVariables>
</configuration>
</plugin>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>appassembler-maven-plugin</artifactId>
<configuration>
<extraJvmArguments>-Xmx128M</extraJvmArguments>
<repositoryLayout>flat</repositoryLayout>
<platforms>
<platform>windows</platform>
<platform>unix</platform>
</platforms>
<programs>
<program>
<mainClass>org.apache.lucene.index.CheckIndex</mainClass>
<name>CheckIndex</name>
</program>
<program>
<mainClass>org.apache.lucene.index.IndexReader</mainClass>
<name>IndexReader</name>
</program>
<program>
<mainClass>org.apache.lucene.store.LockStressTest</mainClass>
<name>LockStressTest</name>
</program>
<program>
<mainClass>org.apache.lucene.store.LockVerifyServer</mainClass>
<name>LockVerifyServer</name>
</program>
<program>
<mainClass>org.apache.lucene.util.English</mainClass>
<name>English</name>
</program>
</programs>
</configuration>
</plugin>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>build-helper-maven-plugin</artifactId>

View File

@ -37,15 +37,9 @@
<module-path>${top-level}/${module-directory}</module-path>
</properties>
<scm>
<connection>
scm:svn:http://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
</connection>
<developerConnection>
scm:svn:https://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
</developerConnection>
<url>
http://svn.apache.org/viewvc/lucene/dev/trunk/${module-directory}
</url>
<connection>scm:svn:${vc-anonymous-base-url}/${module-directory}</connection>
<developerConnection>scm:svn:${vc-dev-base-url}/${module-directory}</developerConnection>
<url>${vc-browse-base-url}/${module-directory}</url>
</scm>
<dependencies>
<dependency>
@ -87,30 +81,5 @@
</excludes>
</testResource>
</testResources>
<plugins>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>appassembler-maven-plugin</artifactId>
<configuration>
<extraJvmArguments>-Xmx128M</extraJvmArguments>
<repositoryLayout>flat</repositoryLayout>
<assembleDirectory>${build-directory}</assembleDirectory>
<platforms>
<platform>windows</platform>
<platform>unix</platform>
</platforms>
<programs>
<program>
<mainClass>org.apache.lucene.demo.IndexFiles</mainClass>
<name>IndexFiles</name>
</program>
<program>
<mainClass>org.apache.lucene.demo.SearchFiles</mainClass>
<name>SearchFiles</name>
</program>
</programs>
</configuration>
</plugin>
</plugins>
</build>
</project>

View File

@ -39,15 +39,9 @@
<module-path>${top-level}/${module-directory}</module-path>
</properties>
<scm>
<connection>
scm:svn:http://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
</connection>
<developerConnection>
scm:svn:https://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
</developerConnection>
<url>
http://svn.apache.org/viewvc/lucene/dev/trunk/${module-directory}
</url>
<connection>scm:svn:${vc-anonymous-base-url}/${module-directory}</connection>
<developerConnection>scm:svn:${vc-dev-base-url}/${module-directory}</developerConnection>
<url>${vc-browse-base-url}/${module-directory}</url>
</scm>
<dependencies>
<dependency>

View File

@ -37,15 +37,9 @@
<module-path>${top-level}/${module-directory}</module-path>
</properties>
<scm>
<connection>
scm:svn:http://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
</connection>
<developerConnection>
scm:svn:https://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
</developerConnection>
<url>
http://svn.apache.org/viewvc/lucene/dev/trunk/${module-directory}
</url>
<connection>scm:svn:${vc-anonymous-base-url}/${module-directory}</connection>
<developerConnection>scm:svn:${vc-dev-base-url}/${module-directory}</developerConnection>
<url>${vc-browse-base-url}/${module-directory}</url>
</scm>
<dependencies>
<dependency>

View File

@ -39,15 +39,9 @@
<module-path>${top-level}/${module-directory}</module-path>
</properties>
<scm>
<connection>
scm:svn:http://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
</connection>
<developerConnection>
scm:svn:https://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
</developerConnection>
<url>
http://svn.apache.org/viewvc/lucene/dev/trunk/${module-directory}
</url>
<connection>scm:svn:${vc-anonymous-base-url}/${module-directory}</connection>
<developerConnection>scm:svn:${vc-dev-base-url}/${module-directory}</developerConnection>
<url>${vc-browse-base-url}/${module-directory}</url>
</scm>
<dependencies>
<dependency>

View File

@ -37,15 +37,9 @@
<module-path>${top-level}/${module-directory}</module-path>
</properties>
<scm>
<connection>
scm:svn:http://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
</connection>
<developerConnection>
scm:svn:https://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
</developerConnection>
<url>
http://svn.apache.org/viewvc/lucene/dev/trunk/${module-directory}
</url>
<connection>scm:svn:${vc-anonymous-base-url}/${module-directory}</connection>
<developerConnection>scm:svn:${vc-dev-base-url}/${module-directory}</developerConnection>
<url>${vc-browse-base-url}/${module-directory}</url>
</scm>
<dependencies>
<dependency>

View File

@ -39,15 +39,9 @@
<module-path>${top-level}/${module-directory}</module-path>
</properties>
<scm>
<connection>
scm:svn:http://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
</connection>
<developerConnection>
scm:svn:https://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
</developerConnection>
<url>
http://svn.apache.org/viewvc/lucene/dev/trunk/${module-directory}
</url>
<connection>scm:svn:${vc-anonymous-base-url}/${module-directory}</connection>
<developerConnection>scm:svn:${vc-dev-base-url}/${module-directory}</developerConnection>
<url>${vc-browse-base-url}/${module-directory}</url>
</scm>
<dependencies>
<dependency>

View File

@ -37,15 +37,9 @@
<module-path>${top-level}/${module-directory}</module-path>
</properties>
<scm>
<connection>
scm:svn:http://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
</connection>
<developerConnection>
scm:svn:https://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
</developerConnection>
<url>
http://svn.apache.org/viewvc/lucene/dev/trunk/${module-directory}
</url>
<connection>scm:svn:${vc-anonymous-base-url}/${module-directory}</connection>
<developerConnection>scm:svn:${vc-dev-base-url}/${module-directory}</developerConnection>
<url>${vc-browse-base-url}/${module-directory}</url>
</scm>
<dependencies>
<dependency>
@ -72,49 +66,5 @@
</excludes>
</testResource>
</testResources>
<plugins>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>appassembler-maven-plugin</artifactId>
<configuration>
<extraJvmArguments>-Xmx128M</extraJvmArguments>
<repositoryLayout>flat</repositoryLayout>
<platforms>
<platform>windows</platform>
<platform>unix</platform>
</platforms>
<programs>
<program>
<mainClass>org.apache.lucene.index.FieldNormModifier</mainClass>
<name>FieldNormModifier</name>
</program>
<program>
<mainClass>org.apache.lucene.index.IndexSplitter</mainClass>
<name>IndexSplitter</name>
</program>
<program>
<mainClass>org.apache.lucene.index.MultiPassIndexSplitter</mainClass>
<name>MultiPassIndexSplitter</name>
</program>
<program>
<mainClass>org.apache.lucene.misc.GetTermInfo</mainClass>
<name>GetTermInfo</name>
</program>
<program>
<mainClass>org.apache.lucene.misc.HighFreqTerms</mainClass>
<name>HighFreqTerms</name>
</program>
<program>
<mainClass>org.apache.lucene.misc.IndexMergeTool</mainClass>
<name>IndexMergeTool</name>
</program>
<program>
<mainClass>org.apache.lucene.misc.LengthNormModifier</mainClass>
<name>LengthNormModifier</name>
</program>
</programs>
</configuration>
</plugin>
</plugins>
</build>
</project>

View File

@ -35,15 +35,9 @@
<module-directory>lucene</module-directory>
</properties>
<scm>
<connection>
scm:svn:http://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
</connection>
<developerConnection>
scm:svn:https://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
</developerConnection>
<url>
http://svn.apache.org/viewvc/lucene/dev/trunk/${module-directory}
</url>
<connection>scm:svn:${vc-anonymous-base-url}/${module-directory}</connection>
<developerConnection>scm:svn:${vc-dev-base-url}/${module-directory}</developerConnection>
<url>${vc-browse-base-url}/${module-directory}</url>
</scm>
<modules>
<module>core</module>

View File

@ -37,15 +37,9 @@
<module-path>${top-level}/${module-directory}</module-path>
</properties>
<scm>
<connection>
scm:svn:http://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
</connection>
<developerConnection>
scm:svn:https://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
</developerConnection>
<url>
http://svn.apache.org/viewvc/lucene/dev/trunk/${module-directory}
</url>
<connection>scm:svn:${vc-anonymous-base-url}/${module-directory}</connection>
<developerConnection>scm:svn:${vc-dev-base-url}/${module-directory}</developerConnection>
<url>${vc-browse-base-url}/${module-directory}</url>
</scm>
<dependencies>
<dependency>

View File

@ -37,15 +37,9 @@
<module-path>${top-level}/${module-directory}</module-path>
</properties>
<scm>
<connection>
scm:svn:http://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
</connection>
<developerConnection>
scm:svn:https://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
</developerConnection>
<url>
http://svn.apache.org/viewvc/lucene/dev/trunk/${module-directory}
</url>
<connection>scm:svn:${vc-anonymous-base-url}/${module-directory}</connection>
<developerConnection>scm:svn:${vc-dev-base-url}/${module-directory}</developerConnection>
<url>${vc-browse-base-url}/${module-directory}</url>
</scm>
<dependencies>
<dependency>

View File

@ -37,15 +37,9 @@
<module-path>${top-level}/${module-directory}</module-path>
</properties>
<scm>
<connection>
scm:svn:http://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
</connection>
<developerConnection>
scm:svn:https://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
</developerConnection>
<url>
http://svn.apache.org/viewvc/lucene/dev/trunk/${module-directory}
</url>
<connection>scm:svn:${vc-anonymous-base-url}/${module-directory}</connection>
<developerConnection>scm:svn:${vc-dev-base-url}/${module-directory}</developerConnection>
<url>${vc-browse-base-url}/${module-directory}</url>
</scm>
<dependencies>
<dependency>

View File

@ -37,15 +37,9 @@
<module-path>${top-level}/${module-directory}</module-path>
</properties>
<scm>
<connection>
scm:svn:http://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
</connection>
<developerConnection>
scm:svn:https://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
</developerConnection>
<url>
http://svn.apache.org/viewvc/lucene/dev/trunk/${module-directory}
</url>
<connection>scm:svn:${vc-anonymous-base-url}/${module-directory}</connection>
<developerConnection>scm:svn:${vc-dev-base-url}/${module-directory}</developerConnection>
<url>${vc-browse-base-url}/${module-directory}</url>
</scm>
<dependencies>
<dependency>

View File

@ -37,15 +37,9 @@
<module-path>${top-level}/${module-directory}</module-path>
</properties>
<scm>
<connection>
scm:svn:http://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
</connection>
<developerConnection>
scm:svn:https://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
</developerConnection>
<url>
http://svn.apache.org/viewvc/lucene/dev/trunk/${module-directory}
</url>
<connection>scm:svn:${vc-anonymous-base-url}/${module-directory}</connection>
<developerConnection>scm:svn:${vc-dev-base-url}/${module-directory}</developerConnection>
<url>${vc-browse-base-url}/${module-directory}</url>
</scm>
<dependencies>
<dependency>

View File

@ -31,15 +31,18 @@
<version>@version@</version>
<packaging>pom</packaging>
<name>Grandparent POM for Apache Lucene Core and Apache Solr</name>
<description>Parent POM for Apache Lucene Core and Apache Solr</description>
<url>http://lucene.apache.org/java</url>
<description>Grandparent POM for Apache Lucene Core and Apache Solr</description>
<url>http://lucene.apache.org</url>
<modules>
<module>lucene</module>
<module>solr</module>
</modules>
<properties>
<top-level>..</top-level>
<base.specification.version>4.0.0</base.specification.version>
<vc-anonymous-base-url>http://svn.apache.org/repos/asf/lucene/dev/trunk</vc-anonymous-base-url>
<vc-dev-base-url>https://svn.apache.org/repos/asf/lucene/dev/trunk</vc-dev-base-url>
<vc-browse-base-url>http://svn.apache.org/viewvc/lucene/dev/trunk</vc-browse-base-url>
<base.specification.version>5.0.0</base.specification.version>
<maven.build.timestamp.format>yyyy-MM-dd HH:mm:ss</maven.build.timestamp.format>
<java.compat.version>1.6</java.compat.version>
<jetty.version>8.1.2.v20120308</jetty.version>
@ -69,11 +72,11 @@
</properties>
<issueManagement>
<system>JIRA</system>
<url>http://issues.apache.org/jira/browse/LUCENE</url>
<url>https://issues.apache.org/jira/browse/LUCENE</url>
</issueManagement>
<ciManagement>
<system>Hudson</system>
<url>http://lucene.zones.apache.org:8080/hudson/job/Lucene-Nightly/</url>
<system>Jenkins</system>
<url>https://builds.apache.org/computer/lucene/</url>
</ciManagement>
<mailingLists>
<mailingList>
@ -109,15 +112,9 @@
</mailingLists>
<inceptionYear>2000</inceptionYear>
<scm>
<connection>
scm:svn:http://svn.apache.org/repos/asf/lucene/dev/trunk
</connection>
<developerConnection>
scm:svn:https://svn.apache.org/repos/asf/lucene/dev/trunk
</developerConnection>
<url>
http://svn.apache.org/viewvc/lucene/dev/trunk
</url>
<connection>scm:svn:${vc-anonymous-base-url}</connection>
<developerConnection>scm:svn:${vc-dev-base-url}</developerConnection>
<url>${vc-browse-base-url}</url>
</scm>
<licenses>
<license>
@ -388,7 +385,7 @@
<dependency>
<groupId>com.carrotsearch.randomizedtesting</groupId>
<artifactId>randomizedtesting-runner</artifactId>
<version>1.6.0</version>
<version>2.0.0.rc5</version>
</dependency>
</dependencies>
</dependencyManagement>
@ -549,11 +546,6 @@
</archive>
</configuration>
</plugin>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>appassembler-maven-plugin</artifactId>
<version>1.2.1</version>
</plugin>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>build-helper-maven-plugin</artifactId>

View File

@ -35,18 +35,11 @@
<module-directory>solr/contrib/analysis-extras</module-directory>
<top-level>../../../..</top-level>
<module-path>${top-level}/${module-directory}</module-path>
<surefire-top-level>${top-level}/../..</surefire-top-level>
</properties>
<scm>
<connection>
scm:svn:http://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
</connection>
<developerConnection>
scm:svn:https://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
</developerConnection>
<url>
http://svn.apache.org/viewvc/lucene/dev/trunk/${module-directory}
</url>
<connection>scm:svn:${vc-anonymous-base-url}/${module-directory}</connection>
<developerConnection>scm:svn:${vc-dev-base-url}/${module-directory}</developerConnection>
<url>${vc-browse-base-url}/${module-directory}</url>
</scm>
<dependencies>
<dependency>
@ -101,17 +94,12 @@
<testResource>
<directory>${top-level}/solr/core/src/test-files</directory>
</testResource>
<testResource>
<directory>${top-level}/dev-tools/maven/solr</directory>
<includes>
<include>maven.testlogging.properties</include>
</includes>
</testResource>
</testResources>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<configuration>
<systemPropertyVariables>
<java.util.logging.config.file>${surefire-top-level}/solr/testlogging.properties</java.util.logging.config.file>
</systemPropertyVariables>
</configuration>
</plugin>
</plugins>
</build>
</project>

View File

@ -35,18 +35,11 @@
<module-directory>solr/contrib/clustering</module-directory>
<top-level>../../../..</top-level>
<module-path>${top-level}/${module-directory}</module-path>
<surefire-top-level>${top-level}/../..</surefire-top-level>
</properties>
<scm>
<connection>
scm:svn:http://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
</connection>
<developerConnection>
scm:svn:https://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
</developerConnection>
<url>
http://svn.apache.org/viewvc/lucene/dev/trunk/${module-directory}
</url>
<connection>scm:svn:${vc-anonymous-base-url}/${module-directory}</connection>
<developerConnection>scm:svn:${vc-dev-base-url}/${module-directory}</developerConnection>
<url>${vc-browse-base-url}/${module-directory}</url>
</scm>
<dependencies>
<dependency>
@ -106,17 +99,12 @@
<testResource>
<directory>${top-level}/solr/core/src/test-files</directory>
</testResource>
<testResource>
<directory>${top-level}/dev-tools/maven/solr</directory>
<includes>
<include>maven.testlogging.properties</include>
</includes>
</testResource>
</testResources>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<configuration>
<systemPropertyVariables>
<java.util.logging.config.file>${surefire-top-level}/solr/testlogging.properties</java.util.logging.config.file>
</systemPropertyVariables>
</configuration>
</plugin>
</plugins>
</build>
</project>

View File

@ -35,18 +35,11 @@
<module-directory>solr/contrib/dataimporthandler-extras</module-directory>
<top-level>../../../..</top-level>
<module-path>${top-level}/${module-directory}</module-path>
<surefire-top-level>${top-level}/../..</surefire-top-level>
</properties>
<scm>
<connection>
scm:svn:http://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
</connection>
<developerConnection>
scm:svn:https://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
</developerConnection>
<url>
http://svn.apache.org/viewvc/lucene/dev/trunk/${module-directory}
</url>
<connection>scm:svn:${vc-anonymous-base-url}/${module-directory}</connection>
<developerConnection>scm:svn:${vc-dev-base-url}/${module-directory}</developerConnection>
<url>${vc-browse-base-url}/${module-directory}</url>
</scm>
<dependencies>
<dependency>
@ -104,17 +97,12 @@
<testResource>
<directory>${top-level}/solr/core/src/test-files</directory>
</testResource>
<testResource>
<directory>${top-level}/dev-tools/maven/solr</directory>
<includes>
<include>maven.testlogging.properties</include>
</includes>
</testResource>
</testResources>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<configuration>
<systemPropertyVariables>
<java.util.logging.config.file>${surefire-top-level}/solr/testlogging.properties</java.util.logging.config.file>
</systemPropertyVariables>
</configuration>
</plugin>
</plugins>
</build>
</project>

View File

@ -35,18 +35,11 @@
<module-directory>solr/contrib/dataimporthandler</module-directory>
<top-level>../../../..</top-level>
<module-path>${top-level}/${module-directory}</module-path>
<surefire-top-level>${top-level}/../..</surefire-top-level>
</properties>
<scm>
<connection>
scm:svn:http://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
</connection>
<developerConnection>
scm:svn:https://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
</developerConnection>
<url>
http://svn.apache.org/viewvc/lucene/dev/trunk/${module-directory}
</url>
<connection>scm:svn:${vc-anonymous-base-url}/${module-directory}</connection>
<developerConnection>scm:svn:${vc-dev-base-url}/${module-directory}</developerConnection>
<url>${vc-browse-base-url}/${module-directory}</url>
</scm>
<dependencies>
<dependency>
@ -90,6 +83,12 @@
<testResource>
<directory>${top-level}/solr/core/src/test-files</directory>
</testResource>
<testResource>
<directory>${top-level}/dev-tools/maven/solr</directory>
<includes>
<include>maven.testlogging.properties</include>
</includes>
</testResource>
</testResources>
<plugins>
<plugin>
@ -103,15 +102,6 @@
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<configuration>
<systemPropertyVariables>
<java.util.logging.config.file>${surefire-top-level}/solr/testlogging.properties</java.util.logging.config.file>
</systemPropertyVariables>
</configuration>
</plugin>
</plugins>
</build>
</project>

View File

@ -38,18 +38,11 @@
<module-directory>solr/contrib/extraction</module-directory>
<top-level>../../../..</top-level>
<module-path>${top-level}/${module-directory}</module-path>
<surefire-top-level>${top-level}/../..</surefire-top-level>
</properties>
<scm>
<connection>
scm:svn:http://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
</connection>
<developerConnection>
scm:svn:https://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
</developerConnection>
<url>
http://svn.apache.org/viewvc/lucene/dev/trunk/${module-directory}
</url>
<connection>scm:svn:${vc-anonymous-base-url}/${module-directory}</connection>
<developerConnection>scm:svn:${vc-dev-base-url}/${module-directory}</developerConnection>
<url>${vc-browse-base-url}/${module-directory}</url>
</scm>
<dependencies>
<dependency>
@ -102,17 +95,12 @@
<testResource>
<directory>${top-level}/solr/core/src/test-files</directory>
</testResource>
<testResource>
<directory>${top-level}/dev-tools/maven/solr</directory>
<includes>
<include>maven.testlogging.properties</include>
</includes>
</testResource>
</testResources>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<configuration>
<systemPropertyVariables>
<java.util.logging.config.file>${surefire-top-level}/solr/testlogging.properties</java.util.logging.config.file>
</systemPropertyVariables>
</configuration>
</plugin>
</plugins>
</build>
</project>

View File

@ -39,18 +39,11 @@
<module-directory>solr/contrib/langid</module-directory>
<top-level>../../../..</top-level>
<module-path>${top-level}/${module-directory}</module-path>
<surefire-top-level>${top-level}/../..</surefire-top-level>
</properties>
<scm>
<connection>
scm:svn:http://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
</connection>
<developerConnection>
scm:svn:https://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
</developerConnection>
<url>
http://svn.apache.org/viewvc/lucene/dev/trunk/${module-directory}
</url>
<connection>scm:svn:${vc-anonymous-base-url}/${module-directory}</connection>
<developerConnection>scm:svn:${vc-dev-base-url}/${module-directory}</developerConnection>
<url>${vc-browse-base-url}/${module-directory}</url>
</scm>
<dependencies>
<dependency>
@ -107,17 +100,12 @@
<testResource>
<directory>${top-level}/solr/core/src/test-files</directory>
</testResource>
<testResource>
<directory>${top-level}/dev-tools/maven/solr</directory>
<includes>
<include>maven.testlogging.properties</include>
</includes>
</testResource>
</testResources>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<configuration>
<systemPropertyVariables>
<java.util.logging.config.file>${surefire-top-level}/solr/testlogging.properties</java.util.logging.config.file>
</systemPropertyVariables>
</configuration>
</plugin>
</plugins>
</build>
</project>

View File

@ -35,18 +35,11 @@
<module-directory>solr/contrib/uima</module-directory>
<top-level>../../../..</top-level>
<module-path>${top-level}/${module-directory}</module-path>
<surefire-top-level>${top-level}/../..</surefire-top-level>
</properties>
<scm>
<connection>
scm:svn:http://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
</connection>
<developerConnection>
scm:svn:https://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
</developerConnection>
<url>
http://svn.apache.org/viewvc/lucene/dev/trunk/${module-directory}
</url>
<connection>scm:svn:${vc-anonymous-base-url}/${module-directory}</connection>
<developerConnection>scm:svn:${vc-dev-base-url}/${module-directory}</developerConnection>
<url>${vc-browse-base-url}/${module-directory}</url>
</scm>
<dependencies>
<dependency>
@ -121,17 +114,12 @@
<testResource>
<directory>${module-path}/src/test-files</directory>
</testResource>
<testResource>
<directory>${top-level}/dev-tools/maven/solr</directory>
<includes>
<include>maven.testlogging.properties</include>
</includes>
</testResource>
</testResources>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<configuration>
<systemPropertyVariables>
<java.util.logging.config.file>${surefire-top-level}/solr/testlogging.properties</java.util.logging.config.file>
</systemPropertyVariables>
</configuration>
</plugin>
</plugins>
</build>
</project>

View File

@ -35,18 +35,11 @@
<module-directory>solr/contrib/velocity</module-directory>
<top-level>../../../..</top-level>
<module-path>${top-level}/${module-directory}</module-path>
<surefire-top-level>${top-level}/../..</surefire-top-level>
</properties>
<scm>
<connection>
scm:svn:http://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
</connection>
<developerConnection>
scm:svn:https://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
</developerConnection>
<url>
http://svn.apache.org/viewvc/lucene/dev/trunk/${module-directory}
</url>
<connection>scm:svn:${vc-anonymous-base-url}/${module-directory}</connection>
<developerConnection>scm:svn:${vc-dev-base-url}/${module-directory}</developerConnection>
<url>${vc-browse-base-url}/${module-directory}</url>
</scm>
<dependencies>
<dependency>
@ -142,17 +135,12 @@
<testResource>
<directory>${top-level}/solr/core/src/test-files</directory>
</testResource>
<testResource>
<directory>${top-level}/dev-tools/maven/solr</directory>
<includes>
<include>maven.testlogging.properties</include>
</includes>
</testResource>
</testResources>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<configuration>
<systemPropertyVariables>
<java.util.logging.config.file>${surefire-top-level}/solr/testlogging.properties</java.util.logging.config.file>
</systemPropertyVariables>
</configuration>
</plugin>
</plugins>
</build>
</project>

View File

@ -35,18 +35,11 @@
<module-directory>solr/core</module-directory>
<top-level>../../..</top-level>
<module-path>${top-level}/${module-directory}</module-path>
<surefire-top-level>${top-level}/../..</surefire-top-level>
</properties>
<scm>
<connection>
scm:svn:http://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
</connection>
<developerConnection>
scm:svn:https://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
</developerConnection>
<url>
http://svn.apache.org/viewvc/lucene/dev/trunk/${module-directory}
</url>
<connection>scm:svn:${vc-anonymous-base-url}/${module-directory}</connection>
<developerConnection>scm:svn:${vc-dev-base-url}/${module-directory}</developerConnection>
<url>${vc-browse-base-url}/${module-directory}</url>
</scm>
<dependencies>
<dependency>
@ -243,48 +236,14 @@
<testResource>
<directory>${top-level}/solr/solrj/src/test-files</directory>
</testResource>
<testResource>
<directory>${top-level}/dev-tools/maven/solr</directory>
<includes>
<include>maven.testlogging.properties</include>
</includes>
</testResource>
</testResources>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<configuration>
<systemPropertyVariables>
<java.util.logging.config.file>${surefire-top-level}/solr/testlogging.properties</java.util.logging.config.file>
</systemPropertyVariables>
</configuration>
</plugin>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>appassembler-maven-plugin</artifactId>
<configuration>
<extraJvmArguments>-Xmx128M</extraJvmArguments>
<repositoryLayout>flat</repositoryLayout>
<platforms>
<platform>windows</platform>
<platform>unix</platform>
</platforms>
<programs>
<program>
<mainClass>org.apache.solr.client.solrj.embedded.JettySolrRunner</mainClass>
<name>JettySolrRunner</name>
</program>
<program>
<mainClass>org.apache.solr.util.BitSetPerf</mainClass>
<name>BitSetPerf</name>
<extraJvmArguments>-Xms128m -Xbatch</extraJvmArguments>
</program>
<program>
<mainClass>org.apache.solr.util.SimplePostTool</mainClass>
<name>SimplePostTool</name>
</program>
<program>
<mainClass>org.apache.solr.util.SuggestMissingFactories</mainClass>
<name>SuggestMissingFactories</name>
</program>
</programs>
</configuration>
</plugin>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>build-helper-maven-plugin</artifactId>

View File

@ -0,0 +1,2 @@
handlers=java.util.logging.ConsoleHandler
.level=SEVERE

View File

@ -43,26 +43,14 @@
<module-directory>solr</module-directory>
</properties>
<scm>
<connection>
scm:svn:http://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
</connection>
<developerConnection>
scm:svn:https://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
</developerConnection>
<url>
http://svn.apache.org/viewvc/lucene/dev/trunk/${module-directory}
</url>
<connection>scm:svn:${vc-anonymous-base-url}/${module-directory}</connection>
<developerConnection>scm:svn:${vc-dev-base-url}/${module-directory}</developerConnection>
<url>${vc-browse-base-url}/${module-directory}</url>
</scm>
<issueManagement>
<system>JIRA</system>
<url>http://issues.apache.org/jira/browse/SOLR</url>
<url>https://issues.apache.org/jira/browse/SOLR</url>
</issueManagement>
<ciManagement>
<system>Hudson</system>
<url>
http://lucene.zones.apache.org:8080/hudson/job/Solr-Nightly/
</url>
</ciManagement>
<mailingLists>
<mailingList>
<name>Solr User List</name>
@ -111,6 +99,15 @@
<doctitle>${project.name} ${project.version} API (${now.version})</doctitle>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<configuration>
<systemPropertyVariables>
<java.util.logging.config.file>../test-classes/maven.testlogging.properties</java.util.logging.config.file>
</systemPropertyVariables>
</configuration>
</plugin>
</plugins>
</pluginManagement>
</build>

View File

@ -37,15 +37,9 @@
<module-path>${top-level}/${module-directory}</module-path>
</properties>
<scm>
<connection>
scm:svn:http://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
</connection>
<developerConnection>
scm:svn:https://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
</developerConnection>
<url>
http://svn.apache.org/viewvc/lucene/dev/trunk/${module-directory}
</url>
<connection>scm:svn:${vc-anonymous-base-url}/${module-directory}</connection>
<developerConnection>scm:svn:${vc-dev-base-url}/${module-directory}</developerConnection>
<url>${vc-browse-base-url}/${module-directory}</url>
</scm>
<dependencies>
<dependency>

View File

@ -37,15 +37,9 @@
<module-path>${top-level}/${module-directory}</module-path>
</properties>
<scm>
<connection>
scm:svn:http://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
</connection>
<developerConnection>
scm:svn:https://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
</developerConnection>
<url>
http://svn.apache.org/viewvc/lucene/dev/trunk/${module-directory}
</url>
<connection>scm:svn:${vc-anonymous-base-url}/${module-directory}</connection>
<developerConnection>scm:svn:${vc-dev-base-url}/${module-directory}</developerConnection>
<url>${vc-browse-base-url}/${module-directory}</url>
</scm>
<dependencies>
<!-- These dependencies are compile scope because this is a test framework. -->
@ -60,20 +54,29 @@
<artifactId>solr-core</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>javax.servlet</groupId>
<artifactId>servlet-api</artifactId>
<!-- SOLR-3263: Provided scope is required to avoid jar signing conflicts -->
<scope>provided</scope>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
</dependency>
<!-- If your tests don't use BaseDistributedSearchTestCase or SolrJettyTestBase,
you can exclude the three Jetty dependencies below. -->
<dependency>
<groupId>org.eclipse.jetty</groupId>
<artifactId>jetty-server</artifactId>
<scope>runtime</scope>
<artifactId>jetty-servlet</artifactId>
</dependency>
<dependency>
<groupId>org.eclipse.jetty</groupId>
<artifactId>jetty-util</artifactId>
</dependency>
<!-- If your tests don't use BaseDistributedSearchTestCase or SolrJettyTestBase,
you can exclude the two Jetty dependencies below. -->
<dependency>
<groupId>org.eclipse.jetty</groupId>
<artifactId>jetty-server</artifactId>
<scope>runtime</scope>
</dependency>
<dependency>

View File

@ -37,15 +37,9 @@
<module-path>${top-level}/${module-directory}</module-path>
</properties>
<scm>
<connection>
scm:svn:http://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
</connection>
<developerConnection>
scm:svn:https://svn.apache.org/repos/asf/lucene/dev/trunk/${module-directory}
</developerConnection>
<url>
http://svn.apache.org/viewvc/lucene/dev/trunk/${module-directory}
</url>
<connection>scm:svn:${vc-anonymous-base-url}/${module-directory}</connection>
<developerConnection>scm:svn:${vc-dev-base-url}/${module-directory}</developerConnection>
<url>${vc-browse-base-url}/${module-directory}</url>
</scm>
<dependencies>
<dependency>

View File

@ -58,7 +58,7 @@ def javaExe(version):
def verifyJavaVersion(version):
s = os.popen('%s; java -version 2>&1' % javaExe(version)).read()
if s.find('java version "%s.' % version) == -1:
if s.find(' version "%s.' % version) == -1:
raise RuntimeError('got wrong version for java %s:\n%s' % (version, s))
# http://s.apache.org/lusolr32rc2
@ -363,6 +363,10 @@ def verifyDigests(artifact, urlString, tmpDir):
raise RuntimeError('SHA1 digest mismatch for %s: expected %s but got %s' % (artifact, sha1Expected, sha1Actual))
def getDirEntries(urlString):
if urlString.startswith('file:/') and not urlString.startswith('file://'):
# stupid bogus ant URI
urlString = "file:///" + urlString[6:]
if urlString.startswith('file://'):
path = urlString[7:]
if path.endswith('/'):
@ -1026,7 +1030,7 @@ def crawl(downloadedFiles, urlString, targetDir, exclusions=set()):
def main():
if len(sys.argv) != 4:
if len(sys.argv) < 4:
print()
print('Usage python -u %s BaseURL version tmpDir' % sys.argv[0])
print()
@ -1035,8 +1039,11 @@ def main():
baseURL = sys.argv[1]
version = sys.argv[2]
tmpDir = os.path.abspath(sys.argv[3])
isSigned = True
if len(sys.argv) == 5:
isSigned = (sys.argv[4] == "True")
smokeTest(baseURL, version, tmpDir, True)
smokeTest(baseURL, version, tmpDir, isSigned)
def smokeTest(baseURL, version, tmpDir, isSigned):
@ -1090,4 +1097,5 @@ if __name__ == '__main__':
except:
import traceback
traceback.print_exc()
sys.exit(1)
sys.exit(0)

View File

@ -6,6 +6,56 @@ http://s.apache.org/luceneversions
======================= Lucene 5.0.0 =======================
======================= Lucene 4.0.0 =======================
New Features
* LUCENE-1888: Added the option to store payloads in the term
vectors (IndexableFieldType.storeTermVectorPayloads()). Note
that you must store term vector positions to store payloads.
(Robert Muir)
API Changes
* LUCENE-4299: Added Terms.hasPositions() and Terms.hasOffsets().
Previously you had no real way to know that a term vector field
had positions or offsets, since this can be configured on a
per-field-per-document basis. (Robert Muir)
* Removed DocsAndPositionsEnum.hasPayload() and simplified the
contract of getPayload(). It returns null if there is no payload,
otherwise returns the current payload. You can now call it multiple
times per position if you want. (Robert Muir)
* Removed FieldsEnum. Fields API instead implements Iterable<String>
and exposes Iterator, so you can iterate over field names with
for (String field : fields) instead. (Robert Muir)
Bug Fixes
* LUCENE-4297: BooleanScorer2 would multiply the coord() factor
twice for conjunctions: for most users this is no problem, but
if you had a customized Similarity that returned something other
than 1 when overlap == maxOverlap (always the case for conjunctions),
then the score would be incorrect. (Pascal Chollet, Robert Muir)
* LUCENE-4298: MultiFields.getTermDocsEnum(IndexReader, Bits, String, BytesRef)
did not work at all, it would infinitely recurse.
(Alberto Paro via Robert Muir)
* LUCENE-4300: BooleanQuery's rewrite was not always safe: if you
had a custom Similarity where coord(1,1) != 1F, then the rewritten
query would be scored differently. (Robert Muir)
* Don't allow negatives in the positions file. If you have an index
from 2.4.0 or earlier with such negative positions, and you already
upgraded to 3.x, then to Lucene 4.0-ALPHA or -BETA, you should run
CheckIndex. If it fails, then you need to upgrade again to 4.0 (Robert Muir)
Build
* LUCENE-3985: Upgrade to randomizedtesting 2.0.0. Added support for
thread leak detection. Added support for suite timeouts. (Dawid Weiss)
======================= Lucene 4.0.0-BETA =======================
@ -47,6 +97,11 @@ New features
int docID), to attempt deletion by docID as long as the provided
reader is an NRT reader, and the segment has not yet been merged
away (Mike McCandless).
* LUCENE-4286: Added option to CJKBigramFilter to always also output
unigrams. This can be used for a unigram+bigram approach, or at
index-time only for better support of short queries.
(Tom Burton-West, Robert Muir)
API Changes
@ -115,6 +170,10 @@ Optimizations
making them substantially more lightweight. Behavior is unchanged.
(Robert Muir)
* LUCENE-4291: Reduced internal buffer size for Jflex-based tokenizers
such as StandardTokenizer from 32kb to 8kb.
(Raintung Li, Steven Rowe, Robert Muir)
Bug Fixes
* LUCENE-4109: BooleanQueries are not parsed correctly with the
@ -164,6 +223,9 @@ Bug Fixes
* LUCENE-4282: Automaton FuzzyQuery didnt always deliver all results.
(Johannes Christen, Uwe Schindler, Robert Muir)
* LUCENE-4289: Fix minor idf inconsistencies/inefficiencies in highlighter.
(Robert Muir)
Changes in Runtime Behavior
* LUCENE-4109: Enable position increments in the flexible queryparser by default.

View File

@ -9,7 +9,7 @@ enumeration APIs. Here are the major changes:
by the BytesRef class (which provides an offset + length "slice"
into an existing byte[]).
* Fields are separately enumerated (FieldsEnum) from the terms
* Fields are separately enumerated (Fields.iterator()) from the terms
within each field (TermEnum). So instead of this:
TermEnum termsEnum = ...;
@ -20,10 +20,8 @@ enumeration APIs. Here are the major changes:
Do this:
FieldsEnum fieldsEnum = ...;
String field;
while((field = fieldsEnum.next()) != null) {
TermsEnum termsEnum = fieldsEnum.terms();
for(String field : fields) {
TermsEnum termsEnum = fields.terms(field);
BytesRef text;
while((text = termsEnum.next()) != null) {
System.out.println("field=" + field + "; text=" + text.utf8ToString());
@ -316,11 +314,12 @@ an AtomicReader. Note: using "atomicity emulators" can cause serious
slowdowns due to the need to merge terms, postings, DocValues, and
FieldCache, use them with care!
## LUCENE-2413: Analyzer package changes
## LUCENE-2413,LUCENE-3396: Analyzer package changes
Lucene's core and contrib analyzers, along with Solr's analyzers,
were consolidated into lucene/analysis. During the refactoring some
package names have changed:
package names have changed, and ReusableAnalyzerBase was renamed to
Analyzer:
- o.a.l.analysis.KeywordAnalyzer -> o.a.l.analysis.core.KeywordAnalyzer
- o.a.l.analysis.KeywordTokenizer -> o.a.l.analysis.core.KeywordTokenizer
@ -345,7 +344,7 @@ package names have changed:
- o.a.l.analysis.NormalizeCharMap -> o.a.l.analysis.charfilter.NormalizeCharMap
- o.a.l.analysis.CharArraySet -> o.a.l.analysis.util.CharArraySet
- o.a.l.analysis.CharArrayMap -> o.a.l.analysis.util.CharArrayMap
- o.a.l.analysis.ReusableAnalyzerBase -> o.a.l.analysis.util.ReusableAnalyzerBase
- o.a.l.analysis.ReusableAnalyzerBase -> o.a.l.analysis.Analyzer
- o.a.l.analysis.StopwordAnalyzerBase -> o.a.l.analysis.util.StopwordAnalyzerBase
- o.a.l.analysis.WordListLoader -> o.a.l.analysis.util.WordListLoader
- o.a.l.analysis.CharTokenizer -> o.a.l.analysis.util.CharTokenizer

View File

@ -1,4 +1,4 @@
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 7/26/12 6:22 PM */
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 8/6/12 11:57 AM */
package org.apache.lucene.analysis.charfilter;
@ -40,8 +40,8 @@ import org.apache.lucene.analysis.util.OpenStringBuilder;
/**
* This class is a scanner generated by
* <a href="http://www.jflex.de/">JFlex</a> 1.5.0-SNAPSHOT
* on 7/26/12 6:22 PM from the specification file
* <tt>C:/svn/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex</tt>
* on 8/6/12 11:57 AM from the specification file
* <tt>/home/rmuir/workspace/lucene-trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex</tt>
*/
public final class HTMLStripCharFilter extends BaseCharFilter {
@ -31255,6 +31255,93 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
{ yybegin(STYLE);
}
case 55: break;
case 27:
{ // add (previously matched input length) + (this match length) - (substitution length)
cumulativeDiff += inputSegment.length() + yylength() - 1;
// position the correction at (already output length) + (substitution length)
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
return BLOCK_LEVEL_START_TAG_REPLACEMENT;
}
case 56: break;
case 30:
{ int length = yylength();
inputSegment.write(zzBuffer, zzStartRead, length);
entitySegment.clear();
char ch = entityValues.get(zzBuffer, zzStartRead, length).charValue();
entitySegment.append(ch);
outputSegment = entitySegment;
yybegin(CHARACTER_REFERENCE_TAIL);
}
case 57: break;
case 48:
{ inputSegment.clear();
yybegin(YYINITIAL);
// add (previously matched input length) -- current match and substitution handled below
cumulativeDiff += yychar - inputStart;
// position the offset correction at (already output length) -- substitution handled below
int offsetCorrectionPos = outputCharCount;
int returnValue;
if (escapeSTYLE) {
inputSegment.write(zzBuffer, zzStartRead, yylength());
outputSegment = inputSegment;
returnValue = outputSegment.nextChar();
} else {
// add (this match length) - (substitution length)
cumulativeDiff += yylength() - 1;
// add (substitution length)
++offsetCorrectionPos;
returnValue = STYLE_REPLACEMENT;
}
addOffCorrectMap(offsetCorrectionPos, cumulativeDiff);
return returnValue;
}
case 58: break;
case 8:
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
if (null != escapedTags
&& escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
yybegin(START_TAG_TAIL_INCLUDE);
} else {
yybegin(START_TAG_TAIL_SUBSTITUTE);
}
}
case 59: break;
case 2:
{ inputStart = yychar;
inputSegment.clear();
inputSegment.append('<');
yybegin(LEFT_ANGLE_BRACKET);
}
case 60: break;
case 44:
{ restoreState = STYLE_COMMENT; yybegin(SERVER_SIDE_INCLUDE);
}
case 61: break;
case 21:
{ previousRestoreState = restoreState;
restoreState = SERVER_SIDE_INCLUDE;
yybegin(SINGLE_QUOTED_STRING);
}
case 62: break;
case 11:
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
yybegin(LEFT_ANGLE_BRACKET_SPACE);
}
case 63: break;
case 35:
{ yybegin(SCRIPT);
}
case 64: break;
case 42:
{ restoreState = COMMENT; yybegin(SERVER_SIDE_INCLUDE);
}
case 65: break;
case 10:
{ inputSegment.append('!'); yybegin(BANG);
}
case 66: break;
case 51:
{ // Handle paired UTF-16 surrogates.
String surrogatePair = yytext();
@ -31288,13 +31375,331 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
inputSegment.append('#');
yybegin(NUMERIC_CHARACTER);
}
case 56: break;
case 21:
case 67: break;
case 4:
{ yypushback(1);
outputSegment = inputSegment;
outputSegment.restart();
yybegin(YYINITIAL);
return outputSegment.nextChar();
}
case 68: break;
case 43:
{ restoreState = SCRIPT_COMMENT; yybegin(SERVER_SIDE_INCLUDE);
}
case 69: break;
case 52:
{ // Handle paired UTF-16 surrogates.
String surrogatePair = yytext();
char highSurrogate = '\u0000';
try { // High surrogates are in decimal range [55296, 56319]
highSurrogate = (char)Integer.parseInt(surrogatePair.substring(1, 6));
} catch(Exception e) { // should never happen
assert false: "Exception parsing high surrogate '"
+ surrogatePair.substring(1, 6) + "'";
}
if (Character.isHighSurrogate(highSurrogate)) {
outputSegment = entitySegment;
outputSegment.clear();
try {
outputSegment.unsafeWrite
((char)Integer.parseInt(surrogatePair.substring(10, 14), 16));
} catch(Exception e) { // should never happen
assert false: "Exception parsing low surrogate '"
+ surrogatePair.substring(10, 14) + "'";
}
// add (previously matched input length) + (this match length) - (substitution length)
cumulativeDiff += inputSegment.length() + yylength() - 2;
// position the correction at (already output length) + (substitution length)
addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
return highSurrogate;
}
yypushback(surrogatePair.length() - 1); // Consume only '#'
inputSegment.append('#');
yybegin(NUMERIC_CHARACTER);
}
case 70: break;
case 28:
{ restoreState = STYLE_COMMENT; yybegin(SINGLE_QUOTED_STRING);
}
case 71: break;
case 50:
{ // Handle paired UTF-16 surrogates.
outputSegment = entitySegment;
outputSegment.clear();
String surrogatePair = yytext();
char highSurrogate = '\u0000';
try {
highSurrogate = (char)Integer.parseInt(surrogatePair.substring(2, 6), 16);
} catch(Exception e) { // should never happen
assert false: "Exception parsing high surrogate '"
+ surrogatePair.substring(2, 6) + "'";
}
try {
outputSegment.unsafeWrite
((char)Integer.parseInt(surrogatePair.substring(10, 14), 16));
} catch(Exception e) { // should never happen
assert false: "Exception parsing low surrogate '"
+ surrogatePair.substring(10, 14) + "'";
}
// add (previously matched input length) + (this match length) - (substitution length)
cumulativeDiff += inputSegment.length() + yylength() - 2;
// position the correction at (already output length) + (substitution length)
addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
return highSurrogate;
}
case 72: break;
case 16:
{ restoreState = SCRIPT_COMMENT; yybegin(SINGLE_QUOTED_STRING);
}
case 73: break;
case 22:
{ previousRestoreState = restoreState;
restoreState = SERVER_SIDE_INCLUDE;
yybegin(SINGLE_QUOTED_STRING);
yybegin(DOUBLE_QUOTED_STRING);
}
case 57: break;
case 74: break;
case 26:
{ // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
cumulativeDiff += inputSegment.length() + yylength();
// position the correction at (already output length) [ + (substitution length) = 0 ]
addOffCorrectMap(outputCharCount, cumulativeDiff);
inputSegment.clear();
outputSegment = inputSegment;
yybegin(YYINITIAL);
}
case 75: break;
case 20:
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
}
case 76: break;
case 47:
{ // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
cumulativeDiff += inputSegment.length() + yylength();
// position the correction at (already output length) [ + (substitution length) = 0 ]
addOffCorrectMap(outputCharCount, cumulativeDiff);
inputSegment.clear();
yybegin(CDATA);
}
case 77: break;
case 33:
{ yybegin(YYINITIAL);
if (escapeBR) {
inputSegment.write(zzBuffer, zzStartRead, yylength());
outputSegment = inputSegment;
return outputSegment.nextChar();
} else {
// add (previously matched input length) + (this match length) - (substitution length)
cumulativeDiff += inputSegment.length() + yylength() - 1;
// position the correction at (already output length) + (substitution length)
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
inputSegment.reset();
return BR_START_TAG_REPLACEMENT;
}
}
case 78: break;
case 23:
{ yybegin(restoreState); restoreState = previousRestoreState;
}
case 79: break;
case 32:
{ yybegin(COMMENT);
}
case 80: break;
case 24:
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
outputSegment = inputSegment;
yybegin(YYINITIAL);
return outputSegment.nextChar();
}
case 81: break;
case 3:
{ inputStart = yychar;
inputSegment.clear();
inputSegment.append('&');
yybegin(AMPERSAND);
}
case 82: break;
case 46:
{ yybegin(SCRIPT);
if (escapeSCRIPT) {
inputSegment.write(zzBuffer, zzStartRead, yylength());
outputSegment = inputSegment;
inputStart += 1 + yylength();
return outputSegment.nextChar();
}
}
case 83: break;
case 14:
{ // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
cumulativeDiff += inputSegment.length() + yylength();
// position the correction at (already output length) [ + (substitution length) = 0 ]
addOffCorrectMap(outputCharCount, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
}
case 84: break;
case 6:
{ int matchLength = yylength();
inputSegment.write(zzBuffer, zzStartRead, matchLength);
if (matchLength <= 7) { // 0x10FFFF = 1114111: max 7 decimal chars
String decimalCharRef = yytext();
int codePoint = 0;
try {
codePoint = Integer.parseInt(decimalCharRef);
} catch(Exception e) {
assert false: "Exception parsing code point '" + decimalCharRef + "'";
}
if (codePoint <= 0x10FFFF) {
outputSegment = entitySegment;
outputSegment.clear();
if (codePoint >= Character.MIN_SURROGATE
&& codePoint <= Character.MAX_SURROGATE) {
outputSegment.unsafeWrite(REPLACEMENT_CHARACTER);
} else {
outputSegment.setLength
(Character.toChars(codePoint, outputSegment.getArray(), 0));
}
yybegin(CHARACTER_REFERENCE_TAIL);
} else {
outputSegment = inputSegment;
yybegin(YYINITIAL);
return outputSegment.nextChar();
}
} else {
outputSegment = inputSegment;
yybegin(YYINITIAL);
return outputSegment.nextChar();
}
}
case 85: break;
case 34:
{ // add (previously matched input length) + (this match length) [ - (substitution length) = 0]
cumulativeDiff += yychar - inputStart + yylength();
// position the correction at (already output length) [ + (substitution length) = 0]
addOffCorrectMap(outputCharCount, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
}
case 86: break;
case 5:
{ inputSegment.append('#'); yybegin(NUMERIC_CHARACTER);
}
case 87: break;
case 13:
{ inputSegment.append(zzBuffer[zzStartRead]);
}
case 88: break;
case 18:
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
if (null != escapedTags
&& escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
yybegin(END_TAG_TAIL_INCLUDE);
} else {
yybegin(END_TAG_TAIL_SUBSTITUTE);
}
}
case 89: break;
case 40:
{ yybegin(SCRIPT_COMMENT);
}
case 90: break;
case 37:
{ // add (this match length) [ - (substitution length) = 0 ]
cumulativeDiff += yylength();
// position the correction at (already output length) [ + (substitution length) = 0 ]
addOffCorrectMap(outputCharCount, cumulativeDiff);
yybegin(YYINITIAL);
}
case 91: break;
case 12:
{ inputSegment.append('/'); yybegin(LEFT_ANGLE_BRACKET_SLASH);
}
case 92: break;
case 9:
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
if (null != escapedTags
&& escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
yybegin(START_TAG_TAIL_INCLUDE);
} else {
yybegin(START_TAG_TAIL_EXCLUDE);
}
}
case 93: break;
case 49:
{ inputSegment.clear();
yybegin(YYINITIAL);
// add (previously matched input length) -- current match and substitution handled below
cumulativeDiff += yychar - inputStart;
// position at (already output length) -- substitution handled below
int offsetCorrectionPos = outputCharCount;
int returnValue;
if (escapeSCRIPT) {
inputSegment.write(zzBuffer, zzStartRead, yylength());
outputSegment = inputSegment;
returnValue = outputSegment.nextChar();
} else {
// add (this match length) - (substitution length)
cumulativeDiff += yylength() - 1;
// add (substitution length)
++offsetCorrectionPos;
returnValue = SCRIPT_REPLACEMENT;
}
addOffCorrectMap(offsetCorrectionPos, cumulativeDiff);
return returnValue;
}
case 94: break;
case 29:
{ restoreState = STYLE_COMMENT; yybegin(DOUBLE_QUOTED_STRING);
}
case 95: break;
case 17:
{ restoreState = SCRIPT_COMMENT; yybegin(DOUBLE_QUOTED_STRING);
}
case 96: break;
case 45:
{ yybegin(STYLE);
if (escapeSTYLE) {
inputSegment.write(zzBuffer, zzStartRead, yylength());
outputSegment = inputSegment;
inputStart += 1 + yylength();
return outputSegment.nextChar();
}
}
case 97: break;
case 7:
{ // add (previously matched input length) + (this match length) - (substitution length)
cumulativeDiff += inputSegment.length() + yylength() - outputSegment.length();
// position the correction at (already output length) + (substitution length)
addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
yybegin(YYINITIAL);
return outputSegment.nextChar();
}
case 98: break;
case 19:
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
if (null != escapedTags
&& escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
yybegin(END_TAG_TAIL_INCLUDE);
} else {
yybegin(END_TAG_TAIL_EXCLUDE);
}
}
case 99: break;
case 25:
{ // add (previously matched input length) + (this match length) - (substitution length)
cumulativeDiff += inputSegment.length() + yylength() - 1;
// position the correction at (already output length) + (substitution length)
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
return BLOCK_LEVEL_END_TAG_REPLACEMENT;
}
case 100: break;
case 31:
{ int matchLength = yylength();
inputSegment.write(zzBuffer, zzStartRead, matchLength);
@ -31329,66 +31734,7 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
return outputSegment.nextChar();
}
}
case 58: break;
case 19:
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
if (null != escapedTags
&& escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
yybegin(END_TAG_TAIL_INCLUDE);
} else {
yybegin(END_TAG_TAIL_EXCLUDE);
}
}
case 59: break;
case 2:
{ inputStart = yychar;
inputSegment.clear();
inputSegment.append('<');
yybegin(LEFT_ANGLE_BRACKET);
}
case 60: break;
case 27:
{ // add (previously matched input length) + (this match length) - (substitution length)
cumulativeDiff += inputSegment.length() + yylength() - 1;
// position the correction at (already output length) + (substitution length)
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
return BLOCK_LEVEL_START_TAG_REPLACEMENT;
}
case 61: break;
case 44:
{ restoreState = STYLE_COMMENT; yybegin(SERVER_SIDE_INCLUDE);
}
case 62: break;
case 35:
{ yybegin(SCRIPT);
}
case 63: break;
case 42:
{ restoreState = COMMENT; yybegin(SERVER_SIDE_INCLUDE);
}
case 64: break;
case 10:
{ inputSegment.append('!'); yybegin(BANG);
}
case 65: break;
case 33:
{ yybegin(YYINITIAL);
if (escapeBR) {
inputSegment.write(zzBuffer, zzStartRead, yylength());
outputSegment = inputSegment;
return outputSegment.nextChar();
} else {
// add (previously matched input length) + (this match length) - (substitution length)
cumulativeDiff += inputSegment.length() + yylength() - 1;
// position the correction at (already output length) + (substitution length)
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
inputSegment.reset();
return BR_START_TAG_REPLACEMENT;
}
}
case 66: break;
case 101: break;
case 53:
{ // Handle paired UTF-16 surrogates.
String surrogatePair = yytext();
@ -31424,288 +31770,7 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
inputSegment.append('#');
yybegin(NUMERIC_CHARACTER);
}
case 67: break;
case 43:
{ restoreState = SCRIPT_COMMENT; yybegin(SERVER_SIDE_INCLUDE);
}
case 68: break;
case 30:
{ int length = yylength();
inputSegment.write(zzBuffer, zzStartRead, length);
entitySegment.clear();
char ch = entityValues.get(zzBuffer, zzStartRead, length).charValue();
entitySegment.append(ch);
outputSegment = entitySegment;
yybegin(CHARACTER_REFERENCE_TAIL);
}
case 69: break;
case 28:
{ restoreState = STYLE_COMMENT; yybegin(SINGLE_QUOTED_STRING);
}
case 70: break;
case 3:
{ inputStart = yychar;
inputSegment.clear();
inputSegment.append('&');
yybegin(AMPERSAND);
}
case 71: break;
case 16:
{ restoreState = SCRIPT_COMMENT; yybegin(SINGLE_QUOTED_STRING);
}
case 72: break;
case 52:
{ // Handle paired UTF-16 surrogates.
String surrogatePair = yytext();
char highSurrogate = '\u0000';
try { // High surrogates are in decimal range [55296, 56319]
highSurrogate = (char)Integer.parseInt(surrogatePair.substring(1, 6));
} catch(Exception e) { // should never happen
assert false: "Exception parsing high surrogate '"
+ surrogatePair.substring(1, 6) + "'";
}
if (Character.isHighSurrogate(highSurrogate)) {
outputSegment = entitySegment;
outputSegment.clear();
try {
outputSegment.unsafeWrite
((char)Integer.parseInt(surrogatePair.substring(10, 14), 16));
} catch(Exception e) { // should never happen
assert false: "Exception parsing low surrogate '"
+ surrogatePair.substring(10, 14) + "'";
}
// add (previously matched input length) + (this match length) - (substitution length)
cumulativeDiff += inputSegment.length() + yylength() - 2;
// position the correction at (already output length) + (substitution length)
addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
return highSurrogate;
}
yypushback(surrogatePair.length() - 1); // Consume only '#'
inputSegment.append('#');
yybegin(NUMERIC_CHARACTER);
}
case 73: break;
case 6:
{ int matchLength = yylength();
inputSegment.write(zzBuffer, zzStartRead, matchLength);
if (matchLength <= 7) { // 0x10FFFF = 1114111: max 7 decimal chars
String decimalCharRef = yytext();
int codePoint = 0;
try {
codePoint = Integer.parseInt(decimalCharRef);
} catch(Exception e) {
assert false: "Exception parsing code point '" + decimalCharRef + "'";
}
if (codePoint <= 0x10FFFF) {
outputSegment = entitySegment;
outputSegment.clear();
if (codePoint >= Character.MIN_SURROGATE
&& codePoint <= Character.MAX_SURROGATE) {
outputSegment.unsafeWrite(REPLACEMENT_CHARACTER);
} else {
outputSegment.setLength
(Character.toChars(codePoint, outputSegment.getArray(), 0));
}
yybegin(CHARACTER_REFERENCE_TAIL);
} else {
outputSegment = inputSegment;
yybegin(YYINITIAL);
return outputSegment.nextChar();
}
} else {
outputSegment = inputSegment;
yybegin(YYINITIAL);
return outputSegment.nextChar();
}
}
case 74: break;
case 37:
{ // add (this match length) [ - (substitution length) = 0 ]
cumulativeDiff += yylength();
// position the correction at (already output length) [ + (substitution length) = 0 ]
addOffCorrectMap(outputCharCount, cumulativeDiff);
yybegin(YYINITIAL);
}
case 75: break;
case 8:
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
if (null != escapedTags
&& escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
yybegin(START_TAG_TAIL_INCLUDE);
} else {
yybegin(START_TAG_TAIL_SUBSTITUTE);
}
}
case 76: break;
case 46:
{ yybegin(SCRIPT);
if (escapeSCRIPT) {
inputSegment.write(zzBuffer, zzStartRead, yylength());
outputSegment = inputSegment;
inputStart += 1 + yylength();
return outputSegment.nextChar();
}
}
case 77: break;
case 11:
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
yybegin(LEFT_ANGLE_BRACKET_SPACE);
}
case 78: break;
case 20:
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
}
case 79: break;
case 34:
{ // add (previously matched input length) + (this match length) [ - (substitution length) = 0]
cumulativeDiff += yychar - inputStart + yylength();
// position the correction at (already output length) [ + (substitution length) = 0]
addOffCorrectMap(outputCharCount, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
}
case 80: break;
case 23:
{ yybegin(restoreState); restoreState = previousRestoreState;
}
case 81: break;
case 32:
{ yybegin(COMMENT);
}
case 82: break;
case 14:
{ // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
cumulativeDiff += inputSegment.length() + yylength();
// position the correction at (already output length) [ + (substitution length) = 0 ]
addOffCorrectMap(outputCharCount, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
}
case 83: break;
case 18:
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
if (null != escapedTags
&& escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
yybegin(END_TAG_TAIL_INCLUDE);
} else {
yybegin(END_TAG_TAIL_SUBSTITUTE);
}
}
case 84: break;
case 25:
{ // add (previously matched input length) + (this match length) - (substitution length)
cumulativeDiff += inputSegment.length() + yylength() - 1;
// position the correction at (already output length) + (substitution length)
addOffCorrectMap(outputCharCount + 1, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
return BLOCK_LEVEL_END_TAG_REPLACEMENT;
}
case 85: break;
case 7:
{ // add (previously matched input length) + (this match length) - (substitution length)
cumulativeDiff += inputSegment.length() + yylength() - outputSegment.length();
// position the correction at (already output length) + (substitution length)
addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
yybegin(YYINITIAL);
return outputSegment.nextChar();
}
case 86: break;
case 48:
{ inputSegment.clear();
yybegin(YYINITIAL);
// add (previously matched input length) -- current match and substitution handled below
cumulativeDiff += yychar - inputStart;
// position the offset correction at (already output length) -- substitution handled below
int offsetCorrectionPos = outputCharCount;
int returnValue;
if (escapeSTYLE) {
inputSegment.write(zzBuffer, zzStartRead, yylength());
outputSegment = inputSegment;
returnValue = outputSegment.nextChar();
} else {
// add (this match length) - (substitution length)
cumulativeDiff += yylength() - 1;
// add (substitution length)
++offsetCorrectionPos;
returnValue = STYLE_REPLACEMENT;
}
addOffCorrectMap(offsetCorrectionPos, cumulativeDiff);
return returnValue;
}
case 87: break;
case 5:
{ inputSegment.append('#'); yybegin(NUMERIC_CHARACTER);
}
case 88: break;
case 26:
{ // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
cumulativeDiff += inputSegment.length() + yylength();
// position the correction at (already output length) [ + (substitution length) = 0 ]
addOffCorrectMap(outputCharCount, cumulativeDiff);
inputSegment.clear();
outputSegment = inputSegment;
yybegin(YYINITIAL);
}
case 89: break;
case 13:
{ inputSegment.append(zzBuffer[zzStartRead]);
}
case 90: break;
case 50:
{ // Handle paired UTF-16 surrogates.
outputSegment = entitySegment;
outputSegment.clear();
String surrogatePair = yytext();
char highSurrogate = '\u0000';
try {
highSurrogate = (char)Integer.parseInt(surrogatePair.substring(2, 6), 16);
} catch(Exception e) { // should never happen
assert false: "Exception parsing high surrogate '"
+ surrogatePair.substring(2, 6) + "'";
}
try {
outputSegment.unsafeWrite
((char)Integer.parseInt(surrogatePair.substring(10, 14), 16));
} catch(Exception e) { // should never happen
assert false: "Exception parsing low surrogate '"
+ surrogatePair.substring(10, 14) + "'";
}
// add (previously matched input length) + (this match length) - (substitution length)
cumulativeDiff += inputSegment.length() + yylength() - 2;
// position the correction at (already output length) + (substitution length)
addOffCorrectMap(outputCharCount + 2, cumulativeDiff);
inputSegment.clear();
yybegin(YYINITIAL);
return highSurrogate;
}
case 91: break;
case 40:
{ yybegin(SCRIPT_COMMENT);
}
case 92: break;
case 45:
{ yybegin(STYLE);
if (escapeSTYLE) {
inputSegment.write(zzBuffer, zzStartRead, yylength());
outputSegment = inputSegment;
inputStart += 1 + yylength();
return outputSegment.nextChar();
}
}
case 93: break;
case 22:
{ previousRestoreState = restoreState;
restoreState = SERVER_SIDE_INCLUDE;
yybegin(DOUBLE_QUOTED_STRING);
}
case 94: break;
case 12:
{ inputSegment.append('/'); yybegin(LEFT_ANGLE_BRACKET_SLASH);
}
case 95: break;
case 102: break;
case 36:
{ yybegin(YYINITIAL);
if (escapeBR) {
@ -31721,83 +31786,18 @@ public final class HTMLStripCharFilter extends BaseCharFilter {
return BR_END_TAG_REPLACEMENT;
}
}
case 96: break;
case 24:
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
outputSegment = inputSegment;
yybegin(YYINITIAL);
return outputSegment.nextChar();
}
case 97: break;
case 47:
{ // add (previously matched input length) + (this match length) [ - (substitution length) = 0 ]
cumulativeDiff += inputSegment.length() + yylength();
// position the correction at (already output length) [ + (substitution length) = 0 ]
addOffCorrectMap(outputCharCount, cumulativeDiff);
inputSegment.clear();
yybegin(CDATA);
}
case 98: break;
case 29:
{ restoreState = STYLE_COMMENT; yybegin(DOUBLE_QUOTED_STRING);
}
case 99: break;
case 17:
{ restoreState = SCRIPT_COMMENT; yybegin(DOUBLE_QUOTED_STRING);
}
case 100: break;
case 9:
{ inputSegment.write(zzBuffer, zzStartRead, yylength());
if (null != escapedTags
&& escapedTags.contains(zzBuffer, zzStartRead, yylength())) {
yybegin(START_TAG_TAIL_INCLUDE);
} else {
yybegin(START_TAG_TAIL_EXCLUDE);
}
}
case 101: break;
case 49:
{ inputSegment.clear();
yybegin(YYINITIAL);
// add (previously matched input length) -- current match and substitution handled below
cumulativeDiff += yychar - inputStart;
// position at (already output length) -- substitution handled below
int offsetCorrectionPos = outputCharCount;
int returnValue;
if (escapeSCRIPT) {
inputSegment.write(zzBuffer, zzStartRead, yylength());
outputSegment = inputSegment;
returnValue = outputSegment.nextChar();
} else {
// add (this match length) - (substitution length)
cumulativeDiff += yylength() - 1;
// add (substitution length)
++offsetCorrectionPos;
returnValue = SCRIPT_REPLACEMENT;
}
addOffCorrectMap(offsetCorrectionPos, cumulativeDiff);
return returnValue;
}
case 102: break;
case 103: break;
case 38:
{ yybegin(restoreState);
}
case 103: break;
case 104: break;
case 41:
{ yybegin(STYLE_COMMENT);
}
case 104: break;
case 105: break;
case 1:
{ return zzBuffer[zzStartRead];
}
case 105: break;
case 4:
{ yypushback(1);
outputSegment = inputSegment;
outputSegment.restart();
yybegin(YYINITIAL);
return outputSegment.nextChar();
}
case 106: break;
default:
if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {

View File

@ -141,9 +141,9 @@ InlineElment = ( [aAbBiIqQsSuU] |
[vV][aA][rR] )
%include src/java/org/apache/lucene/analysis/charfilter/HTMLCharacterEntities.jflex
%include HTMLCharacterEntities.jflex
%include src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.SUPPLEMENTARY.jflex-macro
%include HTMLStripCharFilter.SUPPLEMENTARY.jflex-macro
%{
private static final int INITIAL_INPUT_SEGMENT_SIZE = 1024;

View File

@ -24,6 +24,8 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.ArrayUtil;
@ -35,6 +37,12 @@ import org.apache.lucene.util.ArrayUtil;
* {@link #CJKBigramFilter(TokenStream, int)} to explicitly control which
* of the CJK scripts are turned into bigrams.
* <p>
* By default, when a CJK character has no adjacent characters to form
* a bigram, it is output in unigram form. If you want to always output
* both unigrams and bigrams, set the <code>outputUnigrams</code>
* flag in {@link CJKBigramFilter#CJKBigramFilter(TokenStream, int, boolean)}.
* This can be used for a combined unigram+bigram approach.
* <p>
* In all cases, all non-CJK input is passed thru unmodified.
*/
public final class CJKBigramFilter extends TokenFilter {
@ -67,10 +75,16 @@ public final class CJKBigramFilter extends TokenFilter {
private final Object doHiragana;
private final Object doKatakana;
private final Object doHangul;
// true if we should output unigram tokens always
private final boolean outputUnigrams;
private boolean ngramState; // false = output unigram, true = output bigram
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
private final PositionLengthAttribute posLengthAtt = addAttribute(PositionLengthAttribute.class);
// buffers containing codepoint and offsets in parallel
int buffer[] = new int[8];
@ -88,23 +102,36 @@ public final class CJKBigramFilter extends TokenFilter {
/**
* Calls {@link CJKBigramFilter#CJKBigramFilter(TokenStream, int)
* CJKBigramFilter(HAN | HIRAGANA | KATAKANA | HANGUL)}
* CJKBigramFilter(in, HAN | HIRAGANA | KATAKANA | HANGUL)}
*/
public CJKBigramFilter(TokenStream in) {
this(in, HAN | HIRAGANA | KATAKANA | HANGUL);
}
/**
* Create a new CJKBigramFilter, specifying which writing systems should be bigrammed.
* @param flags OR'ed set from {@link CJKBigramFilter#HAN}, {@link CJKBigramFilter#HIRAGANA},
* {@link CJKBigramFilter#KATAKANA}, {@link CJKBigramFilter#HANGUL}
* Calls {@link CJKBigramFilter#CJKBigramFilter(TokenStream, int, boolean)
* CJKBigramFilter(in, flags, false)}
*/
public CJKBigramFilter(TokenStream in, int flags) {
this(in, flags, false);
}
/**
* Create a new CJKBigramFilter, specifying which writing systems should be bigrammed,
* and whether or not unigrams should also be output.
* @param flags OR'ed set from {@link CJKBigramFilter#HAN}, {@link CJKBigramFilter#HIRAGANA},
* {@link CJKBigramFilter#KATAKANA}, {@link CJKBigramFilter#HANGUL}
* @param outputUnigrams true if unigrams for the selected writing systems should also be output.
* when this is false, this is only done when there are no adjacent characters to form
* a bigram.
*/
public CJKBigramFilter(TokenStream in, int flags, boolean outputUnigrams) {
super(in);
doHan = (flags & HAN) == 0 ? NO : HAN_TYPE;
doHiragana = (flags & HIRAGANA) == 0 ? NO : HIRAGANA_TYPE;
doKatakana = (flags & KATAKANA) == 0 ? NO : KATAKANA_TYPE;
doHangul = (flags & HANGUL) == 0 ? NO : HANGUL_TYPE;
this.outputUnigrams = outputUnigrams;
}
/*
@ -120,7 +147,24 @@ public final class CJKBigramFilter extends TokenFilter {
// case 1: we have multiple remaining codepoints buffered,
// so we can emit a bigram here.
flushBigram();
if (outputUnigrams) {
// when also outputting unigrams, we output the unigram first,
// then rewind back to revisit the bigram.
// so an input of ABC is A + (rewind)AB + B + (rewind)BC + C
// the logic in hasBufferedUnigram ensures we output the C,
// even though it did actually have adjacent CJK characters.
if (ngramState) {
flushBigram();
} else {
flushUnigram();
index--;
}
ngramState = !ngramState;
} else {
flushBigram();
}
return true;
} else if (doNext()) {
@ -260,6 +304,11 @@ public final class CJKBigramFilter extends TokenFilter {
termAtt.setLength(len2);
offsetAtt.setOffset(startOffset[index], endOffset[index+1]);
typeAtt.setType(DOUBLE_TYPE);
// when outputting unigrams, all bigrams are synonyms that span two unigrams
if (outputUnigrams) {
posIncAtt.setPositionIncrement(0);
posLengthAtt.setPositionLength(2);
}
index++;
}
@ -292,7 +341,13 @@ public final class CJKBigramFilter extends TokenFilter {
* inputs.
*/
private boolean hasBufferedUnigram() {
return bufferLen == 1 && index == 0;
if (outputUnigrams) {
// when outputting unigrams always
return bufferLen - index == 1;
} else {
// otherwise its only when we have a lone CJK character
return bufferLen == 1 && index == 0;
}
}
@Override
@ -303,5 +358,6 @@ public final class CJKBigramFilter extends TokenFilter {
lastEndOffset = 0;
loneState = null;
exhausted = false;
ngramState = false;
}
}

View File

@ -33,12 +33,13 @@ import org.apache.lucene.analysis.util.TokenFilterFactory;
* &lt;filter class="solr.LowerCaseFilterFactory"/&gt;
* &lt;filter class="solr.CJKBigramFilterFactory"
* han="true" hiragana="true"
* katakana="true" hangul="true" /&gt;
* katakana="true" hangul="true" outputUnigrams="false" /&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
*/
public class CJKBigramFilterFactory extends TokenFilterFactory {
int flags;
boolean outputUnigrams;
@Override
public void init(Map<String,String> args) {
@ -56,10 +57,11 @@ public class CJKBigramFilterFactory extends TokenFilterFactory {
if (getBoolean("hangul", true)) {
flags |= CJKBigramFilter.HANGUL;
}
outputUnigrams = getBoolean("outputUnigrams", false);
}
@Override
public TokenStream create(TokenStream input) {
return new CJKBigramFilter(input, flags);
return new CJKBigramFilter(input, flags, outputUnigrams);
}
}

View File

@ -1,4 +1,4 @@
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 7/15/12 1:57 AM */
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 8/6/12 11:57 AM */
package org.apache.lucene.analysis.standard;
@ -33,8 +33,8 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
/**
* This class is a scanner generated by
* <a href="http://www.jflex.de/">JFlex</a> 1.5.0-SNAPSHOT
* on 7/15/12 1:57 AM from the specification file
* <tt>C:/cygwin/home/s/svn/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex</tt>
* on 8/6/12 11:57 AM from the specification file
* <tt>/home/rmuir/workspace/lucene-trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex</tt>
*/
class ClassicTokenizerImpl implements StandardTokenizerInterface {
@ -42,7 +42,7 @@ class ClassicTokenizerImpl implements StandardTokenizerInterface {
public static final int YYEOF = -1;
/** initial size of the lookahead buffer */
private static final int ZZ_BUFFERSIZE = 16384;
private static final int ZZ_BUFFERSIZE = 4096;
/** lexical states */
public static final int YYINITIAL = 0;

View File

@ -36,6 +36,7 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
%function getNextToken
%pack
%char
%buffer 4096
%{

View File

@ -14,7 +14,7 @@
* limitations under the License.
*/
// Generated using ICU4J 49.1.0.0 on Thursday, July 26, 2012 10:22:01 PM UTC
// Generated using ICU4J 49.1.0.0 on Monday, August 6, 2012 3:57:23 PM UTC
// by org.apache.lucene.analysis.icu.GenerateJFlexSupplementaryMacros

View File

@ -1,4 +1,4 @@
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 7/26/12 6:22 PM */
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 8/6/12 11:57 AM */
package org.apache.lucene.analysis.standard;
@ -43,7 +43,7 @@ public final class StandardTokenizerImpl implements StandardTokenizerInterface {
public static final int YYEOF = -1;
/** initial size of the lookahead buffer */
private static final int ZZ_BUFFERSIZE = 16384;
private static final int ZZ_BUFFERSIZE = 4096;
/** lexical states */
public static final int YYINITIAL = 0;

View File

@ -44,8 +44,9 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
%implements StandardTokenizerInterface
%function getNextToken
%char
%buffer 4096
%include src/java/org/apache/lucene/analysis/standard/SUPPLEMENTARY.jflex-macro
%include SUPPLEMENTARY.jflex-macro
ALetter = ([\p{WB:ALetter}] | {ALetterSupp})
Format = ([\p{WB:Format}] | {FormatSupp})
Numeric = ([\p{WB:Numeric}] | {NumericSupp})

View File

@ -1,4 +1,4 @@
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 7/26/12 6:22 PM */
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 8/6/12 11:57 AM */
package org.apache.lucene.analysis.standard;
@ -46,7 +46,7 @@ public final class UAX29URLEmailTokenizerImpl implements StandardTokenizerInterf
public static final int YYEOF = -1;
/** initial size of the lookahead buffer */
private static final int ZZ_BUFFERSIZE = 16384;
private static final int ZZ_BUFFERSIZE = 4096;
/** lexical states */
public static final int YYINITIAL = 0;

View File

@ -47,8 +47,9 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
%implements StandardTokenizerInterface
%function getNextToken
%char
%buffer 4096
%include src/java/org/apache/lucene/analysis/standard/SUPPLEMENTARY.jflex-macro
%include SUPPLEMENTARY.jflex-macro
ALetter = ([\p{WB:ALetter}] | {ALetterSupp})
Format = ([\p{WB:Format}] | {FormatSupp})
Numeric = ([\p{WB:Numeric}] | {NumericSupp})
@ -88,7 +89,7 @@ HiraganaEx = {Hiragana} ({Format} | {Extend})*
// RFC-5321: Simple Mail Transfer Protocol
// RFC-5322: Internet Message Format
%include src/java/org/apache/lucene/analysis/standard/ASCIITLD.jflex-macro
%include ASCIITLD.jflex-macro
DomainLabel = [A-Za-z0-9] ([-A-Za-z0-9]* [A-Za-z0-9])?
DomainNameStrict = {DomainLabel} ("." {DomainLabel})* {ASCIITLD}

View File

@ -1,4 +1,4 @@
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 7/15/12 1:57 AM */
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 8/6/12 11:57 AM */
package org.apache.lucene.analysis.wikipedia;
@ -25,8 +25,8 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
/**
* This class is a scanner generated by
* <a href="http://www.jflex.de/">JFlex</a> 1.5.0-SNAPSHOT
* on 7/15/12 1:57 AM from the specification file
* <tt>C:/cygwin/home/s/svn/lucene/dev/trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex</tt>
* on 8/6/12 11:57 AM from the specification file
* <tt>/home/rmuir/workspace/lucene-trunk/lucene/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex</tt>
*/
class WikipediaTokenizerImpl {
@ -34,7 +34,7 @@ class WikipediaTokenizerImpl {
public static final int YYEOF = -1;
/** initial size of the lookahead buffer */
private static final int ZZ_BUFFERSIZE = 16384;
private static final int ZZ_BUFFERSIZE = 4096;
/** lexical states */
public static final int THREE_SINGLE_QUOTES_STATE = 10;

View File

@ -27,6 +27,7 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
%function getNextToken
%pack
%char
%buffer 4096
%{

View File

@ -18,6 +18,7 @@ package org.apache.lucene.analysis.cjk;
*/
import java.io.Reader;
import java.util.Random;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
@ -33,6 +34,15 @@ public class TestCJKBigramFilter extends BaseTokenStreamTestCase {
}
};
Analyzer unibiAnalyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer t = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
return new TokenStreamComponents(t,
new CJKBigramFilter(t, 0xff, true));
}
};
public void testHuge() throws Exception {
assertAnalyzesTo(analyzer, "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた"
+ "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた"
@ -62,6 +72,96 @@ public class TestCJKBigramFilter extends BaseTokenStreamTestCase {
}
};
assertAnalyzesTo(a, "多くの学生が試験に落ちた。",
new String[] { "", "", "", "学生", "", "試験", "", "", "", "" });
new String[] { "", "", "", "学生", "", "試験", "", "", "", "" },
new int[] { 0, 1, 2, 3, 5, 6, 8, 9, 10, 11 },
new int[] { 1, 2, 3, 5, 6, 8, 9, 10, 11, 12 },
new String[] { "<SINGLE>", "<HIRAGANA>", "<HIRAGANA>", "<DOUBLE>", "<HIRAGANA>", "<DOUBLE>",
"<HIRAGANA>", "<SINGLE>", "<HIRAGANA>", "<HIRAGANA>", "<SINGLE>" },
new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 });
}
public void testAllScripts() throws Exception {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer t = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
return new TokenStreamComponents(t,
new CJKBigramFilter(t, 0xff, false));
}
};
assertAnalyzesTo(a, "多くの学生が試験に落ちた。",
new String[] { "多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた" });
}
public void testUnigramsAndBigramsAllScripts() throws Exception {
assertAnalyzesTo(unibiAnalyzer, "多くの学生が試験に落ちた。",
new String[] {
"", "多く", "", "くの", "", "の学", "", "学生", "",
"生が", "", "が試", "", "試験", "", "験に", "",
"に落", "", "落ち", "", "ちた", ""
},
new int[] { 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6,
6, 7, 7, 8, 8, 9, 9, 10, 10, 11 },
new int[] { 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7,
8, 8, 9, 9, 10, 10, 11, 11, 12, 12 },
new String[] { "<SINGLE>", "<DOUBLE>", "<SINGLE>", "<DOUBLE>", "<SINGLE>", "<DOUBLE>", "<SINGLE>", "<DOUBLE>",
"<SINGLE>", "<DOUBLE>", "<SINGLE>", "<DOUBLE>", "<SINGLE>", "<DOUBLE>", "<SINGLE>", "<DOUBLE>",
"<SINGLE>", "<DOUBLE>", "<SINGLE>", "<DOUBLE>", "<SINGLE>", "<DOUBLE>", "<SINGLE>" },
new int[] { 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
0, 1, 0, 1, 0, 1, 0, 1, 0, 1 },
new int[] { 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1,
2, 1, 2, 1, 2, 1, 2, 1, 2, 1 }
);
}
public void testUnigramsAndBigramsHanOnly() throws Exception {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer t = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
return new TokenStreamComponents(t, new CJKBigramFilter(t, CJKBigramFilter.HAN, true));
}
};
assertAnalyzesTo(a, "多くの学生が試験に落ちた。",
new String[] { "", "", "", "", "学生", "", "", "", "試験", "", "", "", "", "" },
new int[] { 0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 10, 11 },
new int[] { 1, 2, 3, 4, 5, 5, 6, 7, 8, 8, 9, 10, 11, 12 },
new String[] { "<SINGLE>", "<HIRAGANA>", "<HIRAGANA>", "<SINGLE>", "<DOUBLE>",
"<SINGLE>", "<HIRAGANA>", "<SINGLE>", "<DOUBLE>", "<SINGLE>",
"<HIRAGANA>", "<SINGLE>", "<HIRAGANA>", "<HIRAGANA>", "<SINGLE>" },
new int[] { 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1 },
new int[] { 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1 });
}
public void testUnigramsAndBigramsHuge() throws Exception {
assertAnalyzesTo(unibiAnalyzer, "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた"
+ "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた"
+ "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた" + "多くの学生が試験に落ちた",
new String[] {
"", "多く", "", "くの", "", "の学", "", "学生", "", "生が", "", "が試", "", "試験", "", "験に", "", "に落", "", "落ち", "", "ちた", "", "た多",
"", "多く", "", "くの", "", "の学", "", "学生", "", "生が", "", "が試", "", "試験", "", "験に", "", "に落", "", "落ち", "", "ちた", "", "た多",
"", "多く", "", "くの", "", "の学", "", "学生", "", "生が", "", "が試", "", "試験", "", "験に", "", "に落", "", "落ち", "", "ちた", "", "た多",
"", "多く", "", "くの", "", "の学", "", "学生", "", "生が", "", "が試", "", "試験", "", "験に", "", "に落", "", "落ち", "", "ちた", "", "た多",
"", "多く", "", "くの", "", "の学", "", "学生", "", "生が", "", "が試", "", "試験", "", "験に", "", "に落", "", "落ち", "", "ちた", "", "た多",
"", "多く", "", "くの", "", "の学", "", "学生", "", "生が", "", "が試", "", "試験", "", "験に", "", "に落", "", "落ち", "", "ちた", "", "た多",
"", "多く", "", "くの", "", "の学", "", "学生", "", "生が", "", "が試", "", "試験", "", "験に", "", "に落", "", "落ち", "", "ちた", "", "た多",
"", "多く", "", "くの", "", "の学", "", "学生", "", "生が", "", "が試", "", "試験", "", "験に", "", "に落", "", "落ち", "", "ちた", "", "た多",
"", "多く", "", "くの", "", "の学", "", "学生", "", "生が", "", "が試", "", "試験", "", "験に", "", "に落", "", "落ち", "", "ちた", "", "た多",
"", "多く", "", "くの", "", "の学", "", "学生", "", "生が", "", "が試", "", "試験", "", "験に", "", "に落", "", "落ち", "", "ちた", "", "た多",
"", "多く", "", "くの", "", "の学", "", "学生", "", "生が", "", "が試", "", "試験", "", "験に", "", "に落", "", "落ち", "", "ちた", ""
}
);
}
/** blast some random strings through the analyzer */
public void testRandomUnibiStrings() throws Exception {
checkRandomData(random(), unibiAnalyzer, 1000*RANDOM_MULTIPLIER);
}
/** blast some random strings through the analyzer */
public void testRandomUnibiHugeStrings() throws Exception {
Random random = random();
checkRandomData(random, unibiAnalyzer, 100*RANDOM_MULTIPLIER, 8192);
}
}

View File

@ -52,4 +52,16 @@ public class TestCJKBigramFilterFactory extends BaseTokenStreamTestCase {
assertTokenStreamContents(stream,
new String[] { "", "", "", "学生", "", "試験", "", "", "", "" });
}
public void testHanOnlyUnigrams() throws Exception {
Reader reader = new StringReader("多くの学生が試験に落ちた。");
CJKBigramFilterFactory factory = new CJKBigramFilterFactory();
Map<String,String> args = new HashMap<String,String>();
args.put("hiragana", "false");
args.put("outputUnigrams", "true");
factory.init(args);
TokenStream stream = factory.create(new StandardTokenizer(TEST_VERSION_CURRENT, reader));
assertTokenStreamContents(stream,
new String[] { "", "", "", "", "学生", "", "", "", "試験", "", "", "", "", "" });
}
}

View File

@ -100,8 +100,7 @@ public class TestAllAnalyzersHaveFactories extends LuceneTestCase {
private static final ResourceLoader loader = new StringMockResourceLoader("");
public void test() throws Exception {
List<Class<?>> analysisClasses = new ArrayList<Class<?>>();
TestRandomChains.getClassesForPackage("org.apache.lucene.analysis", analysisClasses);
List<Class<?>> analysisClasses = TestRandomChains.getClassesForPackage("org.apache.lucene.analysis");
for (final Class<?> c : analysisClasses) {
final int modifiers = c.getModifiers();

View File

@ -25,6 +25,7 @@ import java.io.StringReader;
import java.lang.reflect.Constructor;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Modifier;
import java.net.URI;
import java.net.URL;
import java.nio.CharBuffer;
import java.util.ArrayList;
@ -165,8 +166,7 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
@BeforeClass
public static void beforeClass() throws Exception {
List<Class<?>> analysisClasses = new ArrayList<Class<?>>();
getClassesForPackage("org.apache.lucene.analysis", analysisClasses);
List<Class<?>> analysisClasses = getClassesForPackage("org.apache.lucene.analysis");
tokenizers = new ArrayList<Constructor<? extends Tokenizer>>();
tokenfilters = new ArrayList<Constructor<? extends TokenFilter>>();
charfilters = new ArrayList<Constructor<? extends CharFilter>>();
@ -235,19 +235,30 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
private static <T> Constructor<T> castConstructor(Class<T> instanceClazz, Constructor<?> ctor) {
return (Constructor<T>) ctor;
}
static void getClassesForPackage(String pckgname, List<Class<?>> classes) throws Exception {
public static List<Class<?>> getClassesForPackage(String pckgname) throws Exception {
final List<Class<?>> classes = new ArrayList<Class<?>>();
collectClassesForPackage(pckgname, classes);
assertFalse("No classes found in package '"+pckgname+"'; maybe your test classes are packaged as JAR file?", classes.isEmpty());
return classes;
}
private static void collectClassesForPackage(String pckgname, List<Class<?>> classes) throws Exception {
final ClassLoader cld = TestRandomChains.class.getClassLoader();
final String path = pckgname.replace('.', '/');
final Enumeration<URL> resources = cld.getResources(path);
while (resources.hasMoreElements()) {
final File directory = new File(resources.nextElement().toURI());
final URI uri = resources.nextElement().toURI();
if (!"file".equalsIgnoreCase(uri.getScheme()))
continue;
final File directory = new File(uri);
if (directory.exists()) {
String[] files = directory.list();
for (String file : files) {
if (new File(directory, file).isDirectory()) {
// recurse
String subPackage = pckgname + "." + file;
getClassesForPackage(subPackage, classes);
collectClassesForPackage(subPackage, classes);
}
if (file.endsWith(".class")) {
String clazzName = file.substring(0, file.length() - 6);

View File

@ -43,7 +43,6 @@ import org.apache.lucene.facet.taxonomy.TaxonomyReader;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.FieldsEnum;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
@ -156,7 +155,12 @@ public class TestPerfTasksLogic extends BenchmarkTestCase {
CountingSearchTestTask.numSearches = 0;
execBenchmark(algLines);
assertTrue(CountingSearchTestTask.numSearches > 0);
// NOTE: cannot assert this, because on a super-slow
// system, it could be after waiting 0.5 seconds that
// the search threads hadn't yet succeeded in starting
// up and then they start up and do no searching:
//assertTrue(CountingSearchTestTask.numSearches > 0);
}
public void testHighlighting() throws Exception {
@ -201,6 +205,7 @@ public class TestPerfTasksLogic extends BenchmarkTestCase {
// 1. alg definition (required in every "logic" test)
String algLines[] = {
"doc.stored=true",//doc storage is required in order to have text to highlight
"doc.term.vector=true",
"doc.term.vector.offsets=true",
"content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
"docs.file=" + getReuters20LinesFile(),
@ -487,13 +492,13 @@ public class TestPerfTasksLogic extends BenchmarkTestCase {
int totalTokenCount2 = 0;
FieldsEnum fields = MultiFields.getFields(reader).iterator();
String fieldName = null;
while((fieldName = fields.next()) != null) {
Fields fields = MultiFields.getFields(reader);
for (String fieldName : fields) {
if (fieldName.equals(DocMaker.ID_FIELD) || fieldName.equals(DocMaker.DATE_MSEC_FIELD) || fieldName.equals(DocMaker.TIME_SEC_FIELD)) {
continue;
}
Terms terms = fields.terms();
Terms terms = fields.terms(fieldName);
if (terms == null) {
continue;
}

View File

@ -139,29 +139,6 @@
<target name="compile-core" depends="compile-lucene-core"/>
<!--
Run after Junit tests.
-->
<target name="generate-clover-reports" depends="clover">
<fail unless="run.clover">Clover not enabled!</fail>
<mkdir dir="${clover.report.dir}"/>
<fileset dir="build" id="clover.test.result.files">
<include name="**/test/TEST-*.xml"/>
<!-- do not include BW tests -->
<exclude name="backwards/**"/>
</fileset>
<clover-report>
<current outfile="${clover.report.dir}" title="${final.name}" numThreads="0">
<format type="html" filter="assert"/>
<testresults refid="clover.test.result.files"/>
</current>
<current outfile="${clover.report.dir}/clover.xml" title="${final.name}">
<format type="xml" filter="assert"/>
<testresults refid="clover.test.result.files"/>
</current>
</clover-report>
</target>
<!-- Validation (license/notice/api checks). -->
<target name="validate" depends="check-licenses,rat-sources,check-forbidden-apis" description="Validate stuff." />
@ -176,6 +153,7 @@
<apiFileSet dir="${custom-tasks.dir}/forbiddenApis">
<include name="jdk.txt" />
<include name="jdk-deprecated.txt" />
<include name="executors.txt" />
</apiFileSet>
<fileset dir="${basedir}/build" includes="**/*.class" />
</forbidden-apis>

View File

@ -88,7 +88,7 @@
<property name="tests.timezone" value="random" />
<property name="tests.directory" value="random" />
<property name="tests.linedocsfile" value="europarl.lines.txt.gz" />
<property name="tests.loggingfile" value="/dev/null"/>
<property name="tests.loggingfile" value="${common.dir}/tools/junit4/logging.properties"/>
<property name="tests.nightly" value="false" />
<property name="tests.weekly" value="false" />
<property name="tests.slow" value="true" />
@ -700,15 +700,22 @@
<condition property="tests.method" value="${testmethod}*">
<isset property="testmethod" />
</condition>
<condition property="tests.showSuccess" value="true">
<or>
<isset property="tests.class" />
<isset property="tests.method" />
</or>
</condition>
<!-- default -->
<property name="tests.showSuccess" value="false"/>
<condition property="tests.showOutput" value="always">
<or>
<isset property="tests.class" />
<isset property="tests.method" />
</or>
</condition>
<property name="tests.showOutput" value="onerror"/>
<!-- Test macro using junit4. -->
<macrodef name="test-macro" description="Executes junit tests.">
@ -854,6 +861,7 @@
<syspropertyset>
<propertyref prefix="tests.maxfailures" />
<propertyref prefix="tests.failfast" />
<propertyref prefix="tests.badapples" />
</syspropertyset>
<!-- Pass randomized settings to the forked JVM. -->
@ -875,8 +883,7 @@
<junit4:report-text
showThrowable="true"
showStackTraces="true"
showOutputStream="true"
showErrorStream="true"
showOutput="${tests.showOutput}"
showStatusOk="${tests.showSuccess}"
showStatusError="${tests.showError}"
@ -896,8 +903,7 @@
file="@{junit.output.dir}/tests-report.txt"
showThrowable="true"
showStackTraces="true"
showOutputStream="true"
showErrorStream="true"
showOutput="always"
showStatusOk="true"
showStatusError="true"
@ -913,8 +919,7 @@
file="@{junit.output.dir}/tests-failures.txt"
showThrowable="true"
showStackTraces="true"
showOutputStream="true"
showErrorStream="true"
showOutput="onerror"
showStatusOk="false"
showStatusError="true"
@ -929,8 +934,13 @@
the slowest tests or for reuse in balancing). -->
<junit4:report-execution-times file="@{junit.output.dir}/tests-timehints.txt" historyLength="5" />
<junit4:report-ant-xml dir="@{junit.output.dir}" />
<junit4:report-json file="@{junit.output.dir}/tests-report-${ant.project.name}/index.html" />
<!-- ANT-compatible XMLs for jenkins records etc. -->
<junit4:report-ant-xml dir="@{junit.output.dir}" outputStreams="no" />
<!--
Enable if you wish to have a nice HTML5 report.
<junit4:report-json file="@{junit.output.dir}/tests-report-${ant.project.name}/index.html" outputStreams="no" />
-->
</listeners>
<!-- Input test classes. -->

View File

@ -480,7 +480,7 @@ public class MyAnalyzer extends Analyzer {
System.out.println(termAtt.toString());
}
stream.end()
stream.end();
} finally {
stream.close();
}
@ -509,7 +509,7 @@ easily by adding a LengthFilter to the chain. Only the
{@literal @Override}
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
final Tokenizer source = new WhitespaceTokenizer(matchVersion, reader);
TokenStream result = new LengthFilter(source, 3, Integer.MAX_VALUE);
TokenStream result = new LengthFilter(true, source, 3, Integer.MAX_VALUE);
return new TokenStreamComponents(source, result);
}
</pre>

View File

@ -27,7 +27,6 @@ import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.FieldsEnum;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.Terms;
@ -40,6 +39,7 @@ import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.DoubleBarrelLRUCache;
import org.apache.lucene.util.UnmodifiableIterator;
/** Handles a terms dict, but decouples all details of
* doc/freqs/positions reading to an instance of {@link
@ -184,8 +184,8 @@ public class BlockTermsReader extends FieldsProducer {
}
@Override
public FieldsEnum iterator() {
return new TermFieldsEnum();
public Iterator<String> iterator() {
return new UnmodifiableIterator<String>(fields.keySet().iterator());
}
@Override
@ -199,32 +199,6 @@ public class BlockTermsReader extends FieldsProducer {
return fields.size();
}
// Iterates through all fields
private class TermFieldsEnum extends FieldsEnum {
final Iterator<FieldReader> it;
FieldReader current;
TermFieldsEnum() {
it = fields.values().iterator();
}
@Override
public String next() {
if (it.hasNext()) {
current = it.next();
return current.fieldInfo.name;
} else {
current = null;
return null;
}
}
@Override
public Terms terms() throws IOException {
return current;
}
}
private class FieldReader extends Terms {
final long numTerms;
final FieldInfo fieldInfo;
@ -253,6 +227,21 @@ public class BlockTermsReader extends FieldsProducer {
return new SegmentTermsEnum();
}
@Override
public boolean hasOffsets() {
return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
}
@Override
public boolean hasPositions() {
return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
}
@Override
public boolean hasPayloads() {
return fieldInfo.hasPayloads();
}
@Override
public long size() {
return numTerms;

View File

@ -31,7 +31,6 @@ import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.FieldsEnum;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.Terms;
@ -46,6 +45,7 @@ import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.UnmodifiableIterator;
import org.apache.lucene.util.automaton.CompiledAutomaton;
import org.apache.lucene.util.automaton.RunAutomaton;
import org.apache.lucene.util.automaton.Transition;
@ -199,8 +199,8 @@ public class BlockTreeTermsReader extends FieldsProducer {
}
@Override
public FieldsEnum iterator() {
return new TermFieldsEnum();
public Iterator<String> iterator() {
return new UnmodifiableIterator<String>(fields.keySet().iterator());
}
@Override
@ -214,32 +214,6 @@ public class BlockTreeTermsReader extends FieldsProducer {
return fields.size();
}
// Iterates through all fields
private class TermFieldsEnum extends FieldsEnum {
final Iterator<FieldReader> it;
FieldReader current;
TermFieldsEnum() {
it = fields.values().iterator();
}
@Override
public String next() {
if (it.hasNext()) {
current = it.next();
return current.fieldInfo.name;
} else {
current = null;
return null;
}
}
@Override
public Terms terms() throws IOException {
return current;
}
}
// for debugging
String brToString(BytesRef b) {
if (b == null) {
@ -456,6 +430,21 @@ public class BlockTreeTermsReader extends FieldsProducer {
return BytesRef.getUTF8SortedAsUnicodeComparator();
}
@Override
public boolean hasOffsets() {
return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
}
@Override
public boolean hasPositions() {
return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
}
@Override
public boolean hasPayloads() {
return fieldInfo.hasPayloads();
}
@Override
public TermsEnum iterator(TermsEnum reuse) throws IOException {
return new SegmentTermsEnum();

View File

@ -22,7 +22,6 @@ import java.io.IOException;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.FieldsEnum;
import org.apache.lucene.index.MergeState;
import org.apache.lucene.index.SegmentWriteState; // javadocs
import org.apache.lucene.index.Terms;
@ -53,13 +52,10 @@ public abstract class FieldsConsumer implements Closeable {
public abstract void close() throws IOException;
public void merge(MergeState mergeState, Fields fields) throws IOException {
FieldsEnum fieldsEnum = fields.iterator();
assert fieldsEnum != null;
String field;
while((field = fieldsEnum.next()) != null) {
for (String field : fields) {
mergeState.fieldInfo = mergeState.fieldInfos.fieldInfo(field);
assert mergeState.fieldInfo != null : "FieldInfo for field is null: "+ field;
Terms terms = fieldsEnum.terms();
Terms terms = fields.terms(field);
if (terms != null) {
final TermsConsumer termsConsumer = addField(mergeState.fieldInfo);
termsConsumer.merge(mergeState, terms.iterator(null));

View File

@ -124,15 +124,17 @@ public final class MappingMultiDocsAndPositionsEnum extends DocsAndPositionsEnum
@Override
public BytesRef getPayload() throws IOException {
BytesRef payload = current.getPayload();
if (mergeState.currentPayloadProcessor[upto] != null) {
if (mergeState.currentPayloadProcessor[upto] != null && payload != null) {
// to not violate the D&P api, we must give the processor a private copy
// TODO: reuse a BytesRef if there is a PPP
payload = BytesRef.deepCopyOf(payload);
mergeState.currentPayloadProcessor[upto].processPayload(payload);
if (payload.length == 0) {
// don't let PayloadProcessors corrumpt the index
return null;
}
}
return payload;
}
@Override
public boolean hasPayload() {
return current.hasPayload();
}
}

View File

@ -112,12 +112,7 @@ public abstract class PostingsConsumer {
totTF += freq;
for(int i=0;i<freq;i++) {
final int position = postingsEnum.nextPosition();
final BytesRef payload;
if (postingsEnum.hasPayload()) {
payload = postingsEnum.getPayload();
} else {
payload = null;
}
final BytesRef payload = postingsEnum.getPayload();
this.addPosition(position, payload, -1, -1);
}
this.finishDoc();
@ -137,12 +132,7 @@ public abstract class PostingsConsumer {
totTF += freq;
for(int i=0;i<freq;i++) {
final int position = postingsEnum.nextPosition();
final BytesRef payload;
if (postingsEnum.hasPayload()) {
payload = postingsEnum.getPayload();
} else {
payload = null;
}
final BytesRef payload = postingsEnum.getPayload();
this.addPosition(position, payload, postingsEnum.startOffset(), postingsEnum.endOffset());
}
this.finishDoc();

View File

@ -26,8 +26,9 @@ import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.FieldsEnum;
import org.apache.lucene.index.MergeState;
import org.apache.lucene.index.PayloadProcessorProvider.PayloadProcessor;
import org.apache.lucene.index.PayloadProcessorProvider.ReaderPayloadProcessor;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.DocIdSetIterator;
@ -41,14 +42,14 @@ import org.apache.lucene.util.BytesRef;
* <ol>
* <li>For every document, {@link #startDocument(int)} is called,
* informing the Codec how many fields will be written.
* <li>{@link #startField(FieldInfo, int, boolean, boolean)} is called for
* <li>{@link #startField(FieldInfo, int, boolean, boolean, boolean)} is called for
* each field in the document, informing the codec how many terms
* will be written for that field, and whether or not positions
* or offsets are enabled.
* will be written for that field, and whether or not positions,
* offsets, or payloads are enabled.
* <li>Within each field, {@link #startTerm(BytesRef, int)} is called
* for each term.
* <li>If offsets and/or positions are enabled, then
* {@link #addPosition(int, int, int)} will be called for each term
* {@link #addPosition(int, int, int, BytesRef)} will be called for each term
* occurrence.
* <li>After all documents have been written, {@link #finish(FieldInfos, int)}
* is called for verification/sanity-checks.
@ -60,7 +61,7 @@ import org.apache.lucene.util.BytesRef;
public abstract class TermVectorsWriter implements Closeable {
/** Called before writing the term vectors of the document.
* {@link #startField(FieldInfo, int, boolean, boolean)} will
* {@link #startField(FieldInfo, int, boolean, boolean, boolean)} will
* be called <code>numVectorFields</code> times. Note that if term
* vectors are enabled, this is called even if the document
* has no vector fields, in this case <code>numVectorFields</code>
@ -69,17 +70,17 @@ public abstract class TermVectorsWriter implements Closeable {
/** Called before writing the terms of the field.
* {@link #startTerm(BytesRef, int)} will be called <code>numTerms</code> times. */
public abstract void startField(FieldInfo info, int numTerms, boolean positions, boolean offsets) throws IOException;
public abstract void startField(FieldInfo info, int numTerms, boolean positions, boolean offsets, boolean payloads) throws IOException;
/** Adds a term and its term frequency <code>freq</code>.
* If this field has positions and/or offsets enabled, then
* {@link #addPosition(int, int, int)} will be called
* {@link #addPosition(int, int, int, BytesRef)} will be called
* <code>freq</code> times respectively.
*/
public abstract void startTerm(BytesRef term, int freq) throws IOException;
/** Adds a term position and offsets */
public abstract void addPosition(int position, int startOffset, int endOffset) throws IOException;
public abstract void addPosition(int position, int startOffset, int endOffset, BytesRef payload) throws IOException;
/** Aborts writing entirely, implementation should remove
* any partially-written files, etc. */
@ -99,7 +100,7 @@ public abstract class TermVectorsWriter implements Closeable {
* This is an expert API that allows the codec to consume
* positions and offsets directly from the indexer.
* <p>
* The default implementation calls {@link #addPosition(int, int, int)},
* The default implementation calls {@link #addPosition(int, int, int, BytesRef)},
* but subclasses can override this if they want to efficiently write
* all the positions, then all the offsets, for example.
* <p>
@ -111,15 +112,36 @@ public abstract class TermVectorsWriter implements Closeable {
public void addProx(int numProx, DataInput positions, DataInput offsets) throws IOException {
int position = 0;
int lastOffset = 0;
BytesRef payload = null;
for (int i = 0; i < numProx; i++) {
final int startOffset;
final int endOffset;
final BytesRef thisPayload;
if (positions == null) {
position = -1;
thisPayload = null;
} else {
position += positions.readVInt();
int code = positions.readVInt();
position += code >>> 1;
if ((code & 1) != 0) {
// This position has a payload
final int payloadLength = positions.readVInt();
if (payload == null) {
payload = new BytesRef();
payload.bytes = new byte[payloadLength];
} else if (payload.bytes.length < payloadLength) {
payload.grow(payloadLength);
}
positions.readBytes(payload.bytes, 0, payloadLength);
payload.length = payloadLength;
thisPayload = payload;
} else {
thisPayload = null;
}
}
if (offsets == null) {
@ -129,24 +151,31 @@ public abstract class TermVectorsWriter implements Closeable {
endOffset = startOffset + offsets.readVInt();
lastOffset = endOffset;
}
addPosition(position, startOffset, endOffset);
addPosition(position, startOffset, endOffset, thisPayload);
}
}
/** Merges in the term vectors from the readers in
* <code>mergeState</code>. The default implementation skips
* over deleted documents, and uses {@link #startDocument(int)},
* {@link #startField(FieldInfo, int, boolean, boolean)},
* {@link #startTerm(BytesRef, int)}, {@link #addPosition(int, int, int)},
* {@link #startField(FieldInfo, int, boolean, boolean, boolean)},
* {@link #startTerm(BytesRef, int)}, {@link #addPosition(int, int, int, BytesRef)},
* and {@link #finish(FieldInfos, int)},
* returning the number of documents that were written.
* Implementations can override this method for more sophisticated
* merging (bulk-byte copying, etc). */
public int merge(MergeState mergeState) throws IOException {
int docCount = 0;
for (AtomicReader reader : mergeState.readers) {
for (int i = 0; i < mergeState.readers.size(); i++) {
final AtomicReader reader = mergeState.readers.get(i);
final int maxDoc = reader.maxDoc();
final Bits liveDocs = reader.getLiveDocs();
// set PayloadProcessor
if (mergeState.payloadProcessorProvider != null) {
mergeState.currentReaderPayloadProcessor = mergeState.readerPayloadProcessor[i];
} else {
mergeState.currentReaderPayloadProcessor = null;
}
for (int docID = 0; docID < maxDoc; docID++) {
if (liveDocs != null && !liveDocs.get(docID)) {
// skip deleted docs
@ -155,7 +184,7 @@ public abstract class TermVectorsWriter implements Closeable {
// NOTE: it's very important to first assign to vectors then pass it to
// termVectorsWriter.addAllDocVectors; see LUCENE-1282
Fields vectors = reader.getTermVectors(docID);
addAllDocVectors(vectors, mergeState.fieldInfos);
addAllDocVectors(vectors, mergeState);
docCount++;
mergeState.checkAbort.work(300);
}
@ -169,7 +198,7 @@ public abstract class TermVectorsWriter implements Closeable {
* implementation requires that the vectors implement
* both Fields.size and
* Terms.size. */
protected final void addAllDocVectors(Fields vectors, FieldInfos fieldInfos) throws IOException {
protected final void addAllDocVectors(Fields vectors, MergeState mergeState) throws IOException {
if (vectors == null) {
startDocument(0);
return;
@ -181,54 +210,55 @@ public abstract class TermVectorsWriter implements Closeable {
}
startDocument(numFields);
final FieldsEnum fieldsEnum = vectors.iterator();
String fieldName;
String lastFieldName = null;
TermsEnum termsEnum = null;
DocsAndPositionsEnum docsAndPositionsEnum = null;
final ReaderPayloadProcessor readerPayloadProcessor = mergeState.currentReaderPayloadProcessor;
PayloadProcessor payloadProcessor = null;
while((fieldName = fieldsEnum.next()) != null) {
final FieldInfo fieldInfo = fieldInfos.fieldInfo(fieldName);
for(String fieldName : vectors) {
final FieldInfo fieldInfo = mergeState.fieldInfos.fieldInfo(fieldName);
assert lastFieldName == null || fieldName.compareTo(lastFieldName) > 0: "lastFieldName=" + lastFieldName + " fieldName=" + fieldName;
lastFieldName = fieldName;
final Terms terms = fieldsEnum.terms();
final Terms terms = vectors.terms(fieldName);
if (terms == null) {
// FieldsEnum shouldn't lie...
continue;
}
final boolean hasPositions = terms.hasPositions();
final boolean hasOffsets = terms.hasOffsets();
final boolean hasPayloads = terms.hasPayloads();
assert !hasPayloads || hasPositions;
final int numTerms = (int) terms.size();
if (numTerms == -1) {
throw new IllegalStateException("terms.size() must be implemented (it returned -1)");
}
final TermsEnum termsEnum = terms.iterator(null);
DocsAndPositionsEnum docsAndPositionsEnum = null;
boolean startedField = false;
// NOTE: this is tricky, because TermVectors allow
// indexing offsets but NOT positions. So we must
// lazily init the field by checking whether first
// position we see is -1 or not.
startField(fieldInfo, numTerms, hasPositions, hasOffsets, hasPayloads);
termsEnum = terms.iterator(termsEnum);
int termCount = 0;
while(termsEnum.next() != null) {
termCount++;
final int freq = (int) termsEnum.totalTermFreq();
if (startedField) {
startTerm(termsEnum.term(), freq);
startTerm(termsEnum.term(), freq);
if (hasPayloads && readerPayloadProcessor != null) {
payloadProcessor = readerPayloadProcessor.getProcessor(fieldName, termsEnum.term());
}
// TODO: we need a "query" API where we can ask (via
// flex API) what this term was indexed with...
// Both positions & offsets:
docsAndPositionsEnum = termsEnum.docsAndPositions(null, null);
boolean hasOffsets = false;
boolean hasPositions = false;
if (docsAndPositionsEnum != null) {
if (hasPositions || hasOffsets) {
docsAndPositionsEnum = termsEnum.docsAndPositions(null, docsAndPositionsEnum);
assert docsAndPositionsEnum != null;
final int docID = docsAndPositionsEnum.nextDoc();
assert docID != DocIdSetIterator.NO_MORE_DOCS;
assert docsAndPositionsEnum.freq() == freq;
@ -237,27 +267,21 @@ public abstract class TermVectorsWriter implements Closeable {
final int pos = docsAndPositionsEnum.nextPosition();
final int startOffset = docsAndPositionsEnum.startOffset();
final int endOffset = docsAndPositionsEnum.endOffset();
if (!startedField) {
assert numTerms > 0;
hasPositions = pos != -1;
hasOffsets = startOffset != -1;
startField(fieldInfo, numTerms, hasPositions, hasOffsets);
startTerm(termsEnum.term(), freq);
startedField = true;
}
if (hasOffsets) {
assert startOffset != -1;
assert endOffset != -1;
BytesRef payload = docsAndPositionsEnum.getPayload();
if (payloadProcessor != null && payload != null) {
// to not violate the D&P api, we must give the processor a private copy
payload = BytesRef.deepCopyOf(payload);
payloadProcessor.processPayload(payload);
if (payload.length == 0) {
// don't let PayloadProcessors corrumpt the index
payload = null;
}
}
assert !hasPositions || pos >= 0;
addPosition(pos, startOffset, endOffset);
}
} else {
if (!startedField) {
assert numTerms > 0;
startField(fieldInfo, numTerms, hasPositions, hasOffsets);
startTerm(termsEnum.term(), freq);
startedField = true;
addPosition(pos, startOffset, endOffset, payload);
}
}
}

View File

@ -954,11 +954,6 @@ public final class BlockPostingsReader extends PostingsReaderBase {
return -1;
}
@Override
public boolean hasPayload() {
return false;
}
@Override
public BytesRef getPayload() {
return null;
@ -1226,10 +1221,6 @@ public final class BlockPostingsReader extends PostingsReaderBase {
if (DEBUG) {
System.out.println(" FPR.nextDoc");
}
if (indexHasPayloads) {
payloadByteUpto += payloadLength;
payloadLength = 0;
}
while (true) {
if (DEBUG) {
System.out.println(" docUpto=" + docUpto + " (of df=" + docFreq + ") docBufferUpto=" + docBufferUpto);
@ -1255,7 +1246,6 @@ public final class BlockPostingsReader extends PostingsReaderBase {
System.out.println(" return doc=" + doc + " freq=" + freq + " posPendingCount=" + posPendingCount);
}
position = 0;
payloadLength = 0;
lastStartOffset = 0;
return doc;
}
@ -1355,12 +1345,7 @@ public final class BlockPostingsReader extends PostingsReaderBase {
if (DEBUG) {
System.out.println(" return doc=" + accum);
}
if (indexHasPayloads) {
payloadByteUpto += payloadLength;
payloadLength = 0;
}
position = 0;
payloadLength = 0;
lastStartOffset = 0;
return doc = accum;
} else {
@ -1433,7 +1418,6 @@ public final class BlockPostingsReader extends PostingsReaderBase {
}
position = 0;
payloadLength = 0;
lastStartOffset = 0;
}
@ -1461,16 +1445,6 @@ public final class BlockPostingsReader extends PostingsReaderBase {
posBufferUpto = BLOCK_SIZE;
}
if (indexHasPayloads) {
if (DEBUG) {
if (payloadLength != 0) {
System.out.println(" skip unread payload length=" + payloadLength);
}
}
payloadByteUpto += payloadLength;
payloadLength = 0;
}
if (posPendingCount > freq) {
skipPositions();
posPendingCount = freq;
@ -1484,6 +1458,10 @@ public final class BlockPostingsReader extends PostingsReaderBase {
if (indexHasPayloads) {
payloadLength = payloadLengthBuffer[posBufferUpto];
payload.bytes = payloadBytes;
payload.offset = payloadByteUpto;
payload.length = payloadLength;
payloadByteUpto += payloadLength;
}
if (indexHasOffsets) {
@ -1510,22 +1488,16 @@ public final class BlockPostingsReader extends PostingsReaderBase {
return endOffset;
}
@Override
public boolean hasPayload() {
return payloadLength != 0;
}
@Override
public BytesRef getPayload() {
if (DEBUG) {
System.out.println(" FPR.getPayload payloadLength=" + payloadLength + " payloadByteUpto=" + payloadByteUpto);
}
payload.bytes = payloadBytes;
payload.offset = payloadByteUpto;
payload.length = payloadLength;
payloadByteUpto += payloadLength;
payloadLength = 0;
return payload;
if (payloadLength == 0) {
return null;
} else {
return payload;
}
}
}
}

View File

@ -21,6 +21,7 @@ import java.io.IOException;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
@ -35,7 +36,6 @@ import org.apache.lucene.codecs.TermsConsumer;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldsEnum;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
@ -44,7 +44,6 @@ import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FuzzySet;
@ -187,9 +186,8 @@ public class BloomFilteringPostingsFormat extends PostingsFormat {
}
public FieldsEnum iterator() throws IOException {
return new BloomFilteredFieldsEnum(delegateFieldsProducer.iterator(),
bloomsByFieldName);
public Iterator<String> iterator() {
return delegateFieldsProducer.iterator();
}
public void close() throws IOException {
@ -217,44 +215,6 @@ public class BloomFilteringPostingsFormat extends PostingsFormat {
return delegateFieldsProducer.getUniqueTermCount();
}
// Not all fields in a segment may be subject to a bloom filter. This class
// wraps Terms objects appropriately if a filtering request is present
class BloomFilteredFieldsEnum extends FieldsEnum {
private FieldsEnum delegateFieldsEnum;
private HashMap<String,FuzzySet> bloomsByFieldName;
private String currentFieldName;
public BloomFilteredFieldsEnum(FieldsEnum iterator,
HashMap<String,FuzzySet> bloomsByFieldName) {
this.delegateFieldsEnum = iterator;
this.bloomsByFieldName = bloomsByFieldName;
}
public AttributeSource attributes() {
return delegateFieldsEnum.attributes();
}
public String next() throws IOException {
currentFieldName = delegateFieldsEnum.next();
return currentFieldName;
}
public Terms terms() throws IOException {
FuzzySet filter = bloomsByFieldName.get(currentFieldName);
if (filter == null) {
return delegateFieldsEnum.terms();
} else {
Terms result = delegateFieldsEnum.terms();
if (result == null) {
return null;
}
// wrap the terms object with a bloom filter
return new BloomFilteredTerms(result, filter);
}
}
}
class BloomFilteredTerms extends Terms {
private Terms delegateTerms;
private FuzzySet filter;
@ -314,6 +274,21 @@ public class BloomFilteringPostingsFormat extends PostingsFormat {
public int getDocCount() throws IOException {
return delegateTerms.getDocCount();
}
@Override
public boolean hasOffsets() {
return delegateTerms.hasOffsets();
}
@Override
public boolean hasPositions() {
return delegateTerms.hasPositions();
}
@Override
public boolean hasPayloads() {
return delegateTerms.hasPayloads();
}
}
class BloomFilteredTermsEnum extends TermsEnum {

View File

@ -873,12 +873,7 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
* payload was indexed. */
@Override
public BytesRef getPayload() throws IOException {
throw new IOException("No payloads exist for this field!");
}
@Override
public boolean hasPayload() {
return false;
return null;
}
}
@ -1152,28 +1147,26 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
@Override
public BytesRef getPayload() throws IOException {
if (storePayloads) {
if (payloadLength <= 0) {
return null;
}
assert lazyProxPointer == -1;
assert posPendingCount < freq;
if (!payloadPending) {
throw new IOException("Either no payload exists at this term position or an attempt was made to load it more than once.");
}
if (payloadLength > payload.bytes.length) {
payload.grow(payloadLength);
}
if (payloadPending) {
if (payloadLength > payload.bytes.length) {
payload.grow(payloadLength);
}
proxIn.readBytes(payload.bytes, 0, payloadLength);
payload.length = payloadLength;
payloadPending = false;
proxIn.readBytes(payload.bytes, 0, payloadLength);
payload.length = payloadLength;
payloadPending = false;
}
return payload;
} else {
throw new IOException("No payloads exist for this field!");
return null;
}
}
@Override
public boolean hasPayload() {
return payloadPending && payloadLength > 0;
}
}
}

View File

@ -67,33 +67,46 @@ import org.apache.lucene.store.IOContext;
* <li><a name="tvf" id="tvf"></a>
* <p>The Field or .tvf file.</p>
* <p>This file contains, for each field that has a term vector stored, a list of
* the terms, their frequencies and, optionally, position and offset
* the terms, their frequencies and, optionally, position, offset, and payload
* information.</p>
* <p>Field (.tvf) --&gt; Header,&lt;NumTerms, Position/Offset, TermFreqs&gt;
* <p>Field (.tvf) --&gt; Header,&lt;NumTerms, Flags, TermFreqs&gt;
* <sup>NumFields</sup></p>
* <ul>
* <li>Header --&gt; {@link CodecUtil#writeHeader CodecHeader}</li>
* <li>NumTerms --&gt; {@link DataOutput#writeVInt VInt}</li>
* <li>Position/Offset --&gt; {@link DataOutput#writeByte Byte}</li>
* <li>TermFreqs --&gt; &lt;TermText, TermFreq, Positions?, Offsets?&gt;
* <li>Flags --&gt; {@link DataOutput#writeByte Byte}</li>
* <li>TermFreqs --&gt; &lt;TermText, TermFreq, Positions?, PayloadData?, Offsets?&gt;
* <sup>NumTerms</sup></li>
* <li>TermText --&gt; &lt;PrefixLength, Suffix&gt;</li>
* <li>PrefixLength --&gt; {@link DataOutput#writeVInt VInt}</li>
* <li>Suffix --&gt; {@link DataOutput#writeString String}</li>
* <li>TermFreq --&gt; {@link DataOutput#writeVInt VInt}</li>
* <li>Positions --&gt; &lt;{@link DataOutput#writeVInt VInt}&gt;<sup>TermFreq</sup></li>
* <li>Positions --&gt; &lt;PositionDelta PayloadLength?&gt;<sup>TermFreq</sup></li>
* <li>PositionDelta --&gt; {@link DataOutput#writeVInt VInt}</li>
* <li>PayloadLength --&gt; {@link DataOutput#writeVInt VInt}</li>
* <li>PayloadData --&gt; {@link DataOutput#writeByte Byte}<sup>NumPayloadBytes</sup></li>
* <li>Offsets --&gt; &lt;{@link DataOutput#writeVInt VInt}, {@link DataOutput#writeVInt VInt}&gt;<sup>TermFreq</sup></li>
* </ul>
* <p>Notes:</p>
* <ul>
* <li>Position/Offset byte stores whether this term vector has position or offset
* <li>Flags byte stores whether this term vector has position, offset, payload.
* information stored.</li>
* <li>Term byte prefixes are shared. The PrefixLength is the number of initial
* bytes from the previous term which must be pre-pended to a term's suffix
* in order to form the term's bytes. Thus, if the previous term's text was "bone"
* and the term is "boy", the PrefixLength is two and the suffix is "y".</li>
* <li>Positions are stored as delta encoded VInts. This means we only store the
* difference of the current position from the last position</li>
* <li>PositionDelta is, if payloads are disabled for the term's field, the
* difference between the position of the current occurrence in the document and
* the previous occurrence (or zero, if this is the first occurrence in this
* document). If payloads are enabled for the term's field, then PositionDelta/2
* is the difference between the current and the previous position. If payloads
* are enabled and PositionDelta is odd, then PayloadLength is stored, indicating
* the length of the payload at the current term position.</li>
* <li>PayloadData is metadata associated with a term position. If
* PayloadLength is stored at the current position, then it indicates the length
* of this payload. If PayloadLength is not stored, then this payload has the same
* length as the payload at the previous position. PayloadData encodes the
* concatenated bytes for all of a terms occurrences.</li>
* <li>Offsets are stored as delta encoded VInts. The first VInt is the
* startOffset, the second is the endOffset.</li>
* </ul>

View File

@ -21,7 +21,9 @@ import java.io.IOException;
import java.util.Arrays;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.NoSuchElementException;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.TermVectorsReader;
@ -30,7 +32,6 @@ import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.FieldsEnum;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.index.Terms;
@ -55,6 +56,8 @@ public class Lucene40TermVectorsReader extends TermVectorsReader {
static final byte STORE_OFFSET_WITH_TERMVECTOR = 0x2;
static final byte STORE_PAYLOAD_WITH_TERMVECTOR = 0x4;
/** Extension of vectors fields file */
static final String VECTORS_FIELDS_EXTENSION = "tvf";
@ -68,8 +71,10 @@ public class Lucene40TermVectorsReader extends TermVectorsReader {
static final String CODEC_NAME_DOCS = "Lucene40TermVectorsDocs";
static final String CODEC_NAME_INDEX = "Lucene40TermVectorsIndex";
static final int VERSION_START = 0;
static final int VERSION_CURRENT = VERSION_START;
static final int VERSION_NO_PAYLOADS = 0;
static final int VERSION_PAYLOADS = 1;
static final int VERSION_START = VERSION_NO_PAYLOADS;
static final int VERSION_CURRENT = VERSION_PAYLOADS;
static final long HEADER_LENGTH_FIELDS = CodecUtil.headerLength(CODEC_NAME_FIELDS);
static final long HEADER_LENGTH_DOCS = CodecUtil.headerLength(CODEC_NAME_DOCS);
@ -245,9 +250,8 @@ public class Lucene40TermVectorsReader extends TermVectorsReader {
}
@Override
public FieldsEnum iterator() throws IOException {
return new FieldsEnum() {
public Iterator<String> iterator() {
return new Iterator<String>() {
private int fieldUpto;
@Override
@ -255,13 +259,18 @@ public class Lucene40TermVectorsReader extends TermVectorsReader {
if (fieldNumbers != null && fieldUpto < fieldNumbers.length) {
return fieldInfos.fieldInfo(fieldNumbers[fieldUpto++]).name;
} else {
return null;
throw new NoSuchElementException();
}
}
@Override
public Terms terms() throws IOException {
return TVFields.this.terms(fieldInfos.fieldInfo(fieldNumbers[fieldUpto-1]).name);
public boolean hasNext() {
return fieldNumbers != null && fieldUpto < fieldNumbers.length;
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
};
}
@ -296,10 +305,17 @@ public class Lucene40TermVectorsReader extends TermVectorsReader {
private class TVTerms extends Terms {
private final int numTerms;
private final long tvfFPStart;
private final boolean storePositions;
private final boolean storeOffsets;
private final boolean storePayloads;
public TVTerms(long tvfFP) throws IOException {
tvf.seek(tvfFP);
numTerms = tvf.readVInt();
final byte bits = tvf.readByte();
storePositions = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0;
storeOffsets = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0;
storePayloads = (bits & STORE_PAYLOAD_WITH_TERMVECTOR) != 0;
tvfFPStart = tvf.getFilePointer();
}
@ -314,7 +330,7 @@ public class Lucene40TermVectorsReader extends TermVectorsReader {
} else {
termsEnum = new TVTermsEnum();
}
termsEnum.reset(numTerms, tvfFPStart);
termsEnum.reset(numTerms, tvfFPStart, storePositions, storeOffsets, storePayloads);
return termsEnum;
}
@ -345,6 +361,21 @@ public class Lucene40TermVectorsReader extends TermVectorsReader {
// this...? I guess codec could buffer and re-sort...
return BytesRef.getUTF8SortedAsUnicodeComparator();
}
@Override
public boolean hasOffsets() {
return storeOffsets;
}
@Override
public boolean hasPositions() {
return storePositions;
}
@Override
public boolean hasPayloads() {
return storePayloads;
}
}
private class TVTermsEnum extends TermsEnum {
@ -357,11 +388,17 @@ public class Lucene40TermVectorsReader extends TermVectorsReader {
private BytesRef term = new BytesRef();
private boolean storePositions;
private boolean storeOffsets;
private boolean storePayloads;
private long tvfFP;
private int[] positions;
private int[] startOffsets;
private int[] endOffsets;
// one shared byte[] for any term's payloads
private int[] payloadOffsets;
private int lastPayloadLength;
private byte[] payloadData;
// NOTE: tvf is pre-positioned by caller
public TVTermsEnum() {
@ -373,17 +410,20 @@ public class Lucene40TermVectorsReader extends TermVectorsReader {
return tvf == origTVF;
}
public void reset(int numTerms, long tvfFPStart) throws IOException {
public void reset(int numTerms, long tvfFPStart, boolean storePositions, boolean storeOffsets, boolean storePayloads) throws IOException {
this.numTerms = numTerms;
this.storePositions = storePositions;
this.storeOffsets = storeOffsets;
this.storePayloads = storePayloads;
nextTerm = 0;
tvf.seek(tvfFPStart);
final byte bits = tvf.readByte();
storePositions = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0;
storeOffsets = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0;
tvfFP = 1+tvfFPStart;
positions = null;
startOffsets = null;
endOffsets = null;
payloadOffsets = null;
payloadData = null;
lastPayloadLength = -1;
}
// NOTE: slow! (linear scan)
@ -430,7 +470,26 @@ public class Lucene40TermVectorsReader extends TermVectorsReader {
tvf.readBytes(term.bytes, start, deltaLen);
freq = tvf.readVInt();
if (storePositions) {
if (storePayloads) {
positions = new int[freq];
payloadOffsets = new int[freq];
int totalPayloadLength = 0;
int pos = 0;
for(int posUpto=0;posUpto<freq;posUpto++) {
int code = tvf.readVInt();
pos += code >>> 1;
positions[posUpto] = pos;
if ((code & 1) != 0) {
// length change
lastPayloadLength = tvf.readVInt();
}
payloadOffsets[posUpto] = totalPayloadLength;
totalPayloadLength += lastPayloadLength;
assert totalPayloadLength >= 0;
}
payloadData = new byte[totalPayloadLength];
tvf.readBytes(payloadData, 0, payloadData.length);
} else if (storePositions /* no payloads */) {
// TODO: we could maybe reuse last array, if we can
// somehow be careful about consumer never using two
// D&PEnums at once...
@ -502,14 +561,12 @@ public class Lucene40TermVectorsReader extends TermVectorsReader {
} else {
docsAndPositionsEnum = new TVDocsAndPositionsEnum();
}
docsAndPositionsEnum.reset(liveDocs, positions, startOffsets, endOffsets);
docsAndPositionsEnum.reset(liveDocs, positions, startOffsets, endOffsets, payloadOffsets, payloadData);
return docsAndPositionsEnum;
}
@Override
public Comparator<BytesRef> getComparator() {
// TODO: really indexer hardwires
// this...? I guess codec could buffer and re-sort...
return BytesRef.getUTF8SortedAsUnicodeComparator();
}
}
@ -567,6 +624,9 @@ public class Lucene40TermVectorsReader extends TermVectorsReader {
private int[] positions;
private int[] startOffsets;
private int[] endOffsets;
private int[] payloadOffsets;
private BytesRef payload = new BytesRef();
private byte[] payloadBytes;
@Override
public int freq() throws IOException {
@ -602,11 +662,13 @@ public class Lucene40TermVectorsReader extends TermVectorsReader {
}
}
public void reset(Bits liveDocs, int[] positions, int[] startOffsets, int[] endOffsets) {
public void reset(Bits liveDocs, int[] positions, int[] startOffsets, int[] endOffsets, int[] payloadLengths, byte[] payloadBytes) {
this.liveDocs = liveDocs;
this.positions = positions;
this.startOffsets = startOffsets;
this.endOffsets = endOffsets;
this.payloadOffsets = payloadLengths;
this.payloadBytes = payloadBytes;
this.doc = -1;
didNext = false;
nextPos = 0;
@ -614,12 +676,19 @@ public class Lucene40TermVectorsReader extends TermVectorsReader {
@Override
public BytesRef getPayload() {
return null;
}
@Override
public boolean hasPayload() {
return false;
if (payloadOffsets == null) {
return null;
} else {
int off = payloadOffsets[nextPos-1];
int end = nextPos == payloadOffsets.length ? payloadBytes.length : payloadOffsets[nextPos];
if (end - off == 0) {
return null;
}
payload.bytes = payloadBytes;
payload.offset = off;
payload.length = end - off;
return payload;
}
}
@Override

View File

@ -106,12 +106,14 @@ public final class Lucene40TermVectorsWriter extends TermVectorsWriter {
private String lastFieldName;
@Override
public void startField(FieldInfo info, int numTerms, boolean positions, boolean offsets) throws IOException {
public void startField(FieldInfo info, int numTerms, boolean positions, boolean offsets, boolean payloads) throws IOException {
assert lastFieldName == null || info.name.compareTo(lastFieldName) > 0: "fieldName=" + info.name + " lastFieldName=" + lastFieldName;
lastFieldName = info.name;
this.positions = positions;
this.offsets = offsets;
this.payloads = payloads;
lastTerm.length = 0;
lastPayloadLength = -1; // force first payload to write its length
fps[fieldCount++] = tvf.getFilePointer();
tvd.writeVInt(info.number);
tvf.writeVInt(numTerms);
@ -120,6 +122,8 @@ public final class Lucene40TermVectorsWriter extends TermVectorsWriter {
bits |= Lucene40TermVectorsReader.STORE_POSITIONS_WITH_TERMVECTOR;
if (offsets)
bits |= Lucene40TermVectorsReader.STORE_OFFSET_WITH_TERMVECTOR;
if (payloads)
bits |= Lucene40TermVectorsReader.STORE_PAYLOAD_WITH_TERMVECTOR;
tvf.writeByte(bits);
assert fieldCount <= numVectorFields;
@ -138,10 +142,12 @@ public final class Lucene40TermVectorsWriter extends TermVectorsWriter {
// we also don't buffer during bulk merges.
private int offsetStartBuffer[] = new int[10];
private int offsetEndBuffer[] = new int[10];
private int offsetIndex = 0;
private int offsetFreq = 0;
private BytesRef payloadData = new BytesRef(10);
private int bufferedIndex = 0;
private int bufferedFreq = 0;
private boolean positions = false;
private boolean offsets = false;
private boolean payloads = false;
@Override
public void startTerm(BytesRef term, int freq) throws IOException {
@ -158,20 +164,40 @@ public final class Lucene40TermVectorsWriter extends TermVectorsWriter {
// we might need to buffer if its a non-bulk merge
offsetStartBuffer = ArrayUtil.grow(offsetStartBuffer, freq);
offsetEndBuffer = ArrayUtil.grow(offsetEndBuffer, freq);
offsetIndex = 0;
offsetFreq = freq;
}
bufferedIndex = 0;
bufferedFreq = freq;
payloadData.length = 0;
}
int lastPosition = 0;
int lastOffset = 0;
int lastPayloadLength = -1; // force first payload to write its length
BytesRef scratch = new BytesRef(); // used only by this optimized flush below
@Override
public void addProx(int numProx, DataInput positions, DataInput offsets) throws IOException {
// TODO: technically we could just copy bytes and not re-encode if we knew the length...
if (positions != null) {
if (payloads) {
// TODO, maybe overkill and just call super.addProx() in this case?
// we do avoid buffering the offsets in RAM though.
for (int i = 0; i < numProx; i++) {
tvf.writeVInt(positions.readVInt());
int code = positions.readVInt();
if ((code & 1) == 1) {
int length = positions.readVInt();
scratch.grow(length);
scratch.length = length;
positions.readBytes(scratch.bytes, scratch.offset, scratch.length);
writePosition(code >>> 1, scratch);
} else {
writePosition(code >>> 1, null);
}
}
tvf.writeBytes(payloadData.bytes, payloadData.offset, payloadData.length);
} else if (positions != null) {
// pure positions, no payloads
for (int i = 0; i < numProx; i++) {
tvf.writeVInt(positions.readVInt() >>> 1);
}
}
@ -184,28 +210,36 @@ public final class Lucene40TermVectorsWriter extends TermVectorsWriter {
}
@Override
public void addPosition(int position, int startOffset, int endOffset) throws IOException {
if (positions && offsets) {
public void addPosition(int position, int startOffset, int endOffset, BytesRef payload) throws IOException {
if (positions && (offsets || payloads)) {
// write position delta
tvf.writeVInt(position - lastPosition);
writePosition(position - lastPosition, payload);
lastPosition = position;
// buffer offsets
offsetStartBuffer[offsetIndex] = startOffset;
offsetEndBuffer[offsetIndex] = endOffset;
offsetIndex++;
if (offsets) {
offsetStartBuffer[bufferedIndex] = startOffset;
offsetEndBuffer[bufferedIndex] = endOffset;
}
bufferedIndex++;
// dump buffer if we are done
if (offsetIndex == offsetFreq) {
for (int i = 0; i < offsetIndex; i++) {
tvf.writeVInt(offsetStartBuffer[i] - lastOffset);
tvf.writeVInt(offsetEndBuffer[i] - offsetStartBuffer[i]);
lastOffset = offsetEndBuffer[i];
if (bufferedIndex == bufferedFreq) {
if (payloads) {
tvf.writeBytes(payloadData.bytes, payloadData.offset, payloadData.length);
}
for (int i = 0; i < bufferedIndex; i++) {
if (offsets) {
tvf.writeVInt(offsetStartBuffer[i] - lastOffset);
tvf.writeVInt(offsetEndBuffer[i] - offsetStartBuffer[i]);
lastOffset = offsetEndBuffer[i];
}
}
}
} else if (positions) {
// write position delta
tvf.writeVInt(position - lastPosition);
writePosition(position - lastPosition, payload);
lastPosition = position;
} else if (offsets) {
// write offset deltas
@ -214,6 +248,30 @@ public final class Lucene40TermVectorsWriter extends TermVectorsWriter {
lastOffset = endOffset;
}
}
private void writePosition(int delta, BytesRef payload) throws IOException {
if (payloads) {
int payloadLength = payload == null ? 0 : payload.length;
if (payloadLength != lastPayloadLength) {
lastPayloadLength = payloadLength;
tvf.writeVInt((delta<<1)|1);
tvf.writeVInt(payloadLength);
} else {
tvf.writeVInt(delta << 1);
}
if (payloadLength > 0) {
if (payloadLength + payloadData.length < 0) {
// we overflowed the payload buffer, just throw UOE
// having > Integer.MAX_VALUE bytes of payload for a single term in a single doc is nuts.
throw new UnsupportedOperationException("A term cannot have more than Integer.MAX_VALUE bytes of payload data in a single document");
}
payloadData.append(payload);
}
} else {
tvf.writeVInt(delta);
}
}
@Override
public void abort() {
@ -255,7 +313,14 @@ public final class Lucene40TermVectorsWriter extends TermVectorsWriter {
int idx = 0;
int numDocs = 0;
for (final AtomicReader reader : mergeState.readers) {
for (int i = 0; i < mergeState.readers.size(); i++) {
final AtomicReader reader = mergeState.readers.get(i);
// set PayloadProcessor
if (mergeState.payloadProcessorProvider != null) {
mergeState.currentReaderPayloadProcessor = mergeState.readerPayloadProcessor[i];
} else {
mergeState.currentReaderPayloadProcessor = null;
}
final SegmentReader matchingSegmentReader = mergeState.matchingSegmentReaders[idx++];
Lucene40TermVectorsReader matchingVectorsReader = null;
if (matchingSegmentReader != null) {
@ -288,8 +353,8 @@ public final class Lucene40TermVectorsWriter extends TermVectorsWriter {
final int maxDoc = reader.maxDoc();
final Bits liveDocs = reader.getLiveDocs();
int totalNumDocs = 0;
if (matchingVectorsReader != null) {
// We can bulk-copy because the fieldInfos are "congruent"
if (matchingVectorsReader != null && mergeState.currentReaderPayloadProcessor == null) {
// We can bulk-copy because the fieldInfos are "congruent" and there is no payload processor
for (int docNum = 0; docNum < maxDoc;) {
if (!liveDocs.get(docNum)) {
// skip deleted docs
@ -324,7 +389,7 @@ public final class Lucene40TermVectorsWriter extends TermVectorsWriter {
// NOTE: it's very important to first assign to vectors then pass it to
// termVectorsWriter.addAllDocVectors; see LUCENE-1282
Fields vectors = reader.getTermVectors(docNum);
addAllDocVectors(vectors, mergeState.fieldInfos);
addAllDocVectors(vectors, mergeState);
totalNumDocs++;
mergeState.checkAbort.work(300);
}
@ -339,8 +404,8 @@ public final class Lucene40TermVectorsWriter extends TermVectorsWriter {
int rawDocLengths2[])
throws IOException {
final int maxDoc = reader.maxDoc();
if (matchingVectorsReader != null) {
// We can bulk-copy because the fieldInfos are "congruent"
if (matchingVectorsReader != null && mergeState.currentReaderPayloadProcessor == null) {
// We can bulk-copy because the fieldInfos are "congruent" and there is no payload processor
int docCount = 0;
while (docCount < maxDoc) {
int len = Math.min(MAX_RAW_MERGE_DOCS, maxDoc - docCount);
@ -354,7 +419,7 @@ public final class Lucene40TermVectorsWriter extends TermVectorsWriter {
// NOTE: it's very important to first assign to vectors then pass it to
// termVectorsWriter.addAllDocVectors; see LUCENE-1282
Fields vectors = reader.getTermVectors(docNum);
addAllDocVectors(vectors, mergeState.fieldInfos);
addAllDocVectors(vectors, mergeState);
mergeState.checkAbort.work(300);
}
}

View File

@ -366,7 +366,7 @@ the {@link org.apache.lucene.codecs.Codec Codec} api. Fast per-document storage
factors need no longer be a single byte, they can be any DocValues
{@link org.apache.lucene.index.DocValues.Type type}. Terms need not be unicode
strings, they can be any byte sequence. Term offsets can optionally be indexed
into the postings lists.</li>
into the postings lists. Payloads can be stored in the term vectors.</li>
</ul>
<a name="Limitations" id="Limitations"></a>
<h2>Limitations</h2>

View File

@ -32,7 +32,6 @@ import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.FieldsEnum;
import org.apache.lucene.index.OrdTermState;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
@ -44,6 +43,7 @@ import org.apache.lucene.store.RAMOutputStream;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.UnmodifiableIterator;
import org.apache.lucene.util.automaton.CompiledAutomaton;
import org.apache.lucene.util.automaton.RunAutomaton;
import org.apache.lucene.util.automaton.Transition;
@ -124,36 +124,14 @@ public class DirectPostingsFormat extends PostingsFormat {
private final Map<String,DirectField> fields = new TreeMap<String,DirectField>();
public DirectFields(SegmentReadState state, Fields fields, int minSkipCount, int lowFreqCutoff) throws IOException {
FieldsEnum fieldsEnum = fields.iterator();
String field;
while ((field = fieldsEnum.next()) != null) {
this.fields.put(field, new DirectField(state, field, fieldsEnum.terms(), minSkipCount, lowFreqCutoff));
for (String field : fields) {
this.fields.put(field, new DirectField(state, field, fields.terms(field), minSkipCount, lowFreqCutoff));
}
}
@Override
public FieldsEnum iterator() {
final Iterator<Map.Entry<String,DirectField>> iter = fields.entrySet().iterator();
return new FieldsEnum() {
Map.Entry<String,DirectField> current;
@Override
public String next() {
if (iter.hasNext()) {
current = iter.next();
return current.getKey();
} else {
return null;
}
}
@Override
public Terms terms() {
return current.getValue();
}
};
public Iterator<String> iterator() {
return new UnmodifiableIterator<String>(fields.keySet().iterator());
}
@Override
@ -348,9 +326,8 @@ public class DirectPostingsFormat extends PostingsFormat {
scratch.add(docsAndPositionsEnum.endOffset());
}
if (hasPayloads) {
final BytesRef payload;
if (docsAndPositionsEnum.hasPayload()) {
payload = docsAndPositionsEnum.getPayload();
final BytesRef payload = docsAndPositionsEnum.getPayload();
if (payload != null) {
scratch.add(payload.length);
ros.writeBytes(payload.bytes, payload.offset, payload.length);
} else {
@ -421,9 +398,8 @@ public class DirectPostingsFormat extends PostingsFormat {
for(int pos=0;pos<freq;pos++) {
positions[upto][posUpto] = docsAndPositionsEnum.nextPosition();
if (hasPayloads) {
if (docsAndPositionsEnum.hasPayload()) {
BytesRef payload = docsAndPositionsEnum.getPayload();
assert payload != null;
BytesRef payload = docsAndPositionsEnum.getPayload();
if (payload != null) {
byte[] payloadBytes = new byte[payload.length];
System.arraycopy(payload.bytes, payload.offset, payloadBytes, 0, payload.length);
payloads[upto][pos] = payloadBytes;
@ -635,6 +611,21 @@ public class DirectPostingsFormat extends PostingsFormat {
return BytesRef.getUTF8SortedAsUnicodeComparator();
}
@Override
public boolean hasOffsets() {
return hasOffsets;
}
@Override
public boolean hasPositions() {
return hasPos;
}
@Override
public boolean hasPayloads() {
return hasPayloads;
}
private final class DirectTermsEnum extends TermsEnum {
private final BytesRef scratch = new BytesRef();
@ -1791,18 +1782,12 @@ public class DirectPostingsFormat extends PostingsFormat {
return docID;
}
@Override
public boolean hasPayload() {
return payloadLength > 0;
}
@Override
public BytesRef getPayload() {
if (payloadLength > 0) {
payload.bytes = payloadBytes;
payload.offset = lastPayloadOffset;
payload.length = payloadLength;
payloadLength = 0;
return payload;
} else {
return null;
@ -1995,7 +1980,6 @@ public class DirectPostingsFormat extends PostingsFormat {
private int upto;
private int docID = -1;
private int posUpto;
private boolean gotPayload;
private int[] curPositions;
public HighFreqDocsAndPositionsEnum(Bits liveDocs, boolean hasOffsets) {
@ -2065,7 +2049,6 @@ public class DirectPostingsFormat extends PostingsFormat {
@Override
public int nextPosition() {
posUpto += posJump;
gotPayload = false;
return curPositions[posUpto];
}
@ -2199,21 +2182,22 @@ public class DirectPostingsFormat extends PostingsFormat {
}
}
@Override
public boolean hasPayload() {
return !gotPayload && payloads != null && payloads[upto][posUpto/(hasOffsets ? 3 : 1)] != null;
}
private final BytesRef payload = new BytesRef();
@Override
public BytesRef getPayload() {
final byte[] payloadBytes = payloads[upto][posUpto/(hasOffsets ? 3:1)];
payload.bytes = payloadBytes;
payload.length = payloadBytes.length;
payload.offset = 0;
gotPayload = true;
return payload;
if (payloads == null) {
return null;
} else {
final byte[] payloadBytes = payloads[upto][posUpto/(hasOffsets ? 3:1)];
if (payloadBytes == null) {
return null;
}
payload.bytes = payloadBytes;
payload.length = payloadBytes.length;
payload.offset = 0;
return payload;
}
}
}
}

View File

@ -34,7 +34,6 @@ import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.FieldsEnum;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
@ -49,6 +48,7 @@ import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.UnmodifiableIterator;
import org.apache.lucene.util.fst.Builder;
import org.apache.lucene.util.fst.ByteSequenceOutputs;
import org.apache.lucene.util.fst.BytesRefFSTEnum;
@ -446,7 +446,6 @@ public class MemoryPostingsFormat extends PostingsFormat {
private int numDocs;
private int posPending;
private int payloadLength;
private boolean payloadRetrieved;
final boolean storeOffsets;
int offsetLength;
int startOffset;
@ -484,7 +483,6 @@ public class MemoryPostingsFormat extends PostingsFormat {
payloadLength = 0;
this.numDocs = numDocs;
posPending = 0;
payloadRetrieved = false;
startOffset = storeOffsets ? 0 : -1; // always return -1 if no offsets are stored
offsetLength = 0;
return this;
@ -577,10 +575,6 @@ public class MemoryPostingsFormat extends PostingsFormat {
payload.offset = in.getPosition();
in.skipBytes(payloadLength);
payload.length = payloadLength;
// Necessary, in case caller changed the
// payload.bytes from prior call:
payload.bytes = buffer;
payloadRetrieved = false;
}
//System.out.println(" pos=" + pos + " payload=" + payload + " fp=" + in.getPosition());
@ -599,13 +593,7 @@ public class MemoryPostingsFormat extends PostingsFormat {
@Override
public BytesRef getPayload() {
payloadRetrieved = true;
return payload;
}
@Override
public boolean hasPayload() {
return !payloadRetrieved && payload.length > 0;
return payload.length > 0 ? payload : null;
}
@Override
@ -834,6 +822,21 @@ public class MemoryPostingsFormat extends PostingsFormat {
public Comparator<BytesRef> getComparator() {
return BytesRef.getUTF8SortedAsUnicodeComparator();
}
@Override
public boolean hasOffsets() {
return field.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
}
@Override
public boolean hasPositions() {
return field.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
}
@Override
public boolean hasPayloads() {
return field.hasPayloads();
}
}
@Override
@ -859,24 +862,8 @@ public class MemoryPostingsFormat extends PostingsFormat {
return new FieldsProducer() {
@Override
public FieldsEnum iterator() {
final Iterator<TermsReader> iter = fields.values().iterator();
return new FieldsEnum() {
private TermsReader current;
@Override
public String next() {
current = iter.next();
return current.field.name;
}
@Override
public Terms terms() {
return current;
}
};
public Iterator<String> iterator() {
return new UnmodifiableIterator<String>(fields.keySet().iterator());
}
@Override

View File

@ -30,11 +30,11 @@ import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.TermsConsumer;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldsEnum;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.UnmodifiableIterator;
/**
* Enables per field format support.
@ -197,34 +197,9 @@ public abstract class PerFieldPostingsFormat extends PostingsFormat {
}
}
private final class FieldsIterator extends FieldsEnum {
private final Iterator<String> it;
private String current;
public FieldsIterator() {
it = fields.keySet().iterator();
}
@Override
public String next() {
if (it.hasNext()) {
current = it.next();
} else {
current = null;
}
return current;
}
@Override
public Terms terms() throws IOException {
return fields.get(current).terms(current);
}
}
@Override
public FieldsEnum iterator() throws IOException {
return new FieldsIterator();
public Iterator<String> iterator() {
return new UnmodifiableIterator<String>(fields.keySet().iterator());
}
@Override

View File

@ -532,19 +532,13 @@ public class PulsingPostingsReader extends PostingsReaderBase {
}
}
@Override
public boolean hasPayload() {
return storePayloads && !payloadRetrieved && payloadLength > 0;
}
@Override
public BytesRef getPayload() throws IOException {
//System.out.println("PR getPayload payloadLength=" + payloadLength + " this=" + this);
if (payloadRetrieved) {
throw new IOException("Either no payload exists at this term position or an attempt was made to load it more than once.");
}
payloadRetrieved = true;
if (payloadLength > 0) {
return payload;
} else if (storePayloads && payloadLength > 0) {
payloadRetrieved = true;
if (payload == null) {
payload = new BytesRef(payloadLength);
} else {

View File

@ -714,7 +714,11 @@ public class SepPostingsReader extends PostingsReaderBase {
@Override
public BytesRef getPayload() throws IOException {
if (!payloadPending) {
throw new IOException("Either no payload exists at this term position or an attempt was made to load it more than once.");
return null;
}
if (pendingPayloadBytes == 0) {
return payload;
}
assert pendingPayloadBytes >= payloadLength;
@ -731,15 +735,9 @@ public class SepPostingsReader extends PostingsReaderBase {
}
payloadIn.readBytes(payload.bytes, 0, payloadLength);
payloadPending = false;
payload.length = payloadLength;
pendingPayloadBytes = 0;
return payload;
}
@Override
public boolean hasPayload() {
return payloadPending && payloadLength > 0;
}
}
}

View File

@ -20,14 +20,17 @@ package org.apache.lucene.codecs.simpletext;
import java.io.IOException;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeMap;
import java.util.TreeSet;
import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.FieldsEnum;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
@ -40,6 +43,7 @@ import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.OpenBitSet;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.UnmodifiableIterator;
import org.apache.lucene.util.fst.Builder;
import org.apache.lucene.util.fst.BytesRefFSTEnum;
import org.apache.lucene.util.fst.FST;
@ -48,7 +52,7 @@ import org.apache.lucene.util.fst.PositiveIntOutputs;
import org.apache.lucene.util.fst.Util;
class SimpleTextFieldsReader extends FieldsProducer {
private final TreeMap<String,Long> fields;
private final IndexInput in;
private final FieldInfos fieldInfos;
@ -66,35 +70,22 @@ class SimpleTextFieldsReader extends FieldsProducer {
in = state.dir.openInput(SimpleTextPostingsFormat.getPostingsFileName(state.segmentInfo.name, state.segmentSuffix), state.context);
fieldInfos = state.fieldInfos;
fields = readFields((IndexInput)in.clone());
}
private class SimpleTextFieldsEnum extends FieldsEnum {
private final IndexInput in;
private final BytesRef scratch = new BytesRef(10);
private String current;
public SimpleTextFieldsEnum() {
this.in = (IndexInput) SimpleTextFieldsReader.this.in.clone();
}
@Override
public String next() throws IOException {
while(true) {
SimpleTextUtil.readLine(in, scratch);
if (scratch.equals(END)) {
current = null;
return null;
}
if (StringHelper.startsWith(scratch, FIELD)) {
return current = new String(scratch.bytes, scratch.offset + FIELD.length, scratch.length - FIELD.length, "UTF-8");
}
private TreeMap<String,Long> readFields(IndexInput in) throws IOException {
BytesRef scratch = new BytesRef(10);
TreeMap<String,Long> fields = new TreeMap<String,Long>();
while (true) {
SimpleTextUtil.readLine(in, scratch);
if (scratch.equals(END)) {
return fields;
} else if (StringHelper.startsWith(scratch, FIELD)) {
String fieldName = new String(scratch.bytes, scratch.offset + FIELD.length, scratch.length - FIELD.length, "UTF-8");
fields.put(fieldName, in.getFilePointer());
}
}
@Override
public Terms terms() throws IOException {
return SimpleTextFieldsReader.this.terms(current);
}
}
private class SimpleTextTermsEnum extends TermsEnum {
@ -471,18 +462,7 @@ class SimpleTextFieldsReader extends FieldsProducer {
@Override
public BytesRef getPayload() {
// Some tests rely on only being able to retrieve the
// payload once
try {
return payload;
} finally {
payload = null;
}
}
@Override
public boolean hasPayload() {
return payload != null;
return payload;
}
}
@ -498,7 +478,7 @@ class SimpleTextFieldsReader extends FieldsProducer {
private class SimpleTextTerms extends Terms {
private final long termsStart;
private final IndexOptions indexOptions;
private final FieldInfo fieldInfo;
private long sumTotalTermFreq;
private long sumDocFreq;
private int docCount;
@ -509,7 +489,7 @@ class SimpleTextFieldsReader extends FieldsProducer {
public SimpleTextTerms(String field, long termsStart) throws IOException {
this.termsStart = termsStart;
indexOptions = fieldInfos.fieldInfo(field).getIndexOptions();
fieldInfo = fieldInfos.fieldInfo(field);
loadTerms();
}
@ -579,7 +559,7 @@ class SimpleTextFieldsReader extends FieldsProducer {
@Override
public TermsEnum iterator(TermsEnum reuse) throws IOException {
if (fst != null) {
return new SimpleTextTermsEnum(fst, indexOptions);
return new SimpleTextTermsEnum(fst, fieldInfo.getIndexOptions());
} else {
return TermsEnum.EMPTY;
}
@ -597,7 +577,7 @@ class SimpleTextFieldsReader extends FieldsProducer {
@Override
public long getSumTotalTermFreq() {
return indexOptions == IndexOptions.DOCS_ONLY ? -1 : sumTotalTermFreq;
return fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY ? -1 : sumTotalTermFreq;
}
@Override
@ -609,11 +589,26 @@ class SimpleTextFieldsReader extends FieldsProducer {
public int getDocCount() throws IOException {
return docCount;
}
@Override
public boolean hasOffsets() {
return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
}
@Override
public boolean hasPositions() {
return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
}
@Override
public boolean hasPayloads() {
return fieldInfo.hasPayloads();
}
}
@Override
public FieldsEnum iterator() throws IOException {
return new SimpleTextFieldsEnum();
public Iterator<String> iterator() {
return new UnmodifiableIterator<String>(fields.keySet().iterator());
}
private final Map<String,Terms> termsCache = new HashMap<String,Terms>();
@ -622,15 +617,13 @@ class SimpleTextFieldsReader extends FieldsProducer {
synchronized public Terms terms(String field) throws IOException {
Terms terms = termsCache.get(field);
if (terms == null) {
SimpleTextFieldsEnum fe = (SimpleTextFieldsEnum) iterator();
String fieldUpto;
while((fieldUpto = fe.next()) != null) {
if (fieldUpto.equals(field)) {
terms = new SimpleTextTerms(field, fe.in.getFilePointer());
break;
}
Long fp = fields.get(field);
if (fp == null) {
return null;
} else {
terms = new SimpleTextTerms(field, fp);
termsCache.put(field, terms);
}
termsCache.put(field, terms);
}
return terms;
}

View File

@ -29,7 +29,6 @@ import org.apache.lucene.codecs.TermVectorsReader;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.FieldsEnum;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.index.Terms;
@ -45,6 +44,7 @@ import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.UnmodifiableIterator;
import static org.apache.lucene.codecs.simpletext.SimpleTextTermVectorsWriter.*;
@ -126,11 +126,15 @@ public class SimpleTextTermVectorsReader extends TermVectorsReader {
assert StringHelper.startsWith(scratch, FIELDOFFSETS);
boolean offsets = Boolean.parseBoolean(readString(FIELDOFFSETS.length, scratch));
readLine();
assert StringHelper.startsWith(scratch, FIELDPAYLOADS);
boolean payloads = Boolean.parseBoolean(readString(FIELDPAYLOADS.length, scratch));
readLine();
assert StringHelper.startsWith(scratch, FIELDTERMCOUNT);
int termCount = parseIntAt(FIELDTERMCOUNT.length);
SimpleTVTerms terms = new SimpleTVTerms();
SimpleTVTerms terms = new SimpleTVTerms(offsets, positions, payloads);
fields.put(fieldName, terms);
for (int j = 0; j < termCount; j++) {
@ -152,6 +156,9 @@ public class SimpleTextTermVectorsReader extends TermVectorsReader {
if (positions || offsets) {
if (positions) {
postings.positions = new int[postings.freq];
if (payloads) {
postings.payloads = new BytesRef[postings.freq];
}
}
if (offsets) {
@ -164,6 +171,17 @@ public class SimpleTextTermVectorsReader extends TermVectorsReader {
readLine();
assert StringHelper.startsWith(scratch, POSITION);
postings.positions[k] = parseIntAt(POSITION.length);
if (payloads) {
readLine();
assert StringHelper.startsWith(scratch, PAYLOAD);
if (scratch.length - PAYLOAD.length == 0) {
postings.payloads[k] = null;
} else {
byte payloadBytes[] = new byte[scratch.length - PAYLOAD.length];
System.arraycopy(scratch.bytes, scratch.offset+PAYLOAD.length, payloadBytes, 0, payloadBytes.length);
postings.payloads[k] = new BytesRef(payloadBytes);
}
}
}
if (offsets) {
@ -222,26 +240,8 @@ public class SimpleTextTermVectorsReader extends TermVectorsReader {
}
@Override
public FieldsEnum iterator() throws IOException {
return new FieldsEnum() {
private Iterator<Map.Entry<String,SimpleTVTerms>> iterator = fields.entrySet().iterator();
private Map.Entry<String,SimpleTVTerms> current = null;
@Override
public String next() {
if (!iterator.hasNext()) {
return null;
} else {
current = iterator.next();
return current.getKey();
}
}
@Override
public Terms terms() {
return current.getValue();
}
};
public Iterator<String> iterator() {
return new UnmodifiableIterator<String>(fields.keySet().iterator());
}
@Override
@ -257,8 +257,14 @@ public class SimpleTextTermVectorsReader extends TermVectorsReader {
private static class SimpleTVTerms extends Terms {
final SortedMap<BytesRef,SimpleTVPostings> terms;
final boolean hasOffsets;
final boolean hasPositions;
final boolean hasPayloads;
SimpleTVTerms() {
SimpleTVTerms(boolean hasOffsets, boolean hasPositions, boolean hasPayloads) {
this.hasOffsets = hasOffsets;
this.hasPositions = hasPositions;
this.hasPayloads = hasPayloads;
terms = new TreeMap<BytesRef,SimpleTVPostings>();
}
@ -292,6 +298,21 @@ public class SimpleTextTermVectorsReader extends TermVectorsReader {
public int getDocCount() throws IOException {
return 1;
}
@Override
public boolean hasOffsets() {
return hasOffsets;
}
@Override
public boolean hasPositions() {
return hasPositions;
}
@Override
public boolean hasPayloads() {
return hasPayloads;
}
}
private static class SimpleTVPostings {
@ -299,6 +320,7 @@ public class SimpleTextTermVectorsReader extends TermVectorsReader {
private int positions[];
private int startOffsets[];
private int endOffsets[];
private BytesRef payloads[];
}
private static class SimpleTVTermsEnum extends TermsEnum {
@ -372,7 +394,7 @@ public class SimpleTextTermVectorsReader extends TermVectorsReader {
}
// TODO: reuse
SimpleTVDocsAndPositionsEnum e = new SimpleTVDocsAndPositionsEnum();
e.reset(liveDocs, postings.positions, postings.startOffsets, postings.endOffsets);
e.reset(liveDocs, postings.positions, postings.startOffsets, postings.endOffsets, postings.payloads);
return e;
}
@ -433,6 +455,7 @@ public class SimpleTextTermVectorsReader extends TermVectorsReader {
private int nextPos;
private Bits liveDocs;
private int[] positions;
private BytesRef[] payloads;
private int[] startOffsets;
private int[] endOffsets;
@ -470,11 +493,12 @@ public class SimpleTextTermVectorsReader extends TermVectorsReader {
}
}
public void reset(Bits liveDocs, int[] positions, int[] startOffsets, int[] endOffsets) {
public void reset(Bits liveDocs, int[] positions, int[] startOffsets, int[] endOffsets, BytesRef payloads[]) {
this.liveDocs = liveDocs;
this.positions = positions;
this.startOffsets = startOffsets;
this.endOffsets = endOffsets;
this.payloads = payloads;
this.doc = -1;
didNext = false;
nextPos = 0;
@ -482,12 +506,7 @@ public class SimpleTextTermVectorsReader extends TermVectorsReader {
@Override
public BytesRef getPayload() {
return null;
}
@Override
public boolean hasPayload() {
return false;
return payloads == null ? null : payloads[nextPos-1];
}
@Override

View File

@ -45,10 +45,12 @@ public class SimpleTextTermVectorsWriter extends TermVectorsWriter {
static final BytesRef FIELDNAME = new BytesRef(" name ");
static final BytesRef FIELDPOSITIONS = new BytesRef(" positions ");
static final BytesRef FIELDOFFSETS = new BytesRef(" offsets ");
static final BytesRef FIELDPAYLOADS = new BytesRef(" payloads ");
static final BytesRef FIELDTERMCOUNT = new BytesRef(" numterms ");
static final BytesRef TERMTEXT = new BytesRef(" term ");
static final BytesRef TERMFREQ = new BytesRef(" freq ");
static final BytesRef POSITION = new BytesRef(" position ");
static final BytesRef PAYLOAD = new BytesRef(" payload ");
static final BytesRef STARTOFFSET = new BytesRef(" startoffset ");
static final BytesRef ENDOFFSET = new BytesRef(" endoffset ");
@ -61,6 +63,7 @@ public class SimpleTextTermVectorsWriter extends TermVectorsWriter {
private final BytesRef scratch = new BytesRef();
private boolean offsets;
private boolean positions;
private boolean payloads;
public SimpleTextTermVectorsWriter(Directory directory, String segment, IOContext context) throws IOException {
this.directory = directory;
@ -89,7 +92,7 @@ public class SimpleTextTermVectorsWriter extends TermVectorsWriter {
}
@Override
public void startField(FieldInfo info, int numTerms, boolean positions, boolean offsets) throws IOException {
public void startField(FieldInfo info, int numTerms, boolean positions, boolean offsets, boolean payloads) throws IOException {
write(FIELD);
write(Integer.toString(info.number));
newLine();
@ -106,12 +109,17 @@ public class SimpleTextTermVectorsWriter extends TermVectorsWriter {
write(Boolean.toString(offsets));
newLine();
write(FIELDPAYLOADS);
write(Boolean.toString(payloads));
newLine();
write(FIELDTERMCOUNT);
write(Integer.toString(numTerms));
newLine();
this.positions = positions;
this.offsets = offsets;
this.payloads = payloads;
}
@Override
@ -126,13 +134,22 @@ public class SimpleTextTermVectorsWriter extends TermVectorsWriter {
}
@Override
public void addPosition(int position, int startOffset, int endOffset) throws IOException {
public void addPosition(int position, int startOffset, int endOffset, BytesRef payload) throws IOException {
assert positions || offsets;
if (positions) {
write(POSITION);
write(Integer.toString(position));
newLine();
if (payloads) {
write(PAYLOAD);
if (payload != null) {
assert payload.length > 0;
write(payload);
}
newLine();
}
}
if (offsets) {

View File

@ -39,6 +39,7 @@ public class FieldType implements IndexableFieldType {
private boolean storeTermVectors;
private boolean storeTermVectorOffsets;
private boolean storeTermVectorPositions;
private boolean storeTermVectorPayloads;
private boolean omitNorms;
private IndexOptions indexOptions = IndexOptions.DOCS_AND_FREQS_AND_POSITIONS;
private DocValues.Type docValueType;
@ -53,6 +54,7 @@ public class FieldType implements IndexableFieldType {
this.storeTermVectors = ref.storeTermVectors();
this.storeTermVectorOffsets = ref.storeTermVectorOffsets();
this.storeTermVectorPositions = ref.storeTermVectorPositions();
this.storeTermVectorPayloads = ref.storeTermVectorPayloads();
this.omitNorms = ref.omitNorms();
this.indexOptions = ref.indexOptions();
this.docValueType = ref.docValueType();
@ -132,6 +134,15 @@ public class FieldType implements IndexableFieldType {
this.storeTermVectorPositions = value;
}
public boolean storeTermVectorPayloads() {
return this.storeTermVectorPayloads;
}
public void setStoreTermVectorPayloads(boolean value) {
checkIfFrozen();
this.storeTermVectorPayloads = value;
}
public boolean omitNorms() {
return this.omitNorms;
}
@ -198,24 +209,19 @@ public class FieldType implements IndexableFieldType {
result.append(",");
result.append("indexed");
if (tokenized()) {
if (result.length() > 0)
result.append(",");
result.append("tokenized");
result.append(",tokenized");
}
if (storeTermVectors()) {
if (result.length() > 0)
result.append(",");
result.append("termVector");
result.append(",termVector");
}
if (storeTermVectorOffsets()) {
if (result.length() > 0)
result.append(",");
result.append("termVectorOffsets");
result.append(",termVectorOffsets");
}
if (storeTermVectorPositions()) {
if (result.length() > 0)
result.append(",");
result.append("termVectorPosition");
result.append(",termVectorPosition");
if (storeTermVectorPayloads()) {
result.append(",termVectorPayloads");
}
}
if (omitNorms()) {
result.append(",omitNorms");
@ -232,7 +238,9 @@ public class FieldType implements IndexableFieldType {
}
}
if (docValueType != null) {
result.append(",docValueType=");
if (result.length() > 0)
result.append(",");
result.append("docValueType=");
result.append(docValueType);
}

View File

@ -685,12 +685,7 @@ public class CheckIndex {
DocsAndPositionsEnum postings = null;
String lastField = null;
final FieldsEnum fieldsEnum = fields.iterator();
while(true) {
final String field = fieldsEnum.next();
if (field == null) {
break;
}
for (String field : fields) {
// MultiFieldsEnum relies upon this order...
if (lastField != null && field.compareTo(lastField) <= 0) {
throw new RuntimeException("fields out of order: lastField=" + lastField + " field=" + field);
@ -713,11 +708,16 @@ public class CheckIndex {
// assert fields.terms(field) != null;
computedFieldCount++;
final Terms terms = fieldsEnum.terms();
final Terms terms = fields.terms(field);
if (terms == null) {
continue;
}
final boolean hasPositions = terms.hasPositions();
final boolean hasOffsets = terms.hasOffsets();
// term vectors cannot omit TF
final boolean hasFreqs = isVectors || fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
final TermsEnum termsEnum = terms.iterator(null);
boolean hasOrd = true;
@ -777,17 +777,10 @@ public class CheckIndex {
status.termCount++;
final DocsEnum docs2;
final boolean hasPositions;
// if we are checking vectors, we have freqs implicitly
final boolean hasFreqs = isVectors || fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
// if we are checking vectors, offsets are a free-for-all anyway
final boolean hasOffsets = isVectors || fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
if (postings != null) {
docs2 = postings;
hasPositions = true;
} else {
docs2 = docs;
hasPositions = false;
}
int lastDoc = -1;
@ -824,22 +817,17 @@ public class CheckIndex {
if (hasPositions) {
for(int j=0;j<freq;j++) {
final int pos = postings.nextPosition();
// NOTE: pos=-1 is allowed because of ancient bug
// (LUCENE-1542) whereby IndexWriter could
// write pos=-1 when first token's posInc is 0
// (separately: analyzers should not give
// posInc=0 to first token); also, term
// vectors are allowed to return pos=-1 if
// they indexed offset but not positions:
if (pos < -1) {
if (pos < 0) {
throw new RuntimeException("term " + term + ": doc " + doc + ": pos " + pos + " is out of bounds");
}
if (pos < lastPos) {
throw new RuntimeException("term " + term + ": doc " + doc + ": pos " + pos + " < lastPos " + lastPos);
}
lastPos = pos;
if (postings.hasPayload()) {
postings.getPayload();
BytesRef payload = postings.getPayload();
if (payload != null && payload.length < 1) {
throw new RuntimeException("term " + term + ": doc " + doc + ": pos " + pos + " payload length is out of bounds " + payload.length);
}
if (hasOffsets) {
int startOffset = postings.startOffset();
@ -924,14 +912,8 @@ public class CheckIndex {
int lastOffset = 0;
for(int posUpto=0;posUpto<freq;posUpto++) {
final int pos = postings.nextPosition();
// NOTE: pos=-1 is allowed because of ancient bug
// (LUCENE-1542) whereby IndexWriter could
// write pos=-1 when first token's posInc is 0
// (separately: analyzers should not give
// posInc=0 to first token); also, term
// vectors are allowed to return pos=-1 if
// they indexed offset but not positions:
if (pos < -1) {
if (pos < 0) {
throw new RuntimeException("position " + pos + " is out of bounds");
}
if (pos < lastPosition) {
@ -1000,11 +982,7 @@ public class CheckIndex {
// only happen if it's a ghost field (field with
// no terms, eg there used to be terms but all
// docs got deleted and then merged away):
// make sure TermsEnum is empty:
final Terms fieldTerms2 = fieldsEnum.terms();
if (fieldTerms2 != null && fieldTerms2.iterator(null).next() != null) {
throw new RuntimeException("Fields.terms(field=" + field + ") returned null yet the field appears to have terms");
}
} else {
if (fieldTerms instanceof BlockTreeTermsReader.FieldReader) {
final BlockTreeTermsReader.Stats stats = ((BlockTreeTermsReader.FieldReader) fieldTerms).computeStats();
@ -1415,9 +1393,7 @@ public class CheckIndex {
status.docCount++;
}
FieldsEnum fieldsEnum = tfv.iterator();
String field = null;
while((field = fieldsEnum.next()) != null) {
for(String field : tfv) {
if (doStats) {
status.totVectors++;
}
@ -1432,6 +1408,8 @@ public class CheckIndex {
Terms terms = tfv.terms(field);
termsEnum = terms.iterator(termsEnum);
final boolean postingsHasFreq = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
final boolean postingsHasPayload = fieldInfo.hasPayloads();
final boolean vectorsHasPayload = terms.hasPayloads();
Terms postingsTerms = postingsFields.terms(field);
if (postingsTerms == null) {
@ -1439,19 +1417,18 @@ public class CheckIndex {
}
postingsTermsEnum = postingsTerms.iterator(postingsTermsEnum);
final boolean hasProx = terms.hasOffsets() || terms.hasPositions();
BytesRef term = null;
while ((term = termsEnum.next()) != null) {
final boolean hasProx;
// Try positions:
postings = termsEnum.docsAndPositions(null, postings);
if (postings == null) {
hasProx = false;
// Try docIDs & freqs:
docs = termsEnum.docs(null, docs);
if (hasProx) {
postings = termsEnum.docsAndPositions(null, postings);
assert postings != null;
docs = null;
} else {
hasProx = true;
docs = termsEnum.docs(null, docs);
assert docs != null;
postings = null;
}
final DocsEnum docs2;
@ -1504,7 +1481,7 @@ public class CheckIndex {
int pos = postings.nextPosition();
if (postingsPostings != null) {
int postingsPos = postingsPostings.nextPosition();
if (pos != -1 && postingsPos != -1 && pos != postingsPos) {
if (terms.hasPositions() && pos != postingsPos) {
throw new RuntimeException("vector term=" + term + " field=" + field + " doc=" + j + ": pos=" + pos + " differs from postings pos=" + postingsPos);
}
}
@ -1535,6 +1512,34 @@ public class CheckIndex {
throw new RuntimeException("vector term=" + term + " field=" + field + " doc=" + j + ": endOffset=" + endOffset + " differs from postings endOffset=" + postingsEndOffset);
}
}
BytesRef payload = postings.getPayload();
if (payload != null) {
assert vectorsHasPayload;
}
if (postingsHasPayload && vectorsHasPayload) {
assert postingsPostings != null;
if (payload == null) {
// we have payloads, but not at this position.
// postings has payloads too, it should not have one at this position
if (postingsPostings.getPayload() != null) {
throw new RuntimeException("vector term=" + term + " field=" + field + " doc=" + j + " has no payload but postings does: " + postingsPostings.getPayload());
}
} else {
// we have payloads, and one at this position
// postings should also have one at this position, with the same bytes.
if (postingsPostings.getPayload() == null) {
throw new RuntimeException("vector term=" + term + " field=" + field + " doc=" + j + " has payload=" + payload + " but postings does not.");
}
BytesRef postingsPayload = postingsPostings.getPayload();
if (!payload.equals(postingsPayload)) {
throw new RuntimeException("vector term=" + term + " field=" + field + " doc=" + j + " has payload=" + payload + " but differs from postings payload=" + postingsPayload);
}
}
}
}
}
}

View File

@ -24,7 +24,7 @@ import java.util.List;
import java.util.Map;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.PriorityQueue;
import org.apache.lucene.util.MergedIterator;
import org.apache.lucene.index.BufferedDeletesStream.QueryAndLimit;
class CoalescedDeletes {
@ -48,13 +48,14 @@ class CoalescedDeletes {
public Iterable<Term> termsIterable() {
return new Iterable<Term>() {
@SuppressWarnings("unchecked")
@Override
public Iterator<Term> iterator() {
ArrayList<Iterator<Term>> subs = new ArrayList<Iterator<Term>>(iterables.size());
for (Iterable<Term> iterable : iterables) {
subs.add(iterable.iterator());
Iterator<Term> subs[] = new Iterator[iterables.size()];
for (int i = 0; i < iterables.size(); i++) {
subs[i] = iterables.get(i).iterator();
}
return mergedIterator(subs);
return new MergedIterator<Term>(subs);
}
};
}
@ -86,106 +87,4 @@ class CoalescedDeletes {
}
};
}
/** provides a merged view across multiple iterators */
static Iterator<Term> mergedIterator(final List<Iterator<Term>> iterators) {
return new Iterator<Term>() {
Term current;
TermMergeQueue queue = new TermMergeQueue(iterators.size());
SubIterator[] top = new SubIterator[iterators.size()];
int numTop;
{
int index = 0;
for (Iterator<Term> iterator : iterators) {
if (iterator.hasNext()) {
SubIterator sub = new SubIterator();
sub.current = iterator.next();
sub.iterator = iterator;
sub.index = index++;
queue.add(sub);
}
}
}
public boolean hasNext() {
if (queue.size() > 0) {
return true;
}
for (int i = 0; i < numTop; i++) {
if (top[i].iterator.hasNext()) {
return true;
}
}
return false;
}
public Term next() {
// restore queue
pushTop();
// gather equal top fields
if (queue.size() > 0) {
pullTop();
} else {
current = null;
}
return current;
}
public void remove() {
throw new UnsupportedOperationException();
}
private void pullTop() {
// extract all subs from the queue that have the same top term
assert numTop == 0;
while (true) {
top[numTop++] = queue.pop();
if (queue.size() == 0
|| !(queue.top()).current.equals(top[0].current)) {
break;
}
}
current = top[0].current;
}
private void pushTop() {
// call next() on each top, and put back into queue
for (int i = 0; i < numTop; i++) {
if (top[i].iterator.hasNext()) {
top[i].current = top[i].iterator.next();
queue.add(top[i]);
} else {
// no more terms
top[i].current = null;
}
}
numTop = 0;
}
};
}
private static class SubIterator {
Iterator<Term> iterator;
Term current;
int index;
}
private static class TermMergeQueue extends PriorityQueue<SubIterator> {
TermMergeQueue(int size) {
super(size);
}
@Override
protected boolean lessThan(SubIterator a, SubIterator b) {
final int cmp = a.current.compareTo(b.current);
if (cmp != 0) {
return cmp < 0;
} else {
return a.index < b.index;
}
}
}
}

View File

@ -105,7 +105,7 @@ public abstract class DocValues implements Closeable {
* <p>
* {@link Source} instances obtained from this method are closed / released
* from the cache once this {@link DocValues} instance is closed by the
* {@link IndexReader}, {@link Fields} or {@link FieldsEnum} the
* {@link IndexReader}, {@link Fields} or the
* {@link DocValues} was created from.
*/
public Source getSource() throws IOException {

View File

@ -48,11 +48,8 @@ public abstract class DocsAndPositionsEnum extends DocsEnum {
public abstract int endOffset() throws IOException;
/** Returns the payload at this position, or null if no
* payload was indexed. Only call this once per
* position. You should not modify anything (neither
* members of the returned BytesRef nor bytes in the
* byte[]). */
* payload was indexed. You should not modify anything
* (neither members of the returned BytesRef nor bytes
* in the byte[]). */
public abstract BytesRef getPayload() throws IOException;
public abstract boolean hasPayload();
}

View File

@ -18,15 +18,16 @@ package org.apache.lucene.index;
*/
import java.io.IOException;
import java.util.Iterator;
/** Flex API for access to fields and terms
* @lucene.experimental */
public abstract class Fields {
public abstract class Fields implements Iterable<String> {
/** Returns an iterator that will step through all fields
* names. This will not return null. */
public abstract FieldsEnum iterator() throws IOException;
public abstract Iterator<String> iterator();
/** Get the {@link Terms} for this field. This will return
* null if the field does not exist. */
@ -45,12 +46,7 @@ public abstract class Fields {
// TODO: deprecate?
public long getUniqueTermCount() throws IOException {
long numTerms = 0;
FieldsEnum it = iterator();
while(true) {
String field = it.next();
if (field == null) {
break;
}
for (String field : this) {
Terms terms = terms(field);
if (terms != null) {
final long termCount = terms.size();

View File

@ -1,79 +0,0 @@
package org.apache.lucene.index;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.util.AttributeSource;
/** Enumerates indexed fields. You must first call {@link
* #next} before calling {@link #terms}.
*
* @lucene.experimental */
public abstract class FieldsEnum {
// TODO: maybe allow retrieving FieldInfo for current
// field, as optional method?
private AttributeSource atts = null;
/**
* Returns the related attributes.
*/
public AttributeSource attributes() {
if (atts == null) {
atts = new AttributeSource();
}
return atts;
}
/** Increments the enumeration to the next field. Returns
* null when there are no more fields.*/
public abstract String next() throws IOException;
// TODO: would be nice to require/fix all impls so they
// never return null here... we have to fix the writers to
// never write 0-terms fields... or maybe allow a non-null
// Terms instance in just this case
/** Get {@link Terms} for the current field. After {@link #next} returns
* null this method should not be called. This method may
* return null in some cases, which means the provided
* field does not have any terms. */
public abstract Terms terms() throws IOException;
// TODO: should we allow pulling Terms as well? not just
// the iterator?
public final static FieldsEnum[] EMPTY_ARRAY = new FieldsEnum[0];
/** Provides zero fields */
public final static FieldsEnum EMPTY = new FieldsEnum() {
@Override
public String next() {
return null;
}
@Override
public Terms terms() {
throw new IllegalStateException("this method should never be called");
}
};
}

View File

@ -24,6 +24,7 @@ import org.apache.lucene.util.automaton.CompiledAutomaton;
import java.io.IOException;
import java.util.Comparator;
import java.util.Iterator;
/** A <code>FilterAtomicReader</code> contains another AtomicReader, which it
* uses as its basic source of data, possibly transforming the data along the
@ -46,7 +47,7 @@ public class FilterAtomicReader extends AtomicReader {
}
@Override
public FieldsEnum iterator() throws IOException {
public Iterator<String> iterator() {
return in.iterator();
}
@ -109,28 +110,20 @@ public class FilterAtomicReader extends AtomicReader {
public TermsEnum intersect(CompiledAutomaton automaton, BytesRef bytes) throws java.io.IOException {
return in.intersect(automaton, bytes);
}
}
/** Base class for filtering {@link TermsEnum} implementations. */
public static class FilterFieldsEnum extends FieldsEnum {
protected final FieldsEnum in;
public FilterFieldsEnum(FieldsEnum in) {
this.in = in;
@Override
public boolean hasOffsets() {
return in.hasOffsets();
}
@Override
public String next() throws IOException {
return in.next();
}
@Override
public Terms terms() throws IOException {
return in.terms();
public boolean hasPositions() {
return in.hasPositions();
}
@Override
public AttributeSource attributes() {
return in.attributes();
public boolean hasPayloads() {
return in.hasPayloads();
}
}
@ -292,11 +285,6 @@ public class FilterAtomicReader extends AtomicReader {
public BytesRef getPayload() throws IOException {
return in.getPayload();
}
@Override
public boolean hasPayload() {
return in.hasPayload();
}
@Override
public AttributeSource attributes() {

View File

@ -173,7 +173,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
postings.lastDocCodes[termID] = docState.docID;
} else {
postings.lastDocCodes[termID] = docState.docID << 1;
postings.docFreqs[termID] = 1;
postings.termFreqs[termID] = 1;
if (hasProx) {
writeProx(termID, fieldState.position);
if (hasOffsets) {
@ -194,10 +194,10 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
FreqProxPostingsArray postings = (FreqProxPostingsArray) termsHashPerField.postingsArray;
assert !hasFreq || postings.docFreqs[termID] > 0;
assert !hasFreq || postings.termFreqs[termID] > 0;
if (!hasFreq) {
assert postings.docFreqs == null;
assert postings.termFreqs == null;
if (docState.docID != postings.lastDocIDs[termID]) {
assert docState.docID > postings.lastDocIDs[termID];
termsHashPerField.writeVInt(0, postings.lastDocCodes[termID]);
@ -212,13 +212,13 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
// Now that we know doc freq for previous doc,
// write it & lastDocCode
if (1 == postings.docFreqs[termID]) {
if (1 == postings.termFreqs[termID]) {
termsHashPerField.writeVInt(0, postings.lastDocCodes[termID]|1);
} else {
termsHashPerField.writeVInt(0, postings.lastDocCodes[termID]);
termsHashPerField.writeVInt(0, postings.docFreqs[termID]);
termsHashPerField.writeVInt(0, postings.termFreqs[termID]);
}
postings.docFreqs[termID] = 1;
postings.termFreqs[termID] = 1;
fieldState.maxTermFrequency = Math.max(1, fieldState.maxTermFrequency);
postings.lastDocCodes[termID] = (docState.docID - postings.lastDocIDs[termID]) << 1;
postings.lastDocIDs[termID] = docState.docID;
@ -233,7 +233,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
}
fieldState.uniqueTermCount++;
} else {
fieldState.maxTermFrequency = Math.max(fieldState.maxTermFrequency, ++postings.docFreqs[termID]);
fieldState.maxTermFrequency = Math.max(fieldState.maxTermFrequency, ++postings.termFreqs[termID]);
if (hasProx) {
writeProx(termID, fieldState.position-postings.lastPositions[termID]);
}
@ -252,7 +252,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
public FreqProxPostingsArray(int size, boolean writeFreqs, boolean writeProx, boolean writeOffsets) {
super(size);
if (writeFreqs) {
docFreqs = new int[size];
termFreqs = new int[size];
}
lastDocIDs = new int[size];
lastDocCodes = new int[size];
@ -267,7 +267,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
//System.out.println("PA init freqs=" + writeFreqs + " pos=" + writeProx + " offs=" + writeOffsets);
}
int docFreqs[]; // # times this term occurs in the current doc
int termFreqs[]; // # times this term occurs in the current doc
int lastDocIDs[]; // Last docID where this term occurred
int lastDocCodes[]; // Code for prior doc
int lastPositions[]; // Last position where this term occurred
@ -275,7 +275,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
@Override
ParallelPostingsArray newInstance(int size) {
return new FreqProxPostingsArray(size, docFreqs != null, lastPositions != null, lastOffsets != null);
return new FreqProxPostingsArray(size, termFreqs != null, lastPositions != null, lastOffsets != null);
}
@Override
@ -295,9 +295,9 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
assert to.lastOffsets != null;
System.arraycopy(lastOffsets, 0, to.lastOffsets, 0, numToCopy);
}
if (docFreqs != null) {
assert to.docFreqs != null;
System.arraycopy(docFreqs, 0, to.docFreqs, 0, numToCopy);
if (termFreqs != null) {
assert to.termFreqs != null;
System.arraycopy(termFreqs, 0, to.termFreqs, 0, numToCopy);
}
}
@ -310,7 +310,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
if (lastOffsets != null) {
bytes += RamUsageEstimator.NUM_BYTES_INT;
}
if (docFreqs != null) {
if (termFreqs != null) {
bytes += RamUsageEstimator.NUM_BYTES_INT;
}
@ -416,21 +416,21 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
// Now termStates has numToMerge FieldMergeStates
// which all share the same term. Now we must
// interleave the docID streams.
int numDocs = 0;
int docFreq = 0;
long totTF = 0;
int docID = 0;
while(true) {
//System.out.println(" cycle");
final int termDocFreq;
final int termFreq;
if (freq.eof()) {
if (postings.lastDocCodes[termID] != -1) {
// Return last doc
docID = postings.lastDocIDs[termID];
if (readTermFreq) {
termDocFreq = postings.docFreqs[termID];
termFreq = postings.termFreqs[termID];
} else {
termDocFreq = -1;
termFreq = -1;
}
postings.lastDocCodes[termID] = -1;
} else {
@ -441,20 +441,20 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
final int code = freq.readVInt();
if (!readTermFreq) {
docID += code;
termDocFreq = -1;
termFreq = -1;
} else {
docID += code >>> 1;
if ((code & 1) != 0) {
termDocFreq = 1;
termFreq = 1;
} else {
termDocFreq = freq.readVInt();
termFreq = freq.readVInt();
}
}
assert docID != postings.lastDocIDs[termID];
}
numDocs++;
docFreq++;
assert docID < state.segmentInfo.getDocCount(): "doc=" + docID + " maxDoc=" + state.segmentInfo.getDocCount();
// NOTE: we could check here if the docID was
@ -469,7 +469,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
// 2nd sweep does the real flush, but I suspect
// that'd add too much time to flush.
visitedDocs.set(docID);
postingsConsumer.startDoc(docID, writeTermFreq ? termDocFreq : -1);
postingsConsumer.startDoc(docID, writeTermFreq ? termFreq : -1);
if (docID < delDocLimit) {
// Mark it deleted. TODO: we could also skip
// writing its postings; this would be
@ -485,7 +485,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
}
}
totTF += termDocFreq;
totTF += termFreq;
// Carefully copy over the prox + payload info,
// changing the format to match Lucene's segment
@ -495,7 +495,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
// we did record positions (& maybe payload) and/or offsets
int position = 0;
int offset = 0;
for(int j=0;j<termDocFreq;j++) {
for(int j=0;j<termFreq;j++) {
final BytesRef thisPayload;
if (readPositions) {
@ -542,9 +542,9 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
}
postingsConsumer.finishDoc();
}
termsConsumer.finishTerm(text, new TermStats(numDocs, writeTermFreq ? totTF : -1));
termsConsumer.finishTerm(text, new TermStats(docFreq, writeTermFreq ? totTF : -1));
sumTotalTermFreq += totTF;
sumDocFreq += numDocs;
sumDocFreq += docFreq;
}
termsConsumer.finish(writeTermFreq ? sumTotalTermFreq : -1, sumDocFreq, visitedDocs.cardinality());

View File

@ -2312,9 +2312,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit {
}
SegmentInfos sis = new SegmentInfos(); // read infos from dir
sis.read(dir);
final Set<String> dsFilesCopied = new HashSet<String>();
final Map<String, String> dsNames = new HashMap<String, String>();
final Set<String> copiedFiles = new HashSet<String>();
for (SegmentInfoPerCommit info : sis) {
assert !infos.contains(info): "dup info dir=" + info.info.dir + " name=" + info.info.name;
@ -2327,7 +2325,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit {
IOContext context = new IOContext(new MergeInfo(info.info.getDocCount(), info.info.sizeInBytes(), true, -1));
infos.add(copySegmentAsIs(info, newSegName, dsNames, dsFilesCopied, context, copiedFiles));
infos.add(copySegmentAsIs(info, newSegName, context));
}
}
@ -2463,25 +2461,9 @@ public class IndexWriter implements Closeable, TwoPhaseCommit {
}
/** Copies the segment files as-is into the IndexWriter's directory. */
// TODO: this can be substantially simplified now that 3.x support/shared docstores is removed!
private SegmentInfoPerCommit copySegmentAsIs(SegmentInfoPerCommit info, String segName,
Map<String, String> dsNames, Set<String> dsFilesCopied, IOContext context,
Set<String> copiedFiles)
private SegmentInfoPerCommit copySegmentAsIs(SegmentInfoPerCommit info, String segName, IOContext context)
throws IOException {
// Determine if the doc store of this segment needs to be copied. It's
// only relevant for segments that share doc store with others,
// because the DS might have been copied already, in which case we
// just want to update the DS name of this SegmentInfo.
final String dsName = info.info.name;
assert dsName != null;
final String newDsName;
if (dsNames.containsKey(dsName)) {
newDsName = dsNames.get(dsName);
} else {
dsNames.put(dsName, segName);
newDsName = segName;
}
// note: we don't really need this fis (its copied), but we load it up
// so we don't pass a null value to the si writer
FieldInfos fis = getFieldInfos(info.info);
@ -2496,7 +2478,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit {
}
//System.out.println("copy seg=" + info.info.name + " version=" + info.info.getVersion());
// Same SI as before but we change directory, name and docStoreSegment:
// Same SI as before but we change directory and name
SegmentInfo newInfo = new SegmentInfo(directory, info.info.getVersion(), segName, info.info.getDocCount(),
info.info.getUseCompoundFile(),
info.info.getCodec(), info.info.getDiagnostics(), attributes);
@ -2513,16 +2495,10 @@ public class IndexWriter implements Closeable, TwoPhaseCommit {
}
newInfo.setFiles(segFiles);
// We must rewrite the SI file because it references
// segment name (its own name, if its 3.x, and doc
// store segment name):
// We must rewrite the SI file because it references segment name in its list of files, etc
TrackingDirectoryWrapper trackingDir = new TrackingDirectoryWrapper(directory);
try {
newInfo.getCodec().segmentInfoFormat().getSegmentInfoWriter().write(trackingDir, newInfo, fis, context);
} catch (UnsupportedOperationException uoe) {
// OK: 3x codec cannot write a new SI file;
// SegmentInfos will write this on commit
}
newInfo.getCodec().segmentInfoFormat().getSegmentInfoWriter().write(trackingDir, newInfo, fis, context);
final Collection<String> siFiles = trackingDir.getCreatedFiles();
@ -2537,8 +2513,7 @@ public class IndexWriter implements Closeable, TwoPhaseCommit {
}
assert !directory.fileExists(newFileName): "file \"" + newFileName + "\" already exists; siFiles=" + siFiles;
assert !copiedFiles.contains(file): "file \"" + file + "\" is being copied more than once";
copiedFiles.add(file);
info.info.dir.copy(directory, file, newFileName, context);
}

View File

@ -42,6 +42,9 @@ public interface IndexableFieldType {
/** True if term vector positions should be indexed */
public boolean storeTermVectorPositions();
/** True if term vector payloads should be indexed */
public boolean storeTermVectorPayloads();
/** True if norms should not be indexed */
public boolean omitNorms();

View File

@ -199,6 +199,7 @@ public class MergeState {
// and we could make a codec(wrapper) to do all of this privately so IW is uninvolved
public PayloadProcessorProvider payloadProcessorProvider;
public ReaderPayloadProcessor[] readerPayloadProcessor;
public ReaderPayloadProcessor currentReaderPayloadProcessor;
public PayloadProcessor[] currentPayloadProcessor;
// TODO: get rid of this? it tells you which segments are 'aligned' (e.g. for bulk merging)

Some files were not shown because too many files have changed in this diff Show More