SOLR-2452: merged with trunk up r1144161; applied the svn movement script and the latest version of the post-svn-movement patch

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/solr2452@1144174 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Steven Rowe 2011-07-08 06:41:23 +00:00
commit 4505c08643
211 changed files with 4993 additions and 2547 deletions

View File

@ -51,13 +51,13 @@
<classpathentry kind="src" path="modules/queries/src/test"/>
<classpathentry kind="src" path="modules/suggest/src/java"/>
<classpathentry kind="src" path="modules/suggest/src/test"/>
<classpathentry kind="src" path="solr/src/java"/>
<classpathentry kind="src" path="solr/src/webapp/src"/>
<classpathentry kind="src" path="solr/src/common"/>
<classpathentry kind="src" path="solr/src/solrj"/>
<classpathentry kind="src" path="solr/src/test-framework"/>
<classpathentry kind="src" path="solr/src/test"/>
<classpathentry kind="src" path="solr/src/test-files"/>
<classpathentry kind="src" path="solr/core/src/java"/>
<classpathentry kind="src" path="solr/core/src/test"/>
<classpathentry kind="src" path="solr/core/src/test-files"/>
<classpathentry kind="src" path="solr/solrj/src/java"/>
<classpathentry kind="src" path="solr/solrj/src/test"/>
<classpathentry kind="src" path="solr/solrj/src/test-files"/>
<classpathentry kind="src" path="solr/test-framework/src/java"/>
<classpathentry kind="src" path="solr/contrib/analysis-extras/src/java"/>
<classpathentry kind="src" path="solr/contrib/analysis-extras/src/test"/>
<classpathentry kind="src" path="solr/contrib/analysis-extras/src/test-files"/>

View File

@ -25,11 +25,15 @@
<buildFile url="file://$PROJECT_DIR$/modules/join/build.xml" />
<buildFile url="file://$PROJECT_DIR$/modules/suggest/build.xml" />
<buildFile url="file://$PROJECT_DIR$/solr/build.xml" />
<buildFile url="file://$PROJECT_DIR$/solr/core/build.xml" />
<buildFile url="file://$PROJECT_DIR$/solr/contrib/analysis-extras/build.xml" />
<buildFile url="file://$PROJECT_DIR$/solr/contrib/clustering/build.xml" />
<buildFile url="file://$PROJECT_DIR$/solr/contrib/dataimporthandler/build.xml" />
<buildFile url="file://$PROJECT_DIR$/solr/contrib/dataimporthandler-extras/build.xml" />
<buildFile url="file://$PROJECT_DIR$/solr/contrib/extraction/build.xml" />
<buildFile url="file://$PROJECT_DIR$/solr/contrib/uima/build.xml" />
<buildFile url="file://$PROJECT_DIR$/solr/solrj/build.xml" />
<buildFile url="file://$PROJECT_DIR$/solr/test-framework/build.xml" />
<buildFile url="file://$PROJECT_DIR$/solr/webapp/build.xml" />
</component>
</project>

View File

@ -10,7 +10,7 @@
<module filepath="$PROJECT_DIR$/lucene/contrib/memory/memory.iml" />
<module filepath="$PROJECT_DIR$/lucene/contrib/misc/misc.iml" />
<module filepath="$PROJECT_DIR$/lucene/contrib/queries/queries-contrib.iml" />
<module filepath="$PROJECT_DIR$/lucene/contrib/queryparser/queryparser.iml" />
<module filepath="$PROJECT_DIR$/lucene/contrib/queryparser/queryparser-contrib.iml" />
<module filepath="$PROJECT_DIR$/lucene/contrib/spatial/spatial.iml" />
<module filepath="$PROJECT_DIR$/lucene/contrib/wordnet/wordnet.iml" />
<module filepath="$PROJECT_DIR$/lucene/contrib/xml-query-parser/xml-query-parser.iml" />

View File

@ -121,7 +121,7 @@
<option name="TEST_SEARCH_SCOPE"><value defaultName="singleModule" /></option>
</configuration>
<configuration default="false" name="queryparser contrib" type="JUnit" factoryName="JUnit">
<module name="queryparser" />
<module name="queryparser-contrib" />
<option name="TEST_OBJECT" value="package" />
<option name="WORKING_DIRECTORY" value="file://$PROJECT_DIR$/lucene/build/contrib/queryparser" />
<option name="VM_PARAMETERS" value="-ea -DtempDir=temp" />

View File

@ -1,8 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="JAVA_MODULE" version="4">
<component name="NewModuleRootManager" LANGUAGE_LEVEL="JDK_1_6" inherit-compiler-output="false">
<output url="file://$MODULE_DIR$/../../build/contrib/analysis-extras/classes" />
<output-test url="file://$MODULE_DIR$/../../build/contrib/analysis-extras/test-classes" />
<output url="file://$MODULE_DIR$/../../build/contrib/analysis-extras/classes/java" />
<output-test url="file://$MODULE_DIR$/../../build/contrib/analysis-extras/classes/test" />
<exclude-output />
<content url="file://$MODULE_DIR$">
<sourceFolder url="file://$MODULE_DIR$/src/java" isTestSource="false" />

View File

@ -1,8 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="JAVA_MODULE" version="4">
<component name="NewModuleRootManager" LANGUAGE_LEVEL="JDK_1_6" inherit-compiler-output="false">
<output url="file://$MODULE_DIR$/../../build/contrib/clustering/classes" />
<output-test url="file://$MODULE_DIR$/../../build/contrib/clustering/test-classes" />
<output url="file://$MODULE_DIR$/../../build/contrib/clustering/classes/java" />
<output-test url="file://$MODULE_DIR$/../../build/contrib/clustering/classes/test" />
<exclude-output />
<content url="file://$MODULE_DIR$">
<sourceFolder url="file://$MODULE_DIR$/src/java" isTestSource="false" />

View File

@ -1,8 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="JAVA_MODULE" version="4">
<component name="NewModuleRootManager" LANGUAGE_LEVEL="JDK_1_6" inherit-compiler-output="false">
<output url="file://$MODULE_DIR$/../../build/contrib/dataimporthandler-extras/classes" />
<output-test url="file://$MODULE_DIR$/../../build/contrib/dataimporthandler-extras/test-classes" />
<output url="file://$MODULE_DIR$/../../build/contrib/dataimporthandler-extras/classes/java" />
<output-test url="file://$MODULE_DIR$/../../build/contrib/dataimporthandler-extras/classes/test" />
<exclude-output />
<content url="file://$MODULE_DIR$">
<sourceFolder url="file://$MODULE_DIR$/src/java" isTestSource="false" />

View File

@ -1,8 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="JAVA_MODULE" version="4">
<component name="NewModuleRootManager" LANGUAGE_LEVEL="JDK_1_6" inherit-compiler-output="false">
<output url="file://$MODULE_DIR$/../../build/contrib/dataimporthandler/classes" />
<output-test url="file://$MODULE_DIR$/../../build/contrib/dataimporthandler/test-classes" />
<output url="file://$MODULE_DIR$/../../build/contrib/dataimporthandler/classes/java" />
<output-test url="file://$MODULE_DIR$/../../build/contrib/dataimporthandler/classes/test" />
<exclude-output />
<content url="file://$MODULE_DIR$">
<sourceFolder url="file://$MODULE_DIR$/src/java" isTestSource="false" />

View File

@ -1,8 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="JAVA_MODULE" version="4">
<component name="NewModuleRootManager" LANGUAGE_LEVEL="JDK_1_6" inherit-compiler-output="false">
<output url="file://$MODULE_DIR$/../../build/contrib/extraction/classes" />
<output-test url="file://$MODULE_DIR$/../../build/contrib/extraction/test-classes" />
<output url="file://$MODULE_DIR$/../../build/contrib/extraction/classes/java" />
<output-test url="file://$MODULE_DIR$/../../build/contrib/extraction/classes/test" />
<exclude-output />
<content url="file://$MODULE_DIR$">
<sourceFolder url="file://$MODULE_DIR$/src/test" isTestSource="true" />

View File

@ -1,8 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="JAVA_MODULE" version="4">
<component name="NewModuleRootManager" LANGUAGE_LEVEL="JDK_1_6" inherit-compiler-output="false">
<output url="file://$MODULE_DIR$/../../build/contrib/uima/classes" />
<output-test url="file://$MODULE_DIR$/../../build/contrib/uima/test-classes" />
<output url="file://$MODULE_DIR$/../../build/contrib/uima/classes/java" />
<output-test url="file://$MODULE_DIR$/../../build/contrib/uima/classes/test" />
<exclude-output />
<content url="file://$MODULE_DIR$">
<sourceFolder url="file://$MODULE_DIR$/src/test" isTestSource="true" />

View File

@ -1,18 +1,17 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="JAVA_MODULE" version="4">
<component name="NewModuleRootManager" LANGUAGE_LEVEL="JDK_1_6" inherit-compiler-output="false">
<output url="file://$MODULE_DIR$/build/solr" />
<output-test url="file://$MODULE_DIR$/build/tests" />
<output url="file://$MODULE_DIR$/../build/solr-idea/classes/java" />
<output-test url="file://$MODULE_DIR$/../build/solr-idea/classes/test" />
<exclude-output />
<content url="file://$MODULE_DIR$">
<sourceFolder url="file://$MODULE_DIR$/src/solrj" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/src/common" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/src/webapp/src" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/src/webapp/web" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/src/java" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/src/test" isTestSource="true" />
<sourceFolder url="file://$MODULE_DIR$/src/test-files" isTestSource="true" />
<sourceFolder url="file://$MODULE_DIR$/src/test-framework" isTestSource="true" />
<sourceFolder url="file://$MODULE_DIR$/core/src/java" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/core/src/test" isTestSource="true" />
<sourceFolder url="file://$MODULE_DIR$/core/src/test-files" isTestSource="true" />
<sourceFolder url="file://$MODULE_DIR$/solrj/src/java" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/solrj/src/test" isTestSource="true" />
<sourceFolder url="file://$MODULE_DIR$/solrj/src/test-files" isTestSource="true" />
<sourceFolder url="file://$MODULE_DIR$/test-framework/src/java" isTestSource="true" />
<excludeFolder url="file://$MODULE_DIR$/build" />
<excludeFolder url="file://$MODULE_DIR$/dist" />
<excludeFolder url="file://$MODULE_DIR$/package" />

View File

@ -27,7 +27,7 @@
<relativePath>../../pom.xml</relativePath>
</parent>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-queryparser</artifactId>
<artifactId>lucene-queryparser-contrib</artifactId>
<packaging>jar</packaging>
<name>Lucene Query Parser</name>
<description>

View File

@ -91,8 +91,8 @@
</dependencies>
<build>
<directory>${build-directory}</directory>
<outputDirectory>${build-directory}/classes</outputDirectory>
<testOutputDirectory>${build-directory}/test-classes</testOutputDirectory>
<outputDirectory>${build-directory}/classes/java</outputDirectory>
<testOutputDirectory>${build-directory}/classes/test</testOutputDirectory>
<sourceDirectory>src/java</sourceDirectory>
<testSourceDirectory>src/test</testSourceDirectory>
<testResources>
@ -100,7 +100,7 @@
<directory>src/test-files</directory>
</testResource>
<testResource>
<directory>../../src/test-files</directory>
<directory>../../core/src/test-files</directory>
</testResource>
</testResources>
<plugins>

View File

@ -116,8 +116,8 @@
</dependencies>
<build>
<directory>${build-directory}</directory>
<outputDirectory>${build-directory}/classes</outputDirectory>
<testOutputDirectory>${build-directory}/test-classes</testOutputDirectory>
<outputDirectory>${build-directory}/classes/java</outputDirectory>
<testOutputDirectory>${build-directory}/classes/test</testOutputDirectory>
<sourceDirectory>src/java</sourceDirectory>
<testSourceDirectory>src/test</testSourceDirectory>
<testResources>
@ -125,7 +125,7 @@
<directory>src/test-files</directory>
</testResource>
<testResource>
<directory>../../src/test-files</directory>
<directory>../../core/src/test-files</directory>
</testResource>
</testResources>
<plugins>

View File

@ -102,8 +102,8 @@
</dependencies>
<build>
<directory>${build-directory}</directory>
<outputDirectory>${build-directory}/classes</outputDirectory>
<testOutputDirectory>${build-directory}/test-classes</testOutputDirectory>
<outputDirectory>${build-directory}/classes/java</outputDirectory>
<testOutputDirectory>${build-directory}/classes/test</testOutputDirectory>
<sourceDirectory>src/java</sourceDirectory>
<testSourceDirectory>src/test</testSourceDirectory>
<testResources>
@ -111,7 +111,7 @@
<directory>src/test-files</directory>
</testResource>
<testResource>
<directory>../../src/test-files</directory>
<directory>../../core/src/test-files</directory>
</testResource>
</testResources>
<plugins>

View File

@ -99,8 +99,8 @@
</dependencies>
<build>
<directory>${build-directory}</directory>
<outputDirectory>${build-directory}/classes</outputDirectory>
<testOutputDirectory>${build-directory}/test-classes</testOutputDirectory>
<outputDirectory>${build-directory}/classes/java</outputDirectory>
<testOutputDirectory>${build-directory}/classes/test</testOutputDirectory>
<sourceDirectory>src/java</sourceDirectory>
<testSourceDirectory>src/test</testSourceDirectory>
<testResources>
@ -108,7 +108,7 @@
<directory>src/test-files</directory>
</testResource>
<testResource>
<directory>../../src/test-files</directory>
<directory>../../core/src/test-files</directory>
</testResource>
</testResources>
<plugins>

View File

@ -91,8 +91,8 @@
</dependencies>
<build>
<directory>${build-directory}</directory>
<outputDirectory>${build-directory}/classes</outputDirectory>
<testOutputDirectory>${build-directory}/test-classes</testOutputDirectory>
<outputDirectory>${build-directory}/classes/java</outputDirectory>
<testOutputDirectory>${build-directory}/classes/test</testOutputDirectory>
<sourceDirectory>src/java</sourceDirectory>
<testSourceDirectory>src/test</testSourceDirectory>
<testResources>
@ -100,7 +100,7 @@
<directory>src/test-files</directory>
</testResource>
<testResource>
<directory>../../src/test-files</directory>
<directory>../../core/src/test-files</directory>
</testResource>
</testResources>
<plugins>

View File

@ -103,8 +103,8 @@
</dependencies>
<build>
<directory>${build-directory}</directory>
<outputDirectory>${build-directory}/classes</outputDirectory>
<testOutputDirectory>${build-directory}/test-classes</testOutputDirectory>
<outputDirectory>${build-directory}/classes/java</outputDirectory>
<testOutputDirectory>${build-directory}/classes/test</testOutputDirectory>
<sourceDirectory>src/java</sourceDirectory>
<testSourceDirectory>src/test</testSourceDirectory>
<resources>

View File

@ -33,7 +33,7 @@
<description>Apache Solr Core</description>
<properties>
<module-directory>solr</module-directory>
<build-directory>../build</build-directory>
<build-directory>../build/solr-maven</build-directory>
</properties>
<dependencies>
<dependency>
@ -184,34 +184,19 @@
</dependencies>
<build>
<directory>${build-directory}</directory>
<outputDirectory>${build-directory}/solr</outputDirectory>
<testOutputDirectory>${build-directory}/tests</testOutputDirectory>
<sourceDirectory>java</sourceDirectory>
<testSourceDirectory>test</testSourceDirectory>
<outputDirectory>${build-directory}/classes/java</outputDirectory>
<testOutputDirectory>${build-directory}/classes/test</testOutputDirectory>
<sourceDirectory>src/java</sourceDirectory>
<testSourceDirectory>src/test</testSourceDirectory>
<testResources>
<testResource>
<directory>test-files</directory>
<directory>src/test-files</directory>
</testResource>
<testResource>
<directory>../solrj/src/test-files</directory>
</testResource>
</testResources>
<plugins>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>build-helper-maven-plugin</artifactId>
<executions>
<execution>
<id>add-source</id>
<phase>generate-sources</phase>
<goals>
<goal>add-source</goal>
</goals>
<configuration>
<sources>
<source>webapp/src</source>
</sources>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
@ -257,14 +242,15 @@
<artifactId>build-helper-maven-plugin</artifactId>
<executions>
<execution>
<id>add-test-source</id>
<id>add-sorlj-and-test-framework</id>
<phase>generate-test-sources</phase>
<goals>
<goal>add-test-source</goal>
</goals>
<configuration>
<sources>
<source>test-framework</source>
<source>../test-framework/src/java</source>
<source>../solrj/src/test</source> <!-- solrj tests are run from solr-core -->
</sources>
</configuration>
</execution>

View File

@ -32,10 +32,10 @@
<name>Apache Solr parent POM</name>
<description>Apache Solr parent POM</description>
<modules>
<module>src</module>
<module>src/solrj</module>
<module>src/webapp</module>
<module>src/test-framework</module>
<module>core</module>
<module>solrj</module>
<module>webapp</module>
<module>test-framework</module>
<module>contrib</module>
</modules>
<properties>

View File

@ -24,7 +24,7 @@
<groupId>org.apache.solr</groupId>
<artifactId>solr-parent</artifactId>
<version>@version@</version>
<relativePath>../../pom.xml</relativePath>
<relativePath>../pom.xml</relativePath>
</parent>
<groupId>org.apache.solr</groupId>
<artifactId>solr-solrj</artifactId>
@ -32,8 +32,8 @@
<name>Apache Solr Solrj</name>
<description>Apache Solr Solrj</description>
<properties>
<module-directory>solr/src/solrj</module-directory>
<build-directory>../../build/solrj</build-directory>
<module-directory>solr/solrj</module-directory>
<build-directory>../build/solr-solrj</build-directory>
</properties>
<dependencies>
<dependency>
@ -41,6 +41,12 @@
<artifactId>lucene-core</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-test-framework</artifactId>
<version>${project.version}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers-common</artifactId>
@ -80,37 +86,17 @@
</dependencies>
<build>
<directory>${build-directory}</directory>
<outputDirectory>${build-directory}/classes</outputDirectory>
<sourceDirectory>.</sourceDirectory>
<outputDirectory>${build-directory}/classes/java</outputDirectory>
<testOutputDirectory/>
<sourceDirectory>src/java</sourceDirectory>
<testSourceDirectory/>
<testResources/>
<plugins>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>build-helper-maven-plugin</artifactId>
<executions>
<execution>
<id>add-source</id>
<phase>generate-sources</phase>
<goals>
<goal>add-source</goal>
</goals>
<configuration>
<sources>
<source>../common</source>
</sources>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<configuration>
<!-- Solrj tests are run from the solr-core build; -->
<!-- otherwise there would be a cyclic dependency, -->
<!-- since Solrj's tests depend on solr-core, and -->
<!-- solr-core depends on Solrj. -->
<skipTests>true</skipTests>
<skip>true</skip> <!-- Tests are run from solr-core module -->
</configuration>
</plugin>
</plugins>

View File

@ -24,7 +24,7 @@
<groupId>org.apache.solr</groupId>
<artifactId>solr-parent</artifactId>
<version>@version@</version>
<relativePath>../../pom.xml</relativePath>
<relativePath>../pom.xml</relativePath>
</parent>
<groupId>org.apache.solr</groupId>
<artifactId>solr-test-framework</artifactId>
@ -32,8 +32,8 @@
<name>Apache Solr Test Framework</name>
<description>Apache Solr Test Framework</description>
<properties>
<module-directory>solr/src/test-framework</module-directory>
<build-directory>../../build</build-directory>
<module-directory>solr/test-framework</module-directory>
<build-directory>../build/solr-test-framework</build-directory>
</properties>
<dependencies>
<dependency>
@ -53,8 +53,10 @@
</dependencies>
<build>
<directory>${build-directory}</directory>
<outputDirectory>${build-directory}/classes/test-framework</outputDirectory>
<sourceDirectory>.</sourceDirectory>
<outputDirectory>${build-directory}/classes/java</outputDirectory>
<testOutputDirectory>${build-directory}/classes/test</testOutputDirectory>
<sourceDirectory>src/java</sourceDirectory>
<testSourceDirectory/>
<resources>
<resource>
<directory>.</directory>

View File

@ -24,7 +24,7 @@
<groupId>org.apache.solr</groupId>
<artifactId>solr-parent</artifactId>
<version>@version@</version>
<relativePath>../../pom.xml</relativePath>
<relativePath>../pom.xml</relativePath>
</parent>
<groupId>org.apache.solr</groupId>
<artifactId>solr</artifactId>
@ -32,8 +32,8 @@
<name>Apache Solr Search Server</name>
<description>Apache Solr Search Server</description>
<properties>
<module-directory>solr/src/webapp</module-directory>
<build-directory>../../build/web</build-directory>
<module-directory>solr/webapp</module-directory>
<build-directory>../build</build-directory>
</properties>
<dependencies>
<dependency>
@ -107,6 +107,11 @@
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>${project.groupId}</groupId>
<artifactId>solr-solrj</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.codehaus.woodstox</groupId>
<artifactId>wstx-asl</artifactId>

View File

@ -156,6 +156,12 @@ Changes in backwards compatibility policy
the queries module and can be found at o.a.l.queries.function. See MIGRATE.txt
for more information (Chris Male)
* LUCENE-2392: Decoupled vector space scoring from Query/Weight/Scorer. If you
extended Similarity directly before, you should extend TFIDFSimilarity instead.
Similarity is now a lower-level API to implement other scoring algorithms.
See MIGRATE.txt for more details.
(David Nemeskey, Simon Willnauer, Mike Mccandless, Robert Muir)
Changes in Runtime Behavior
* LUCENE-2846: omitNorms now behaves like omitTermFrequencyAndPositions, if you

View File

@ -382,3 +382,13 @@ LUCENE-1458, LUCENE-2111: Flexible Indexing
- o.a.l.search.function.ShortFieldSource -> o.a.l.queries.function.valuesource.ShortFieldSource
- o.a.l.search.function.ValueSource -> o.a.l.queries.function.ValueSource
- o.a.l.search.function.ValueSourceQuery -> o.a.l.queries.function.FunctionQuery
* LUCENE-2392: Enable flexible scoring:
The existing "Similarity" api is now TFIDFSimilarity, if you were extending
Similarity before, you should likely extend this instead.
Weight.normalize no longer takes a norm value that incorporates the top-level
boost from outer queries such as BooleanQuery, instead it takes 2 parameters,
the outer boost (topLevelBoost) and the norm. Weight.sumOfSquaredWeights has
been renamed to Weight.getValueForNormalization().

View File

@ -286,7 +286,7 @@
<patternset refid="binary.build.dist.patterns"/>
</zipfileset>
</zip>
<lucene-checksum file="${dist.dir}/lucene-${version}.zip"/>
<make-checksums file="${dist.dir}/lucene-${version}.zip"/>
</target>
<!-- ================================================================== -->
@ -311,7 +311,7 @@
<gzip zipfile="${dist.dir}/lucene-${version}.tgz"
src="${build.dir}/lucene-${version}.tar"
/>
<lucene-checksum file="${dist.dir}/lucene-${version}.tgz"/>
<make-checksums file="${dist.dir}/lucene-${version}.tgz"/>
</target>
<!-- ================================================================== -->
@ -340,70 +340,21 @@
<mkdir dir="${maven.dist.dir}"/>
</target>
<property name="svn.export.dir" location="${build.dir}/svn-export"/>
<!-- ================================================================== -->
<!-- Packages the sources from "svn export" with tar-gzip -->
<!-- ================================================================== -->
<target name="package-tgz-src" depends="init, init-dist, svn-export-source"
description="--> Generates the Lucene distribution as .tgz">
<target name="package-tgz-src" depends="init, init-dist"
description="--> Generates the Lucene source distribution from 'svn export' as .tgz">
<property name="source.package.file"
value="${dist.dir}/lucene-${version}-src.tgz"/>
<delete file="${source.package.file}"/>
<svn-export-source source.dir="."/>
<build-changes changes.src.dir="${svn.export.dir}/src/site/changes"
changes.target.dir="${svn.export.dir}/docs/changes"/>
<tar tarfile="${source.package.file}" compression="gzip" longfile="gnu">
<tarfileset prefix="lucene-${version}" dir="${svn.export.dir}"/>
</tar>
<lucene-checksum file="${source.package.file}"/>
</target>
<!-- ================================================================== -->
<!-- Runs "svn export" in build/svn-export/ with the same URL -->
<!-- and revision as the current working copy. -->
<!-- ================================================================== -->
<target name="svn-export-source" depends="get-svn-info">
<delete dir="${svn.export.dir}" includeemptydirs="true" failonerror="false"/>
<mkdir dir="${build.dir}"/>
<exec dir="." executable="${svn.exe}" failonerror="true">
<arg value="export"/>
<arg value="--native-eol"/>
<arg value="LF"/>
<arg value="-r"/>
<arg value="${svn.Revision}"/>
<arg value="${svn.URL}"/>
<arg value="${svn.export.dir}"/>
</exec>
<build-changes changes.src.dir="${svn.export.dir}/src/site/changes" changes.target.dir="${svn.export.dir}/docs/changes"/>
</target>
<!-- ================================================================== -->
<!-- Populates properties svn.URL and svn.Revision using "svn info". -->
<!-- ================================================================== -->
<target name="get-svn-info" depends="check-svn">
<exec dir="." executable="${svn.exe}" outputproperty="svn.info" failonerror="true">
<arg value="info"/>
</exec>
<loadproperties>
<propertyresource name="svn.info"/>
<filterchain>
<linecontainsregexp>
<regexp pattern="(URL|Revision):"/>
</linecontainsregexp>
<replacestring from=": " to="="/>
<prefixlines prefix="svn."/>
</filterchain>
</loadproperties>
</target>
<target name="check-svn">
<sequential>
<exec dir="." executable="${svnversion.exe}" outputproperty="svn.ver"/>
<fail message="A subversion checkout is required for this target">
<condition>
<equals arg1="${svn.ver}" arg2="exported"/>
</condition>
</fail>
</sequential>
<make-checksums file="${source.package.file}"/>
</target>
<!-- ================================================================== -->
@ -413,28 +364,8 @@
<target name="dist-all" depends="dist, dist-src"/>
<property name="rc" value="rc0"/>
<property name="remote.staging.dir" value="public_html/staging_area/${rc}/${version}"/>
<property name="keyfile" value="${user.home}/.ssh/id_rsa"/>
<property name="scp.user" value="${user.name}"/>
<!--keys.dir is the location of the https://svn.apache.org/repos/asf/lucene/java/dist/ directory-->
<property name="keys.dir" value="${common.dir}/../../dist"/>
<target name="copy-to-stage">
<sshexec host="people.apache.org"
username="${scp.user}"
keyfile="${keyfile}"
command="mkdir -p ${remote.staging.dir}"/>
<echo>Uploading artifacts to ${scp.user}@people.apache.org:${remote.staging.dir}</echo>
<scp todir="${scp.user}@people.apache.org:${remote.staging.dir}"
username="${scp.user}"
keyfile="${keyfile}"
verbose="true"
>
<fileset dir="dist"/>
<fileset dir="${keys.dir}">
<include name="KEYS"/>
</fileset>
</scp>
<copy-to-stage-macro artifacts.dir="${dist.dir}"/>
</target>
<target name="prepare-release" depends="clean, dist-all, generate-maven-artifacts, sign-artifacts"/>
@ -477,43 +408,8 @@
<!-- ================================================================== -->
<!-- support for signing the artifacts using gpg -->
<!-- ================================================================== -->
<target name="clean-dist-signatures">
<delete failonerror="false">
<fileset dir="${dist.dir}">
<include name="**/*.asc"/>
</fileset>
</delete>
</target>
<target name="sign-artifacts" depends="clean-dist-signatures">
<available property="gpg.input.handler" classname="org.apache.tools.ant.input.SecureInputHandler"
value="org.apache.tools.ant.input.SecureInputHandler"/>
<!--else:--><property name="gpg.input.handler" value="org.apache.tools.ant.input.DefaultInputHandler"/>
<input message="Enter GPG keystore password: >" addproperty="gpg.passphrase">
<handler classname="${gpg.input.handler}" />
</input>
<apply executable="${gpg.exe}" inputstring="${gpg.passphrase}"
dest="${dist.dir}" type="file" maxparallel="1" verbose="yes">
<arg value="--passphrase-fd"/>
<arg value="0"/>
<arg value="--batch"/>
<arg value="--armor"/>
<arg value="--default-key"/>
<arg value="${gpg.key}"/>
<arg value="--output"/>
<targetfile/>
<arg value="--detach-sig"/>
<srcfile/>
<fileset dir="${dist.dir}">
<include name="**/*.jar"/>
<include name="**/*.zip"/>
<include name="**/*.tgz"/>
<include name="**/*.pom"/>
</fileset>
<globmapper from="*" to="*.asc"/>
</apply>
<target name="sign-artifacts">
<sign-artifacts-macro artifacts.dir="${dist.dir}"/>
</target>
<!-- ================================================================== -->
@ -602,19 +498,6 @@
</exec>
</target>
<macrodef name="contrib-crawl">
<attribute name="target" default=""/>
<attribute name="failonerror" default="true"/>
<sequential>
<subant target="@{target}" failonerror="@{failonerror}">
<property name="core.compiled" value="true"/>
<fileset dir="."
includes="contrib/*/build.xml"
/>
</subant>
</sequential>
</macrodef>
<target name="build-contrib" depends="compile-test"
description="Builds all contrib modules and their tests">
<contrib-crawl target="build-artifacts-and-tests"/>
@ -624,16 +507,6 @@
<contrib-crawl target="test" failonerror="true"/>
</target>
<!-- Macro for building checksum files -->
<macrodef name="lucene-checksum">
<attribute name="file"/>
<sequential>
<echo>Building checksums for '@{file}'</echo>
<checksum file="@{file}" algorithm="md5" format="MD5SUM" forceoverwrite="yes" readbuffersize="65536"/>
<checksum file="@{file}" algorithm="sha1" format="MD5SUM" forceoverwrite="yes" readbuffersize="65536"/>
</sequential>
</macrodef>
<!--
compile changes.txt into an html file
-->

View File

@ -26,7 +26,7 @@
<dirname file="${ant.file.common}" property="common.dir"/>
<property name="dev-tools.dir" value="${common.dir}/../dev-tools"/>
<property name="prettify.dir" value="${common.dir}/src/tools/prettify"/>
<property name="maven.build.dir" value="${common.dir}/build/maven"/>
<property name="maven.build.dir" value="${build.dir}/maven"/>
<!-- Give user a chance to override without editing this file
(and without typing -D each time it compiles it -->
@ -39,6 +39,8 @@
<format property="current.year" pattern="yyyy"/>
<format property="DSTAMP" pattern="yyyy-MM-dd"/>
<format property="TSTAMP" pattern="HH:mm:ss"/>
<!-- datetime format that is safe to treat as part of a dotted version -->
<format property="dateversion" pattern="yyyy.MM.dd.HH.mm.ss" />
</tstamp>
<property name="name" value="${ant.project.name}"/>
@ -201,6 +203,11 @@
</and>
</condition>
<propertyset id="uptodate.and.compiled.properties" dynamic="true">
<propertyref regex=".*\.uptodate$$"/>
<propertyref regex=".*\.compiled$$"/>
</propertyset>
<target name="clean"
description="Removes contents of build and dist directories">
<delete dir="${build.dir}"/>
@ -325,13 +332,14 @@
<macrodef name="m2-deploy" description="Builds a Maven artifact">
<element name="artifact-attachments" optional="yes"/>
<attribute name="pom.xml" default="pom.xml"/>
<attribute name="jar.file" default="${build.dir}/${final.name}.jar"/>
<sequential>
<artifact:install-provider artifactId="wagon-ssh" version="1.0-beta-7"/>
<artifact:pom id="maven.project" file="@{pom.xml}"/>
<artifact:deploy file="${build.dir}/${maven.project.build.finalName}.jar">
<artifact:deploy file="@{jar.file}">
<artifact-attachments/>
<remoteRepository url="${m2.repository.url}">
<authentication username="${m2.repository.username}" privateKey="${m2.repository.private.key}"/>
<authentication username="${m2.repository.username}" privateKey="${m2.repository.private.key}" password="${m2.repository.password}"/>
</remoteRepository>
<pom refid="maven.project"/>
</artifact:deploy>
@ -342,16 +350,16 @@
<attribute name="pom.xml"/>
<attribute name="jar.file"/>
<sequential>
<copy file="@{pom.xml}" tofile="${maven.build.dir}/@{pom.xml}">
<copy file="@{pom.xml}" tofile="${maven.build.dir}/pom.xml">
<filterset begintoken="@" endtoken="@">
<filter token="version" value="${version}"/>
</filterset>
</copy>
<artifact:install-provider artifactId="wagon-ssh" version="1.0-beta-7"/>
<artifact:pom id="maven.project" file="${maven.build.dir}/@{pom.xml}" />
<artifact:pom id="maven.project" file="${maven.build.dir}/pom.xml" />
<artifact:deploy file="@{jar.file}">
<remoteRepository url="${m2.repository.url}">
<authentication username="${m2.repository.username}" privateKey="${m2.repository.private.key}"/>
<authentication username="${m2.repository.username}" privateKey="${m2.repository.private.key}" password="${m2.repository.password}"/>
</remoteRepository>
<pom refid="maven.project"/>
</artifact:deploy>
@ -359,35 +367,58 @@
</macrodef>
<macrodef name="build-manifest" description="Builds a manifest file">
<attribute name="title" default="Lucene Search Engine: ${ant.project.name}" />
<sequential>
<attribute name="title"/>
<attribute name="implementation.title"/>
<attribute name="spec.version"/>
<sequential>
<manifest file="${manifest.file}">
<!--
http://java.sun.com/j2se/1.5.0/docs/guide/jar/jar.html#JAR%20Manifest
http://java.sun.com/j2se/1.5.0/docs/guide/versioning/spec/versioning2.html
http://java.sun.com/j2se/1.5.0/docs/api/java/lang/Package.html
http://java.sun.com/j2se/1.5.0/docs/api/java/util/jar/package-summary.html
http://java.sun.com/developer/Books/javaprogramming/JAR/basics/manifest.html
-->
<!-- Don't set 'Manifest-Version' it identifies the version of the
manifest file format, and should always be 1.0 (the default)
Don't set 'Created-by' attribute, its purpose is
to identify the version of java used to build the jar,
which ant will do by default.
Ant will happily override these with bogus strings if you
tell it to, so don't.
NOTE: we don't use section info because all of our manifest data
applies to the entire jar/war ... no package specific info.
-->
<attribute name="Extension-Name" value="@{implementation.title}"/>
<attribute name="Specification-Title" value="@{title}"/>
<!-- spec version must match "digit+{.digit+}*" -->
<attribute name="Specification-Version" value="${spec.version}"/>
<attribute name="Specification-Vendor"
value="The Apache Software Foundation"/>
<attribute name="Implementation-Title" value="org.apache.lucene"/>
<!-- impl version can be any string -->
<attribute name="Implementation-Version"
value="${version} ${svnversion} - ${DSTAMP} ${TSTAMP}"/>
<attribute name="Implementation-Vendor"
value="The Apache Software Foundation"/>
<attribute name="X-Compile-Source-JDK"
value="${javac.source}"/>
<attribute name="X-Compile-Target-JDK"
value="${javac.target}"/>
</manifest>
</sequential>
<!-- spec version must match "digit+{.digit+}*" -->
<attribute name="Specification-Version" value="@{spec.version}"/>
<attribute name="Specification-Vendor"
value="The Apache Software Foundation"/>
<attribute name="Implementation-Title" value="@{implementation.title}"/>
<!-- impl version can be any string -->
<attribute name="Implementation-Version"
value="${version} ${svnversion} - ${user.name} - ${DSTAMP} ${TSTAMP}"/>
<attribute name="Implementation-Vendor"
value="The Apache Software Foundation"/>
<attribute name="X-Compile-Source-JDK" value="${javac.source}"/>
<attribute name="X-Compile-Target-JDK" value="${javac.target}"/>
</manifest>
</sequential>
</macrodef>
<macrodef name="jarify" description="Builds a JAR file">
<attribute name="basedir" default="${build.dir}/classes/java"/>
<attribute name="destfile" default="${build.dir}/${final.name}.jar"/>
<attribute name="title" default="Lucene Search Engine: ${ant.project.name}"/>
<attribute name="excludes" default="**/pom.xml"/>
<element name="manifest-attributes" optional="yes"/>
<element name="metainf-includes" optional="yes"/>
<attribute name="excludes" default="**/pom.xml,**/*.iml"/>
<attribute name="metainf.source.dir" default="${common.dir}"/>
<attribute name="implementation.title" default="org.apache.lucene"/>
<attribute name="spec.version" default="${spec.version}"/>
<element name="nested" optional="true" implicit="true"/>
<sequential>
<!-- If possible, include the svnversion -->
<exec dir="." executable="${svnversion.exe}"
@ -395,21 +426,16 @@
<arg line="."/>
</exec>
<build-manifest title="@{title}"/>
<build-manifest title="@{title}"
implementation.title="@{implementation.title}"
spec.version="@{spec.version}"/>
<jar
destfile="@{destfile}"
basedir="@{basedir}"
manifest="${manifest.file}"
excludes="@{excludes}">
<manifest>
<manifest-attributes/>
</manifest>
<metainf dir="${common.dir}">
<include name="LICENSE.txt"/>
<include name="NOTICE.txt"/>
</metainf>
<metainf-includes/>
<jar destfile="@{destfile}"
basedir="@{basedir}"
manifest="${manifest.file}"
excludes="@{excludes}">
<metainf dir="@{metainf.source.dir}" includes="LICENSE.txt,NOTICE.txt"/>
<nested />
</jar>
</sequential>
</macrodef>
@ -776,7 +802,7 @@
<record name="@{destdir}/log_javadoc.txt" action="start" append="no"/>
<javadoc
overview="@{overview}"
packagenames="org.apache.lucene.*"
packagenames="org.apache.lucene.*,org.apache.solr.*"
destdir="@{destdir}"
access="${javadoc.access}"
encoding="${build.encoding}"
@ -830,6 +856,16 @@
</sequential>
</macrodef>
<macrodef name="contrib-crawl">
<attribute name="target" default=""/>
<attribute name="failonerror" default="true"/>
<sequential>
<subant target="@{target}" failonerror="@{failonerror}" inheritall="false">
<propertyset refid="uptodate.and.compiled.properties"/>
<fileset dir="." includes="contrib/*/build.xml"/>
</subant>
</sequential>
</macrodef>
<!-- VALIDATION work -->
@ -851,10 +887,127 @@
<target name="validate-lucene" depends="check-legal-lucene" unless="validated-lucene"/>
<!-- Generic placeholder target for if we add other validation tasks -->
<target name="validate" depends="validate-lucene"/>
<property name="svn.export.dir" location="${build.dir}/svn-export"/>
<macrodef name="svn-export-source"
description="Runs 'svn export' with the same URL and revision as the current working copy.">
<attribute name="source.dir"/>
<sequential>
<delete dir="${svn.export.dir}" includeemptydirs="true" failonerror="false"/>
<get-svn-info directory="@{source.dir}"/>
<exec dir="@{source.dir}" executable="${svn.exe}" failonerror="true">
<arg value="export"/>
<arg value="--native-eol"/>
<arg value="LF"/>
<arg value="-r"/>
<arg value="${svn.Revision}"/>
<arg value="${svn.URL}"/>
<arg value="${svn.export.dir}"/>
</exec>
</sequential>
</macrodef>
<macrodef name="get-svn-info"
description="Populates properties svn.URL and svn.Revision using 'svn info'.">
<attribute name="directory"/>
<sequential>
<exec dir="." executable="${svnversion.exe}" outputproperty="svn.ver"/>
<fail message="A subversion checkout is required for this target">
<condition>
<equals arg1="${svn.ver}" arg2="exported"/>
</condition>
</fail>
<exec dir="@{directory}" executable="${svn.exe}" outputproperty="svn.info" failonerror="true">
<arg value="info"/>
</exec>
<loadproperties>
<propertyresource name="svn.info"/>
<filterchain>
<linecontainsregexp>
<regexp pattern="(URL|Revision):"/>
</linecontainsregexp>
<replacestring from=": " to="="/>
<prefixlines prefix="svn."/>
</filterchain>
</loadproperties>
</sequential>
</macrodef>
<macrodef name="make-checksums" description="Macro for building checksum files">
<attribute name="file"/>
<sequential>
<echo>Building checksums for '@{file}'</echo>
<checksum file="@{file}" algorithm="md5" format="MD5SUM" forceoverwrite="yes" readbuffersize="65536"/>
<checksum file="@{file}" algorithm="sha1" format="MD5SUM" forceoverwrite="yes" readbuffersize="65536"/>
</sequential>
</macrodef>
<macrodef name="sign-artifacts-macro">
<attribute name="artifacts.dir"/>
<sequential>
<delete failonerror="false">
<fileset dir="@{artifacts.dir}">
<include name="**/*.asc"/>
</fileset>
</delete>
<available property="gpg.input.handler" classname="org.apache.tools.ant.input.SecureInputHandler"
value="org.apache.tools.ant.input.SecureInputHandler"/>
<!--else:--><property name="gpg.input.handler" value="org.apache.tools.ant.input.DefaultInputHandler"/>
<input message="Enter GPG keystore password: >" addproperty="gpg.passphrase">
<handler classname="${gpg.input.handler}" />
</input>
<apply executable="${gpg.exe}" inputstring="${gpg.passphrase}"
dest="@{artifacts.dir}" type="file" maxparallel="1" verbose="yes">
<arg value="--passphrase-fd"/>
<arg value="0"/>
<arg value="--batch"/>
<arg value="--armor"/>
<arg value="--default-key"/>
<arg value="${gpg.key}"/>
<arg value="--output"/>
<targetfile/>
<arg value="--detach-sig"/>
<srcfile/>
<fileset dir="@{artifacts.dir}">
<include name="**/*.jar"/>
<include name="**/*.zip"/>
<include name="**/*.tgz"/>
<include name="**/*.pom"/>
</fileset>
<globmapper from="*" to="*.asc"/>
</apply>
</sequential>
</macrodef>
<property name="rc" value="rc0"/>
<property name="remote.staging.dir" value="public_html/staging_area/${rc}/${version}"/>
<property name="keyfile" value="${user.home}/.ssh/id_rsa"/>
<property name="scp.user" value="${user.name}"/>
<!--keys.dir is the location of the https://svn.apache.org/repos/asf/lucene/java/dist/ directory-->
<property name="keys.dir" value="${common.dir}/../../dist"/>
<macrodef name="copy-to-stage-macro">
<attribute name="artifacts.dir"/>
<sequential>
<sshexec host="people.apache.org"
username="${scp.user}"
keyfile="${keyfile}"
command="mkdir -p ${remote.staging.dir}"/>
<echo>Uploading artifacts to ${scp.user}@people.apache.org:${remote.staging.dir}</echo>
<scp todir="${scp.user}@people.apache.org:${remote.staging.dir}"
username="${scp.user}"
keyfile="${keyfile}"
verbose="true">
<fileset dir="${artifacts.dir}"/>
<fileset dir="${keys.dir}">
<include name="KEYS"/>
</fileset>
</scp>
</sequential>
</macrodef>
</project>

View File

@ -0,0 +1,860 @@
<?xml version="1.0"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<project name="common" xmlns:artifact="antlib:org.apache.maven.artifact.ant">
<description>
This file is designed for importing into a main build file, and not intended
for standalone use.
</description>
<dirname file="${ant.file.common}" property="common.dir"/>
<property name="dev-tools.dir" value="${common.dir}/../dev-tools"/>
<property name="prettify.dir" value="${common.dir}/src/tools/prettify"/>
<property name="maven.build.dir" value="${common.dir}/build/maven"/>
<!-- Give user a chance to override without editing this file
(and without typing -D each time it compiles it -->
<property file="${user.home}/lucene.build.properties"/>
<property file="${user.home}/build.properties"/>
<property file="${basedir}/build.properties"/>
<property file="${common.dir}/build.properties"/>
<tstamp>
<format property="current.year" pattern="yyyy"/>
<format property="DSTAMP" pattern="yyyy-MM-dd"/>
<format property="TSTAMP" pattern="HH:mm:ss"/>
</tstamp>
<property name="name" value="${ant.project.name}"/>
<property name="Name" value="Lucene"/>
<property name="dev.version" value="4.0-SNAPSHOT"/>
<property name="tests.luceneMatchVersion" value="4.0"/>
<property name="version" value="${dev.version}"/>
<property name="spec.version" value="${version}"/>
<property name="year" value="2000-${current.year}"/>
<property name="final.name" value="lucene-${name}-${version}"/>
<property name="junit.jar" value="junit-4.7.jar"/>
<property name="junit-location.jar" value="${common.dir}/lib/${junit.jar}"/>
<path id="junit-path">
<pathelement location="${junit-location.jar}"/>
</path>
<path id="ant-path">
<fileset dir="${common.dir}/lib" includes="ant-*.jar"/>
</path>
<path id="tools.runtime.classpath">
<pathelement location="${common.dir}/build/classes/tools"/>
</path>
<path id="maven-ant-tasks.classpath">
<fileset dir="${common.dir}/lib">
<include name="maven-ant-tasks-*.jar"/>
</fileset>
</path>
<!-- default arguments to pass to JVM executing tests -->
<property name="testmethod" value=""/>
<property name="args" value=""/>
<property name="tests.threadspercpu" value="1" />
<condition property="tests.sequential">
<or>
<isset property="testcase"/>
<equals arg1="${tests.threadspercpu}" arg2="0"/>
</or>
</condition>
<property name="tests.multiplier" value="1" />
<property name="tests.codec" value="randomPerField" />
<property name="tests.codecprovider" value="random" />
<property name="tests.locale" value="random" />
<property name="tests.timezone" value="random" />
<property name="tests.directory" value="random" />
<property name="tests.linedocsfile" value="europarl.lines.txt.gz" />
<property name="tests.iter" value="1" />
<property name="tests.iter.min" value="${tests.iter}" />
<property name="tests.seed" value="random" />
<property name="tests.loggingfile" value="/dev/null"/>
<property name="tests.nightly" value="false" />
<property name="javac.deprecation" value="off"/>
<property name="javac.debug" value="on"/>
<property name="javac.source" value="1.6"/>
<property name="javac.target" value="1.6"/>
<property name="javac.source.backwards" value="1.6"/>
<property name="javac.target.backwards" value="1.6"/>
<!-- clover wants to run with -lib, otherwise we prefer a repeatable
classpath -->
<property name="javac.includeAntRuntime" value="${run.clover}"/>
<property name="javadoc.link" value="http://download.oracle.com/javase/6/docs/api/"/>
<property name="javadoc.access" value="protected"/>
<property name="javadoc.charset" value="utf-8"/>
<property name="javadoc.dir" value="${common.dir}/build/docs/api"/>
<property name="javadoc.maxmemory" value="512m" />
<!-- Javadoc classpath -->
<path id="javadoc.classpath">
<path refid="classpath"/>
<pathelement location="${ant.home}/lib/ant.jar"/>
<fileset dir=".">
<exclude name="build/**/*.jar"/>
<include name="**/lib/*.jar"/>
</fileset>
</path>
<property name="changes.src.dir" value="${common.dir}/src/site/changes"/>
<property name="changes.target.dir" value="${common.dir}/build/docs/changes"/>
<property name="project.name" value="site"/> <!-- todo: is this used by anakia or something else? -->
<property name="build.encoding" value="utf-8"/>
<property name="src.dir" location="src/java"/>
<property name="tests.src.dir" location="src/test"/>
<property name="tests-framework.src.dir" location="${common.dir}/src/test-framework"/>
<property name="build.dir" location="build"/>
<!-- Needed in case a contrib needs the original build, also for compile-tools to be called from contrib -->
<property name="common.build.dir" location="${common.dir}/build"/>
<property name="dist.dir" location="dist"/>
<property name="maven.dist.dir" location="dist/maven"/>
<property name="m2.repository.url" value="file://${maven.dist.dir}"/>
<property name="m2.repository.private.key" value="${user.home}/.ssh/id_dsa"/>
<property name="javacc.home" location="${common.dir}"/>
<property name="jflex.home" location="${common.dir}"/>
<path id="jflex.classpath">
<fileset dir="${jflex.home}/">
<!-- for a JFlex trunk checkout: -->
<include name="jflex/target/*.jar"/>
<!-- for a JFlex distribution (not yet available): -->
<include name="lib/*.jar"/>
</fileset>
</path>
<path id="javacc.classpath">
<fileset dir="${javacc.home}/">
<include name="bin/lib/*.jar"/>
</fileset>
</path>
<property name="backwards.dir" location="backwards"/>
<property name="build.dir.backwards" location="${build.dir}/backwards"/>
<property name="junit.output.dir" location="${build.dir}/test"/>
<property name="junit.output.dir.backwards" location="${build.dir.backwards}/test"/>
<property name="junit.reports" location="${build.dir}/test/reports"/>
<property name="junit.reports.backwards" location="${build.dir.backwards}/test/reports"/>
<property name="junit.excludes" value=""/>
<condition property="junit.details.formatter"
value="org.apache.tools.ant.taskdefs.optional.junit.BriefJUnitResultFormatter"
else="org.apache.lucene.util.LuceneJUnitResultFormatter">
<isset property="tests.sequential"/>
</condition>
<property name="junit.parallel.selector" value="org.apache.lucene.util.LuceneJUnitDividingSelector"/>
<property name="manifest.file" location="${common.dir}/build/MANIFEST.MF"/>
<!--
we attempt to exec svnversion to get details build information
for jar manifests. this property can be set at runtime to an
explicit path as needed, or ant will just try to find it in the
default PATH. (this is useful for Hudson)
-->
<property name="svnversion.exe" value="svnversion" />
<property name="svn.exe" value="svn" />
<property name="hg.exe" value="hg" />
<property name="moman.url" value="https://bitbucket.org/jpbarrette/moman" />
<property name="moman.rev" value="115" />
<property name="python.exe" value="python" />
<property name="gpg.exe" value="gpg" />
<property name="gpg.key" value="CODE SIGNING KEY" />
<property name="clover.db.dir" location="${common.dir}/build/test/clover/db"/>
<property name="clover.report.dir" location="${common.dir}/build/test/clover/reports"/>
<available
property="clover.present"
classname="com.cenqua.clover.tasks.CloverReportTask"
/>
<condition property="clover.enabled">
<and>
<isset property="run.clover"/>
<isset property="clover.present"/>
</and>
</condition>
<target name="clean"
description="Removes contents of build and dist directories">
<delete dir="${build.dir}"/>
<delete dir="${dist.dir}"/>
<delete file="velocity.log"/>
</target>
<!-- TODO: maybe make JavaCC checking specific to only the projects
that use it (Lucene core and contrib/misc
-->
<target name="javacc-uptodate-check">
<uptodate property="javacc.files.uptodate">
<srcfiles dir="${src.dir}" includes="**/*.jj" />
<mapper type="glob" from="*.jj" to="*.java"/>
</uptodate>
</target>
<target name="javacc-notice" depends="javacc-uptodate-check" unless="javacc.files.uptodate">
<echo>
One or more of the JavaCC .jj files is newer than its corresponding
.java file. Run the "javacc" target to regenerate the artifacts.
</echo>
</target>
<target name="init">
<!-- currently empty -->
</target>
<target name="jflex-uptodate-check">
<uptodate property="jflex.files.uptodate">
<srcfiles dir="${src.dir}" includes="**/*.jflex" />
<mapper type="glob" from="*.jflex" to="*.java"/>
</uptodate>
</target>
<target name="jflex-notice" depends="jflex-uptodate-check" unless="jflex.files.uptodate">
<echo>
One or more of the JFlex .jflex files is newer than its corresponding
.java file. Run the "jflex" target to regenerate the artifacts.
</echo>
</target>
<target name="javacc-check">
<available property="javacc.present" classname="org.javacc.parser.Main">
<classpath refid="javacc.classpath"/>
</available>
<fail unless="javacc.present">
##################################################################
JavaCC not found.
JavaCC Home: ${javacc.home}
Please download and install JavaCC 4.1 from:
&lt;http://javacc.dev.java.net&gt;
Then, create a build.properties file either in your home
directory, or within the Lucene directory and set the javacc.home
property to the path where JavaCC is installed. For example,
if you installed JavaCC in /usr/local/java/javacc-4.1, then set the
javacc.home property to:
javacc.home=/usr/local/java/javacc-4.1
If you get an error like the one below, then you have not installed
things correctly. Please check all your paths and try again.
java.lang.NoClassDefFoundError: org.javacc.parser.Main
##################################################################
</fail>
</target>
<target name="jflex-check">
<available property="jflex.present" classname="jflex.anttask.JFlexTask">
<classpath refid="jflex.classpath"/>
</available>
<fail unless="jflex.present">
##################################################################
JFlex not found.
JFlex Home: ${jflex.home}
Please install the jFlex 1.5 version (currently not released)
from its SVN repository:
svn co http://jflex.svn.sourceforge.net/svnroot/jflex/trunk jflex
cd jflex
mvn install
Then, create a build.properties file either in your home
directory, or within the Lucene directory and set the jflex.home
property to the path where the JFlex trunk checkout is located
(in the above example its the directory called "jflex").
##################################################################
</fail>
</target>
<target name="compile-core" depends="init, clover"
description="Compiles core classes">
<compile
srcdir="${src.dir}"
destdir="${build.dir}/classes/java">
<classpath refid="classpath"/>
</compile>
<!-- Copy the resources folder (if existent) -->
<copy todir="${build.dir}/classes/java" includeEmptyDirs="false">
<globmapper from="resources/*" to="*" handledirsep="yes"/>
<fileset dir="src" includes="resources/**"/>
</copy>
</target>
<target name="compile" depends="compile-core">
<!-- convenience target to compile core -->
</target>
<target name="jar-core" depends="compile-core"
description="Packages the JAR file">
<jarify/>
</target>
<macrodef name="m2-deploy" description="Builds a Maven artifact">
<element name="artifact-attachments" optional="yes"/>
<attribute name="pom.xml" default="pom.xml"/>
<sequential>
<artifact:install-provider artifactId="wagon-ssh" version="1.0-beta-7"/>
<artifact:pom id="maven.project" file="@{pom.xml}"/>
<artifact:deploy file="${build.dir}/${maven.project.build.finalName}.jar">
<artifact-attachments/>
<remoteRepository url="${m2.repository.url}">
<authentication username="${m2.repository.username}" privateKey="${m2.repository.private.key}" password="${m2.repository.password}"/>
</remoteRepository>
<pom refid="maven.project"/>
</artifact:deploy>
</sequential>
</macrodef>
<macrodef name="m2-deploy-with-pom-template" description="Builds a Maven artifact given a POM template">
<attribute name="pom.xml"/>
<attribute name="jar.file"/>
<sequential>
<copy file="@{pom.xml}" tofile="${maven.build.dir}/@{pom.xml}">
<filterset begintoken="@" endtoken="@">
<filter token="version" value="${version}"/>
</filterset>
</copy>
<artifact:install-provider artifactId="wagon-ssh" version="1.0-beta-7"/>
<artifact:pom id="maven.project" file="${maven.build.dir}/@{pom.xml}" />
<artifact:deploy file="@{jar.file}">
<remoteRepository url="${m2.repository.url}">
<authentication username="${m2.repository.username}" privateKey="${m2.repository.private.key}" password="${m2.repository.password}"/>
</remoteRepository>
<pom refid="maven.project"/>
</artifact:deploy>
</sequential>
</macrodef>
<macrodef name="build-manifest" description="Builds a manifest file">
<attribute name="title" default="Lucene Search Engine: ${ant.project.name}" />
<sequential>
<manifest file="${manifest.file}">
<attribute name="Specification-Title" value="@{title}"/>
<!-- spec version must match "digit+{.digit+}*" -->
<attribute name="Specification-Version" value="${spec.version}"/>
<attribute name="Specification-Vendor"
value="The Apache Software Foundation"/>
<attribute name="Implementation-Title" value="org.apache.lucene"/>
<!-- impl version can be any string -->
<attribute name="Implementation-Version"
value="${version} ${svnversion} - ${DSTAMP} ${TSTAMP}"/>
<attribute name="Implementation-Vendor"
value="The Apache Software Foundation"/>
<attribute name="X-Compile-Source-JDK"
value="${javac.source}"/>
<attribute name="X-Compile-Target-JDK"
value="${javac.target}"/>
</manifest>
</sequential>
</macrodef>
<macrodef name="jarify" description="Builds a JAR file">
<attribute name="basedir" default="${build.dir}/classes/java"/>
<attribute name="destfile" default="${build.dir}/${final.name}.jar"/>
<attribute name="title" default="Lucene Search Engine: ${ant.project.name}"/>
<attribute name="excludes" default="**/pom.xml"/>
<element name="manifest-attributes" optional="yes"/>
<element name="metainf-includes" optional="yes"/>
<sequential>
<!-- If possible, include the svnversion -->
<exec dir="." executable="${svnversion.exe}"
outputproperty="svnversion" failifexecutionfails="false">
<arg line="."/>
</exec>
<build-manifest title="@{title}"/>
<jar
destfile="@{destfile}"
basedir="@{basedir}"
manifest="${manifest.file}"
excludes="@{excludes}">
<manifest>
<manifest-attributes/>
</manifest>
<metainf dir="${common.dir}">
<include name="LICENSE.txt"/>
<include name="NOTICE.txt"/>
</metainf>
<metainf-includes/>
</jar>
</sequential>
</macrodef>
<target name="compile-test-framework" depends="compile-core">
<compile-test-macro srcdir="${tests-framework.src.dir}" destdir="${common.dir}/build/classes/test-framework"
test.classpath="test.classpath"/>
</target>
<target name="compile-tools">
<compile
srcdir="${common.dir}/src/tools/java"
destdir="${common.build.dir}/classes/tools">
<classpath refid="classpath"/>
</compile>
</target>
<target name="compile-test" depends="compile-test-framework">
<compile-test-macro srcdir="${tests.src.dir}" destdir="${build.dir}/classes/test"
test.classpath="test.classpath"/>
</target>
<property name="tests.verbose" value="false"/>
<macrodef name="compile-test-macro" description="Compiles junit tests.">
<attribute name="srcdir"/>
<attribute name="destdir"/>
<attribute name="test.classpath"/>
<attribute name="javac.source" default="${javac.source}"/>
<attribute name="javac.target" default="${javac.target}"/>
<sequential>
<compile
srcdir="@{srcdir}"
destdir="@{destdir}"
javac.source="@{javac.source}"
javac.target="@{javac.source}">
<classpath refid="@{test.classpath}"/>
</compile>
<!-- Copy any data files present to the classpath -->
<copy todir="@{destdir}">
<fileset dir="@{srcdir}" excludes="**/*.java"/>
</copy>
</sequential>
</macrodef>
<macrodef name="test-macro" description="Executes junit tests.">
<attribute name="junit.output.dir" default="${junit.output.dir}"/>
<attribute name="junit.classpath" default="junit.classpath"/>
<attribute name="dataDir" default="${tests.src.dir}"/>
<attribute name="tempDir" default="${build.dir}/test"/>
<attribute name="threadNum" default="1"/>
<attribute name="threadTotal" default="1"/>
<sequential>
<condition property="runall">
<not><or>
<isset property="testcase" />
<isset property="testpackage" />
<isset property="testpackageroot" />
</or></not>
</condition>
<!-- <mkdir dir="@{tempDir}/@{pattern}"/>
This is very loud and obnoxious. abuse touch instead for a "quiet" mkdir
-->
<touch file="@{tempDir}/@{threadNum}/quiet.ant" verbose="false" mkdirs="true"/>
<junit printsummary="off" haltonfailure="no" maxmemory="512M" tempdir="@{tempDir}/@{threadNum}"
errorProperty="tests.failed" failureProperty="tests.failed" forkmode="perBatch" dir="@{tempDir}/@{threadNum}">
<classpath refid="@{junit.classpath}"/>
<assertions>
<enable package="org.apache.lucene"/>
<enable package="org.apache.solr"/>
</assertions>
<jvmarg line="${args}"/>
<!-- allow tests to control debug prints -->
<sysproperty key="tests.verbose" value="${tests.verbose}"/>
<!-- set the codec tests should run with -->
<sysproperty key="tests.codec" value="${tests.codec}"/>
<!-- set the codec provider tests should run with -->
<sysproperty key="tests.codecprovider" value="${tests.codecprovider}"/>
<!-- set the locale tests should run with -->
<sysproperty key="tests.locale" value="${tests.locale}"/>
<!-- set the timezone tests should run with -->
<sysproperty key="tests.timezone" value="${tests.timezone}"/>
<!-- set the directory tests should run with -->
<sysproperty key="tests.directory" value="${tests.directory}"/>
<!-- set the line file source for oal.util.LineFileDocs -->
<sysproperty key="tests.linedocsfile" value="${tests.linedocsfile}"/>
<!-- set the number of times tests should run -->
<sysproperty key="tests.iter" value="${tests.iter}"/>
<!-- set the minimum number of times tests should run unless failure -->
<sysproperty key="tests.iter.min" value="${tests.iter.min}"/>
<!-- set the test seed -->
<sysproperty key="tests.seed" value="${tests.seed}"/>
<!-- set the Version that tests should run against -->
<sysproperty key="tests.luceneMatchVersion" value="${tests.luceneMatchVersion}"/>
<!-- for lucene we can be strict, and we don't want false fails even across methods
make this a param so lucene-core/contrib tests use it only?
<sysproperty key="tests.cleanthreads" value="perMethod"/>
-->
<sysproperty key="tests.cleanthreads" value="perClass"/>
<!-- logging config file -->
<sysproperty key="java.util.logging.config.file" value="${tests.loggingfile}"/>
<!-- set whether or not nightly tests should run -->
<sysproperty key="tests.nightly" value="${tests.nightly}"/>
<!-- TODO: create propertyset for test properties, so each project can have its own set -->
<sysproperty key="tests.multiplier" value="${tests.multiplier}"/>
<sysproperty key="tempDir" file="@{tempDir}/@{threadNum}"/>
<sysproperty key="lucene.version" value="${dev.version}"/>
<sysproperty key="testmethod" value="${testmethod}"/>
<sysproperty key="jetty.testMode" value="1"/>
<sysproperty key="jetty.insecurerandom" value="1"/>
<sysproperty key="solr.directoryFactory" value="org.apache.solr.core.MockDirectoryFactory"/>
<formatter type="xml"/>
<formatter classname="${junit.details.formatter}" usefile="false"/>
<batchtest fork="yes" todir="@{junit.output.dir}" if="runall">
<fileset dir="@{dataDir}" includes="**/Test*.java,**/*Test.java" excludes="${junit.excludes}">
<custom classname="${junit.parallel.selector}" classpathref="@{junit.classpath}">
<param name="divisor" value="@{threadTotal}" />
<param name="part" value="@{threadNum}" />
</custom>
</fileset>
</batchtest>
<batchtest fork="yes" todir="@{junit.output.dir}" if="testpackage">
<fileset dir="@{dataDir}" includes="**/${testpackage}/**/Test*.java,**/${testpackage}/**/*Test.java" excludes="${junit.excludes}">
<custom classname="${junit.parallel.selector}" classpathref="@{junit.classpath}">
<param name="divisor" value="@{threadTotal}" />
<param name="part" value="@{threadNum}" />
</custom>
</fileset>
</batchtest>
<batchtest fork="yes" todir="@{junit.output.dir}" if="testpackageroot">
<fileset dir="@{dataDir}" includes="**/${testpackageroot}/Test*.java,**/${testpackageroot}/*Test.java" excludes="${junit.excludes}">
<custom classname="${junit.parallel.selector}" classpathref="@{junit.classpath}">
<param name="divisor" value="@{threadTotal}" />
<param name="part" value="@{threadNum}" />
</custom>
</fileset>
</batchtest>
<batchtest fork="yes" todir="@{junit.output.dir}" if="testcase">
<fileset dir="@{dataDir}" includes="**/${testcase}.java"/>
</batchtest>
</junit>
<fail if="tests.failed">Tests failed!</fail>
</sequential>
</macrodef>
<target name="test" depends="compile-test,validate-lucene,junit-mkdir,junit-sequential,junit-parallel" description="Runs unit tests"/>
<target name="junit-mkdir">
<mkdir dir="${junit.output.dir}"/>
</target>
<target name="junit-sequential" if="tests.sequential">
<test-macro/>
</target>
<target name="junit-parallel" unless="tests.sequential">
<parallel threadsPerProcessor="${tests.threadspercpu}">
<test-macro threadNum="1" threadTotal="8"/>
<test-macro threadNum="2" threadTotal="8"/>
<test-macro threadNum="3" threadTotal="8"/>
<test-macro threadNum="4" threadTotal="8"/>
<test-macro threadNum="5" threadTotal="8"/>
<test-macro threadNum="6" threadTotal="8"/>
<test-macro threadNum="7" threadTotal="8"/>
<test-macro threadNum="8" threadTotal="8"/>
</parallel>
</target>
<!--
If you want clover test code coverage, run this before the tests. You need clover.jar and the license in your ANT classspath and you need to specify -Drun.clover=true on the command line.
See http://issues.apache.org/jira/browse/LUCENE-721
-->
<target name="clover" depends="clover.setup, clover.info" description="Instrument the Unit tests using Clover. Requires a Clover 2.x license and clover.jar in the ANT classpath. To use, specify -Drun.clover=true on the command line."/>
<target name="clover.setup" if="clover.enabled">
<taskdef resource="cloverlib.xml"/>
<mkdir dir="${clover.db.dir}"/>
<clover-setup initString="${clover.db.dir}/lucene_coverage.db" encoding="${build.encoding}">
<fileset dir="${src.dir}">
<include name="org/apache/**/*.java" />
</fileset>
<testsources dir="${tests-framework.src.dir}">
<include name="org/apache/**/*.java" />
</testsources>
<testsources dir="${tests.src.dir}">
<include name="org/apache/**/*.java" />
</testsources>
</clover-setup>
</target>
<target name="clover.info" unless="clover.present">
<echo>
Clover not found. Code coverage reports disabled.
</echo>
</target>
<target name="clover.check">
<fail unless="clover.present">
##################################################################
Clover not found.
Please make sure clover.jar is in ANT_HOME/lib, or made available
to Ant using other mechanisms like -lib or CLASSPATH.
##################################################################
</fail>
</target>
<!--
Run after Junit tests.
-->
<target name="generate-clover-reports" depends="clover.check, clover">
<mkdir dir="${clover.report.dir}"/>
<!-- This extra path is needed, because from a top-level ant run, the contrib tests would be not visible (report generation is only done on top-level not via subants) -->
<fileset dir="contrib" id="clover.contrib.test.src.files">
<include name="**/test/**/*.java"/>
</fileset>
<fileset dir="${tests-framework.src.dir}" id="clover.test.src.files">
<include name="**/*.java" />
</fileset>
<fileset dir="${tests.src.dir}" id="clover.test.src.files">
<include name="**/*.java" />
</fileset>
<fileset dir="${build.dir}" id="clover.test.result.files">
<include name="**/test/TEST-*.xml" />
<!-- do not include BW tests -->
<exclude name="backwards/**"/>
</fileset>
<clover-report>
<current outfile="${clover.report.dir}" title="${final.name}" numThreads="0">
<format type="html" filter="assert"/>
<testsources refid="clover.test.src.files"/>
<testsources refid="clover.contrib.test.src.files"/>
<testresults refid="clover.test.result.files"/>
</current>
<current outfile="${clover.report.dir}/clover.xml" title="${final.name}">
<format type="xml" filter="assert"/>
<testsources refid="clover.test.src.files"/>
<testsources refid="clover.contrib.test.src.files"/>
<testresults refid="clover.test.result.files"/>
</current>
</clover-report>
</target>
<target name="generate-test-reports" description="Generates test reports">
<mkdir dir="${junit.reports}"/>
<junitreport todir="${junit.output.dir}">
<!-- this fileset let's the task work for individual contribs,
as well as the project as a whole
-->
<fileset dir="${build.dir}">
<include name="**/test/TEST-*.xml"/>
</fileset>
<report format="frames" todir="${junit.reports}"/>
</junitreport>
<mkdir dir="${junit.reports.backwards}"/>
<junitreport todir="${junit.output.dir.backwards}">
<!-- this fileset let's the task work for individual contribs,
as well as the project as a whole
-->
<fileset dir="${build.dir.backwards}">
<include name="**/test/TEST-*.xml"/>
</fileset>
<report format="frames" todir="${junit.reports.backwards}"/>
</junitreport>
</target>
<target name="jar" depends="jar-core">
<!-- convenience target to package core JAR -->
</target>
<target name="jar-src" depends="init">
<jarify basedir="${src.dir}" destfile="${build.dir}/${final.name}-src.jar"/>
</target>
<target name="default" depends="jar-core"/>
<target name="rat-sources-typedef">
<typedef resource="org/apache/rat/anttasks/antlib.xml" uri="antlib:rat.anttasks">
<classpath>
<fileset dir="." includes="rat*.jar"/>
</classpath>
</typedef>
</target>
<target name="rat-sources" depends="rat-sources-typedef"
description="runs the tasks over source and test files">
<rat:report xmlns:rat="antlib:org.apache.rat.anttasks">
<fileset dir="${src.dir}"/>
<fileset dir="${tests-framework.src.dir}"/>
<fileset dir="${tests.src.dir}"/>
</rat:report>
</target>
<!--+
| M A C R O S
+-->
<macrodef name="compile">
<attribute name="srcdir"/>
<attribute name="destdir"/>
<attribute name="javac.source" default="${javac.source}"/>
<attribute name="javac.target" default="${javac.target}"/>
<element name="nested" implicit="yes" optional="yes"/>
<sequential>
<mkdir dir="@{destdir}"/>
<javac
includeAntRuntime="${javac.includeAntRuntime}"
encoding="${build.encoding}"
srcdir="@{srcdir}"
destdir="@{destdir}"
deprecation="${javac.deprecation}"
debug="${javac.debug}"
source="@{javac.source}"
target="@{javac.target}">
<nested/>
<!-- <compilerarg line="-Xmaxwarns 10000000"/>
<compilerarg line="-Xmaxerrs 10000000"/> -->
<!-- for generics in Java 1.5: -->
<compilerarg line="-Xlint -Xlint:-deprecation -Xlint:-serial"/>
</javac>
</sequential>
</macrodef>
<macrodef name="invoke-javacc">
<attribute name="target"/>
<attribute name="outputDir"/>
<sequential>
<mkdir dir="@{outputDir}"/>
<javacc
target="@{target}"
outputDirectory="@{outputDir}"
debugTokenManager="${javacc.debug.tokenmgr}"
debugParser="${javacc.debug.parser}"
debuglookahead="${javacc.debug.lookahead}"
javacchome="${javacc.home}"
jdkversion="${javac.source}"
/>
<fixcrlf srcdir="@{outputDir}" includes="*.java" encoding="UTF-8">
<containsregexp expression="Generated.*By.*JavaCC"/>
</fixcrlf>
</sequential>
</macrodef>
<property name="failonjavadocwarning" value="true"/>
<macrodef name="invoke-javadoc">
<element name="sources" optional="yes"/>
<attribute name="destdir"/>
<attribute name="title" default="${Name} ${version} API"/>
<attribute name="overview" default="${src.dir}/overview.html"/>
<sequential>
<copy todir="@{destdir}/../prettify" overwrite="false">
<fileset dir="${prettify.dir}"/>
</copy>
<record name="@{destdir}/log_javadoc.txt" action="start" append="no"/>
<javadoc
overview="@{overview}"
packagenames="org.apache.lucene.*"
destdir="@{destdir}"
access="${javadoc.access}"
encoding="${build.encoding}"
charset="${javadoc.charset}"
docencoding="${javadoc.charset}"
author="true"
version="true"
use="true"
failonerror="true"
source="${ant.java.version}"
link="${javadoc.link}"
windowtitle="${Name} ${version} API"
doctitle="@{title}"
stylesheetfile="@{destdir}/../prettify/stylesheet+prettify.css"
maxmemory="${javadoc.maxmemory}"
bottom="Copyright &amp;copy; ${year} Apache Software Foundation. All Rights Reserved.">
<tag name="lucene.experimental"
description="WARNING: This API is experimental and might change in incompatible ways in the next release."/>
<tag name="lucene.internal"
description="NOTE: This API is for Lucene internal purposes only and might change in incompatible ways in the next release."/>
<link offline="true" packagelistLoc="${javadoc.dir}"/>
<header><![CDATA[
<script src="{@docRoot}/../prettify/prettify.js" type="text/javascript"></script>
<script language="JavaScript">window.onload=function(){windowTitle();prettyPrint();}</script>
]]></header>
<sources />
<classpath refid="javadoc.classpath"/>
</javadoc>
<record name="@{destdir}/log_javadoc.txt" action="stop"/>
<delete>
<fileset file="@{destdir}/log_javadoc.txt">
<not>
<containsregexp expression="\[javadoc\]\s*[1-9][0-9]*[\s]*warning"/>
</not>
</fileset>
</delete>
<fail message="Javadocs warnings were found!">
<condition>
<and>
<available file="@{destdir}/log_javadoc.txt"/>
<istrue value="${failonjavadocwarning}"/>
</and>
</condition>
</fail>
</sequential>
</macrodef>
<!-- VALIDATION work -->
<target name="check-legal-lucene" depends="compile-tools">
<java classname="org.apache.lucene.validation.DependencyChecker" failonerror="true" fork="true">
<classpath>
<path refid="tools.runtime.classpath" />
</classpath>
<!-- TODO: it might be better to just automatically find all directories that contain jar files, but that could take a
long time. This should be faster, but we could miss a directory
-->
<!-- Lucene -->
<arg value="-c" />
<arg value="${basedir}/lib" />
</java>
</target>
<target name="check-legal" depends="check-legal-lucene"/>
<target name="validate-lucene" depends="check-legal-lucene" unless="validated-lucene"/>
<!-- Generic placeholder target for if we add other validation tasks -->
<target name="validate" depends="validate-lucene"/>
</project>

View File

@ -55,7 +55,9 @@
</path>
<target name="build-lucene" unless="core.compiled">
<ant dir="${common.dir}" target="compile-test" inheritAll="false"/>
<ant dir="${common.dir}" target="compile-test" inheritAll="false">
<propertyset refid="uptodate.and.compiled.properties"/>
</ant>
<!-- set the property for this ant execution to speed up later tasks depending on this -->
<property name="core.compiled" value="true"/>
</target>

View File

@ -37,7 +37,9 @@
</uptodate>
</target>
<target name="jar-lucene" depends="build-lucene" unless="lucene.jar.uptodate">
<ant dir="${common.dir}" target="jar-core" inheritAll="false"/>
<ant dir="${common.dir}" target="jar-core" inheritAll="false">
<propertyset refid="uptodate.and.compiled.properties"/>
</ant>
</target>
<target name="init" depends="contrib-build.init,lucene-jar-uptodate,jar-lucene"/>

View File

@ -38,11 +38,15 @@
<target name="build-memory" unless="memory.uptodate">
<echo>Highlighter building dependency contrib/memory</echo>
<ant antfile="../memory/build.xml" target="default" inheritall="false" dir="../memory" />
<ant antfile="../memory/build.xml" target="default" inheritall="false" dir="../memory">
<propertyset refid="uptodate.and.compiled.properties"/>
</ant>
</target>
<target name="build-queries" unless="queries.uptodate">
<echo>Highlighter building dependency contrib/queries</echo>
<ant antfile="../queries/build.xml" target="default" inheritall="false" dir="../queries" />
<ant antfile="../queries/build.xml" target="default" inheritall="false" dir="../queries">
<propertyset refid="uptodate.and.compiled.properties"/>
</ant>
</target>
</project>

View File

@ -240,8 +240,7 @@ public class InstantiatedIndexWriter implements Closeable {
final FieldInvertState invertState = new FieldInvertState();
invertState.setBoost(eFieldTermDocInfoFactoriesByTermText.getKey().boost * document.getDocument().getBoost());
invertState.setLength(eFieldTermDocInfoFactoriesByTermText.getKey().fieldLength);
final float norm = similarityProvider.get(fieldName).computeNorm(invertState);
normsByFieldNameAndDocumentNumber.get(fieldName)[document.getDocumentNumber()] = similarityProvider.get(fieldName).encodeNormValue(norm);
normsByFieldNameAndDocumentNumber.get(fieldName)[document.getDocumentNumber()] = similarityProvider.get(fieldName).computeNorm(invertState);
} else {
System.currentTimeMillis();
}

View File

@ -51,7 +51,6 @@ import org.apache.lucene.index.TermFreqVector;
import org.apache.lucene.index.TermPositionVector;
import org.apache.lucene.index.TermVectorMapper;
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.IndexReader.ReaderContext;
import org.apache.lucene.index.codecs.PerDocValues;
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.IndexSearcher;
@ -1202,15 +1201,14 @@ public class MemoryIndex {
int numOverlapTokens = info != null ? info.numOverlapTokens : 0;
float boost = info != null ? info.getBoost() : 1.0f;
FieldInvertState invertState = new FieldInvertState(0, numTokens, numOverlapTokens, 0, boost);
float n = fieldSim.computeNorm(invertState);
byte norm = fieldSim.encodeNormValue(n);
byte norm = fieldSim.computeNorm(invertState);
norms = new byte[] {norm};
// cache it for future reuse
cachedNorms = norms;
cachedFieldName = fieldName;
cachedSimilarity = sim;
if (DEBUG) System.err.println("MemoryIndexReader.norms: " + fieldName + ":" + n + ":" + norm + ":" + numTokens);
if (DEBUG) System.err.println("MemoryIndexReader.norms: " + fieldName + ":" + norm + ":" + numTokens);
}
return norms;
}

View File

@ -147,7 +147,7 @@ public class FieldNormModifier {
for (int d = 0; d < termCounts.length; d++) {
if (liveDocs == null || liveDocs.get(d)) {
invertState.setLength(termCounts[d]);
subReader.setNorm(d, field, fieldSim.encodeNormValue(fieldSim.computeNorm(invertState)));
subReader.setNorm(d, field, fieldSim.computeNorm(invertState));
}
}
}

View File

@ -25,7 +25,6 @@ import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.codecs.Codec;
import org.apache.lucene.index.codecs.DocValuesConsumer;
import org.apache.lucene.index.codecs.DefaultDocValuesProducer;
import org.apache.lucene.index.codecs.FieldsConsumer;
import org.apache.lucene.index.codecs.FieldsProducer;
@ -58,7 +57,7 @@ public class AppendingCodec extends Codec {
public static String CODEC_NAME = "Appending";
public AppendingCodec() {
name = CODEC_NAME;
super(CODEC_NAME);
}
@Override
@ -138,22 +137,22 @@ public class AppendingCodec extends Codec {
StandardPostingsReader.files(dir, segmentInfo, codecId, files);
BlockTermsReader.files(dir, segmentInfo, codecId, files);
FixedGapTermsIndexReader.files(dir, segmentInfo, codecId, files);
DefaultDocValuesConsumer.files(dir, segmentInfo, codecId, files);
DefaultDocValuesConsumer.files(dir, segmentInfo, codecId, files, getDocValuesUseCFS());
}
@Override
public void getExtensions(Set<String> extensions) {
StandardCodec.getStandardExtensions(extensions);
DefaultDocValuesConsumer.getDocValuesExtensions(extensions);
DefaultDocValuesConsumer.getDocValuesExtensions(extensions, getDocValuesUseCFS());
}
@Override
public PerDocConsumer docsConsumer(PerDocWriteState state) throws IOException {
return new DefaultDocValuesConsumer(state, BytesRef.getUTF8SortedAsUnicodeComparator());
return new DefaultDocValuesConsumer(state, getDocValuesSortComparator(), getDocValuesUseCFS());
}
@Override
public PerDocValues docsProducer(SegmentReadState state) throws IOException {
return new DefaultDocValuesProducer(state.segmentInfo, state.dir, state.fieldInfos, state.codecId);
return new DefaultDocValuesProducer(state.segmentInfo, state.dir, state.fieldInfos, state.codecId, getDocValuesUseCFS(), getDocValuesSortComparator());
}
}

View File

@ -106,7 +106,7 @@ public class SweetSpotSimilarity extends DefaultSimilarity {
* discountOverlaps is true by default or true for this
* specific field. */
@Override
public float computeNorm(FieldInvertState state) {
public byte computeNorm(FieldInvertState state) {
final int numTokens;
if (discountOverlaps)
@ -114,7 +114,7 @@ public class SweetSpotSimilarity extends DefaultSimilarity {
else
numTokens = state.getLength();
return state.getBoost() * computeLengthNorm(numTokens);
return encodeNormValue(state.getBoost() * computeLengthNorm(numTokens));
}
/**

View File

@ -49,8 +49,8 @@ public class TestFieldNormModifier extends LuceneTestCase {
public Similarity get(String field) {
return new DefaultSimilarity() {
@Override
public float computeNorm(FieldInvertState state) {
return state.getBoost() * (discountOverlaps ? state.getLength() - state.getNumOverlap() : state.getLength());
public byte computeNorm(FieldInvertState state) {
return encodeNormValue(state.getBoost() * (discountOverlaps ? state.getLength() - state.getNumOverlap() : state.getLength()));
}
};
}

View File

@ -21,6 +21,7 @@ package org.apache.lucene.misc;
import org.apache.lucene.search.DefaultSimilarity;
import org.apache.lucene.search.DefaultSimilarityProvider;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.search.TFIDFSimilarity;
import org.apache.lucene.search.SimilarityProvider;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.index.FieldInvertState;
@ -58,15 +59,15 @@ public class SweetSpotSimilarityTest extends LuceneTestCase {
invertState.setLength(i);
assertEquals("3,10: spot i="+i,
1.0f,
s.computeNorm(invertState),
ss.decodeNormValue(s.computeNorm(invertState)),
0.0f);
}
for (int i = 10; i < 1000; i++) {
invertState.setLength(i-9);
final float normD = d.computeNorm(invertState);
final byte normD = d.computeNorm(invertState);
invertState.setLength(i);
final float normS = s.computeNorm(invertState);
final byte normS = s.computeNorm(invertState);
assertEquals("3,10: 10<x : i="+i,
normD,
normS,
@ -104,14 +105,14 @@ public class SweetSpotSimilarityTest extends LuceneTestCase {
invertState.setLength(i);
assertEquals("f: 3,10: spot i="+i,
1.0f,
sp.get("foo").computeNorm(invertState),
ss.decodeNormValue(sp.get("foo").computeNorm(invertState)),
0.0f);
}
for (int i = 10; i < 1000; i++) {
invertState.setLength(i-9);
final float normD = d.computeNorm(invertState);
final byte normD = d.computeNorm(invertState);
invertState.setLength(i);
final float normS = sp.get("foo").computeNorm(invertState);
final byte normS = sp.get("foo").computeNorm(invertState);
assertEquals("f: 3,10: 10<x : i="+i,
normD,
normS,
@ -121,21 +122,21 @@ public class SweetSpotSimilarityTest extends LuceneTestCase {
invertState.setLength(i);
assertEquals("f: 8,13: spot i="+i,
1.0f,
sp.get("bar").computeNorm(invertState),
ss.decodeNormValue(sp.get("bar").computeNorm(invertState)),
0.0f);
}
for (int i = 6; i <=9; i++) {
invertState.setLength(i);
assertEquals("f: 6,9: spot i="+i,
1.0f,
sp.get("yak").computeNorm(invertState),
ss.decodeNormValue(sp.get("yak").computeNorm(invertState)),
0.0f);
}
for (int i = 13; i < 1000; i++) {
invertState.setLength(i-12);
final float normD = d.computeNorm(invertState);
final byte normD = d.computeNorm(invertState);
invertState.setLength(i);
final float normS = sp.get("bar").computeNorm(invertState);
final byte normS = sp.get("bar").computeNorm(invertState);
assertEquals("f: 8,13: 13<x : i="+i,
normD,
normS,
@ -143,9 +144,9 @@ public class SweetSpotSimilarityTest extends LuceneTestCase {
}
for (int i = 9; i < 1000; i++) {
invertState.setLength(i-8);
final float normD = d.computeNorm(invertState);
final byte normD = d.computeNorm(invertState);
invertState.setLength(i);
final float normS = sp.get("yak").computeNorm(invertState);
final byte normS = sp.get("yak").computeNorm(invertState);
assertEquals("f: 6,9: 9<x : i="+i,
normD,
normS,
@ -157,8 +158,8 @@ public class SweetSpotSimilarityTest extends LuceneTestCase {
for (int i = 9; i < 1000; i++) {
invertState.setLength(i);
final float normSS = sp.get("a").computeNorm(invertState);
final float normS = sp.get("b").computeNorm(invertState);
final byte normSS = sp.get("a").computeNorm(invertState);
final byte normS = sp.get("b").computeNorm(invertState);
assertTrue("s: i="+i+" : a="+normSS+
" < b="+normS,
normSS < normS);
@ -170,8 +171,8 @@ public class SweetSpotSimilarityTest extends LuceneTestCase {
SweetSpotSimilarity ss = new SweetSpotSimilarity();
Similarity d = new DefaultSimilarity();
Similarity s = ss;
TFIDFSimilarity d = new DefaultSimilarity();
TFIDFSimilarity s = ss;
// tf equal
@ -222,7 +223,7 @@ public class SweetSpotSimilarityTest extends LuceneTestCase {
};
ss.setHyperbolicTfFactors(3.3f, 7.7f, Math.E, 5.0f);
Similarity s = ss;
TFIDFSimilarity s = ss;
for (int i = 1; i <=1000; i++) {
assertTrue("MIN tf: i="+i+" : s="+s.tf(i),

View File

@ -54,8 +54,8 @@ public class TestLengthNormModifier extends LuceneTestCase {
public Similarity get(String field) {
return new DefaultSimilarity() {
@Override
public float computeNorm(FieldInvertState state) {
return state.getBoost() * (discountOverlaps ? state.getLength() - state.getNumOverlap() : state.getLength());
public byte computeNorm(FieldInvertState state) {
return encodeNormValue(state.getBoost() * (discountOverlaps ? state.getLength() - state.getNumOverlap() : state.getLength()));
}
};
}
@ -175,8 +175,8 @@ public class TestLengthNormModifier extends LuceneTestCase {
public Similarity get(String field) {
return new DefaultSimilarity() {
@Override
public float computeNorm(FieldInvertState state) {
return state.getBoost() * (discountOverlaps ? state.getLength() - state.getNumOverlap() : state.getLength());
public byte computeNorm(FieldInvertState state) {
return encodeNormValue(state.getBoost() * (discountOverlaps ? state.getLength() - state.getNumOverlap() : state.getLength()));
}
};
}

View File

@ -51,7 +51,11 @@ import org.apache.lucene.util.PriorityQueue;
*/
public class FuzzyLikeThisQuery extends Query
{
static Similarity sim=new DefaultSimilarity();
// TODO: generalize this query (at least it should not reuse this static sim!
// a better way might be to convert this into multitermquery rewrite methods.
// the rewrite method can 'average' the TermContext's term statistics (docfreq,totalTermFreq)
// provided to TermQuery, so that the general idea is agnostic to any scoring system...
static TFIDFSimilarity sim=new DefaultSimilarity();
Query rewrittenQuery=null;
ArrayList<FieldVals> fieldVals=new ArrayList<FieldVals>();
Analyzer analyzer;

View File

@ -44,6 +44,7 @@ import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.search.TFIDFSimilarity;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.FSDirectory;
@ -285,7 +286,7 @@ public final class MoreLikeThis {
/**
* For idf() calculations.
*/
private Similarity similarity;// = new DefaultSimilarity();
private TFIDFSimilarity similarity;// = new DefaultSimilarity();
/**
* IndexReader to use
@ -320,17 +321,17 @@ public final class MoreLikeThis {
this(ir, new DefaultSimilarity());
}
public MoreLikeThis(IndexReader ir, Similarity sim){
public MoreLikeThis(IndexReader ir, TFIDFSimilarity sim){
this.ir = ir;
this.similarity = sim;
}
public Similarity getSimilarity() {
public TFIDFSimilarity getSimilarity() {
return similarity;
}
public void setSimilarity(Similarity similarity) {
public void setSimilarity(TFIDFSimilarity similarity) {
this.similarity = similarity;
}

View File

@ -17,7 +17,7 @@
limitations under the License.
-->
<project name="queryparser" default="default">
<project name="queryparser-contrib" default="default">
<description>
Flexible Query Parser

View File

@ -36,7 +36,10 @@
<target name="build-queries" unless="queries.uptodate">
<echo>Misc building dependency ${queries.jar}</echo>
<ant antfile="../queries/build.xml" target="default" inheritall="false" dir="../queries" />
<ant antfile="../queries/build.xml" target="default" inheritall="false" dir="../queries">
<propertyset refid="uptodate.and.compiled.properties"/>
</ant>
<property name="queries.uptodate" value="true"/>
</target>
</project>

View File

@ -47,7 +47,10 @@
<target name="build-queries" unless="queries.uptodate">
<echo>XML Parser building dependency ${queries.jar}</echo>
<ant antfile="../queries/build.xml" target="default" inheritall="false" dir="../queries" />
<ant antfile="../queries/build.xml" target="default" inheritall="false" dir="../queries">
<propertyset refid="uptodate.and.compiled.properties"/>
</ant>
<property name="queries.uptodate" value="true"/>
</target>
<!-- override contrib-build.xml target to also build web demo -->

View File

@ -81,13 +81,13 @@ public abstract class AbstractField implements Fieldable {
* default, in the {@link
* org.apache.lucene.search.Similarity#computeNorm(FieldInvertState)} method, the boost value is multiplied
* by the length normalization factor and then
* rounded by {@link org.apache.lucene.search.Similarity#encodeNormValue(float)} before it is stored in the
* rounded by {@link org.apache.lucene.search.DefaultSimilarity#encodeNormValue(float)} before it is stored in the
* index. One should attempt to ensure that this product does not overflow
* the range of that encoding.
*
* @see org.apache.lucene.document.Document#setBoost(float)
* @see org.apache.lucene.search.Similarity#computeNorm(FieldInvertState)
* @see org.apache.lucene.search.Similarity#encodeNormValue(float)
* @see org.apache.lucene.search.DefaultSimilarity#encodeNormValue(float)
*/
public void setBoost(float boost) {
this.boost = boost;

View File

@ -48,13 +48,13 @@ public interface Fieldable {
* default, in the {@link
* org.apache.lucene.search.Similarity#computeNorm(FieldInvertState)} method, the boost value is multiplied
* by the length normalization factor
* and then rounded by {@link org.apache.lucene.search.Similarity#encodeNormValue(float)} before it is stored in the
* and then rounded by {@link org.apache.lucene.search.DefaultSimilarity#encodeNormValue(float)} before it is stored in the
* index. One should attempt to ensure that this product does not overflow
* the range of that encoding.
*
* @see org.apache.lucene.document.Document#setBoost(float)
* @see org.apache.lucene.search.Similarity#computeNorm(FieldInvertState)
* @see org.apache.lucene.search.Similarity#encodeNormValue(float)
* @see org.apache.lucene.search.DefaultSimilarity#encodeNormValue(float)
*/
void setBoost(float boost);

View File

@ -30,9 +30,12 @@ import org.apache.lucene.store.BufferedIndexInput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.CloseableThreadLocal;
import org.apache.lucene.util.IOUtils;
import java.io.Closeable;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
/**
* Class responsible for access to stored document fields.
@ -41,7 +44,7 @@ import java.io.Reader;
*
* @lucene.internal
*/
public final class FieldsReader implements Cloneable {
public final class FieldsReader implements Cloneable, Closeable {
private final static int FORMAT_SIZE = 4;
private final FieldInfos fieldInfos;
@ -179,21 +182,11 @@ public final class FieldsReader implements Cloneable {
*/
public final void close() throws IOException {
if (!closed) {
if (fieldsStream != null) {
fieldsStream.close();
}
if (isOriginal) {
if (cloneableFieldsStream != null) {
cloneableFieldsStream.close();
}
if (cloneableIndexStream != null) {
cloneableIndexStream.close();
}
IOUtils.closeSafely(false, fieldsStream, indexStream, fieldsStreamTL, cloneableFieldsStream, cloneableIndexStream);
} else {
IOUtils.closeSafely(false, fieldsStream, indexStream, fieldsStreamTL);
}
if (indexStream != null) {
indexStream.close();
}
fieldsStreamTL.close();
closed = true;
}
}

View File

@ -1025,7 +1025,7 @@ public abstract class IndexReader implements Cloneable,Closeable {
public abstract byte[] norms(String field) throws IOException;
/** Expert: Resets the normalization factor for the named field of the named
* document. The norm represents the product of the field's {@link
* document. By default, The norm represents the product of the field's {@link
* org.apache.lucene.document.Fieldable#setBoost(float) boost} and its
* length normalization}. Thus, to preserve the length normalization
* values when resetting this, one should base the new value upon the old.
@ -1034,7 +1034,8 @@ public abstract class IndexReader implements Cloneable,Closeable {
* this method throws {@link IllegalStateException}.
*
* @see #norms(String)
* @see Similarity#decodeNormValue(byte)
* @see Similarity#computeNorm(FieldInvertState)
* @see org.apache.lucene.search.DefaultSimilarity#decodeNormValue(byte)
* @throws StaleReaderException if the index has changed
* since this reader was opened
* @throws CorruptIndexException if the index is corrupt

View File

@ -28,6 +28,7 @@ import org.apache.lucene.index.values.IndexDocValues;
import org.apache.lucene.index.values.MultiIndexDocValues;
import org.apache.lucene.index.values.ValueType;
import org.apache.lucene.index.values.MultiIndexDocValues.DocValuesIndex;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.ReaderUtil;
import org.apache.lucene.util.ReaderUtil.Gather;
@ -151,20 +152,7 @@ public class MultiPerDocValues extends PerDocValues {
}
public void close() throws IOException {
final PerDocValues[] perDocValues = this.subs;
IOException ex = null;
for (PerDocValues values : perDocValues) {
try {
values.close();
} catch (IOException e) {
if (ex == null) {
ex = e;
}
}
}
if (ex != null) {
throw ex;
}
IOUtils.closeSafely(false, this.subs);
}
@Override

View File

@ -72,8 +72,7 @@ final class NormsWriterPerField extends InvertedDocEndConsumerPerField implement
assert norms.length == upto;
norms = ArrayUtil.grow(norms, 1+upto);
}
final float norm = similarity.computeNorm(fieldState);
norms[upto] = similarity.encodeNormValue(norm);
norms[upto] = similarity.computeNorm(fieldState);
docIDs[upto] = docState.docID;
upto++;
}

View File

@ -51,7 +51,7 @@ final class PerFieldCodecWrapper extends Codec {
private final SegmentCodecs segmentCodecs;
PerFieldCodecWrapper(SegmentCodecs segmentCodecs) {
name = "PerField";
super("PerField");
this.segmentCodecs = segmentCodecs;
}

View File

@ -25,6 +25,7 @@ import org.apache.lucene.index.codecs.FieldsProducer;
import org.apache.lucene.index.codecs.PerDocValues;
import org.apache.lucene.store.CompoundFileDirectory;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.IOUtils;
/** Holds core readers that are shared (unchanged) when
* SegmentReader is cloned or reopened */
@ -119,33 +120,9 @@ final class SegmentCoreReaders {
}
synchronized void decRef() throws IOException {
if (ref.decrementAndGet() == 0) {
if (fields != null) {
fields.close();
}
if (perDocProducer != null) {
perDocProducer.close();
}
if (termVectorsReaderOrig != null) {
termVectorsReaderOrig.close();
}
if (fieldsReaderOrig != null) {
fieldsReaderOrig.close();
}
if (cfsReader != null) {
cfsReader.close();
}
if (storeCFSReader != null) {
storeCFSReader.close();
}
IOUtils.closeSafely(false, fields, perDocProducer, termVectorsReaderOrig,
fieldsReaderOrig, cfsReader, storeCFSReader);
// Now, notify any ReaderFinished listeners:
if (owner != null) {
owner.notifyReaderFinishedListeners();

View File

@ -115,6 +115,7 @@ final class SegmentMerger {
mergedDocs = mergeFields();
mergeTerms();
mergePerDoc();
mergeNorms();
if (fieldInfos.hasVectors())
@ -483,17 +484,10 @@ final class SegmentMerger {
int docBase = 0;
final List<Fields> fields = new ArrayList<Fields>();
final List<ReaderUtil.Slice> slices = new ArrayList<ReaderUtil.Slice>();
final List<Bits> bits = new ArrayList<Bits>();
final List<Integer> bitsStarts = new ArrayList<Integer>();
// TODO: move this into its own method - this merges currently only docvalues
final List<PerDocValues> perDocProducers = new ArrayList<PerDocValues>();
final List<ReaderUtil.Slice> perDocSlices = new ArrayList<ReaderUtil.Slice>();
final List<Bits> perDocBits = new ArrayList<Bits>();
final List<Integer> perDocBitsStarts = new ArrayList<Integer>();
for(IndexReader r : readers) {
final Fields f = r.fields();
final int maxDoc = r.maxDoc();
@ -503,18 +497,10 @@ final class SegmentMerger {
bits.add(r.getLiveDocs());
bitsStarts.add(docBase);
}
final PerDocValues producer = r.perDocValues();
if (producer != null) {
perDocSlices.add(new ReaderUtil.Slice(docBase, maxDoc, fields.size()));
perDocProducers.add(producer);
perDocBits.add(r.getLiveDocs());
perDocBitsStarts.add(docBase);
}
docBase += maxDoc;
}
bitsStarts.add(docBase);
perDocBitsStarts.add(docBase);
// we may gather more readers than mergeState.readerCount
mergeState = new MergeState();
@ -580,19 +566,45 @@ final class SegmentMerger {
} finally {
consumer.close();
}
}
private void mergePerDoc() throws IOException {
final List<PerDocValues> perDocProducers = new ArrayList<PerDocValues>();
final List<ReaderUtil.Slice> perDocSlices = new ArrayList<ReaderUtil.Slice>();
final List<Bits> perDocBits = new ArrayList<Bits>();
final List<Integer> perDocBitsStarts = new ArrayList<Integer>();
int docBase = 0;
for (IndexReader r : readers) {
final int maxDoc = r.maxDoc();
final PerDocValues producer = r.perDocValues();
if (producer != null) {
perDocSlices.add(new ReaderUtil.Slice(docBase, maxDoc, perDocProducers
.size()));
perDocProducers.add(producer);
perDocBits.add(r.getLiveDocs());
perDocBitsStarts.add(docBase);
}
docBase += maxDoc;
}
perDocBitsStarts.add(docBase);
if (!perDocSlices.isEmpty()) {
mergeState.multiLiveDocs = new MultiBits(perDocBits, perDocBitsStarts, true);
mergeState.multiLiveDocs = new MultiBits(perDocBits, perDocBitsStarts,
true);
final PerDocConsumer docsConsumer = codec
.docsConsumer(new PerDocWriteState(segmentWriteState));
boolean success = false;
try {
final MultiPerDocValues multiPerDocValues = new MultiPerDocValues(perDocProducers
.toArray(PerDocValues.EMPTY_ARRAY), perDocSlices
.toArray(ReaderUtil.Slice.EMPTY_ARRAY));
final MultiPerDocValues multiPerDocValues = new MultiPerDocValues(
perDocProducers.toArray(PerDocValues.EMPTY_ARRAY),
perDocSlices.toArray(ReaderUtil.Slice.EMPTY_ARRAY));
docsConsumer.merge(mergeState, multiPerDocValues);
success = true;
} finally {
docsConsumer.close();
IOUtils.closeSafely(!success, docsConsumer);
}
}
/* don't close the perDocProducers here since they are private segment producers
* and will be closed once the SegmentReader goes out of scope */
}
private MergeState mergeState;

View File

@ -22,11 +22,13 @@ import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import java.io.Closeable;
import java.io.IOException;
import java.util.Arrays;
class TermVectorsReader implements Cloneable {
class TermVectorsReader implements Cloneable, Closeable {
// NOTE: if you make a new format, it must be larger than
// the current format
@ -192,14 +194,8 @@ class TermVectorsReader implements Cloneable {
return format;
}
void close() throws IOException {
// make all effort to close up. Keep the first exception
// and throw it as a new one.
IOException keep = null;
if (tvx != null) try { tvx.close(); } catch (IOException e) { keep = e; }
if (tvd != null) try { tvd.close(); } catch (IOException e) { if (keep == null) keep = e; }
if (tvf != null) try { tvf.close(); } catch (IOException e) { if (keep == null) keep = e; }
if (keep != null) throw (IOException) keep.fillInStackTrace();
public void close() throws IOException {
IOUtils.closeSafely(false, tvx, tvd, tvf);
}
/**

View File

@ -18,6 +18,7 @@ package org.apache.lucene.index.codecs;
*/
import java.io.IOException;
import java.util.Comparator;
import java.util.Set;
import org.apache.lucene.index.PerDocWriteState;
@ -25,13 +26,21 @@ import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
/** @lucene.experimental */
public abstract class Codec {
public static final Codec[] EMPTY = new Codec[0];
/** Unique name that's used to retrieve this codec when
* reading the index */
public String name;
public final String name;
private boolean dvUseCompoundFile = true;
private Comparator<BytesRef> docValuesSortComparator = BytesRef
.getUTF8SortedAsUnicodeComparator();
protected Codec(String name) {
this.name = name;
}
/** Writes a new segment */
public abstract FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException;
@ -69,6 +78,47 @@ public abstract class Codec {
/** Records all file extensions this codec uses */
public abstract void getExtensions(Set<String> extensions);
/**
* If set to <code>true</code> this codec will use a compound file for
* IndexDocValues, otherwise each IndexDocValues field will create up to 2
* files per segment.
* <p>
* NOTE: The default values is <code>true</code>.
*/
public void setDocValuesUseCFS(boolean docValuesUseCFS) {
this.dvUseCompoundFile = docValuesUseCFS;
}
/**
* Returns <code>true</code> iff compound file should be used for
* IndexDocValues, otherwise <code>false</code>.
*
* @see #setDocValuesUseCFS(boolean)
* @return <code>true</code> iff compound file should be used for
* IndexDocValues, otherwise <code>false</code>.
*/
public boolean getDocValuesUseCFS() {
return dvUseCompoundFile;
}
/**
* Sets the {@link BytesRef} comparator for sorted IndexDocValue variants. The
* default is {@link BytesRef#getUTF8SortedAsUnicodeComparator()}. *
*/
public void setDocValuesSortComparator(
Comparator<BytesRef> docValuesSortComparator) {
this.docValuesSortComparator = docValuesSortComparator;
}
/**
* Returns the {@link BytesRef} comparator for sorted IndexDocValue variants.
* The default is {@link BytesRef#getUTF8SortedAsUnicodeComparator()}.
*/
public Comparator<BytesRef> getDocValuesSortComparator() {
return docValuesSortComparator;
}
@Override
public String toString() {
return name;

View File

@ -44,7 +44,7 @@ public class CoreCodecProvider extends CodecProvider {
public CoreCodecProvider() {
register(new StandardCodec());
register(new PreFlexCodec());
register(new PulsingCodec(1));
register(new PulsingCodec());
register(new SimpleTextCodec());
register(new MemoryCodec());
}

View File

@ -31,79 +31,102 @@ import org.apache.lucene.index.values.Writer;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
/**
*
* @lucene.experimental
*/
public class DefaultDocValuesConsumer extends PerDocConsumer {
private final String segmentName;
private final int codecId;
private final Directory directory;
private final AtomicLong bytesUsed;
private final Comparator<BytesRef> comparator;
private boolean useCompoundFile;
public DefaultDocValuesConsumer(PerDocWriteState state, Comparator<BytesRef> comparator) {
public DefaultDocValuesConsumer(PerDocWriteState state, Comparator<BytesRef> comparator, boolean useCompoundFile) throws IOException {
this.segmentName = state.segmentName;
this.codecId = state.codecId;
this.bytesUsed = state.bytesUsed;
this.directory = state.directory;
//TODO maybe we should enable a global CFS that all codecs can pull on demand to further reduce the number of files?
this.directory = useCompoundFile ? state.directory.createCompoundOutput(IndexFileNames.segmentFileName(segmentName, state.codecId, IndexFileNames.COMPOUND_FILE_EXTENSION)) : state.directory;
this.comparator = comparator;
this.useCompoundFile = useCompoundFile;
}
public void close() throws IOException {
if (useCompoundFile) {
this.directory.close();
}
}
@Override
public DocValuesConsumer addValuesField(FieldInfo field) throws IOException {
return Writer.create(field.getDocValues(),
docValuesId(segmentName, codecId, field.number),
// TODO can we have a compound file per segment and codec for
// docvalues?
directory, comparator, bytesUsed);
}
@SuppressWarnings("fallthrough")
public static void files(Directory dir, SegmentInfo segmentInfo, int codecId,
Set<String> files) throws IOException {
Set<String> files, boolean useCompoundFile) throws IOException {
FieldInfos fieldInfos = segmentInfo.getFieldInfos();
for (FieldInfo fieldInfo : fieldInfos) {
if (fieldInfo.getCodecId() == codecId && fieldInfo.hasDocValues()) {
String filename = docValuesId(segmentInfo.name, codecId,
fieldInfo.number);
switch (fieldInfo.getDocValues()) {
case BYTES_FIXED_DEREF:
case BYTES_VAR_DEREF:
case BYTES_VAR_SORTED:
case BYTES_FIXED_SORTED:
case BYTES_VAR_STRAIGHT:
files.add(IndexFileNames.segmentFileName(filename, "",
Writer.INDEX_EXTENSION));
assert dir.fileExists(IndexFileNames.segmentFileName(filename, "",
Writer.INDEX_EXTENSION));
// until here all types use an index
case BYTES_FIXED_STRAIGHT:
case FLOAT_32:
case FLOAT_64:
case VAR_INTS:
case FIXED_INTS_16:
case FIXED_INTS_32:
case FIXED_INTS_64:
case FIXED_INTS_8:
files.add(IndexFileNames.segmentFileName(filename, "",
Writer.DATA_EXTENSION));
assert dir.fileExists(IndexFileNames.segmentFileName(filename, "",
Writer.DATA_EXTENSION));
break;
if (useCompoundFile) {
files.add(IndexFileNames.segmentFileName(segmentInfo.name, codecId, IndexFileNames.COMPOUND_FILE_EXTENSION));
files.add(IndexFileNames.segmentFileName(segmentInfo.name, codecId, IndexFileNames.COMPOUND_FILE_ENTRIES_EXTENSION));
assert dir.fileExists(IndexFileNames.segmentFileName(segmentInfo.name, codecId, IndexFileNames.COMPOUND_FILE_ENTRIES_EXTENSION));
assert dir.fileExists(IndexFileNames.segmentFileName(segmentInfo.name, codecId, IndexFileNames.COMPOUND_FILE_EXTENSION));
return;
} else {
switch (fieldInfo.getDocValues()) {
case BYTES_FIXED_DEREF:
case BYTES_VAR_DEREF:
case BYTES_VAR_SORTED:
case BYTES_FIXED_SORTED:
case BYTES_VAR_STRAIGHT:
files.add(IndexFileNames.segmentFileName(filename, "",
Writer.INDEX_EXTENSION));
assert dir.fileExists(IndexFileNames.segmentFileName(filename, "",
Writer.INDEX_EXTENSION));
// until here all types use an index
case BYTES_FIXED_STRAIGHT:
case FLOAT_32:
case FLOAT_64:
case VAR_INTS:
case FIXED_INTS_16:
case FIXED_INTS_32:
case FIXED_INTS_64:
case FIXED_INTS_8:
files.add(IndexFileNames.segmentFileName(filename, "",
Writer.DATA_EXTENSION));
assert dir.fileExists(IndexFileNames.segmentFileName(filename, "",
Writer.DATA_EXTENSION));
break;
default:
assert false;
default:
assert false;
}
}
}
}
}
static String docValuesId(String segmentsName, int codecID, int fieldId) {
return segmentsName + "_" + codecID + "-" + fieldId;
}
public static void getDocValuesExtensions(Set<String> extensions) {
extensions.add(Writer.DATA_EXTENSION);
extensions.add(Writer.INDEX_EXTENSION);
public static void getDocValuesExtensions(Set<String> extensions, boolean useCompoundFile) {
if (useCompoundFile) {
extensions.add(IndexFileNames.COMPOUND_FILE_ENTRIES_EXTENSION);
extensions.add(IndexFileNames.COMPOUND_FILE_EXTENSION);
} else {
extensions.add(Writer.DATA_EXTENSION);
extensions.add(Writer.INDEX_EXTENSION);
}
}
}

View File

@ -16,12 +16,16 @@ package org.apache.lucene.index.codecs;
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Closeable;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Comparator;
import java.util.TreeMap;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.index.values.Bytes;
import org.apache.lucene.index.values.IndexDocValues;
@ -29,6 +33,8 @@ import org.apache.lucene.index.values.Floats;
import org.apache.lucene.index.values.Ints;
import org.apache.lucene.index.values.ValueType;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
/**
* Abstract base class for FieldsProducer implementations supporting
@ -39,8 +45,12 @@ import org.apache.lucene.store.Directory;
public class DefaultDocValuesProducer extends PerDocValues {
protected final TreeMap<String, IndexDocValues> docValues;
private final boolean useCompoundFile;
private final Closeable cfs;
private final Comparator<BytesRef> sortComparator;
/**
*
* Creates a new {@link DefaultDocValuesProducer} instance and loads all
* {@link IndexDocValues} instances for this segment and codec.
*
@ -52,12 +62,27 @@ public class DefaultDocValuesProducer extends PerDocValues {
* the {@link FieldInfos}
* @param codecId
* the codec ID
* @param useCompoundFile
* if <code>true</code> this producer opens a compound file to read
* IndexDocValues fields, otherwise each field defines its own set of
* files.
* @param sortComparator
* defines the sort order for sorted IndexDocValues variants
* @throws IOException
* if an {@link IOException} occurs
*/
public DefaultDocValuesProducer(SegmentInfo si, Directory dir,
FieldInfos fieldInfo, int codecId) throws IOException {
docValues = load(fieldInfo, si.name, si.docCount, dir, codecId);
FieldInfos fieldInfo, int codecId, boolean useCompoundFile, Comparator<BytesRef> sortComparator) throws IOException {
this.useCompoundFile = useCompoundFile;
this.sortComparator = sortComparator;
final Directory directory;
if (useCompoundFile) {
cfs = directory = dir.openCompoundInput(IndexFileNames.segmentFileName(si.name, codecId, IndexFileNames.COMPOUND_FILE_EXTENSION), 1024);
} else {
cfs = null;
directory = dir;
}
docValues = load(fieldInfo, si.name, si.docCount, directory, codecId);
}
/**
@ -85,14 +110,14 @@ public class DefaultDocValuesProducer extends PerDocValues {
final String id = DefaultDocValuesConsumer.docValuesId(segment,
codecId, fieldInfo.number);
values.put(field,
loadDocValues(docCount, dir, id, fieldInfo.getDocValues()));
loadDocValues(docCount, dir, id, fieldInfo.getDocValues(), sortComparator));
}
}
success = true;
} finally {
if (!success) {
// if we fail we must close all opened resources if there are any
closeDocValues(values.values());
closeInternal(values.values());
}
}
return values;
@ -112,6 +137,7 @@ public class DefaultDocValuesProducer extends PerDocValues {
* the unique file ID within the segment
* @param type
* the type to load
* @param sortComparator byte comparator used by sorted variants
* @return a {@link IndexDocValues} instance for the given type
* @throws IOException
* if an {@link IOException} occurs
@ -119,7 +145,7 @@ public class DefaultDocValuesProducer extends PerDocValues {
* if the given {@link ValueType} is not supported
*/
protected IndexDocValues loadDocValues(int docCount, Directory dir, String id,
ValueType type) throws IOException {
ValueType type, Comparator<BytesRef> sortComparator) throws IOException {
switch (type) {
case FIXED_INTS_16:
case FIXED_INTS_32:
@ -132,39 +158,36 @@ public class DefaultDocValuesProducer extends PerDocValues {
case FLOAT_64:
return Floats.getValues(dir, id, docCount);
case BYTES_FIXED_STRAIGHT:
return Bytes.getValues(dir, id, Bytes.Mode.STRAIGHT, true, docCount);
return Bytes.getValues(dir, id, Bytes.Mode.STRAIGHT, true, docCount, sortComparator);
case BYTES_FIXED_DEREF:
return Bytes.getValues(dir, id, Bytes.Mode.DEREF, true, docCount);
return Bytes.getValues(dir, id, Bytes.Mode.DEREF, true, docCount, sortComparator);
case BYTES_FIXED_SORTED:
return Bytes.getValues(dir, id, Bytes.Mode.SORTED, true, docCount);
return Bytes.getValues(dir, id, Bytes.Mode.SORTED, true, docCount, sortComparator);
case BYTES_VAR_STRAIGHT:
return Bytes.getValues(dir, id, Bytes.Mode.STRAIGHT, false, docCount);
return Bytes.getValues(dir, id, Bytes.Mode.STRAIGHT, false, docCount, sortComparator);
case BYTES_VAR_DEREF:
return Bytes.getValues(dir, id, Bytes.Mode.DEREF, false, docCount);
return Bytes.getValues(dir, id, Bytes.Mode.DEREF, false, docCount, sortComparator);
case BYTES_VAR_SORTED:
return Bytes.getValues(dir, id, Bytes.Mode.SORTED, false, docCount);
return Bytes.getValues(dir, id, Bytes.Mode.SORTED, false, docCount, sortComparator);
default:
throw new IllegalStateException("unrecognized index values mode " + type);
}
}
public void close() throws IOException {
closeDocValues(docValues.values());
closeInternal(docValues.values());
}
private void closeDocValues(final Collection<IndexDocValues> values)
throws IOException {
IOException ex = null;
for (IndexDocValues docValues : values) {
try {
docValues.close();
} catch (IOException e) {
ex = e;
}
}
if (ex != null) {
throw ex;
private void closeInternal(Collection<? extends Closeable> closeables) throws IOException {
final Collection<? extends Closeable> toClose;
if (useCompoundFile) {
final ArrayList<Closeable> list = new ArrayList<Closeable>(closeables);
list.add(cfs);
toClose = list;
} else {
toClose = closeables;
}
IOUtils.closeSafely(false, toClose);
}
@Override

View File

@ -78,7 +78,7 @@ import org.apache.lucene.util.fst.FST;
public class MemoryCodec extends Codec {
public MemoryCodec() {
name = "Memory";
super("Memory");
}
private static final boolean VERBOSE = false;
@ -778,22 +778,22 @@ public class MemoryCodec extends Codec {
@Override
public void files(Directory dir, SegmentInfo segmentInfo, int id, Set<String> files) throws IOException {
files.add(IndexFileNames.segmentFileName(segmentInfo.name, id, EXTENSION));
DefaultDocValuesConsumer.files(dir, segmentInfo, id, files);
DefaultDocValuesConsumer.files(dir, segmentInfo, id, files, getDocValuesUseCFS());
}
@Override
public void getExtensions(Set<String> extensions) {
extensions.add(EXTENSION);
DefaultDocValuesConsumer.getDocValuesExtensions(extensions);
DefaultDocValuesConsumer.getDocValuesExtensions(extensions, getDocValuesUseCFS());
}
@Override
public PerDocConsumer docsConsumer(PerDocWriteState state) throws IOException {
return new DefaultDocValuesConsumer(state, BytesRef.getUTF8SortedAsUnicodeComparator());
return new DefaultDocValuesConsumer(state, getDocValuesSortComparator(), getDocValuesUseCFS());
}
@Override
public PerDocValues docsProducer(SegmentReadState state) throws IOException {
return new DefaultDocValuesProducer(state.segmentInfo, state.dir, state.fieldInfos, state.codecId);
return new DefaultDocValuesProducer(state.segmentInfo, state.dir, state.fieldInfos, state.codecId, getDocValuesUseCFS(), getDocValuesSortComparator());
}
}

View File

@ -55,7 +55,7 @@ public class PreFlexCodec extends Codec {
public static final String PROX_EXTENSION = "prx";
public PreFlexCodec() {
name = "PreFlex";
super("PreFlex");
}
@Override

View File

@ -43,7 +43,6 @@ import org.apache.lucene.index.codecs.TermsIndexReaderBase;
import org.apache.lucene.index.codecs.TermsIndexWriterBase;
import org.apache.lucene.index.codecs.standard.StandardCodec;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
/** This codec "inlines" the postings for terms that have
@ -58,10 +57,19 @@ public class PulsingCodec extends Codec {
private final int freqCutoff;
/**
* Creates a {@link PulsingCodec} with <tt>freqCutoff = 1</tt>
*
* @see PulsingCodec#PulsingCodec(int)
*/
public PulsingCodec() {
this(1);
}
/** Terms with freq <= freqCutoff are inlined into terms
* dict. */
public PulsingCodec(int freqCutoff) {
name = "Pulsing";
super("Pulsing");
this.freqCutoff = freqCutoff;
}
@ -157,22 +165,22 @@ public class PulsingCodec extends Codec {
StandardPostingsReader.files(dir, segmentInfo, id, files);
BlockTermsReader.files(dir, segmentInfo, id, files);
VariableGapTermsIndexReader.files(dir, segmentInfo, id, files);
DefaultDocValuesConsumer.files(dir, segmentInfo, id, files);
DefaultDocValuesConsumer.files(dir, segmentInfo, id, files, getDocValuesUseCFS());
}
@Override
public void getExtensions(Set<String> extensions) {
StandardCodec.getStandardExtensions(extensions);
DefaultDocValuesConsumer.getDocValuesExtensions(extensions);
DefaultDocValuesConsumer.getDocValuesExtensions(extensions, getDocValuesUseCFS());
}
@Override
public PerDocConsumer docsConsumer(PerDocWriteState state) throws IOException {
return new DefaultDocValuesConsumer(state, BytesRef.getUTF8SortedAsUnicodeComparator());
return new DefaultDocValuesConsumer(state, getDocValuesSortComparator(), getDocValuesUseCFS());
}
@Override
public PerDocValues docsProducer(SegmentReadState state) throws IOException {
return new DefaultDocValuesProducer(state.segmentInfo, state.dir, state.fieldInfos, state.codecId);
return new DefaultDocValuesProducer(state.segmentInfo, state.dir, state.fieldInfos, state.codecId, getDocValuesUseCFS(), getDocValuesSortComparator());
}
}

View File

@ -33,7 +33,6 @@ import org.apache.lucene.index.codecs.PerDocConsumer;
import org.apache.lucene.index.codecs.DefaultDocValuesConsumer;
import org.apache.lucene.index.codecs.PerDocValues;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
/** For debugging, curiosity, transparency only!! Do not
* use this codec in production.
@ -46,9 +45,10 @@ import org.apache.lucene.util.BytesRef;
public class SimpleTextCodec extends Codec {
public SimpleTextCodec() {
name = "SimpleText";
super("SimpleText");
}
@Override
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
return new SimpleTextFieldsWriter(state);
@ -69,23 +69,23 @@ public class SimpleTextCodec extends Codec {
@Override
public void files(Directory dir, SegmentInfo segmentInfo, int id, Set<String> files) throws IOException {
files.add(getPostingsFileName(segmentInfo.name, id));
DefaultDocValuesConsumer.files(dir, segmentInfo, id, files);
DefaultDocValuesConsumer.files(dir, segmentInfo, id, files, getDocValuesUseCFS());
}
@Override
public void getExtensions(Set<String> extensions) {
extensions.add(POSTINGS_EXTENSION);
DefaultDocValuesConsumer.getDocValuesExtensions(extensions);
DefaultDocValuesConsumer.getDocValuesExtensions(extensions, getDocValuesUseCFS());
}
// TODO: would be great if these used a plain text impl
@Override
public PerDocConsumer docsConsumer(PerDocWriteState state) throws IOException {
return new DefaultDocValuesConsumer(state, BytesRef.getUTF8SortedAsUnicodeComparator());
return new DefaultDocValuesConsumer(state, getDocValuesSortComparator(), getDocValuesUseCFS());
}
@Override
public PerDocValues docsProducer(SegmentReadState state) throws IOException {
return new DefaultDocValuesProducer(state.segmentInfo, state.dir, state.fieldInfos, state.codecId);
return new DefaultDocValuesProducer(state.segmentInfo, state.dir, state.fieldInfos, state.codecId, getDocValuesUseCFS(), getDocValuesSortComparator());
}
}

View File

@ -40,14 +40,13 @@ import org.apache.lucene.index.codecs.BlockTermsWriter;
import org.apache.lucene.index.codecs.BlockTermsReader;
import org.apache.lucene.index.codecs.DefaultDocValuesProducer;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
/** Default codec.
* @lucene.experimental */
public class StandardCodec extends Codec {
public StandardCodec() {
name = "Standard";
super("Standard");
}
@Override
@ -140,13 +139,13 @@ public class StandardCodec extends Codec {
StandardPostingsReader.files(dir, segmentInfo, id, files);
BlockTermsReader.files(dir, segmentInfo, id, files);
VariableGapTermsIndexReader.files(dir, segmentInfo, id, files);
DefaultDocValuesConsumer.files(dir, segmentInfo, id, files);
DefaultDocValuesConsumer.files(dir, segmentInfo, id, files, getDocValuesUseCFS());
}
@Override
public void getExtensions(Set<String> extensions) {
getStandardExtensions(extensions);
DefaultDocValuesConsumer.getDocValuesExtensions(extensions);
DefaultDocValuesConsumer.getDocValuesExtensions(extensions, getDocValuesUseCFS());
}
public static void getStandardExtensions(Set<String> extensions) {
@ -158,11 +157,11 @@ public class StandardCodec extends Codec {
@Override
public PerDocConsumer docsConsumer(PerDocWriteState state) throws IOException {
return new DefaultDocValuesConsumer(state, BytesRef.getUTF8SortedAsUnicodeComparator());
return new DefaultDocValuesConsumer(state, getDocValuesSortComparator(), getDocValuesUseCFS());
}
@Override
public PerDocValues docsProducer(SegmentReadState state) throws IOException {
return new DefaultDocValuesProducer(state.segmentInfo, state.dir, state.fieldInfos, state.codecId);
return new DefaultDocValuesProducer(state.segmentInfo, state.dir, state.fieldInfos, state.codecId, getDocValuesUseCFS(), getDocValuesSortComparator());
}
}

View File

@ -151,12 +151,13 @@ public final class Bytes {
* otherwise <code>false</code>
* @param maxDoc
* the number of document values stored for the given ID
* @param sortComparator byte comparator used by sorted variants
* @return an initialized {@link IndexDocValues} instance.
* @throws IOException
* if an {@link IOException} occurs
*/
public static IndexDocValues getValues(Directory dir, String id, Mode mode,
boolean fixedSize, int maxDoc) throws IOException {
boolean fixedSize, int maxDoc, Comparator<BytesRef> sortComparator) throws IOException {
// TODO -- I can peek @ header to determing fixed/mode?
if (fixedSize) {
if (mode == Mode.STRAIGHT) {
@ -172,7 +173,7 @@ public final class Bytes {
} else if (mode == Mode.DEREF) {
return new VarDerefBytesImpl.Reader(dir, id, maxDoc);
} else if (mode == Mode.SORTED) {
return new VarSortedBytesImpl.Reader(dir, id, maxDoc);
return new VarSortedBytesImpl.Reader(dir, id, maxDoc, sortComparator);
}
}

View File

@ -131,6 +131,18 @@ public abstract class IndexDocValues implements Closeable {
return cache.loadSorted(this, comparator);
}
/**
* Returns a {@link SortedSource} instance using a default {@link BytesRef}
* comparator for this {@link IndexDocValues} field instance like
* {@link #getSource()}.
* <p>
* This method will return null iff this {@link IndexDocValues} represent a
* {@link Source} instead of a {@link SortedSource}.
*/
public SortedSource getSortedSorted() throws IOException {
return getSortedSorted(null);
}
/**
* Loads and returns a {@link SortedSource} instance for this
* {@link IndexDocValues} field instance like {@link #load()}.
@ -143,6 +155,18 @@ public abstract class IndexDocValues implements Closeable {
throw new UnsupportedOperationException();
}
/**
* Loads and returns a {@link SortedSource} instance using a default
* {@link BytesRef} comparator for this {@link IndexDocValues} field instance
* like {@link #load()}.
* <p>
* This method will return null iff this {@link IndexDocValues} represent a
* {@link Source} instead of a {@link SortedSource}.
*/
public SortedSource loadSorted() throws IOException {
return loadSorted(null);
}
/**
* Returns the {@link ValueType} of this {@link IndexDocValues} instance
*/

View File

@ -167,14 +167,16 @@ class VarSortedBytesImpl {
public static class Reader extends BytesReaderBase {
Reader(Directory dir, String id, int maxDoc) throws IOException {
private final Comparator<BytesRef> defaultComp;
Reader(Directory dir, String id, int maxDoc, Comparator<BytesRef> comparator) throws IOException {
super(dir, id, CODEC_NAME, VERSION_START, true);
this.defaultComp = comparator;
}
@Override
public org.apache.lucene.index.values.IndexDocValues.Source load()
throws IOException {
return loadSorted(null);
return loadSorted(defaultComp);
}
@Override

View File

@ -183,14 +183,11 @@ public class BooleanQuery extends Query implements Iterable<BooleanClause> {
public Query getQuery() { return BooleanQuery.this; }
@Override
public float getValue() { return getBoost(); }
@Override
public float sumOfSquaredWeights() throws IOException {
public float getValueForNormalization() throws IOException {
float sum = 0.0f;
for (int i = 0 ; i < weights.size(); i++) {
// call sumOfSquaredWeights for all clauses in case of side effects
float s = weights.get(i).sumOfSquaredWeights(); // sum sub weights
float s = weights.get(i).getValueForNormalization(); // sum sub weights
if (!clauses.get(i).isProhibited())
// only add to sum for non-prohibited clauses
sum += s;
@ -206,11 +203,11 @@ public class BooleanQuery extends Query implements Iterable<BooleanClause> {
}
@Override
public void normalize(float norm) {
norm *= getBoost(); // incorporate boost
public void normalize(float norm, float topLevelBoost) {
topLevelBoost *= getBoost(); // incorporate boost
for (Weight w : weights) {
// normalize all clauses, (even if prohibited in case of side affects)
w.normalize(norm);
w.normalize(norm, topLevelBoost);
}
}

View File

@ -27,7 +27,7 @@ import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.ByteBlockPool;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefHash;
import org.apache.lucene.util.PerReaderTermState;
import org.apache.lucene.util.TermContext;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.BytesRefHash.DirectBytesStartArray;
@ -77,7 +77,7 @@ class ConstantScoreAutoRewrite extends TermCollectingRewrite<BooleanQuery> {
}
@Override
protected void addClause(BooleanQuery topLevel, Term term, int docFreq, float boost /*ignored*/, PerReaderTermState states) {
protected void addClause(BooleanQuery topLevel, Term term, int docFreq, float boost /*ignored*/, TermContext states) {
topLevel.add(new TermQuery(term, states), BooleanClause.Occur.SHOULD);
}
@ -140,9 +140,9 @@ class ConstantScoreAutoRewrite extends TermCollectingRewrite<BooleanQuery> {
assert termState != null;
if (pos < 0) {
pos = (-pos)-1;
array.termState[pos].register(termState, readerContext.ord, termsEnum.docFreq());
array.termState[pos].register(termState, readerContext.ord, termsEnum.docFreq(), termsEnum.totalTermFreq());
} else {
array.termState[pos] = new PerReaderTermState(topReaderContext, termState, readerContext.ord, termsEnum.docFreq());
array.termState[pos] = new TermContext(topReaderContext, termState, readerContext.ord, termsEnum.docFreq(), termsEnum.totalTermFreq());
}
return true;
}
@ -183,9 +183,9 @@ class ConstantScoreAutoRewrite extends TermCollectingRewrite<BooleanQuery> {
return true;
}
/** Special implementation of BytesStartArray that keeps parallel arrays for {@link PerReaderTermState} */
/** Special implementation of BytesStartArray that keeps parallel arrays for {@link TermContext} */
static final class TermStateByteStart extends DirectBytesStartArray {
PerReaderTermState[] termState;
TermContext[] termState;
public TermStateByteStart(int initSize) {
super(initSize);
@ -194,7 +194,7 @@ class ConstantScoreAutoRewrite extends TermCollectingRewrite<BooleanQuery> {
@Override
public int[] init() {
final int[] ord = super.init();
termState = new PerReaderTermState[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
termState = new TermContext[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
assert termState.length >= ord.length;
return ord;
}
@ -203,7 +203,7 @@ class ConstantScoreAutoRewrite extends TermCollectingRewrite<BooleanQuery> {
public int[] grow() {
final int[] ord = super.grow();
if (termState.length < ord.length) {
PerReaderTermState[] tmpTermState = new PerReaderTermState[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
TermContext[] tmpTermState = new TermContext[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
System.arraycopy(termState, 0, tmpTermState, 0, termState.length);
termState = tmpTermState;
}

View File

@ -110,24 +110,19 @@ public class ConstantScoreQuery extends Query {
}
@Override
public float getValue() {
return queryWeight;
}
@Override
public float sumOfSquaredWeights() throws IOException {
public float getValueForNormalization() throws IOException {
// we calculate sumOfSquaredWeights of the inner weight, but ignore it (just to initialize everything)
if (innerWeight != null) innerWeight.sumOfSquaredWeights();
if (innerWeight != null) innerWeight.getValueForNormalization();
queryWeight = getBoost();
return queryWeight * queryWeight;
}
@Override
public void normalize(float norm) {
this.queryNorm = norm;
public void normalize(float norm, float topLevelBoost) {
this.queryNorm = norm * topLevelBoost;
queryWeight *= this.queryNorm;
// we normalize the inner weight, but ignore it (just to initialize everything)
if (innerWeight != null) innerWeight.normalize(norm);
if (innerWeight != null) innerWeight.normalize(norm, topLevelBoost);
}
@Override
@ -148,7 +143,7 @@ public class ConstantScoreQuery extends Query {
if (disi == null) {
return null;
}
return new ConstantScorer(disi, this);
return new ConstantScorer(disi, this, queryWeight);
}
@Override
@ -181,9 +176,9 @@ public class ConstantScoreQuery extends Query {
final DocIdSetIterator docIdSetIterator;
final float theScore;
public ConstantScorer(DocIdSetIterator docIdSetIterator, Weight w) throws IOException {
public ConstantScorer(DocIdSetIterator docIdSetIterator, Weight w, float theScore) throws IOException {
super(w);
theScore = w.getValue();
this.theScore = theScore;
this.docIdSetIterator = docIdSetIterator;
}
@ -212,7 +207,7 @@ public class ConstantScoreQuery extends Query {
@Override
public void setScorer(Scorer scorer) throws IOException {
// we must wrap again here, but using the scorer passed in as parameter:
collector.setScorer(new ConstantScorer(scorer, ConstantScorer.this.weight));
collector.setScorer(new ConstantScorer(scorer, ConstantScorer.this.weight, ConstantScorer.this.theScore));
}
@Override

View File

@ -20,7 +20,7 @@ import org.apache.lucene.index.FieldInvertState;
*/
/** Expert: Default scoring implementation. */
public class DefaultSimilarity extends Similarity {
public class DefaultSimilarity extends TFIDFSimilarity {
/** Implemented as
* <code>state.getBoost()*lengthNorm(numTerms)</code>, where
@ -31,13 +31,13 @@ public class DefaultSimilarity extends Similarity {
*
* @lucene.experimental */
@Override
public float computeNorm(FieldInvertState state) {
public byte computeNorm(FieldInvertState state) {
final int numTerms;
if (discountOverlaps)
numTerms = state.getLength() - state.getNumOverlap();
else
numTerms = state.getLength();
return state.getBoost() * ((float) (1.0 / Math.sqrt(numTerms)));
return encodeNormValue(state.getBoost() * ((float) (1.0 / Math.sqrt(numTerms))));
}
/** Implemented as <code>sqrt(freq)</code>. */

View File

@ -110,16 +110,12 @@ public class DisjunctionMaxQuery extends Query implements Iterable<Query> {
@Override
public Query getQuery() { return DisjunctionMaxQuery.this; }
/** Return our boost */
@Override
public float getValue() { return getBoost(); }
/** Compute the sub of squared weights of us applied to our subqueries. Used for normalization. */
@Override
public float sumOfSquaredWeights() throws IOException {
public float getValueForNormalization() throws IOException {
float max = 0.0f, sum = 0.0f;
for (Weight currentWeight : weights) {
float sub = currentWeight.sumOfSquaredWeights();
float sub = currentWeight.getValueForNormalization();
sum += sub;
max = Math.max(max, sub);
@ -130,10 +126,10 @@ public class DisjunctionMaxQuery extends Query implements Iterable<Query> {
/** Apply the computed normalization factor to our subqueries */
@Override
public void normalize(float norm) {
norm *= getBoost(); // Incorporate our boost
public void normalize(float norm, float topLevelBoost) {
topLevelBoost *= getBoost(); // Incorporate our boost
for (Weight wt : weights) {
wt.normalize(norm);
wt.normalize(norm, topLevelBoost);
}
}

View File

@ -23,12 +23,6 @@ import java.util.Arrays;
import org.apache.lucene.index.*;
final class ExactPhraseScorer extends Scorer {
private final byte[] norms;
private final float value;
private static final int SCORE_CACHE_SIZE = 32;
private final float[] scoreCache = new float[SCORE_CACHE_SIZE];
private final int endMinus1;
private final static int CHUNK = 4096;
@ -60,14 +54,12 @@ final class ExactPhraseScorer extends Scorer {
private int docID = -1;
private int freq;
private final Similarity similarity;
private final Similarity.ExactDocScorer docScorer;
ExactPhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings,
Similarity similarity, byte[] norms) throws IOException {
Similarity.ExactDocScorer docScorer) throws IOException {
super(weight);
this.similarity = similarity;
this.norms = norms;
this.value = weight.getValue();
this.docScorer = docScorer;
chunkStates = new ChunkState[postings.length];
@ -88,10 +80,6 @@ final class ExactPhraseScorer extends Scorer {
return;
}
}
for (int i = 0; i < SCORE_CACHE_SIZE; i++) {
scoreCache[i] = similarity.tf((float) i) * value;
}
}
@Override
@ -206,13 +194,7 @@ final class ExactPhraseScorer extends Scorer {
@Override
public float score() throws IOException {
final float raw; // raw score
if (freq < SCORE_CACHE_SIZE) {
raw = scoreCache[freq];
} else {
raw = similarity.tf((float) freq) * value;
}
return norms == null ? raw : raw * similarity.decodeNormValue(norms[docID]); // normalize
return docScorer.score(docID, freq);
}
private int phraseFreq() throws IOException {

View File

@ -125,25 +125,4 @@ public class Explanation {
return buffer.toString();
}
/**
* Small Util class used to pass both an idf factor as well as an
* explanation for that factor.
*
* This class will likely be held on a {@link Weight}, so be aware
* before storing any large or un-serializable fields.
*
*/
public static abstract class IDFExplanation {
/**
* @return the idf factor
*/
public abstract float getIdf();
/**
* This should be calculated lazily if possible.
*
* @return the explanation for the idf factor.
*/
public abstract String explain();
}
}

View File

@ -63,21 +63,15 @@ extends Query {
public Weight createWeight(final IndexSearcher searcher) throws IOException {
final Weight weight = query.createWeight (searcher);
return new Weight() {
private float value;
// pass these methods through to enclosed query's weight
@Override
public float getValue() { return value; }
@Override
public float sumOfSquaredWeights() throws IOException {
return weight.sumOfSquaredWeights() * getBoost() * getBoost();
public float getValueForNormalization() throws IOException {
return weight.getValueForNormalization() * getBoost() * getBoost();
}
@Override
public void normalize (float v) {
weight.normalize(v);
value = weight.getValue() * getBoost();
public void normalize (float norm, float topLevelBoost) {
weight.normalize(norm, topLevelBoost);
}
@Override

View File

@ -674,11 +674,11 @@ public class IndexSearcher {
public Weight createNormalizedWeight(Query query) throws IOException {
query = rewrite(query);
Weight weight = query.createWeight(this);
float sum = weight.sumOfSquaredWeights();
float norm = getSimilarityProvider().queryNorm(sum);
float v = weight.getValueForNormalization();
float norm = getSimilarityProvider().queryNorm(v);
if (Float.isInfinite(norm) || Float.isNaN(norm))
norm = 1.0f;
weight.normalize(norm);
weight.normalize(norm, 1.0f);
return weight;
}

View File

@ -32,35 +32,17 @@ import java.io.IOException;
*/
public class MatchAllDocsQuery extends Query {
public MatchAllDocsQuery() {
this(null);
}
private final String normsField;
/**
* @param normsField Field used for normalization factor (document boost). Null if nothing.
*/
public MatchAllDocsQuery(String normsField) {
this.normsField = normsField;
}
private class MatchAllScorer extends Scorer {
final float score;
final byte[] norms;
private int doc = -1;
private final int maxDoc;
private final Bits liveDocs;
private final Similarity similarity;
MatchAllScorer(IndexReader reader, Similarity similarity, Weight w,
byte[] norms) throws IOException {
MatchAllScorer(IndexReader reader, Weight w, float score) throws IOException {
super(w);
this.similarity = similarity;
liveDocs = reader.getLiveDocs();
score = w.getValue();
this.score = score;
maxDoc = reader.maxDoc();
this.norms = norms;
}
@Override
@ -82,7 +64,7 @@ public class MatchAllDocsQuery extends Query {
@Override
public float score() {
return norms == null ? score : score * similarity.decodeNormValue(norms[docID()]);
return score;
}
@Override
@ -93,12 +75,10 @@ public class MatchAllDocsQuery extends Query {
}
private class MatchAllDocsWeight extends Weight {
private Similarity similarity;
private float queryWeight;
private float queryNorm;
public MatchAllDocsWeight(IndexSearcher searcher) {
this.similarity = normsField == null ? null : searcher.getSimilarityProvider().get(normsField);
}
@Override
@ -112,33 +92,27 @@ public class MatchAllDocsQuery extends Query {
}
@Override
public float getValue() {
return queryWeight;
}
@Override
public float sumOfSquaredWeights() {
public float getValueForNormalization() {
queryWeight = getBoost();
return queryWeight * queryWeight;
}
@Override
public void normalize(float queryNorm) {
this.queryNorm = queryNorm;
public void normalize(float queryNorm, float topLevelBoost) {
this.queryNorm = queryNorm * topLevelBoost;
queryWeight *= this.queryNorm;
}
@Override
public Scorer scorer(AtomicReaderContext context, ScorerContext scorerContext) throws IOException {
return new MatchAllScorer(context.reader, similarity, this,
normsField != null ? context.reader.norms(normsField) : null);
return new MatchAllScorer(context.reader, this, queryWeight);
}
@Override
public Explanation explain(AtomicReaderContext context, int doc) {
// explain query weight
Explanation queryExpl = new ComplexExplanation
(true, getValue(), "MatchAllDocsQuery, product of:");
(true, queryWeight, "MatchAllDocsQuery, product of:");
if (getBoost() != 1.0f) {
queryExpl.addDetail(new Explanation(getBoost(),"boost"));
}

View File

@ -22,12 +22,14 @@ import java.util.*;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.IndexReader.ReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.search.Explanation.IDFExplanation;
import org.apache.lucene.search.Similarity.SloppyDocScorer;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.TermContext;
import org.apache.lucene.util.ToStringUtils;
import org.apache.lucene.util.PriorityQueue;
import org.apache.lucene.util.Bits;
@ -129,45 +131,35 @@ public class MultiPhraseQuery extends Query {
private class MultiPhraseWeight extends Weight {
private Similarity similarity;
private float value;
private final IDFExplanation idfExp;
private float idf;
private float queryNorm;
private float queryWeight;
private final Similarity similarity;
private final Similarity.Stats stats;
public MultiPhraseWeight(IndexSearcher searcher)
throws IOException {
this.similarity = searcher.getSimilarityProvider().get(field);
final ReaderContext context = searcher.getTopReaderContext();
// compute idf
ArrayList<Term> allTerms = new ArrayList<Term>();
ArrayList<TermContext> allTerms = new ArrayList<TermContext>();
for(final Term[] terms: termArrays) {
for (Term term: terms) {
allTerms.add(term);
allTerms.add(TermContext.build(context, term, true));
}
}
idfExp = similarity.idfExplain(allTerms, searcher);
idf = idfExp.getIdf();
stats = similarity.computeStats(searcher, field, getBoost(), allTerms.toArray(new TermContext[allTerms.size()]));
}
@Override
public Query getQuery() { return MultiPhraseQuery.this; }
@Override
public float getValue() { return value; }
@Override
public float sumOfSquaredWeights() {
queryWeight = idf * getBoost(); // compute query weight
return queryWeight * queryWeight; // square it
public float getValueForNormalization() {
return stats.getValueForNormalization();
}
@Override
public void normalize(float queryNorm) {
this.queryNorm = queryNorm;
queryWeight *= queryNorm; // normalize query weight
value = queryWeight * idf; // idf for document
public void normalize(float queryNorm, float topLevelBoost) {
stats.normalize(queryNorm, topLevelBoost);
}
@Override
@ -222,8 +214,7 @@ public class MultiPhraseQuery extends Query {
}
if (slop == 0) {
ExactPhraseScorer s = new ExactPhraseScorer(this, postingsFreqs, similarity,
reader.norms(field));
ExactPhraseScorer s = new ExactPhraseScorer(this, postingsFreqs, similarity.exactDocScorer(stats, field, context));
if (s.noDocs) {
return null;
} else {
@ -231,84 +222,29 @@ public class MultiPhraseQuery extends Query {
}
} else {
return new SloppyPhraseScorer(this, postingsFreqs, similarity,
slop, reader.norms(field));
slop, similarity.sloppyDocScorer(stats, field, context));
}
}
@Override
public Explanation explain(AtomicReaderContext context, int doc)
throws IOException {
ComplexExplanation result = new ComplexExplanation();
result.setDescription("weight("+getQuery()+" in "+doc+"), product of:");
Explanation idfExpl = new Explanation(idf, "idf(" + field + ":" + idfExp.explain() +")");
// explain query weight
Explanation queryExpl = new Explanation();
queryExpl.setDescription("queryWeight(" + getQuery() + "), product of:");
Explanation boostExpl = new Explanation(getBoost(), "boost");
if (getBoost() != 1.0f)
queryExpl.addDetail(boostExpl);
queryExpl.addDetail(idfExpl);
Explanation queryNormExpl = new Explanation(queryNorm,"queryNorm");
queryExpl.addDetail(queryNormExpl);
queryExpl.setValue(boostExpl.getValue() *
idfExpl.getValue() *
queryNormExpl.getValue());
result.addDetail(queryExpl);
// explain field weight
ComplexExplanation fieldExpl = new ComplexExplanation();
fieldExpl.setDescription("fieldWeight("+getQuery()+" in "+doc+
"), product of:");
public Explanation explain(AtomicReaderContext context, int doc) throws IOException {
Scorer scorer = scorer(context, ScorerContext.def());
if (scorer == null) {
return new Explanation(0.0f, "no matching docs");
if (scorer != null) {
int newDoc = scorer.advance(doc);
if (newDoc == doc) {
float freq = scorer.freq();
SloppyDocScorer docScorer = similarity.sloppyDocScorer(stats, field, context);
ComplexExplanation result = new ComplexExplanation();
result.setDescription("weight("+getQuery()+" in "+doc+") [" + similarity.getClass().getSimpleName() + "], result of:");
Explanation scoreExplanation = docScorer.explain(doc, new Explanation(freq, "phraseFreq=" + freq));
result.addDetail(scoreExplanation);
result.setValue(scoreExplanation.getValue());
result.setMatch(true);
return result;
}
}
Explanation tfExplanation = new Explanation();
int d = scorer.advance(doc);
float phraseFreq;
if (d == doc) {
phraseFreq = scorer.freq();
} else {
phraseFreq = 0.0f;
}
tfExplanation.setValue(similarity.tf(phraseFreq));
tfExplanation.setDescription("tf(phraseFreq=" + phraseFreq + ")");
fieldExpl.addDetail(tfExplanation);
fieldExpl.addDetail(idfExpl);
Explanation fieldNormExpl = new Explanation();
byte[] fieldNorms = context.reader.norms(field);
float fieldNorm =
fieldNorms!=null ? similarity.decodeNormValue(fieldNorms[doc]) : 1.0f;
fieldNormExpl.setValue(fieldNorm);
fieldNormExpl.setDescription("fieldNorm(field="+field+", doc="+doc+")");
fieldExpl.addDetail(fieldNormExpl);
fieldExpl.setMatch(Boolean.valueOf(tfExplanation.isMatch()));
fieldExpl.setValue(tfExplanation.getValue() *
idfExpl.getValue() *
fieldNormExpl.getValue());
result.addDetail(fieldExpl);
result.setMatch(fieldExpl.getMatch());
// combine them
result.setValue(queryExpl.getValue() * fieldExpl.getValue());
if (queryExpl.getValue() == 1.0f)
return fieldExpl;
return result;
return new ComplexExplanation(false, 0.0f, "no matching term");
}
}

View File

@ -25,7 +25,7 @@ import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.PerReaderTermState;
import org.apache.lucene.util.TermContext;
/**
* An abstract {@link Query} that matches documents
@ -154,7 +154,7 @@ public abstract class MultiTermQuery extends Query {
}
@Override
protected void addClause(BooleanQuery topLevel, Term term, int docCount, float boost, PerReaderTermState states) {
protected void addClause(BooleanQuery topLevel, Term term, int docCount, float boost, TermContext states) {
final TermQuery tq = new TermQuery(term, states);
tq.setBoost(boost);
topLevel.add(tq, BooleanClause.Occur.SHOULD);
@ -195,7 +195,7 @@ public abstract class MultiTermQuery extends Query {
}
@Override
protected void addClause(BooleanQuery topLevel, Term term, int docFreq, float boost, PerReaderTermState states) {
protected void addClause(BooleanQuery topLevel, Term term, int docFreq, float boost, TermContext states) {
final Query q = new ConstantScoreQuery(new TermQuery(term, states));
q.setBoost(boost);
topLevel.add(q, BooleanClause.Occur.SHOULD);

View File

@ -22,10 +22,16 @@ import java.util.Set;
import java.util.ArrayList;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.IndexReader.ReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.Explanation.IDFExplanation;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.Similarity.SloppyDocScorer;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.TermContext;
import org.apache.lucene.util.ToStringUtils;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Bits;
@ -171,18 +177,17 @@ public class PhraseQuery extends Query {
private class PhraseWeight extends Weight {
private final Similarity similarity;
private float value;
private float idf;
private float queryNorm;
private float queryWeight;
private IDFExplanation idfExp;
private final Similarity.Stats stats;
private transient TermContext states[];
public PhraseWeight(IndexSearcher searcher)
throws IOException {
this.similarity = searcher.getSimilarityProvider().get(field);
idfExp = similarity.idfExplain(terms, searcher);
idf = idfExp.getIdf();
final ReaderContext context = searcher.getTopReaderContext();
states = new TermContext[terms.size()];
for (int i = 0; i < terms.size(); i++)
states[i] = TermContext.build(context, terms.get(i), true);
stats = similarity.computeStats(searcher, field, getBoost(), states);
}
@Override
@ -192,19 +197,13 @@ public class PhraseQuery extends Query {
public Query getQuery() { return PhraseQuery.this; }
@Override
public float getValue() { return value; }
@Override
public float sumOfSquaredWeights() {
queryWeight = idf * getBoost(); // compute query weight
return queryWeight * queryWeight; // square it
public float getValueForNormalization() {
return stats.getValueForNormalization();
}
@Override
public void normalize(float queryNorm) {
this.queryNorm = queryNorm;
queryWeight *= queryNorm; // normalize query weight
value = queryWeight * idf; // idf for document
public void normalize(float queryNorm, float topLevelBoost) {
stats.normalize(queryNorm, topLevelBoost);
}
@Override
@ -216,21 +215,26 @@ public class PhraseQuery extends Query {
PostingsAndFreq[] postingsFreqs = new PostingsAndFreq[terms.size()];
for (int i = 0; i < terms.size(); i++) {
final Term t = terms.get(i);
final TermState state = states[i].get(context.ord);
if (state == null) { /* term doesnt exist in this segment */
assert termNotInReader(reader, field, t.bytes()) : "no termstate found but term exists in reader";
return null;
}
DocsAndPositionsEnum postingsEnum = reader.termPositionsEnum(liveDocs,
t.field(),
t.bytes());
t.bytes(),
state);
// PhraseQuery on a field that did not index
// positions.
if (postingsEnum == null) {
if (reader.termDocsEnum(liveDocs, t.field(), t.bytes()) != null) {
// term does exist, but has no positions
throw new IllegalStateException("field \"" + t.field() + "\" was indexed with Field.omitTermFreqAndPositions=true; cannot run PhraseQuery (term=" + t.text() + ")");
} else {
// term does not exist
return null;
}
assert (reader.termDocsEnum(liveDocs, t.field(), t.bytes(), state) != null) : "termstate found but no term exists in reader";
// term does exist, but has no positions
throw new IllegalStateException("field \"" + t.field() + "\" was indexed with Field.omitTermFreqAndPositions=true; cannot run PhraseQuery (term=" + t.text() + ")");
}
postingsFreqs[i] = new PostingsAndFreq(postingsEnum, reader.docFreq(t.field(), t.bytes()), positions.get(i).intValue(), t);
// get the docFreq without seeking
TermsEnum te = reader.fields().terms(field).getThreadTermsEnum();
te.seekExact(t.bytes(), state);
postingsFreqs[i] = new PostingsAndFreq(postingsEnum, te.docFreq(), positions.get(i).intValue(), t);
}
// sort by increasing docFreq order
@ -239,8 +243,7 @@ public class PhraseQuery extends Query {
}
if (slop == 0) { // optimize exact case
ExactPhraseScorer s = new ExactPhraseScorer(this, postingsFreqs, similarity,
reader.norms(field));
ExactPhraseScorer s = new ExactPhraseScorer(this, postingsFreqs, similarity.exactDocScorer(stats, field, context));
if (s.noDocs) {
return null;
} else {
@ -248,96 +251,35 @@ public class PhraseQuery extends Query {
}
} else {
return
new SloppyPhraseScorer(this, postingsFreqs, similarity, slop,
reader.norms(field));
new SloppyPhraseScorer(this, postingsFreqs, similarity, slop, similarity.sloppyDocScorer(stats, field, context));
}
}
private boolean termNotInReader(IndexReader reader, String field, BytesRef bytes) throws IOException {
// only called from assert
final Terms terms = reader.terms(field);
return terms == null || terms.docFreq(bytes) == 0;
}
@Override
public Explanation explain(AtomicReaderContext context, int doc)
throws IOException {
ComplexExplanation result = new ComplexExplanation();
result.setDescription("weight("+getQuery()+" in "+doc+"), product of:");
StringBuilder docFreqs = new StringBuilder();
StringBuilder query = new StringBuilder();
query.append('\"');
docFreqs.append(idfExp.explain());
for (int i = 0; i < terms.size(); i++) {
if (i != 0) {
query.append(" ");
}
Term term = terms.get(i);
query.append(term.text());
}
query.append('\"');
Explanation idfExpl =
new Explanation(idf, "idf(" + field + ":" + docFreqs + ")");
// explain query weight
Explanation queryExpl = new Explanation();
queryExpl.setDescription("queryWeight(" + getQuery() + "), product of:");
Explanation boostExpl = new Explanation(getBoost(), "boost");
if (getBoost() != 1.0f)
queryExpl.addDetail(boostExpl);
queryExpl.addDetail(idfExpl);
Explanation queryNormExpl = new Explanation(queryNorm,"queryNorm");
queryExpl.addDetail(queryNormExpl);
queryExpl.setValue(boostExpl.getValue() *
idfExpl.getValue() *
queryNormExpl.getValue());
result.addDetail(queryExpl);
// explain field weight
Explanation fieldExpl = new Explanation();
fieldExpl.setDescription("fieldWeight("+field+":"+query+" in "+doc+
"), product of:");
public Explanation explain(AtomicReaderContext context, int doc) throws IOException {
Scorer scorer = scorer(context, ScorerContext.def());
if (scorer == null) {
return new Explanation(0.0f, "no matching docs");
}
Explanation tfExplanation = new Explanation();
int d = scorer.advance(doc);
float phraseFreq;
if (d == doc) {
phraseFreq = scorer.freq();
} else {
phraseFreq = 0.0f;
if (scorer != null) {
int newDoc = scorer.advance(doc);
if (newDoc == doc) {
float freq = scorer.freq();
SloppyDocScorer docScorer = similarity.sloppyDocScorer(stats, field, context);
ComplexExplanation result = new ComplexExplanation();
result.setDescription("weight("+getQuery()+" in "+doc+") [" + similarity.getClass().getSimpleName() + "], result of:");
Explanation scoreExplanation = docScorer.explain(doc, new Explanation(freq, "phraseFreq=" + freq));
result.addDetail(scoreExplanation);
result.setValue(scoreExplanation.getValue());
result.setMatch(true);
return result;
}
}
tfExplanation.setValue(similarity.tf(phraseFreq));
tfExplanation.setDescription("tf(phraseFreq=" + phraseFreq + ")");
fieldExpl.addDetail(tfExplanation);
fieldExpl.addDetail(idfExpl);
Explanation fieldNormExpl = new Explanation();
byte[] fieldNorms = context.reader.norms(field);
float fieldNorm =
fieldNorms!=null ? similarity.decodeNormValue(fieldNorms[doc]) : 1.0f;
fieldNormExpl.setValue(fieldNorm);
fieldNormExpl.setDescription("fieldNorm(field="+field+", doc="+doc+")");
fieldExpl.addDetail(fieldNormExpl);
fieldExpl.setValue(tfExplanation.getValue() *
idfExpl.getValue() *
fieldNormExpl.getValue());
result.addDetail(fieldExpl);
// combine them
result.setValue(queryExpl.getValue() * fieldExpl.getValue());
result.setMatch(tfExplanation.isMatch());
return result;
return new ComplexExplanation(false, 0.0f, "no matching term");
}
}

View File

@ -30,9 +30,6 @@ import java.io.IOException;
* means a match.
*/
abstract class PhraseScorer extends Scorer {
protected byte[] norms;
protected float value;
private boolean firstTime = true;
private boolean more = true;
protected PhraseQueue pq;
@ -40,14 +37,12 @@ abstract class PhraseScorer extends Scorer {
private float freq; //phrase frequency in current doc as computed by phraseFreq().
protected final Similarity similarity;
protected final Similarity.SloppyDocScorer docScorer;
PhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings,
Similarity similarity, byte[] norms) {
Similarity.SloppyDocScorer docScorer) throws IOException {
super(weight);
this.similarity = similarity;
this.norms = norms;
this.value = weight.getValue();
this.docScorer = docScorer;
// convert tps to a list of phrase positions.
// note: phrase-position differs from term-position in that its position
@ -107,9 +102,7 @@ abstract class PhraseScorer extends Scorer {
@Override
public float score() throws IOException {
//System.out.println("scoring " + first.doc);
float raw = similarity.tf(freq) * value; // raw score
return norms == null ? raw : raw * similarity.decodeNormValue(norms[first.doc]); // normalize
return docScorer.score(first.doc, freq);
}
@Override

View File

@ -28,7 +28,7 @@ import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.ByteBlockPool;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefHash;
import org.apache.lucene.util.PerReaderTermState;
import org.apache.lucene.util.TermContext;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.BytesRefHash.DirectBytesStartArray;
@ -56,7 +56,7 @@ public abstract class ScoringRewrite<Q extends Query> extends TermCollectingRewr
@Override
protected void addClause(BooleanQuery topLevel, Term term, int docCount,
float boost, PerReaderTermState states) {
float boost, TermContext states) {
final TermQuery tq = new TermQuery(term, states);
tq.setBoost(boost);
topLevel.add(tq, BooleanClause.Occur.SHOULD);
@ -117,7 +117,7 @@ public abstract class ScoringRewrite<Q extends Query> extends TermCollectingRewr
if (size > 0) {
final int sort[] = col.terms.sort(col.termsEnum.getComparator());
final float[] boost = col.array.boost;
final PerReaderTermState[] termStates = col.array.termState;
final TermContext[] termStates = col.array.termState;
for (int i = 0; i < size; i++) {
final int pos = sort[i];
final Term term = new Term(query.getField(), col.terms.get(pos, new BytesRef()));
@ -150,12 +150,12 @@ public abstract class ScoringRewrite<Q extends Query> extends TermCollectingRewr
if (e < 0 ) {
// duplicate term: update docFreq
final int pos = (-e)-1;
array.termState[pos].register(state, readerContext.ord, termsEnum.docFreq());
array.termState[pos].register(state, readerContext.ord, termsEnum.docFreq(), termsEnum.totalTermFreq());
assert array.boost[pos] == boostAtt.getBoost() : "boost should be equal in all segment TermsEnums";
} else {
// new entry: we populate the entry initially
array.boost[e] = boostAtt.getBoost();
array.termState[e] = new PerReaderTermState(topReaderContext, state, readerContext.ord, termsEnum.docFreq());
array.termState[e] = new TermContext(topReaderContext, state, readerContext.ord, termsEnum.docFreq(), termsEnum.totalTermFreq());
ScoringRewrite.this.checkMaxClauseCount(terms.size());
}
return true;
@ -165,7 +165,7 @@ public abstract class ScoringRewrite<Q extends Query> extends TermCollectingRewr
/** Special implementation of BytesStartArray that keeps parallel arrays for boost and docFreq */
static final class TermFreqBoostByteStart extends DirectBytesStartArray {
float[] boost;
PerReaderTermState[] termState;
TermContext[] termState;
public TermFreqBoostByteStart(int initSize) {
super(initSize);
@ -175,7 +175,7 @@ public abstract class ScoringRewrite<Q extends Query> extends TermCollectingRewr
public int[] init() {
final int[] ord = super.init();
boost = new float[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_FLOAT)];
termState = new PerReaderTermState[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
termState = new TermContext[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
assert termState.length >= ord.length && boost.length >= ord.length;
return ord;
}
@ -185,7 +185,7 @@ public abstract class ScoringRewrite<Q extends Query> extends TermCollectingRewr
final int[] ord = super.grow();
boost = ArrayUtil.grow(boost, ord.length);
if (termState.length < ord.length) {
PerReaderTermState[] tmpTermState = new PerReaderTermState[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
TermContext[] tmpTermState = new TermContext[ArrayUtil.oversize(ord.length, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
System.arraycopy(termState, 0, tmpTermState, 0, termState.length);
termState = tmpTermState;
}

View File

@ -19,594 +19,111 @@ package org.apache.lucene.search;
import java.io.IOException;
import java.util.Collection;
import org.apache.lucene.document.IndexDocValuesField; // javadoc
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Explanation.IDFExplanation;
import org.apache.lucene.util.SmallFloat;
import org.apache.lucene.index.IndexReader; // javadoc
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.Terms; // javadoc
import org.apache.lucene.search.spans.SpanQuery; // javadoc
import org.apache.lucene.util.SmallFloat; // javadoc
import org.apache.lucene.util.TermContext;
/**
* Similarity defines the components of Lucene scoring.
* <p>
* Expert: Scoring API.
*
* <p>Similarity defines the components of Lucene scoring.
* Overriding computation of these components is a convenient
* way to alter Lucene scoring.
*
* <p>Suggested reading:
* <a href="http://nlp.stanford.edu/IR-book/html/htmledition/queries-as-vectors-1.html">
* Introduction To Information Retrieval, Chapter 6</a>.
*
* <p>The following describes how Lucene scoring evolves from
* underlying information retrieval models to (efficient) implementation.
* We first brief on <i>VSM Score</i>,
* then derive from it <i>Lucene's Conceptual Scoring Formula</i>,
* from which, finally, evolves <i>Lucene's Practical Scoring Function</i>
* (the latter is connected directly with Lucene classes and methods).
*
* <p>Lucene combines
* <a href="http://en.wikipedia.org/wiki/Standard_Boolean_model">
* Boolean model (BM) of Information Retrieval</a>
* with
* <a href="http://en.wikipedia.org/wiki/Vector_Space_Model">
* Vector Space Model (VSM) of Information Retrieval</a> -
* documents "approved" by BM are scored by VSM.
*
* <p>In VSM, documents and queries are represented as
* weighted vectors in a multi-dimensional space,
* where each distinct index term is a dimension,
* and weights are
* <a href="http://en.wikipedia.org/wiki/Tfidf">Tf-idf</a> values.
*
* <p>VSM does not require weights to be <i>Tf-idf</i> values,
* but <i>Tf-idf</i> values are believed to produce search results of high quality,
* and so Lucene is using <i>Tf-idf</i>.
* <i>Tf</i> and <i>Idf</i> are described in more detail below,
* but for now, for completion, let's just say that
* for given term <i>t</i> and document (or query) <i>x</i>,
* <i>Tf(t,x)</i> varies with the number of occurrences of term <i>t</i> in <i>x</i>
* (when one increases so does the other) and
* <i>idf(t)</i> similarly varies with the inverse of the
* number of index documents containing term <i>t</i>.
*
* <p><i>VSM score</i> of document <i>d</i> for query <i>q</i> is the
* <a href="http://en.wikipedia.org/wiki/Cosine_similarity">
* Cosine Similarity</a>
* of the weighted query vectors <i>V(q)</i> and <i>V(d)</i>:
*
* <br>&nbsp;<br>
* <table cellpadding="2" cellspacing="2" border="0" align="center">
* <tr><td>
* <table cellpadding="1" cellspacing="0" border="1" align="center">
* <tr><td>
* <table cellpadding="2" cellspacing="2" border="0" align="center">
* <tr>
* <td valign="middle" align="right" rowspan="1">
* cosine-similarity(q,d) &nbsp; = &nbsp;
* </td>
* <td valign="middle" align="center">
* <table>
* <tr><td align="center"><small>V(q)&nbsp;&middot;&nbsp;V(d)</small></td></tr>
* <tr><td align="center">&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;</td></tr>
* <tr><td align="center"><small>|V(q)|&nbsp;|V(d)|</small></td></tr>
* </table>
* </td>
* </tr>
* </table>
* </td></tr>
* </table>
* </td></tr>
* <tr><td>
* <center><font=-1><u>VSM Score</u></font></center>
* </td></tr>
* </table>
* <br>&nbsp;<br>
*
*
* Where <i>V(q)</i> &middot; <i>V(d)</i> is the
* <a href="http://en.wikipedia.org/wiki/Dot_product">dot product</a>
* of the weighted vectors,
* and <i>|V(q)|</i> and <i>|V(d)|</i> are their
* <a href="http://en.wikipedia.org/wiki/Euclidean_norm#Euclidean_norm">Euclidean norms</a>.
*
* <p>Note: the above equation can be viewed as the dot product of
* the normalized weighted vectors, in the sense that dividing
* <i>V(q)</i> by its euclidean norm is normalizing it to a unit vector.
*
* <p>Lucene refines <i>VSM score</i> for both search quality and usability:
* <ul>
* <li>Normalizing <i>V(d)</i> to the unit vector is known to be problematic in that
* it removes all document length information.
* For some documents removing this info is probably ok,
* e.g. a document made by duplicating a certain paragraph <i>10</i> times,
* especially if that paragraph is made of distinct terms.
* But for a document which contains no duplicated paragraphs,
* this might be wrong.
* To avoid this problem, a different document length normalization
* factor is used, which normalizes to a vector equal to or larger
* than the unit vector: <i>doc-len-norm(d)</i>.
* </li>
*
* <li>At indexing, users can specify that certain documents are more
* important than others, by assigning a document boost.
* For this, the score of each document is also multiplied by its boost value
* <i>doc-boost(d)</i>.
* </li>
*
* <li>Lucene is field based, hence each query term applies to a single
* field, document length normalization is by the length of the certain field,
* and in addition to document boost there are also document fields boosts.
* </li>
*
* <li>The same field can be added to a document during indexing several times,
* and so the boost of that field is the multiplication of the boosts of
* the separate additions (or parts) of that field within the document.
* </li>
*
* <li>At search time users can specify boosts to each query, sub-query, and
* each query term, hence the contribution of a query term to the score of
* a document is multiplied by the boost of that query term <i>query-boost(q)</i>.
* </li>
*
* <li>A document may match a multi term query without containing all
* the terms of that query (this is correct for some of the queries),
* and users can further reward documents matching more query terms
* through a coordination factor, which is usually larger when
* more terms are matched: <i>coord-factor(q,d)</i>.
* </li>
* </ul>
*
* <p>Under the simplifying assumption of a single field in the index,
* we get <i>Lucene's Conceptual scoring formula</i>:
*
* <br>&nbsp;<br>
* <table cellpadding="2" cellspacing="2" border="0" align="center">
* <tr><td>
* <table cellpadding="1" cellspacing="0" border="1" align="center">
* <tr><td>
* <table cellpadding="2" cellspacing="2" border="0" align="center">
* <tr>
* <td valign="middle" align="right" rowspan="1">
* score(q,d) &nbsp; = &nbsp;
* <font color="#FF9933">coord-factor(q,d)</font> &middot; &nbsp;
* <font color="#CCCC00">query-boost(q)</font> &middot; &nbsp;
* </td>
* <td valign="middle" align="center">
* <table>
* <tr><td align="center"><small><font color="#993399">V(q)&nbsp;&middot;&nbsp;V(d)</font></small></td></tr>
* <tr><td align="center">&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;</td></tr>
* <tr><td align="center"><small><font color="#FF33CC">|V(q)|</font></small></td></tr>
* </table>
* </td>
* <td valign="middle" align="right" rowspan="1">
* &nbsp; &middot; &nbsp; <font color="#3399FF">doc-len-norm(d)</font>
* &nbsp; &middot; &nbsp; <font color="#3399FF">doc-boost(d)</font>
* </td>
* </tr>
* </table>
* </td></tr>
* </table>
* </td></tr>
* <tr><td>
* <center><font=-1><u>Lucene Conceptual Scoring Formula</u></font></center>
* </td></tr>
* </table>
* <br>&nbsp;<br>
*
* <p>The conceptual formula is a simplification in the sense that (1) terms and documents
* are fielded and (2) boosts are usually per query term rather than per query.
*
* <p>We now describe how Lucene implements this conceptual scoring formula, and
* derive from it <i>Lucene's Practical Scoring Function</i>.
*
* <p>For efficient score computation some scoring components
* are computed and aggregated in advance:
*
* <ul>
* <li><i>Query-boost</i> for the query (actually for each query term)
* is known when search starts.
* </li>
*
* <li>Query Euclidean norm <i>|V(q)|</i> can be computed when search starts,
* as it is independent of the document being scored.
* From search optimization perspective, it is a valid question
* why bother to normalize the query at all, because all
* scored documents will be multiplied by the same <i>|V(q)|</i>,
* and hence documents ranks (their order by score) will not
* be affected by this normalization.
* There are two good reasons to keep this normalization:
* <ul>
* <li>Recall that
* <a href="http://en.wikipedia.org/wiki/Cosine_similarity">
* Cosine Similarity</a> can be used find how similar
* two documents are. One can use Lucene for e.g.
* clustering, and use a document as a query to compute
* its similarity to other documents.
* In this use case it is important that the score of document <i>d3</i>
* for query <i>d1</i> is comparable to the score of document <i>d3</i>
* for query <i>d2</i>. In other words, scores of a document for two
* distinct queries should be comparable.
* There are other applications that may require this.
* And this is exactly what normalizing the query vector <i>V(q)</i>
* provides: comparability (to a certain extent) of two or more queries.
* </li>
*
* <li>Applying query normalization on the scores helps to keep the
* scores around the unit vector, hence preventing loss of score data
* because of floating point precision limitations.
* </li>
* </ul>
* </li>
*
* <li>Document length norm <i>doc-len-norm(d)</i> and document
* boost <i>doc-boost(d)</i> are known at indexing time.
* They are computed in advance and their multiplication
* is saved as a single value in the index: <i>norm(d)</i>.
* (In the equations below, <i>norm(t in d)</i> means <i>norm(field(t) in doc d)</i>
* where <i>field(t)</i> is the field associated with term <i>t</i>.)
* </li>
* </ul>
*
* <p><i>Lucene's Practical Scoring Function</i> is derived from the above.
* The color codes demonstrate how it relates
* to those of the <i>conceptual</i> formula:
*
* <P>
* <table cellpadding="2" cellspacing="2" border="0" align="center">
* <tr><td>
* <table cellpadding="" cellspacing="2" border="2" align="center">
* <tr><td>
* <table cellpadding="2" cellspacing="2" border="0" align="center">
* <tr>
* <td valign="middle" align="right" rowspan="1">
* score(q,d) &nbsp; = &nbsp;
* <A HREF="#formula_coord"><font color="#FF9933">coord(q,d)</font></A> &nbsp;&middot;&nbsp;
* <A HREF="#formula_queryNorm"><font color="#FF33CC">queryNorm(q)</font></A> &nbsp;&middot;&nbsp;
* </td>
* <td valign="bottom" align="center" rowspan="1">
* <big><big><big>&sum;</big></big></big>
* </td>
* <td valign="middle" align="right" rowspan="1">
* <big><big>(</big></big>
* <A HREF="#formula_tf"><font color="#993399">tf(t in d)</font></A> &nbsp;&middot;&nbsp;
* <A HREF="#formula_idf"><font color="#993399">idf(t)</font></A><sup>2</sup> &nbsp;&middot;&nbsp;
* <A HREF="#formula_termBoost"><font color="#CCCC00">t.getBoost()</font></A>&nbsp;&middot;&nbsp;
* <A HREF="#formula_norm"><font color="#3399FF">norm(t,d)</font></A>
* <big><big>)</big></big>
* </td>
* </tr>
* <tr valigh="top">
* <td></td>
* <td align="center"><small>t in q</small></td>
* <td></td>
* </tr>
* </table>
* </td></tr>
* </table>
* </td></tr>
* <tr><td>
* <center><font=-1><u>Lucene Practical Scoring Function</u></font></center>
* </td></tr>
* </table>
*
* <p> where
* <p>
* This is a low-level API, you should only extend this API if you want to implement
* an information retrieval <i>model</i>. If you are instead looking for a convenient way
* to alter Lucene's scoring, consider extending a higher-level implementation
* such as {@link TFIDFSimilarity}, which implements the vector space model with this API, or
* just tweaking the default implementation: {@link DefaultSimilarity}.
* <p>
* Similarity determines how Lucene weights terms, and Lucene interacts with
* this class at both <a href="#indextime">index-time</a> and
* <a href="#querytime">query-time</a>.
* <p>
* <a name="indextime"/>
* At indexing time, the indexer calls {@link #computeNorm(FieldInvertState)}, allowing
* the Similarity implementation to return a per-document byte for the field that will
* be later accessible via {@link IndexReader#norms(String)}. Lucene makes no assumption
* about what is in this byte, but it is most useful for encoding length normalization
* information.
* <p>
* Implementations should carefully consider how the normalization byte is encoded: while
* Lucene's classical {@link TFIDFSimilarity} encodes a combination of index-time boost
* and length normalization information with {@link SmallFloat}, this might not be suitable
* for all purposes.
* <p>
* Many formulas require the use of average document length, which can be computed via a
* combination of {@link Terms#getSumTotalTermFreq()} and {@link IndexReader#maxDoc()},
* <p>
* Because index-time boost is handled entirely at the application level anyway,
* an application can alternatively store the index-time boost separately using an
* {@link IndexDocValuesField}, and access this at query-time with
* {@link IndexReader#docValues(String)}.
* <p>
* Finally, using index-time boosts (either via folding into the normalization byte or
* via IndexDocValues), is an inefficient way to boost the scores of different fields if the
* boost will be the same for every document, instead the Similarity can simply take a constant
* boost parameter <i>C</i>, and the SimilarityProvider can return different instances with
* different boosts depending upon field name.
* <p>
* <a name="querytime"/>
* At query-time, Queries interact with the Similarity via these steps:
* <ol>
* <li>
* <A NAME="formula_tf"></A>
* <b><i>tf(t in d)</i></b>
* correlates to the term's <i>frequency</i>,
* defined as the number of times term <i>t</i> appears in the currently scored document <i>d</i>.
* Documents that have more occurrences of a given term receive a higher score.
* Note that <i>tf(t in q)</i> is assumed to be <i>1</i> and therefore it does not appear in this equation,
* However if a query contains twice the same term, there will be
* two term-queries with that same term and hence the computation would still be correct (although
* not very efficient).
* The default computation for <i>tf(t in d)</i> in
* {@link org.apache.lucene.search.DefaultSimilarity#tf(float) DefaultSimilarity} is:
*
* <br>&nbsp;<br>
* <table cellpadding="2" cellspacing="2" border="0" align="center">
* <tr>
* <td valign="middle" align="right" rowspan="1">
* {@link org.apache.lucene.search.DefaultSimilarity#tf(float) tf(t in d)} &nbsp; = &nbsp;
* </td>
* <td valign="top" align="center" rowspan="1">
* frequency<sup><big>&frac12;</big></sup>
* </td>
* </tr>
* </table>
* <br>&nbsp;<br>
* </li>
*
* <li>
* <A NAME="formula_idf"></A>
* <b><i>idf(t)</i></b> stands for Inverse Document Frequency. This value
* correlates to the inverse of <i>docFreq</i>
* (the number of documents in which the term <i>t</i> appears).
* This means rarer terms give higher contribution to the total score.
* <i>idf(t)</i> appears for <i>t</i> in both the query and the document,
* hence it is squared in the equation.
* The default computation for <i>idf(t)</i> in
* {@link org.apache.lucene.search.DefaultSimilarity#idf(int, int) DefaultSimilarity} is:
*
* <br>&nbsp;<br>
* <table cellpadding="2" cellspacing="2" border="0" align="center">
* <tr>
* <td valign="middle" align="right">
* {@link org.apache.lucene.search.DefaultSimilarity#idf(int, int) idf(t)}&nbsp; = &nbsp;
* </td>
* <td valign="middle" align="center">
* 1 + log <big>(</big>
* </td>
* <td valign="middle" align="center">
* <table>
* <tr><td align="center"><small>numDocs</small></td></tr>
* <tr><td align="center">&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;</td></tr>
* <tr><td align="center"><small>docFreq+1</small></td></tr>
* </table>
* </td>
* <td valign="middle" align="center">
* <big>)</big>
* </td>
* </tr>
* </table>
* <br>&nbsp;<br>
* </li>
*
* <li>
* <A NAME="formula_coord"></A>
* <b><i>coord(q,d)</i></b>
* is a score factor based on how many of the query terms are found in the specified document.
* Typically, a document that contains more of the query's terms will receive a higher score
* than another document with fewer query terms.
* This is a search time factor computed in
* {@link SimilarityProvider#coord(int, int) coord(q,d)}
* by the SimilarityProvider in effect at search time.
* <br>&nbsp;<br>
* </li>
*
* <li><b>
* <A NAME="formula_queryNorm"></A>
* <i>queryNorm(q)</i>
* </b>
* is a normalizing factor used to make scores between queries comparable.
* This factor does not affect document ranking (since all ranked documents are multiplied by the same factor),
* but rather just attempts to make scores from different queries (or even different indexes) comparable.
* This is a search time factor computed by the SimilarityProvider in effect at search time.
*
* The default computation in
* {@link org.apache.lucene.search.DefaultSimilarityProvider#queryNorm(float) DefaultSimilarityProvider}
* produces a <a href="http://en.wikipedia.org/wiki/Euclidean_norm#Euclidean_norm">Euclidean norm</a>:
* <br>&nbsp;<br>
* <table cellpadding="1" cellspacing="0" border="0" align="center">
* <tr>
* <td valign="middle" align="right" rowspan="1">
* queryNorm(q) &nbsp; = &nbsp;
* {@link org.apache.lucene.search.DefaultSimilarityProvider#queryNorm(float) queryNorm(sumOfSquaredWeights)}
* &nbsp; = &nbsp;
* </td>
* <td valign="middle" align="center" rowspan="1">
* <table>
* <tr><td align="center"><big>1</big></td></tr>
* <tr><td align="center"><big>
* &ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;
* </big></td></tr>
* <tr><td align="center">sumOfSquaredWeights<sup><big>&frac12;</big></sup></td></tr>
* </table>
* </td>
* </tr>
* </table>
* <br>&nbsp;<br>
*
* The sum of squared weights (of the query terms) is
* computed by the query {@link org.apache.lucene.search.Weight} object.
* For example, a {@link org.apache.lucene.search.BooleanQuery}
* computes this value as:
*
* <br>&nbsp;<br>
* <table cellpadding="1" cellspacing="0" border="0"n align="center">
* <tr>
* <td valign="middle" align="right" rowspan="1">
* {@link org.apache.lucene.search.Weight#sumOfSquaredWeights() sumOfSquaredWeights} &nbsp; = &nbsp;
* {@link org.apache.lucene.search.Query#getBoost() q.getBoost()} <sup><big>2</big></sup>
* &nbsp;&middot;&nbsp;
* </td>
* <td valign="bottom" align="center" rowspan="1">
* <big><big><big>&sum;</big></big></big>
* </td>
* <td valign="middle" align="right" rowspan="1">
* <big><big>(</big></big>
* <A HREF="#formula_idf">idf(t)</A> &nbsp;&middot;&nbsp;
* <A HREF="#formula_termBoost">t.getBoost()</A>
* <big><big>) <sup>2</sup> </big></big>
* </td>
* </tr>
* <tr valigh="top">
* <td></td>
* <td align="center"><small>t in q</small></td>
* <td></td>
* </tr>
* </table>
* <br>&nbsp;<br>
*
* </li>
*
* <li>
* <A NAME="formula_termBoost"></A>
* <b><i>t.getBoost()</i></b>
* is a search time boost of term <i>t</i> in the query <i>q</i> as
* specified in the query text
* (see <A HREF="../../../../../../queryparsersyntax.html#Boosting a Term">query syntax</A>),
* or as set by application calls to
* {@link org.apache.lucene.search.Query#setBoost(float) setBoost()}.
* Notice that there is really no direct API for accessing a boost of one term in a multi term query,
* but rather multi terms are represented in a query as multi
* {@link org.apache.lucene.search.TermQuery TermQuery} objects,
* and so the boost of a term in the query is accessible by calling the sub-query
* {@link org.apache.lucene.search.Query#getBoost() getBoost()}.
* <br>&nbsp;<br>
* </li>
*
* <li>
* <A NAME="formula_norm"></A>
* <b><i>norm(t,d)</i></b> encapsulates a few (indexing time) boost and length factors:
*
* <ul>
* <li><b>Document boost</b> - set by calling
* {@link org.apache.lucene.document.Document#setBoost(float) doc.setBoost()}
* before adding the document to the index.
* </li>
* <li><b>Field boost</b> - set by calling
* {@link org.apache.lucene.document.Fieldable#setBoost(float) field.setBoost()}
* before adding the field to a document.
* </li>
* <li><b>lengthNorm</b> - computed
* when the document is added to the index in accordance with the number of tokens
* of this field in the document, so that shorter fields contribute more to the score.
* LengthNorm is computed by the Similarity class in effect at indexing.
* </li>
* </ul>
* The {@link #computeNorm} method is responsible for
* combining all of these factors into a single float.
*
* <p>
* When a document is added to the index, all the above factors are multiplied.
* If the document has multiple fields with the same name, all their boosts are multiplied together:
*
* <br>&nbsp;<br>
* <table cellpadding="1" cellspacing="0" border="0"n align="center">
* <tr>
* <td valign="middle" align="right" rowspan="1">
* norm(t,d) &nbsp; = &nbsp;
* {@link org.apache.lucene.document.Document#getBoost() doc.getBoost()}
* &nbsp;&middot;&nbsp;
* lengthNorm
* &nbsp;&middot;&nbsp;
* </td>
* <td valign="bottom" align="center" rowspan="1">
* <big><big><big>&prod;</big></big></big>
* </td>
* <td valign="middle" align="right" rowspan="1">
* {@link org.apache.lucene.document.Fieldable#getBoost() f.getBoost}()
* </td>
* </tr>
* <tr valigh="top">
* <td></td>
* <td align="center"><small>field <i><b>f</b></i> in <i>d</i> named as <i><b>t</b></i></small></td>
* <td></td>
* </tr>
* </table>
* <br>&nbsp;<br>
* However the resulted <i>norm</i> value is {@link #encodeNormValue(float) encoded} as a single byte
* before being stored.
* At search time, the norm byte value is read from the index
* {@link org.apache.lucene.store.Directory directory} and
* {@link #decodeNormValue(byte) decoded} back to a float <i>norm</i> value.
* This encoding/decoding, while reducing index size, comes with the price of
* precision loss - it is not guaranteed that <i>decode(encode(x)) = x</i>.
* For instance, <i>decode(encode(0.89)) = 0.75</i>.
* <br>&nbsp;<br>
* Compression of norm values to a single byte saves memory at search time,
* because once a field is referenced at search time, its norms - for
* all documents - are maintained in memory.
* <br>&nbsp;<br>
* The rationale supporting such lossy compression of norm values is that
* given the difficulty (and inaccuracy) of users to express their true information
* need by a query, only big differences matter.
* <br>&nbsp;<br>
* Last, note that search time is too late to modify this <i>norm</i> part of scoring, e.g. by
* using a different {@link Similarity} for search.
* <br>&nbsp;<br>
* </li>
* <li>The {@link #computeStats(IndexSearcher, String, float, TermContext...)} method is called a single time,
* allowing the implementation to compute any statistics (such as IDF, average document length, etc)
* across <i>the entire collection</i>. The {@link TermContext}s passed in are already positioned
* to the terms involved with the raw statistics involved, so a Similarity can freely use any combination
* of term statistics without causing any additional I/O. Lucene makes no assumption about what is
* stored in the returned {@link Similarity.Stats} object.
* <li>The query normalization process occurs a single time: {@link Similarity.Stats#getValueForNormalization()}
* is called for each query leaf node, {@link SimilarityProvider#queryNorm(float)} is called for the top-level
* query, and finally {@link Similarity.Stats#normalize(float, float)} passes down the normalization value
* and any top-level boosts (e.g. from enclosing {@link BooleanQuery}s).
* <li>For each segment in the index, the Query creates a {@link #exactDocScorer(Stats, String, IndexReader.AtomicReaderContext)}
* (for queries with exact frequencies such as TermQuerys and exact PhraseQueries) or a
* {@link #sloppyDocScorer(Stats, String, IndexReader.AtomicReaderContext)} (for queries with sloppy frequencies such as
* SpanQuerys and sloppy PhraseQueries). The score() method is called for each matching document.
* </ol>
* <p>
* <a name="explaintime"/>
* When {@link IndexSearcher#explain(Query, int)} is called, queries consult the Similarity's DocScorer for an
* explanation of how it computed its score. The query passes in a the document id and an explanation of how the frequency
* was computed.
*
* @see org.apache.lucene.index.IndexWriterConfig#setSimilarityProvider(SimilarityProvider)
* @see IndexSearcher#setSimilarityProvider(SimilarityProvider)
* @lucene.experimental
*/
public abstract class Similarity {
public static final int NO_DOC_ID_PROVIDED = -1;
/** Cache of decoded bytes. */
private static final float[] NORM_TABLE = new float[256];
static {
for (int i = 0; i < 256; i++)
NORM_TABLE[i] = SmallFloat.byte315ToFloat((byte)i);
}
/** Decodes a normalization factor stored in an index.
* @see #encodeNormValue(float)
*/
public float decodeNormValue(byte b) {
return NORM_TABLE[b & 0xFF]; // & 0xFF maps negative bytes to positive above 127
}
/**
* Computes the normalization value for a field, given the accumulated
* state of term processing for this field (see {@link FieldInvertState}).
*
* <p>Implementations should calculate a float value based on the field
* <p>Implementations should calculate a byte value based on the field
* state and then return that value.
*
* <p>Matches in longer fields are less precise, so implementations of this
* method usually return smaller values when <code>state.getLength()</code> is large,
* and larger values when <code>state.getLength()</code> is small.
*
* <p>Note that the return values are computed under
* {@link org.apache.lucene.index.IndexWriter#addDocument(org.apache.lucene.document.Document)}
* and then stored using
* {@link #encodeNormValue(float)}.
* Thus they have limited precision, and documents
* must be re-indexed if this method is altered.
*
* @lucene.experimental
*
* @param state current processing state for this field
* @return the calculated float norm
* @return the calculated byte norm
*/
public abstract float computeNorm(FieldInvertState state);
/** Encodes a normalization factor for storage in an index.
*
* <p>The encoding uses a three-bit mantissa, a five-bit exponent, and
* the zero-exponent point at 15, thus
* representing values from around 7x10^9 to 2x10^-9 with about one
* significant decimal digit of accuracy. Zero is also represented.
* Negative numbers are rounded up to zero. Values too large to represent
* are rounded down to the largest representable value. Positive values too
* small to represent are rounded up to the smallest positive representable
* value.
* @see org.apache.lucene.document.Field#setBoost(float)
* @see org.apache.lucene.util.SmallFloat
*/
public byte encodeNormValue(float f) {
return SmallFloat.floatToByte315(f);
}
/** Computes a score factor based on a term or phrase's frequency in a
* document. This value is multiplied by the {@link #idf(int, int)}
* factor for each term in the query and these products are then summed to
* form the initial score for a document.
*
* <p>Terms and phrases repeated in a document indicate the topic of the
* document, so implementations of this method usually return larger values
* when <code>freq</code> is large, and smaller values when <code>freq</code>
* is small.
*
* <p>The default implementation calls {@link #tf(float)}.
*
* @param freq the frequency of a term within a document
* @return a score factor based on a term's within-document frequency
*/
public float tf(int freq) {
return tf((float)freq);
}
public abstract byte computeNorm(FieldInvertState state);
/** Computes the amount of a sloppy phrase match, based on an edit distance.
* This value is summed for each sloppy phrase match in a document to form
* the frequency that is passed to {@link #tf(float)}.
* the frequency to be used in scoring instead of the exact term count.
*
* <p>A phrase match with a small edit distance to a document passage more
* closely matches the document, so implementations of this method usually
@ -619,124 +136,6 @@ public abstract class Similarity {
*/
public abstract float sloppyFreq(int distance);
/** Computes a score factor based on a term or phrase's frequency in a
* document. This value is multiplied by the {@link #idf(int, int)}
* factor for each term in the query and these products are then summed to
* form the initial score for a document.
*
* <p>Terms and phrases repeated in a document indicate the topic of the
* document, so implementations of this method usually return larger values
* when <code>freq</code> is large, and smaller values when <code>freq</code>
* is small.
*
* @param freq the frequency of a term within a document
* @return a score factor based on a term's within-document frequency
*/
public abstract float tf(float freq);
/**
* Computes a score factor for a simple term and returns an explanation
* for that score factor.
*
* <p>
* The default implementation uses:
*
* <pre>
* idf(docFreq, searcher.maxDoc());
* </pre>
*
* Note that {@link IndexSearcher#maxDoc()} is used instead of
* {@link org.apache.lucene.index.IndexReader#numDocs() IndexReader#numDocs()} because also
* {@link IndexSearcher#docFreq(Term)} is used, and when the latter
* is inaccurate, so is {@link IndexSearcher#maxDoc()}, and in the same direction.
* In addition, {@link IndexSearcher#maxDoc()} is more efficient to compute
*
* @param term the term in question
* @param searcher the document collection being searched
* @param docFreq externally computed docFreq for this term
* @return an IDFExplain object that includes both an idf score factor
and an explanation for the term.
* @throws IOException
*/
public IDFExplanation idfExplain(final Term term, final IndexSearcher searcher, int docFreq) throws IOException {
final int df = docFreq;
final int max = searcher.maxDoc();
final float idf = idf(df, max);
return new IDFExplanation() {
@Override
public String explain() {
return "idf(docFreq=" + df +
", maxDocs=" + max + ")";
}
@Override
public float getIdf() {
return idf;
}};
}
/**
* This method forwards to {@link
* #idfExplain(Term,IndexSearcher,int)} by passing
* <code>searcher.docFreq(term)</code> as the docFreq.
*/
public IDFExplanation idfExplain(final Term term, final IndexSearcher searcher) throws IOException {
return idfExplain(term, searcher, searcher.docFreq(term));
}
/**
* Computes a score factor for a phrase.
*
* <p>
* The default implementation sums the idf factor for
* each term in the phrase.
*
* @param terms the terms in the phrase
* @param searcher the document collection being searched
* @return an IDFExplain object that includes both an idf
* score factor for the phrase and an explanation
* for each term.
* @throws IOException
*/
public IDFExplanation idfExplain(Collection<Term> terms, IndexSearcher searcher) throws IOException {
final int max = searcher.maxDoc();
float idf = 0.0f;
final StringBuilder exp = new StringBuilder();
for (final Term term : terms ) {
final int df = searcher.docFreq(term);
idf += idf(df, max);
exp.append(" ");
exp.append(term.text());
exp.append("=");
exp.append(df);
}
final float fIdf = idf;
return new IDFExplanation() {
@Override
public float getIdf() {
return fIdf;
}
@Override
public String explain() {
return exp.toString();
}
};
}
/** Computes a score factor based on a term's document frequency (the number
* of documents which contain the term). This value is multiplied by the
* {@link #tf(int)} factor for each term in the query and these products are
* then summed to form the initial score for a document.
*
* <p>Terms that occur in fewer documents are better indicators of topic, so
* implementations of this method usually return larger values for rare terms,
* and smaller values for common terms.
*
* @param docFreq the number of documents which contain the term
* @param numDocs the total number of documents in the collection
* @return a score factor based on the term's document frequency
*/
public abstract float idf(int docFreq, int numDocs);
/**
* Calculate a scoring factor based on the data in the payload. Overriding implementations
* are responsible for interpreting what is in the payload. Lucene makes no assumptions about
@ -759,4 +158,100 @@ public abstract class Similarity {
return 1;
}
/**
* Compute any collection-level stats (e.g. IDF, average document length, etc) needed for scoring a query.
*/
public abstract Stats computeStats(IndexSearcher searcher, String fieldName, float queryBoost, TermContext... termContexts) throws IOException;
/**
* returns a new {@link Similarity.ExactDocScorer}.
*/
public abstract ExactDocScorer exactDocScorer(Stats stats, String fieldName, AtomicReaderContext context) throws IOException;
/**
* returns a new {@link Similarity.SloppyDocScorer}.
*/
public abstract SloppyDocScorer sloppyDocScorer(Stats stats, String fieldName, AtomicReaderContext context) throws IOException;
/**
* API for scoring exact queries such as {@link TermQuery} and
* exact {@link PhraseQuery}.
* <p>
* Term frequencies are integers (the term or phrase's tf)
*/
public abstract class ExactDocScorer {
/**
* Score a single document
* @param doc document id
* @param freq term frequency
* @return document's score
*/
public abstract float score(int doc, int freq);
/**
* Explain the score for a single document
* @param doc document id
* @param freq Explanation of how the term frequency was computed
* @return document's score
*/
public Explanation explain(int doc, Explanation freq) {
Explanation result = new Explanation(score(doc, (int)freq.getValue()),
"score(doc=" + doc + ",freq=" + freq.getValue() +"), with freq of:");
result.addDetail(freq);
return result;
}
}
/**
* API for scoring "sloppy" queries such as {@link SpanQuery} and
* sloppy {@link PhraseQuery}.
* <p>
* Term frequencies are floating point values.
*/
public abstract class SloppyDocScorer {
/**
* Score a single document
* @param doc document id
* @param freq sloppy term frequency
* @return document's score
*/
public abstract float score(int doc, float freq);
/**
* Explain the score for a single document
* @param doc document id
* @param freq Explanation of how the sloppy term frequency was computed
* @return document's score
*/
public Explanation explain(int doc, Explanation freq) {
Explanation result = new Explanation(score(doc, freq.getValue()),
"score(doc=" + doc + ",freq=" + freq.getValue() +"), with freq of:");
result.addDetail(freq);
return result;
}
}
/** Stores the statistics for the indexed collection. This abstract
* implementation is empty; descendants of {@code Similarity} should
* subclass {@code Stats} and define the statistics they require in the
* subclass. Examples include idf, average field length, etc.
*/
public static abstract class Stats {
/** The value for normalization of contained query clauses (e.g. sum of squared weights).
* <p>
* NOTE: a Similarity implementation might not use any query normalization at all,
* its not required. However, if it wants to participate in query normalization,
* it can return a value here.
*/
public abstract float getValueForNormalization();
/** Assigns the query normalization factor and boost from parent queries to this.
* <p>
* NOTE: a Similarity implementation might not use this normalized value at all,
* its not required. However, its usually a good idea to at least incorporate
* the topLevelBoost (e.g. from an outer BooleanQuery) into its score.
*/
public abstract void normalize(float queryNorm, float topLevelBoost);
}
}

View File

@ -25,11 +25,13 @@ final class SloppyPhraseScorer extends PhraseScorer {
private PhrasePositions repeats[];
private PhrasePositions tmpPos[]; // for flipping repeating pps.
private boolean checkedRepeats;
private final Similarity similarity;
SloppyPhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings, Similarity similarity,
int slop, byte[] norms) {
super(weight, postings, similarity, norms);
int slop, Similarity.SloppyDocScorer docScorer) throws IOException {
super(weight, postings, docScorer);
this.slop = slop;
this.similarity = similarity;
}
/**

View File

@ -0,0 +1,831 @@
package org.apache.lucene.search;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.util.TermContext;
import org.apache.lucene.util.SmallFloat;
/**
* Implementation of {@link Similarity} with the Vector Space Model.
* <p>
* Expert: Scoring API.
* <p>TFIDFSimilarity defines the components of Lucene scoring.
* Overriding computation of these components is a convenient
* way to alter Lucene scoring.
*
* <p>Suggested reading:
* <a href="http://nlp.stanford.edu/IR-book/html/htmledition/queries-as-vectors-1.html">
* Introduction To Information Retrieval, Chapter 6</a>.
*
* <p>The following describes how Lucene scoring evolves from
* underlying information retrieval models to (efficient) implementation.
* We first brief on <i>VSM Score</i>,
* then derive from it <i>Lucene's Conceptual Scoring Formula</i>,
* from which, finally, evolves <i>Lucene's Practical Scoring Function</i>
* (the latter is connected directly with Lucene classes and methods).
*
* <p>Lucene combines
* <a href="http://en.wikipedia.org/wiki/Standard_Boolean_model">
* Boolean model (BM) of Information Retrieval</a>
* with
* <a href="http://en.wikipedia.org/wiki/Vector_Space_Model">
* Vector Space Model (VSM) of Information Retrieval</a> -
* documents "approved" by BM are scored by VSM.
*
* <p>In VSM, documents and queries are represented as
* weighted vectors in a multi-dimensional space,
* where each distinct index term is a dimension,
* and weights are
* <a href="http://en.wikipedia.org/wiki/Tfidf">Tf-idf</a> values.
*
* <p>VSM does not require weights to be <i>Tf-idf</i> values,
* but <i>Tf-idf</i> values are believed to produce search results of high quality,
* and so Lucene is using <i>Tf-idf</i>.
* <i>Tf</i> and <i>Idf</i> are described in more detail below,
* but for now, for completion, let's just say that
* for given term <i>t</i> and document (or query) <i>x</i>,
* <i>Tf(t,x)</i> varies with the number of occurrences of term <i>t</i> in <i>x</i>
* (when one increases so does the other) and
* <i>idf(t)</i> similarly varies with the inverse of the
* number of index documents containing term <i>t</i>.
*
* <p><i>VSM score</i> of document <i>d</i> for query <i>q</i> is the
* <a href="http://en.wikipedia.org/wiki/Cosine_similarity">
* Cosine Similarity</a>
* of the weighted query vectors <i>V(q)</i> and <i>V(d)</i>:
*
* <br>&nbsp;<br>
* <table cellpadding="2" cellspacing="2" border="0" align="center">
* <tr><td>
* <table cellpadding="1" cellspacing="0" border="1" align="center">
* <tr><td>
* <table cellpadding="2" cellspacing="2" border="0" align="center">
* <tr>
* <td valign="middle" align="right" rowspan="1">
* cosine-similarity(q,d) &nbsp; = &nbsp;
* </td>
* <td valign="middle" align="center">
* <table>
* <tr><td align="center"><small>V(q)&nbsp;&middot;&nbsp;V(d)</small></td></tr>
* <tr><td align="center">&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;</td></tr>
* <tr><td align="center"><small>|V(q)|&nbsp;|V(d)|</small></td></tr>
* </table>
* </td>
* </tr>
* </table>
* </td></tr>
* </table>
* </td></tr>
* <tr><td>
* <center><font=-1><u>VSM Score</u></font></center>
* </td></tr>
* </table>
* <br>&nbsp;<br>
*
*
* Where <i>V(q)</i> &middot; <i>V(d)</i> is the
* <a href="http://en.wikipedia.org/wiki/Dot_product">dot product</a>
* of the weighted vectors,
* and <i>|V(q)|</i> and <i>|V(d)|</i> are their
* <a href="http://en.wikipedia.org/wiki/Euclidean_norm#Euclidean_norm">Euclidean norms</a>.
*
* <p>Note: the above equation can be viewed as the dot product of
* the normalized weighted vectors, in the sense that dividing
* <i>V(q)</i> by its euclidean norm is normalizing it to a unit vector.
*
* <p>Lucene refines <i>VSM score</i> for both search quality and usability:
* <ul>
* <li>Normalizing <i>V(d)</i> to the unit vector is known to be problematic in that
* it removes all document length information.
* For some documents removing this info is probably ok,
* e.g. a document made by duplicating a certain paragraph <i>10</i> times,
* especially if that paragraph is made of distinct terms.
* But for a document which contains no duplicated paragraphs,
* this might be wrong.
* To avoid this problem, a different document length normalization
* factor is used, which normalizes to a vector equal to or larger
* than the unit vector: <i>doc-len-norm(d)</i>.
* </li>
*
* <li>At indexing, users can specify that certain documents are more
* important than others, by assigning a document boost.
* For this, the score of each document is also multiplied by its boost value
* <i>doc-boost(d)</i>.
* </li>
*
* <li>Lucene is field based, hence each query term applies to a single
* field, document length normalization is by the length of the certain field,
* and in addition to document boost there are also document fields boosts.
* </li>
*
* <li>The same field can be added to a document during indexing several times,
* and so the boost of that field is the multiplication of the boosts of
* the separate additions (or parts) of that field within the document.
* </li>
*
* <li>At search time users can specify boosts to each query, sub-query, and
* each query term, hence the contribution of a query term to the score of
* a document is multiplied by the boost of that query term <i>query-boost(q)</i>.
* </li>
*
* <li>A document may match a multi term query without containing all
* the terms of that query (this is correct for some of the queries),
* and users can further reward documents matching more query terms
* through a coordination factor, which is usually larger when
* more terms are matched: <i>coord-factor(q,d)</i>.
* </li>
* </ul>
*
* <p>Under the simplifying assumption of a single field in the index,
* we get <i>Lucene's Conceptual scoring formula</i>:
*
* <br>&nbsp;<br>
* <table cellpadding="2" cellspacing="2" border="0" align="center">
* <tr><td>
* <table cellpadding="1" cellspacing="0" border="1" align="center">
* <tr><td>
* <table cellpadding="2" cellspacing="2" border="0" align="center">
* <tr>
* <td valign="middle" align="right" rowspan="1">
* score(q,d) &nbsp; = &nbsp;
* <font color="#FF9933">coord-factor(q,d)</font> &middot; &nbsp;
* <font color="#CCCC00">query-boost(q)</font> &middot; &nbsp;
* </td>
* <td valign="middle" align="center">
* <table>
* <tr><td align="center"><small><font color="#993399">V(q)&nbsp;&middot;&nbsp;V(d)</font></small></td></tr>
* <tr><td align="center">&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;</td></tr>
* <tr><td align="center"><small><font color="#FF33CC">|V(q)|</font></small></td></tr>
* </table>
* </td>
* <td valign="middle" align="right" rowspan="1">
* &nbsp; &middot; &nbsp; <font color="#3399FF">doc-len-norm(d)</font>
* &nbsp; &middot; &nbsp; <font color="#3399FF">doc-boost(d)</font>
* </td>
* </tr>
* </table>
* </td></tr>
* </table>
* </td></tr>
* <tr><td>
* <center><font=-1><u>Lucene Conceptual Scoring Formula</u></font></center>
* </td></tr>
* </table>
* <br>&nbsp;<br>
*
* <p>The conceptual formula is a simplification in the sense that (1) terms and documents
* are fielded and (2) boosts are usually per query term rather than per query.
*
* <p>We now describe how Lucene implements this conceptual scoring formula, and
* derive from it <i>Lucene's Practical Scoring Function</i>.
*
* <p>For efficient score computation some scoring components
* are computed and aggregated in advance:
*
* <ul>
* <li><i>Query-boost</i> for the query (actually for each query term)
* is known when search starts.
* </li>
*
* <li>Query Euclidean norm <i>|V(q)|</i> can be computed when search starts,
* as it is independent of the document being scored.
* From search optimization perspective, it is a valid question
* why bother to normalize the query at all, because all
* scored documents will be multiplied by the same <i>|V(q)|</i>,
* and hence documents ranks (their order by score) will not
* be affected by this normalization.
* There are two good reasons to keep this normalization:
* <ul>
* <li>Recall that
* <a href="http://en.wikipedia.org/wiki/Cosine_similarity">
* Cosine Similarity</a> can be used find how similar
* two documents are. One can use Lucene for e.g.
* clustering, and use a document as a query to compute
* its similarity to other documents.
* In this use case it is important that the score of document <i>d3</i>
* for query <i>d1</i> is comparable to the score of document <i>d3</i>
* for query <i>d2</i>. In other words, scores of a document for two
* distinct queries should be comparable.
* There are other applications that may require this.
* And this is exactly what normalizing the query vector <i>V(q)</i>
* provides: comparability (to a certain extent) of two or more queries.
* </li>
*
* <li>Applying query normalization on the scores helps to keep the
* scores around the unit vector, hence preventing loss of score data
* because of floating point precision limitations.
* </li>
* </ul>
* </li>
*
* <li>Document length norm <i>doc-len-norm(d)</i> and document
* boost <i>doc-boost(d)</i> are known at indexing time.
* They are computed in advance and their multiplication
* is saved as a single value in the index: <i>norm(d)</i>.
* (In the equations below, <i>norm(t in d)</i> means <i>norm(field(t) in doc d)</i>
* where <i>field(t)</i> is the field associated with term <i>t</i>.)
* </li>
* </ul>
*
* <p><i>Lucene's Practical Scoring Function</i> is derived from the above.
* The color codes demonstrate how it relates
* to those of the <i>conceptual</i> formula:
*
* <P>
* <table cellpadding="2" cellspacing="2" border="0" align="center">
* <tr><td>
* <table cellpadding="" cellspacing="2" border="2" align="center">
* <tr><td>
* <table cellpadding="2" cellspacing="2" border="0" align="center">
* <tr>
* <td valign="middle" align="right" rowspan="1">
* score(q,d) &nbsp; = &nbsp;
* <A HREF="#formula_coord"><font color="#FF9933">coord(q,d)</font></A> &nbsp;&middot;&nbsp;
* <A HREF="#formula_queryNorm"><font color="#FF33CC">queryNorm(q)</font></A> &nbsp;&middot;&nbsp;
* </td>
* <td valign="bottom" align="center" rowspan="1">
* <big><big><big>&sum;</big></big></big>
* </td>
* <td valign="middle" align="right" rowspan="1">
* <big><big>(</big></big>
* <A HREF="#formula_tf"><font color="#993399">tf(t in d)</font></A> &nbsp;&middot;&nbsp;
* <A HREF="#formula_idf"><font color="#993399">idf(t)</font></A><sup>2</sup> &nbsp;&middot;&nbsp;
* <A HREF="#formula_termBoost"><font color="#CCCC00">t.getBoost()</font></A>&nbsp;&middot;&nbsp;
* <A HREF="#formula_norm"><font color="#3399FF">norm(t,d)</font></A>
* <big><big>)</big></big>
* </td>
* </tr>
* <tr valigh="top">
* <td></td>
* <td align="center"><small>t in q</small></td>
* <td></td>
* </tr>
* </table>
* </td></tr>
* </table>
* </td></tr>
* <tr><td>
* <center><font=-1><u>Lucene Practical Scoring Function</u></font></center>
* </td></tr>
* </table>
*
* <p> where
* <ol>
* <li>
* <A NAME="formula_tf"></A>
* <b><i>tf(t in d)</i></b>
* correlates to the term's <i>frequency</i>,
* defined as the number of times term <i>t</i> appears in the currently scored document <i>d</i>.
* Documents that have more occurrences of a given term receive a higher score.
* Note that <i>tf(t in q)</i> is assumed to be <i>1</i> and therefore it does not appear in this equation,
* However if a query contains twice the same term, there will be
* two term-queries with that same term and hence the computation would still be correct (although
* not very efficient).
* The default computation for <i>tf(t in d)</i> in
* {@link org.apache.lucene.search.DefaultSimilarity#tf(float) DefaultSimilarity} is:
*
* <br>&nbsp;<br>
* <table cellpadding="2" cellspacing="2" border="0" align="center">
* <tr>
* <td valign="middle" align="right" rowspan="1">
* {@link org.apache.lucene.search.DefaultSimilarity#tf(float) tf(t in d)} &nbsp; = &nbsp;
* </td>
* <td valign="top" align="center" rowspan="1">
* frequency<sup><big>&frac12;</big></sup>
* </td>
* </tr>
* </table>
* <br>&nbsp;<br>
* </li>
*
* <li>
* <A NAME="formula_idf"></A>
* <b><i>idf(t)</i></b> stands for Inverse Document Frequency. This value
* correlates to the inverse of <i>docFreq</i>
* (the number of documents in which the term <i>t</i> appears).
* This means rarer terms give higher contribution to the total score.
* <i>idf(t)</i> appears for <i>t</i> in both the query and the document,
* hence it is squared in the equation.
* The default computation for <i>idf(t)</i> in
* {@link org.apache.lucene.search.DefaultSimilarity#idf(int, int) DefaultSimilarity} is:
*
* <br>&nbsp;<br>
* <table cellpadding="2" cellspacing="2" border="0" align="center">
* <tr>
* <td valign="middle" align="right">
* {@link org.apache.lucene.search.DefaultSimilarity#idf(int, int) idf(t)}&nbsp; = &nbsp;
* </td>
* <td valign="middle" align="center">
* 1 + log <big>(</big>
* </td>
* <td valign="middle" align="center">
* <table>
* <tr><td align="center"><small>numDocs</small></td></tr>
* <tr><td align="center">&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;</td></tr>
* <tr><td align="center"><small>docFreq+1</small></td></tr>
* </table>
* </td>
* <td valign="middle" align="center">
* <big>)</big>
* </td>
* </tr>
* </table>
* <br>&nbsp;<br>
* </li>
*
* <li>
* <A NAME="formula_coord"></A>
* <b><i>coord(q,d)</i></b>
* is a score factor based on how many of the query terms are found in the specified document.
* Typically, a document that contains more of the query's terms will receive a higher score
* than another document with fewer query terms.
* This is a search time factor computed in
* {@link SimilarityProvider#coord(int, int) coord(q,d)}
* by the SimilarityProvider in effect at search time.
* <br>&nbsp;<br>
* </li>
*
* <li><b>
* <A NAME="formula_queryNorm"></A>
* <i>queryNorm(q)</i>
* </b>
* is a normalizing factor used to make scores between queries comparable.
* This factor does not affect document ranking (since all ranked documents are multiplied by the same factor),
* but rather just attempts to make scores from different queries (or even different indexes) comparable.
* This is a search time factor computed by the Similarity in effect at search time.
*
* The default computation in
* {@link org.apache.lucene.search.DefaultSimilarityProvider#queryNorm(float) DefaultSimilarityProvider}
* produces a <a href="http://en.wikipedia.org/wiki/Euclidean_norm#Euclidean_norm">Euclidean norm</a>:
* <br>&nbsp;<br>
* <table cellpadding="1" cellspacing="0" border="0" align="center">
* <tr>
* <td valign="middle" align="right" rowspan="1">
* queryNorm(q) &nbsp; = &nbsp;
* {@link org.apache.lucene.search.DefaultSimilarityProvider#queryNorm(float) queryNorm(sumOfSquaredWeights)}
* &nbsp; = &nbsp;
* </td>
* <td valign="middle" align="center" rowspan="1">
* <table>
* <tr><td align="center"><big>1</big></td></tr>
* <tr><td align="center"><big>
* &ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;&ndash;
* </big></td></tr>
* <tr><td align="center">sumOfSquaredWeights<sup><big>&frac12;</big></sup></td></tr>
* </table>
* </td>
* </tr>
* </table>
* <br>&nbsp;<br>
*
* The sum of squared weights (of the query terms) is
* computed by the query {@link org.apache.lucene.search.Weight} object.
* For example, a {@link org.apache.lucene.search.BooleanQuery}
* computes this value as:
*
* <br>&nbsp;<br>
* <table cellpadding="1" cellspacing="0" border="0"n align="center">
* <tr>
* <td valign="middle" align="right" rowspan="1">
* {@link org.apache.lucene.search.Weight#getValueForNormalization() sumOfSquaredWeights} &nbsp; = &nbsp;
* {@link org.apache.lucene.search.Query#getBoost() q.getBoost()} <sup><big>2</big></sup>
* &nbsp;&middot;&nbsp;
* </td>
* <td valign="bottom" align="center" rowspan="1">
* <big><big><big>&sum;</big></big></big>
* </td>
* <td valign="middle" align="right" rowspan="1">
* <big><big>(</big></big>
* <A HREF="#formula_idf">idf(t)</A> &nbsp;&middot;&nbsp;
* <A HREF="#formula_termBoost">t.getBoost()</A>
* <big><big>) <sup>2</sup> </big></big>
* </td>
* </tr>
* <tr valigh="top">
* <td></td>
* <td align="center"><small>t in q</small></td>
* <td></td>
* </tr>
* </table>
* <br>&nbsp;<br>
*
* </li>
*
* <li>
* <A NAME="formula_termBoost"></A>
* <b><i>t.getBoost()</i></b>
* is a search time boost of term <i>t</i> in the query <i>q</i> as
* specified in the query text
* (see <A HREF="../../../../../../queryparsersyntax.html#Boosting a Term">query syntax</A>),
* or as set by application calls to
* {@link org.apache.lucene.search.Query#setBoost(float) setBoost()}.
* Notice that there is really no direct API for accessing a boost of one term in a multi term query,
* but rather multi terms are represented in a query as multi
* {@link org.apache.lucene.search.TermQuery TermQuery} objects,
* and so the boost of a term in the query is accessible by calling the sub-query
* {@link org.apache.lucene.search.Query#getBoost() getBoost()}.
* <br>&nbsp;<br>
* </li>
*
* <li>
* <A NAME="formula_norm"></A>
* <b><i>norm(t,d)</i></b> encapsulates a few (indexing time) boost and length factors:
*
* <ul>
* <li><b>Document boost</b> - set by calling
* {@link org.apache.lucene.document.Document#setBoost(float) doc.setBoost()}
* before adding the document to the index.
* </li>
* <li><b>Field boost</b> - set by calling
* {@link org.apache.lucene.document.Fieldable#setBoost(float) field.setBoost()}
* before adding the field to a document.
* </li>
* <li><b>lengthNorm</b> - computed
* when the document is added to the index in accordance with the number of tokens
* of this field in the document, so that shorter fields contribute more to the score.
* LengthNorm is computed by the Similarity class in effect at indexing.
* </li>
* </ul>
* The {@link #computeNorm} method is responsible for
* combining all of these factors into a single float.
*
* <p>
* When a document is added to the index, all the above factors are multiplied.
* If the document has multiple fields with the same name, all their boosts are multiplied together:
*
* <br>&nbsp;<br>
* <table cellpadding="1" cellspacing="0" border="0"n align="center">
* <tr>
* <td valign="middle" align="right" rowspan="1">
* norm(t,d) &nbsp; = &nbsp;
* {@link org.apache.lucene.document.Document#getBoost() doc.getBoost()}
* &nbsp;&middot;&nbsp;
* lengthNorm
* &nbsp;&middot;&nbsp;
* </td>
* <td valign="bottom" align="center" rowspan="1">
* <big><big><big>&prod;</big></big></big>
* </td>
* <td valign="middle" align="right" rowspan="1">
* {@link org.apache.lucene.document.Fieldable#getBoost() f.getBoost}()
* </td>
* </tr>
* <tr valigh="top">
* <td></td>
* <td align="center"><small>field <i><b>f</b></i> in <i>d</i> named as <i><b>t</b></i></small></td>
* <td></td>
* </tr>
* </table>
* <br>&nbsp;<br>
* However the resulted <i>norm</i> value is {@link #encodeNormValue(float) encoded} as a single byte
* before being stored.
* At search time, the norm byte value is read from the index
* {@link org.apache.lucene.store.Directory directory} and
* {@link #decodeNormValue(byte) decoded} back to a float <i>norm</i> value.
* This encoding/decoding, while reducing index size, comes with the price of
* precision loss - it is not guaranteed that <i>decode(encode(x)) = x</i>.
* For instance, <i>decode(encode(0.89)) = 0.75</i>.
* <br>&nbsp;<br>
* Compression of norm values to a single byte saves memory at search time,
* because once a field is referenced at search time, its norms - for
* all documents - are maintained in memory.
* <br>&nbsp;<br>
* The rationale supporting such lossy compression of norm values is that
* given the difficulty (and inaccuracy) of users to express their true information
* need by a query, only big differences matter.
* <br>&nbsp;<br>
* Last, note that search time is too late to modify this <i>norm</i> part of scoring, e.g. by
* using a different {@link Similarity} for search.
* <br>&nbsp;<br>
* </li>
* </ol>
*
* @see org.apache.lucene.index.IndexWriterConfig#setSimilarityProvider(SimilarityProvider)
* @see IndexSearcher#setSimilarityProvider(SimilarityProvider)
*/
public abstract class TFIDFSimilarity extends Similarity {
/** Computes a score factor based on a term or phrase's frequency in a
* document. This value is multiplied by the {@link #idf(int, int)}
* factor for each term in the query and these products are then summed to
* form the initial score for a document.
*
* <p>Terms and phrases repeated in a document indicate the topic of the
* document, so implementations of this method usually return larger values
* when <code>freq</code> is large, and smaller values when <code>freq</code>
* is small.
*
* <p>The default implementation calls {@link #tf(float)}.
*
* @param freq the frequency of a term within a document
* @return a score factor based on a term's within-document frequency
*/
public float tf(int freq) {
return tf((float)freq);
}
/** Computes a score factor based on a term or phrase's frequency in a
* document. This value is multiplied by the {@link #idf(int, int)}
* factor for each term in the query and these products are then summed to
* form the initial score for a document.
*
* <p>Terms and phrases repeated in a document indicate the topic of the
* document, so implementations of this method usually return larger values
* when <code>freq</code> is large, and smaller values when <code>freq</code>
* is small.
*
* @param freq the frequency of a term within a document
* @return a score factor based on a term's within-document frequency
*/
public abstract float tf(float freq);
/**
* Computes a score factor for a simple term and returns an explanation
* for that score factor.
*
* <p>
* The default implementation uses:
*
* <pre>
* idf(docFreq, searcher.maxDoc());
* </pre>
*
* Note that {@link IndexSearcher#maxDoc()} is used instead of
* {@link org.apache.lucene.index.IndexReader#numDocs() IndexReader#numDocs()} because also
* {@link IndexSearcher#docFreq(Term)} is used, and when the latter
* is inaccurate, so is {@link IndexSearcher#maxDoc()}, and in the same direction.
* In addition, {@link IndexSearcher#maxDoc()} is more efficient to compute
*
* @param stats statistics of the term in question
* @param searcher the document collection being searched
* @return an Explain object that includes both an idf score factor
and an explanation for the term.
* @throws IOException
*/
public Explanation idfExplain(TermContext stats, final IndexSearcher searcher) throws IOException {
final int df = stats.docFreq();
final int max = searcher.maxDoc();
final float idf = idf(df, max);
return new Explanation(idf, "idf(docFreq=" + df + ", maxDocs=" + max + ")");
}
/**
* Computes a score factor for a phrase.
*
* <p>
* The default implementation sums the idf factor for
* each term in the phrase.
*
* @param stats statistics of the terms in the phrase
* @param searcher the document collection being searched
* @return an Explain object that includes both an idf
* score factor for the phrase and an explanation
* for each term.
* @throws IOException
*/
public Explanation idfExplain(final TermContext stats[], IndexSearcher searcher) throws IOException {
final int max = searcher.maxDoc();
float idf = 0.0f;
final Explanation exp = new Explanation();
exp.setDescription("idf(), sum of:");
for (final TermContext stat : stats ) {
final int df = stat.docFreq();
final float termIdf = idf(df, max);
exp.addDetail(new Explanation(termIdf, "idf(docFreq=" + df + ", maxDocs=" + max + ")"));
idf += termIdf;
}
exp.setValue(idf);
return exp;
}
/** Computes a score factor based on a term's document frequency (the number
* of documents which contain the term). This value is multiplied by the
* {@link #tf(int)} factor for each term in the query and these products are
* then summed to form the initial score for a document.
*
* <p>Terms that occur in fewer documents are better indicators of topic, so
* implementations of this method usually return larger values for rare terms,
* and smaller values for common terms.
*
* @param docFreq the number of documents which contain the term
* @param numDocs the total number of documents in the collection
* @return a score factor based on the term's document frequency
*/
public abstract float idf(int docFreq, int numDocs);
/** Cache of decoded bytes. */
private static final float[] NORM_TABLE = new float[256];
static {
for (int i = 0; i < 256; i++)
NORM_TABLE[i] = SmallFloat.byte315ToFloat((byte)i);
}
/** Decodes a normalization factor stored in an index.
* @see #encodeNormValue(float)
*/
public float decodeNormValue(byte b) {
return NORM_TABLE[b & 0xFF]; // & 0xFF maps negative bytes to positive above 127
}
/** Encodes a normalization factor for storage in an index.
*
* <p>The encoding uses a three-bit mantissa, a five-bit exponent, and
* the zero-exponent point at 15, thus
* representing values from around 7x10^9 to 2x10^-9 with about one
* significant decimal digit of accuracy. Zero is also represented.
* Negative numbers are rounded up to zero. Values too large to represent
* are rounded down to the largest representable value. Positive values too
* small to represent are rounded up to the smallest positive representable
* value.
* @see org.apache.lucene.document.Field#setBoost(float)
* @see org.apache.lucene.util.SmallFloat
*/
public byte encodeNormValue(float f) {
return SmallFloat.floatToByte315(f);
}
@Override
public final Stats computeStats(IndexSearcher searcher, String fieldName, float queryBoost,
TermContext... termContexts) throws IOException {
final Explanation idf = termContexts.length == 1
? idfExplain(termContexts[0], searcher)
: idfExplain(termContexts, searcher);
return new IDFStats(idf, queryBoost);
}
@Override
public final ExactDocScorer exactDocScorer(Stats stats, String fieldName, AtomicReaderContext context) throws IOException {
return new ExactTFIDFDocScorer((IDFStats)stats, context.reader.norms(fieldName));
}
@Override
public final SloppyDocScorer sloppyDocScorer(Stats stats, String fieldName, AtomicReaderContext context) throws IOException {
return new SloppyTFIDFDocScorer((IDFStats)stats, context.reader.norms(fieldName));
}
// TODO: we can specialize these for omitNorms up front, but we should test that it doesn't confuse stupid hotspot.
private final class ExactTFIDFDocScorer extends ExactDocScorer {
private final IDFStats stats;
private final float weightValue;
private final byte[] norms;
private static final int SCORE_CACHE_SIZE = 32;
private float[] scoreCache = new float[SCORE_CACHE_SIZE];
ExactTFIDFDocScorer(IDFStats stats, byte norms[]) {
this.stats = stats;
this.weightValue = stats.value;
this.norms = norms;
for (int i = 0; i < SCORE_CACHE_SIZE; i++)
scoreCache[i] = tf(i) * weightValue;
}
@Override
public float score(int doc, int freq) {
final float raw = // compute tf(f)*weight
freq < SCORE_CACHE_SIZE // check cache
? scoreCache[freq] // cache hit
: tf(freq)*weightValue; // cache miss
return norms == null ? raw : raw * decodeNormValue(norms[doc]); // normalize for field
}
@Override
public Explanation explain(int doc, Explanation freq) {
return explainScore(doc, freq, stats, norms);
}
}
private final class SloppyTFIDFDocScorer extends SloppyDocScorer {
private final IDFStats stats;
private final float weightValue;
private final byte[] norms;
SloppyTFIDFDocScorer(IDFStats stats, byte norms[]) {
this.stats = stats;
this.weightValue = stats.value;
this.norms = norms;
}
@Override
public float score(int doc, float freq) {
final float raw = tf(freq) * weightValue; // compute tf(f)*weight
return norms == null ? raw : raw * decodeNormValue(norms[doc]); // normalize for field
}
@Override
public Explanation explain(int doc, Explanation freq) {
return explainScore(doc, freq, stats, norms);
}
}
/** Collection statistics for the TF-IDF model. The only statistic of interest
* to this model is idf. */
private static class IDFStats extends Stats {
/** The idf and its explanation */
private final Explanation idf;
private float queryNorm;
private float queryWeight;
private final float queryBoost;
private float value;
public IDFStats(Explanation idf, float queryBoost) {
// TODO: Validate?
this.idf = idf;
this.queryBoost = queryBoost;
this.queryWeight = idf.getValue() * queryBoost; // compute query weight
}
@Override
public float getValueForNormalization() {
// TODO: (sorta LUCENE-1907) make non-static class and expose this squaring via a nice method to subclasses?
return queryWeight * queryWeight; // sum of squared weights
}
@Override
public void normalize(float queryNorm, float topLevelBoost) {
this.queryNorm = queryNorm * topLevelBoost;
queryWeight *= this.queryNorm; // normalize query weight
value = queryWeight * idf.getValue(); // idf for document
}
}
private Explanation explainScore(int doc, Explanation freq, IDFStats stats, byte[] norms) {
Explanation result = new Explanation();
result.setDescription("score(doc="+doc+",freq="+freq+"), product of:");
// explain query weight
Explanation queryExpl = new Explanation();
queryExpl.setDescription("queryWeight, product of:");
Explanation boostExpl = new Explanation(stats.queryBoost, "boost");
if (stats.queryBoost != 1.0f)
queryExpl.addDetail(boostExpl);
queryExpl.addDetail(stats.idf);
Explanation queryNormExpl = new Explanation(stats.queryNorm,"queryNorm");
queryExpl.addDetail(queryNormExpl);
queryExpl.setValue(boostExpl.getValue() *
stats.idf.getValue() *
queryNormExpl.getValue());
result.addDetail(queryExpl);
// explain field weight
Explanation fieldExpl = new Explanation();
fieldExpl.setDescription("fieldWeight in "+doc+
", product of:");
Explanation tfExplanation = new Explanation();
tfExplanation.setValue(tf(freq.getValue()));
tfExplanation.setDescription("tf(freq="+freq.getValue()+"), with freq of:");
tfExplanation.addDetail(freq);
fieldExpl.addDetail(tfExplanation);
fieldExpl.addDetail(stats.idf);
Explanation fieldNormExpl = new Explanation();
float fieldNorm =
norms!=null ? decodeNormValue(norms[doc]) : 1.0f;
fieldNormExpl.setValue(fieldNorm);
fieldNormExpl.setDescription("fieldNorm(doc="+doc+")");
fieldExpl.addDetail(fieldNormExpl);
fieldExpl.setValue(tfExplanation.getValue() *
stats.idf.getValue() *
fieldNormExpl.getValue());
result.addDetail(fieldExpl);
// combine them
result.setValue(queryExpl.getValue() * fieldExpl.getValue());
if (queryExpl.getValue() == 1.0f)
return fieldExpl;
return result;
}
}

View File

@ -29,7 +29,7 @@ import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.IndexReader.ReaderContext;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.PerReaderTermState;
import org.apache.lucene.util.TermContext;
import org.apache.lucene.util.ReaderUtil;
abstract class TermCollectingRewrite<Q extends Query> extends MultiTermQuery.RewriteMethod {
@ -43,7 +43,7 @@ abstract class TermCollectingRewrite<Q extends Query> extends MultiTermQuery.Rew
addClause(topLevel, term, docCount, boost, null);
}
protected abstract void addClause(Q topLevel, Term term, int docCount, float boost, PerReaderTermState states) throws IOException;
protected abstract void addClause(Q topLevel, Term term, int docCount, float boost, TermContext states) throws IOException;
protected final void collectTerms(IndexReader reader, MultiTermQuery query, TermCollector collector) throws IOException {

View File

@ -27,9 +27,9 @@ import org.apache.lucene.index.Terms;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.IndexReader.ReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Explanation.IDFExplanation;
import org.apache.lucene.search.Similarity.ExactDocScorer;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.PerReaderTermState;
import org.apache.lucene.util.TermContext;
import org.apache.lucene.util.ReaderUtil;
import org.apache.lucene.util.ToStringUtils;
@ -39,28 +39,19 @@ import org.apache.lucene.util.ToStringUtils;
public class TermQuery extends Query {
private final Term term;
private int docFreq;
private transient PerReaderTermState perReaderTermState;
private transient TermContext perReaderTermState;
private class TermWeight extends Weight {
private final Similarity similarity;
private float value;
private final float idf;
private float queryNorm;
private float queryWeight;
private final IDFExplanation idfExp;
private transient PerReaderTermState termStates;
private final Similarity.Stats stats;
private transient TermContext termStates;
public TermWeight(IndexSearcher searcher, PerReaderTermState termStates, int docFreq)
public TermWeight(IndexSearcher searcher, TermContext termStates)
throws IOException {
assert termStates != null : "PerReaderTermState must not be null";
assert termStates != null : "TermContext must not be null";
this.termStates = termStates;
this.similarity = searcher.getSimilarityProvider().get(term.field());
if (docFreq != -1) {
idfExp = similarity.idfExplain(term, searcher, docFreq);
} else {
idfExp = similarity.idfExplain(term, searcher);
}
idf = idfExp.getIdf();
this.stats = similarity.computeStats(searcher, term.field(), getBoost(), termStates);
}
@Override
@ -70,19 +61,13 @@ public class TermQuery extends Query {
public Query getQuery() { return TermQuery.this; }
@Override
public float getValue() { return value; }
@Override
public float sumOfSquaredWeights() {
queryWeight = idf * getBoost(); // compute query weight
return queryWeight * queryWeight; // square it
public float getValueForNormalization() {
return stats.getValueForNormalization();
}
@Override
public void normalize(float queryNorm) {
this.queryNorm = queryNorm;
queryWeight *= queryNorm; // normalize query weight
value = queryWeight * idf; // idf for document
public void normalize(float queryNorm, float topLevelBoost) {
stats.normalize(queryNorm, topLevelBoost);
}
@Override
@ -97,7 +82,7 @@ public class TermQuery extends Query {
}
final DocsEnum docs = reader.termDocsEnum(reader.getLiveDocs(), field, term.bytes(), state);
assert docs != null;
return new TermScorer(this, docs, similarity, context.reader.norms(field));
return new TermScorer(this, docs, similarity.exactDocScorer(stats, field, context));
}
private boolean termNotInReader(IndexReader reader, String field, BytesRef bytes) throws IOException {
@ -107,79 +92,25 @@ public class TermQuery extends Query {
}
@Override
public Explanation explain(AtomicReaderContext context, int doc)
throws IOException {
final IndexReader reader = context.reader;
ComplexExplanation result = new ComplexExplanation();
result.setDescription("weight("+getQuery()+" in "+doc+"), product of:");
Explanation expl = new Explanation(idf, idfExp.explain());
// explain query weight
Explanation queryExpl = new Explanation();
queryExpl.setDescription("queryWeight(" + getQuery() + "), product of:");
Explanation boostExpl = new Explanation(getBoost(), "boost");
if (getBoost() != 1.0f)
queryExpl.addDetail(boostExpl);
queryExpl.addDetail(expl);
Explanation queryNormExpl = new Explanation(queryNorm,"queryNorm");
queryExpl.addDetail(queryNormExpl);
queryExpl.setValue(boostExpl.getValue() *
expl.getValue() *
queryNormExpl.getValue());
result.addDetail(queryExpl);
// explain field weight
String field = term.field();
ComplexExplanation fieldExpl = new ComplexExplanation();
fieldExpl.setDescription("fieldWeight("+term+" in "+doc+
"), product of:");
Explanation tfExplanation = new Explanation();
int tf = 0;
public Explanation explain(AtomicReaderContext context, int doc) throws IOException {
IndexReader reader = context.reader;
DocsEnum docs = reader.termDocsEnum(context.reader.getLiveDocs(), term.field(), term.bytes());
if (docs != null) {
int newDoc = docs.advance(doc);
if (newDoc == doc) {
tf = docs.freq();
}
tfExplanation.setValue(similarity.tf(tf));
tfExplanation.setDescription("tf(termFreq("+term+")="+tf+")");
} else {
tfExplanation.setValue(0.0f);
tfExplanation.setDescription("no matching term");
int newDoc = docs.advance(doc);
if (newDoc == doc) {
int freq = docs.freq();
ExactDocScorer docScorer = similarity.exactDocScorer(stats, term.field(), context);
ComplexExplanation result = new ComplexExplanation();
result.setDescription("weight("+getQuery()+" in "+doc+") [" + similarity.getClass().getSimpleName() + "], result of:");
Explanation scoreExplanation = docScorer.explain(doc, new Explanation(freq, "termFreq=" + freq));
result.addDetail(scoreExplanation);
result.setValue(scoreExplanation.getValue());
result.setMatch(true);
return result;
}
}
fieldExpl.addDetail(tfExplanation);
fieldExpl.addDetail(expl);
Explanation fieldNormExpl = new Explanation();
final byte[] fieldNorms = reader.norms(field);
float fieldNorm =
fieldNorms!=null ? similarity.decodeNormValue(fieldNorms[doc]) : 1.0f;
fieldNormExpl.setValue(fieldNorm);
fieldNormExpl.setDescription("fieldNorm(field="+field+", doc="+doc+")");
fieldExpl.addDetail(fieldNormExpl);
fieldExpl.setMatch(Boolean.valueOf(tfExplanation.isMatch()));
fieldExpl.setValue(tfExplanation.getValue() *
expl.getValue() *
fieldNormExpl.getValue());
result.addDetail(fieldExpl);
result.setMatch(fieldExpl.getMatch());
// combine them
result.setValue(queryExpl.getValue() * fieldExpl.getValue());
if (queryExpl.getValue() == 1.0f)
return fieldExpl;
return result;
return new ComplexExplanation(false, 0.0f, "no matching term");
}
}
@ -200,7 +131,7 @@ public class TermQuery extends Query {
/** Expert: constructs a TermQuery that will use the
* provided docFreq instead of looking up the docFreq
* against the searcher. */
public TermQuery(Term t, PerReaderTermState states) {
public TermQuery(Term t, TermContext states) {
assert states != null;
term = t;
docFreq = states.docFreq();
@ -213,20 +144,20 @@ public class TermQuery extends Query {
@Override
public Weight createWeight(IndexSearcher searcher) throws IOException {
final ReaderContext context = searcher.getTopReaderContext();
final int weightDocFreq;
final PerReaderTermState termState;
final TermContext termState;
if (perReaderTermState == null || perReaderTermState.topReaderContext != context) {
// make TermQuery single-pass if we don't have a PRTS or if the context differs!
termState = PerReaderTermState.build(context, term, true); // cache term lookups!
// we must not ignore the given docFreq - if set use the given value
weightDocFreq = docFreq == -1 ? termState.docFreq() : docFreq;
termState = TermContext.build(context, term, true); // cache term lookups!
} else {
// PRTS was pre-build for this IS
termState = this.perReaderTermState;
weightDocFreq = docFreq;
}
return new TermWeight(searcher, termState, weightDocFreq);
// we must not ignore the given docFreq - if set use the given value (lie)
if (docFreq != -1)
termState.setDocFreq(docFreq);
return new TermWeight(searcher, termState);
}
@Override

View File

@ -25,20 +25,16 @@ import org.apache.lucene.index.DocsEnum;
*/
final class TermScorer extends Scorer {
private DocsEnum docsEnum;
private byte[] norms;
private float weightValue;
private int doc = -1;
private int freq;
private int pointer;
private int pointerMax;
private static final int SCORE_CACHE_SIZE = 32;
private float[] scoreCache = new float[SCORE_CACHE_SIZE];
private int[] docs;
private int[] freqs;
private final DocsEnum.BulkReadResult bulkResult;
private final Similarity similarity;
private final Similarity.ExactDocScorer docScorer;
/**
* Construct a <code>TermScorer</code>.
@ -47,22 +43,15 @@ final class TermScorer extends Scorer {
* The weight of the <code>Term</code> in the query.
* @param td
* An iterator over the documents matching the <code>Term</code>.
* @param similarity
* The </code>Similarity</code> implementation to be used for score
* computations.
* @param norms
* The field norms of the document fields for the <code>Term</code>.
* @param docScorer
* The </code>Similarity.ExactDocScorer</code> implementation
* to be used for score computations.
*/
TermScorer(Weight weight, DocsEnum td, Similarity similarity, byte[] norms) {
TermScorer(Weight weight, DocsEnum td, Similarity.ExactDocScorer docScorer) throws IOException {
super(weight);
this.similarity = similarity;
this.docScorer = docScorer;
this.docsEnum = td;
this.norms = norms;
this.weightValue = weight.getValue();
bulkResult = td.getBulkResult();
for (int i = 0; i < SCORE_CACHE_SIZE; i++)
scoreCache[i] = similarity.tf(i) * weightValue;
}
@Override
@ -134,12 +123,7 @@ final class TermScorer extends Scorer {
@Override
public float score() {
assert doc != NO_MORE_DOCS;
float raw = // compute tf(f)*weight
freq < SCORE_CACHE_SIZE // check cache
? scoreCache[freq] // cache hit
: similarity.tf(freq)*weightValue; // cache miss
return norms == null ? raw : raw * similarity.decodeNormValue(norms[doc]); // normalize for field
return docScorer.score(doc, freq);
}
/**

View File

@ -29,7 +29,7 @@ import org.apache.lucene.index.TermState;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.PerReaderTermState;
import org.apache.lucene.util.TermContext;
/**
* Base rewrite method for collecting only the top terms
@ -80,7 +80,7 @@ public abstract class TopTermsRewrite<Q extends Query> extends TermCollectingRew
this.termComp = termsEnum.getComparator();
// lazy init the initial ScoreTerm because comparator is not known on ctor:
if (st == null)
st = new ScoreTerm(this.termComp, new PerReaderTermState(topReaderContext));
st = new ScoreTerm(this.termComp, new TermContext(topReaderContext));
boostAtt = termsEnum.attributes().addAttribute(BoostAttribute.class);
}
@ -101,14 +101,14 @@ public abstract class TopTermsRewrite<Q extends Query> extends TermCollectingRew
if (t != null) {
// if the term is already in the PQ, only update docFreq of term in PQ
assert t.boost == boost : "boost should be equal in all segment TermsEnums";
t.termState.register(state, readerContext.ord, termsEnum.docFreq());
t.termState.register(state, readerContext.ord, termsEnum.docFreq(), termsEnum.totalTermFreq());
} else {
// add new entry in PQ, we must clone the term, else it may get overwritten!
st.bytes.copy(bytes);
st.boost = boost;
visitedTerms.put(st.bytes, st);
assert st.termState.docFreq() == 0;
st.termState.register(state, readerContext.ord, termsEnum.docFreq());
st.termState.register(state, readerContext.ord, termsEnum.docFreq(), termsEnum.totalTermFreq());
stQueue.offer(st);
// possibly drop entries from queue
if (stQueue.size() > maxSize) {
@ -116,7 +116,7 @@ public abstract class TopTermsRewrite<Q extends Query> extends TermCollectingRew
visitedTerms.remove(st.bytes);
st.termState.clear(); // reset the termstate!
} else {
st = new ScoreTerm(termComp, new PerReaderTermState(topReaderContext));
st = new ScoreTerm(termComp, new TermContext(topReaderContext));
}
assert stQueue.size() <= maxSize : "the PQ size must be limited to maxSize";
// set maxBoostAtt with values to help FuzzyTermsEnum to optimize
@ -171,8 +171,8 @@ public abstract class TopTermsRewrite<Q extends Query> extends TermCollectingRew
public final Comparator<BytesRef> termComp;
public final BytesRef bytes = new BytesRef();
public float boost;
public final PerReaderTermState termState;
public ScoreTerm(Comparator<BytesRef> termComp, PerReaderTermState termState) {
public final TermContext termState;
public ScoreTerm(Comparator<BytesRef> termComp, TermContext termState) {
this.termComp = termComp;
this.termState = termState;
}

View File

@ -41,11 +41,11 @@ import org.apache.lucene.index.IndexReader.ReaderContext;
* <ol>
* <li>A <code>Weight</code> is constructed by a top-level query, given a
* <code>IndexSearcher</code> ({@link Query#createWeight(IndexSearcher)}).
* <li>The {@link #sumOfSquaredWeights()} method is called on the
* <li>The {@link #getValueForNormalization()} method is called on the
* <code>Weight</code> to compute the query normalization factor
* {@link SimilarityProvider#queryNorm(float)} of the query clauses contained in the
* query.
* <li>The query normalization factor is passed to {@link #normalize(float)}. At
* <li>The query normalization factor is passed to {@link #normalize(float, float)}. At
* this point the weighting is complete.
* <li>A <code>Scorer</code> is constructed by
* {@link #scorer(IndexReader.AtomicReaderContext, ScorerContext)}.
@ -68,11 +68,11 @@ public abstract class Weight {
/** The query that this concerns. */
public abstract Query getQuery();
/** The weight for this query. */
public abstract float getValue();
/** The value for normalization of contained query clauses (e.g. sum of squared weights). */
public abstract float getValueForNormalization() throws IOException;
/** Assigns the query normalization factor to this. */
public abstract void normalize(float norm);
/** Assigns the query normalization factor and boost from parent queries to this. */
public abstract void normalize(float norm, float topLevelBoost);
/**
* Returns a {@link Scorer} which scores documents in/out-of order according
@ -94,9 +94,6 @@ public abstract class Weight {
*/
public abstract Scorer scorer(AtomicReaderContext context, ScorerContext scorerContext) throws IOException;
/** The sum of squared weights of contained query clauses. */
public abstract float sumOfSquaredWeights() throws IOException;
/**
* Returns true iff this implementation scores docs only out of order. This
* method is used in conjunction with {@link Collector}'s

View File

@ -18,11 +18,13 @@ package org.apache.lucene.search.payloads;
*/
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.search.ComplexExplanation;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.search.Weight;
import org.apache.lucene.search.Similarity.SloppyDocScorer;
import org.apache.lucene.search.spans.NearSpansOrdered;
import org.apache.lucene.search.spans.NearSpansUnordered;
import org.apache.lucene.search.spans.SpanNearQuery;
@ -145,7 +147,35 @@ public class PayloadNearQuery extends SpanNearQuery {
@Override
public Scorer scorer(AtomicReaderContext context, ScorerContext scorerContext) throws IOException {
return new PayloadNearSpanScorer(query.getSpans(context), this,
similarity, context.reader.norms(query.getField()));
similarity, similarity.sloppyDocScorer(stats, query.getField(), context));
}
@Override
public Explanation explain(AtomicReaderContext context, int doc) throws IOException {
PayloadNearSpanScorer scorer = (PayloadNearSpanScorer) scorer(context, ScorerContext.def());
if (scorer != null) {
int newDoc = scorer.advance(doc);
if (newDoc == doc) {
float freq = scorer.freq();
SloppyDocScorer docScorer = similarity.sloppyDocScorer(stats, query.getField(), context);
Explanation expl = new Explanation();
expl.setDescription("weight("+getQuery()+" in "+doc+") [" + similarity.getClass().getSimpleName() + "], result of:");
Explanation scoreExplanation = docScorer.explain(doc, new Explanation(freq, "phraseFreq=" + freq));
expl.addDetail(scoreExplanation);
expl.setValue(scoreExplanation.getValue());
// now the payloads part
Explanation payloadExpl = function.explain(doc, scorer.payloadsSeen, scorer.payloadScore);
// combined
ComplexExplanation result = new ComplexExplanation();
result.addDetail(expl);
result.addDetail(payloadExpl);
result.setValue(expl.getValue() * payloadExpl.getValue());
result.setDescription("PayloadNearQuery, product of:");
return result;
}
}
return new ComplexExplanation(false, 0.0f, "no matching term");
}
}
@ -155,8 +185,8 @@ public class PayloadNearQuery extends SpanNearQuery {
private int payloadsSeen;
protected PayloadNearSpanScorer(Spans spans, Weight weight,
Similarity similarity, byte[] norms) throws IOException {
super(spans, weight, similarity, norms);
Similarity similarity, Similarity.SloppyDocScorer docScorer) throws IOException {
super(spans, weight, similarity, docScorer);
this.spans = spans;
}
@ -225,20 +255,6 @@ public class PayloadNearQuery extends SpanNearQuery {
return super.score()
* function.docScore(doc, fieldName, payloadsSeen, payloadScore);
}
@Override
protected Explanation explain(int doc) throws IOException {
Explanation result = new Explanation();
// Add detail about tf/idf...
Explanation nonPayloadExpl = super.explain(doc);
result.addDetail(nonPayloadExpl);
// Add detail about payload
Explanation payloadExpl = function.explain(doc, payloadsSeen, payloadScore);
result.addDetail(payloadExpl);
result.setValue(nonPayloadExpl.getValue() * payloadExpl.getValue());
result.setDescription("PayloadNearQuery, product of:");
return result;
}
}
}

View File

@ -26,6 +26,9 @@ import org.apache.lucene.search.Weight;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.ComplexExplanation;
import org.apache.lucene.search.Similarity.SloppyDocScorer;
import org.apache.lucene.search.Weight.ScorerContext;
import org.apache.lucene.search.payloads.PayloadNearQuery.PayloadNearSpanScorer;
import org.apache.lucene.search.spans.TermSpans;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.search.spans.SpanWeight;
@ -76,7 +79,7 @@ public class PayloadTermQuery extends SpanTermQuery {
@Override
public Scorer scorer(AtomicReaderContext context, ScorerContext scorerContext) throws IOException {
return new PayloadTermSpanScorer((TermSpans) query.getSpans(context),
this, similarity, context.reader.norms(query.getField()));
this, similarity, similarity.sloppyDocScorer(stats, query.getField(), context));
}
protected class PayloadTermSpanScorer extends SpanScorer {
@ -86,8 +89,8 @@ public class PayloadTermQuery extends SpanTermQuery {
private final TermSpans termSpans;
public PayloadTermSpanScorer(TermSpans spans, Weight weight,
Similarity similarity, byte[] norms) throws IOException {
super(spans, weight, similarity, norms);
Similarity similarity, Similarity.SloppyDocScorer docScorer) throws IOException {
super(spans, weight, similarity, docScorer);
termSpans = spans;
}
@ -173,29 +176,40 @@ public class PayloadTermQuery extends SpanTermQuery {
protected float getPayloadScore() {
return function.docScore(doc, term.field(), payloadsSeen, payloadScore);
}
}
@Override
protected Explanation explain(final int doc) throws IOException {
ComplexExplanation result = new ComplexExplanation();
Explanation nonPayloadExpl = super.explain(doc);
result.addDetail(nonPayloadExpl);
// QUESTION: Is there a way to avoid this skipTo call? We need to know
// whether to load the payload or not
Explanation payloadBoost = new Explanation();
result.addDetail(payloadBoost);
float payloadScore = getPayloadScore();
payloadBoost.setValue(payloadScore);
// GSI: I suppose we could toString the payload, but I don't think that
// would be a good idea
payloadBoost.setDescription("scorePayload(...)");
result.setValue(nonPayloadExpl.getValue() * payloadScore);
result.setDescription("btq, product of:");
result.setMatch(nonPayloadExpl.getValue() == 0 ? Boolean.FALSE
: Boolean.TRUE); // LUCENE-1303
return result;
@Override
public Explanation explain(AtomicReaderContext context, int doc) throws IOException {
PayloadTermSpanScorer scorer = (PayloadTermSpanScorer) scorer(context, ScorerContext.def());
if (scorer != null) {
int newDoc = scorer.advance(doc);
if (newDoc == doc) {
float freq = scorer.freq();
SloppyDocScorer docScorer = similarity.sloppyDocScorer(stats, query.getField(), context);
Explanation expl = new Explanation();
expl.setDescription("weight("+getQuery()+" in "+doc+") [" + similarity.getClass().getSimpleName() + "], result of:");
Explanation scoreExplanation = docScorer.explain(doc, new Explanation(freq, "phraseFreq=" + freq));
expl.addDetail(scoreExplanation);
expl.setValue(scoreExplanation.getValue());
// now the payloads part
// QUESTION: Is there a way to avoid this skipTo call? We need to know
// whether to load the payload or not
// GSI: I suppose we could toString the payload, but I don't think that
// would be a good idea
Explanation payloadExpl = new Explanation(scorer.getPayloadScore(), "scorePayload(...)");
payloadExpl.setValue(scorer.getPayloadScore());
// combined
ComplexExplanation result = new ComplexExplanation();
result.addDetail(expl);
result.addDetail(payloadExpl);
result.setValue(expl.getValue() * payloadExpl.getValue());
result.setDescription("btq, product of:");
result.setMatch(expl.getValue() == 0 ? Boolean.FALSE : Boolean.TRUE); // LUCENE-1303
return result;
}
}
return new ComplexExplanation(false, 0.0f, "no matching term");
}
}

View File

@ -27,7 +27,7 @@ import org.apache.lucene.search.Query;
import org.apache.lucene.search.TopTermsRewrite;
import org.apache.lucene.search.ScoringRewrite;
import org.apache.lucene.search.BooleanClause.Occur; // javadocs only
import org.apache.lucene.util.PerReaderTermState;
import org.apache.lucene.util.TermContext;
/**
* Wraps any {@link MultiTermQuery} as a {@link SpanQuery},
@ -155,7 +155,7 @@ public class SpanMultiTermQueryWrapper<Q extends MultiTermQuery> extends SpanQue
}
@Override
protected void addClause(SpanOrQuery topLevel, Term term, int docCount, float boost, PerReaderTermState states) {
protected void addClause(SpanOrQuery topLevel, Term term, int docCount, float boost, TermContext states) {
final SpanTermQuery q = new SpanTermQuery(term);
q.setBoost(boost);
topLevel.addClause(q);
@ -204,7 +204,7 @@ public class SpanMultiTermQueryWrapper<Q extends MultiTermQuery> extends SpanQue
}
@Override
protected void addClause(SpanOrQuery topLevel, Term term, int docFreq, float boost, PerReaderTermState states) {
protected void addClause(SpanOrQuery topLevel, Term term, int docFreq, float boost, TermContext states) {
final SpanTermQuery q = new SpanTermQuery(term);
q.setBoost(boost);
topLevel.addClause(q);

View File

@ -20,6 +20,7 @@ package org.apache.lucene.search.spans;
import java.io.IOException;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.TFIDFSimilarity;
import org.apache.lucene.search.Weight;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.Similarity;
@ -29,22 +30,21 @@ import org.apache.lucene.search.Similarity;
*/
public class SpanScorer extends Scorer {
protected Spans spans;
protected byte[] norms;
protected float value;
protected boolean more = true;
protected int doc;
protected float freq;
protected final Similarity similarity;
protected final Similarity.SloppyDocScorer docScorer;
protected SpanScorer(Spans spans, Weight weight, Similarity similarity, byte[] norms)
protected SpanScorer(Spans spans, Weight weight, Similarity similarity, Similarity.SloppyDocScorer docScorer)
throws IOException {
super(weight);
this.similarity = similarity;
this.docScorer = docScorer;
this.spans = spans;
this.norms = norms;
this.value = weight.getValue();
if (this.spans.next()) {
doc = -1;
} else {
@ -94,27 +94,11 @@ public class SpanScorer extends Scorer {
@Override
public float score() throws IOException {
float raw = similarity.tf(freq) * value; // raw score
return norms == null? raw : raw * similarity.decodeNormValue(norms[doc]); // normalize
return docScorer.score(doc, freq);
}
@Override
public float freq() throws IOException {
return freq;
}
/** This method is no longer an official member of {@link Scorer},
* but it is needed by SpanWeight to build an explanation. */
protected Explanation explain(final int doc) throws IOException {
Explanation tfExplanation = new Explanation();
int expDoc = advance(doc);
float phraseFreq = (expDoc == doc) ? freq : 0.0f;
tfExplanation.setValue(similarity.tf(phraseFreq));
tfExplanation.setDescription("tf(phraseFreq=" + phraseFreq + ")");
return tfExplanation;
}
}

View File

@ -18,125 +18,76 @@ package org.apache.lucene.search.spans;
*/
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.IndexReader.ReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.*;
import org.apache.lucene.search.Explanation.IDFExplanation;
import org.apache.lucene.search.Similarity.SloppyDocScorer;
import org.apache.lucene.util.TermContext;
import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
import java.util.TreeSet;
/**
* Expert-only. Public for use by other weight implementations
*/
public class SpanWeight extends Weight {
protected Similarity similarity;
protected float value;
protected float idf;
protected float queryNorm;
protected float queryWeight;
protected Set<Term> terms;
protected SpanQuery query;
private IDFExplanation idfExp;
protected Similarity.Stats stats;
public SpanWeight(SpanQuery query, IndexSearcher searcher)
throws IOException {
this.similarity = searcher.getSimilarityProvider().get(query.getField());
this.query = query;
terms=new HashSet<Term>();
terms=new TreeSet<Term>();
query.extractTerms(terms);
idfExp = similarity.idfExplain(terms, searcher);
idf = idfExp.getIdf();
final ReaderContext context = searcher.getTopReaderContext();
final TermContext states[] = new TermContext[terms.size()];
int i = 0;
for (Term term : terms)
states[i++] = TermContext.build(context, term, true);
stats = similarity.computeStats(searcher, query.getField(), query.getBoost(), states);
}
@Override
public Query getQuery() { return query; }
@Override
public float getValue() { return value; }
@Override
public float sumOfSquaredWeights() throws IOException {
queryWeight = idf * query.getBoost(); // compute query weight
return queryWeight * queryWeight; // square it
public float getValueForNormalization() throws IOException {
return stats.getValueForNormalization();
}
@Override
public void normalize(float queryNorm) {
this.queryNorm = queryNorm;
queryWeight *= queryNorm; // normalize query weight
value = queryWeight * idf; // idf for document
public void normalize(float queryNorm, float topLevelBoost) {
stats.normalize(queryNorm, topLevelBoost);
}
@Override
public Scorer scorer(AtomicReaderContext context, ScorerContext scorerContext) throws IOException {
return new SpanScorer(query.getSpans(context), this, similarity, context.reader
.norms(query.getField()));
return new SpanScorer(query.getSpans(context), this, similarity, similarity.sloppyDocScorer(stats, query.getField(), context));
}
@Override
public Explanation explain(AtomicReaderContext context, int doc)
throws IOException {
public Explanation explain(AtomicReaderContext context, int doc) throws IOException {
Scorer scorer = scorer(context, ScorerContext.def());
if (scorer != null) {
int newDoc = scorer.advance(doc);
if (newDoc == doc) {
float freq = scorer.freq();
SloppyDocScorer docScorer = similarity.sloppyDocScorer(stats, query.getField(), context);
ComplexExplanation result = new ComplexExplanation();
result.setDescription("weight("+getQuery()+" in "+doc+") [" + similarity.getClass().getSimpleName() + "], result of:");
Explanation scoreExplanation = docScorer.explain(doc, new Explanation(freq, "phraseFreq=" + freq));
result.addDetail(scoreExplanation);
result.setValue(scoreExplanation.getValue());
result.setMatch(true);
return result;
}
}
ComplexExplanation result = new ComplexExplanation();
result.setDescription("weight("+getQuery()+" in "+doc+"), product of:");
String field = ((SpanQuery)getQuery()).getField();
Explanation idfExpl =
new Explanation(idf, "idf(" + field + ": " + idfExp.explain() + ")");
// explain query weight
Explanation queryExpl = new Explanation();
queryExpl.setDescription("queryWeight(" + getQuery() + "), product of:");
Explanation boostExpl = new Explanation(getQuery().getBoost(), "boost");
if (getQuery().getBoost() != 1.0f)
queryExpl.addDetail(boostExpl);
queryExpl.addDetail(idfExpl);
Explanation queryNormExpl = new Explanation(queryNorm,"queryNorm");
queryExpl.addDetail(queryNormExpl);
queryExpl.setValue(boostExpl.getValue() *
idfExpl.getValue() *
queryNormExpl.getValue());
result.addDetail(queryExpl);
// explain field weight
ComplexExplanation fieldExpl = new ComplexExplanation();
fieldExpl.setDescription("fieldWeight("+field+":"+query.toString(field)+
" in "+doc+"), product of:");
Explanation tfExpl = ((SpanScorer)scorer(context, ScorerContext.def())).explain(doc);
fieldExpl.addDetail(tfExpl);
fieldExpl.addDetail(idfExpl);
Explanation fieldNormExpl = new Explanation();
byte[] fieldNorms = context.reader.norms(field);
float fieldNorm =
fieldNorms!=null ? similarity.decodeNormValue(fieldNorms[doc]) : 1.0f;
fieldNormExpl.setValue(fieldNorm);
fieldNormExpl.setDescription("fieldNorm(field="+field+", doc="+doc+")");
fieldExpl.addDetail(fieldNormExpl);
fieldExpl.setMatch(Boolean.valueOf(tfExpl.isMatch()));
fieldExpl.setValue(tfExpl.getValue() *
idfExpl.getValue() *
fieldNormExpl.getValue());
result.addDetail(fieldExpl);
result.setMatch(fieldExpl.getMatch());
// combine them
result.setValue(queryExpl.getValue() * fieldExpl.getValue());
if (queryExpl.getValue() == 1.0f)
return fieldExpl;
return result;
return new ComplexExplanation(false, 0.0f, "no matching term");
}
}

View File

@ -60,7 +60,7 @@ public abstract class CompoundFileDirectory extends Directory {
* NOTE: subclasses must call {@link #initForRead(Map)} before the directory can be used.
*/
public CompoundFileDirectory(Directory directory, String fileName, int readBufferSize) throws IOException {
assert !(directory instanceof CompoundFileDirectory) : "compound file inside of compound file: " + fileName;
this.directory = directory;
this.fileName = fileName;
this.readBufferSize = readBufferSize;
@ -75,9 +75,11 @@ public abstract class CompoundFileDirectory extends Directory {
}
protected final void initForWrite() {
assert !(directory instanceof CompoundFileDirectory) : "compound file inside of compound file: " + fileName;
this.entries = SENTINEL;
this.openForWrite = true;
this.isOpen = true;
writer = new CompoundFileWriter(directory, fileName);
}
/** Helper method that reads CFS entries from an input stream */
@ -173,7 +175,11 @@ public abstract class CompoundFileDirectory extends Directory {
@Override
public synchronized void close() throws IOException {
ensureOpen();
if (!isOpen) {
// allow double close - usually to be consistent with other closeables
assert entries == null;
return; // already closed
}
entries = null;
isOpen = false;
if (writer != null) {
@ -269,7 +275,6 @@ public abstract class CompoundFileDirectory extends Directory {
@Override
public IndexOutput createOutput(String name) throws IOException {
ensureOpen();
initWriter();
return writer.createOutput(name);
}
@ -285,12 +290,13 @@ public abstract class CompoundFileDirectory extends Directory {
throw new UnsupportedOperationException();
}
/** Not implemented
* @throws UnsupportedOperationException */
@Override
public final CompoundFileDirectory openCompoundInput(String name, int bufferSize) throws IOException {
// NOTE: final to make nested compounding impossible.
throw new UnsupportedOperationException();
public CompoundFileDirectory openCompoundInput(String name, int bufferSize) throws IOException {
FileEntry fileEntry = this.entries.get(IndexFileNames.stripSegmentName(name));
if (fileEntry == null) {
throw new FileNotFoundException("file " + name + " does not exists in this CFS");
}
return new NestedCompoundFileDirectory(name, bufferSize, fileEntry.offset, fileEntry.length);
}
/** Not implemented
@ -298,16 +304,36 @@ public abstract class CompoundFileDirectory extends Directory {
@Override
public CompoundFileDirectory createCompoundOutput(String name)
throws IOException {
// NOTE: final to make nested compounding impossible.
throw new UnsupportedOperationException();
throw new UnsupportedOperationException("can not create nested CFS, create seperately and use Directory.copy instead");
}
private final void initWriter() {
assert openForWrite;
assert entries == SENTINEL;
if (writer == null) {
writer = new CompoundFileWriter(directory, fileName);
private class NestedCompoundFileDirectory extends CompoundFileDirectory {
private final long cfsOffset;
private final long cfsLength;
public NestedCompoundFileDirectory(String fileName, int readBufferSize, long offset, long length)
throws IOException {
super(directory, fileName, readBufferSize);
this.cfsOffset = offset;
this.cfsLength = length;
IndexInput input = null;
try {
input = CompoundFileDirectory.this.openInput(fileName, 128);
initForRead(CompoundFileDirectory.readEntries(input,
CompoundFileDirectory.this, fileName));
} finally {
IOUtils.closeSafely(false, input);
}
}
@Override
public IndexInput openInputSlice(String id, long offset, long length,
int readBufferSize) throws IOException {
assert offset + length <= cfsLength;
return CompoundFileDirectory.this.openInputSlice(id, cfsOffset + offset, length, readBufferSize);
}
}
}

View File

@ -17,6 +17,7 @@ package org.apache.lucene.store;
* limitations under the License.
*/
import java.io.Closeable;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.Collection;
@ -55,7 +56,7 @@ import org.apache.lucene.util.IOUtils;
*
* @lucene.internal
*/
final class CompoundFileWriter {
final class CompoundFileWriter implements Closeable{
private static final class FileEntry {
/** source file */
@ -89,8 +90,8 @@ final class CompoundFileWriter {
private boolean closed = false;
private volatile IndexOutput dataOut;
private final AtomicBoolean outputTaken = new AtomicBoolean(false);
private final String entryTableName;
private final String dataFileName;
final String entryTableName;
final String dataFileName;
/**
* Create the compound stream in the specified file. The file name is the
@ -128,17 +129,14 @@ final class CompoundFileWriter {
* if close() had been called before or if no file has been added to
* this object
*/
void close() throws IOException {
public void close() throws IOException {
if (closed) {
throw new IllegalStateException("already closed");
}
IOException priorException = null;
IndexOutput entryTableOut = null;
try {
if (entries.isEmpty()) {
throw new IllegalStateException("CFS has no entries");
}
initDataOut();
if (!pendingEntries.isEmpty() || outputTaken.get()) {
throw new IllegalStateException("CFS has pending open files");
}
@ -147,12 +145,18 @@ final class CompoundFileWriter {
assert dataOut != null;
long finalLength = dataOut.getFilePointer();
assert assertFileLength(finalLength, dataOut);
} catch (IOException e) {
priorException = e;
} finally {
IOUtils.closeSafely(priorException, dataOut);
}
try {
entryTableOut = directory.createOutput(entryTableName);
writeEntryTable(entries.values(), entryTableOut);
} catch (IOException e) {
priorException = e;
} finally {
IOUtils.closeSafely(priorException, dataOut, entryTableOut);
IOUtils.closeSafely(priorException, entryTableOut);
}
}
@ -321,6 +325,7 @@ final class CompoundFileWriter {
closed = true;
entry.length = writtenBytes;
if (isSeparate) {
delegate.close();
// we are a separate file - push into the pending entries
pendingEntries.add(entry);
} else {

View File

@ -692,6 +692,39 @@ public class OpenBitSet extends DocIdSet implements Bits, Cloneable {
return -1;
}
/** Returns the index of the first set bit starting downwards at
* the index specified.
* -1 is returned if there are no more set bits.
*/
public long prevSetBit(long index) {
int i = (int) (index >> 6);
final int subIndex;
long word;
if (i >= wlen) {
i = wlen - 1;
if (i < 0) return -1;
subIndex = 63; // last possible bit
word = bits[i];
} else {
if (i < 0) return -1;
subIndex = (int)index & 0x3f; // index within the word
word = (bits[i] << (63-subIndex)); // skip all the bits to the left of index
}
if (word != 0) {
return (((long)i)<<6) + subIndex - Long.numberOfLeadingZeros(word); // See LUCENE-3197
}
while (--i >= 0) {
word = bits[i];
if (word !=0 ) {
return (((long)i)<<6) + 63 - Long.numberOfLeadingZeros(word);
}
}
return -1;
}
@Override
public Object clone() {
try {

View File

@ -28,25 +28,27 @@ import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.IndexReader.ReaderContext;
import org.apache.lucene.index.TermsEnum.SeekStatus;
/**
* Maintains a {@link IndexReader} {@link TermState} view over
* {@link IndexReader} instances containing a single term. The
* {@link PerReaderTermState} doesn't track if the given {@link TermState}
* {@link TermContext} doesn't track if the given {@link TermState}
* objects are valid, neither if the {@link TermState} instances refer to the
* same terms in the associated readers.
*
* @lucene.experimental
*/
public final class PerReaderTermState {
public final class TermContext {
public final ReaderContext topReaderContext; // for asserting!
private final TermState[] states;
private int docFreq;
private long totalTermFreq;
/**
* Creates an empty {@link PerReaderTermState} from a {@link ReaderContext}
* Creates an empty {@link TermContext} from a {@link ReaderContext}
*/
public PerReaderTermState(ReaderContext context) {
public TermContext(ReaderContext context) {
assert context != null && context.isTopLevel;
topReaderContext = context;
docFreq = 0;
@ -60,28 +62,28 @@ public final class PerReaderTermState {
}
/**
* Creates a {@link PerReaderTermState} with an initial {@link TermState},
* Creates a {@link TermContext} with an initial {@link TermState},
* {@link IndexReader} pair.
*/
public PerReaderTermState(ReaderContext context, TermState state, int ord, int docFreq) {
public TermContext(ReaderContext context, TermState state, int ord, int docFreq, long totalTermFreq) {
this(context);
register(state, ord, docFreq);
register(state, ord, docFreq, totalTermFreq);
}
/**
* Creates a {@link PerReaderTermState} from a top-level {@link ReaderContext} and the
* Creates a {@link TermContext} from a top-level {@link ReaderContext} and the
* given {@link Term}. This method will lookup the given term in all context's leaf readers
* and register each of the readers containing the term in the returned {@link PerReaderTermState}
* and register each of the readers containing the term in the returned {@link TermContext}
* using the leaf reader's ordinal.
* <p>
* Note: the given context must be a top-level context.
*/
public static PerReaderTermState build(ReaderContext context, Term term, boolean cache)
public static TermContext build(ReaderContext context, Term term, boolean cache)
throws IOException {
assert context != null && context.isTopLevel;
final String field = term.field();
final BytesRef bytes = term.bytes();
final PerReaderTermState perReaderTermState = new PerReaderTermState(context);
final TermContext perReaderTermState = new TermContext(context);
final AtomicReaderContext[] leaves = ReaderUtil.leaves(context);
for (int i = 0; i < leaves.length; i++) {
final Fields fields = leaves[i].reader.fields();
@ -91,7 +93,7 @@ public final class PerReaderTermState {
final TermsEnum termsEnum = terms.getThreadTermsEnum(); // thread-private don't share!
if (termsEnum.seekExact(bytes, cache)) {
final TermState termState = termsEnum.termState();
perReaderTermState.register(termState, leaves[i].ord, termsEnum.docFreq());
perReaderTermState.register(termState, leaves[i].ord, termsEnum.docFreq(), termsEnum.totalTermFreq());
}
}
}
@ -100,7 +102,7 @@ public final class PerReaderTermState {
}
/**
* Clears the {@link PerReaderTermState} internal state and removes all
* Clears the {@link TermContext} internal state and removes all
* registered {@link TermState}s
*/
public void clear() {
@ -112,12 +114,16 @@ public final class PerReaderTermState {
* Registers and associates a {@link TermState} with an leaf ordinal. The leaf ordinal
* should be derived from a {@link ReaderContext}'s leaf ord.
*/
public void register(TermState state, final int ord, final int docFreq) {
public void register(TermState state, final int ord, final int docFreq, final long totalTermFreq) {
assert state != null : "state must not be null";
assert ord >= 0 && ord < states.length;
assert states[ord] == null : "state for ord: " + ord
+ " already registered";
this.docFreq += docFreq;
if (this.totalTermFreq >= 0 && totalTermFreq >= 0)
this.totalTermFreq += totalTermFreq;
else
this.totalTermFreq = -1;
states[ord] = state;
}
@ -137,11 +143,27 @@ public final class PerReaderTermState {
/**
* Returns the accumulated document frequency of all {@link TermState}
* instances passed to {@link #register(TermState, int, int)}.
* instances passed to {@link #register(TermState, int, int, long)}.
* @return the accumulated document frequency of all {@link TermState}
* instances passed to {@link #register(TermState, int, int)}.
* instances passed to {@link #register(TermState, int, int, long)}.
*/
public int docFreq() {
return docFreq;
}
/**
* Returns the accumulated term frequency of all {@link TermState}
* instances passed to {@link #register(TermState, int, int, long)}.
* @return the accumulated term frequency of all {@link TermState}
* instances passed to {@link #register(TermState, int, int, long)}.
*/
public long totalTermFreq() {
return totalTermFreq;
}
/** expert: only available for queries that want to lie about docfreq
* @lucene.internal */
public void setDocFreq(int docFreq) {
this.docFreq = docFreq;
}
}

View File

@ -32,6 +32,7 @@ import org.apache.lucene.index.codecs.sep.IntIndexInput;
import org.apache.lucene.index.codecs.sep.IntIndexOutput;
import org.apache.lucene.index.codecs.sep.SepPostingsReaderImpl;
import org.apache.lucene.index.codecs.sep.SepPostingsWriterImpl;
import org.apache.lucene.index.codecs.standard.StandardCodec;
import org.apache.lucene.index.codecs.intblock.FixedIntBlockIndexInput;
import org.apache.lucene.index.codecs.intblock.FixedIntBlockIndexOutput;
import org.apache.lucene.index.codecs.DefaultDocValuesProducer;
@ -46,7 +47,6 @@ import org.apache.lucene.index.codecs.BlockTermsReader;
import org.apache.lucene.index.codecs.BlockTermsWriter;
import org.apache.lucene.index.codecs.TermsIndexReaderBase;
import org.apache.lucene.index.codecs.TermsIndexWriterBase;
import org.apache.lucene.index.codecs.standard.StandardCodec;
import org.apache.lucene.store.*;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
@ -62,8 +62,8 @@ public class MockFixedIntBlockCodec extends Codec {
private final int blockSize;
public MockFixedIntBlockCodec(int blockSize) {
super("MockFixedIntBlock");
this.blockSize = blockSize;
name = "MockFixedIntBlock";
}
@Override
@ -206,7 +206,7 @@ public class MockFixedIntBlockCodec extends Codec {
SepPostingsReaderImpl.files(segmentInfo, codecId, files);
BlockTermsReader.files(dir, segmentInfo, codecId, files);
FixedGapTermsIndexReader.files(dir, segmentInfo, codecId, files);
DefaultDocValuesConsumer.files(dir, segmentInfo, codecId, files);
DefaultDocValuesConsumer.files(dir, segmentInfo, codecId, files, getDocValuesUseCFS());
}
@Override
@ -214,16 +214,16 @@ public class MockFixedIntBlockCodec extends Codec {
SepPostingsWriterImpl.getExtensions(extensions);
BlockTermsReader.getExtensions(extensions);
FixedGapTermsIndexReader.getIndexExtensions(extensions);
DefaultDocValuesConsumer.getDocValuesExtensions(extensions);
DefaultDocValuesConsumer.getDocValuesExtensions(extensions, getDocValuesUseCFS());
}
@Override
public PerDocConsumer docsConsumer(PerDocWriteState state) throws IOException {
return new DefaultDocValuesConsumer(state, BytesRef.getUTF8SortedAsUnicodeComparator());
return new DefaultDocValuesConsumer(state, getDocValuesSortComparator(), getDocValuesUseCFS());
}
@Override
public PerDocValues docsProducer(SegmentReadState state) throws IOException {
return new DefaultDocValuesProducer(state.segmentInfo, state.dir, state.fieldInfos, state.codecId);
return new DefaultDocValuesProducer(state.segmentInfo, state.dir, state.fieldInfos, state.codecId, getDocValuesUseCFS(), getDocValuesSortComparator());
}
}

Some files were not shown because too many files have changed in this diff Show More