LUCENE-3079: faceting module (port to trunk)

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1141246 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2011-06-29 21:02:07 +00:00
parent 4401ef4dae
commit 4291f52c70
263 changed files with 40433 additions and 1 deletions

View File

@ -42,6 +42,9 @@
<classpathentry kind="src" path="modules/benchmark/src/test"/>
<classpathentry kind="src" path="modules/common/src/java"/>
<classpathentry kind="src" path="modules/common/src/test"/>
<classpathentry kind="src" path="modules/facet/src/java"/>
<classpathentry kind="src" path="modules/facet/src/examples"/>
<classpathentry kind="src" path="modules/facet/src/test"/>
<classpathentry kind="src" path="modules/grouping/src/java"/>
<classpathentry kind="src" path="modules/grouping/src/test"/>
<classpathentry kind="src" path="modules/queries/src/java"/>

View File

@ -0,0 +1,17 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="JAVA_MODULE" version="4">
<component name="NewModuleRootManager" inherit-compiler-output="false">
<output url="file://$MODULE_DIR$/build/classes/java" />
<output-test url="file://$MODULE_DIR$/build/classes/test" />
<exclude-output />
<content url="file://$MODULE_DIR$">
<sourceFolder url="file://$MODULE_DIR$/src/java" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/src/test" isTestSource="true" />
<excludeFolder url="file://$MODULE_DIR$/work" />
</content>
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
<orderEntry type="library" scope="TEST" name="JUnit" level="project" />
<orderEntry type="module" module-name="lucene" />
</component>
</module>

View File

@ -0,0 +1,17 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="JAVA_MODULE" version="4">
<component name="NewModuleRootManager" inherit-compiler-output="false">
<output url="file://$MODULE_DIR$/build/classes/java" />
<output-test url="file://$MODULE_DIR$/build/classes/test" />
<exclude-output />
<content url="file://$MODULE_DIR$">
<sourceFolder url="file://$MODULE_DIR$/src/java" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/src/test" isTestSource="true" />
<excludeFolder url="file://$MODULE_DIR$/work" />
</content>
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
<orderEntry type="library" scope="TEST" name="JUnit" level="project" />
<orderEntry type="module" module-name="lucene" />
</component>
</module>

View File

@ -0,0 +1,73 @@
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-parent</artifactId>
<version>@version@</version>
<relativePath>../../pom.xml</relativePath>
</parent>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-facet</artifactId>
<packaging>jar</packaging>
<name>Lucene Facets</name>
<description>
Package for Faceted Indexing and Search
</description>
<properties>
<module-directory>modules/facet</module-directory>
<build-directory>build</build-directory>
</properties>
<dependencies>
<dependency>
<groupId>${project.groupId}</groupId>
<artifactId>lucene-core</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>${project.groupId}</groupId>
<artifactId>lucene-test-framework</artifactId>
<version>${project.version}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<scope>test</scope>
</dependency>
</dependencies>
<build>
<directory>${build-directory}</directory>
<outputDirectory>${build-directory}/classes/java</outputDirectory>
<testOutputDirectory>${build-directory}/classes/test</testOutputDirectory>
<sourceDirectory>src/java</sourceDirectory>
<testSourceDirectory>src/test</testSourceDirectory>
<testResources>
<testResource>
<directory>${project.build.testSourceDirectory}</directory>
<excludes>
<exclude>**/*.java</exclude>
</excludes>
</testResource>
</testResources>
</build>
</project>

View File

@ -65,7 +65,13 @@ New Features
* LUCENE-3234: provide a limit on phrase analysis in FastVectorHighlighter for
highlighting speed up. Use FastVectorHighlighter.setPhraseLimit() to set limit
(e.g. 5000). (Mike Sokolov via Koji Sekiguchi)
* LUCENE-3079: a new facet module which provides faceted indexing & search
capabilities. It allows managing a taxonomy of categories, and index them
with documents. It also provides search API for aggregating (e.g. count)
the weights of the categories that are relevant to the search results.
(Shai Erera)
* LUCENE-3171: Added BlockJoinQuery and BlockJoinCollector, under the
new contrib/join module, to enable searches that require joining
between parent and child documents. Joined (children + parent)

View File

@ -18,12 +18,14 @@
-->
<project name="modules" default="test" basedir=".">
<!-- TODO: at some point we should probably iterate like contrib-crawl -->
<target name="test" description="Test all modules">
<sequential>
<subant target="test" inheritall="false" failonerror="true">
<fileset dir="analysis" includes="build.xml" />
<fileset dir="benchmark" includes="build.xml" />
<fileset dir="common" includes="build.xml" />
<fileset dir="facet" includes="build.xml" />
<fileset dir="grouping" includes="build.xml" />
<fileset dir="queries" includes="build.xml" />
<fileset dir="join" includes="build.xml" />
@ -38,6 +40,7 @@
<fileset dir="analysis" includes="build.xml" />
<fileset dir="benchmark" includes="build.xml" />
<fileset dir="common" includes="build.xml" />
<fileset dir="facet" includes="build.xml" />
<fileset dir="grouping" includes="build.xml" />
<fileset dir="queries" includes="build.xml" />
<fileset dir="join" includes="build.xml" />
@ -52,6 +55,7 @@
<fileset dir="analysis" includes="build.xml" />
<fileset dir="benchmark" includes="build.xml" />
<fileset dir="common" includes="build.xml" />
<fileset dir="facet" includes="build.xml" />
<fileset dir="grouping" includes="build.xml" />
<fileset dir="queries" includes="build.xml" />
<fileset dir="join" includes="build.xml" />
@ -66,6 +70,7 @@
<fileset dir="analysis" includes="build.xml" />
<fileset dir="benchmark" includes="build.xml" />
<fileset dir="common" includes="build.xml" />
<fileset dir="facet" includes="build.xml" />
<fileset dir="grouping" includes="build.xml" />
<fileset dir="queries" includes="build.xml" />
<fileset dir="join" includes="build.xml" />
@ -81,6 +86,7 @@
<fileset dir="analysis" includes="build.xml" />
<fileset dir="benchmark" includes="build.xml" />
<fileset dir="common" includes="build.xml" />
<fileset dir="facet" includes="build.xml" />
<fileset dir="grouping" includes="build.xml" />
<fileset dir="queries" includes="build.xml" />
<fileset dir="join" includes="build.xml" />
@ -94,6 +100,7 @@
<fileset dir="analysis" includes="build.xml" />
<fileset dir="benchmark" includes="build.xml" />
<fileset dir="common" includes="build.xml" />
<fileset dir="facet" includes="build.xml" />
<fileset dir="grouping" includes="build.xml" />
<fileset dir="queries" includes="build.xml" />
<fileset dir="join" includes="build.xml" />
@ -109,6 +116,7 @@
<fileset dir="analysis" includes="build.xml" />
<fileset dir="benchmark" includes="build.xml" />
<fileset dir="common" includes="build.xml" />
<fileset dir="facet" includes="build.xml" />
<fileset dir="grouping" includes="build.xml" />
<fileset dir="queries" includes="build.xml" />
<fileset dir="join" includes="build.xml" />

202
modules/facet/LICENSE.txt Normal file
View File

@ -0,0 +1,202 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

5
modules/facet/NOTICE.txt Normal file
View File

@ -0,0 +1,5 @@
Apache Lucene Facets
Copyright 2011 The Apache Software Foundation
This product includes software developed by
The Apache Software Foundation (http://www.apache.org/).

75
modules/facet/build.xml Normal file
View File

@ -0,0 +1,75 @@
<?xml version="1.0"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<project name="facet" default="default">
<description>
Faceted search module
</description>
<property name="build.dir" location="build/" />
<import file="../../lucene/contrib/contrib-build.xml"/>
<property name="build.dir" location="build/" />
<property name="dist.dir" location="dist/" />
<!-- TODO, cut over tests to MockAnalyzer etc and nuke this dependency -->
<module-uptodate name="analysis/common" jarfile="${common.dir}/../modules/analysis/build/common/lucene-analyzers-common-${version}.jar"
property="analyzers-common.uptodate" classpath.property="analyzers-common.jar"/>
<path id="examples.classpath">
<path refid="classpath" />
<pathelement location="${build.dir}/classes/java" />
<pathelement path="${analyzers-common.jar}" />
</path>
<path id="test.classpath">
<path refid="test.base.classpath" />
<pathelement location="${build.dir}/classes/examples" />
<pathelement path="${analyzers-common.jar}" />
</path>
<path id="classpath">
<pathelement path="${analyzers-common.jar}" />
<path refid="base.classpath"/>
</path>
<target name="compile-examples" description="Compiles Facets examples">
<compile srcdir="src/examples" destdir="${build.dir}/classes/examples">
<classpath refid="examples.classpath" />
</compile>
</target>
<target name="jar-examples" depends="compile-examples">
<jarify basedir="${build.dir}/classes/examples"
destfile="${build.dir}/${final.name}-examples.jar"
title="Lucene Search Engine: ${ant.project.name}-examples" />
</target>
<target name="jar-analyzers-common" unless="analyzers-common.uptodate">
<subant target="jar-core">
<fileset dir="${common.dir}/../modules/analysis/common" includes="build.xml"/>
</subant>
</target>
<target name="compile-core" depends="jar-analyzers-common,common.compile-core,compile-examples" description="Compiles facet classes" />
<target name="jar-core" depends="common.jar-core,jar-examples" />
</project>

View File

@ -0,0 +1,49 @@
package org.apache.lucene.facet.example;
import java.util.List;
import org.apache.lucene.facet.search.results.FacetResult;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Result of running an example program.
* This is a general object for allowing to write a test
* that runs an example and verifies its results.
*
* @lucene.experimental
*/
public class ExampleResult {
private List<FacetResult> facetResults;
/**
* @return the facet results
*/
public List<FacetResult> getFacetResults() {
return facetResults;
}
/**
* @param facetResults the facet results to set
*/
public void setFacetResults(List<FacetResult> facetResults) {
this.facetResults = facetResults;
}
}

View File

@ -0,0 +1,38 @@
package org.apache.lucene.facet.example;
import org.apache.lucene.util.Version;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* @lucene.experimental
*/
public class ExampleUtils {
public static final boolean VERBOSE = Boolean.getBoolean("tests.verbose");
/** The Lucene {@link Version} used by the example code. */
public static final Version EXAMPLE_VER = Version.LUCENE_31;
public static void log(Object msg) {
if (VERBOSE) {
System.out.println(msg.toString());
}
}
}

View File

@ -0,0 +1,67 @@
package org.apache.lucene.facet.example.adaptive;
import java.util.List;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.facet.example.ExampleResult;
import org.apache.lucene.facet.example.ExampleUtils;
import org.apache.lucene.facet.example.simple.SimpleIndexer;
import org.apache.lucene.facet.example.simple.SimpleSearcher;
import org.apache.lucene.facet.search.AdaptiveFacetsAccumulator;
import org.apache.lucene.facet.search.results.FacetResult;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Driver for the adaptive sample, using the {@link AdaptiveFacetsAccumulator}.
* Indexing is the same as in {@link SimpleSearcher}
*
* @lucene.experimental
*/
public class AdaptiveMain {
/**
* Driver for the adaptive sample.
* @throws Exception on error (no detailed exception handling here for sample simplicity
*/
public static void main(String[] args) throws Exception {
new AdaptiveMain().runSample();
ExampleUtils.log("DONE");
}
public ExampleResult runSample() throws Exception {
// create Directories for the search index and for the taxonomy index
Directory indexDir = new RAMDirectory();
Directory taxoDir = new RAMDirectory();
// index the sample documents
ExampleUtils.log("index the adaptive sample documents...");
SimpleIndexer.index(indexDir, taxoDir);
ExampleUtils.log("search the adaptive sample documents...");
List<FacetResult> facetRes = AdaptiveSearcher.searchWithFacets(indexDir, taxoDir);
ExampleResult res = new ExampleResult();
res.setFacetResults(facetRes);
return res;
}
}

View File

@ -0,0 +1,103 @@
package org.apache.lucene.facet.example.adaptive;
import java.util.List;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.store.Directory;
import org.apache.lucene.search.MultiCollector;
import org.apache.lucene.facet.example.ExampleUtils;
import org.apache.lucene.facet.example.simple.SimpleUtils;
import org.apache.lucene.facet.search.AdaptiveFacetsAccumulator;
import org.apache.lucene.facet.search.ScoredDocIdCollector;
import org.apache.lucene.facet.search.params.CountFacetRequest;
import org.apache.lucene.facet.search.params.FacetSearchParams;
import org.apache.lucene.facet.search.results.FacetResult;
import org.apache.lucene.facet.taxonomy.CategoryPath;
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
import org.apache.lucene.facet.taxonomy.lucene.LuceneTaxonomyReader;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Search with facets through the {@link AdaptiveFacetsAccumulator}
*
* @lucene.experimental
*/
public class AdaptiveSearcher {
/**
* Search with facets through the {@link AdaptiveFacetsAccumulator}
* @param indexDir Directory of the search index.
* @param taxoDir Directory of the taxonomy index.
* @throws Exception on error (no detailed exception handling here for sample simplicity
* @return facet results
*/
public static List<FacetResult> searchWithFacets (Directory indexDir, Directory taxoDir) throws Exception {
// prepare index reader and taxonomy.
TaxonomyReader taxo = new LuceneTaxonomyReader(taxoDir);
IndexReader indexReader = IndexReader.open(indexDir);
// prepare searcher to search against
IndexSearcher searcher = new IndexSearcher(indexReader);
// faceted search is working in 2 steps:
// 1. collect matching documents
// 2. aggregate facets for collected documents and
// generate the requested faceted results from the aggregated facets
// step 1: collect matching documents into a collector
Query q = new TermQuery(new Term(SimpleUtils.TEXT,"white"));
ExampleUtils.log("Query: "+q);
// regular collector for scoring matched documents
TopScoreDocCollector topDocsCollector = TopScoreDocCollector.create(10, true);
// docids collector for guiding facets accumulation (scoring disabled)
ScoredDocIdCollector docIdsCollecor = ScoredDocIdCollector.create(indexReader.maxDoc(), false);
// Faceted search parameters indicate which facets are we interested in
FacetSearchParams facetSearchParams = new FacetSearchParams();
facetSearchParams.addFacetRequest(new CountFacetRequest(new CategoryPath("root","a"), 10));
// search, into both collectors. note: in case only facets accumulation
// is required, the topDocCollector part can be totally discarded
searcher.search(q, MultiCollector.wrap(topDocsCollector, docIdsCollecor));
// Obtain facets results and print them
AdaptiveFacetsAccumulator accumulator = new AdaptiveFacetsAccumulator(facetSearchParams, indexReader, taxo);
List<FacetResult> res = accumulator.accumulate(docIdsCollecor.getScoredDocIDs());
int i = 0;
for (FacetResult facetResult : res) {
ExampleUtils.log("Res "+(i++)+": "+facetResult);
}
// we're done, close the index reader and the taxonomy.
indexReader.close();
taxo.close();
return res;
}
}

View File

@ -0,0 +1,132 @@
package org.apache.lucene.facet.example.association;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.store.Directory;
import org.apache.lucene.facet.enhancements.EnhancementsDocumentBuilder;
import org.apache.lucene.facet.enhancements.association.AssociationProperty;
import org.apache.lucene.facet.example.ExampleUtils;
import org.apache.lucene.facet.example.simple.SimpleUtils;
import org.apache.lucene.facet.index.CategoryContainer;
import org.apache.lucene.facet.index.CategoryDocumentBuilder;
import org.apache.lucene.facet.taxonomy.CategoryPath;
import org.apache.lucene.facet.taxonomy.TaxonomyWriter;
import org.apache.lucene.facet.taxonomy.lucene.LuceneTaxonomyWriter;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Sample indexer creates an index, and adds to it sample documents with
* categories, which can be simple or contain associations.
*
* @lucene.experimental
*/
public class AssociationIndexer {
/**
* Create an index, and adds to it sample documents and categories.
*
* @param indexDir
* Directory in which the index should be created.
* @param taxoDir
* Directory in which the taxonomy index should be created.
* @throws Exception
* on error (no detailed exception handling here for sample
* simplicity
*/
public static void index(Directory indexDir, Directory taxoDir) throws Exception {
// create and open an index writer
IndexWriter iw = new IndexWriter(indexDir, new IndexWriterConfig(ExampleUtils.EXAMPLE_VER, SimpleUtils.analyzer));
// create and open a taxonomy writer
TaxonomyWriter taxo = new LuceneTaxonomyWriter(taxoDir, OpenMode.CREATE);
// loop over sample documents
int nDocsAdded = 0;
int nFacetsAdded = 0;
for (int docNum = 0; docNum < SimpleUtils.docTexts.length; docNum++) {
ExampleUtils.log(" ++++ DOC ID: " + docNum);
// obtain the sample categories for current document
CategoryContainer categoryContainer = new CategoryContainer();
for (CategoryPath path : SimpleUtils.categories[docNum]) {
categoryContainer.addCategory(path);
ExampleUtils.log("\t ++++ PATH: " + path);
}
// and also those with associations
CategoryPath[] associationsPaths = AssociationUtils.categories[docNum];
AssociationProperty[] associationProps = AssociationUtils.associations[docNum];
for (int i = 0; i < associationsPaths.length; i++) {
categoryContainer.addCategory(associationsPaths[i], associationProps[i]);
ExampleUtils.log("\t $$$$ Association: ("
+ associationsPaths[i] + "," + associationProps[i]
+ ")");
}
// we do not alter indexing parameters!
// a category document builder will add the categories to a document
// once build() is called
CategoryDocumentBuilder categoryDocBuilder = new EnhancementsDocumentBuilder(
taxo, AssociationUtils.assocIndexingParams);
categoryDocBuilder.setCategories(categoryContainer);
// create a plain Lucene document and add some regular Lucene fields
// to it
Document doc = new Document();
doc.add(new Field(SimpleUtils.TITLE, SimpleUtils.docTitles[docNum],
Store.YES, Index.ANALYZED));
doc.add(new Field(SimpleUtils.TEXT, SimpleUtils.docTexts[docNum],
Store.NO, Index.ANALYZED));
// invoke the category document builder for adding categories to the
// document and,
// as required, to the taxonomy index
categoryDocBuilder.build(doc);
// finally add the document to the index
iw.addDocument(doc);
nDocsAdded++;
nFacetsAdded += categoryContainer.size();
}
// commit changes.
// we commit changes to the taxonomy index prior to committing them to
// the search index.
// this is important, so that all facets referred to by documents in the
// search index
// will indeed exist in the taxonomy index.
taxo.commit();
iw.commit();
// close the taxonomy index and the index - all modifications are
// now safely in the provided directories: indexDir and taxoDir.
taxo.close();
iw.close();
ExampleUtils.log("Indexed " + nDocsAdded + " documents with overall "
+ nFacetsAdded + " facets.");
}
}

View File

@ -0,0 +1,82 @@
package org.apache.lucene.facet.example.association;
import java.util.List;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.facet.example.ExampleResult;
import org.apache.lucene.facet.example.ExampleUtils;
import org.apache.lucene.facet.search.results.FacetResult;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Driver for the simple sample.
*
* @lucene.experimental
*/
public class AssociationMain {
/**
* Driver for the simple sample.
* @throws Exception on error (no detailed exception handling here for sample simplicity
*/
public static void main(String[] args) throws Exception {
new AssociationMain().runSumIntAssociationSample();
new AssociationMain().runSumFloatAssociationSample();
ExampleUtils.log("DONE");
}
public ExampleResult runSumIntAssociationSample() throws Exception {
// create Directories for the search index and for the taxonomy index
Directory indexDir = new RAMDirectory();//FSDirectory.open(new File("/tmp/111"));
Directory taxoDir = new RAMDirectory();
// index the sample documents
ExampleUtils.log("index the sample documents...");
AssociationIndexer.index(indexDir, taxoDir);
ExampleUtils.log("search the sample documents...");
List<FacetResult> facetRes = AssociationSearcher.searchSumIntAssociation(indexDir, taxoDir);
ExampleResult res = new ExampleResult();
res.setFacetResults(facetRes);
return res;
}
public ExampleResult runSumFloatAssociationSample() throws Exception {
// create Directories for the search index and for the taxonomy index
Directory indexDir = new RAMDirectory();//FSDirectory.open(new File("/tmp/111"));
Directory taxoDir = new RAMDirectory();
// index the sample documents
ExampleUtils.log("index the sample documents...");
AssociationIndexer.index(indexDir, taxoDir);
ExampleUtils.log("search the sample documents...");
List<FacetResult> facetRes = AssociationSearcher.searchSumFloatAssociation(indexDir, taxoDir);
ExampleResult res = new ExampleResult();
res.setFacetResults(facetRes);
return res;
}
}

View File

@ -0,0 +1,81 @@
package org.apache.lucene.facet.example.association;
import java.util.List;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.store.Directory;
import org.apache.lucene.facet.example.simple.SimpleSearcher;
import org.apache.lucene.facet.search.params.association.AssociationFloatSumFacetRequest;
import org.apache.lucene.facet.search.params.association.AssociationIntSumFacetRequest;
import org.apache.lucene.facet.search.results.FacetResult;
import org.apache.lucene.facet.taxonomy.CategoryPath;
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
import org.apache.lucene.facet.taxonomy.lucene.LuceneTaxonomyReader;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* AssociationSearcher searches index with facets, evaluating the facets with
* their associated $int value
*
* @lucene.experimental
*/
public class AssociationSearcher {
/** Search an index with a sum of int-association. */
public static List<FacetResult> searchSumIntAssociation(Directory indexDir,
Directory taxoDir) throws Exception {
// prepare index reader
IndexReader indexReader = IndexReader.open(indexDir);
TaxonomyReader taxo = new LuceneTaxonomyReader(taxoDir);
AssociationIntSumFacetRequest facetRequest = new AssociationIntSumFacetRequest(
new CategoryPath("tags"), 10);
List<FacetResult> res = SimpleSearcher.searchWithRequest(indexReader, taxo,
AssociationUtils.assocIndexingParams, facetRequest);
// close readers
taxo.close();
indexReader.close();
return res;
}
/** Search an index with a sum of float-association. */
public static List<FacetResult> searchSumFloatAssociation(Directory indexDir,
Directory taxoDir) throws Exception {
// prepare index reader
IndexReader indexReader = IndexReader.open(indexDir);
TaxonomyReader taxo = new LuceneTaxonomyReader(taxoDir);
AssociationFloatSumFacetRequest facetRequest = new AssociationFloatSumFacetRequest(
new CategoryPath("genre"), 10);
List<FacetResult> res = SimpleSearcher.searchWithRequest(indexReader, taxo,
AssociationUtils.assocIndexingParams, facetRequest);
// close readers
taxo.close();
indexReader.close();
return res;
}
}

View File

@ -0,0 +1,79 @@
package org.apache.lucene.facet.example.association;
import org.apache.lucene.facet.enhancements.association.AssociationEnhancement;
import org.apache.lucene.facet.enhancements.association.AssociationFloatProperty;
import org.apache.lucene.facet.enhancements.association.AssociationIntProperty;
import org.apache.lucene.facet.enhancements.association.AssociationProperty;
import org.apache.lucene.facet.enhancements.params.DefaultEnhancementsIndexingParams;
import org.apache.lucene.facet.taxonomy.CategoryPath;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* @lucene.experimental
*/
public class AssociationUtils {
/**
* Categories: categories[D][N] == category-path with association no. N for
* document no. D.
*/
public static CategoryPath[][] categories = {
// Doc #1
{ new CategoryPath("tags", "lucene") ,
new CategoryPath("genre", "computing")
},
// Doc #2
{ new CategoryPath("tags", "lucene"),
new CategoryPath("tags", "solr"),
new CategoryPath("genre", "computing"),
new CategoryPath("genre", "software")
}
};
public static AssociationProperty[][] associations = {
// Doc #1 associations
{
/* 3 occurrences for tag 'lucene' */
new AssociationIntProperty(3),
/* 87% confidence level of genre 'computing' */
new AssociationFloatProperty(0.87f)
},
// Doc #2 associations
{
/* 1 occurrence for tag 'lucene' */
new AssociationIntProperty(1),
/* 2 occurrences for tag 'solr' */
new AssociationIntProperty(2),
/* 75% confidence level of genre 'computing' */
new AssociationFloatProperty(0.75f),
/* 34% confidence level of genre 'software' */
new AssociationFloatProperty(0.34f),
}
};
/**
* Indexing Params: the indexing params to use when dealing with
* associations.
*/
public static final DefaultEnhancementsIndexingParams assocIndexingParams =
new DefaultEnhancementsIndexingParams(new AssociationEnhancement());
}

View File

@ -0,0 +1,102 @@
package org.apache.lucene.facet.example.merge;
import java.io.IOException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.PayloadProcessorProvider;
import org.apache.lucene.store.Directory;
import org.apache.lucene.facet.example.ExampleUtils;
import org.apache.lucene.facet.index.FacetsPayloadProcessorProvider;
import org.apache.lucene.facet.index.params.DefaultFacetIndexingParams;
import org.apache.lucene.facet.taxonomy.lucene.LuceneTaxonomyWriter;
import org.apache.lucene.facet.taxonomy.lucene.LuceneTaxonomyWriter.DiskOrdinalMap;
import org.apache.lucene.facet.taxonomy.lucene.LuceneTaxonomyWriter.MemoryOrdinalMap;
import org.apache.lucene.facet.taxonomy.lucene.LuceneTaxonomyWriter.OrdinalMap;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* @lucene.experimental
*/
public class TaxonomyMergeUtils {
/**
* Merges the given taxonomy and index directories. Note that this method
* opens {@link LuceneTaxonomyWriter} and {@link IndexWriter} on the
* respective destination indexes. Therefore if you have a writer open on any
* of them, it should be closed, or you should use
* {@link #merge(Directory, Directory, IndexWriter, LuceneTaxonomyWriter)}
* instead.
*
* @see #merge(Directory, Directory, IndexWriter, LuceneTaxonomyWriter)
*/
public static void merge(Directory srcIndexDir, Directory srcTaxDir,
Directory destIndexDir, Directory destTaxDir) throws IOException {
IndexWriter destIndexWriter = new IndexWriter(destIndexDir,
new IndexWriterConfig(ExampleUtils.EXAMPLE_VER, null));
LuceneTaxonomyWriter destTaxWriter = new LuceneTaxonomyWriter(destTaxDir);
merge(srcIndexDir, srcTaxDir, new MemoryOrdinalMap(), destIndexWriter, destTaxWriter);
destTaxWriter.close();
destIndexWriter.close();
}
/**
* Merges the given taxonomy and index directories and commits the changes to
* the given writers. This method uses {@link MemoryOrdinalMap} to store the
* mapped ordinals. If you cannot afford the memory, you can use
* {@link #merge(Directory, Directory, OrdinalMap, IndexWriter, LuceneTaxonomyWriter)}
* by passing {@link DiskOrdinalMap}.
*
* @see #merge(Directory, Directory, OrdinalMap, IndexWriter, LuceneTaxonomyWriter)
*/
public static void merge(Directory srcIndexDir, Directory srcTaxDir,
IndexWriter destIndexWriter,
LuceneTaxonomyWriter destTaxWriter) throws IOException {
merge(srcIndexDir, srcTaxDir, new MemoryOrdinalMap(), destIndexWriter, destTaxWriter);
}
/**
* Merges the given taxonomy and index directories and commits the changes to
* the given writers.
*/
public static void merge(Directory srcIndexDir, Directory srcTaxDir,
OrdinalMap map, IndexWriter destIndexWriter,
LuceneTaxonomyWriter destTaxWriter) throws IOException {
// merge the taxonomies
destTaxWriter.addTaxonomies(new Directory[] { srcTaxDir }, new OrdinalMap[] { map });
PayloadProcessorProvider payloadProcessor = new FacetsPayloadProcessorProvider(
srcIndexDir, map.getMap(), new DefaultFacetIndexingParams());
destIndexWriter.setPayloadProcessorProvider(payloadProcessor);
IndexReader reader = IndexReader.open(srcIndexDir);
try {
destIndexWriter.addIndexes(reader);
// commit changes to taxonomy and index respectively.
destTaxWriter.commit();
destIndexWriter.commit();
} finally {
reader.close();
}
}
}

View File

@ -0,0 +1,209 @@
package org.apache.lucene.facet.example.multiCL;
import java.util.List;
import java.util.Random;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.DocumentBuilder;
import org.apache.lucene.facet.example.ExampleUtils;
import org.apache.lucene.facet.example.simple.SimpleUtils;
import org.apache.lucene.facet.index.CategoryDocumentBuilder;
import org.apache.lucene.facet.index.params.CategoryListParams;
import org.apache.lucene.facet.index.params.FacetIndexingParams;
import org.apache.lucene.facet.index.params.PerDimensionIndexingParams;
import org.apache.lucene.facet.taxonomy.CategoryPath;
import org.apache.lucene.facet.taxonomy.lucene.LuceneTaxonomyWriter;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Sample indexer creates an index, and adds to it sample documents and facets
* with multiple CategoryLists specified for different facets, so there are different
* category lists for different facets.
*
* @lucene.experimental
*/
public class MultiCLIndexer {
// Number of documents to index
public static int NUM_DOCS = 100;
// Number of facets to add per document
public static int NUM_FACETS_PER_DOC = 10;
// Number of tokens in title
public static int TITLE_LENGTH = 5;
// Number of tokens in text
public static int TEXT_LENGTH = 100;
// Lorum ipsum to use as content - this will be tokenized and used for document
// titles/text.
static String words = "Sed ut perspiciatis unde omnis iste natus error sit "
+ "voluptatem accusantium doloremque laudantium totam rem aperiam "
+ "eaque ipsa quae ab illo inventore veritatis et quasi architecto "
+ "beatae vitae dicta sunt explicabo Nemo enim ipsam voluptatem "
+ "quia voluptas sit aspernatur aut odit aut fugit sed quia consequuntur "
+ "magni dolores eos qui ratione voluptatem sequi nesciunt Neque porro "
+ "quisquam est qui dolorem ipsum quia dolor sit amet consectetur adipisci velit "
+ "sed quia non numquam eius modi tempora incidunt ut labore et dolore "
+ "magnam aliquam quaerat voluptatem Ut enim ad minima veniam "
+ "quis nostrum exercitationem ullam corporis suscipit laboriosam "
+ "nisi ut aliquid ex ea commodi consequatur? Quis autem vel eum iure"
+ "reprehenderit qui in ea voluptate velit esse quam nihil molestiae "
+ "consequatur vel illum qui dolorem eum fugiat quo voluptas nulla pariatur";
// PerDimensionIndexingParams for multiple category lists
public static PerDimensionIndexingParams MULTI_IPARAMS = new PerDimensionIndexingParams();
// Initialize PerDimensionIndexingParams
static {
MULTI_IPARAMS.addCategoryListParams(new CategoryPath("0"),
new CategoryListParams(new Term("$Digits", "Zero")));
MULTI_IPARAMS.addCategoryListParams(new CategoryPath("1"),
new CategoryListParams(new Term("$Digits", "One")));
MULTI_IPARAMS.addCategoryListParams(new CategoryPath("2"),
new CategoryListParams(new Term("$Digits", "Two")));
MULTI_IPARAMS.addCategoryListParams(new CategoryPath("3"),
new CategoryListParams(new Term("$Digits", "Three")));
MULTI_IPARAMS.addCategoryListParams(new CategoryPath("4"),
new CategoryListParams(new Term("$Digits", "Four")));
MULTI_IPARAMS.addCategoryListParams(new CategoryPath("5"),
new CategoryListParams(new Term("$Digits", "Five")));
}
/**
* Create an index, and adds to it sample documents and facets.
* @param indexDir Directory in which the index should be created.
* @param taxoDir Directory in which the taxonomy index should be created.
* @throws Exception on error (no detailed exception handling here for sample simplicity
*/
public static void index(Directory indexDir, Directory taxoDir)
throws Exception {
Random random = new Random(2003);
String[] docTitles = new String[NUM_DOCS];
String[] docTexts = new String[NUM_DOCS];
CategoryPath[][] cPaths = new CategoryPath[NUM_DOCS][NUM_FACETS_PER_DOC];
String[] tokens = words.split(" ");
for (int docNum = 0; docNum < NUM_DOCS; docNum++) {
String title = "";
String text = "";
for (int j = 0; j < TITLE_LENGTH; j++) {
title = title + tokens[random.nextInt(tokens.length)] + " ";
}
docTitles[docNum] = title;
for (int j = 0; j < TEXT_LENGTH; j++) {
text = text + tokens[random.nextInt(tokens.length)] + " ";
}
docTexts[docNum] = text;
for (int facetNum = 0; facetNum < NUM_FACETS_PER_DOC; facetNum++) {
cPaths[docNum][facetNum] = new CategoryPath(Integer
.toString(random.nextInt(7)), Integer.toString(random.nextInt(10)));
}
}
index(indexDir, taxoDir, MULTI_IPARAMS, docTitles, docTexts, cPaths);
}
/**
* More advanced method for specifying custom indexing params, doc texts,
* doc titles and category paths.
*/
public static void index(Directory indexDir, Directory taxoDir,
FacetIndexingParams iParams, String[] docTitles,
String[] docTexts, CategoryPath[][] cPaths) throws Exception {
// create and open an index writer
IndexWriter iw = new IndexWriter(indexDir, new IndexWriterConfig(
ExampleUtils.EXAMPLE_VER, SimpleUtils.analyzer).setOpenMode(OpenMode.CREATE));
// create and open a taxonomy writer
LuceneTaxonomyWriter taxo = new LuceneTaxonomyWriter(taxoDir, OpenMode.CREATE);
index(iw, taxo, iParams, docTitles, docTexts, cPaths);
}
/**
* More advanced method for specifying custom indexing params, doc texts,
* doc titles and category paths.
* <p>
* Create an index, and adds to it sample documents and facets.
* @throws Exception
* on error (no detailed exception handling here for sample
* simplicity
*/
public static void index(IndexWriter iw, LuceneTaxonomyWriter taxo,
FacetIndexingParams iParams, String[] docTitles,
String[] docTexts, CategoryPath[][] cPaths) throws Exception {
// loop over sample documents
int nDocsAdded = 0;
int nFacetsAdded = 0;
for (int docNum = 0; docNum < SimpleUtils.docTexts.length; docNum++) {
List<CategoryPath> facetList = SimpleUtils.categoryPathArrayToList(cPaths[docNum]);
// we do not alter indexing parameters!
// a category document builder will add the categories to a document
// once build() is called
DocumentBuilder categoryDocBuilder = new CategoryDocumentBuilder(
taxo, iParams).setCategoryPaths(facetList);
// create a plain Lucene document and add some regular Lucene fields
// to it
Document doc = new Document();
doc.add(new Field(SimpleUtils.TITLE, docTitles[docNum], Store.YES, Index.ANALYZED));
doc.add(new Field(SimpleUtils.TEXT, docTexts[docNum], Store.NO, Index.ANALYZED));
// finally add the document to the index
categoryDocBuilder.build(doc);
iw.addDocument(doc);
nDocsAdded++;
nFacetsAdded += facetList.size();
}
// commit changes.
// we commit changes to the taxonomy index prior to committing them to
// the search index.
// this is important, so that all facets referred to by documents in the
// search index
// will indeed exist in the taxonomy index.
taxo.commit();
iw.commit();
// close the taxonomy index and the index - all modifications are
// now safely in the provided directories: indexDir and taxoDir.
taxo.close();
iw.close();
ExampleUtils.log("Indexed " + nDocsAdded + " documents with overall "
+ nFacetsAdded + " facets.");
}
public static void main(String[] args) throws Exception {
index(new RAMDirectory(), new RAMDirectory());
}
}

View File

@ -0,0 +1,65 @@
package org.apache.lucene.facet.example.multiCL;
import java.util.List;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.facet.example.ExampleResult;
import org.apache.lucene.facet.example.ExampleUtils;
import org.apache.lucene.facet.search.results.FacetResult;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* @lucene.experimental
*/
public class MultiCLMain {
/**
* Driver for the multi sample.
*
* @throws Exception
* on error (no detailed exception handling here for sample
* simplicity
*/
public static void main(String[] args) throws Exception {
new MultiCLMain().runSample();
ExampleUtils.log("DONE");
}
public ExampleResult runSample() throws Exception {
// create Directories for the search index and for the taxonomy index
Directory indexDir = new RAMDirectory();
Directory taxoDir = new RAMDirectory();
// index the sample documents
ExampleUtils.log("index the sample documents...");
MultiCLIndexer.index(indexDir, taxoDir);
ExampleUtils.log("search the sample documents...");
List<FacetResult> facetRes = MultiCLSearcher.searchWithFacets(indexDir,
taxoDir, MultiCLIndexer.MULTI_IPARAMS);
ExampleResult res = new ExampleResult();
res.setFacetResults(facetRes);
return res;
}
}

View File

@ -0,0 +1,128 @@
package org.apache.lucene.facet.example.multiCL;
import java.util.List;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.store.Directory;
import org.apache.lucene.search.MultiCollector;
import org.apache.lucene.facet.example.ExampleUtils;
import org.apache.lucene.facet.example.simple.SimpleUtils;
import org.apache.lucene.facet.index.params.FacetIndexingParams;
import org.apache.lucene.facet.search.FacetsCollector;
import org.apache.lucene.facet.search.params.CountFacetRequest;
import org.apache.lucene.facet.search.params.FacetSearchParams;
import org.apache.lucene.facet.search.results.FacetResult;
import org.apache.lucene.facet.taxonomy.CategoryPath;
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
import org.apache.lucene.facet.taxonomy.lucene.LuceneTaxonomyReader;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* MultiSearcher searches index with facets over an index with multiple
* category lists.
*
* @lucene.experimental
*/
public class MultiCLSearcher {
/**
* Search an index with facets.
*
* @param indexDir
* Directory of the search index.
* @param taxoDir
* Directory of the taxonomy index.
* @throws Exception
* on error (no detailed exception handling here for sample
* simplicity
* @return facet results
*/
public static List<FacetResult> searchWithFacets(Directory indexDir,
Directory taxoDir, FacetIndexingParams iParams) throws Exception {
// prepare index reader and taxonomy.
IndexReader indexReader = IndexReader.open(indexDir);
TaxonomyReader taxo = new LuceneTaxonomyReader(taxoDir);
// Get results
List<FacetResult> results = searchWithFacets(indexReader, taxo, iParams);
// we're done, close the index reader and the taxonomy.
indexReader.close();
taxo.close();
return results;
}
public static List<FacetResult> searchWithFacets(IndexReader indexReader,
TaxonomyReader taxo, FacetIndexingParams iParams) throws Exception {
// prepare searcher to search against
IndexSearcher searcher = new IndexSearcher(indexReader);
// faceted search is working in 2 steps:
// 1. collect matching documents
// 2. aggregate facets for collected documents and
// generate the requested faceted results from the aggregated facets
// step 1: create a query for finding matching documents for which we
// accumulate facets
Query q = new TermQuery(new Term(SimpleUtils.TEXT, "Quis"));
ExampleUtils.log("Query: " + q);
TopScoreDocCollector topDocsCollector = TopScoreDocCollector.create(10,
true);
// Faceted search parameters indicate which facets are we interested in
FacetSearchParams facetSearchParams = new FacetSearchParams(iParams);
facetSearchParams.addFacetRequest(new CountFacetRequest(
new CategoryPath("5"), 10));
facetSearchParams.addFacetRequest(new CountFacetRequest(
new CategoryPath("5", "5"), 10));
facetSearchParams.addFacetRequest(new CountFacetRequest(
new CategoryPath("6", "2"), 10));
// Facets collector is the simplest interface for faceted search.
// It provides faceted search functions that are sufficient to many
// application,
// although it is insufficient for tight control on faceted search
// behavior - in those
// situations other, more low-level interfaces are available, as
// demonstrated in other search examples.
FacetsCollector facetsCollector = new FacetsCollector(
facetSearchParams, indexReader, taxo);
// perform documents search and facets accumulation
searcher.search(q, MultiCollector.wrap(topDocsCollector, facetsCollector));
// Obtain facets results and print them
List<FacetResult> res = facetsCollector.getFacetResults();
int i = 0;
for (FacetResult facetResult : res) {
ExampleUtils.log("Res " + (i++) + ": " + facetResult);
}
return res;
}
}

View File

@ -0,0 +1,102 @@
package org.apache.lucene.facet.example.simple;
import java.util.List;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.store.Directory;
import org.apache.lucene.DocumentBuilder;
import org.apache.lucene.facet.example.ExampleUtils;
import org.apache.lucene.facet.index.CategoryDocumentBuilder;
import org.apache.lucene.facet.taxonomy.CategoryPath;
import org.apache.lucene.facet.taxonomy.TaxonomyWriter;
import org.apache.lucene.facet.taxonomy.lucene.LuceneTaxonomyWriter;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Sample indexer creates an index, and adds to it sample documents and facets.
*
* @lucene.experimental
*/
public class SimpleIndexer {
/**
* Create an index, and adds to it sample documents and facets.
* @param indexDir Directory in which the index should be created.
* @param taxoDir Directory in which the taxonomy index should be created.
* @throws Exception on error (no detailed exception handling here for sample simplicity
*/
public static void index (Directory indexDir, Directory taxoDir) throws Exception {
// create and open an index writer
IndexWriter iw = new IndexWriter(indexDir, new IndexWriterConfig(ExampleUtils.EXAMPLE_VER, SimpleUtils.analyzer));
// create and open a taxonomy writer
TaxonomyWriter taxo = new LuceneTaxonomyWriter(taxoDir, OpenMode.CREATE);
// loop over sample documents
int nDocsAdded = 0;
int nFacetsAdded = 0;
for (int docNum=0; docNum<SimpleUtils.docTexts.length; docNum++) {
// obtain the sample facets for current document
List<CategoryPath> facetList = SimpleUtils.categoryPathArrayToList(SimpleUtils.categories[docNum]);
// we do not alter indexing parameters!
// a category document builder will add the categories to a document once build() is called
DocumentBuilder categoryDocBuilder = new CategoryDocumentBuilder(taxo).setCategoryPaths(facetList);
// create a plain Lucene document and add some regular Lucene fields to it
Document doc = new Document();
doc.add(new Field(SimpleUtils.TITLE, SimpleUtils.docTitles[docNum], Store.YES, Index.ANALYZED));
doc.add(new Field(SimpleUtils.TEXT, SimpleUtils.docTexts[docNum], Store.NO, Index.ANALYZED));
// invoke the category document builder for adding categories to the document and,
// as required, to the taxonomy index
categoryDocBuilder.build(doc);
// finally add the document to the index
iw.addDocument(doc);
nDocsAdded ++;
nFacetsAdded += facetList.size();
}
// commit changes.
// we commit changes to the taxonomy index prior to committing them to the search index.
// this is important, so that all facets referred to by documents in the search index
// will indeed exist in the taxonomy index.
taxo.commit();
iw.commit();
// close the taxonomy index and the index - all modifications are
// now safely in the provided directories: indexDir and taxoDir.
taxo.close();
iw.close();
ExampleUtils.log("Indexed "+nDocsAdded+" documents with overall "+nFacetsAdded+" facets.");
}
}

View File

@ -0,0 +1,99 @@
package org.apache.lucene.facet.example.simple;
import java.util.List;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.facet.example.ExampleResult;
import org.apache.lucene.facet.example.ExampleUtils;
import org.apache.lucene.facet.search.results.FacetResult;
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
import org.apache.lucene.facet.taxonomy.lucene.LuceneTaxonomyReader;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Driver for the simple sample.
*
* @lucene.experimental
*/
public class SimpleMain {
/**
* Driver for the simple sample.
* @throws Exception on error (no detailed exception handling here for sample simplicity
*/
public static void main(String[] args) throws Exception {
new SimpleMain().runSimple();
new SimpleMain().runDrillDown().getFacetResults();
ExampleUtils.log("DONE");
}
public ExampleResult runSimple() throws Exception {
// create Directories for the search index and for the taxonomy index
Directory indexDir = new RAMDirectory();
Directory taxoDir = new RAMDirectory();
// index the sample documents
ExampleUtils.log("index the sample documents...");
SimpleIndexer.index(indexDir, taxoDir);
// open readers
TaxonomyReader taxo = new LuceneTaxonomyReader(taxoDir);
IndexReader indexReader = IndexReader.open(indexDir, true);
ExampleUtils.log("search the sample documents...");
List<FacetResult> facetRes = SimpleSearcher.searchWithFacets(indexReader, taxo);
// close readers
taxo.close();
indexReader.close();
ExampleResult res = new ExampleResult();
res.setFacetResults(facetRes);
return res;
}
public ExampleResult runDrillDown() throws Exception {
// create Directories for the search index and for the taxonomy index
Directory indexDir = new RAMDirectory();
Directory taxoDir = new RAMDirectory();
// index the sample documents
ExampleUtils.log("index the sample documents...");
SimpleIndexer.index(indexDir, taxoDir);
// open readers
TaxonomyReader taxo = new LuceneTaxonomyReader(taxoDir);
IndexReader indexReader = IndexReader.open(indexDir, true);
ExampleUtils.log("search the sample documents...");
List<FacetResult> facetRes = SimpleSearcher.searchWithDrillDown(indexReader, taxo);
// close readers
taxo.close();
indexReader.close();
ExampleResult res = new ExampleResult();
res.setFacetResults(facetRes);
return res;
}
}

View File

@ -0,0 +1,168 @@
package org.apache.lucene.facet.example.simple;
import java.util.Iterator;
import java.util.List;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.search.MultiCollector;
import org.apache.lucene.facet.example.ExampleUtils;
import org.apache.lucene.facet.index.params.DefaultFacetIndexingParams;
import org.apache.lucene.facet.index.params.FacetIndexingParams;
import org.apache.lucene.facet.search.DrillDown;
import org.apache.lucene.facet.search.FacetsCollector;
import org.apache.lucene.facet.search.params.CountFacetRequest;
import org.apache.lucene.facet.search.params.FacetRequest;
import org.apache.lucene.facet.search.params.FacetSearchParams;
import org.apache.lucene.facet.search.results.FacetResult;
import org.apache.lucene.facet.search.results.FacetResultNode;
import org.apache.lucene.facet.taxonomy.CategoryPath;
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* SampleSearcer searches index with facets.
*
* @lucene.experimental
*/
public class SimpleSearcher {
/**
* Search an index with facets.
* @param indexReader index reader.
* @param taxoReader taxonomy reader.
* @throws Exception on error (no detailed exception handling here for sample simplicity
* @return facet results
*/
public static List<FacetResult> searchWithFacets (IndexReader indexReader,
TaxonomyReader taxoReader) throws Exception {
CountFacetRequest facetRequest = new CountFacetRequest(new CategoryPath("root","a"), 10);
return searchWithRequest(indexReader, taxoReader, null, facetRequest);
}
/**
* Search an index with facets for given facet requests.
* @param indexReader index reader.
* @param taxoReader taxonomy reader.
* @param indexingParams the facet indexing params
* @param facetRequests facet requests of interest
* @throws Exception on error (no detailed exception handling here for sample simplicity
* @return facet results
*/
public static List<FacetResult> searchWithRequest(IndexReader indexReader,
TaxonomyReader taxoReader, FacetIndexingParams indexingParams,
FacetRequest... facetRequests) throws Exception {
Query q = new TermQuery(new Term(SimpleUtils.TEXT, "white"));
return searchWithRequestAndQuery(q, indexReader, taxoReader,
indexingParams, facetRequests);
}
/**
* Search an index with facets for given query and facet requests.
* @param q query of interest
* @param indexReader index reader.
* @param taxoReader taxonomy reader.
* @param indexingParams the facet indexing params
* @param facetRequests facet requests of interest
* @throws Exception on error (no detailed exception handling here for sample simplicity
* @return facet results
*/
public static List<FacetResult> searchWithRequestAndQuery(Query q,
IndexReader indexReader, TaxonomyReader taxoReader,
FacetIndexingParams indexingParams, FacetRequest... facetRequests)
throws Exception {
ExampleUtils.log("Query: " + q);
// prepare searcher to search against
IndexSearcher searcher = new IndexSearcher(indexReader);
// collect matching documents into a collector
TopScoreDocCollector topDocsCollector = TopScoreDocCollector.create(10, true);
if (indexingParams == null) {
indexingParams = new DefaultFacetIndexingParams();
}
// Faceted search parameters indicate which facets are we interested in
FacetSearchParams facetSearchParams = new FacetSearchParams(indexingParams);
// Add the facet requests of interest to the search params
for (FacetRequest frq : facetRequests) {
facetSearchParams.addFacetRequest(frq);
}
FacetsCollector facetsCollector = new FacetsCollector(facetSearchParams, indexReader, taxoReader);
// perform documents search and facets accumulation
searcher.search(q, MultiCollector.wrap(topDocsCollector, facetsCollector));
// Obtain facets results and print them
List<FacetResult> res = facetsCollector.getFacetResults();
int i = 0;
for (FacetResult facetResult : res) {
ExampleUtils.log("Res " + (i++) + ": " + facetResult);
}
return res;
}
/**
* Search an index with facets drill-down.
* @param indexReader index reader.
* @param taxoReader taxonomy reader.
* @throws Exception on error (no detailed exception handling here for sample simplicity
* @return facet results
*/
public static List<FacetResult> searchWithDrillDown(IndexReader indexReader,
TaxonomyReader taxoReader) throws Exception {
// base query the user is interested in
Query baseQuery = new TermQuery(new Term(SimpleUtils.TEXT, "white"));
// facet of interest
CountFacetRequest facetRequest = new CountFacetRequest(new CategoryPath("root","a"), 10);
// initial search - all docs matching the base query will contribute to the accumulation
List<FacetResult> res1 = searchWithRequest(indexReader, taxoReader, null, facetRequest);
// a single result (because there was a single request)
FacetResult fres = res1.get(0);
// assume the user is interested in the second sub-result
// (just take the second sub-result returned by the iterator - we know there are 3 results!)
Iterator<? extends FacetResultNode> resIterator = fres.getFacetResultNode().getSubResults().iterator();
resIterator.next(); // skip first result
CategoryPath categoryOfInterest = resIterator.next().getLabel();
// drill-down preparation: turn the base query into a drill-down query for the category of interest
Query q2 = DrillDown.query(baseQuery, categoryOfInterest);
// that's it - search with the new query and we're done!
// only documents both matching the base query AND containing the
// category of interest will contribute to the new accumulation
return searchWithRequestAndQuery(q2, indexReader, taxoReader, null, facetRequest);
}
}

View File

@ -0,0 +1,87 @@
package org.apache.lucene.facet.example.simple;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.facet.example.ExampleUtils;
import org.apache.lucene.facet.taxonomy.CategoryPath;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Some definitions for the Simple Sample.
*
* @lucene.experimental
*/
public class SimpleUtils {
/**
* Documents text field.
*/
public static final String TEXT = "text";
/**
* Documents title field.
*/
public static final String TITLE = "title";
/**
* sample documents text (for the text field).
*/
public static String[] docTexts = {
"the white car is the one I want.",
"the white dog does not belong to anyone.",
};
/**
* sample documents titles (for the title field).
*/
public static String[] docTitles = {
"white car",
"white dog",
};
/**
* Categories: categories[D][N] == category-path no. N for document no. D.
*/
public static CategoryPath[][] categories = {
{ new CategoryPath("root","a","f1"), new CategoryPath("root","a","f2") },
{ new CategoryPath("root","a","f1"), new CategoryPath("root","a","f3") },
};
/**
* Analyzer used in the simple sample.
*/
public static final Analyzer analyzer = new WhitespaceAnalyzer(ExampleUtils.EXAMPLE_VER);
/**
* Utility method: List of category paths out of an array of them...
* @param categoryPaths input array of category paths.
*/
public static List<CategoryPath> categoryPathArrayToList (CategoryPath...categoryPaths) {
ArrayList<CategoryPath> res = new ArrayList<CategoryPath>();
for (CategoryPath categoryPath : categoryPaths) {
res.add(categoryPath);
}
return res;
}
}

View File

@ -0,0 +1,17 @@
<html>
<head>
<title>Simple faceted indexing and search sample</title>
</head>
<body>
<h1>Simple faceted indexing and search sample</h1>
A simple faceted example, showing how to:
<ol>
<li>Create an index.</li>
<li>Add documents with facets to the index.</li>
<li>Search the index.</li>
</ol>
For more complex examples see the other sample code packages.
</body>
</html>

View File

@ -0,0 +1,77 @@
package org.apache.lucene;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* An interface which standardizes the process of building an indexable
* {@link Document}.
* <p>
* The idea is that implementations implement {@link #build(Document doc)},
* which adds to the given Document whatever {@link Field}s it wants to add. A
* DocumentBuilder is also allowed to inspect or change existing Fields in the
* Document, if it wishes to.
* <p>
* Implementations should normally have a constructor with parameters which
* determine what {@link #build(Document)} will add to doc.<br>
* To allow reuse of the DocumentBuilder object, implementations are also
* encouraged to have a setter method, which remembers its parameters just like
* the constructor. This setter method cannot be described in this interface,
* because it will take different parameters in each implementation.
* <p>
* The interface defines a builder pattern, which allows applications to invoke
* several document builders in the following way:
*
* <pre>
* builder1.build(builder2.build(builder3.build(new Document())));
* </pre>
*
* @lucene.experimental
*/
public interface DocumentBuilder {
/** An exception thrown from {@link DocumentBuilder}'s build(). */
public static class DocumentBuilderException extends Exception {
public DocumentBuilderException() {
super();
}
public DocumentBuilderException(String message) {
super(message);
}
public DocumentBuilderException(String message, Throwable cause) {
super(message, cause);
}
public DocumentBuilderException(Throwable cause) {
super(cause);
}
}
/**
* Adds to the given document whatever {@link Field}s the implementation needs
* to add. Return the docunment instance to allow for chaining calls.
*/
public Document build(Document doc) throws DocumentBuilderException;
}

View File

@ -0,0 +1,46 @@
package org.apache.lucene.facet;
import java.io.IOException;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* A parent class for exceptions thrown by the Facets code.
*
* @lucene.experimental
*/
public class FacetException extends IOException {
public FacetException() {
super();
}
public FacetException(String message) {
super(message);
}
public FacetException(String message, Throwable cause) {
super(message);
initCause(cause);
}
public FacetException(Throwable cause) {
initCause(cause);
}
}

View File

@ -0,0 +1,127 @@
package org.apache.lucene.facet.enhancements;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.facet.enhancements.params.EnhancementsIndexingParams;
import org.apache.lucene.facet.index.attributes.CategoryAttribute;
import org.apache.lucene.facet.index.attributes.CategoryProperty;
import org.apache.lucene.facet.index.streaming.CategoryListTokenizer;
import org.apache.lucene.facet.index.streaming.CategoryParentsStream;
import org.apache.lucene.facet.taxonomy.TaxonomyWriter;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* This interface allows easy addition of enhanced category features. Usually, a
* {@link CategoryEnhancement} will correspond to a {@link CategoryProperty}.
* <p>
* A category enhancement can contribute to the index in two possible ways:
* <ol>
* <li>To each category with data relevant to the enhancement, add this data to
* the category's token payload, through
* {@link #getCategoryTokenBytes(CategoryAttribute)}. This data will be read
* during search using {@link #extractCategoryTokenData(byte[], int, int)}.</li>
* <li>To each document which contains categories with data relevant to the
* enhancement, add a {@link CategoryListTokenizer} through
* {@link #getCategoryListTokenizer(TokenStream, EnhancementsIndexingParams, TaxonomyWriter)}
* . The {@link CategoryListTokenizer} should add a single token which includes
* all the enhancement relevant data from the categories. The category list
* token's text is defined by {@link #getCategoryListTermText()}.</li>
* </ol>
*
* @lucene.experimental
*/
public interface CategoryEnhancement {
/**
* Get the bytes to be added to the category token payload for this
* enhancement.
* <p>
* <b>NOTE</b>: The returned array is copied, it is recommended to allocate
* a new one each time.
* <p>
* The bytes generated by this method are the input of
* {@link #extractCategoryTokenData(byte[], int, int)}.
*
* @param categoryAttribute
* The attribute of the category.
* @return The bytes to be added to the category token payload for this
* enhancement.
*/
byte[] getCategoryTokenBytes(CategoryAttribute categoryAttribute);
/**
* Get the data of this enhancement from a category token payload.
* <p>
* The input bytes for this method are generated in
* {@link #getCategoryTokenBytes(CategoryAttribute)}.
*
* @param buffer
* The payload buffer.
* @param offset
* The offset of this enhancement's data in the buffer.
* @param length
* The length of this enhancement's data (bytes).
* @return An Object containing the data.
*/
Object extractCategoryTokenData(byte[] buffer, int offset, int length);
/**
* Declarative method to indicate whether this enhancement generates
* separate category list.
*
* @return {@code true} if generates category list, else {@code false}.
*/
boolean generatesCategoryList();
/**
* Returns the text of this enhancement's category list term.
*
* @return The text of this enhancement's category list term.
*/
String getCategoryListTermText();
/**
* Get the {@link CategoryListTokenizer} which generates the category list
* for this enhancement. If {@link #generatesCategoryList()} returns
* {@code false} this method will not be called.
*
* @param tokenizer
* The input stream containing categories.
* @param indexingParams
* The indexing params to use.
* @param taxonomyWriter
* The taxonomy to add categories and get their ordinals.
* @return A {@link CategoryListTokenizer} generating the category list for
* this enhancement, with {@code tokenizer} as it's input.
*/
CategoryListTokenizer getCategoryListTokenizer(TokenStream tokenizer,
EnhancementsIndexingParams indexingParams,
TaxonomyWriter taxonomyWriter);
/**
* Get a {@link CategoryProperty} class to be retained when creating
* {@link CategoryParentsStream}.
*
* @return the {@link CategoryProperty} class to be retained when creating
* {@link CategoryParentsStream}, or {@code null} if there is no
* such property.
*/
Class<? extends CategoryProperty> getRetainableProperty();
}

View File

@ -0,0 +1,121 @@
package org.apache.lucene.facet.enhancements;
import java.io.IOException;
import java.util.List;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.facet.enhancements.params.EnhancementsIndexingParams;
import org.apache.lucene.facet.index.streaming.CategoryTokenizer;
import org.apache.lucene.util.Vint8;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* A tokenizer which adds to each category token payload according to the
* {@link CategoryEnhancement}s defined in the given
* {@link EnhancementsIndexingParams}.
*
* @lucene.experimental
*/
public class EnhancementsCategoryTokenizer extends CategoryTokenizer {
/**
* The data buffer used for payload instance.
*/
protected byte[] payloadBytes;
/**
* The category enhancements to handle
*/
protected List<CategoryEnhancement> enhancements;
/**
* Buffers for enhancement payload bytes
*/
protected byte[][] enhancementBytes;
private int nStart;
/**
* Constructor.
*
* @param input
* The stream of category tokens.
* @param indexingParams
* The indexing params to use.
* @throws IOException
*/
public EnhancementsCategoryTokenizer(TokenStream input,
EnhancementsIndexingParams indexingParams) throws IOException {
super(input, indexingParams);
payloadBytes = new byte[Vint8.MAXIMUM_BYTES_NEEDED
* (indexingParams.getCategoryEnhancements().size() + 1)];
enhancements = indexingParams.getCategoryEnhancements();
if (enhancements != null) {
// create array of bytes per enhancement
enhancementBytes = new byte[enhancements.size()][];
// write once the number of enhancements in the payload bytes
nStart = Vint8.encode(enhancements.size(), payloadBytes, 0);
}
}
@Override
protected void setPayload() {
this.payloadAttribute.setPayload(null);
if (enhancements == null) {
return;
}
// clear previous payload content
int nBytes = nStart;
int i = 0;
int nEnhancementBytes = 0;
for (CategoryEnhancement enhancement : enhancements) {
// get payload bytes from each enhancement
enhancementBytes[i] = enhancement
.getCategoryTokenBytes(categoryAttribute);
// write the number of bytes in the payload
if (enhancementBytes[i] == null) {
nBytes += Vint8.encode(0, payloadBytes, nBytes);
} else {
nBytes += Vint8.encode(enhancementBytes[i].length,
payloadBytes, nBytes);
nEnhancementBytes += enhancementBytes[i].length;
}
i++;
}
if (nEnhancementBytes > 0) {
// make sure we have space for all bytes
if (payloadBytes.length < nBytes + nEnhancementBytes) {
byte[] temp = new byte[(nBytes + nEnhancementBytes) * 2];
System.arraycopy(payloadBytes, 0, temp, 0, nBytes);
payloadBytes = temp;
}
for (i = 0; i < enhancementBytes.length; i++) {
// add the enhancement payload bytes after the existing bytes
if (enhancementBytes[i] != null) {
System.arraycopy(enhancementBytes[i], 0, payloadBytes,
nBytes, enhancementBytes[i].length);
nBytes += enhancementBytes[i].length;
}
}
payload.setData(payloadBytes, 0, nBytes);
payloadAttribute.setPayload(payload);
}
}
}

View File

@ -0,0 +1,93 @@
package org.apache.lucene.facet.enhancements;
import java.io.IOException;
import java.util.List;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.facet.enhancements.params.EnhancementsIndexingParams;
import org.apache.lucene.facet.index.CategoryDocumentBuilder;
import org.apache.lucene.facet.index.attributes.CategoryProperty;
import org.apache.lucene.facet.index.streaming.CategoryAttributesStream;
import org.apache.lucene.facet.index.streaming.CategoryListTokenizer;
import org.apache.lucene.facet.index.streaming.CategoryParentsStream;
import org.apache.lucene.facet.index.streaming.CategoryTokenizer;
import org.apache.lucene.facet.taxonomy.TaxonomyWriter;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* An {@link EnhancementsDocumentBuilder} is a {@link CategoryDocumentBuilder}
* which adds categories to documents according to the list of
* {@link CategoryEnhancement}s from {@link EnhancementsIndexingParams}. The
* additions over {@link CategoryDocumentBuilder} could be in both category
* tokens, and additional category lists.
*
* @lucene.experimental
*/
public class EnhancementsDocumentBuilder extends CategoryDocumentBuilder {
/**
* @param taxonomyWriter
* @param params
* Indexing params which include {@link CategoryEnhancement}s.
* @throws IOException
*/
public EnhancementsDocumentBuilder(TaxonomyWriter taxonomyWriter,
EnhancementsIndexingParams params) throws IOException {
super(taxonomyWriter, params);
}
@Override
protected TokenStream getParentsStream(CategoryAttributesStream categoryAttributesStream) {
List<Class<? extends CategoryProperty>> toRetainList = ((EnhancementsIndexingParams) indexingParams)
.getRetainableProperties();
if (toRetainList != null) {
CategoryParentsStream categoryParentsStream = new CategoryParentsStream(
categoryAttributesStream, taxonomyWriter, indexingParams);
for (Class<? extends CategoryProperty> toRetain : toRetainList) {
categoryParentsStream.addRetainableProperty(toRetain);
}
return categoryParentsStream;
}
return super.getParentsStream(categoryAttributesStream);
}
@Override
protected CategoryListTokenizer getCategoryListTokenizer(TokenStream categoryStream) {
CategoryListTokenizer tokenizer = super.getCategoryListTokenizer(categoryStream);
// Add tokenizer for each enhancement that produces category list
for (CategoryEnhancement enhancement : ((EnhancementsIndexingParams) indexingParams)
.getCategoryEnhancements()) {
if (enhancement.generatesCategoryList()) {
tokenizer = enhancement.getCategoryListTokenizer(tokenizer,
(EnhancementsIndexingParams) indexingParams,
taxonomyWriter);
}
}
return tokenizer;
}
@Override
protected CategoryTokenizer getCategoryTokenizer(TokenStream categoryStream)
throws IOException {
return new EnhancementsCategoryTokenizer(categoryStream,
(EnhancementsIndexingParams) indexingParams);
}
}

View File

@ -0,0 +1,105 @@
package org.apache.lucene.facet.enhancements;
import java.io.IOException;
import java.util.List;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.facet.search.PayloadIterator;
import org.apache.lucene.util.Vint8;
import org.apache.lucene.util.Vint8.Position;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* A {@link PayloadIterator} for iterating over category posting lists generated
* using {@link EnhancementsCategoryTokenizer}.
*
* @lucene.experimental
*/
public class EnhancementsPayloadIterator extends PayloadIterator {
private CategoryEnhancement[] EnhancedCategories;
int nEnhancements;
private int[] enhancementLength;
private int[] enhancementStart;
/**
* Constructor.
*
* @param enhancementsList
* A list of the {@link CategoryEnhancement}s from the indexing
* params.
* @param indexReader
* A reader of the index.
* @param term
* The category term to iterate.
* @throws IOException
*/
public EnhancementsPayloadIterator(
List<CategoryEnhancement> enhancementsList,
IndexReader indexReader, Term term) throws IOException {
super(indexReader, term);
EnhancedCategories = enhancementsList
.toArray(new CategoryEnhancement[enhancementsList.size()]);
enhancementLength = new int[EnhancedCategories.length];
enhancementStart = new int[EnhancedCategories.length];
}
@Override
public boolean setdoc(int docId) throws IOException {
if (!super.setdoc(docId)) {
return false;
}
// read header - number of enhancements and their lengths
Position position = new Position();
nEnhancements = Vint8.decode(buffer, position);
for (int i = 0; i < nEnhancements; i++) {
enhancementLength[i] = Vint8.decode(buffer, position);
}
// set enhancements start points
enhancementStart[0] = position.pos;
for (int i = 1; i < nEnhancements; i++) {
enhancementStart[i] = enhancementStart[i - 1] + enhancementLength[i - 1];
}
return true;
}
/**
* Get the data of the current category and document for a certain
* enhancement, or {@code null} if no such enhancement exists.
*
* @param enhancedCategory
* The category enhancement to apply.
* @return the data of the current category and document for a certain
* enhancement, or {@code null} if no such enhancement exists.
*/
public Object getCategoryData(CategoryEnhancement enhancedCategory) {
for (int i = 0; i < nEnhancements; i++) {
if (enhancedCategory.equals(EnhancedCategories[i])) {
return enhancedCategory.extractCategoryTokenData(buffer,
enhancementStart[i], enhancementLength[i]);
}
}
return null;
}
}

View File

@ -0,0 +1,153 @@
package org.apache.lucene.facet.enhancements.association;
import java.util.HashSet;
import java.util.Set;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.facet.enhancements.CategoryEnhancement;
import org.apache.lucene.facet.enhancements.params.EnhancementsIndexingParams;
import org.apache.lucene.facet.index.attributes.CategoryAttribute;
import org.apache.lucene.facet.index.attributes.CategoryProperty;
import org.apache.lucene.facet.index.streaming.CategoryListTokenizer;
import org.apache.lucene.facet.taxonomy.TaxonomyWriter;
import org.apache.lucene.util.Vint8;
import org.apache.lucene.util.Vint8.Position;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* A {@link CategoryEnhancement} for adding associations data to the index
* (categories with {@link AssociationProperty}s).
*
* @lucene.experimental
*/
public class AssociationEnhancement implements CategoryEnhancement {
static final String CATEGORY_LIST_TERM_TEXT = "CATEGORY_ASSOCIATION_LIST";
/** Property Classes which extend AssociationProperty */
private static final HashSet<Class<? extends CategoryProperty>> ASSOCIATION_PROPERTY_CLASSES;
/** Property Classes which do not extend AssociationProperty */
private static final HashSet<Class<? extends CategoryProperty>> NON_ASSOCIATION_PROPERTY_CLASSES;
static {
ASSOCIATION_PROPERTY_CLASSES = new HashSet<Class<? extends CategoryProperty>>();
NON_ASSOCIATION_PROPERTY_CLASSES = new HashSet<Class<? extends CategoryProperty>>();
}
/**
* For a given class which extends a CategoryProperty, answers whether it is
* an instance of AssociationProperty (AP) or not. <br>
* This method is a cheaper replacement for a call to
* <code>instanceof</code>. It has two HashSets - one for classes which are
* an extension to AP and one for the classes which are not. Whenever a
* property class is introduced:
* <ul>
* <li>if it is known as a property class extending AP (contained in the
* validHashSet)- returns true</li>
* <li>if it is known as a property class NOT extending AP - returns false</li>
* <li>
* If it was not matched against both sets, it calls 'instanceof' to find
* out if it extends AP, puts it in the matching Set and returning true or
* false accordingly</li>
*</ul>
*
* NOTE: 'instanceof' is only called once per a Class (not instance) of a
* property. And as there are few properties (currently 4 concrete
* implementations) the two sets would be rather small
*/
public static boolean isAssociationProperty(Class<? extends CategoryProperty> clazz) {
if (ASSOCIATION_PROPERTY_CLASSES.contains(clazz)) {
return true;
}
if (NON_ASSOCIATION_PROPERTY_CLASSES.contains(clazz)) {
return false;
}
if (AssociationProperty.class.isAssignableFrom(clazz)) {
ASSOCIATION_PROPERTY_CLASSES.add(clazz);
return true;
}
NON_ASSOCIATION_PROPERTY_CLASSES.add(clazz);
return false;
}
public boolean generatesCategoryList() {
return true;
}
public String getCategoryListTermText() {
return CATEGORY_LIST_TERM_TEXT;
}
public CategoryListTokenizer getCategoryListTokenizer(
TokenStream tokenizer, EnhancementsIndexingParams indexingParams,
TaxonomyWriter taxonomyWriter) {
return new AssociationListTokenizer(tokenizer, indexingParams, this);
}
public byte[] getCategoryTokenBytes(CategoryAttribute categoryAttribute) {
AssociationProperty property = getAssociationProperty(categoryAttribute);
if (property == null) {
return null;
}
int association = property.getAssociation();
int bytesNeeded = Vint8.bytesNeeded(association);
byte[] buffer = new byte[bytesNeeded];
Vint8.encode(association, buffer, 0);
return buffer;
}
public static AssociationProperty getAssociationProperty(
CategoryAttribute categoryAttribute) {
AssociationProperty property = null;
Set<Class<? extends CategoryProperty>> propertyClasses = categoryAttribute
.getPropertyClasses();
if (propertyClasses == null) {
return null;
}
for (Class<? extends CategoryProperty> clazz : propertyClasses) {
if (isAssociationProperty(clazz)) {
property = (AssociationProperty) categoryAttribute
.getProperty(clazz);
break;
}
}
return property;
}
public Object extractCategoryTokenData(byte[] buffer, int offset, int length) {
if (length == 0) {
return null;
}
Integer i = Integer.valueOf(Vint8.decode(buffer, new Position(offset)));
return i;
}
public Class<? extends CategoryProperty> getRetainableProperty() {
return null;
}
}

View File

@ -0,0 +1,74 @@
package org.apache.lucene.facet.enhancements.association;
import org.apache.lucene.facet.index.attributes.CategoryProperty;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* An {@link AssociationProperty} which treats the association as float - the
* association bits are actually float bits, and thus merging two associations
* is done by float summation.
*
* @lucene.experimental
*/
public class AssociationFloatProperty extends AssociationProperty {
/**
* Constructor.
*
* @param value
* The association value.
*/
public AssociationFloatProperty(float value) {
super(Float.floatToIntBits(value));
}
@Override
public boolean equals(Object other) {
if (other == this) {
return true;
}
if (!(other instanceof AssociationFloatProperty)) {
return false;
}
AssociationFloatProperty o = (AssociationFloatProperty) other;
return o.association == this.association;
}
@Override
public int hashCode() {
return "AssociationFloatProperty".hashCode() * 31 + (int) association;
}
public void merge(CategoryProperty other) {
AssociationFloatProperty o = (AssociationFloatProperty) other;
this.association = Float.floatToIntBits(Float
.intBitsToFloat((int) this.association)
+ Float.intBitsToFloat((int) o.association));
}
public float getFloatAssociation() {
return Float.intBitsToFloat((int) association);
}
@Override
public String toString() {
return getClass().getSimpleName() + ": " + Float.intBitsToFloat(getAssociation());
}
}

View File

@ -0,0 +1,60 @@
package org.apache.lucene.facet.enhancements.association;
import org.apache.lucene.facet.index.attributes.CategoryProperty;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* An {@link AssociationProperty} which treats the association as int - merges
* two associations by summation.
*
* @lucene.experimental
*/
public class AssociationIntProperty extends AssociationProperty {
/**
* @param value
* The association value.
*/
public AssociationIntProperty(int value) {
super(value);
}
@Override
public boolean equals(Object other) {
if (other == this) {
return true;
}
if (!(other instanceof AssociationIntProperty)) {
return false;
}
AssociationIntProperty o = (AssociationIntProperty) other;
return o.association == this.association;
}
@Override
public int hashCode() {
return "AssociationIntProperty".hashCode() * 31 + (int) association;
}
public void merge(CategoryProperty other) {
AssociationIntProperty o = (AssociationIntProperty) other;
this.association += o.association;
}
}

View File

@ -0,0 +1,90 @@
package org.apache.lucene.facet.enhancements.association;
import java.io.IOException;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.facet.enhancements.CategoryEnhancement;
import org.apache.lucene.facet.enhancements.params.EnhancementsIndexingParams;
import org.apache.lucene.facet.index.CategoryListPayloadStream;
import org.apache.lucene.facet.index.attributes.OrdinalProperty;
import org.apache.lucene.facet.index.streaming.CategoryListTokenizer;
import org.apache.lucene.util.encoding.SimpleIntEncoder;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Tokenizer for associations of a category
*
* @lucene.experimental
*/
public class AssociationListTokenizer extends CategoryListTokenizer {
protected CategoryListPayloadStream payloadStream;
private String categoryListTermText;
public AssociationListTokenizer(TokenStream input,
EnhancementsIndexingParams indexingParams, CategoryEnhancement enhancement) {
super(input, indexingParams);
categoryListTermText = enhancement.getCategoryListTermText();
}
@Override
protected void handleStartOfInput() throws IOException {
payloadStream = null;
}
@Override
public final boolean incrementToken() throws IOException {
if (input.incrementToken()) {
if (categoryAttribute != null) {
AssociationProperty associationProperty = AssociationEnhancement
.getAssociationProperty(categoryAttribute);
if (associationProperty != null
&& associationProperty.hasBeenSet()) {
OrdinalProperty ordinalProperty = (OrdinalProperty) categoryAttribute
.getProperty(OrdinalProperty.class);
if (ordinalProperty == null) {
throw new IOException(
"Error: Association without ordinal");
}
if (payloadStream == null) {
payloadStream = new CategoryListPayloadStream(
new SimpleIntEncoder());
}
payloadStream.appendIntToStream(ordinalProperty
.getOrdinal());
payloadStream.appendIntToStream(associationProperty
.getAssociation());
}
}
return true;
}
if (payloadStream != null) {
termAttribute.setEmpty().append(categoryListTermText);
payload.setData(payloadStream.convertStreamToByteArray());
payloadAttribute.setPayload(payload);
payloadStream = null;
return true;
}
return false;
}
}

View File

@ -0,0 +1,73 @@
package org.apache.lucene.facet.enhancements.association;
import org.apache.lucene.facet.index.attributes.CategoryAttribute;
import org.apache.lucene.facet.index.attributes.CategoryProperty;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* A {@link CategoryProperty} associating a single integer value to a
* {@link CategoryAttribute}. It should be used to describe the association
* between the category and the document.
* <p>
* This class leave to extending classes the definition of
* {@link #merge(CategoryProperty)} policy for the integer associations.
* <p>
* <B>Note:</B> The association value is added both to a special category list,
* and to the category tokens.
*
* @see AssociationEnhancement
* @lucene.experimental
*/
public abstract class AssociationProperty implements CategoryProperty {
protected long association = Integer.MAX_VALUE + 1;
/**
* Construct an {@link AssociationProperty}.
*
* @param value
* The association value.
*/
public AssociationProperty(int value) {
this.association = value;
}
/**
* Returns the association value.
*
* @return The association value.
*/
public int getAssociation() {
return (int) association;
}
/**
* Returns whether this attribute has been set (not all categories have an
* association).
*/
public boolean hasBeenSet() {
return this.association <= Integer.MAX_VALUE;
}
@Override
public String toString() {
return getClass().getSimpleName() + ": " + association;
}
}

View File

@ -0,0 +1,235 @@
package org.apache.lucene.facet.enhancements.association;
import java.io.IOException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.facet.index.params.CategoryListParams;
import org.apache.lucene.facet.search.PayloadIntDecodingIterator;
import org.apache.lucene.util.collections.IntIterator;
import org.apache.lucene.util.collections.IntToIntMap;
import org.apache.lucene.util.encoding.SimpleIntDecoder;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Allows easy iteration over the associations payload, decoding and breaking it
* to (ordinal, value) pairs, stored in a hash.
*
* @lucene.experimental
*/
public class AssociationsPayloadIterator {
/**
* Default Term for associations
*/
public static final Term ASSOCIATION_POSTING_TERM = new Term(
CategoryListParams.DEFAULT_TERM.field(),
AssociationEnhancement.CATEGORY_LIST_TERM_TEXT);
/**
* Hash mapping to ordinals to the associated int value
*/
private IntToIntMap ordinalToAssociationMap;
/**
* An inner payload decoder which actually goes through the posting and
* decode the ints representing the ordinals and the values
*/
private PayloadIntDecodingIterator associationPayloadIter;
/**
* Marking whether there are associations (at all) in the given index
*/
private boolean hasAssociations = false;
/**
* The long-special-value returned for ordinals which have no associated int
* value. It is not in the int range of values making it a valid mark.
*/
public final static long NO_ASSOCIATION = Integer.MAX_VALUE + 1;
/**
* Construct a new association-iterator, initializing the inner payload
* iterator, with the supplied term and checking whether there are any
* associations within the given index
*
* @param reader
* a reader containing the postings to be iterated
* @param field
* the field containing the relevant associations list term
*/
public AssociationsPayloadIterator(IndexReader reader, String field)
throws IOException {
// Initialize the payloadDecodingIterator
associationPayloadIter = new PayloadIntDecodingIterator(
reader,
// TODO (Facet): should consolidate with AssociationListTokenizer which
// uses AssociationEnhancement.getCatTermText()
new Term(field, AssociationEnhancement.CATEGORY_LIST_TERM_TEXT),
new SimpleIntDecoder());
// Check whether there are any associations
hasAssociations = associationPayloadIter.init();
ordinalToAssociationMap = new IntToIntMap();
}
/**
* Skipping to the next document, fetching its associations & populating the
* map.
*
* @param docId
* document id to be skipped to
* @return true if the document contains associations and they were fetched
* correctly. false otherwise.
* @throws IOException
* on error
*/
public boolean setNextDoc(int docId) throws IOException {
ordinalToAssociationMap.clear();
boolean docContainsAssociations = false;
try {
docContainsAssociations = fetchAssociations(docId);
} catch (IOException e) {
IOException ioe = new IOException(
"An Error occured while reading a document's associations payload (docId="
+ docId + ")");
ioe.initCause(e);
throw ioe;
}
return docContainsAssociations;
}
/**
* Get int association value for the given ordinal. <br>
* The return is either an int value casted as long if the ordinal has an
* associated value. Otherwise the returned value would be
* {@link #NO_ASSOCIATION} which is 'pure long' value (e.g not in the int
* range of values)
*
* @param ordinal
* for which the association value is requested
* @return the associated int value (encapsulated in a long) if the ordinal
* had an associated value, or {@link #NO_ASSOCIATION} otherwise
*/
public long getAssociation(int ordinal) {
if (ordinalToAssociationMap.containsKey(ordinal)) {
return ordinalToAssociationMap.get(ordinal);
}
return NO_ASSOCIATION;
}
/**
* Get an iterator over the ordinals which has an association for the
* document set by {@link #setNextDoc(int)}.
*/
public IntIterator getAssociatedOrdinals() {
return ordinalToAssociationMap.keyIterator();
}
/**
* Skips to the given docId, getting the values in pairs of (ordinal, value)
* and populating the map
*
* @param docId
* document id owning the associations
* @return true if associations were fetched successfully, false otherwise
* @throws IOException
* on error
*/
private boolean fetchAssociations(int docId) throws IOException {
// No associations at all? don't bother trying to seek the docID in the
// posting
if (!hasAssociations) {
return false;
}
// No associations for this document? well, nothing to decode than,
// return false
if (!associationPayloadIter.skipTo(docId)) {
return false;
}
// loop over all the values decoded from the payload in pairs.
for (;;) {
// Get the ordinal
long ordinal = associationPayloadIter.nextCategory();
// if no ordinal - it's the end of data, break the loop
if (ordinal > Integer.MAX_VALUE) {
break;
}
// get the associated value
long association = associationPayloadIter.nextCategory();
// If we're at this step - it means we have an ordinal, do we have
// an association for it?
if (association > Integer.MAX_VALUE) {
// No association!!! A Broken Pair!! PANIC!
throw new IOException(
"ERROR! Associations should come in pairs of (ordinal, value), yet this payload has an odd number of values! (docId="
+ docId + ")");
}
// Populate the map with the given ordinal and association pair
ordinalToAssociationMap.put((int) ordinal, (int) association);
}
return true;
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime
* result
+ ((associationPayloadIter == null) ? 0
: associationPayloadIter.hashCode());
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj) {
return true;
}
if (obj == null) {
return false;
}
if (getClass() != obj.getClass()) {
return false;
}
AssociationsPayloadIterator other = (AssociationsPayloadIterator) obj;
if (associationPayloadIter == null) {
if (other.associationPayloadIter != null) {
return false;
}
} else if (!associationPayloadIter.equals(other.associationPayloadIter)) {
return false;
}
return true;
}
}

View File

@ -0,0 +1,13 @@
<html>
<head>
<title>Association category enhancements</title>
</head>
<body>
<h1>Association category enhancements</h1>
A {@link org.apache.lucene.facet.enhancements.CategoryEnhancement CategoryEnhancement}
for adding associations data to the index (categories with
{@link org.apache.lucene.facet.enhancements.association.AssociationProperty AssociationProperty}'s).
</body>
</html>

View File

@ -0,0 +1,32 @@
<html>
<head>
<title>Enhanced category features</title>
</head>
<body>
<h1>Enhanced category features</h1>
Mechanisms for addition of enhanced category features.
<p>A {@link org.apache.lucene.facet.enhancements.CategoryEnhancement CategoryEnhancement}
(which can correspond to a
{@link org.apache.lucene.facet.index.attributes.CategoryProperty CategoryProperty})
can contribute to the index in two possible ways:
<ol>
<li>To each category with data relevant to the enhancement,
add this data to the category's token payload, through
{@link org.apache.lucene.facet.enhancements.CategoryEnhancement#getCategoryTokenBytes(CategoryAttribute) CategoryEnhancement.getCategoryTokenBytes()}.
This data will be read during search using
{@link org.apache.lucene.facet.enhancements.CategoryEnhancement#extractCategoryTokenData(byte[], int, int) CategoryEnhancement.extractCategoryTokenData()}.
</li>
<li>To each document which contains categories with data relevant to the enhancement, add a
{@link org.apache.lucene.facet.index.streaming.CategoryListTokenizer CategoryListTokenizer} through
{@link org.apache.lucene.facet.enhancements.CategoryEnhancement#getCategoryListTokenizer CategoryEnhancement.getCategoryListTokenizer()} .
The
{@link org.apache.lucene.facet.index.streaming.CategoryListTokenizer CategoryListTokenizer}
should add a single token which includes all the enhancement relevant data from the categories.
The category list token's text is defined by
{@link org.apache.lucene.facet.enhancements.CategoryEnhancement#getCategoryListTermText() CategoryEnhancement.getCategoryListTermText()}.
</li>
</ol>
</body>
</html>

View File

@ -0,0 +1,98 @@
package org.apache.lucene.facet.enhancements.params;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.facet.enhancements.CategoryEnhancement;
import org.apache.lucene.facet.index.attributes.CategoryProperty;
import org.apache.lucene.facet.index.params.CategoryListParams;
import org.apache.lucene.facet.index.params.PerDimensionIndexingParams;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Default implementation of {@link EnhancementsIndexingParams}
*
* @lucene.experimental
*/
public class DefaultEnhancementsIndexingParams extends
PerDimensionIndexingParams implements EnhancementsIndexingParams {
private List<CategoryEnhancement> enhancedCategories;
/**
* Construct with a certain {@link CategoryEnhancement enhancement}
* @throws IllegalArgumentException if no enhancements are provided
*/
public DefaultEnhancementsIndexingParams(CategoryEnhancement... enhancements) {
super();
validateparams(enhancements);
addCategoryEnhancements(enhancements);
}
private void validateparams(CategoryEnhancement... enhancements) {
if (enhancements==null || enhancements.length<1) {
throw new IllegalArgumentException("at least one enhancement is required");
}
}
/**
* Construct with certain {@link CategoryEnhancement enhancements}
* and {@link CategoryListParams}
* @throws IllegalArgumentException if no enhancements are provided
*/
public DefaultEnhancementsIndexingParams(
CategoryListParams categoryListParams,
CategoryEnhancement... enhancements) {
super(categoryListParams);
validateparams(enhancements);
addCategoryEnhancements(enhancements);
}
public void addCategoryEnhancements(CategoryEnhancement... enhancements) {
if (enhancedCategories == null) {
enhancedCategories = new ArrayList<CategoryEnhancement>();
}
for (CategoryEnhancement categoryEnhancement : enhancements) {
enhancedCategories.add(categoryEnhancement);
}
}
public List<CategoryEnhancement> getCategoryEnhancements() {
if (enhancedCategories == null || enhancedCategories.isEmpty()) {
return null;
}
return enhancedCategories;
}
public List<Class<? extends CategoryProperty>> getRetainableProperties() {
if (enhancedCategories == null) {
return null;
}
List<Class<? extends CategoryProperty>> retainableProperties = new ArrayList<Class<? extends CategoryProperty>>();
for (CategoryEnhancement enhancement : enhancedCategories) {
if (enhancement.getRetainableProperty() != null) {
retainableProperties.add(enhancement.getRetainableProperty());
}
}
if (retainableProperties.isEmpty()) {
return null;
}
return retainableProperties;
}
}

View File

@ -0,0 +1,66 @@
package org.apache.lucene.facet.enhancements.params;
import java.util.List;
import org.apache.lucene.facet.enhancements.CategoryEnhancement;
import org.apache.lucene.facet.enhancements.EnhancementsDocumentBuilder;
import org.apache.lucene.facet.index.attributes.CategoryProperty;
import org.apache.lucene.facet.index.params.FacetIndexingParams;
import org.apache.lucene.facet.index.streaming.CategoryParentsStream;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* {@link FacetIndexingParams Facet indexing parameters} for defining
* {@link CategoryEnhancement category enhancements}. It must contain at least
* one enhancement, otherwise nothing is "enhanced" about it. When there are
* more than one, the order matters - see {@link #getCategoryEnhancements()}.
*
* @see EnhancementsDocumentBuilder
* @lucene.experimental
*/
public interface EnhancementsIndexingParams extends FacetIndexingParams {
/**
* Add {@link CategoryEnhancement}s to the indexing parameters
* @param enhancements enhancements to add
*/
public void addCategoryEnhancements(CategoryEnhancement... enhancements);
/**
* Get a list of the active category enhancements. If no enhancements exist
* return {@code null}. The order of enhancements in the returned list
* dictates the order in which the enhancements data appear in the category
* tokens payload.
*
* @return A list of the active category enhancements, or {@code null} if
* there are no enhancements.
*/
public List<CategoryEnhancement> getCategoryEnhancements();
/**
* Get a list of {@link CategoryProperty} classes to be retained when
* creating {@link CategoryParentsStream}.
*
* @return the list of {@link CategoryProperty} classes to be retained when
* creating {@link CategoryParentsStream}, or {@code null} if there
* are no such properties.
*/
public List<Class<? extends CategoryProperty>> getRetainableProperties();
}

View File

@ -0,0 +1,16 @@
<html>
<head>
<title>Enhanced category features</title>
</head>
<body>
<h1>Enhanced category features</h1>
{@link org.apache.lucene.facet.index.params.FacetIndexingParams FacetIndexingParams}
used by
{@link org.apache.lucene.facet.enhancements.EnhancementsDocumentBuilder EnhancementsDocumentBuilder}
for adding
{@link org.apache.lucene.facet.enhancements.CategoryEnhancement CategoryEnhancement}'s
to the indexing parameters, and accessing them during indexing and search.
</body>
</html>

View File

@ -0,0 +1,282 @@
package org.apache.lucene.facet.index;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.Serializable;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import org.apache.lucene.util.Attribute;
import org.apache.lucene.facet.FacetException;
import org.apache.lucene.facet.index.attributes.CategoryAttribute;
import org.apache.lucene.facet.index.attributes.CategoryAttributeImpl;
import org.apache.lucene.facet.index.attributes.CategoryProperty;
import org.apache.lucene.facet.taxonomy.CategoryPath;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* A container to add categories which are to be introduced to
* {@link CategoryDocumentBuilder#setCategories(Iterable)}. Categories can be
* added with Properties.
*
* @lucene.experimental
*/
public class CategoryContainer implements Iterable<CategoryAttribute>, Serializable {
protected transient Map<CategoryPath, CategoryAttribute> map;
/**
* Constructor.
*/
public CategoryContainer() {
map = new HashMap<CategoryPath, CategoryAttribute>();
}
/**
* Add a category.
*
* @param categoryPath
* The path of the category.
* @return The {@link CategoryAttribute} of the category.
*/
public CategoryAttribute addCategory(CategoryPath categoryPath) {
return mapCategoryAttribute(categoryPath);
}
/**
* Add a category with a property.
*
* @param categoryPath
* The path of the category.
* @param property
* The property to associate to the category.
* @return The {@link CategoryAttribute} of the category.
*/
public CategoryAttribute addCategory(CategoryPath categoryPath,
CategoryProperty property) {
/*
* This method is a special case of addCategory with multiple
* properties, but it is kept here for two reasons: 1) Using the array
* version has some performance cost, and 2) it is expected that most
* calls will be for this version (single property).
*/
CategoryAttribute ca = mapCategoryAttribute(categoryPath);
ca.addProperty(property);
return ca;
}
/**
* Add a category with multiple properties.
*
* @param categoryPath
* The path of the category.
* @param properties
* The properties to associate to the category.
* @return The {@link CategoryAttribute} of the category.
* @throws FacetException
* When the category already has a property of the same type as
* one of the new properties, and merging for this property type
* is prohibited.
*/
public CategoryAttribute addCategory(CategoryPath categoryPath,
CategoryProperty... properties) throws FacetException {
CategoryAttribute ca = mapCategoryAttribute(categoryPath);
for (CategoryProperty attribute : properties) {
ca.addProperty(attribute);
}
return ca;
}
/**
* Add an entire {@link CategoryAttribute}.
*
* @param categoryAttribute
* The {@link CategoryAttribute} to add.
* @return The {@link CategoryAttribute} of the category (could be different
* from the one provided).
* @throws FacetException
*/
public CategoryAttribute addCategory(CategoryAttribute categoryAttribute)
throws FacetException {
CategoryAttribute ca = mapCategoryAttribute(categoryAttribute
.getCategoryPath());
Set<Class<? extends CategoryProperty>> propertyClasses = categoryAttribute
.getPropertyClasses();
if (propertyClasses != null) {
for (Class<? extends CategoryProperty> propertyClass : propertyClasses) {
ca.addProperty(categoryAttribute.getProperty(propertyClass));
}
}
return ca;
}
/**
* Get the {@link CategoryAttribute} object for a specific
* {@link CategoryPath}, from the map.
*/
private final CategoryAttribute mapCategoryAttribute(
CategoryPath categoryPath) {
CategoryAttribute ca = map.get(categoryPath);
if (ca == null) {
ca = new CategoryAttributeImpl(categoryPath);
map.put(categoryPath, ca);
}
return ca;
}
/**
* Get the {@link CategoryAttribute} this container has for a certain
* category, or {@code null} if the category is not in the container.
*
* @param categoryPath
* The category path of the requested category.
*/
public CategoryAttribute getCategoryAttribute(CategoryPath categoryPath) {
return map.get(categoryPath);
}
public Iterator<CategoryAttribute> iterator() {
return map.values().iterator();
}
/**
* Remove all categories.
*/
public void clear() {
map.clear();
}
/**
* Add the categories from another {@link CategoryContainer} to this one.
*
* @param other
* The {@link CategoryContainer} to take categories from.
* @throws FacetException
* If any prohibited merge of category properties is attempted.
*/
public void merge(CategoryContainer other) throws FacetException {
for (CategoryAttribute categoryAttribute : other.map.values()) {
addCategory(categoryAttribute);
}
}
/**
* Get the number of categories in the container.
*
* @return The number of categories in the container.
*/
public int size() {
return map.size();
}
@Override
public String toString() {
StringBuilder builder = new StringBuilder("CategoryContainer");
for (CategoryAttribute ca : map.values()) {
builder.append('\n');
builder.append('\t');
builder.append(ca.toString());
}
return builder.toString();
}
/**
* Serialize object content to given {@link ObjectOutputStream}
*/
private void writeObject(ObjectOutputStream out) throws IOException {
out.defaultWriteObject();
// write the number of categories
out.writeInt(size());
// write the category attributes
for (CategoryAttribute ca : this) {
serializeCategoryAttribute(out, ca);
}
}
/**
* Serialize each of the {@link CategoryAttribute}s to the given
* {@link ObjectOutputStream}.<br>
* NOTE: {@link CategoryProperty}s are {@link Serializable}, but do not
* assume that Lucene's {@link Attribute}s are as well
* @throws IOException
*/
protected void serializeCategoryAttribute(ObjectOutputStream out,
CategoryAttribute ca) throws IOException {
out.writeObject(ca.getCategoryPath());
Set<Class<? extends CategoryProperty>> propertyClasses = ca.getPropertyClasses();
if (propertyClasses != null) {
out.writeInt(propertyClasses.size());
for (Class<? extends CategoryProperty> clazz : propertyClasses) {
out.writeObject(ca.getProperty(clazz));
}
} else {
out.writeInt(0);
}
}
/**
* Deserialize object from given {@link ObjectInputStream}
*/
private void readObject(ObjectInputStream in) throws IOException,
ClassNotFoundException {
in.defaultReadObject();
map = new HashMap<CategoryPath, CategoryAttribute>();
int size = in.readInt();
for (int i = 0; i < size; i++) {
deserializeCategoryAttribute(in);
}
}
/**
* De-Serialize each of the {@link CategoryAttribute}s from the given
* {@link ObjectInputStream}.
*/
protected void deserializeCategoryAttribute(ObjectInputStream in)
throws IOException, ClassNotFoundException {
CategoryPath cp = (CategoryPath) in.readObject();
int nProperties = in.readInt();
if (nProperties == 0) {
addCategory(cp);
} else {
for (int j = 0; j < nProperties; j++) {
CategoryProperty property = (CategoryProperty) in.readObject();
addCategory(cp, property);
}
}
}
@Override
public boolean equals(Object o) {
if (! (o instanceof CategoryContainer)) {
return false;
}
CategoryContainer that = (CategoryContainer)o;
return this.map.equals(that.map);
}
@Override
public int hashCode() {
return map.hashCode();
}
}

View File

@ -0,0 +1,298 @@
package org.apache.lucene.facet.index;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.DocumentBuilder;
import org.apache.lucene.facet.index.attributes.CategoryAttribute;
import org.apache.lucene.facet.index.attributes.CategoryAttributesIterable;
import org.apache.lucene.facet.index.categorypolicy.OrdinalPolicy;
import org.apache.lucene.facet.index.categorypolicy.PathPolicy;
import org.apache.lucene.facet.index.params.DefaultFacetIndexingParams;
import org.apache.lucene.facet.index.params.FacetIndexingParams;
import org.apache.lucene.facet.index.streaming.CategoryAttributesStream;
import org.apache.lucene.facet.index.streaming.CategoryListTokenizer;
import org.apache.lucene.facet.index.streaming.CategoryParentsStream;
import org.apache.lucene.facet.index.streaming.CategoryTokenizer;
import org.apache.lucene.facet.index.streaming.CountingListTokenizer;
import org.apache.lucene.facet.taxonomy.CategoryPath;
import org.apache.lucene.facet.taxonomy.TaxonomyWriter;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* A utility class which allows attachment of {@link CategoryPath}s or
* {@link CategoryAttribute}s to a given document using a taxonomy.<br>
* Construction could be done with either a given {@link FacetIndexingParams} or
* the default implementation {@link DefaultFacetIndexingParams}.<br>
* A CategoryDocumentBuilder can be reused by repeatedly setting the categories
* and building the document. Categories are provided either as
* {@link CategoryAttribute} elements through {@link #setCategories(Iterable)},
* or as {@link CategoryPath} elements through
* {@link #setCategoryPaths(Iterable)}.
* <p>
* Note that both {@link #setCategories(Iterable)} and
* {@link #setCategoryPaths(Iterable)} return this
* {@link CategoryDocumentBuilder}, allowing the following pattern: {@code new
* CategoryDocumentBuilder(taxonomy,
* params).setCategories(categories).build(doc)}.
*
* @lucene.experimental
*/
public class CategoryDocumentBuilder implements DocumentBuilder {
/**
* A {@link TaxonomyWriter} for adding categories and retrieving their
* ordinals.
*/
protected final TaxonomyWriter taxonomyWriter;
/**
* Parameters to be used when indexing categories.
*/
protected final FacetIndexingParams indexingParams;
/**
* A list of fields which is filled at ancestors' construction and used
* during {@link CategoryDocumentBuilder#build(Document)}.
*/
protected final ArrayList<Field> fieldList = new ArrayList<Field>();
protected Map<String, List<CategoryAttribute>> categoriesMap;
/**
* Creating a facets document builder with default facet indexing
* parameters.<br>
* See:
* {@link #CategoryDocumentBuilder(TaxonomyWriter, FacetIndexingParams)}
*
* @param taxonomyWriter
* to which new categories will be added, as well as translating
* known categories to ordinals
* @throws IOException
*
*/
public CategoryDocumentBuilder(TaxonomyWriter taxonomyWriter)
throws IOException {
this(taxonomyWriter, new DefaultFacetIndexingParams());
}
/**
* Creating a facets document builder with a given facet indexing parameters
* object.<br>
*
* @param taxonomyWriter
* to which new categories will be added, as well as translating
* known categories to ordinals
* @param params
* holds all parameters the indexing process should use such as
* category-list parameters
* @throws IOException
*/
public CategoryDocumentBuilder(TaxonomyWriter taxonomyWriter,
FacetIndexingParams params) throws IOException {
this.taxonomyWriter = taxonomyWriter;
this.indexingParams = params;
this.categoriesMap = new HashMap<String, List<CategoryAttribute>>();
}
/**
* Set the categories of the document builder from an {@link Iterable} of
* {@link CategoryPath} objects.
*
* @param categoryPaths
* An iterable of CategoryPath objects which holds the categories
* (facets) which will be added to the document at
* {@link #build(Document)}
* @return This CategoryDocumentBuilder, to enable this one line call:
* {@code new} {@link #CategoryDocumentBuilder(TaxonomyWriter)}.
* {@link #setCategoryPaths(Iterable)}.{@link #build(Document)}.
* @throws IOException
*/
public CategoryDocumentBuilder setCategoryPaths(
Iterable<CategoryPath> categoryPaths) throws IOException {
if (categoryPaths == null) {
fieldList.clear();
return this;
}
return setCategories(new CategoryAttributesIterable(categoryPaths));
}
/**
* Set the categories of the document builder from an {@link Iterable} of
* {@link CategoryAttribute} objects.
*
* @param categories
* An iterable of {@link CategoryAttribute} objects which holds
* the categories (facets) which will be added to the document at
* {@link #build(Document)}
* @return This CategoryDocumentBuilder, to enable this one line call:
* {@code new} {@link #CategoryDocumentBuilder(TaxonomyWriter)}.
* {@link #setCategories(Iterable)}.{@link #build(Document)}.
* @throws IOException
*/
public CategoryDocumentBuilder setCategories(
Iterable<CategoryAttribute> categories) throws IOException {
fieldList.clear();
if (categories == null) {
return this;
}
// get field-name to a list of facets mapping as different facets could
// be added to different category-lists on different fields
fillCategoriesMap(categories);
// creates a different stream for each different field
for (Entry<String, List<CategoryAttribute>> e : categoriesMap
.entrySet()) {
// create a category attributes stream for the array of facets
CategoryAttributesStream categoryAttributesStream = new CategoryAttributesStream(
e.getValue());
// Set a suitable {@link TokenStream} using
// CategoryParentsStream, followed by CategoryListTokenizer and
// CategoryTokenizer composition (the ordering of the last two is
// not mandatory).
CategoryParentsStream parentsStream = (CategoryParentsStream) getParentsStream(categoryAttributesStream);
CategoryListTokenizer categoryListTokenizer = getCategoryListTokenizer(parentsStream);
CategoryTokenizer stream = getCategoryTokenizer(categoryListTokenizer);
// Finally creating a suitable field with stream and adding it to a
// master field-list, used during the build process (see
// super.build())
fieldList.add(new Field(e.getKey(), stream));
}
return this;
}
/**
* Get a stream of categories which includes the parents, according to
* policies defined in indexing parameters.
*
* @param categoryAttributesStream
* The input stream
* @return The parents stream.
* @see OrdinalPolicy OrdinalPolicy (for policy of adding category tokens for parents)
* @see PathPolicy PathPolicy (for policy of adding category <b>list</b> tokens for parents)
*/
protected TokenStream getParentsStream(
CategoryAttributesStream categoryAttributesStream) {
return new CategoryParentsStream(categoryAttributesStream,
taxonomyWriter, indexingParams);
}
/**
* Fills the categories mapping between a field name and a list of
* categories that belongs to it according to this builder's
* {@link FacetIndexingParams} object
*
* @param categories
* Iterable over the category attributes
*/
protected void fillCategoriesMap(Iterable<CategoryAttribute> categories)
throws IOException {
categoriesMap.clear();
// for-each category
for (CategoryAttribute category : categories) {
// extracting the field-name to which this category belongs
String fieldName = indexingParams.getCategoryListParams(
category.getCategoryPath()).getTerm().field();
// getting the list of categories which belongs to that field
List<CategoryAttribute> list = categoriesMap.get(fieldName);
// if no such list exists
if (list == null) {
// adding a new one to the map
list = new ArrayList<CategoryAttribute>();
categoriesMap.put(fieldName, list);
}
// adding the new category to the list
list.add(category.clone());
}
}
/**
* Get a category list tokenizer (or a series of such tokenizers) to create
* the <b>category list tokens</b>.
*
* @param categoryStream
* A stream containing {@link CategoryAttribute} with the
* relevant data.
* @return The category list tokenizer (or series of tokenizers) to be used
* in creating category list tokens.
*/
protected CategoryListTokenizer getCategoryListTokenizer(
TokenStream categoryStream) {
return getCountingListTokenizer(categoryStream);
}
/**
* Get a {@link CountingListTokenizer} for creating counting list token.
*
* @param categoryStream
* A stream containing {@link CategoryAttribute}s with the
* relevant data.
* @return A counting list tokenizer to be used in creating counting list
* token.
*/
protected CountingListTokenizer getCountingListTokenizer(
TokenStream categoryStream) {
return new CountingListTokenizer(categoryStream, indexingParams);
}
/**
* Get a {@link CategoryTokenizer} to create the <b>category tokens</b>.
* This method can be overridden for adding more attributes to the category
* tokens.
*
* @param categoryStream
* A stream containing {@link CategoryAttribute} with the
* relevant data.
* @return The {@link CategoryTokenizer} to be used in creating category
* tokens.
* @throws IOException
*/
protected CategoryTokenizer getCategoryTokenizer(TokenStream categoryStream)
throws IOException {
return new CategoryTokenizer(categoryStream, indexingParams);
}
/**
* Adds the fields created in one of the "set" methods to the document
*/
public Document build(Document doc) {
for (Field f : fieldList) {
f.setOmitNorms(true);
doc.add(f);
}
return doc;
}
}

View File

@ -0,0 +1,65 @@
package org.apache.lucene.facet.index;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import org.apache.lucene.util.encoding.IntEncoder;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Accumulates category IDs for a single document, for writing in byte array
* form, for example, to a Lucene Payload.
*
* @lucene.experimental
*/
public class CategoryListPayloadStream {
private ByteArrayOutputStream baos = new ByteArrayOutputStream(50);
private IntEncoder encoder;
/** Creates a Payload stream using the specified encoder. */
public CategoryListPayloadStream(IntEncoder encoder) {
this.encoder = encoder;
this.encoder.reInit(baos);
}
/** Appends an integer to the stream. */
public void appendIntToStream(int intValue) throws IOException {
encoder.encode(intValue);
}
/** Returns the streamed bytes so far accumulated, as an array of bytes. */
public byte[] convertStreamToByteArray() {
try {
encoder.close();
return baos.toByteArray();
} catch (IOException e) {
// This cannot happen, because of BAOS (no I/O).
return new byte[0];
}
}
/** Resets this stream to begin building a new payload. */
public void reset() throws IOException {
encoder.close();
baos.reset();
encoder.reInit(baos);
}
}

View File

@ -0,0 +1,188 @@
package org.apache.lucene.facet.index;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.index.PayloadProcessorProvider;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.Directory;
import org.apache.lucene.facet.index.params.CategoryListParams;
import org.apache.lucene.facet.index.params.FacetIndexingParams;
import org.apache.lucene.facet.taxonomy.lucene.LuceneTaxonomyWriter.OrdinalMap;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.encoding.IntDecoder;
import org.apache.lucene.util.encoding.IntEncoder;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* A {@link PayloadProcessorProvider} for updating facets ordinal references,
* based on an ordinal map. You should use this code in conjunction with merging
* taxonomies - after you merge taxonomies, you receive an {@link OrdinalMap}
* which maps the 'old' payloads to the 'new' ones. You can use that map to
* re-map the payloads which contain the facets information (ordinals) either
* before or while merging the indexes.
* <p>
* For re-mapping the ordinals before you merge the indexes, do the following:
*
* <pre>
* // merge the old taxonomy with the new one.
* OrdinalMap map = LuceneTaxonomyWriter.addTaxonomies();
* int[] ordmap = map.getMap();
*
* // re-map the ordinals on the old directory.
* Directory oldDir;
* FacetsPayloadProcessorProvider fppp = new FacetsPayloadProcessorProvider(
* oldDir, ordmap);
* IndexWriterConfig conf = new IndexWriterConfig(VER, ANALYZER);
* conf.setMergePolicy(new ForceOptimizeMergePolicy());
* IndexWriter writer = new IndexWriter(oldDir, conf);
* writer.setPayloadProcessorProvider(fppp);
* writer.optimize();
* writer.close();
*
* // merge that directory with the new index.
* IndexWriter newWriter; // opened on the 'new' Directory
* newWriter.addIndexes(oldDir);
* newWriter.commit();
* </pre>
*
* For re-mapping the ordinals during index merge, do the following:
*
* <pre>
* // merge the old taxonomy with the new one.
* OrdinalMap map = LuceneTaxonomyWriter.addTaxonomies();
* int[] ordmap = map.getMap();
*
* // Add the index and re-map ordinals on the go
* IndexReader r = IndexReader.open(oldDir);
* IndexWriterConfig conf = new IndexWriterConfig(VER, ANALYZER);
* IndexWriter writer = new IndexWriter(newDir, conf);
* writer.setPayloadProcessorProvider(fppp);
* writer.addIndexes(r);
* writer.commit();
* </pre>
* <p>
* <b>NOTE:</b> while the second example looks simpler, IndexWriter may trigger
* a long merge due to addIndexes. The first example avoids this perhaps
* unneeded merge, as well as can be done separately (e.g. on another node)
* before the index is merged.
*
* @lucene.experimental
*/
public class FacetsPayloadProcessorProvider extends PayloadProcessorProvider {
private final Directory workDir;
private final DirPayloadProcessor dirProcessor;
/**
* Construct FacetsPayloadProcessorProvider with FacetIndexingParams
*
* @param dir the {@link Directory} containing the segments to update
* @param ordinalMap an array mapping previous facets ordinals to new ones
* @param indexingParams the facets indexing parameters
*/
public FacetsPayloadProcessorProvider(Directory dir, int[] ordinalMap,
FacetIndexingParams indexingParams) {
workDir = dir;
dirProcessor = new FacetsDirPayloadProcessor(indexingParams, ordinalMap);
}
@Override
public DirPayloadProcessor getDirProcessor(Directory dir) throws IOException {
if (workDir != dir) {
return null;
}
return dirProcessor;
}
public static class FacetsDirPayloadProcessor extends DirPayloadProcessor {
private final Map<Term, CategoryListParams> termMap = new HashMap<Term, CategoryListParams>(1);
private final int[] ordinalMap;
/**
* Construct FacetsDirPayloadProcessor with custom FacetIndexingParams
* @param ordinalMap an array mapping previous facets ordinals to new ones
* @param indexingParams the facets indexing parameters
*/
protected FacetsDirPayloadProcessor(FacetIndexingParams indexingParams, int[] ordinalMap) {
this.ordinalMap = ordinalMap;
for (CategoryListParams params: indexingParams.getAllCategoryListParams()) {
termMap.put(params.getTerm(), params);
}
}
@Override
public PayloadProcessor getProcessor(String field, BytesRef bytes) throws IOException {
// TODO (Facet): don't create terms
CategoryListParams params = termMap.get(new Term(field, bytes));
if (params == null) {
return null;
}
return new FacetsPayloadProcessor(params, ordinalMap);
}
}
/** A PayloadProcessor for updating facets ordinal references, based on an ordinal map */
public static class FacetsPayloadProcessor extends PayloadProcessor {
private final IntEncoder encoder;
private final IntDecoder decoder;
private final int[] ordinalMap;
private final ByteArrayOutputStream os = new ByteArrayOutputStream();
/**
* @param params defines the encoding of facet ordinals as payload
* @param ordinalMap an array mapping previous facets ordinals to new ones
*/
protected FacetsPayloadProcessor(CategoryListParams params, int[] ordinalMap) {
encoder = params.createEncoder();
decoder = encoder.createMatchingDecoder();
this.ordinalMap = ordinalMap;
}
@Override
public void processPayload(BytesRef payload) throws IOException {
InputStream is = new ByteArrayInputStream(payload.bytes, payload.offset, payload.length);
decoder.reInit(is);
os.reset();
encoder.reInit(os);
long ordinal;
while ((ordinal = decoder.decode()) != IntDecoder.EOS) {
int newOrdinal = ordinalMap[(int)ordinal];
encoder.encode(newOrdinal);
}
encoder.close();
// TODO (Facet): avoid copy?
byte out[] = os.toByteArray();
payload.bytes = out;
payload.offset = 0;
payload.length = out.length;
}
}
}

View File

@ -0,0 +1,129 @@
package org.apache.lucene.facet.index.attributes;
import java.util.Collection;
import java.util.Set;
import org.apache.lucene.util.Attribute;
import org.apache.lucene.facet.taxonomy.CategoryPath;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* An attribute which contains for a certain category the {@link CategoryPath}
* and additional properties.
*
* @lucene.experimental
*/
public interface CategoryAttribute extends Attribute {
/**
* Set the content of this {@link CategoryAttribute} from another
* {@link CategoryAttribute} object.
*
* @param other
* The {@link CategoryAttribute} to take the content from.
*/
public void set(CategoryAttribute other);
/**
* Sets the category path value of this attribute.
*
* @param cp
* A category path. May not be null.
*/
public void setCategoryPath(CategoryPath cp);
/**
* Returns the value of this attribute: a category path.
*
* @return The category path last assigned to this attribute, or null if
* none has been assigned.
*/
public CategoryPath getCategoryPath();
/**
* Add a property. The property can be later retrieved using
* {@link #getProperty(Class)} with this property class .<br>
* Adding multiple properties of the same class is forbidden.
*
* @param property
* The property to add.
* @throws UnsupportedOperationException
* When attempting to add a property of a class that was added
* before and merge is prohibited.
*/
public void addProperty(CategoryProperty property)
throws UnsupportedOperationException;
/**
* Get a property of a certain property class.
*
* @param propertyClass
* The required property class.
* @return The property of the given class, or null if no such property
* exists.
*/
public CategoryProperty getProperty(
Class<? extends CategoryProperty> propertyClass);
/**
* Get a property of one of given property classes.
*
* @param propertyClasses
* The property classes.
* @return A property matching one of the given classes, or null if no such
* property exists.
*/
public CategoryProperty getProperty(
Collection<Class<? extends CategoryProperty>> propertyClasses);
/**
* Get all the active property classes.
*
* @return A set containing the active property classes, or {@code null} if
* there are no properties.
*/
public Set<Class<? extends CategoryProperty>> getPropertyClasses();
/**
* Clone this {@link CategoryAttribute}.
*
* @return A clone of this {@link CategoryAttribute}.
*/
public CategoryAttribute clone();
/**
* Resets this attribute to its initial value: a null category path and no
* properties.
*/
public void clear();
/**
* Clear all properties.
*/
public void clearProperties();
/**
* Remove an property of a certain property class.
*
* @param propertyClass
* The required property class.
*/
public void remove(Class<? extends CategoryProperty> propertyClass);
}

View File

@ -0,0 +1,192 @@
package org.apache.lucene.facet.index.attributes;
import java.util.Collection;
import java.util.HashMap;
import java.util.Set;
import org.apache.lucene.util.AttributeImpl;
import org.apache.lucene.facet.taxonomy.CategoryPath;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* An implementation of {@link CategoryAttribute}.
*
* @lucene.experimental
*/
public final class CategoryAttributeImpl extends AttributeImpl implements
CategoryAttribute {
/**
* The category path instance.
*/
protected CategoryPath categoryPath;
/**
* A map of properties associated to the current category path.
*/
protected HashMap<Class<? extends CategoryProperty>, CategoryProperty> properties;
/**
* Construct an empty CategoryAttributeImpl.
*/
public CategoryAttributeImpl() {
// do nothing
}
/**
* Construct a CategoryAttributeImpl with the given CategoryPath.
*
* @param categoryPath
* The category path to use.
*/
public CategoryAttributeImpl(CategoryPath categoryPath) {
setCategoryPath(categoryPath);
}
public void set(CategoryAttribute other) {
((CategoryAttributeImpl) other).copyTo(this);
}
/**
* Returns the category path value.
*
* @return The category path last assigned to this attribute, or null if
* none has been assigned.
*/
public CategoryPath getCategoryPath() {
return categoryPath;
}
public void setCategoryPath(CategoryPath cp) {
categoryPath = cp;
}
public void addProperty(CategoryProperty property)
throws UnsupportedOperationException {
if (properties == null) {
properties = new HashMap<Class<? extends CategoryProperty>, CategoryProperty>();
}
CategoryProperty existing = properties.get(property.getClass());
if (existing == null) {
properties.put(property.getClass(), property);
} else {
existing.merge(property);
}
}
public CategoryProperty getProperty(
Class<? extends CategoryProperty> propertyClass) {
if (properties == null) {
return null;
}
return properties.get(propertyClass);
}
public CategoryProperty getProperty(
Collection<Class<? extends CategoryProperty>> propertyClasses) {
if (properties == null) {
return null;
}
for (Class<? extends CategoryProperty> propertyClass : propertyClasses) {
CategoryProperty categoryProperty = properties.get(propertyClass);
if (categoryProperty != null) {
return categoryProperty;
}
}
return null;
}
@Override
public void copyTo(AttributeImpl target) {
((CategoryAttributeImpl) target).categoryPath = this.categoryPath;
((CategoryAttributeImpl) target).properties = this.properties;
}
@SuppressWarnings("unchecked")
@Override
public CategoryAttribute clone() {
CategoryAttributeImpl ca = (CategoryAttributeImpl) super.clone();
if (categoryPath != null) {
ca.categoryPath = (CategoryPath) categoryPath.clone();
}
if (properties != null && !properties.isEmpty()) {
ca.properties = (HashMap<Class<? extends CategoryProperty>, CategoryProperty>) properties
.clone();
}
return ca;
}
@Override
public void clear() {
categoryPath = null;
clearProperties();
}
public void clearProperties() {
if (properties != null) {
properties.clear();
}
}
@Override
public boolean equals(Object o) {
if (o == this) {
return true;
}
if (!(o instanceof CategoryAttributeImpl)) {
return false;
}
CategoryAttributeImpl other = (CategoryAttributeImpl) o;
if (categoryPath == null) {
return (other.categoryPath == null);
}
if (!categoryPath.equals(other.categoryPath)) {
return false;
}
if (properties == null || properties.isEmpty()) {
return (other.properties == null || other.properties.isEmpty());
}
return properties.equals(other.properties);
}
@Override
public int hashCode() {
if (categoryPath == null) {
return 0;
}
int hashCode = categoryPath.hashCode();
if (properties != null && !properties.isEmpty()) {
hashCode ^= properties.hashCode();
}
return hashCode;
}
public Set<Class<? extends CategoryProperty>> getPropertyClasses() {
if (properties == null || properties.isEmpty()) {
return null;
}
return properties.keySet();
}
public void remove(Class<? extends CategoryProperty> propertyClass) {
properties.remove(propertyClass);
}
}

View File

@ -0,0 +1,69 @@
package org.apache.lucene.facet.index.attributes;
import java.util.Iterator;
import org.apache.lucene.facet.index.streaming.CategoryAttributesStream;
import org.apache.lucene.facet.taxonomy.CategoryPath;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* This class transforms an {@link Iterable} of {@link CategoryPath} objects
* into an {@link Iterable} of {@link CategoryAttribute} objects, which can be
* used to construct a {@link CategoryAttributesStream}.
*
* @lucene.experimental
*/
public class CategoryAttributesIterable implements Iterable<CategoryAttribute> {
private Iterable<CategoryPath> inputIterable;
public CategoryAttributesIterable(Iterable<CategoryPath> inputIterable) {
this.inputIterable = inputIterable;
}
public Iterator<CategoryAttribute> iterator() {
return new CategoryAttributesIterator(this.inputIterable);
}
private static class CategoryAttributesIterator implements Iterator<CategoryAttribute> {
private Iterator<CategoryPath> internalIterator;
private CategoryAttributeImpl categoryAttributeImpl;
public CategoryAttributesIterator(Iterable<CategoryPath> inputIterable) {
this.internalIterator = inputIterable.iterator();
this.categoryAttributeImpl = new CategoryAttributeImpl();
}
public boolean hasNext() {
return this.internalIterator.hasNext();
}
public CategoryAttribute next() {
this.categoryAttributeImpl.setCategoryPath(this.internalIterator
.next());
return this.categoryAttributeImpl;
}
public void remove() {
this.internalIterator.remove();
}
}
}

View File

@ -0,0 +1,51 @@
package org.apache.lucene.facet.index.attributes;
import java.io.Serializable;
import org.apache.lucene.facet.index.CategoryContainer;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Property that can be added to {@link CategoryAttribute}s during indexing.
* Note that properties are put in a map and could be shallow copied during
* {@link CategoryAttributeImpl#clone()}, therefore reuse of
* {@link CategoryProperty} objects is not recommended. Also extends
* {@link Serializable}, making the {@link CategoryContainer} serialization more
* elegant.
*
* @lucene.experimental
*/
public interface CategoryProperty extends Serializable {
/**
* When adding categories with properties to a certain document, it is
* possible that the same category will be added more than once with
* different instances of the same property. This method defined how to
* treat such cases, by merging the newly added property into the one
* previously added. Implementing classes can assume that this method will
* be called only with a property of the same class.
*
* @param other
* The category property to merge.
* @throws UnsupportedOperationException
* If merging is prohibited for this property.
*/
public void merge(CategoryProperty other)
throws UnsupportedOperationException;
}

View File

@ -0,0 +1,71 @@
package org.apache.lucene.facet.index.attributes;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* A {@link CategoryProperty} holding the ordinal from the taxonomy of the
* current category in {@link CategoryAttribute}.
* <p>
* Ordinal properties are added internally during processing of category
* streams, and it is recommended not to use it externally.
*
* @lucene.experimental
*/
public class OrdinalProperty implements CategoryProperty {
protected int ordinal = -1;
public int getOrdinal() {
return ordinal;
}
public boolean hasBeenSet() {
return this.ordinal >= 0;
}
public void setOrdinal(int value) {
this.ordinal = value;
}
public void clear() {
this.ordinal = -1;
}
@Override
public boolean equals(Object other) {
if (other == this) {
return true;
}
if (!(other instanceof OrdinalProperty)) {
return false;
}
OrdinalProperty o = (OrdinalProperty) other;
return o.ordinal == this.ordinal;
}
@Override
public int hashCode() {
return this.ordinal;
}
public void merge(CategoryProperty other) {
throw new UnsupportedOperationException(
"Merging ordinal attributes is prohibited");
}
}

View File

@ -0,0 +1,13 @@
<html>
<head>
<title>Category attributes and their properties for indexing</title>
</head>
<body>
<h1>Category attributes and their properties for indexing</h1>
Attributes for a {@link org.apache.lucene.facet.taxonomy.CategoryPath category},
possibly containing
{@link org.apache.lucene.facet.index.attributes.CategoryProperty category property}'s.
</body>
</html>

View File

@ -0,0 +1,43 @@
package org.apache.lucene.facet.index.categorypolicy;
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
import org.apache.lucene.facet.taxonomy.TaxonomyWriter;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* This class filters our the ROOT category ID. For more information see
* {@link OrdinalPolicy}.
*
* @lucene.experimental
*/
public class DefaultOrdinalPolicy implements OrdinalPolicy {
/**
* Filters out (returns false) ordinals equal or less than
* {@link TaxonomyReader#ROOT_ORDINAL}. true otherwise.
*/
public boolean shouldAdd(int ordinal) {
return ordinal > TaxonomyReader.ROOT_ORDINAL;
}
/**
* Implemented as NO-OP as the default is not taxonomy dependent
*/
public void init(TaxonomyWriter taxonomyWriter) { }
}

View File

@ -0,0 +1,38 @@
package org.apache.lucene.facet.index.categorypolicy;
import org.apache.lucene.facet.taxonomy.CategoryPath;
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* This class filters our the ROOT category path. For more information see
* {@link PathPolicy}.
*
* @lucene.experimental
*/
public class DefaultPathPolicy implements PathPolicy {
/**
* Filters out (returns false) CategoryPaths equal or less than
* {@link TaxonomyReader#ROOT_ORDINAL}. true otherwise.
*/
public boolean shouldAdd(CategoryPath categoryPath) {
return categoryPath.length() > 0;
}
}

View File

@ -0,0 +1,71 @@
package org.apache.lucene.facet.index.categorypolicy;
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
import org.apache.lucene.facet.taxonomy.TaxonomyWriter;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Filter out any "top level" category ordinals. <br> {@link #shouldAdd(int)}.
*
* @lucene.experimental
*/
public class NonTopLevelOrdinalPolicy implements OrdinalPolicy {
/**
* The taxonomyWriter with which the given ordinals' parent is determined.
*/
private TaxonomyWriter taxonomyWriter;
/**
* Constructs a new non-top-level-ordinal-filter. With a given
* taxonomyWriter.
*
*/
public NonTopLevelOrdinalPolicy() {
this.taxonomyWriter = null;
}
/**
* @param taxonomyWriter
* A relevant taxonomyWriter object, with which ordinals sent to
* {@link #shouldAdd(int)} are examined.
*/
public void init(TaxonomyWriter taxonomyWriter) {
this.taxonomyWriter = taxonomyWriter;
}
/**
* Filters out ordinal which are ROOT or who's parent is ROOT. In order to
* determine if a parent is root, there's a need for
* {@link TaxonomyWriter#getParent(int)}.
*/
public boolean shouldAdd(int ordinal) {
if (ordinal > TaxonomyReader.ROOT_ORDINAL) {
try {
if (this.taxonomyWriter.getParent(ordinal) > TaxonomyReader.ROOT_ORDINAL) {
return true;
}
} catch (Exception e) {
return false;
}
}
return false;
}
}

View File

@ -0,0 +1,43 @@
package org.apache.lucene.facet.index.categorypolicy;
import org.apache.lucene.facet.taxonomy.CategoryPath;
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* This class filters our the ROOT category, and it's direct descendants. For
* more information see {@link PathPolicy}.
*
* @lucene.experimental
*/
public class NonTopLevelPathPolicy implements PathPolicy {
/**
* The shortest path length delivered is two components (root + one child).
*/
public final int DEFAULT_MINIMAL_SUBPATH_LENGTH = 2;
/**
* Filters out (returns false) CategoryPaths equal or less than
* {@link TaxonomyReader#ROOT_ORDINAL}. true otherwise.
*/
public boolean shouldAdd(CategoryPath categoryPath) {
return categoryPath.length() >= DEFAULT_MINIMAL_SUBPATH_LENGTH;
}
}

View File

@ -0,0 +1,56 @@
package org.apache.lucene.facet.index.categorypolicy;
import java.io.Serializable;
import org.apache.lucene.facet.index.streaming.CategoryParentsStream;
import org.apache.lucene.facet.taxonomy.TaxonomyWriter;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Filtering category ordinals in {@link CategoryParentsStream}, where a given
* category ordinal is added to the stream, and than its parents are being added
* one after the other using {@link TaxonomyWriter#getParent(int)}. <br>
* That loop should have a stop point - the default approach (excluding the
* ROOT) is implemented in {@link DefaultOrdinalPolicy}.
*
* @lucene.experimental
*/
public interface OrdinalPolicy extends Serializable {
/**
* Check whether a given category ordinal should be added to the stream.
*
* @param ordinal
* A given category ordinal which is to be tested for stream
* addition.
* @return <code>true</code> if the category should be added.
* <code>false</code> otherwise.
*/
public abstract boolean shouldAdd(int ordinal);
/**
* Initialize the policy with a TaxonomyWriter. This method can be
* implemented as noop if the ordinal policy is not taxonomy dependent
*
* @param taxonomyWriter
* A relevant taxonomyWriter object, with which ordinals sent to
* {@link #shouldAdd(int)} are examined.
*/
public abstract void init(TaxonomyWriter taxonomyWriter);
}

View File

@ -0,0 +1,47 @@
package org.apache.lucene.facet.index.categorypolicy;
import java.io.Serializable;
import org.apache.lucene.facet.index.streaming.CategoryParentsStream;
import org.apache.lucene.facet.taxonomy.CategoryPath;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Filtering category paths in {@link CategoryParentsStream}, where a given
* category is added to the stream, and than all its parents are being
* added one after the other by successively removing the last component. <br>
* That loop should have a stop point - the default approach (excluding the
* ROOT) is implemented in {@link DefaultOrdinalPolicy}.
*
* @lucene.experimental
*/
public interface PathPolicy extends Serializable {
/**
* Check whether a given category path should be added to the stream.
*
* @param categoryPath
* A given category path which is to be tested for stream
* addition.
* @return <code>true</code> if the category path should be added.
* <code>false</code> otherwise.
*/
public abstract boolean shouldAdd(CategoryPath categoryPath);
}

View File

@ -0,0 +1,21 @@
<html>
<head>
<title>Policies for indexing categories</title>
</head>
<body>
<h1>Policies for indexing categories</h1>
There are two kinds of policies:
<ul>
<li>Path policies are based on the path of the category.</li>
<li>Ordinal policies are based on the ordinal of the category.</li>
</ul>
Policies are "consulted" with during indexing, for deciding whether a category should
be added to the index or not. The two kinds of policies can be used for different purposes.
For example, path policies dictates which categories can participate in a drill-down operation,
while ordinal policies affect which can be accumulated (e.g. counted).
</body>
</html>

View File

@ -0,0 +1,15 @@
<html>
<head>
<title>Indexing of document categories</title>
</head>
<body>
<h1>Indexing of document categories</h1>
Attachment of
{@link org.apache.lucene.facet.taxonomy.CategoryPath CategoryPath}'s
or {@link org.apache.lucene.facet.index.attributes.CategoryAttribute CategoryAttribute}'s
to a given document using a
{@link org.apache.lucene.facet.taxonomy.TaxonomyWriter Taxonomy}.
</body>
</html>

View File

@ -0,0 +1,149 @@
package org.apache.lucene.facet.index.params;
import java.io.IOException;
import java.io.Serializable;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.facet.search.CategoryListIterator;
import org.apache.lucene.facet.search.PayloadIntDecodingIterator;
import org.apache.lucene.facet.search.TotalFacetCounts;
import org.apache.lucene.facet.util.PartitionsUtils;
import org.apache.lucene.util.encoding.DGapIntEncoder;
import org.apache.lucene.util.encoding.IntDecoder;
import org.apache.lucene.util.encoding.IntEncoder;
import org.apache.lucene.util.encoding.SortingIntEncoder;
import org.apache.lucene.util.encoding.UniqueValuesIntEncoder;
import org.apache.lucene.util.encoding.VInt8IntEncoder;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Contains parameters for a category list *
*
* @lucene.experimental
*/
public class CategoryListParams implements Serializable {
/** The default term used to store the facets information. */
public static final Term DEFAULT_TERM = new Term("$facets", "$fulltree$");
private final Term term;
private final int hashCode;
/**
* Constructs a default category list parameters object, using
* {@link #DEFAULT_TERM}.
*/
public CategoryListParams() {
this(DEFAULT_TERM);
}
/**
* Constructs a category list parameters object, using the given {@link Term}.
* @param term who's payload hold the category-list.
*/
public CategoryListParams(Term term) {
this.term = term;
// Pre-compute the hashCode because these objects are immutable. Saves
// some time on the comparisons later.
this.hashCode = term.hashCode();
}
/**
* A {@link Term} who's payload holds the category-list.
*/
public final Term getTerm() {
return term;
}
/**
* Allows to override how categories are encoded and decoded. A matching
* {@link IntDecoder} is provided by the {@link IntEncoder}.
* <p>
* Default implementation creates a new Sorting(<b>Unique</b>(DGap)) encoder.
* Uniqueness in this regard means when the same category appears twice in a
* document, only one appearance would be encoded. This has effect on facet
* counting results.
* <p>
* Some possible considerations when overriding may be:
* <ul>
* <li>an application "knows" that all categories are unique. So no need to
* pass through the unique filter.</li>
* <li>Another application might wish to count multiple occurrences of the
* same category, or, use a faster encoding which will consume more space.</li>
* </ul>
* In any event when changing this value make sure you know what you are
* doing, and test the results - e.g. counts, if the application is about
* counting facets.
*/
public IntEncoder createEncoder() {
return new SortingIntEncoder(new UniqueValuesIntEncoder(new DGapIntEncoder(new VInt8IntEncoder())));
}
/**
* Equality is defined by the 'term' that defines this category list.
* Sub-classes should override this method if a more complex calculation
* is needed to ensure equality.
*/
@Override
public boolean equals(Object o) {
if (o == this) {
return true;
}
if (!(o instanceof CategoryListParams)) {
return false;
}
CategoryListParams other = (CategoryListParams) o;
if (this.hashCode != other.hashCode) {
return false;
}
// The above hashcodes might equal each other in the case of a collision,
// so at this point only directly term equality testing will settle
// the equality test.
return this.term.equals(other.term);
}
/**
* Hashcode is similar to {@link #equals(Object)}, in that it uses
* the term that defines this category list to derive the hashcode.
* Subclasses need to ensure that equality/hashcode is correctly defined,
* or there could be side-effects in the {@link TotalFacetCounts} caching
* mechanism (as the filename for a Total Facet Counts array cache
* is dependent on the hashCode, so it should consistently return the same
* hash for identity).
*/
@Override
public int hashCode() {
return this.hashCode;
}
/**
* Create the category list iterator for the specified partition.
*/
public CategoryListIterator createCategoryListIterator(IndexReader reader,
int partition) throws IOException {
String categoryListTermStr = PartitionsUtils.partitionName(this, partition);
Term payloadTerm = new Term(term.field(), categoryListTermStr);
return new PayloadIntDecodingIterator(reader, payloadTerm,
createEncoder().createMatchingDecoder());
}
}

View File

@ -0,0 +1,196 @@
package org.apache.lucene.facet.index.params;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.facet.index.categorypolicy.DefaultOrdinalPolicy;
import org.apache.lucene.facet.index.categorypolicy.DefaultPathPolicy;
import org.apache.lucene.facet.index.categorypolicy.OrdinalPolicy;
import org.apache.lucene.facet.index.categorypolicy.PathPolicy;
import org.apache.lucene.facet.taxonomy.CategoryPath;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Default implementation for {@link FacetIndexingParams}.
* <p>
* Getters for <em>partition-size</em>, {@link OrdinalPolicy} and
* {@link PathPolicy} are all final, and so the proper way to modify them when
* extending this class is through {@link #fixedPartitionSize()},
* {@link #fixedOrdinalPolicy()} or {@link #fixedPathPolicy()} accordingly.
*
* @lucene.experimental
*/
public class DefaultFacetIndexingParams implements FacetIndexingParams {
/**
* delimiter between a categories in a path, e.g. Products FACET_DELIM
* Consumer FACET_DELIM Tv. This should be a character not found in any path
* component
*/
public static final char DEFAULT_FACET_DELIM_CHAR = '\uF749';
private final CategoryListParams clpParams;
private final OrdinalPolicy ordinalPolicy;
private final PathPolicy pathPolicy;
private final int partitionSize;
public DefaultFacetIndexingParams() {
this(new CategoryListParams());
}
public DefaultFacetIndexingParams(CategoryListParams categoryListParams) {
clpParams = categoryListParams;
ordinalPolicy = fixedOrdinalPolicy();
pathPolicy = fixedPathPolicy();
partitionSize = fixedPartitionSize();
}
public CategoryListParams getCategoryListParams(CategoryPath category) {
return clpParams;
}
public int drillDownTermText(CategoryPath path, char[] buffer) {
return path.copyToCharArray(buffer, 0, -1, getFacetDelimChar());
}
/**
* "fixed" partition size.
* @see #getPartitionSize()
*/
protected int fixedPartitionSize() {
return Integer.MAX_VALUE;
}
/**
* "fixed" ordinal policy.
* @see #getOrdinalPolicy()
*/
protected OrdinalPolicy fixedOrdinalPolicy() {
return new DefaultOrdinalPolicy();
}
/**
* "fixed" path policy.
* @see #getPathPolicy()
*/
protected PathPolicy fixedPathPolicy() {
return new DefaultPathPolicy();
}
public final int getPartitionSize() {
return partitionSize;
}
/*
* (non-Javadoc)
*
* @see
* org.apache.lucene.facet.index.params.FacetIndexingParams#getAllCategoryListParams
* ()
*/
public Iterable<CategoryListParams> getAllCategoryListParams() {
List<CategoryListParams> res = new ArrayList<CategoryListParams>();
res.add(clpParams);
return res;
}
public final OrdinalPolicy getOrdinalPolicy() {
return ordinalPolicy;
}
public final PathPolicy getPathPolicy() {
return pathPolicy;
}
/* (non-Javadoc)
* @see java.lang.Object#hashCode()
*/
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result
+ ((clpParams == null) ? 0 : clpParams.hashCode());
result = prime * result
+ ((ordinalPolicy == null) ? 0 : ordinalPolicy.hashCode());
result = prime * result + partitionSize;
result = prime * result
+ ((pathPolicy == null) ? 0 : pathPolicy.hashCode());
for (CategoryListParams clp: getAllCategoryListParams()) {
result ^= clp.hashCode();
}
return result;
}
/* (non-Javadoc)
* @see java.lang.Object#equals(java.lang.Object)
*/
@Override
public boolean equals(Object obj) {
if (this == obj) {
return true;
}
if (obj == null) {
return false;
}
if (!(obj instanceof DefaultFacetIndexingParams)) {
return false;
}
DefaultFacetIndexingParams other = (DefaultFacetIndexingParams) obj;
if (clpParams == null) {
if (other.clpParams != null) {
return false;
}
} else if (!clpParams.equals(other.clpParams)) {
return false;
}
if (ordinalPolicy == null) {
if (other.ordinalPolicy != null) {
return false;
}
} else if (!ordinalPolicy.equals(other.ordinalPolicy)) {
return false;
}
if (partitionSize != other.partitionSize) {
return false;
}
if (pathPolicy == null) {
if (other.pathPolicy != null) {
return false;
}
} else if (!pathPolicy.equals(other.pathPolicy)) {
return false;
}
Iterable<CategoryListParams> cLs = getAllCategoryListParams();
Iterable<CategoryListParams> otherCLs = other.getAllCategoryListParams();
return cLs.equals(otherCLs);
}
/**
* Use {@link #DEFAULT_FACET_DELIM_CHAR} as the delimiter.
*/
public char getFacetDelimChar() {
return DEFAULT_FACET_DELIM_CHAR;
}
}

View File

@ -0,0 +1,98 @@
package org.apache.lucene.facet.index.params;
import java.io.Serializable;
import org.apache.lucene.facet.index.categorypolicy.OrdinalPolicy;
import org.apache.lucene.facet.index.categorypolicy.PathPolicy;
import org.apache.lucene.facet.taxonomy.CategoryPath;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Parameters on how facets are to be written to the index.
* For example, which fields and terms are used to refer to the indexed posting list.
* <P>
* If non-default parameters were used during indexing, the same parameters
* must also be passed during faceted search. This requirement is analogous
* to the requirement during search to know which fields were indexed, and which
* Analyzer was used on the text.
*
* @lucene.experimental
*/
public interface FacetIndexingParams extends Serializable {
/**
* The name of the category-list to put this category in, or null if this
* category should not be aggregatable.
* <P>
* By default, all categories are written to the same category list, but
* applications which know in advance that in some situations only parts
* of the category hierarchy needs to be counted can divide the categories
* into two or more different category lists.
* <P>
* If null is returned for a category, it means that this category should
* not appear in any category list, and thus counts for it cannot be
* aggregated. This category can still be used for drill-down, even though
* the count for it is not known.
*/
public CategoryListParams getCategoryListParams(CategoryPath category);
/**
* Return info about all category lists in the index.
*
* @see #getCategoryListParams(CategoryPath)
*/
public Iterable<CategoryListParams> getAllCategoryListParams();
// TODO (Facet): Add special cases of exact/non-exact category term-text
/**
* Return the drilldown Term-Text which does not need to do any allocations.
* The number of chars set is returned.
* <p>
* Note: Make sure <code>buffer</code> is large enough.
* @see CategoryPath#charsNeededForFullPath()
*/
public int drillDownTermText(CategoryPath path, char[] buffer);
/**
* Get the partition size.
* Same value should be used during the life time of an index.
* At search time this value is compared with actual taxonomy size and their minimum is used.
*/
public int getPartitionSize();
/**
* Get the policy for indexing category <b>paths</b>,
* used for deciding how "high" to climb in taxonomy
* from a category when ingesting its category paths.
*/
public PathPolicy getPathPolicy();
/**
* Get the policy for indexing category <b>ordinals</b>,
* used for deciding how "high" to climb in taxonomy
* from a category when ingesting its ordinals
*/
public OrdinalPolicy getOrdinalPolicy();
/**
* Get the delimiter character used internally for drill-down terms
*/
public char getFacetDelimChar();
}

View File

@ -0,0 +1,32 @@
package org.apache.lucene.facet.index.params;
import org.apache.lucene.facet.FacetException;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Thrown when the facets params are missing a property. *
*
* @lucene.experimental
*/
public class FacetParamsMissingPropertyException extends FacetException {
public FacetParamsMissingPropertyException(String key) {
super("Property with key \"" + key + "\" not found");
}
}

View File

@ -0,0 +1,105 @@
package org.apache.lucene.facet.index.params;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.facet.taxonomy.CategoryPath;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* A FacetIndexingParams that utilizes different category lists, defined by the
* dimension specified CategoryPaths (see
* {@link PerDimensionIndexingParams#addCategoryListParams(CategoryPath, CategoryListParams)}
* <p>
* A 'dimension' is defined as the first or "zero-th" component in a
* CategoryPath. For example, if a CategoryPath is defined as
* "/Author/American/Mark Twain", then the dimension is "Author".
* <p>
* This class also uses the 'default' CategoryListParams (as specified by
* {@link CategoryListParams#CategoryListParams()} when
* {@link #getCategoryListParams(CategoryPath)} is called for a CategoryPath
* whose dimension component has not been specifically defined.
*
* @lucene.experimental
*/
public class PerDimensionIndexingParams extends DefaultFacetIndexingParams {
// "Root" or "first component" of a Category Path maps to a
// CategoryListParams
private final Map<String, CategoryListParams> clParamsMap = new HashMap<String, CategoryListParams>();
/**
* Construct with the default {@link CategoryListParams} as the default
* CategoryListParams for unspecified CategoryPaths.
*/
public PerDimensionIndexingParams() {
this(new CategoryListParams());
}
/**
* Construct with the included categoryListParams as the default
* CategoryListParams for unspecified CategoryPaths.
*
* @param categoryListParams
* the default categoryListParams to use
*/
public PerDimensionIndexingParams(CategoryListParams categoryListParams) {
super(categoryListParams);
}
/**
* Get all the categoryListParams, including the default.
*/
@Override
public Iterable<CategoryListParams> getAllCategoryListParams() {
ArrayList<CategoryListParams> vals =
new ArrayList<CategoryListParams>(clParamsMap.values());
for (CategoryListParams clp : super.getAllCategoryListParams()) {
vals.add(clp);
}
return vals;
}
/**
* Get the CategoryListParams based on the dimension or "zero-th category"
* of the specified CategoryPath.
*/
@Override
public CategoryListParams getCategoryListParams(CategoryPath category) {
if (category != null) {
CategoryListParams clParams = clParamsMap.get(category.getComponent(0));
if (clParams != null) {
return clParams;
}
}
return super.getCategoryListParams(category);
}
/**
* Add a CategoryListParams for a given CategoryPath's dimension or
* "zero-th" category.
*
* @param category
* @param clParams
*/
public void addCategoryListParams(CategoryPath category, CategoryListParams clParams) {
clParamsMap.put(category.getComponent(0), clParams);
}
}

View File

@ -0,0 +1,12 @@
<html>
<head>
<title>Indexing-time specifications for handling facets</title>
</head>
<body>
<h1>Indexing-time specifications for handling facets</h1>
Parameters on how facets are to be written to the index,
such as which fields and terms are used to refer to the facets posting list.
</body>
</html>

View File

@ -0,0 +1,81 @@
package org.apache.lucene.facet.index.streaming;
import java.io.IOException;
import java.util.Iterator;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.facet.index.attributes.CategoryAttribute;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* An attribute stream built from an {@link Iterable} of
* {@link CategoryAttribute}. This stream should then be passed through several
* filters (see {@link CategoryParentsStream}, {@link CategoryListTokenizer} and
* {@link CategoryTokenizer}) until a token stream is produced that can be
* indexed by Lucene.
* <P>
* A CategoryAttributesStream object can be reused for producing more than one
* stream. To do that, the user should cause the underlying
* Iterable<CategoryAttribute> object to return a new set of categories, and
* then call {@link #reset()} to allow this stream to be used again.
*
* @lucene.experimental
*/
public class CategoryAttributesStream extends TokenStream {
protected CategoryAttribute categoryAttribute;
private Iterable<CategoryAttribute> iterable;
private Iterator<CategoryAttribute> iterator;
/**
* Constructor
*
* @param iterable
* {@link Iterable} of {@link CategoryAttribute}, from which
* categories are taken.
*/
public CategoryAttributesStream(Iterable<CategoryAttribute> iterable) {
this.iterable = iterable;
this.iterator = null;
this.categoryAttribute = this.addAttribute(CategoryAttribute.class);
}
@Override
public final boolean incrementToken() throws IOException {
if (iterator == null) {
if (iterable == null) {
return false;
}
iterator = iterable.iterator();
}
if (iterator.hasNext()) {
categoryAttribute.set(iterator.next());
return true;
}
return false;
}
@Override
public void reset() {
this.iterator = null;
}
}

View File

@ -0,0 +1,67 @@
package org.apache.lucene.facet.index.streaming;
import java.io.IOException;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.facet.index.params.FacetIndexingParams;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* A base class for category list tokenizers, which add category list tokens to
* category streams.
*
* @lucene.experimental
*/
public abstract class CategoryListTokenizer extends CategoryTokenizerBase {
/**
* @see CategoryTokenizerBase#CategoryTokenizerBase(TokenStream, FacetIndexingParams)
*/
public CategoryListTokenizer(TokenStream input,
FacetIndexingParams indexingParams) {
super(input, indexingParams);
}
/**
* A method invoked once when the input stream begins, for subclass-specific
* processing. Subclass implementations must invoke this one, too!
*/
protected void handleStartOfInput() throws IOException {
// In this class, we do nothing.
}
/**
* A method invoked once when the input stream ends, for subclass-specific
* processing.
*/
protected void handleEndOfInput() throws IOException {
// In this class, we do nothing.
}
@Override
public void reset() throws IOException {
super.reset();
handleStartOfInput();
}
@Override
public abstract boolean incrementToken() throws IOException;
}

View File

@ -0,0 +1,189 @@
package org.apache.lucene.facet.index.streaming;
import java.io.IOException;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.facet.index.attributes.CategoryAttribute;
import org.apache.lucene.facet.index.attributes.CategoryProperty;
import org.apache.lucene.facet.index.attributes.OrdinalProperty;
import org.apache.lucene.facet.index.categorypolicy.OrdinalPolicy;
import org.apache.lucene.facet.index.categorypolicy.PathPolicy;
import org.apache.lucene.facet.index.params.FacetIndexingParams;
import org.apache.lucene.facet.taxonomy.CategoryPath;
import org.apache.lucene.facet.taxonomy.TaxonomyWriter;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* This class adds parents to a {@link CategoryAttributesStream}. The parents
* are added according to the {@link PathPolicy} and {@link OrdinalPolicy} from
* the {@link FacetIndexingParams} given in the constructor.<br>
* By default, category properties are removed when creating parents of a
* certain category. However, it is possible to retain certain property types
* using {@link #addRetainableProperty(Class)}.
*
* @lucene.experimental
*/
public class CategoryParentsStream extends TokenFilter {
/**
* A {@link TaxonomyWriter} for adding categories and retrieving their
* ordinals.
*/
protected TaxonomyWriter taxonomyWriter;
/** An attribute containing all data related to the category */
protected CategoryAttribute categoryAttribute;
/** A category property containing the category ordinal */
protected OrdinalProperty ordinalProperty;
/**
* A set of property classes that are to be retained when creating a parent
* token.
*/
private Set<Class<? extends CategoryProperty>> retainableProperties;
/** A {@link PathPolicy} for the category's parents' category paths. */
private PathPolicy pathPolicy;
/** An {@link OrdinalPolicy} for the category's parents' ordinals. */
private OrdinalPolicy ordinalPolicy;
/**
* Constructor.
*
* @param input
* The input stream to handle, must be derived from
* {@link CategoryAttributesStream}.
* @param taxonomyWriter
* The taxonomy writer to use for adding categories and
* retrieving their ordinals.
* @param indexingParams
* The indexing params used for filtering parents.
*/
public CategoryParentsStream(CategoryAttributesStream input,
TaxonomyWriter taxonomyWriter, FacetIndexingParams indexingParams) {
super(input);
this.categoryAttribute = this.addAttribute(CategoryAttribute.class);
this.taxonomyWriter = taxonomyWriter;
this.pathPolicy = indexingParams.getPathPolicy();
this.ordinalPolicy = indexingParams.getOrdinalPolicy();
this.ordinalPolicy.init(taxonomyWriter);
this.ordinalProperty = new OrdinalProperty();
}
@Override
public final boolean incrementToken() throws IOException {
if (this.categoryAttribute.getCategoryPath() != null) {
// try adding the parent of the current category to the stream
clearCategoryProperties();
boolean added = false;
// set the parent's ordinal, if illegal set -1
int ordinal = this.ordinalProperty.getOrdinal();
if (ordinal != -1) {
ordinal = this.taxonomyWriter.getParent(ordinal);
if (this.ordinalPolicy.shouldAdd(ordinal)) {
this.ordinalProperty.setOrdinal(ordinal);
try {
this.categoryAttribute.addProperty(ordinalProperty);
} catch (UnsupportedOperationException e) {
throw new IOException(e.getLocalizedMessage());
}
added = true;
} else {
this.ordinalProperty.setOrdinal(-1);
}
}
// set the parent's category path, if illegal set null
CategoryPath cp = this.categoryAttribute.getCategoryPath();
if (cp != null) {
cp.trim(1);
// if ordinal added, must also have category paths
if (added || this.pathPolicy.shouldAdd(cp)) {
this.categoryAttribute.setCategoryPath(cp);
added = true;
} else {
this.categoryAttribute.clear();
}
}
if (added) {
// a legal parent exists
return true;
}
}
// no more parents - get new category
if (input.incrementToken()) {
int ordinal = taxonomyWriter.addCategory(this.categoryAttribute.getCategoryPath());
this.ordinalProperty.setOrdinal(ordinal);
try {
this.categoryAttribute.addProperty(this.ordinalProperty);
} catch (UnsupportedOperationException e) {
throw new IOException(e.getLocalizedMessage());
}
return true;
}
return false;
}
/**
* Clear the properties of the current {@link CategoryAttribute} attribute
* before setting the parent attributes. <br>
* It is possible to retain properties of certain types the parent tokens,
* using {@link #addRetainableProperty(Class)}.
*/
protected void clearCategoryProperties() {
if (this.retainableProperties == null
|| this.retainableProperties.isEmpty()) {
this.categoryAttribute.clearProperties();
} else {
List<Class<? extends CategoryProperty>> propertyClassesToRemove =
new LinkedList<Class<? extends CategoryProperty>>();
for (Class<? extends CategoryProperty> propertyClass : this.categoryAttribute
.getPropertyClasses()) {
if (!this.retainableProperties.contains(propertyClass)) {
propertyClassesToRemove.add(propertyClass);
}
}
for (Class<? extends CategoryProperty> propertyClass : propertyClassesToRemove) {
this.categoryAttribute.remove(propertyClass);
}
}
}
/**
* Add a {@link CategoryProperty} class which is retained when creating
* parent tokens.
*
* @param toRetain
* The property class to retain.
*/
public void addRetainableProperty(Class<? extends CategoryProperty> toRetain) {
if (this.retainableProperties == null) {
this.retainableProperties = new HashSet<Class<? extends CategoryProperty>>();
}
this.retainableProperties.add(toRetain);
}
}

View File

@ -0,0 +1,67 @@
package org.apache.lucene.facet.index.streaming;
import java.io.IOException;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.facet.index.params.FacetIndexingParams;
import org.apache.lucene.facet.taxonomy.CategoryPath;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Basic class for setting the {@link CharTermAttribute}s and
* {@link PayloadAttribute}s of category tokens.
*
* @lucene.experimental
*/
public class CategoryTokenizer extends CategoryTokenizerBase {
/**
* @see CategoryTokenizerBase#CategoryTokenizerBase(TokenStream,
* FacetIndexingParams)
*/
public CategoryTokenizer(TokenStream input,
FacetIndexingParams indexingParams) {
super(input, indexingParams);
}
@Override
public final boolean incrementToken() throws IOException {
if (input.incrementToken()) {
if (categoryAttribute != null && categoryAttribute.getCategoryPath() != null) {
CategoryPath categoryPath = categoryAttribute.getCategoryPath();
char[] termBuffer = termAttribute.resizeBuffer(categoryPath.charsNeededForFullPath());
int nChars = indexingParams.drillDownTermText(categoryPath, termBuffer);
termAttribute.setLength(nChars);
setPayload();
}
return true;
}
return false;
}
/**
* Set the payload of the current category token.
*/
protected void setPayload() {
}
}

View File

@ -0,0 +1,78 @@
package org.apache.lucene.facet.index.streaming;
import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.index.Payload;
import org.apache.lucene.facet.index.CategoryDocumentBuilder;
import org.apache.lucene.facet.index.attributes.CategoryAttribute;
import org.apache.lucene.facet.index.params.FacetIndexingParams;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* A base class for all token filters which add term and payload attributes to
* tokens and are to be used in {@link CategoryDocumentBuilder}. Contains three
* attributes: {@link CategoryAttribute}, {@link CharTermAttribute} and
* {@link PayloadAttribute}.
*
* @lucene.experimental
*/
public abstract class CategoryTokenizerBase extends TokenFilter {
/** The stream's category attributes. */
protected CategoryAttribute categoryAttribute;
/** The stream's payload attribute. */
protected PayloadAttribute payloadAttribute;
/** The stream's term attribute. */
protected CharTermAttribute termAttribute;
/** The object used for constructing payloads. */
protected Payload payload = new Payload();
/** Indexing params for creating term text **/
protected FacetIndexingParams indexingParams;
/**
* Constructor.
*
* @param input
* The input stream, either {@link CategoryParentsStream} or an
* extension of {@link CategoryTokenizerBase}.
* @param indexingParams
* The indexing params to use.
*/
public CategoryTokenizerBase(TokenStream input,
FacetIndexingParams indexingParams) {
super(input);
this.categoryAttribute = this.addAttribute(CategoryAttribute.class);
this.termAttribute = this.addAttribute(CharTermAttribute.class);
this.payloadAttribute = this.addAttribute(PayloadAttribute.class);
this.indexingParams = indexingParams;
}
@Override
public abstract boolean incrementToken() throws IOException;
}

View File

@ -0,0 +1,125 @@
package org.apache.lucene.facet.index.streaming;
import java.io.IOException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map.Entry;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.facet.index.CategoryListPayloadStream;
import org.apache.lucene.facet.index.attributes.OrdinalProperty;
import org.apache.lucene.facet.index.params.CategoryListParams;
import org.apache.lucene.facet.index.params.FacetIndexingParams;
import org.apache.lucene.facet.taxonomy.CategoryPath;
import org.apache.lucene.facet.util.PartitionsUtils;
import org.apache.lucene.util.encoding.IntEncoder;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* {@link CategoryListTokenizer} for facet counting
*
* @lucene.experimental
*/
public class CountingListTokenizer extends CategoryListTokenizer {
/** A table for retrieving payload streams by category-list name. */
protected HashMap<String, CategoryListPayloadStream> payloadStreamsByName =
new HashMap<String, CategoryListPayloadStream>();
/** An iterator over the payload streams */
protected Iterator<Entry<String, CategoryListPayloadStream>> payloadStreamIterator;
public CountingListTokenizer(TokenStream input,
FacetIndexingParams indexingParams) {
super(input, indexingParams);
this.payloadStreamsByName = new HashMap<String, CategoryListPayloadStream>();
}
@Override
protected void handleStartOfInput() throws IOException {
payloadStreamsByName.clear();
payloadStreamIterator = null;
}
@Override
public final boolean incrementToken() throws IOException {
if (input.incrementToken()) {
if (this.categoryAttribute != null) {
OrdinalProperty ordinalProperty = (OrdinalProperty) this.categoryAttribute
.getProperty(OrdinalProperty.class);
if (ordinalProperty != null && legalCategory()) {
CategoryPath categoryPath = this.categoryAttribute
.getCategoryPath();
int ordinal = ordinalProperty.getOrdinal();
CategoryListPayloadStream payloadStream = getPayloadStream(
categoryPath, ordinal);
int partitionSize = indexingParams.getPartitionSize();
payloadStream.appendIntToStream(ordinal % partitionSize);
}
}
return true;
}
if (this.payloadStreamIterator == null) {
this.handleEndOfInput();
this.payloadStreamIterator = this.payloadStreamsByName.entrySet()
.iterator();
}
if (this.payloadStreamIterator.hasNext()) {
Entry<String, CategoryListPayloadStream> entry = this.payloadStreamIterator
.next();
String countingListName = entry.getKey();
int length = countingListName.length();
this.termAttribute.resizeBuffer(length);
countingListName.getChars(0, length, termAttribute.buffer(), 0);
this.termAttribute.setLength(length);
CategoryListPayloadStream payloadStream = entry.getValue();
payload.setData(payloadStream.convertStreamToByteArray());
this.payloadAttribute.setPayload(payload);
return true;
}
return false;
}
/**
* A method which allows extending classes to filter the categories going
* into the counting list.
*
* @return By default returns {@code true}, meaning the current category is
* to be part of the counting list. For categories that should be
* filtered, return {@code false}.
*/
protected boolean legalCategory() {
return true;
}
protected CategoryListPayloadStream getPayloadStream(
CategoryPath categoryPath, int ordinal) throws IOException {
CategoryListParams clParams = this.indexingParams.getCategoryListParams(categoryPath);
String name = PartitionsUtils.partitionNameByOrdinal(indexingParams, clParams, ordinal);
CategoryListPayloadStream fps = payloadStreamsByName.get(name);
if (fps == null) {
IntEncoder encoder = clParams.createEncoder();
fps = new CategoryListPayloadStream(encoder);
payloadStreamsByName.put(name, fps);
}
return fps;
}
}

View File

@ -0,0 +1,19 @@
<html>
<head>
<title>Expert: attributes streaming definition for indexing facets</title>
</head>
<body>
<h1>Expert: attributes streaming definition for indexing facets</h1>
Steaming of facets attributes is a low level indexing interface with Lucene indexing.
There are two types of category related streams:
<ul>
<li><b>Category tokenizer stream</b> handles tokenization for a single category,
e.g. for creating drill-down tokens.</li>
<li><b>Category list tokenizer stream</b> handles tokenization for multiple categories,
e.g. for creating a counting list token, representing all the categories of
a certain document.</li>
</ul>
</body>
</html>

View File

@ -0,0 +1,8 @@
<html>
<head>
<title>Faceted Indexing and Search</title>
</head>
<body>
Provides faceted indexing and search capabilities.
</body>
</html>

View File

@ -0,0 +1,116 @@
package org.apache.lucene.facet.search;
import java.io.IOException;
import java.util.List;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.facet.search.params.FacetSearchParams;
import org.apache.lucene.facet.search.results.FacetResult;
import org.apache.lucene.facet.search.results.FacetResultNode;
import org.apache.lucene.facet.search.sampling.Sampler;
import org.apache.lucene.facet.search.sampling.SamplingAccumulator;
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* {@link FacetsAccumulator} whose behavior regarding complements, sampling,
* etc. is not set up front but rather is determined at accumulation time
* according to the statistics of the accumulated set of documents and the
* index.
* <p>
* Note: Sampling accumulation (Accumulation over a sampled-set of the results),
* does not guarantee accurate values for
* {@link FacetResult#getNumValidDescendants()} &
* {@link FacetResultNode#getResidue()}.
*
* @lucene.experimental
*/
public final class AdaptiveFacetsAccumulator extends StandardFacetsAccumulator {
private Sampler sampler = new Sampler();
/**
* Create an {@link AdaptiveFacetsAccumulator}
* @see StandardFacetsAccumulator#StandardFacetsAccumulator(FacetSearchParams, IndexReader, TaxonomyReader)
*/
public AdaptiveFacetsAccumulator(FacetSearchParams searchParams, IndexReader indexReader,
TaxonomyReader taxonomyReader) {
super(searchParams, indexReader, taxonomyReader);
}
/**
* Create an {@link AdaptiveFacetsAccumulator}
* @see StandardFacetsAccumulator#StandardFacetsAccumulator(FacetSearchParams, IndexReader, TaxonomyReader,
* IntArrayAllocator, FloatArrayAllocator)
*/
public AdaptiveFacetsAccumulator(FacetSearchParams searchParams, IndexReader indexReader,
TaxonomyReader taxonomyReader, IntArrayAllocator intArrayAllocator,
FloatArrayAllocator floatArrayAllocator) {
super(searchParams, indexReader, taxonomyReader, intArrayAllocator, floatArrayAllocator);
}
/**
* Set the sampler.
* @param sampler sampler to set
*/
public void setSampler(Sampler sampler) {
this.sampler = sampler;
}
@Override
public List<FacetResult> accumulate(ScoredDocIDs docids) throws IOException {
FacetsAccumulator delegee = appropriateFacetCountingAccumulator(docids);
if (delegee == this) {
return super.accumulate(docids);
}
return delegee.accumulate(docids);
}
/**
* Compute the appropriate facet accumulator to use.
* If no special/clever adaptation is possible/needed return this (self).
*/
private FacetsAccumulator appropriateFacetCountingAccumulator(ScoredDocIDs docids) {
// Verify that searchPareams permit sampling/complement/etc... otherwise do default
if (!mayComplement()) {
return this;
}
// Now we're sure we can use the sampling methods as we're in a counting only mode
// Verify that sampling is enabled and required ... otherwise do default
if (sampler == null || !sampler.shouldSample(docids)) {
return this;
}
SamplingAccumulator samplingAccumulator = new SamplingAccumulator(sampler, searchParams, indexReader, taxonomyReader);
samplingAccumulator.setComplementThreshold(getComplementThreshold());
return samplingAccumulator;
}
/**
* @return the sampler in effect
*/
public final Sampler getSampler() {
return sampler;
}
}

View File

@ -0,0 +1,69 @@
package org.apache.lucene.facet.search;
import java.io.IOException;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* An interface for iterating over a "category list", i.e., the list of
* categories per document.
* <p>
* <b>NOTE:</b>
* <ul>
* <li>This class operates as a key to a Map. Appropriate implementation of
* <code>hashCode()</code> and <code>equals()</code> must be provided.
* <li>{@link #init()} must be called before you consume any categories, or call
* {@link #skipTo(int)}.
* <li>{@link #skipTo(int)} must be called before any calls to
* {@link #nextCategory()}.
* <li>{@link #nextCategory()} returns values &lt; {@link Integer#MAX_VALUE}, so
* you can use it as a stop condition.
* </ul>
*
* @lucene.experimental
*/
public interface CategoryListIterator {
/**
* Initializes the iterator. This method must be called before any calls to
* {@link #skipTo(int)}, and its return value indicates whether there are
* any relevant documents for this iterator. If it returns false, any call
* to {@link #skipTo(int)} will return false as well.<br>
* <b>NOTE:</b> calling this method twice may result in skipping over
* documents for some implementations. Also, calling it again after all
* documents were consumed may yield unexpected behavior.
*/
public boolean init() throws IOException;
/**
* Skips forward to document docId. Returns true iff this document exists
* and has any categories. This method must be called before calling
* {@link #nextCategory()} for a particular document.<br>
* <b>NOTE:</b> Users should call this method with increasing docIds, and
* implementations can assume that this is the case.
*/
public boolean skipTo(int docId) throws IOException;
/**
* Returns the next category for the current document that is set through
* {@link #skipTo(int)}, or a number higher than {@link Integer#MAX_VALUE}.
* No assumptions can be made on the order of the categories.
*/
public long nextCategory() throws IOException;
}

View File

@ -0,0 +1,110 @@
package org.apache.lucene.facet.search;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.facet.index.params.CategoryListParams;
import org.apache.lucene.facet.index.params.FacetIndexingParams;
import org.apache.lucene.facet.search.params.FacetSearchParams;
import org.apache.lucene.facet.taxonomy.CategoryPath;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Creation of drill down term or query.
*
* @lucene.experimental
*/
public final class DrillDown {
/**
* @see #term(FacetIndexingParams, CategoryPath)
*/
public static final Term term(FacetSearchParams sParams, CategoryPath path) {
return term(sParams.getFacetIndexingParams(), path);
}
/**
* Return a term for drilling down into a category.
*/
public static final Term term(FacetIndexingParams iParams, CategoryPath path) {
CategoryListParams clp = iParams.getCategoryListParams(path);
char[] buffer = new char[path.charsNeededForFullPath()];
iParams.drillDownTermText(path, buffer);
return new Term(clp.getTerm().field(), String.valueOf(buffer));
}
/**
* Return a query for drilling down into all given categories (AND).
* @see #term(FacetSearchParams, CategoryPath)
* @see #query(FacetSearchParams, Query, CategoryPath...)
*/
public static final Query query(FacetIndexingParams iParams, CategoryPath... paths) {
if (paths==null || paths.length==0) {
throw new IllegalArgumentException("Empty category path not allowed for drill down query!");
}
if (paths.length==1) {
return new TermQuery(term(iParams, paths[0]));
}
BooleanQuery res = new BooleanQuery();
for (CategoryPath cp : paths) {
res.add(new TermQuery(term(iParams, cp)), Occur.MUST);
}
return res;
}
/**
* Return a query for drilling down into all given categories (AND).
* @see #term(FacetSearchParams, CategoryPath)
* @see #query(FacetSearchParams, Query, CategoryPath...)
*/
public static final Query query(FacetSearchParams sParams, CategoryPath... paths) {
return query(sParams.getFacetIndexingParams(), paths);
}
/**
* Turn a base query into a drilling-down query for all given category paths (AND).
* @see #query(FacetIndexingParams, CategoryPath...)
*/
public static final Query query(FacetIndexingParams iParams, Query baseQuery, CategoryPath... paths) {
BooleanQuery res = new BooleanQuery();
res.add(baseQuery, Occur.MUST);
res.add(query(iParams, paths), Occur.MUST);
return res;
}
/**
* Turn a base query into a drilling-down query for all given category paths (AND).
* @see #query(FacetSearchParams, CategoryPath...)
*/
public static final Query query(FacetSearchParams sParams, Query baseQuery, CategoryPath... paths) {
return query(sParams.getFacetIndexingParams(), baseQuery, paths);
}
/**
* Turn a base query into a drilling-down query using the default {@link FacetSearchParams}
* @see #query(FacetSearchParams, Query, CategoryPath...)
*/
public static final Query query(Query baseQuery, CategoryPath... paths) {
return query(new FacetSearchParams(), baseQuery, paths);
}
}

View File

@ -0,0 +1,91 @@
package org.apache.lucene.facet.search;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Provider of arrays used for facet operations such as counting.
*
* @lucene.experimental
*/
public class FacetArrays {
private int[] intArray;
private float[] floatArray;
private IntArrayAllocator intArrayAllocator;
private FloatArrayAllocator floatArrayAllocator;
private int arraysLength;
/**
* Create a FacetArrays with certain array allocators.
* @param intArrayAllocator allocator for int arrays.
* @param floatArrayAllocator allocator for float arrays.
*/
public FacetArrays(IntArrayAllocator intArrayAllocator,
FloatArrayAllocator floatArrayAllocator) {
this.intArrayAllocator = intArrayAllocator;
this.floatArrayAllocator = floatArrayAllocator;
}
/**
* Notify allocators that they can free arrays allocated
* on behalf of this FacetArrays object.
*/
public void free() {
if (intArrayAllocator!=null) {
intArrayAllocator.free(intArray);
// Should give up handle to the array now
// that it is freed.
intArray = null;
}
if (floatArrayAllocator!=null) {
floatArrayAllocator.free(floatArray);
// Should give up handle to the array now
// that it is freed.
floatArray = null;
}
arraysLength = 0;
}
/**
* Obtain an int array, e.g. for facet counting.
*/
public int[] getIntArray() {
if (intArray == null) {
intArray = intArrayAllocator.allocate();
arraysLength = intArray.length;
}
return intArray;
}
/** Obtain a float array, e.g. for evaluating facet association values. */
public float[] getFloatArray() {
if (floatArray == null) {
floatArray = floatArrayAllocator.allocate();
arraysLength = floatArray.length;
}
return floatArray;
}
/**
* Return the arrays length
*/
public int getArraysLength() {
return arraysLength;
}
}

View File

@ -0,0 +1,161 @@
package org.apache.lucene.facet.search;
import java.io.IOException;
import org.apache.lucene.facet.search.params.FacetRequest;
import org.apache.lucene.facet.search.results.FacetResult;
import org.apache.lucene.facet.search.results.FacetResultNode;
import org.apache.lucene.facet.search.results.IntermediateFacetResult;
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Handler for facet results.
* <p>
* The facet results handler provided by the {@link FacetRequest} to
* a {@link FacetsAccumulator}.
* <p>
* First it is used by {@link FacetsAccumulator} to obtain a temporary
* facet result for each partition and to merge results of several partitions.
* <p>
* Later the accumulator invokes the handler to render the results, creating
* {@link FacetResult} objects.
* <p>
* Last the accumulator invokes the handler to label final results.
*
* @lucene.experimental
*/
public abstract class FacetResultsHandler {
/** Taxonomy for which facets are handled */
protected final TaxonomyReader taxonomyReader;
/**
* Facet request served by this handler.
*/
protected final FacetRequest facetRequest;
/**
* Create a faceted search handler.
* @param taxonomyReader See {@link #getTaxonomyReader()}.
* @param facetRequest See {@link #getFacetRequest()}.
*/
public FacetResultsHandler(TaxonomyReader taxonomyReader,
FacetRequest facetRequest) {
this.taxonomyReader = taxonomyReader;
this.facetRequest = facetRequest;
}
/**
* Fetch results of a single partition, given facet arrays for that partition,
* and based on the matching documents and faceted search parameters.
*
* @param arrays
* facet arrays for the certain partition
* @param offset
* offset in input arrays where partition starts
* @return temporary facet result, potentially, to be passed back to
* <b>this</b> result handler for merging, or <b>null</b> in case that
* constructor parameter, <code>facetRequest</code>, requests an
* illegal FacetResult, like, e.g., a root node category path that
* does not exist in constructor parameter <code>taxonomyReader</code>
* .
* @throws IOException
* on error
*/
public abstract IntermediateFacetResult fetchPartitionResult(FacetArrays arrays, int offset) throws IOException;
/**
* Merge results of several facet partitions. Logic of the merge is undefined
* and open for interpretations. For example, a merge implementation could
* keep top K results. Passed {@link IntermediateFacetResult} must be ones
* that were created by this handler otherwise a {@link ClassCastException} is
* thrown. In addition, all passed {@link IntermediateFacetResult} must have
* the same {@link FacetRequest} otherwise an {@link IllegalArgumentException}
* is thrown.
*
* @param tmpResults one or more temporary results created by <b>this</b>
* handler.
* @return temporary facet result that represents to union, as specified by
* <b>this</b> handler, of the input temporary facet results.
* @throws IOException on error.
* @throws ClassCastException if the temporary result passed was not created
* by this handler
* @throws IllegalArgumentException if passed <code>facetResults</code> do not
* have the same {@link FacetRequest}
* @see IntermediateFacetResult#getFacetRequest()
*/
public abstract IntermediateFacetResult mergeResults(IntermediateFacetResult... tmpResults)
throws IOException, ClassCastException, IllegalArgumentException;
/**
* Create a facet result from the temporary result.
* @param tmpResult temporary result to be rendered as a {@link FacetResult}
* @throws IOException on error.
*/
public abstract FacetResult renderFacetResult(IntermediateFacetResult tmpResult) throws IOException ;
/**
* Perform any rearrangement as required on a facet result that has changed after
* it was rendered.
* <P>
* Possible use case: a sampling facets accumulator invoked another
* other facets accumulator on a sample set of documents, obtained
* rendered facet results, fixed their counts, and now it is needed
* to sort the results differently according to the fixed counts.
* @param facetResult result to be rearranged.
* @see FacetResultNode#setValue(double)
*/
public abstract FacetResult rearrangeFacetResult(FacetResult facetResult);
/**
* Label results according to settings in {@link FacetRequest},
* such as {@link FacetRequest#getNumLabel()}.
* Usually invoked by {@link FacetsAccumulator#accumulate(ScoredDocIDs)}
* @param facetResult facet result to be labeled.
* @throws IOException on error
*/
public abstract void labelResult (FacetResult facetResult) throws IOException;
/** Return taxonomy reader used for current facets accumulation operation. */
public final TaxonomyReader getTaxonomyReader() {
return this.taxonomyReader;
}
/** Return the facet request served by this handler. */
public final FacetRequest getFacetRequest() {
return this.facetRequest;
}
/**
* Check if an array contains the partition which contains ordinal
*
* @param ordinal
* checked facet
* @param facetArrays
* facet arrays for the certain partition
* @param offset
* offset in input arrays where partition starts
*/
protected boolean isSelfPartition (int ordinal, FacetArrays facetArrays, int offset) {
int partitionSize = facetArrays.getArraysLength();
return ordinal / partitionSize == offset / partitionSize;
}
}

View File

@ -0,0 +1,153 @@
package org.apache.lucene.facet.search;
import java.io.IOException;
import java.util.List;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.facet.search.params.FacetSearchParams;
import org.apache.lucene.facet.search.params.FacetRequest;
import org.apache.lucene.facet.search.results.FacetResult;
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Driver for Accumulating facets of faceted search requests over given
* documents.
*
* @lucene.experimental
*/
public abstract class FacetsAccumulator {
/**
* Default threshold for using the complements optimization.
* If accumulating facets for a document set larger than this ratio of the index size than
* perform the complement optimization.
* @see #setComplementThreshold(double) for more info on the complements optimization.
*/
public static final double DEFAULT_COMPLEMENT_THRESHOLD = 0.6;
/**
* Passing this to {@link #setComplementThreshold(double)} will disable using complement optimization.
*/
public static final double DISABLE_COMPLEMENT = Double.POSITIVE_INFINITY; // > 1 actually
/**
* Passing this to {@link #setComplementThreshold(double)} will force using complement optimization.
*/
public static final double FORCE_COMPLEMENT = 0; // <=0
private double complementThreshold = DEFAULT_COMPLEMENT_THRESHOLD;
protected final TaxonomyReader taxonomyReader;
protected final IndexReader indexReader;
protected FacetSearchParams searchParams;
private boolean allowLabeling = true;
public FacetsAccumulator(FacetSearchParams searchParams,
IndexReader indexReader,
TaxonomyReader taxonomyReader) {
this.indexReader = indexReader;
this.taxonomyReader = taxonomyReader;
this.searchParams = searchParams;
}
/**
* Accumulate facets over given documents, according to facet requests in effect.
* @param docids documents (and their scores) for which facets are Accumulated.
* @return Accumulated facets.
* @throws IOException on error.
*/
// internal API note: it was considered to move the docids into the constructor as well,
// but this prevents nice extension capabilities, especially in the way that
// Sampling Accumulator works with the (any) delegated accumulator.
public abstract List<FacetResult> accumulate(ScoredDocIDs docids) throws IOException;
/**
* @return the complement threshold
* @see #setComplementThreshold(double)
*/
public double getComplementThreshold() {
return complementThreshold;
}
/**
* Set the complement threshold.
* This threshold will dictate whether the complements optimization is applied.
* The optimization is to count for less documents. It is useful when the same
* FacetSearchParams are used for varying sets of documents. The first time
* complements is used the "total counts" are computed - counting for all the
* documents in the collection. Then, only the complementing set of documents
* is considered, and used to decrement from the overall counts, thereby
* walking through less documents, which is faster.
* <p>
* Note that this optimization is only available when searching an index
* whose {@link IndexReader} implements both
* {@link IndexReader#directory()} and {@link IndexReader#getVersion()}
* otherwise the optimization is silently disabled regardless of
* the complement threshold settings.
* <p>
* For the default settings see {@link #DEFAULT_COMPLEMENT_THRESHOLD}.
* <p>
* To forcing complements in all cases pass {@link #FORCE_COMPLEMENT}.
* This is mostly useful for testing purposes, as forcing complements when only
* tiny fraction of available documents match the query does not make sense and
* would incur performance degradations.
* <p>
* To disable complements pass {@link #DISABLE_COMPLEMENT}.
* @param complementThreshold the complement threshold to set
*/
public void setComplementThreshold(double complementThreshold) {
this.complementThreshold = complementThreshold;
}
/**
* Check if labeling is allowed for this accumulator.
* <p>
* By default labeling is allowed.
* This allows one accumulator to invoke other accumulators for accumulation
* but keep to itself the responsibility of labeling.
* This might br handy since labeling is a costly operation.
* @return true of labeling is allowed for this accumulator
* @see #setAllowLabeling(boolean)
*/
protected boolean isAllowLabeling() {
return allowLabeling;
}
/**
* Set whether labeling is allowed for this accumulator.
* @param allowLabeling new setting for allow labeling
* @see #isAllowLabeling()
*/
protected void setAllowLabeling(boolean allowLabeling) {
this.allowLabeling = allowLabeling;
}
/** check if all requests are complementable */
protected boolean mayComplement() {
for (FacetRequest freq:searchParams.getFacetRequests()) {
if (!freq.supportsComplements()) {
return false;
}
}
return true;
}
}

View File

@ -0,0 +1,137 @@
package org.apache.lucene.facet.search;
import java.io.IOException;
import java.util.List;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.facet.search.params.FacetRequest;
import org.apache.lucene.facet.search.params.FacetSearchParams;
import org.apache.lucene.facet.search.results.FacetResult;
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Collector for facet accumulation. *
*
* @lucene.experimental
*/
public class FacetsCollector extends Collector {
protected final FacetsAccumulator facetsAccumulator;
private ScoredDocIdCollector scoreDocIdCollector;
private List<FacetResult> results;
private Object resultsGuard;
/**
* Create a collector for accumulating facets while collecting documents
* during search.
*
* @param facetSearchParams
* faceted search parameters defining which facets are required and
* how.
* @param indexReader
* searched index.
* @param taxonomyReader
* taxonomy containing the facets.
*/
public FacetsCollector(FacetSearchParams facetSearchParams,
IndexReader indexReader, TaxonomyReader taxonomyReader) {
facetsAccumulator = initFacetsAccumulator(facetSearchParams, indexReader, taxonomyReader);
scoreDocIdCollector = initScoredDocCollector(facetSearchParams, indexReader, taxonomyReader);
resultsGuard = new Object();
}
/**
* Create a {@link ScoredDocIdCollector} to be used as the first phase of
* the facet collection. If all facetRequests are do not require the
* document score, a ScoredDocIdCollector which does not store the document
* scores would be returned. Otherwise a SDIC which does store the documents
* will be returned, having an initial allocated space for 1000 such
* documents' scores.
*/
protected ScoredDocIdCollector initScoredDocCollector(
FacetSearchParams facetSearchParams, IndexReader indexReader,
TaxonomyReader taxonomyReader) {
for (FacetRequest frq : facetSearchParams.getFacetRequests()) {
if (frq.requireDocumentScore()) {
return ScoredDocIdCollector.create(1000, true);
}
}
return ScoredDocIdCollector.create(indexReader.maxDoc(), false);
}
/**
* Create the {@link FacetsAccumulator} to be used. Default is
* {@link StandardFacetsAccumulator}. Called once at the constructor of the collector.
*
* @param facetSearchParams
* The search params.
* @param indexReader
* A reader to the index to search in.
* @param taxonomyReader
* A reader to the active taxonomy.
* @return The {@link FacetsAccumulator} to use.
*/
protected FacetsAccumulator initFacetsAccumulator(FacetSearchParams facetSearchParams,
IndexReader indexReader,
TaxonomyReader taxonomyReader) {
return new StandardFacetsAccumulator(facetSearchParams, indexReader, taxonomyReader);
}
/**
* Return accumulated facets results (according to faceted search parameters)
* for collected documents.
* @throws IOException on error
*/
public List<FacetResult> getFacetResults() throws IOException {
synchronized (resultsGuard) { // over protection
if (results == null) {
// lazy creation but just once
results = facetsAccumulator.accumulate(scoreDocIdCollector.getScoredDocIDs());
scoreDocIdCollector = null;
}
return results;
}
}
@Override
public boolean acceptsDocsOutOfOrder() {
return false;
}
@Override
public void collect(int doc) throws IOException {
scoreDocIdCollector.collect(doc);
}
@Override
public void setNextReader(AtomicReaderContext context) throws IOException {
scoreDocIdCollector.setNextReader(context);
}
@Override
public void setScorer(Scorer scorer) throws IOException {
scoreDocIdCollector.setScorer(scorer);
}
}

View File

@ -0,0 +1,68 @@
package org.apache.lucene.facet.search;
import java.util.Arrays;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* An FloatArrayAllocator is an object which manages float array objects
* of a certain size. These float arrays are needed temporarily during
* faceted search (see {@link FacetsAccumulator} and can be reused across searches
* instead of being allocated afresh on every search.
* <P>
* An FloatArrayAllocator is thread-safe.
*
* @lucene.experimental
*/
public final class FloatArrayAllocator extends TemporaryObjectAllocator<float[]> {
// An FloatArrayAllocater deals with integer arrays of a fixed size, size.
private int size;
/**
* Construct an allocator for float arrays of size <CODE>size</CODE>,
* keeping around a pool of up to <CODE>maxArrays</CODE> old arrays.
* <P>
* Note that the pool size only restricts the number of arrays that hang
* around when not needed, but <I>not</I> the maximum number of arrays
* that are allocated when actually is use: If a number of concurrent
* threads ask for an allocation, all of them will get a counter array,
* even if their number is greater than maxArrays. If an application wants
* to limit the number of concurrent threads making allocations, it needs
* to do so on its own - for example by blocking new threads until the
* existing ones have finished.
* <P>
* In particular, when maxArrays=0, this object behaves as a trivial
* allocator, always allocating a new array and never reusing an old one.
*/
public FloatArrayAllocator(int size, int maxArrays) {
super(maxArrays);
this.size = size;
}
@Override
public float[] create() {
return new float[size];
}
@Override
public void clear(float[] array) {
Arrays.fill(array, 0);
}
}

View File

@ -0,0 +1,56 @@
package org.apache.lucene.facet.search;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Declares an interface for heap (and heap alike) structures,
* handling a given type T
*
* @lucene.experimental
*/
public interface Heap<T> {
/**
* Get and remove the top of the Heap <BR>
* NOTE: Once {@link #pop()} is called no other {@link #add(Object)} or
* {@link #insertWithOverflow(Object)} should be called.
*/
public T pop();
/** Get (But not remove) the top of the Heap */
public T top();
/**
* Insert a new value, returning the overflowen object <br>
* NOTE: This method should not be called after invoking {@link #pop()}
*/
public T insertWithOverflow(T value);
/**
* Add a new value to the heap, return the new top(). <br>
* Some implementations may choose to not implement this functionality.
* In such a case <code>null</code> should be returned. <BR>
* NOTE: This method should not be called after invoking {@link #pop()}
*/
public T add(T frn);
/** Clear the heap */
public void clear();
/** Return the amount of objects currently in the heap */
public int size();
}

View File

@ -0,0 +1,68 @@
package org.apache.lucene.facet.search;
import java.util.Arrays;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* An IntArrayAllocator is an object which manages counter array objects
* of a certain length. These counter arrays are needed temporarily during
* faceted search (see {@link FacetsAccumulator} and can be reused across searches
* instead of being allocated afresh on every search.
* <P>
* An IntArrayAllocator is thread-safe.
*
* @lucene.experimental
*/
public final class IntArrayAllocator extends TemporaryObjectAllocator<int[]> {
// An IntArrayAllocater deals with integer arrays of a fixed length.
private int length;
/**
* Construct an allocator for counter arrays of length <CODE>length</CODE>,
* keeping around a pool of up to <CODE>maxArrays</CODE> old arrays.
* <P>
* Note that the pool size only restricts the number of arrays that hang
* around when not needed, but <I>not</I> the maximum number of arrays
* that are allocated when actually is use: If a number of concurrent
* threads ask for an allocation, all of them will get a counter array,
* even if their number is greater than maxArrays. If an application wants
* to limit the number of concurrent threads making allocations, it needs
* to do so on its own - for example by blocking new threads until the
* existing ones have finished.
* <P>
* In particular, when maxArrays=0, this object behaves as a trivial
* allocator, always allocating a new array and never reusing an old one.
*/
public IntArrayAllocator(int length, int maxArrays) {
super(maxArrays);
this.length = length;
}
@Override
public int[] create() {
return new int[length];
}
@Override
public void clear(int[] array) {
Arrays.fill(array, 0);
}
}

View File

@ -0,0 +1,117 @@
package org.apache.lucene.facet.search;
import java.io.IOException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.util.UnsafeByteArrayInputStream;
import org.apache.lucene.util.encoding.IntDecoder;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* A payload deserializer comes with its own working space (buffer). One need to
* define the {@link IndexReader} and {@link Term} in which the payload resides.
* The iterator then consumes the payload information of each document and
* decodes it into categories. A typical use case of this class is:
*
* <pre>
* IndexReader reader = [open your reader];
* Term t = new Term(&quot;field&quot;, &quot;where-payload-exists&quot;);
* CategoryListIterator cli = new PayloadIntDecodingIterator(reader, t);
* if (!cli.init()) {
* // it means there are no payloads / documents associated with that term.
* // Usually a sanity check. However, init() must be called.
* }
* DocIdSetIterator disi = [you usually iterate on something else, such as a Scorer];
* int doc;
* while ((doc = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
* cli.setdoc(doc);
* long category;
* while ((category = cli.nextCategory()) &lt; Integer.MAX_VALUE) {
* }
* }
* </pre>
*
* @lucene.experimental
*/
public class PayloadIntDecodingIterator implements CategoryListIterator {
private final UnsafeByteArrayInputStream ubais;
private final IntDecoder decoder;
private final IndexReader indexReader;
private final Term term;
private final PayloadIterator pi;
private final int hashCode;
public PayloadIntDecodingIterator(IndexReader indexReader, Term term, IntDecoder decoder)
throws IOException {
this(indexReader, term, decoder, new byte[1024]);
}
public PayloadIntDecodingIterator(IndexReader indexReader, Term term, IntDecoder decoder,
byte[] buffer) throws IOException {
pi = new PayloadIterator(indexReader, term, buffer);
ubais = new UnsafeByteArrayInputStream();
this.decoder = decoder;
hashCode = indexReader.hashCode() ^ term.hashCode();
this.term = term;
this.indexReader = indexReader;
}
@Override
public boolean equals(Object other) {
if (!(other instanceof PayloadIntDecodingIterator)) {
return false;
}
PayloadIntDecodingIterator that = (PayloadIntDecodingIterator) other;
if (hashCode != that.hashCode) {
return false;
}
// Hash codes are the same, check equals() to avoid cases of hash-collisions.
return indexReader.equals(that.indexReader) && term.equals(that.term);
}
@Override
public int hashCode() {
return hashCode;
}
public boolean init() throws IOException {
return pi.init();
}
public long nextCategory() throws IOException {
return decoder.decode();
}
public boolean skipTo(int docId) throws IOException {
if (!pi.setdoc(docId)) {
return false;
}
// Initializing the decoding mechanism with the new payload data
ubais.reInit(pi.getBuffer(), 0, pi.getPayloadLength());
decoder.reInit(ubais);
return true;
}
}

View File

@ -0,0 +1,138 @@
package org.apache.lucene.facet.search;
import java.io.IOException;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* A utility class for iterating through a posting list of a given term and
* retrieving the payload of the first occurrence in every document. Comes with
* its own working space (buffer).
*
* @lucene.experimental
*/
public class PayloadIterator {
protected byte[] buffer;
protected int payloadLength;
DocsAndPositionsEnum tp;
private boolean hasMore;
public PayloadIterator(IndexReader indexReader, Term term)
throws IOException {
this(indexReader, term, new byte[1024]);
}
public PayloadIterator(IndexReader indexReader, Term term, byte[] buffer)
throws IOException {
this.buffer = buffer;
// TODO (Facet): avoid Multi*?
Bits deletedDocs = MultiFields.getDeletedDocs(indexReader);
this.tp = MultiFields.getTermPositionsEnum(indexReader, deletedDocs, term.field(), term.bytes());
}
/**
* (re)initialize the iterator. Should be done before the first call to
* {@link #setdoc(int)}. Returns false if there is no category list found
* (no setdoc() will never return true).
*/
public boolean init() throws IOException {
hasMore = tp != null && tp.nextDoc() != DocIdSetIterator.NO_MORE_DOCS;
return hasMore;
}
/**
* Skip forward to document docId. Return true if this document exists and
* has any payload.
* <P>
* Users should call this method with increasing docIds, and implementations
* can assume that this is the case.
*/
public boolean setdoc(int docId) throws IOException {
if (!hasMore) {
return false;
}
if (tp.docID() > docId) {
return false;
}
// making sure we have the requested document
if (tp.docID() < docId) {
// Skipping to requested document
if (tp.advance(docId) == DocIdSetIterator.NO_MORE_DOCS) {
this.hasMore = false;
return false;
}
// If document not found (skipped to much)
if (tp.docID() != docId) {
return false;
}
}
// Prepare for payload extraction
tp.nextPosition();
// TODO: fix bug in SepCodec and then remove this check (the null check should be enough)
if (!tp.hasPayload()) {
return false;
}
BytesRef br = tp.getPayload();
if (br == null || br.length == 0) {
return false;
}
this.payloadLength = br.length;
if (this.payloadLength > this.buffer.length) {
// Growing if necessary.
this.buffer = new byte[this.payloadLength * 2 + 1];
}
// Loading the payload
System.arraycopy(br.bytes, br.offset, this.buffer, 0, payloadLength);
return true;
}
/**
* Get the buffer with the content of the last read payload.
*/
public byte[] getBuffer() {
return buffer;
}
/**
* Get the length of the last read payload.
*/
public int getPayloadLength() {
return payloadLength;
}
}

View File

@ -0,0 +1,118 @@
package org.apache.lucene.facet.search;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.facet.search.params.FacetSearchParams;
import org.apache.lucene.facet.search.results.FacetResult;
import org.apache.lucene.facet.search.results.FacetResultNode;
import org.apache.lucene.facet.search.sampling.Sampler;
import org.apache.lucene.facet.search.sampling.Sampler.SampleResult;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Wrap any Facets Accumulator with sampling.
* <p>
* Note: Sampling accumulation (Accumulation over a sampled-set of the results),
* does not guarantee accurate values for
* {@link FacetResult#getNumValidDescendants()} &
* {@link FacetResultNode#getResidue()}.
*
* @lucene.experimental
*/
public class SamplingWrapper extends FacetsAccumulator {
private FacetsAccumulator delegee;
private Sampler sampler;
public SamplingWrapper(FacetsAccumulator delegee, Sampler sampler) {
super(delegee.searchParams, delegee.indexReader, delegee.taxonomyReader);
this.delegee = delegee;
this.sampler = sampler;
}
@Override
public List<FacetResult> accumulate(ScoredDocIDs docids) throws IOException {
// first let delegee accumulate without labeling at all (though
// currently it doesn't matter because we have to label all returned anyhow)
boolean origAllowLabeling = isAllowLabeling();
setAllowLabeling(false);
// Replacing the original searchParams with the over-sampled (and without statistics-compute)
FacetSearchParams original = delegee.searchParams;
delegee.searchParams = sampler.overSampledSearchParams(original);
SampleResult sampleSet = sampler.getSampleSet(docids);
List<FacetResult> sampleRes = delegee.accumulate(sampleSet.docids);
setAllowLabeling(origAllowLabeling);
List<FacetResult> fixedRes = new ArrayList<FacetResult>();
for (FacetResult fres : sampleRes) {
// for sure fres is not null because this is guaranteed by the delegee.
FacetResultsHandler frh = fres.getFacetRequest().createFacetResultsHandler(taxonomyReader);
// fix the result of current request
sampler.getSampleFixer(indexReader, taxonomyReader, searchParams)
.fixResult(docids, fres);
fres = frh.rearrangeFacetResult(fres); // let delegee's handler do any
// Using the sampler to trim the extra (over-sampled) results
fres = sampler.trimResult(fres);
// final labeling if allowed (because labeling is a costly operation)
if (isAllowLabeling()) {
frh.labelResult(fres);
}
fixedRes.add(fres); // add to final results
}
delegee.searchParams = original; // Back to original params
return fixedRes;
}
/**
* @see FacetsAccumulator#getComplementThreshold()
*/
@Override
public double getComplementThreshold() {
return delegee.getComplementThreshold();
}
/**
* @param complementThreshold
* @see FacetsAccumulator#setComplementThreshold(double)
*/
@Override
public void setComplementThreshold(double complementThreshold) {
delegee.setComplementThreshold(complementThreshold);
}
@Override
protected boolean isAllowLabeling() {
return delegee.isAllowLabeling();
}
@Override
protected void setAllowLabeling(boolean allowLabeling) {
delegee.setAllowLabeling(allowLabeling);
}
}

View File

@ -0,0 +1,42 @@
package org.apache.lucene.facet.search;
import java.io.IOException;
import org.apache.lucene.search.DocIdSet;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Document IDs with scores for each, driving facets accumulation. Document
* scores are optionally used in the process of facets scoring.
*
* @see FacetsAccumulator#accumulate(ScoredDocIDs)
* @lucene.experimental
*/
public interface ScoredDocIDs {
/** Returns an iterator over the document IDs and their scores. */
public ScoredDocIDsIterator iterator() throws IOException;
/** Returns the set of doc IDs. */
public DocIdSet getDocIDs();
/** Returns the number of scored documents. */
public int size();
}

View File

@ -0,0 +1,43 @@
package org.apache.lucene.facet.search;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Iterator over document IDs and their scores. Each {@link #next()} retrieves
* the next docID and its score which can be later be retrieved by
* {@link #getDocID()} and {@link #getScore()}. <b>NOTE:</b> you must call
* {@link #next()} before {@link #getDocID()} and/or {@link #getScore()}, or
* otherwise the returned values are unexpected.
*
* @lucene.experimental
*/
public interface ScoredDocIDsIterator {
/** Default score used in case scoring is disabled. */
public static final float DEFAULT_SCORE = 1.0f;
/** Iterate to the next document/score pair. Returns true iff there is such a pair. */
public abstract boolean next();
/** Returns the ID of the current document. */
public abstract int getDocID();
/** Returns the score of the current document. */
public abstract float getScore();
}

View File

@ -0,0 +1,224 @@
package org.apache.lucene.facet.search;
import java.io.IOException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.OpenBitSet;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* A {@link Collector} which stores all docIDs and their scores in a
* {@link ScoredDocIDs} instance. If scoring is not enabled, then the default
* score as set in {@link #setDefaultScore(float)} (or
* {@link ScoredDocIDsIterator#DEFAULT_SCORE}) will be set for all documents.
*
* @lucene.experimental
*/
public abstract class ScoredDocIdCollector extends Collector {
private static final class NonScoringDocIdCollector extends ScoredDocIdCollector {
float defaultScore = ScoredDocIDsIterator.DEFAULT_SCORE;
@SuppressWarnings("synthetic-access")
public NonScoringDocIdCollector(int maxDoc) {
super(maxDoc);
}
@Override
public boolean acceptsDocsOutOfOrder() { return true; }
@Override
public void collect(int doc) throws IOException {
docIds.fastSet(docBase + doc);
++numDocIds;
}
@Override
public float getDefaultScore() {
return defaultScore;
}
@Override
public ScoredDocIDsIterator scoredDocIdsIterator() throws IOException {
return new ScoredDocIDsIterator() {
private DocIdSetIterator docIdsIter = docIds.iterator();
private int nextDoc;
public int getDocID() { return nextDoc; }
public float getScore() { return defaultScore; }
public boolean next() {
try {
nextDoc = docIdsIter.nextDoc();
return nextDoc != DocIdSetIterator.NO_MORE_DOCS;
} catch (IOException e) {
// This should not happen as we're iterating over an OpenBitSet. For
// completeness, terminate iteration
nextDoc = DocIdSetIterator.NO_MORE_DOCS;
return false;
}
}
};
}
@Override
public void setDefaultScore(float defaultScore) {
this.defaultScore = defaultScore;
}
@Override
public void setScorer(Scorer scorer) throws IOException {}
}
private static final class ScoringDocIdCollector extends ScoredDocIdCollector {
float[] scores;
private Scorer scorer;
@SuppressWarnings("synthetic-access")
public ScoringDocIdCollector(int maxDoc) {
super(maxDoc);
scores = new float[maxDoc];
}
@Override
public boolean acceptsDocsOutOfOrder() { return false; }
@Override
public void collect(int doc) throws IOException {
docIds.fastSet(docBase + doc);
float score = this.scorer.score();
if (numDocIds >= scores.length) {
float[] newScores = new float[ArrayUtil.oversize(numDocIds + 1, 4)];
System.arraycopy(scores, 0, newScores, 0, numDocIds);
scores = newScores;
}
scores[numDocIds] = score;
++numDocIds;
}
@Override
public ScoredDocIDsIterator scoredDocIdsIterator() throws IOException {
return new ScoredDocIDsIterator() {
private DocIdSetIterator docIdsIter = docIds.iterator();
private int nextDoc;
private int scoresIdx = -1;
public int getDocID() { return nextDoc; }
public float getScore() { return scores[scoresIdx]; }
public boolean next() {
try {
nextDoc = docIdsIter.nextDoc();
if (nextDoc == DocIdSetIterator.NO_MORE_DOCS) {
return false;
}
++scoresIdx;
return true;
} catch (IOException e) {
// This should not happen as we're iterating over an OpenBitSet. For
// completeness, terminate iteration
nextDoc = DocIdSetIterator.NO_MORE_DOCS;
return false;
}
}
};
}
@Override
public float getDefaultScore() { return ScoredDocIDsIterator.DEFAULT_SCORE; }
@Override
public void setDefaultScore(float defaultScore) {}
@Override
public void setScorer(Scorer scorer) throws IOException {
this.scorer = scorer;
}
}
protected int numDocIds;
protected int docBase;
protected final OpenBitSet docIds;
/**
* Creates a new {@link ScoredDocIdCollector} with the given parameters.
*
* @param maxDoc the number of documents that are expected to be collected.
* Note that if more documents are collected, unexpected exceptions may
* be thrown. Usually you should pass {@link IndexReader#maxDoc()} of
* the same IndexReader with which the search is executed.
* @param enableScoring if scoring is enabled, a score will be computed for
* every matching document, which might be expensive. Therefore if you
* do not require scoring, it is better to set it to <i>false</i>.
*/
public static ScoredDocIdCollector create(int maxDoc, boolean enableScoring) {
return enableScoring ? new ScoringDocIdCollector(maxDoc)
: new NonScoringDocIdCollector(maxDoc);
}
private ScoredDocIdCollector(int maxDoc) {
numDocIds = 0;
docIds = new OpenBitSet(maxDoc);
}
/** Returns the default score used when scoring is disabled. */
public abstract float getDefaultScore();
/** Set the default score. Only applicable if scoring is disabled. */
public abstract void setDefaultScore(float defaultScore);
public abstract ScoredDocIDsIterator scoredDocIdsIterator() throws IOException;
public ScoredDocIDs getScoredDocIDs() {
return new ScoredDocIDs() {
public ScoredDocIDsIterator iterator() throws IOException {
return scoredDocIdsIterator();
}
public DocIdSet getDocIDs() {
return docIds;
}
public int size() {
return numDocIds;
}
};
}
@Override
public void setNextReader(AtomicReaderContext context) throws IOException {
this.docBase = context.docBase;
}
}

View File

@ -0,0 +1,338 @@
package org.apache.lucene.facet.search;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map.Entry;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.facet.search.aggregator.Aggregator;
import org.apache.lucene.facet.search.params.FacetSearchParams;
import org.apache.lucene.facet.search.params.FacetRequest;
import org.apache.lucene.facet.search.results.FacetResult;
import org.apache.lucene.facet.search.results.IntermediateFacetResult;
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
import org.apache.lucene.facet.util.PartitionsUtils;
import org.apache.lucene.facet.util.ScoredDocIdsUtils;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Standard implementation for {@link FacetsAccumulator}, utilizing partitions to save on memory.
* <p>
* Why partitions? Because if there are say 100M categories out of which
* only top K are required, we must first compute value for all 100M categories
* (going over all documents) and only then could we select top K.
* This is made easier on memory by working in partitions of distinct categories:
* Once a values for a partition are found, we take the top K for that
* partition and work on the next partition, them merge the top K of both,
* and so forth, thereby computing top K with RAM needs for the size of
* a single partition rather than for the size of all the 100M categories.
* <p>
* Decision on partitions size is done at indexing time, and the facet information
* for each partition is maintained separately.
* <p>
* <u>Implementation detail:</u> Since facets information of each partition is
* maintained in a separate "category list", we can be more efficient
* at search time, because only the facet info for a single partition
* need to be read while processing that partition.
*
* @lucene.experimental
*/
public class StandardFacetsAccumulator extends FacetsAccumulator {
private static final Logger logger = Logger.getLogger(StandardFacetsAccumulator.class.getName());
protected final IntArrayAllocator intArrayAllocator;
protected final FloatArrayAllocator floatArrayAllocator;
protected int partitionSize;
protected int maxPartitions;
protected boolean isUsingComplements;
private TotalFacetCounts totalFacetCounts;
private Object accumulateGuard;
public StandardFacetsAccumulator(FacetSearchParams searchParams, IndexReader indexReader,
TaxonomyReader taxonomyReader, IntArrayAllocator intArrayAllocator,
FloatArrayAllocator floatArrayAllocator) {
super(searchParams,indexReader,taxonomyReader);
int realPartitionSize = intArrayAllocator == null || floatArrayAllocator == null
? PartitionsUtils.partitionSize(searchParams, taxonomyReader) : -1; // -1 if not needed.
this.intArrayAllocator = intArrayAllocator != null
? intArrayAllocator
// create a default one if null was provided
: new IntArrayAllocator(realPartitionSize, 1);
this.floatArrayAllocator = floatArrayAllocator != null
? floatArrayAllocator
// create a default one if null provided
: new FloatArrayAllocator(realPartitionSize, 1);
// can only be computed later when docids size is known
isUsingComplements = false;
partitionSize = PartitionsUtils.partitionSize(searchParams, taxonomyReader);
maxPartitions = (int) Math.ceil(this.taxonomyReader.getSize() / (double) partitionSize);
accumulateGuard = new Object();
}
public StandardFacetsAccumulator(FacetSearchParams searchParams, IndexReader indexReader,
TaxonomyReader taxonomyReader) {
this(searchParams, indexReader, taxonomyReader, null, null);
}
@Override
public List<FacetResult> accumulate(ScoredDocIDs docids) throws IOException {
// synchronize to prevent calling two accumulate()'s at the same time.
// We decided not to synchronize the method because that might mislead
// users to feel encouraged to call this method simultaneously.
synchronized (accumulateGuard) {
// only now we can compute this
isUsingComplements = shouldComplement(docids);
if (isUsingComplements) {
try {
totalFacetCounts = TotalFacetCountsCache.getSingleton()
.getTotalCounts(indexReader, taxonomyReader,
searchParams.getFacetIndexingParams(), searchParams.getClCache());
if (totalFacetCounts != null) {
docids = ScoredDocIdsUtils.getComplementSet(docids, indexReader);
} else {
isUsingComplements = false;
}
} catch (UnsupportedOperationException e) {
// TODO (Facet): this exception is thrown from TotalCountsKey if the
// IndexReader used does not support getVersion(). We should re-think
// this: is this tiny detail worth disabling total counts completely
// for such readers? Currently, it's not supported by Parallel and
// MultiReader, which might be problematic for several applications.
// We could, for example, base our "isCurrent" logic on something else
// than the reader's version. Need to think more deeply about it.
if (logger.isLoggable(Level.FINEST)) {
logger.log(Level.FINEST, "IndexReader used does not support completents: ", e);
}
isUsingComplements = false;
} catch (IOException e) {
if (logger.isLoggable(Level.FINEST)) {
logger.log(Level.FINEST, "Failed to load/calculate total counts (complement counting disabled): ", e);
}
// silently fail if for some reason failed to load/save from/to dir
isUsingComplements = false;
} catch (Exception e) {
// give up: this should not happen!
IOException ioEx = new IOException(
"PANIC: Got unexpected exception while trying to get/calculate total counts: "
+e.getMessage());
ioEx.initCause(e);
throw ioEx;
}
}
docids = actualDocsToAccumulate(docids);
FacetArrays facetArrays = new FacetArrays(intArrayAllocator, floatArrayAllocator);
HashMap<FacetRequest, IntermediateFacetResult> fr2tmpRes = new HashMap<FacetRequest, IntermediateFacetResult>();
try {
for (int part = 0; part < maxPartitions; part++) {
// fill arrays from category lists
fillArraysForPartition(docids, facetArrays, part);
int offset = part * partitionSize;
// for each partition we go over all requests and handle
// each, where
// the request maintains the merged result.
// In this implementation merges happen after each
// partition,
// but other impl could merge only at the end.
for (FacetRequest fr : searchParams.getFacetRequests()) {
FacetResultsHandler frHndlr = fr.createFacetResultsHandler(taxonomyReader);
IntermediateFacetResult res4fr = frHndlr.fetchPartitionResult(facetArrays, offset);
IntermediateFacetResult oldRes = fr2tmpRes.get(fr);
if (oldRes != null) {
res4fr = frHndlr.mergeResults(oldRes, res4fr);
}
fr2tmpRes.put(fr, res4fr);
}
}
} finally {
facetArrays.free();
}
// gather results from all requests into a list for returning them
List<FacetResult> res = new ArrayList<FacetResult>();
for (FacetRequest fr : searchParams.getFacetRequests()) {
FacetResultsHandler frHndlr = fr.createFacetResultsHandler(taxonomyReader);
IntermediateFacetResult tmpResult = fr2tmpRes.get(fr);
if (tmpResult == null) {
continue; // do not add a null to the list.
}
FacetResult facetRes = frHndlr.renderFacetResult(tmpResult);
// final labeling if allowed (because labeling is a costly operation)
if (isAllowLabeling()) {
frHndlr.labelResult(facetRes);
}
res.add(facetRes);
}
return res;
}
}
/**
* Set the actual set of documents over which accumulation should take place.
* <p>
* Allows to override the set of documents to accumulate for. Invoked just
* before actual accumulating starts. From this point that set of documents
* remains unmodified. Default implementation just returns the input
* unchanged.
*
* @param docids
* candidate documents to accumulate for
* @return actual documents to accumulate for
*/
protected ScoredDocIDs actualDocsToAccumulate(ScoredDocIDs docids) throws IOException {
return docids;
}
/** Check if it is worth to use complements */
protected boolean shouldComplement(ScoredDocIDs docids) {
return
mayComplement() &&
(docids.size() > indexReader.numDocs() * getComplementThreshold()) ;
}
/**
* Iterate over the documents for this partition and fill the facet arrays with the correct
* count/complement count/value.
* @param internalCollector
* @param facetArrays
* @param part
* @throws IOException
*/
private final void fillArraysForPartition(ScoredDocIDs docids,
FacetArrays facetArrays, int partition) throws IOException {
if (isUsingComplements) {
initArraysByTotalCounts(facetArrays, partition, docids.size());
} else {
facetArrays.free(); // to get a cleared array for this partition
}
HashMap<CategoryListIterator, Aggregator> categoryLists = getCategoryListMap(
facetArrays, partition);
for (Entry<CategoryListIterator, Aggregator> entry : categoryLists.entrySet()) {
CategoryListIterator categoryList = entry.getKey();
if (!categoryList.init()) {
continue;
}
Aggregator categorator = entry.getValue();
ScoredDocIDsIterator iterator = docids.iterator();
while (iterator.next()) {
int docID = iterator.getDocID();
if (!categoryList.skipTo(docID)) {
continue;
}
categorator.setNextDoc(docID, iterator.getScore());
long ordinal;
while ((ordinal = categoryList.nextCategory()) <= Integer.MAX_VALUE) {
categorator.aggregate((int) ordinal);
}
}
}
}
/**
* Init arrays for partition by total counts, optionally applying a factor
*/
private final void initArraysByTotalCounts(FacetArrays facetArrays, int partition, int nAccumulatedDocs) {
int[] intArray = facetArrays.getIntArray();
totalFacetCounts.fillTotalCountsForPartition(intArray, partition);
double totalCountsFactor = getTotalCountsFactor();
// fix total counts, but only if the effect of this would be meaningfull.
if (totalCountsFactor < 0.99999) {
int delta = nAccumulatedDocs + 1;
for (int i = 0; i < intArray.length; i++) {
intArray[i] *= totalCountsFactor;
// also translate to prevent loss of non-positive values
// due to complement sampling (ie if sampled docs all decremented a certain category).
intArray[i] += delta;
}
}
}
/**
* Expert: factor by which counts should be multiplied when initializing
* the count arrays from total counts.
* Default implementation for this returns 1, which is a no op.
* @return a factor by which total counts should be multiplied
*/
protected double getTotalCountsFactor() {
return 1;
}
/**
* Create an {@link Aggregator} and a {@link CategoryListIterator} for each
* and every {@link FacetRequest}. Generating a map, matching each
* categoryListIterator to its matching aggregator.
* <p>
* If two CategoryListIterators are served by the same aggregator, a single
* aggregator is returned for both.
*
* <b>NOTE: </b>If a given category list iterator is needed with two different
* aggregators (e.g counting and association) - an exception is thrown as this
* functionality is not supported at this time.
*/
protected HashMap<CategoryListIterator, Aggregator> getCategoryListMap(FacetArrays facetArrays,
int partition) throws IOException {
HashMap<CategoryListIterator, Aggregator> categoryLists = new HashMap<CategoryListIterator, Aggregator>();
for (FacetRequest facetRequest : searchParams.getFacetRequests()) {
Aggregator categoryAggregator = facetRequest.createAggregator(
isUsingComplements, facetArrays, indexReader, taxonomyReader);
CategoryListIterator cli =
facetRequest.createCategoryListIterator(indexReader, taxonomyReader, searchParams, partition);
// get the aggregator
Aggregator old = categoryLists.put(cli, categoryAggregator);
if (old != null && !old.equals(categoryAggregator)) {
// TODO (Facet): create a more meaningful RE class, and throw it.
throw new RuntimeException(
"Overriding existing category list with different aggregator. THAT'S A NO NO!");
}
// if the aggregator is the same we're covered
}
return categoryLists;
}
}

View File

@ -0,0 +1,114 @@
package org.apache.lucene.facet.search;
import java.util.concurrent.ConcurrentLinkedQueue;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* An TemporaryObjectAllocator is an object which manages large, reusable,
* temporary objects needed during multiple concurrent computations. The idea
* is to remember some of the previously allocated temporary objects, and
* reuse them if possible to avoid constant allocation and garbage-collection
* of these objects.
* <P>
* This technique is useful for temporary counter arrays in faceted search
* (see {@link FacetsAccumulator}), which can be reused across searches instead
* of being allocated afresh on every search.
* <P>
* A TemporaryObjectAllocator is thread-safe.
*
* @lucene.experimental
*/
public abstract class TemporaryObjectAllocator<T> {
// In the "pool" we hold up to "maxObjects" old objects, and if the pool
// is not empty, we return one of its objects rather than allocating a new
// one.
ConcurrentLinkedQueue<T> pool = new ConcurrentLinkedQueue<T>();
int maxObjects;
/**
* Construct an allocator for objects of a certain type, keeping around a
* pool of up to <CODE>maxObjects</CODE> old objects.
* <P>
* Note that the pool size only restricts the number of objects that hang
* around when not needed, but <I>not</I> the maximum number of objects
* that are allocated when actually is use: If a number of concurrent
* threads ask for an allocation, all of them will get an object, even if
* their number is greater than maxObjects. If an application wants to
* limit the number of concurrent threads making allocations, it needs to
* do so on its own - for example by blocking new threads until the
* existing ones have finished. If more than maxObjects are freed, only
* maxObjects of them will be kept in the pool - the rest will not and
* will eventually be garbage-collected by Java.
* <P>
* In particular, when maxObjects=0, this object behaves as a trivial
* allocator, always allocating a new array and never reusing an old one.
*/
public TemporaryObjectAllocator(int maxObjects) {
this.maxObjects = maxObjects;
}
/**
* Subclasses must override this method to actually create a new object
* of the desired type.
*
*/
protected abstract T create();
/**
* Subclasses must override this method to clear an existing object of
* the desired type, to prepare it for reuse. Note that objects will be
* cleared just before reuse (on allocation), not when freed.
*/
protected abstract void clear(T object);
/**
* Allocate a new object. If there's a previously allocated object in our
* pool, we return it immediately. Otherwise, a new object is allocated.
* <P>
* Don't forget to call {@link #free(Object)} when you're done with the object,
* to return it to the pool. If you don't, memory is <I>not</I> leaked,
* but the pool will remain empty and a new object will be allocated each
* time (just like the maxArrays=0 case).
*/
public final T allocate() {
T object = pool.poll();
if (object==null) {
return create();
}
clear(object);
return object;
}
/**
* Return a no-longer-needed object back to the pool. If we already have
* enough objects in the pool (maxObjects as specified in the constructor),
* the array will not be saved, and Java will eventually garbage collect
* it.
* <P>
* In particular, when maxArrays=0, the given array is never saved and
* free does nothing.
*/
public final void free(T object) {
if (pool.size() < maxObjects && object != null) {
pool.add(object);
}
}
}

View File

@ -0,0 +1,292 @@
package org.apache.lucene.facet.search;
import java.io.IOException;
import java.util.ArrayList;
import org.apache.lucene.facet.search.params.FacetRequest;
import org.apache.lucene.facet.search.results.FacetResult;
import org.apache.lucene.facet.search.results.FacetResultNode;
import org.apache.lucene.facet.search.results.MutableFacetResultNode;
import org.apache.lucene.facet.search.results.IntermediateFacetResult;
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
import org.apache.lucene.facet.taxonomy.TaxonomyReader.ChildrenArrays;
import org.apache.lucene.facet.util.ResultSortUtils;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Generate Top-K results for a particular FacetRequest.
* <p>
* K is global (among all results) and is defined by {@link FacetRequest#getNumResults()}.
* <p>
* Note: Values of 0 (Zero) are ignored by this results handler.
*
* @lucene.experimental
*/
public class TopKFacetResultsHandler extends FacetResultsHandler {
/**
* Construct top-K results handler.
* @param taxonomyReader taxonomy reader
* @param facetRequest facet request being served
*/
public TopKFacetResultsHandler(TaxonomyReader taxonomyReader,
FacetRequest facetRequest) {
super(taxonomyReader, facetRequest);
}
// fetch top K for specific partition.
@Override
public IntermediateFacetResult fetchPartitionResult(FacetArrays facetArrays, int offset)
throws IOException {
TopKFacetResult res = null;
int ordinal = taxonomyReader.getOrdinal(facetRequest.getCategoryPath());
if (ordinal != TaxonomyReader.INVALID_ORDINAL) {
double value = 0;
if (isSelfPartition(ordinal, facetArrays, offset)) {
int partitionSize = facetArrays.getArraysLength();
value = facetRequest.getValueOf(facetArrays, ordinal % partitionSize);
}
// TODO (Facet): should initial value of "residue" depend on aggregator if not sum?
MutableFacetResultNode parentResultNode =
new MutableFacetResultNode(ordinal, value);
Heap<FacetResultNode> heap = ResultSortUtils.createSuitableHeap(facetRequest);
int totalFacets = heapDescendants(ordinal, heap, parentResultNode, facetArrays, offset);
res = new TopKFacetResult(facetRequest, parentResultNode, totalFacets);
res.setHeap(heap);
}
return res;
}
// merge given top K results into current
@Override
public IntermediateFacetResult mergeResults(IntermediateFacetResult... tmpResults) throws IOException {
int ordinal = taxonomyReader.getOrdinal(facetRequest.getCategoryPath());
MutableFacetResultNode resNode = new MutableFacetResultNode(ordinal, 0);
int totalFacets = 0;
Heap<FacetResultNode> heap = null;
// merge other results in queue
for (IntermediateFacetResult tmpFres : tmpResults) {
// cast should succeed
TopKFacetResult fres = (TopKFacetResult) tmpFres;
totalFacets += fres.getNumValidDescendants();
// set the value for the result node representing the facet request
resNode.increaseValue(fres.getFacetResultNode().getValue());
Heap<FacetResultNode> tmpHeap = fres.getHeap();
if (heap == null) {
heap = tmpHeap;
continue;
}
// bring sub results from heap of tmp res into result heap
for (int i = tmpHeap.size(); i > 0; i--) {
FacetResultNode a = heap.insertWithOverflow(tmpHeap.pop());
if (a != null) {
resNode.increaseResidue(a.getResidue());
}
}
}
TopKFacetResult res = new TopKFacetResult(facetRequest, resNode, totalFacets);
res.setHeap(heap);
return res;
}
/**
* Finds the top K descendants of ordinal, which are at most facetRequest.getDepth()
* deeper than facetRequest.getCategoryPath (whose ordinal is input parameter ordinal).
* Candidates are restricted to current "counting list" and current "partition",
* they join the overall priority queue pq of size K.
* @return total number of descendants considered here by pq, excluding ordinal itself.
*/
private int heapDescendants(int ordinal, Heap<FacetResultNode> pq,
MutableFacetResultNode parentResultNode, FacetArrays facetArrays, int offset) {
int partitionSize = facetArrays.getArraysLength();
int endOffset = offset + partitionSize;
ChildrenArrays childrenArray = taxonomyReader.getChildrenArrays();
int[] youngestChild = childrenArray.getYoungestChildArray();
int[] olderSibling = childrenArray.getOlderSiblingArray();
FacetResultNode reusable = null;
int localDepth = 0;
int depth = facetRequest.getDepth();
int[] ordinalStack = new int[2+Math.min(Short.MAX_VALUE, depth)];
int childrenCounter = 0;
int tosOrdinal; // top of stack element
int yc = youngestChild[ordinal];
while (yc >= endOffset) {
yc = olderSibling[yc];
}
// make use of the fact that TaxonomyReader.INVALID_ORDINAL == -1, < endOffset
// and it, too, can stop the loop.
ordinalStack[++localDepth] = yc;
/*
* stack holds input parameter ordinal in position 0.
* Other elements are < endoffset.
* Only top of stack can be TaxonomyReader.INVALID_ORDINAL, and this if and only if
* the element below it exhausted all its children: has them all processed.
*
* stack elements are processed (counted and accumulated) only if they
* belong to current partition (between offset and endoffset) and first time
* they are on top of stack
*
* loop as long as stack is not empty of elements other than input ordinal, or for a little while -- it sibling
*/
while (localDepth > 0) {
tosOrdinal = ordinalStack[localDepth];
if (tosOrdinal == TaxonomyReader.INVALID_ORDINAL) {
// element below tos has all its children, and itself, all processed
// need to proceed to its sibling
localDepth--;
// change element now on top of stack to its sibling.
ordinalStack[localDepth] = olderSibling[ordinalStack[localDepth]];
continue;
}
// top of stack is not invalid, this is the first time we see it on top of stack.
// collect it, if belongs to current partition, and then push its kids on itself, if applicable
if (tosOrdinal >= offset) { // tosOrdinal resides in current partition
int relativeOrdinal = tosOrdinal % partitionSize;
double value = facetRequest.getValueOf(facetArrays, relativeOrdinal);
if (value != 0 && !Double.isNaN(value)) {
// Count current ordinal -- the TOS
if (reusable == null) {
reusable = new MutableFacetResultNode(tosOrdinal, value);
} else {
// it is safe to cast since reusable was created here.
((MutableFacetResultNode)reusable).reset(tosOrdinal, value);
}
++childrenCounter;
reusable = pq.insertWithOverflow(reusable);
if (reusable != null) {
// TODO (Facet): is other logic (not add) needed, per aggregator?
parentResultNode.increaseResidue(reusable.getValue());
}
}
}
if (localDepth < depth) {
// push kid of current tos
yc = youngestChild[tosOrdinal];
while (yc >= endOffset) {
yc = olderSibling[yc];
}
ordinalStack[++localDepth] = yc;
} else { // localDepth == depth; current tos exhausted its possible children, mark this by pushing INVALID_ORDINAL
ordinalStack[++localDepth] = TaxonomyReader.INVALID_ORDINAL;
}
} // endof while stack is not empty
return childrenCounter; // we're done
}
@Override
public FacetResult renderFacetResult(IntermediateFacetResult tmpResult) {
TopKFacetResult res = (TopKFacetResult) tmpResult; // cast is safe by contract of this class
if (res != null) {
Heap<FacetResultNode> heap = res.getHeap();
MutableFacetResultNode resNode = (MutableFacetResultNode)res.getFacetResultNode(); // cast safe too
for (int i = heap.size(); i > 0; i--) {
resNode.insertSubResult(heap.pop());
}
}
return res;
}
@Override
public FacetResult rearrangeFacetResult(FacetResult facetResult) {
TopKFacetResult res = (TopKFacetResult) facetResult; // cast is safe by contract of this class
Heap<FacetResultNode> heap = res.getHeap();
heap.clear(); // just to be safe
MutableFacetResultNode topFrn = (MutableFacetResultNode) res.getFacetResultNode(); // safe cast
for (FacetResultNode frn : topFrn.getSubResults()) {
heap.add(frn);
}
int size = heap.size();
ArrayList<FacetResultNode> subResults = new ArrayList<FacetResultNode>(size);
for (int i = heap.size(); i > 0; i--) {
subResults.add(0,heap.pop());
}
topFrn.setSubResults(subResults);
return res;
}
@Override
// label top K sub results
public void labelResult(FacetResult facetResult) throws IOException {
if (facetResult != null) { // any result to label?
FacetResultNode facetResultNode = facetResult.getFacetResultNode();
if (facetResultNode != null) { // any result to label?
facetResultNode.getLabel(taxonomyReader);
int num2label = facetRequest.getNumLabel();
for (FacetResultNode frn : facetResultNode.getSubResults()) {
if (--num2label < 0) {
break;
}
frn.getLabel(taxonomyReader);
}
}
}
}
////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////
/**
* Private Mutable implementation of result of faceted search.
*/
private static class TopKFacetResult extends FacetResult implements IntermediateFacetResult {
// TODO (Facet): is it worth to override PriorityQueue.getSentinelObject()
// for any of our PQs?
private Heap<FacetResultNode> heap;
/**
* Create a Facet Result.
* @param facetRequest Request for which this result was obtained.
* @param facetResultNode top result node for this facet result.
* @param totalFacets - number of children of the targetFacet, up till the requested depth.
*/
TopKFacetResult(FacetRequest facetRequest, MutableFacetResultNode facetResultNode, int totalFacets) {
super(facetRequest, facetResultNode, totalFacets);
}
/**
* @return the heap
*/
public Heap<FacetResultNode> getHeap() {
return heap;
}
/**
* Set the heap for this result.
* @param heap heap top be set.
*/
public void setHeap(Heap<FacetResultNode> heap) {
this.heap = heap;
}
}
//////////////////////////////////////////////////////
}

View File

@ -0,0 +1,797 @@
package org.apache.lucene.facet.search;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.util.PriorityQueue;
import org.apache.lucene.facet.search.params.FacetRequest;
import org.apache.lucene.facet.search.params.FacetRequest.SortOrder;
import org.apache.lucene.facet.search.results.FacetResult;
import org.apache.lucene.facet.search.results.FacetResultNode;
import org.apache.lucene.facet.search.results.MutableFacetResultNode;
import org.apache.lucene.facet.search.results.IntermediateFacetResult;
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
import org.apache.lucene.facet.taxonomy.TaxonomyReader.ChildrenArrays;
import org.apache.lucene.util.collections.IntIterator;
import org.apache.lucene.util.collections.IntToObjectMap;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Generates {@link FacetResult} from the count arrays aggregated for a particular
* {@link FacetRequest}.
* The generated {@link FacetResult} is a subtree of the taxonomy tree.
* Its root node, {@link FacetResult#getFacetResultNode()},
* is the facet specified by {@link FacetRequest#getCategoryPath()},
* and the enumerated children, {@link FacetResultNode#getSubResults()}, of each node in that
* {@link FacetResult} are the top K ( = {@link FacetRequest#getNumResults()}) among its children
* in the taxonomy.
* Top in the sense {@link FacetRequest#getSortBy()},
* which can be by the values aggregated in the count arrays, or by ordinal numbers;
* also specified is the sort order, {@link FacetRequest#getSortOrder()},
* ascending or descending, of these values or ordinals before their top K are selected.
* The depth (number of levels excluding the root) of the
* {@link FacetResult} tree is specified by {@link FacetRequest#getDepth()}.
* <p>
* Because the number of selected children of each node is restricted,
* and not the overall number of nodes in the {@link FacetResult}, facets not selected
* into {@link FacetResult} might have better values, or ordinals, (typically,
* higher counts), than facets that are selected into the {@link FacetResult}.
* <p>
* The generated {@link FacetResult} also provides with
* {@link FacetResult#getNumValidDescendants()}, which returns the total number of facets
* that are descendants of the root node, no deeper than {@link FacetRequest#getDepth()}, and
* which have valid value. The rootnode itself is not counted here.
* Valid value is determined by the {@link FacetResultsHandler}.
* {@link TopKInEachNodeHandler} defines valid as != 0.
* <p>
* <b>NOTE:</b> this code relies on the assumption that {@link TaxonomyReader#INVALID_ORDINAL} == -1, a smaller
* value than any valid ordinal.
*
* @lucene.experimental
*/
public class TopKInEachNodeHandler extends FacetResultsHandler {
public TopKInEachNodeHandler(TaxonomyReader taxonomyReader,
FacetRequest facetRequest) {
super(taxonomyReader, facetRequest);
}
/**
* Recursively explore all facets that can be potentially included in the
* {@link FacetResult} to be generated, and that belong to the given
* partition, so that values can be examined and collected. For each such
* node, gather its top K ({@link FacetRequest#getNumResults()}) children
* among its children that are encountered in the given particular partition
* (aka current counting list).
*
* @return {@link IntermediateFacetResult} consisting of
* {@link IntToObjectMap} that maps potential
* {@link FacetResult} nodes to their top K children encountered in
* the current partition. Note that the mapped potential tree nodes
* need not belong to the given partition, only the top K children
* mapped to. The aim is to identify nodes that are certainly excluded
* from the {@link FacetResult} to be eventually (after going through
* all the partitions) returned by this handler, because they have K
* better siblings, already identified in this partition. For the
* identified excluded nodes, we only count number of their
* descendants in the subtree (to be included in
* {@link FacetResult#getNumValidDescendants()}), but not bother with
* selecting top K in these generations, which, by definition, are,
* too, excluded from the FacetResult tree.
* @param arrays the already filled in count array, potentially only covering
* one partition: the ordinals ranging from
* @param offset to <code>offset</code> + the length of the count arrays
* within <code>arrays</code> (exclusive)
* @throws IOException in case
* {@link TaxonomyReader#getOrdinal(org.apache.lucene.facet.taxonomy.CategoryPath)}
* does.
* @see FacetResultsHandler#fetchPartitionResult(FacetArrays, int)
*/
@Override
public IntermediateFacetResult fetchPartitionResult(FacetArrays arrays, int offset) throws IOException {
// get the root of the result tree to be returned, and the depth of that result tree
// (depth means number of node levels excluding the root).
int rootNode = this.taxonomyReader.getOrdinal(this.facetRequest.getCategoryPath());
if (rootNode == TaxonomyReader.INVALID_ORDINAL) {
return null;
}
int K = Math.min(facetRequest.getNumResults(),taxonomyReader.getSize()); // number of best results in each node
// this will grow into the returned IntermediateFacetResult
IntToObjectMap<AACO> AACOsOfOnePartition = new IntToObjectMap<AACO>();
int partitionSize = arrays.getArraysLength(); // all partitions, except, possibly, the last,
// have the same length. Hence modulo is OK.
int depth = facetRequest.getDepth();
if (depth == 0) {
// Need to only have root node.
IntermediateFacetResultWithHash tempFRWH = new IntermediateFacetResultWithHash(
facetRequest, AACOsOfOnePartition);
if (isSelfPartition(rootNode, arrays, offset)) {
tempFRWH.isRootNodeIncluded = true;
tempFRWH.rootNodeValue = this.facetRequest.getValueOf(arrays, rootNode % partitionSize);
}
return tempFRWH;
}
if (depth > Short.MAX_VALUE - 3) {
depth = Short.MAX_VALUE -3;
}
int endOffset = offset + partitionSize; // one past the largest ordinal in the partition
ChildrenArrays childrenArray = taxonomyReader.getChildrenArrays();
int[] youngestChild = childrenArray.getYoungestChildArray();
int[] olderSibling = childrenArray.getOlderSiblingArray();
int totalNumOfDescendantsConsidered = 0; // total number of facets with value != 0,
// in the tree. These include those selected as top K in each node, and all the others that
// were not. Not including rootNode
// the following priority queue will be used again and again for each node recursed into
// to select its best K children among its children encountered in the given partition
PriorityQueue<AggregatedCategory> pq =
new AggregatedCategoryHeap(K, this.getSuitableACComparator());
// reusables will feed the priority queue in each use
AggregatedCategory [] reusables = new AggregatedCategory[2+K];
for (int i = 0; i < reusables.length; i++) {
reusables[i] = new AggregatedCategory(1,0);
}
/*
* The returned map is built by a recursive visit of potential tree nodes. Nodes
* determined to be excluded from the FacetResult are not recursively explored as others,
* they are only recursed in order to count the number of their descendants.
* Also, nodes that they and any of their descendants can not be mapped into facets encountered
* in this partition, are, too, explored no further. These are facets whose ordinal
* numbers are greater than the ordinals of the given partition. (recall that the Taxonomy
* maintains that a parent ordinal is smaller than any of its descendants' ordinals).
* So, when scanning over all children of a potential tree node n: (1) all children with ordinal number
* greater than those in the given partition are skipped over, (2) among the children of n residing
* in this partition, the best K children are selected (using pq) for usual further recursion
* and the rest (those rejected out from the pq) are only recursed for counting total number
* of descendants, and (3) all the children of ordinal numbers smaller than the given partition
* are further explored in the usual way, since these may lead to descendants residing in this partition.
*
* ordinalStack drives the recursive descent.
* Top of stack holds the current node which we recurse from.
* ordinalStack[0] holds the root of the facetRequest, and
* it is always maintained that parent(ordianlStack[i]) = ordinalStack[i-1].
* localDepth points to the current top of ordinalStack.
* Only top of ordinalStack can be TaxonomyReader.INVALID_ORDINAL, and this if and only if
* the element below it explored all its relevant children.
*/
int[] ordinalStack = new int[depth+2]; // for 0 and for invalid on top
ordinalStack[0] = rootNode;
int localDepth = 0;
/*
* bestSignlingsStack[i] maintains the best K children of ordinalStack[i-1], namely,
* the best K siblings of ordinalStack[i], best K among those residing in the given partition.
* Note that the residents of ordinalStack need not belong
* to the current partition, only the residents of bestSignlingsStack.
* When exploring the children of ordianlStack[i-1] that reside in the current partition
* (after the top K of them have been determined and stored into bestSignlingsStack[i]),
* siblingExplored[i] points into bestSignlingsStack[i], to the child now explored, hence
* residing in ordinalStack[i], and firstToTheLeftOfPartition[i] holds the largest ordinal of
* a sibling smaller than the ordinals in the partition.
* When siblingExplored[i] == max int, the top K siblings of ordinalStack[i] among those siblings
* that reside in this partition have not been determined yet.
* if siblingExplored[i] < 0, the node in ordinalStack[i] is to the left of partition
* (i.e. of a smaller ordinal than the current partition)
* (step (3) above is executed for the children of ordianlStack[i-1])
*/
int[][] bestSignlingsStack = new int[depth+2][];
int[] siblingExplored = new int[depth+2];
int[] firstToTheLeftOfPartition = new int [depth+2];
int tosOrdinal; // top of stack element, the ordinal at the top of stack
/*
* to start the loop, complete the datastructures for root node:
* push its youngest child to ordinalStack; make a note in siblingExplored[] that the children
* of rootNode, which reside in the current partition have not been read yet to select the top
* K of them. Also, make rootNode as if, related to its parent, rootNode belongs to the children
* of ordinal numbers smaller than those of the current partition (this will ease on end condition --
* we can continue to the older sibling of rootNode once the localDepth goes down, before we verify that
* it went that down)
*/
ordinalStack[++localDepth] = youngestChild[rootNode];
siblingExplored[localDepth] = Integer.MAX_VALUE; // we have not verified position wrt current partition
siblingExplored[0] = -1; // as if rootNode resides to the left of current position
/*
* now the whole recursion: loop as long as stack is not empty of elements descendants of
* facetRequest's root.
*/
while (localDepth > 0) {
tosOrdinal = ordinalStack[localDepth];
if (tosOrdinal == TaxonomyReader.INVALID_ORDINAL) {
// the brotherhood that has been occupying the top of stack is all exhausted.
// Hence, element below tos, namely, father of tos, has all its children,
// and itself, all explored.
localDepth--;
// replace this father, now on top of stack, by this father's sibling:
// this parent's ordinal can not be greater than current partition, as otherwise
// its child, now just removed, would not have been pushed on it.
// so the father is either inside the partition, or smaller ordinal
if (siblingExplored[localDepth] < 0 ) {
ordinalStack[localDepth] = olderSibling[ordinalStack[localDepth]];
continue;
}
// in this point, siblingExplored[localDepth] between 0 and number of bestSiblings
// it can not be max int
siblingExplored[localDepth]--;
if (siblingExplored[localDepth] == -1 ) {
//siblings residing in the partition have been all processed, we now move
// to those of ordinal numbers smaller than the partition
ordinalStack[localDepth] = firstToTheLeftOfPartition[localDepth];
} else {
// still explore siblings residing in the partition
// just move to the next one
ordinalStack[localDepth] = bestSignlingsStack[localDepth][siblingExplored[localDepth]];
}
continue;
} // endof tosOrdinal is invalid, and hence removed, and its parent was replaced by this
// parent's sibling
// now try to push a kid, but first look at tos whether it 'deserves' its kids explored:
// it is not to the right of current partition, and we know whether to only count or to
// select best K siblings.
if (siblingExplored[localDepth] == Integer.MAX_VALUE) {
//tosOrdinal was not examined yet for its position relative to current partition
// and the best K of current partition, among its siblings, have not been determined yet
while (tosOrdinal >= endOffset) {
tosOrdinal = olderSibling[tosOrdinal];
}
// now it is inside. Run it and all its siblings inside the partition through a heap
// and in doing so, count them, find best K, and sum into residue
double residue = 0f; // the sum of all the siblings from this partition that do not make
// it to top K
pq.clear();
//reusables are consumed as from a stack. The stack starts full and returns full.
int tosReuslables = reusables.length -1;
while (tosOrdinal >= offset) { // while tosOrdinal belongs to the given partition; here, too, we use the fact
// that TaxonomyReader.INVALID_ORDINAL == -1 < offset
double value = facetRequest.getValueOf(arrays, tosOrdinal % partitionSize);
if (value != 0) { // the value of yc is not 0, it is to be considered.
totalNumOfDescendantsConsidered++;
// consume one reusable, and push to the priority queue
AggregatedCategory ac = reusables[tosReuslables--];
ac.ordinal = tosOrdinal;
ac.value = value;
ac = pq.insertWithOverflow(ac);
if (null != ac) {
residue += ac.value;
// TODO (Facet): could it be that we need to do something
// else, not add, depending on the aggregator?
/* when a facet is excluded from top K, because already in this partition it has
* K better siblings, it is only recursed for count only.
*/
// update totalNumOfDescendants by the now excluded node and all its descendants
totalNumOfDescendantsConsidered--; // reduce the 1 earned when the excluded node entered the heap
// and now return it and all its descendants. These will never make it to FacetResult
totalNumOfDescendantsConsidered += countOnly (ac.ordinal, youngestChild,
olderSibling, arrays, partitionSize, offset, endOffset, localDepth, depth);
reusables[++tosReuslables] = ac;
}
}
tosOrdinal = olderSibling[tosOrdinal];
}
// now pq has best K children of ordinals that belong to the given partition.
// Populate a new AACO with them.
// tosOrdinal is now first sibling smaller than partition, make a note of that
firstToTheLeftOfPartition[localDepth] = tosOrdinal;
int aaci = pq.size();
int[] ords = new int[aaci];
double [] vals = new double [aaci];
while (aaci > 0) {
AggregatedCategory ac = pq.pop();
ords[--aaci] = ac.ordinal;
vals[aaci] = ac.value;
reusables[++tosReuslables] = ac;
}
// if more than 0 ordinals, add this AACO to the map to be returned,
// and add ords to sibling stack, and make a note in siblingExplored that these are to
// be visited now
if (ords.length > 0) {
AACOsOfOnePartition.put(ordinalStack[localDepth-1], new AACO(ords,vals,residue));
bestSignlingsStack[localDepth] = ords;
siblingExplored[localDepth] = ords.length-1;
ordinalStack[localDepth] = ords[ords.length-1];
} else {
// no ordinals siblings of tosOrdinal in current partition, move to the left of it
// tosOrdinal is already there (to the left of partition).
// make a note of it in siblingExplored
ordinalStack[localDepth] = tosOrdinal;
siblingExplored[localDepth] = -1;
}
continue;
} // endof we did not check the position of a valid ordinal wrt partition
// now tosOrdinal is a valid ordinal, inside partition or to the left of it, we need
// to push its kids on top of it, if not too deep.
// Make a note that we did not check them yet
if (localDepth >= depth) {
// localDepth == depth; current tos exhausted its possible children, mark this by pushing INVALID_ORDINAL
ordinalStack[++localDepth] = TaxonomyReader.INVALID_ORDINAL;
continue;
}
ordinalStack[++localDepth] = youngestChild[tosOrdinal];
siblingExplored[localDepth] = Integer.MAX_VALUE;
} // endof loop while stack is not empty
// now generate a TempFacetResult from AACOsOfOnePartition, and consider self.
IntermediateFacetResultWithHash tempFRWH = new IntermediateFacetResultWithHash(
facetRequest, AACOsOfOnePartition);
if (isSelfPartition(rootNode, arrays, offset)) {
tempFRWH.isRootNodeIncluded = true;
tempFRWH.rootNodeValue = this.facetRequest.getValueOf(arrays, rootNode % partitionSize);
}
tempFRWH.totalNumOfFacetsConsidered = totalNumOfDescendantsConsidered;
return tempFRWH;
}
/**
* Recursively count <code>ordinal</code>, whose depth is <code>currentDepth</code>,
* and all its descendants down to <code>maxDepth</code> (including),
* descendants whose value in the count arrays, <code>arrays</code>, is != 0.
* The count arrays only includes the current partition, from <code>offset</code>, to (exclusive)
* <code>endOffset</code>.
* It is assumed that <code>ordinal</code> < <code>endOffset</code>,
* otherwise, not <code>ordinal</code>, and none of its descendants, reside in
* the current partition. <code>ordinal</code> < <code>offset</code> is allowed,
* as ordinal's descendants might be >= <code>offeset</code>.
*
* @param ordinal a facet ordinal.
* @param youngestChild mapping a given ordinal to its youngest child in the taxonomy (of largest ordinal number),
* or to -1 if has no children.
* @param olderSibling mapping a given ordinal to its older sibling, or to -1
* @param arrays values for the ordinals in the given partition
* @param offset the first (smallest) ordinal in the given partition
* @param partitionSize number of ordinals in the given partition
* @param endOffset one larger than the largest ordinal that belong to this partition
* @param currentDepth the depth or ordinal in the TaxonomyTree (relative to rootnode of the facetRequest)
* @param maxDepth maximal depth of descendants to be considered here (measured relative to rootnode of the
* facetRequest).
*
* @return the number of nodes, from ordinal down its descendants, of depth <= maxDepth,
* which reside in the current partition, and whose value != 0
*/
private int countOnly(int ordinal, int[] youngestChild, int[] olderSibling,
FacetArrays arrays, int partitionSize, int offset,
int endOffset, int currentDepth, int maxDepth) {
int ret = 0;
if (offset <= ordinal) {
// ordinal belongs to the current partition
if (0 != facetRequest.getValueOf(arrays, ordinal % partitionSize)) {
ret++;
}
}
// now consider children of ordinal, if not too deep
if (currentDepth >= maxDepth) {
return ret;
}
int yc = youngestChild[ordinal];
while (yc >= endOffset) {
yc = olderSibling[yc];
}
while (yc > TaxonomyReader.INVALID_ORDINAL) { // assuming this is -1, smaller than any legal ordinal
ret += countOnly (yc, youngestChild, olderSibling, arrays,
partitionSize, offset, endOffset, currentDepth+1, maxDepth);
yc = olderSibling[yc];
}
return ret;
}
/**
* Merge several partitions' {@link IntermediateFacetResult}-s into one of the
* same format
*
* @see FacetResultsHandler#mergeResults(IntermediateFacetResult...)
*/
@Override
public IntermediateFacetResult mergeResults(IntermediateFacetResult... tmpResults)
throws ClassCastException, IllegalArgumentException {
if (tmpResults.length == 0) {
return null;
}
int i=0;
// skip over null tmpResults
for (; (i < tmpResults.length)&&(tmpResults[i] == null); i++) {}
if (i == tmpResults.length) {
// all inputs are null
return null;
}
// i points to the first non-null input
int K = this.facetRequest.getNumResults(); // number of best result in each node
IntermediateFacetResultWithHash tmpToReturn = (IntermediateFacetResultWithHash)tmpResults[i++];
// now loop over the rest of tmpResults and merge each into tmpToReturn
for ( ; i < tmpResults.length; i++) {
IntermediateFacetResultWithHash tfr = (IntermediateFacetResultWithHash)tmpResults[i];
tmpToReturn.totalNumOfFacetsConsidered += tfr.totalNumOfFacetsConsidered;
if (tfr.isRootNodeIncluded) {
tmpToReturn.isRootNodeIncluded = true;
tmpToReturn.rootNodeValue = tfr.rootNodeValue;
}
// now merge the HashMap of tfr into this of tmpToReturn
IntToObjectMap<AACO> tmpToReturnMapToACCOs = tmpToReturn.mapToAACOs;
IntToObjectMap<AACO> tfrMapToACCOs = tfr.mapToAACOs;
IntIterator tfrIntIterator = tfrMapToACCOs.keyIterator();
//iterate over all ordinals in tfr that are maps to their children (and the residue over
// non included chilren)
while (tfrIntIterator.hasNext()) {
int tfrkey = tfrIntIterator.next();
AACO tmpToReturnAACO = null;
if (null == (tmpToReturnAACO = tmpToReturnMapToACCOs.get(tfrkey))) {
// if tmpToReturn does not have any kids of tfrkey, map all the kids
// from tfr to it as one package, along with their redisude
tmpToReturnMapToACCOs.put(tfrkey, tfrMapToACCOs.get(tfrkey));
} else {
// merge the best K children of tfrkey as appear in tmpToReturn and in tfr
AACO tfrAACO = tfrMapToACCOs.get(tfrkey);
int resLength = tfrAACO.ordinals.length + tmpToReturnAACO.ordinals.length;
if (K < resLength) {
resLength = K;
}
int[] resOrds = new int [resLength];
double[] resVals = new double [resLength];
double resResidue = tmpToReturnAACO.residue + tfrAACO.residue;
int indexIntoTmpToReturn = 0;
int indexIntoTFR = 0;
ACComparator merger = getSuitableACComparator(); // by facet Request
for (int indexIntoRes = 0; indexIntoRes < resLength; indexIntoRes++) {
if (indexIntoTmpToReturn >= tmpToReturnAACO.ordinals.length) {
//tmpToReturnAACO (former result to return) ran out of indices
// it is all merged into resOrds and resVal
resOrds[indexIntoRes] = tfrAACO.ordinals[indexIntoTFR];
resVals[indexIntoRes] = tfrAACO.values[indexIntoTFR];
indexIntoTFR++;
continue;
}
if (indexIntoTFR >= tfrAACO.ordinals.length) {
// tfr ran out of indices
resOrds[indexIntoRes] = tmpToReturnAACO.ordinals[indexIntoTmpToReturn];
resVals[indexIntoRes] = tmpToReturnAACO.values[indexIntoTmpToReturn];
indexIntoTmpToReturn++;
continue;
}
// select which goes now to res: next (ord, value) from tmpToReturn or from tfr:
if (merger.leftGoesNow( tmpToReturnAACO.ordinals[indexIntoTmpToReturn],
tmpToReturnAACO.values[indexIntoTmpToReturn],
tfrAACO.ordinals[indexIntoTFR],
tfrAACO.values[indexIntoTFR])) {
resOrds[indexIntoRes] = tmpToReturnAACO.ordinals[indexIntoTmpToReturn];
resVals[indexIntoRes] = tmpToReturnAACO.values[indexIntoTmpToReturn];
indexIntoTmpToReturn++;
} else {
resOrds[indexIntoRes] = tfrAACO.ordinals[indexIntoTFR];
resVals[indexIntoRes] = tfrAACO.values[indexIntoTFR];
indexIntoTFR++;
}
} // end of merge of best kids of tfrkey that appear in tmpToReturn and its kids that appear in tfr
// altogether yielding no more that best K kids for tfrkey, not to appear in the new shape of
// tmpToReturn
while (indexIntoTmpToReturn < tmpToReturnAACO.ordinals.length) {
resResidue += tmpToReturnAACO.values[indexIntoTmpToReturn++];
}
while (indexIntoTFR < tfrAACO.ordinals.length) {
resResidue += tfrAACO.values[indexIntoTFR++];
}
//update the list of best kids of tfrkey as appear in tmpToReturn
tmpToReturnMapToACCOs.put(tfrkey, new AACO(resOrds, resVals, resResidue));
} // endof need to merge both AACO -- children and residue for same ordinal
} // endof loop over all ordinals in tfr
} // endof loop over all temporary facet results to merge
return tmpToReturn;
}
private static class AggregatedCategoryHeap extends PriorityQueue<AggregatedCategory> {
private ACComparator merger;
public AggregatedCategoryHeap(int size, ACComparator merger) {
super(size);
this.merger = merger;
}
@Override
protected boolean lessThan(AggregatedCategory arg1, AggregatedCategory arg2) {
return merger.leftGoesNow(arg2.ordinal, arg2.value, arg1.ordinal, arg1.value);
}
}
private static class ResultNodeHeap extends PriorityQueue<FacetResultNode> {
private ACComparator merger;
public ResultNodeHeap(int size, ACComparator merger) {
super(size);
this.merger = merger;
}
@Override
protected boolean lessThan(FacetResultNode arg1, FacetResultNode arg2) {
return merger.leftGoesNow(arg2.getOrdinal(), arg2.getValue(), arg1.getOrdinal(), arg1.getValue());
}
}
/**
* @return the {@link ACComparator} that reflects the order,
* expressed in the {@link FacetRequest}, of
* facets in the {@link FacetResult}.
*/
private ACComparator getSuitableACComparator() {
if (facetRequest.getSortOrder() == SortOrder.ASCENDING) {
switch (facetRequest.getSortBy()) {
case VALUE:
return new AscValueACComparator();
case ORDINAL:
return new AscOrdACComparator();
}
} else {
switch (facetRequest.getSortBy()) {
case VALUE:
return new DescValueACComparator();
case ORDINAL:
return new DescOrdACComparator();
}
}
return null;
}
/**
* A comparator of two Aggregated Categories according to the order
* (ascending / descending) and item (ordinal or value) specified in the
* FacetRequest for the FacetResult to be generated
*/
private static abstract class ACComparator {
ACComparator() { }
protected abstract boolean leftGoesNow (int ord1, double val1, int ord2, double val2);
}
private static final class AscValueACComparator extends ACComparator {
AscValueACComparator() { }
@Override
protected boolean leftGoesNow (int ord1, double val1, int ord2, double val2) {
return (val1 < val2);
}
}
private static final class DescValueACComparator extends ACComparator {
DescValueACComparator() { }
@Override
protected boolean leftGoesNow (int ord1, double val1, int ord2, double val2) {
return (val1 > val2);
}
}
private static final class AscOrdACComparator extends ACComparator {
AscOrdACComparator() { }
@Override
protected boolean leftGoesNow (int ord1, double val1, int ord2, double val2) {
return (ord1 < ord2);
}
}
private static final class DescOrdACComparator extends ACComparator {
DescOrdACComparator() { }
@Override
protected boolean leftGoesNow (int ord1, double val1, int ord2, double val2) {
return (ord1 > ord2);
}
}
/**
* Intermediate result to hold counts from one or more partitions processed
* thus far. Its main field, constructor parameter <i>mapToAACOs</i>, is a map
* from ordinals to AACOs. The AACOs mapped to contain ordinals and values
* encountered in the count arrays of the partitions processed thus far. The
* ordinals mapped from are their parents, and they may be not contained in
* the partitions processed thus far. All nodes belong to the taxonomy subtree
* defined at the facet request, constructor parameter <i>facetReq</i>, by its
* root and depth.
*/
public static class IntermediateFacetResultWithHash implements IntermediateFacetResult {
protected IntToObjectMap<AACO> mapToAACOs;
FacetRequest facetRequest;
boolean isRootNodeIncluded; // among the ordinals in the partitions
// processed thus far
double rootNodeValue; // the value of it, in case encountered.
int totalNumOfFacetsConsidered; // total number of facets
// which belong to facetRequest subtree and have value != 0,
// and have been encountered thus far in the partitions processed.
// root node of result tree is not included in this count.
public IntermediateFacetResultWithHash(FacetRequest facetReq,
IntToObjectMap<AACO> mapToAACOs) {
this.mapToAACOs = mapToAACOs;
this.facetRequest = facetReq;
this.isRootNodeIncluded = false;
this.rootNodeValue = 0.0;
this.totalNumOfFacetsConsidered = 0;
}
public FacetRequest getFacetRequest() {
return this.facetRequest;
}
} // endof FacetResultWithHash
/**
* Maintains info of one entry in the filled up count array:
* an ordinal number of a category and the value aggregated for it
* (typically, that value is the count for that ordinal).
*/
private static final class AggregatedCategory {
int ordinal;
double value;
AggregatedCategory(int ord, double val) {
this.ordinal = ord;
this.value = val;
}
}
/**
* Maintains an array of {@link AggregatedCategory}. For space consideration, this is implemented as
* a pair of arrays, <i>ordinals</i> and <i>values</i>, rather than one array of pairs.
* Enumerated in <i>ordinals</i> are siblings,
* potential nodes of the {@link FacetResult} tree
* (i.e., the descendants of the root node, no deeper than the specified depth).
* No more than K ( = {@link FacetRequest#getNumResults()})
* siblings are enumerated, and
* <i>residue</i> holds the sum of values of the siblings rejected from the
* enumerated top K.
*/
private static final class AACO {
int [] ordinals; // ordinals of the best K children, sorted from best to least
double [] values; // the respective values for these children
double residue; // sum of values of all other children, that did not get into top K
AACO (int[] ords, double[] vals, double r) {
this.ordinals = ords;
this.values = vals;
this.residue = r;
}
}
@Override
/**
* Recursively label the first facetRequest.getNumLabel() sub results
* of the root of a given {@link FacetResult}, or of an already labeled node in it.
* I.e., a node is labeled only if it is the root or all its ancestors are labeled.
*/
public void labelResult(FacetResult facetResult) throws IOException {
if (facetResult == null) {
return; // any result to label?
}
FacetResultNode rootNode = facetResult.getFacetResultNode();
recursivelyLabel(rootNode, facetRequest.getNumLabel());
}
private void recursivelyLabel(FacetResultNode node, int numToLabel) throws IOException {
if (node == null) {
return;
}
node.getLabel(this.taxonomyReader); // attach a label -- category path -- to the node
if (null == node.getSubResults()) {
return; // if node has no children -- done
}
// otherwise, label the first numToLabel of these children, and recursively -- their children.
int numLabeled = 0;
for (FacetResultNode frn : node.getSubResults()) {
// go over the children of node from first to last, no more than numToLable of them
recursivelyLabel(frn, numToLabel);
if (++numLabeled >= numToLabel) {
return;
}
}
}
@Override
// verifies that the children of each node are sorted by the order
// specified by the facetRequest.
// the values in these nodes may have changed due to a re-count, for example
// following the accumulation by Sampling.
// so now we test and re-order if necessary.
public FacetResult rearrangeFacetResult(FacetResult facetResult) {
PriorityQueue<FacetResultNode> nodesHeap =
new ResultNodeHeap(this.facetRequest.getNumResults(), this.getSuitableACComparator());
MutableFacetResultNode topFrn = (MutableFacetResultNode) facetResult.getFacetResultNode(); // safe cast
rearrangeChilrenOfNode(topFrn, nodesHeap);
return facetResult;
}
private void rearrangeChilrenOfNode(FacetResultNode node,
PriorityQueue<FacetResultNode> nodesHeap) {
nodesHeap.clear(); // just to be safe
for (FacetResultNode frn : node.getSubResults()) {
nodesHeap.add(frn);
}
int size = nodesHeap.size();
ArrayList<FacetResultNode> subResults = new ArrayList<FacetResultNode>(size);
while (nodesHeap.size()>0) {
subResults.add(0,nodesHeap.pop());
}
((MutableFacetResultNode)node).setSubResults(subResults);
for (FacetResultNode frn : node.getSubResults()) {
rearrangeChilrenOfNode(frn, nodesHeap);
}
}
@Override
public FacetResult renderFacetResult(IntermediateFacetResult tmpResult) throws IOException {
IntermediateFacetResultWithHash tmp = (IntermediateFacetResultWithHash) tmpResult;
int ordinal = this.taxonomyReader.getOrdinal(this.facetRequest.getCategoryPath());
if ((tmp == null) || (ordinal == TaxonomyReader.INVALID_ORDINAL)) {
return null;
}
double value = Double.NaN;
if (tmp.isRootNodeIncluded) {
value = tmp.rootNodeValue;
}
MutableFacetResultNode root = generateNode (ordinal, value, tmp.mapToAACOs);
return new FacetResult (tmp.facetRequest, root, tmp.totalNumOfFacetsConsidered);
}
private MutableFacetResultNode generateNode (int ordinal, double val, IntToObjectMap<AACO> mapToAACOs) {
MutableFacetResultNode node = new MutableFacetResultNode(ordinal, val);
AACO aaco = mapToAACOs.get(ordinal);
if (null == aaco) {
return node;
}
List<FacetResultNode> list = new ArrayList<FacetResultNode>();
for (int i = 0; i < aaco.ordinals.length; i++) {
list.add(generateNode(aaco.ordinals[i], aaco.values[i], mapToAACOs));
}
node.setSubResults(list);
node.setResidue(aaco.residue);
return node;
}
}

View File

@ -0,0 +1,188 @@
package org.apache.lucene.facet.search;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.HashMap;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.facet.index.params.CategoryListParams;
import org.apache.lucene.facet.index.params.FacetIndexingParams;
import org.apache.lucene.facet.search.aggregator.Aggregator;
import org.apache.lucene.facet.search.aggregator.CountingAggregator;
import org.apache.lucene.facet.search.cache.CategoryListCache;
import org.apache.lucene.facet.search.cache.CategoryListData;
import org.apache.lucene.facet.search.params.FacetSearchParams;
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
import org.apache.lucene.facet.util.PartitionsUtils;
import org.apache.lucene.facet.util.ScoredDocIdsUtils;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Maintain Total Facet Counts per partition, for given parameters:
* <ul>
* <li>Index reader of an index</li>
* <li>Taxonomy index reader</li>
* <li>Facet indexing params (and particularly the category list params)</li>
* <li></li>
* </ul>
* The total facet counts are maintained as an array of arrays of integers,
* where a separate array is kept for each partition.
*
* @lucene.experimental
*/
public class TotalFacetCounts {
/** total facet counts per partition: totalCounts[partition][ordinal%partitionLength] */
private int[][] totalCounts = null;
private final TaxonomyReader taxonomy;
private final FacetIndexingParams facetIndexingParams;
private final static AtomicInteger atomicGen4Test = new AtomicInteger(1);
/** Creation type for test purposes */
enum CreationType { Computed, Loaded } // for testing
final int gen4test;
final CreationType createType4test;
/**
* Construct by key - from index Directory or by recomputing.
* @param key the key mapping of this total facet counts (index, taxonomy, category lists...)
*/
private TotalFacetCounts (TaxonomyReader taxonomy, FacetIndexingParams facetIndexingParams,
int[][] counts, CreationType createType4Test) throws IOException, LockObtainFailedException {
this.taxonomy = taxonomy;
this.facetIndexingParams = facetIndexingParams;
this.totalCounts = counts;
this.createType4test = createType4Test;
this.gen4test = atomicGen4Test.incrementAndGet();
}
/**
* Fill a partition's array with the TotalCountsArray values.
* @param partitionArray array to fill
* @param partition number of required partition
*/
public void fillTotalCountsForPartition(int[] partitionArray, int partition) {
int partitionSize = partitionArray.length;
int[] countArray = totalCounts[partition];
if (countArray == null) {
countArray = new int[partitionSize];
totalCounts[partition] = countArray;
}
int length = Math.min(partitionSize, countArray.length);
System.arraycopy(countArray, 0, partitionArray, 0, length);
}
/**
* Return the total count of an input category
* @param ordinal ordinal of category whose total count is required
*/
public int getTotalCount(int ordinal) {
int partition = PartitionsUtils.partitionNumber(facetIndexingParams,ordinal);
int offset = ordinal % PartitionsUtils.partitionSize(facetIndexingParams, taxonomy);
return totalCounts[partition][offset];
}
static TotalFacetCounts loadFromFile(File inputFile, TaxonomyReader taxonomy,
FacetIndexingParams facetIndexingParams) throws IOException {
DataInputStream dis = new DataInputStream(new BufferedInputStream(new FileInputStream(inputFile)));
try {
int[][] counts = new int[dis.readInt()][];
for (int i=0; i<counts.length; i++) {
int size = dis.readInt();
if (size<0) {
counts[i] = null;
} else {
counts[i] = new int[size];
for (int j=0; j<size; j++) {
counts[i][j] = dis.readInt();
}
}
}
return new TotalFacetCounts(taxonomy, facetIndexingParams, counts, CreationType.Loaded);
} finally {
dis.close();
}
}
static void storeToFile(File outputFile, TotalFacetCounts tfc) throws IOException {
DataOutputStream dos = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(outputFile)));
try {
dos.writeInt(tfc.totalCounts.length);
for (int[] counts : tfc.totalCounts) {
if (counts == null) {
dos.writeInt(-1);
} else {
dos.writeInt(counts.length);
for (int i : counts) {
dos.writeInt(i);
}
}
}
} finally {
dos.close();
}
}
static TotalFacetCounts compute(final IndexReader indexReader,
final TaxonomyReader taxonomy, final FacetIndexingParams facetIndexingParams,
final CategoryListCache clCache) throws IOException {
int partitionSize = PartitionsUtils.partitionSize(facetIndexingParams, taxonomy);
final int[][] counts = new int[(int) Math.ceil(taxonomy.getSize() /(float) partitionSize)][partitionSize];
FacetSearchParams newSearchParams = new FacetSearchParams(facetIndexingParams);
//createAllListsSearchParams(facetIndexingParams, this.totalCounts);
FacetsAccumulator fe = new StandardFacetsAccumulator(newSearchParams, indexReader, taxonomy) {
@Override
protected HashMap<CategoryListIterator, Aggregator> getCategoryListMap(
FacetArrays facetArrays, int partition) throws IOException {
Aggregator aggregator = new CountingAggregator(counts[partition]);
HashMap<CategoryListIterator, Aggregator> map = new HashMap<CategoryListIterator, Aggregator>();
for (CategoryListParams clp: facetIndexingParams.getAllCategoryListParams()) {
final CategoryListIterator cli = clIteraor(clCache, clp, indexReader, partition);
map.put(cli, aggregator);
}
return map;
}
};
fe.setComplementThreshold(FacetsAccumulator.DISABLE_COMPLEMENT);
fe.accumulate(ScoredDocIdsUtils.createAllDocsScoredDocIDs(indexReader));
return new TotalFacetCounts(taxonomy, facetIndexingParams, counts, CreationType.Computed);
}
static CategoryListIterator clIteraor(CategoryListCache clCache, CategoryListParams clp,
IndexReader indexReader, int partition) throws IOException {
if (clCache != null) {
CategoryListData cld = clCache.get(clp);
if (cld != null) {
return cld.iterator(partition);
}
}
return clp.createCategoryListIterator(indexReader, partition);
}
}

View File

@ -0,0 +1,285 @@
package org.apache.lucene.facet.search;
import java.io.File;
import java.io.IOException;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentLinkedQueue;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.facet.index.params.CategoryListParams;
import org.apache.lucene.facet.index.params.FacetIndexingParams;
import org.apache.lucene.facet.search.cache.CategoryListCache;
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Manage an LRU cache for {@link TotalFacetCounts} per index, taxonomy, and
* facet indexing params.
*
* @lucene.experimental
*/
public final class TotalFacetCountsCache {
/**
* Default size of in memory cache for computed total facet counts.
* Set to 2 for the case when an application reopened a reader and
* the original one is still in use (Otherwise there will be
* switching again and again between the two.)
*/
public static final int DEFAULT_CACHE_SIZE = 2;
private static final TotalFacetCountsCache singleton = new TotalFacetCountsCache();
/**
* Get the single instance of this cache
*/
public static TotalFacetCountsCache getSingleton() {
return singleton;
}
/**
* In-memory cache of TFCs.
* <ul>
* <li>It's size is kept within limits through {@link #trimCache()}.
* <li>An LRU eviction policy is applied, by maintaining active keys in {@link #lruKeys}.
* <li>After each addition to the cache, trimCache is called, to remove entries least recently used.
* </ul>
* @see #markRecentlyUsed(TFCKey)
*/
private ConcurrentHashMap<TFCKey,TotalFacetCounts> cache = new ConcurrentHashMap<TFCKey,TotalFacetCounts>();
/**
* A queue of active keys for applying LRU policy on eviction from the {@link #cache}.
* @see #markRecentlyUsed(TFCKey)
*/
private ConcurrentLinkedQueue<TFCKey> lruKeys = new ConcurrentLinkedQueue<TFCKey>();
private int maxCacheSize = DEFAULT_CACHE_SIZE;
/** private constructor for singleton pattern */
private TotalFacetCountsCache() {
}
/**
* Get the total facet counts for a reader/taxonomy pair and facet indexing parameters.
* If not in cache, computed here and added to the cache for later use.
* @param indexReader the documents index
* @param taxonomy the taxonomy index
* @param facetIndexingParams facet indexing parameters
* @param clCache category list cache for faster computation, can be null
* @return the total facet counts.
*/
public TotalFacetCounts getTotalCounts(IndexReader indexReader, TaxonomyReader taxonomy,
FacetIndexingParams facetIndexingParams, CategoryListCache clCache) throws IOException {
// create the key
TFCKey key = new TFCKey(indexReader, taxonomy, facetIndexingParams);
// it is important that this call is not synchronized, so that available TFC
// would not wait for one that needs to be computed.
TotalFacetCounts tfc = cache.get(key);
if (tfc != null) {
markRecentlyUsed(key);
return tfc;
}
return computeAndCache(key, clCache);
}
/**
* Mark key as it as recently used.
* <p>
* <b>Implementation notes: Synchronization considerations and the interaction between lruKeys and cache:</b>
* <ol>
* <li>A concurrent {@link LinkedHashMap} would have made this class much simpler.
* But unfortunately, Java does not provide one.
* Instead, we combine two concurrent objects:
* <ul>
* <li>{@link ConcurrentHashMap} for the cached TFCs.
* <li>{@link ConcurrentLinkedQueue} for active keys
* </ul>
* <li>Both {@link #lruKeys} and {@link #cache} are concurrently safe.
* <li>Checks for a cached item through getTotalCounts() are not synchronized.
* Therefore, the case that a needed TFC is in the cache is very fast:
* it does not wait for the computation of other TFCs.
* <li>computeAndCache() is synchronized, and, has a (double) check of the required
* TFC, to avoid computing the same TFC twice.
* <li>A race condition in this method (markRecentlyUsed) might result in two copies
* of the same 'key' in lruKeys, but this is handled by the loop in trimCache(),
* where an attempt to remove the same key twice is a no-op.
* </ol>
*/
private void markRecentlyUsed(TFCKey key) {
lruKeys.remove(key);
lruKeys.add(key);
}
private synchronized void trimCache() {
// loop until cache is of desired size.
while (cache.size()>maxCacheSize ) {
TFCKey key = lruKeys.poll();
if (key==null) { //defensive
// it is defensive since lruKeys presumably covers the cache keys
key = cache.keys().nextElement();
}
// remove this element. Note that an attempt to remove with the same key again is a no-op,
// which gracefully handles the possible race in markRecentlyUsed().
cache.remove(key);
}
}
/**
* compute TFC and cache it, after verifying it was not just added - for this
* matter this method is synchronized, which is not too bad, because there is
* lots of work done in the computations.
*/
private synchronized TotalFacetCounts computeAndCache(TFCKey key, CategoryListCache clCache) throws IOException {
TotalFacetCounts tfc = cache.get(key);
if (tfc == null) {
tfc = TotalFacetCounts.compute(key.indexReader, key.taxonomy, key.facetIndexingParams, clCache);
lruKeys.add(key);
cache.put(key,tfc);
trimCache();
}
return tfc;
}
/**
* Load {@link TotalFacetCounts} matching input parameters from the provided outputFile
* and add them into the cache for the provided indexReader, taxonomy, and facetIndexingParams.
* If a {@link TotalFacetCounts} for these parameters already exists in the cache, it will be
* replaced by the loaded one.
* @param inputFile file from which to read the data
* @param indexReader the documents index
* @param taxonomy the taxonomy index
* @param facetIndexingParams the facet indexing parameters
* @throws IOException on error
* @see #store(File, IndexReader, TaxonomyReader, FacetIndexingParams, CategoryListCache)
*/
public synchronized void load(File inputFile, IndexReader indexReader, TaxonomyReader taxonomy,
FacetIndexingParams facetIndexingParams) throws IOException {
if (!inputFile.isFile() || !inputFile.exists() || !inputFile.canRead()) {
throw new IllegalArgumentException("Exepecting an existing readable file: "+inputFile);
}
TFCKey key = new TFCKey(indexReader, taxonomy, facetIndexingParams);
TotalFacetCounts tfc = TotalFacetCounts.loadFromFile(inputFile, taxonomy, facetIndexingParams);
cache.put(key,tfc);
trimCache();
markRecentlyUsed(key);
}
/**
* Store the {@link TotalFacetCounts} matching input parameters into the provided outputFile,
* making them available for a later call to {@link #load(File, IndexReader, TaxonomyReader, FacetIndexingParams)}.
* If these {@link TotalFacetCounts} are available in the cache, they are used. But if they are
* not in the cache, this call will first compute them (which will also add them to the cache).
* @param outputFile file to store in.
* @param indexReader the documents index
* @param taxonomy the taxonomy index
* @param facetIndexingParams the facet indexing parameters
* @param clCache category list cache for faster computation, can be null
* @throws IOException on error
* @see #load(File, IndexReader, TaxonomyReader, FacetIndexingParams)
* @see #getTotalCounts(IndexReader, TaxonomyReader, FacetIndexingParams, CategoryListCache)
*/
public void store(File outputFile, IndexReader indexReader, TaxonomyReader taxonomy,
FacetIndexingParams facetIndexingParams, CategoryListCache clCache) throws IOException {
File parentFile = outputFile.getParentFile();
if (
( outputFile.exists() && (!outputFile.isFile() || !outputFile.canWrite())) ||
(!outputFile.exists() && (!parentFile.isDirectory() || !parentFile.canWrite()))
) {
throw new IllegalArgumentException("Exepecting a writable file: "+outputFile);
}
TotalFacetCounts tfc = getTotalCounts(indexReader, taxonomy, facetIndexingParams, clCache);
TotalFacetCounts.storeToFile(outputFile, tfc);
}
private static class TFCKey {
final IndexReader indexReader;
final TaxonomyReader taxonomy;
private final Iterable<CategoryListParams> clps;
private final int hashCode;
private final int nDels; // needed when a reader used for faceted search was just used for deletion.
final FacetIndexingParams facetIndexingParams;
public TFCKey(IndexReader indexReader, TaxonomyReader taxonomy,
FacetIndexingParams facetIndexingParams) {
this.indexReader = indexReader;
this.taxonomy = taxonomy;
this.facetIndexingParams = facetIndexingParams;
this.clps = facetIndexingParams.getAllCategoryListParams();
this.nDels = indexReader.numDeletedDocs();
hashCode = indexReader.hashCode() ^ taxonomy.hashCode();
}
@Override
public int hashCode() {
return hashCode;
}
@Override
public boolean equals(Object other) {
TFCKey o = (TFCKey) other;
if (indexReader != o.indexReader || taxonomy != o.taxonomy || nDels != o.nDels) {
return false;
}
Iterator<CategoryListParams> it1 = clps.iterator();
Iterator<CategoryListParams> it2 = o.clps.iterator();
while (it1.hasNext() && it2.hasNext()) {
if (!it1.next().equals(it2.next())) {
return false;
}
}
return it1.hasNext() == it2.hasNext();
}
}
/**
* Clear the cache.
*/
public synchronized void clear() {
cache.clear();
lruKeys.clear();
}
/**
* @return the maximal cache size
*/
public int getCacheSize() {
return maxCacheSize;
}
/**
* Set the number of TotalFacetCounts arrays that will remain in memory cache.
* <p>
* If new size is smaller than current size, the cache is appropriately trimmed.
* <p>
* Minimal size is 1, so passing zero or negative size would result in size of 1.
* @param size new size to set
*/
public void setCacheSize(int size) {
if (size < 1) size = 1;
int origSize = maxCacheSize;
maxCacheSize = size;
if (maxCacheSize < origSize) { // need to trim only if the cache was reduced
trimCache();
}
}
}

View File

@ -0,0 +1,51 @@
package org.apache.lucene.facet.search.aggregator;
import java.io.IOException;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* An Aggregator is the analogue of Lucene's Collector (see
* {@link org.apache.lucene.search.Collector}), for processing the categories
* belonging to a certain document. The Aggregator is responsible for doing
* whatever it wishes with the categories it is fed, e.g., counting the number
* of times that each category appears, or performing some computation on their
* association values.
* <P>
* Much of the function of an Aggregator implementation is not described by this
* interface. This includes the constructor and getter methods to retrieve the
* results of the aggregation.
*
* @lucene.experimental
*/
public interface Aggregator {
/**
* Specify the document (and its score in the search) that the following
* {@link #aggregate(int)} calls will pertain to.
*/
void setNextDoc(int docid, float score) throws IOException;
/**
* Collect (and do whatever an implementation deems appropriate) the
* category given by its ordinal. This category belongs to a document
* given earlier by {@link #setNextDoc(int, float)}.
*/
void aggregate(int ordinal);
}

View File

@ -0,0 +1,37 @@
package org.apache.lucene.facet.search.aggregator;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* A {@link CountingAggregator} used during complement counting.
*
* @lucene.experimental
*/
public class ComplementCountingAggregator extends CountingAggregator {
public ComplementCountingAggregator(int[] counterArray) {
super(counterArray);
}
@Override
public void aggregate(int ordinal) {
assert counterArray[ordinal]!=0:"complement aggregation: count is about to become negative for ordinal "+ordinal;
--counterArray[ordinal];
}
}

View File

@ -0,0 +1,59 @@
package org.apache.lucene.facet.search.aggregator;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* A CountingAggregator updates a counter array with the size of the whole
* taxonomy, counting the number of times each category appears in the given set
* of documents.
*
* @lucene.experimental
*/
public class CountingAggregator implements Aggregator {
protected int[] counterArray;
public void aggregate(int ordinal) {
++counterArray[ordinal];
}
public void setNextDoc(int docid, float score) {
// There's nothing for us to do here since we only increment the count by 1
// in this aggregator.
}
public CountingAggregator(int[] counterArray) {
this.counterArray = counterArray;
}
@Override
public boolean equals(Object obj) {
if (obj == null || obj.getClass() != this.getClass()) {
return false;
}
CountingAggregator that = (CountingAggregator) obj;
return that.counterArray == this.counterArray;
}
@Override
public int hashCode() {
int hashCode = counterArray == null ? 0 : counterArray.hashCode();
return hashCode;
}
}

Some files were not shown because too many files have changed in this diff Show More