diff --git a/dev-tools/eclipse/dot.classpath b/dev-tools/eclipse/dot.classpath index ef007b4e412..594167ae471 100644 --- a/dev-tools/eclipse/dot.classpath +++ b/dev-tools/eclipse/dot.classpath @@ -42,6 +42,9 @@ + + + diff --git a/dev-tools/idea/modules/facet/facet.iml b/dev-tools/idea/modules/facet/facet.iml new file mode 100644 index 00000000000..50cb2814087 --- /dev/null +++ b/dev-tools/idea/modules/facet/facet.iml @@ -0,0 +1,17 @@ + + + + + + + + + + + + + + + + + diff --git a/dev-tools/idea/modules/join/join.iml b/dev-tools/idea/modules/join/join.iml new file mode 100644 index 00000000000..50cb2814087 --- /dev/null +++ b/dev-tools/idea/modules/join/join.iml @@ -0,0 +1,17 @@ + + + + + + + + + + + + + + + + + diff --git a/dev-tools/maven/modules/facet/pom.xml.template b/dev-tools/maven/modules/facet/pom.xml.template new file mode 100755 index 00000000000..cf506099866 --- /dev/null +++ b/dev-tools/maven/modules/facet/pom.xml.template @@ -0,0 +1,73 @@ + + + 4.0.0 + + org.apache.lucene + lucene-parent + @version@ + ../../pom.xml + + org.apache.lucene + lucene-facet + jar + Lucene Facets + + Package for Faceted Indexing and Search + + + modules/facet + build + + + + ${project.groupId} + lucene-core + ${project.version} + + + ${project.groupId} + lucene-test-framework + ${project.version} + test + + + junit + junit + test + + + + ${build-directory} + ${build-directory}/classes/java + ${build-directory}/classes/test + src/java + src/test + + + ${project.build.testSourceDirectory} + + **/*.java + + + + + diff --git a/lucene/contrib/CHANGES.txt b/lucene/contrib/CHANGES.txt index 79e0dde45e5..cd72a9865a3 100644 --- a/lucene/contrib/CHANGES.txt +++ b/lucene/contrib/CHANGES.txt @@ -65,7 +65,13 @@ New Features * LUCENE-3234: provide a limit on phrase analysis in FastVectorHighlighter for highlighting speed up. Use FastVectorHighlighter.setPhraseLimit() to set limit (e.g. 5000). (Mike Sokolov via Koji Sekiguchi) - + + * LUCENE-3079: a new facet module which provides faceted indexing & search + capabilities. It allows managing a taxonomy of categories, and index them + with documents. It also provides search API for aggregating (e.g. count) + the weights of the categories that are relevant to the search results. + (Shai Erera) + * LUCENE-3171: Added BlockJoinQuery and BlockJoinCollector, under the new contrib/join module, to enable searches that require joining between parent and child documents. Joined (children + parent) diff --git a/modules/build.xml b/modules/build.xml index 7916b4d94cc..da02d8786df 100644 --- a/modules/build.xml +++ b/modules/build.xml @@ -18,12 +18,14 @@ --> + + @@ -38,6 +40,7 @@ + @@ -52,6 +55,7 @@ + @@ -66,6 +70,7 @@ + @@ -81,6 +86,7 @@ + @@ -94,6 +100,7 @@ + @@ -109,6 +116,7 @@ + diff --git a/modules/facet/LICENSE.txt b/modules/facet/LICENSE.txt new file mode 100644 index 00000000000..d6456956733 --- /dev/null +++ b/modules/facet/LICENSE.txt @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/modules/facet/NOTICE.txt b/modules/facet/NOTICE.txt new file mode 100644 index 00000000000..c9685d2b78c --- /dev/null +++ b/modules/facet/NOTICE.txt @@ -0,0 +1,5 @@ +Apache Lucene Facets +Copyright 2011 The Apache Software Foundation + +This product includes software developed by +The Apache Software Foundation (http://www.apache.org/). diff --git a/modules/facet/build.xml b/modules/facet/build.xml new file mode 100644 index 00000000000..dbc6187dee8 --- /dev/null +++ b/modules/facet/build.xml @@ -0,0 +1,75 @@ + + + + + + + + Faceted search module + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/modules/facet/src/examples/org/apache/lucene/facet/example/ExampleResult.java b/modules/facet/src/examples/org/apache/lucene/facet/example/ExampleResult.java new file mode 100644 index 00000000000..25c8d6b3d1b --- /dev/null +++ b/modules/facet/src/examples/org/apache/lucene/facet/example/ExampleResult.java @@ -0,0 +1,49 @@ +package org.apache.lucene.facet.example; + +import java.util.List; + +import org.apache.lucene.facet.search.results.FacetResult; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Result of running an example program. + * This is a general object for allowing to write a test + * that runs an example and verifies its results. + * + * @lucene.experimental + */ +public class ExampleResult { + + private List facetResults; + + /** + * @return the facet results + */ + public List getFacetResults() { + return facetResults; + } + + /** + * @param facetResults the facet results to set + */ + public void setFacetResults(List facetResults) { + this.facetResults = facetResults; + } + +} diff --git a/modules/facet/src/examples/org/apache/lucene/facet/example/ExampleUtils.java b/modules/facet/src/examples/org/apache/lucene/facet/example/ExampleUtils.java new file mode 100644 index 00000000000..2e12c987656 --- /dev/null +++ b/modules/facet/src/examples/org/apache/lucene/facet/example/ExampleUtils.java @@ -0,0 +1,38 @@ +package org.apache.lucene.facet.example; + +import org.apache.lucene.util.Version; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @lucene.experimental + */ +public class ExampleUtils { + + public static final boolean VERBOSE = Boolean.getBoolean("tests.verbose"); + + /** The Lucene {@link Version} used by the example code. */ + public static final Version EXAMPLE_VER = Version.LUCENE_31; + + public static void log(Object msg) { + if (VERBOSE) { + System.out.println(msg.toString()); + } + } + +} diff --git a/modules/facet/src/examples/org/apache/lucene/facet/example/adaptive/AdaptiveMain.java b/modules/facet/src/examples/org/apache/lucene/facet/example/adaptive/AdaptiveMain.java new file mode 100644 index 00000000000..260333a9615 --- /dev/null +++ b/modules/facet/src/examples/org/apache/lucene/facet/example/adaptive/AdaptiveMain.java @@ -0,0 +1,67 @@ +package org.apache.lucene.facet.example.adaptive; + +import java.util.List; + +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.RAMDirectory; + +import org.apache.lucene.facet.example.ExampleResult; +import org.apache.lucene.facet.example.ExampleUtils; +import org.apache.lucene.facet.example.simple.SimpleIndexer; +import org.apache.lucene.facet.example.simple.SimpleSearcher; +import org.apache.lucene.facet.search.AdaptiveFacetsAccumulator; +import org.apache.lucene.facet.search.results.FacetResult; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Driver for the adaptive sample, using the {@link AdaptiveFacetsAccumulator}. + * Indexing is the same as in {@link SimpleSearcher} + * + * @lucene.experimental + */ +public class AdaptiveMain { + + /** + * Driver for the adaptive sample. + * @throws Exception on error (no detailed exception handling here for sample simplicity + */ + public static void main(String[] args) throws Exception { + new AdaptiveMain().runSample(); + ExampleUtils.log("DONE"); + } + + public ExampleResult runSample() throws Exception { + + // create Directories for the search index and for the taxonomy index + Directory indexDir = new RAMDirectory(); + Directory taxoDir = new RAMDirectory(); + + // index the sample documents + ExampleUtils.log("index the adaptive sample documents..."); + SimpleIndexer.index(indexDir, taxoDir); + + ExampleUtils.log("search the adaptive sample documents..."); + List facetRes = AdaptiveSearcher.searchWithFacets(indexDir, taxoDir); + + ExampleResult res = new ExampleResult(); + res.setFacetResults(facetRes); + return res; + } + +} diff --git a/modules/facet/src/examples/org/apache/lucene/facet/example/adaptive/AdaptiveSearcher.java b/modules/facet/src/examples/org/apache/lucene/facet/example/adaptive/AdaptiveSearcher.java new file mode 100644 index 00000000000..93326e52dc0 --- /dev/null +++ b/modules/facet/src/examples/org/apache/lucene/facet/example/adaptive/AdaptiveSearcher.java @@ -0,0 +1,103 @@ +package org.apache.lucene.facet.example.adaptive; + +import java.util.List; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.TopScoreDocCollector; +import org.apache.lucene.store.Directory; + +import org.apache.lucene.search.MultiCollector; +import org.apache.lucene.facet.example.ExampleUtils; +import org.apache.lucene.facet.example.simple.SimpleUtils; +import org.apache.lucene.facet.search.AdaptiveFacetsAccumulator; +import org.apache.lucene.facet.search.ScoredDocIdCollector; +import org.apache.lucene.facet.search.params.CountFacetRequest; +import org.apache.lucene.facet.search.params.FacetSearchParams; +import org.apache.lucene.facet.search.results.FacetResult; +import org.apache.lucene.facet.taxonomy.CategoryPath; +import org.apache.lucene.facet.taxonomy.TaxonomyReader; +import org.apache.lucene.facet.taxonomy.lucene.LuceneTaxonomyReader; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Search with facets through the {@link AdaptiveFacetsAccumulator} + * + * @lucene.experimental + */ +public class AdaptiveSearcher { + + /** + * Search with facets through the {@link AdaptiveFacetsAccumulator} + * @param indexDir Directory of the search index. + * @param taxoDir Directory of the taxonomy index. + * @throws Exception on error (no detailed exception handling here for sample simplicity + * @return facet results + */ + public static List searchWithFacets (Directory indexDir, Directory taxoDir) throws Exception { + // prepare index reader and taxonomy. + TaxonomyReader taxo = new LuceneTaxonomyReader(taxoDir); + IndexReader indexReader = IndexReader.open(indexDir); + + // prepare searcher to search against + IndexSearcher searcher = new IndexSearcher(indexReader); + + // faceted search is working in 2 steps: + // 1. collect matching documents + // 2. aggregate facets for collected documents and + // generate the requested faceted results from the aggregated facets + + // step 1: collect matching documents into a collector + Query q = new TermQuery(new Term(SimpleUtils.TEXT,"white")); + ExampleUtils.log("Query: "+q); + + // regular collector for scoring matched documents + TopScoreDocCollector topDocsCollector = TopScoreDocCollector.create(10, true); + + // docids collector for guiding facets accumulation (scoring disabled) + ScoredDocIdCollector docIdsCollecor = ScoredDocIdCollector.create(indexReader.maxDoc(), false); + + // Faceted search parameters indicate which facets are we interested in + FacetSearchParams facetSearchParams = new FacetSearchParams(); + facetSearchParams.addFacetRequest(new CountFacetRequest(new CategoryPath("root","a"), 10)); + + // search, into both collectors. note: in case only facets accumulation + // is required, the topDocCollector part can be totally discarded + searcher.search(q, MultiCollector.wrap(topDocsCollector, docIdsCollecor)); + + // Obtain facets results and print them + AdaptiveFacetsAccumulator accumulator = new AdaptiveFacetsAccumulator(facetSearchParams, indexReader, taxo); + List res = accumulator.accumulate(docIdsCollecor.getScoredDocIDs()); + + int i = 0; + for (FacetResult facetResult : res) { + ExampleUtils.log("Res "+(i++)+": "+facetResult); + } + + // we're done, close the index reader and the taxonomy. + indexReader.close(); + taxo.close(); + + return res; + } + +} diff --git a/modules/facet/src/examples/org/apache/lucene/facet/example/association/AssociationIndexer.java b/modules/facet/src/examples/org/apache/lucene/facet/example/association/AssociationIndexer.java new file mode 100644 index 00000000000..c6bebaab08e --- /dev/null +++ b/modules/facet/src/examples/org/apache/lucene/facet/example/association/AssociationIndexer.java @@ -0,0 +1,132 @@ +package org.apache.lucene.facet.example.association; + +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.Field.Index; +import org.apache.lucene.document.Field.Store; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.IndexWriterConfig.OpenMode; +import org.apache.lucene.store.Directory; + +import org.apache.lucene.facet.enhancements.EnhancementsDocumentBuilder; +import org.apache.lucene.facet.enhancements.association.AssociationProperty; +import org.apache.lucene.facet.example.ExampleUtils; +import org.apache.lucene.facet.example.simple.SimpleUtils; +import org.apache.lucene.facet.index.CategoryContainer; +import org.apache.lucene.facet.index.CategoryDocumentBuilder; +import org.apache.lucene.facet.taxonomy.CategoryPath; +import org.apache.lucene.facet.taxonomy.TaxonomyWriter; +import org.apache.lucene.facet.taxonomy.lucene.LuceneTaxonomyWriter; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Sample indexer creates an index, and adds to it sample documents with + * categories, which can be simple or contain associations. + * + * @lucene.experimental + */ +public class AssociationIndexer { + + /** + * Create an index, and adds to it sample documents and categories. + * + * @param indexDir + * Directory in which the index should be created. + * @param taxoDir + * Directory in which the taxonomy index should be created. + * @throws Exception + * on error (no detailed exception handling here for sample + * simplicity + */ + public static void index(Directory indexDir, Directory taxoDir) throws Exception { + + // create and open an index writer + IndexWriter iw = new IndexWriter(indexDir, new IndexWriterConfig(ExampleUtils.EXAMPLE_VER, SimpleUtils.analyzer)); + + // create and open a taxonomy writer + TaxonomyWriter taxo = new LuceneTaxonomyWriter(taxoDir, OpenMode.CREATE); + + // loop over sample documents + int nDocsAdded = 0; + int nFacetsAdded = 0; + for (int docNum = 0; docNum < SimpleUtils.docTexts.length; docNum++) { + ExampleUtils.log(" ++++ DOC ID: " + docNum); + // obtain the sample categories for current document + CategoryContainer categoryContainer = new CategoryContainer(); + for (CategoryPath path : SimpleUtils.categories[docNum]) { + categoryContainer.addCategory(path); + ExampleUtils.log("\t ++++ PATH: " + path); + } + // and also those with associations + CategoryPath[] associationsPaths = AssociationUtils.categories[docNum]; + AssociationProperty[] associationProps = AssociationUtils.associations[docNum]; + for (int i = 0; i < associationsPaths.length; i++) { + categoryContainer.addCategory(associationsPaths[i], associationProps[i]); + ExampleUtils.log("\t $$$$ Association: (" + + associationsPaths[i] + "," + associationProps[i] + + ")"); + } + + // we do not alter indexing parameters! + // a category document builder will add the categories to a document + // once build() is called + CategoryDocumentBuilder categoryDocBuilder = new EnhancementsDocumentBuilder( + taxo, AssociationUtils.assocIndexingParams); + categoryDocBuilder.setCategories(categoryContainer); + + // create a plain Lucene document and add some regular Lucene fields + // to it + Document doc = new Document(); + doc.add(new Field(SimpleUtils.TITLE, SimpleUtils.docTitles[docNum], + Store.YES, Index.ANALYZED)); + doc.add(new Field(SimpleUtils.TEXT, SimpleUtils.docTexts[docNum], + Store.NO, Index.ANALYZED)); + + // invoke the category document builder for adding categories to the + // document and, + // as required, to the taxonomy index + categoryDocBuilder.build(doc); + + // finally add the document to the index + iw.addDocument(doc); + + nDocsAdded++; + nFacetsAdded += categoryContainer.size(); + } + + // commit changes. + // we commit changes to the taxonomy index prior to committing them to + // the search index. + // this is important, so that all facets referred to by documents in the + // search index + // will indeed exist in the taxonomy index. + taxo.commit(); + iw.commit(); + + // close the taxonomy index and the index - all modifications are + // now safely in the provided directories: indexDir and taxoDir. + taxo.close(); + iw.close(); + + ExampleUtils.log("Indexed " + nDocsAdded + " documents with overall " + + nFacetsAdded + " facets."); + } + +} diff --git a/modules/facet/src/examples/org/apache/lucene/facet/example/association/AssociationMain.java b/modules/facet/src/examples/org/apache/lucene/facet/example/association/AssociationMain.java new file mode 100644 index 00000000000..f49dc3dbbb2 --- /dev/null +++ b/modules/facet/src/examples/org/apache/lucene/facet/example/association/AssociationMain.java @@ -0,0 +1,82 @@ +package org.apache.lucene.facet.example.association; + +import java.util.List; + +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.RAMDirectory; + +import org.apache.lucene.facet.example.ExampleResult; +import org.apache.lucene.facet.example.ExampleUtils; +import org.apache.lucene.facet.search.results.FacetResult; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Driver for the simple sample. + * + * @lucene.experimental + */ +public class AssociationMain { + + /** + * Driver for the simple sample. + * @throws Exception on error (no detailed exception handling here for sample simplicity + */ + public static void main(String[] args) throws Exception { + new AssociationMain().runSumIntAssociationSample(); + new AssociationMain().runSumFloatAssociationSample(); + ExampleUtils.log("DONE"); + } + + public ExampleResult runSumIntAssociationSample() throws Exception { + + // create Directories for the search index and for the taxonomy index + Directory indexDir = new RAMDirectory();//FSDirectory.open(new File("/tmp/111")); + Directory taxoDir = new RAMDirectory(); + + // index the sample documents + ExampleUtils.log("index the sample documents..."); + AssociationIndexer.index(indexDir, taxoDir); + + ExampleUtils.log("search the sample documents..."); + List facetRes = AssociationSearcher.searchSumIntAssociation(indexDir, taxoDir); + + ExampleResult res = new ExampleResult(); + res.setFacetResults(facetRes); + return res; + } + + public ExampleResult runSumFloatAssociationSample() throws Exception { + + // create Directories for the search index and for the taxonomy index + Directory indexDir = new RAMDirectory();//FSDirectory.open(new File("/tmp/111")); + Directory taxoDir = new RAMDirectory(); + + // index the sample documents + ExampleUtils.log("index the sample documents..."); + AssociationIndexer.index(indexDir, taxoDir); + + ExampleUtils.log("search the sample documents..."); + List facetRes = AssociationSearcher.searchSumFloatAssociation(indexDir, taxoDir); + + ExampleResult res = new ExampleResult(); + res.setFacetResults(facetRes); + return res; + } + +} diff --git a/modules/facet/src/examples/org/apache/lucene/facet/example/association/AssociationSearcher.java b/modules/facet/src/examples/org/apache/lucene/facet/example/association/AssociationSearcher.java new file mode 100644 index 00000000000..08a8e715d64 --- /dev/null +++ b/modules/facet/src/examples/org/apache/lucene/facet/example/association/AssociationSearcher.java @@ -0,0 +1,81 @@ +package org.apache.lucene.facet.example.association; + +import java.util.List; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.store.Directory; + +import org.apache.lucene.facet.example.simple.SimpleSearcher; +import org.apache.lucene.facet.search.params.association.AssociationFloatSumFacetRequest; +import org.apache.lucene.facet.search.params.association.AssociationIntSumFacetRequest; +import org.apache.lucene.facet.search.results.FacetResult; +import org.apache.lucene.facet.taxonomy.CategoryPath; +import org.apache.lucene.facet.taxonomy.TaxonomyReader; +import org.apache.lucene.facet.taxonomy.lucene.LuceneTaxonomyReader; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * AssociationSearcher searches index with facets, evaluating the facets with + * their associated $int value + * + * @lucene.experimental + */ +public class AssociationSearcher { + + /** Search an index with a sum of int-association. */ + public static List searchSumIntAssociation(Directory indexDir, + Directory taxoDir) throws Exception { + // prepare index reader + IndexReader indexReader = IndexReader.open(indexDir); + TaxonomyReader taxo = new LuceneTaxonomyReader(taxoDir); + + AssociationIntSumFacetRequest facetRequest = new AssociationIntSumFacetRequest( + new CategoryPath("tags"), 10); + + List res = SimpleSearcher.searchWithRequest(indexReader, taxo, + AssociationUtils.assocIndexingParams, facetRequest); + + // close readers + taxo.close(); + indexReader.close(); + + return res; + } + + /** Search an index with a sum of float-association. */ + public static List searchSumFloatAssociation(Directory indexDir, + Directory taxoDir) throws Exception { + // prepare index reader + IndexReader indexReader = IndexReader.open(indexDir); + TaxonomyReader taxo = new LuceneTaxonomyReader(taxoDir); + + AssociationFloatSumFacetRequest facetRequest = new AssociationFloatSumFacetRequest( + new CategoryPath("genre"), 10); + + List res = SimpleSearcher.searchWithRequest(indexReader, taxo, + AssociationUtils.assocIndexingParams, facetRequest); + + // close readers + taxo.close(); + indexReader.close(); + + return res; + } + +} diff --git a/modules/facet/src/examples/org/apache/lucene/facet/example/association/AssociationUtils.java b/modules/facet/src/examples/org/apache/lucene/facet/example/association/AssociationUtils.java new file mode 100644 index 00000000000..3bc749db5a4 --- /dev/null +++ b/modules/facet/src/examples/org/apache/lucene/facet/example/association/AssociationUtils.java @@ -0,0 +1,79 @@ +package org.apache.lucene.facet.example.association; + +import org.apache.lucene.facet.enhancements.association.AssociationEnhancement; +import org.apache.lucene.facet.enhancements.association.AssociationFloatProperty; +import org.apache.lucene.facet.enhancements.association.AssociationIntProperty; +import org.apache.lucene.facet.enhancements.association.AssociationProperty; +import org.apache.lucene.facet.enhancements.params.DefaultEnhancementsIndexingParams; +import org.apache.lucene.facet.taxonomy.CategoryPath; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @lucene.experimental + */ +public class AssociationUtils { + + /** + * Categories: categories[D][N] == category-path with association no. N for + * document no. D. + */ + public static CategoryPath[][] categories = { + // Doc #1 + { new CategoryPath("tags", "lucene") , + new CategoryPath("genre", "computing") + }, + + // Doc #2 + { new CategoryPath("tags", "lucene"), + new CategoryPath("tags", "solr"), + new CategoryPath("genre", "computing"), + new CategoryPath("genre", "software") + } + }; + + public static AssociationProperty[][] associations = { + // Doc #1 associations + { + /* 3 occurrences for tag 'lucene' */ + new AssociationIntProperty(3), + /* 87% confidence level of genre 'computing' */ + new AssociationFloatProperty(0.87f) + }, + + // Doc #2 associations + { + /* 1 occurrence for tag 'lucene' */ + new AssociationIntProperty(1), + /* 2 occurrences for tag 'solr' */ + new AssociationIntProperty(2), + /* 75% confidence level of genre 'computing' */ + new AssociationFloatProperty(0.75f), + /* 34% confidence level of genre 'software' */ + new AssociationFloatProperty(0.34f), + } + }; + + /** + * Indexing Params: the indexing params to use when dealing with + * associations. + */ + public static final DefaultEnhancementsIndexingParams assocIndexingParams = + new DefaultEnhancementsIndexingParams(new AssociationEnhancement()); + +} diff --git a/modules/facet/src/examples/org/apache/lucene/facet/example/merge/TaxonomyMergeUtils.java b/modules/facet/src/examples/org/apache/lucene/facet/example/merge/TaxonomyMergeUtils.java new file mode 100644 index 00000000000..4b3d346ffc7 --- /dev/null +++ b/modules/facet/src/examples/org/apache/lucene/facet/example/merge/TaxonomyMergeUtils.java @@ -0,0 +1,102 @@ +package org.apache.lucene.facet.example.merge; + +import java.io.IOException; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.PayloadProcessorProvider; +import org.apache.lucene.store.Directory; + +import org.apache.lucene.facet.example.ExampleUtils; +import org.apache.lucene.facet.index.FacetsPayloadProcessorProvider; +import org.apache.lucene.facet.index.params.DefaultFacetIndexingParams; +import org.apache.lucene.facet.taxonomy.lucene.LuceneTaxonomyWriter; +import org.apache.lucene.facet.taxonomy.lucene.LuceneTaxonomyWriter.DiskOrdinalMap; +import org.apache.lucene.facet.taxonomy.lucene.LuceneTaxonomyWriter.MemoryOrdinalMap; +import org.apache.lucene.facet.taxonomy.lucene.LuceneTaxonomyWriter.OrdinalMap; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @lucene.experimental + */ +public class TaxonomyMergeUtils { + + /** + * Merges the given taxonomy and index directories. Note that this method + * opens {@link LuceneTaxonomyWriter} and {@link IndexWriter} on the + * respective destination indexes. Therefore if you have a writer open on any + * of them, it should be closed, or you should use + * {@link #merge(Directory, Directory, IndexWriter, LuceneTaxonomyWriter)} + * instead. + * + * @see #merge(Directory, Directory, IndexWriter, LuceneTaxonomyWriter) + */ + public static void merge(Directory srcIndexDir, Directory srcTaxDir, + Directory destIndexDir, Directory destTaxDir) throws IOException { + IndexWriter destIndexWriter = new IndexWriter(destIndexDir, + new IndexWriterConfig(ExampleUtils.EXAMPLE_VER, null)); + LuceneTaxonomyWriter destTaxWriter = new LuceneTaxonomyWriter(destTaxDir); + merge(srcIndexDir, srcTaxDir, new MemoryOrdinalMap(), destIndexWriter, destTaxWriter); + destTaxWriter.close(); + destIndexWriter.close(); + } + + /** + * Merges the given taxonomy and index directories and commits the changes to + * the given writers. This method uses {@link MemoryOrdinalMap} to store the + * mapped ordinals. If you cannot afford the memory, you can use + * {@link #merge(Directory, Directory, OrdinalMap, IndexWriter, LuceneTaxonomyWriter)} + * by passing {@link DiskOrdinalMap}. + * + * @see #merge(Directory, Directory, OrdinalMap, IndexWriter, LuceneTaxonomyWriter) + */ + public static void merge(Directory srcIndexDir, Directory srcTaxDir, + IndexWriter destIndexWriter, + LuceneTaxonomyWriter destTaxWriter) throws IOException { + merge(srcIndexDir, srcTaxDir, new MemoryOrdinalMap(), destIndexWriter, destTaxWriter); + } + + /** + * Merges the given taxonomy and index directories and commits the changes to + * the given writers. + */ + public static void merge(Directory srcIndexDir, Directory srcTaxDir, + OrdinalMap map, IndexWriter destIndexWriter, + LuceneTaxonomyWriter destTaxWriter) throws IOException { + // merge the taxonomies + destTaxWriter.addTaxonomies(new Directory[] { srcTaxDir }, new OrdinalMap[] { map }); + + PayloadProcessorProvider payloadProcessor = new FacetsPayloadProcessorProvider( + srcIndexDir, map.getMap(), new DefaultFacetIndexingParams()); + destIndexWriter.setPayloadProcessorProvider(payloadProcessor); + + IndexReader reader = IndexReader.open(srcIndexDir); + try { + destIndexWriter.addIndexes(reader); + + // commit changes to taxonomy and index respectively. + destTaxWriter.commit(); + destIndexWriter.commit(); + } finally { + reader.close(); + } + } + +} diff --git a/modules/facet/src/examples/org/apache/lucene/facet/example/multiCL/MultiCLIndexer.java b/modules/facet/src/examples/org/apache/lucene/facet/example/multiCL/MultiCLIndexer.java new file mode 100644 index 00000000000..5291fe16dc3 --- /dev/null +++ b/modules/facet/src/examples/org/apache/lucene/facet/example/multiCL/MultiCLIndexer.java @@ -0,0 +1,209 @@ +package org.apache.lucene.facet.example.multiCL; + +import java.util.List; +import java.util.Random; + +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.Field.Index; +import org.apache.lucene.document.Field.Store; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.IndexWriterConfig.OpenMode; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.RAMDirectory; + +import org.apache.lucene.DocumentBuilder; +import org.apache.lucene.facet.example.ExampleUtils; +import org.apache.lucene.facet.example.simple.SimpleUtils; +import org.apache.lucene.facet.index.CategoryDocumentBuilder; +import org.apache.lucene.facet.index.params.CategoryListParams; +import org.apache.lucene.facet.index.params.FacetIndexingParams; +import org.apache.lucene.facet.index.params.PerDimensionIndexingParams; +import org.apache.lucene.facet.taxonomy.CategoryPath; +import org.apache.lucene.facet.taxonomy.lucene.LuceneTaxonomyWriter; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Sample indexer creates an index, and adds to it sample documents and facets + * with multiple CategoryLists specified for different facets, so there are different + * category lists for different facets. + * + * @lucene.experimental + */ +public class MultiCLIndexer { + + // Number of documents to index + public static int NUM_DOCS = 100; + // Number of facets to add per document + public static int NUM_FACETS_PER_DOC = 10; + // Number of tokens in title + public static int TITLE_LENGTH = 5; + // Number of tokens in text + public static int TEXT_LENGTH = 100; + + // Lorum ipsum to use as content - this will be tokenized and used for document + // titles/text. + static String words = "Sed ut perspiciatis unde omnis iste natus error sit " + + "voluptatem accusantium doloremque laudantium totam rem aperiam " + + "eaque ipsa quae ab illo inventore veritatis et quasi architecto " + + "beatae vitae dicta sunt explicabo Nemo enim ipsam voluptatem " + + "quia voluptas sit aspernatur aut odit aut fugit sed quia consequuntur " + + "magni dolores eos qui ratione voluptatem sequi nesciunt Neque porro " + + "quisquam est qui dolorem ipsum quia dolor sit amet consectetur adipisci velit " + + "sed quia non numquam eius modi tempora incidunt ut labore et dolore " + + "magnam aliquam quaerat voluptatem Ut enim ad minima veniam " + + "quis nostrum exercitationem ullam corporis suscipit laboriosam " + + "nisi ut aliquid ex ea commodi consequatur? Quis autem vel eum iure" + + "reprehenderit qui in ea voluptate velit esse quam nihil molestiae " + + "consequatur vel illum qui dolorem eum fugiat quo voluptas nulla pariatur"; + // PerDimensionIndexingParams for multiple category lists + public static PerDimensionIndexingParams MULTI_IPARAMS = new PerDimensionIndexingParams(); + + // Initialize PerDimensionIndexingParams + static { + MULTI_IPARAMS.addCategoryListParams(new CategoryPath("0"), + new CategoryListParams(new Term("$Digits", "Zero"))); + MULTI_IPARAMS.addCategoryListParams(new CategoryPath("1"), + new CategoryListParams(new Term("$Digits", "One"))); + MULTI_IPARAMS.addCategoryListParams(new CategoryPath("2"), + new CategoryListParams(new Term("$Digits", "Two"))); + MULTI_IPARAMS.addCategoryListParams(new CategoryPath("3"), + new CategoryListParams(new Term("$Digits", "Three"))); + MULTI_IPARAMS.addCategoryListParams(new CategoryPath("4"), + new CategoryListParams(new Term("$Digits", "Four"))); + MULTI_IPARAMS.addCategoryListParams(new CategoryPath("5"), + new CategoryListParams(new Term("$Digits", "Five"))); + } + + /** + * Create an index, and adds to it sample documents and facets. + * @param indexDir Directory in which the index should be created. + * @param taxoDir Directory in which the taxonomy index should be created. + * @throws Exception on error (no detailed exception handling here for sample simplicity + */ + public static void index(Directory indexDir, Directory taxoDir) + throws Exception { + + Random random = new Random(2003); + + String[] docTitles = new String[NUM_DOCS]; + String[] docTexts = new String[NUM_DOCS]; + CategoryPath[][] cPaths = new CategoryPath[NUM_DOCS][NUM_FACETS_PER_DOC]; + + String[] tokens = words.split(" "); + for (int docNum = 0; docNum < NUM_DOCS; docNum++) { + String title = ""; + String text = ""; + for (int j = 0; j < TITLE_LENGTH; j++) { + title = title + tokens[random.nextInt(tokens.length)] + " "; + } + docTitles[docNum] = title; + + for (int j = 0; j < TEXT_LENGTH; j++) { + text = text + tokens[random.nextInt(tokens.length)] + " "; + } + docTexts[docNum] = text; + + for (int facetNum = 0; facetNum < NUM_FACETS_PER_DOC; facetNum++) { + cPaths[docNum][facetNum] = new CategoryPath(Integer + .toString(random.nextInt(7)), Integer.toString(random.nextInt(10))); + } + } + index(indexDir, taxoDir, MULTI_IPARAMS, docTitles, docTexts, cPaths); + } + + /** + * More advanced method for specifying custom indexing params, doc texts, + * doc titles and category paths. + */ + public static void index(Directory indexDir, Directory taxoDir, + FacetIndexingParams iParams, String[] docTitles, + String[] docTexts, CategoryPath[][] cPaths) throws Exception { + // create and open an index writer + IndexWriter iw = new IndexWriter(indexDir, new IndexWriterConfig( + ExampleUtils.EXAMPLE_VER, SimpleUtils.analyzer).setOpenMode(OpenMode.CREATE)); + // create and open a taxonomy writer + LuceneTaxonomyWriter taxo = new LuceneTaxonomyWriter(taxoDir, OpenMode.CREATE); + index(iw, taxo, iParams, docTitles, docTexts, cPaths); + } + + /** + * More advanced method for specifying custom indexing params, doc texts, + * doc titles and category paths. + *

+ * Create an index, and adds to it sample documents and facets. + * @throws Exception + * on error (no detailed exception handling here for sample + * simplicity + */ + public static void index(IndexWriter iw, LuceneTaxonomyWriter taxo, + FacetIndexingParams iParams, String[] docTitles, + String[] docTexts, CategoryPath[][] cPaths) throws Exception { + + // loop over sample documents + int nDocsAdded = 0; + int nFacetsAdded = 0; + for (int docNum = 0; docNum < SimpleUtils.docTexts.length; docNum++) { + List facetList = SimpleUtils.categoryPathArrayToList(cPaths[docNum]); + + // we do not alter indexing parameters! + // a category document builder will add the categories to a document + // once build() is called + DocumentBuilder categoryDocBuilder = new CategoryDocumentBuilder( + taxo, iParams).setCategoryPaths(facetList); + + // create a plain Lucene document and add some regular Lucene fields + // to it + Document doc = new Document(); + doc.add(new Field(SimpleUtils.TITLE, docTitles[docNum], Store.YES, Index.ANALYZED)); + doc.add(new Field(SimpleUtils.TEXT, docTexts[docNum], Store.NO, Index.ANALYZED)); + + // finally add the document to the index + categoryDocBuilder.build(doc); + iw.addDocument(doc); + + nDocsAdded++; + nFacetsAdded += facetList.size(); + } + + // commit changes. + // we commit changes to the taxonomy index prior to committing them to + // the search index. + // this is important, so that all facets referred to by documents in the + // search index + // will indeed exist in the taxonomy index. + taxo.commit(); + iw.commit(); + + // close the taxonomy index and the index - all modifications are + // now safely in the provided directories: indexDir and taxoDir. + taxo.close(); + iw.close(); + + ExampleUtils.log("Indexed " + nDocsAdded + " documents with overall " + + nFacetsAdded + " facets."); + } + + public static void main(String[] args) throws Exception { + index(new RAMDirectory(), new RAMDirectory()); + } + +} diff --git a/modules/facet/src/examples/org/apache/lucene/facet/example/multiCL/MultiCLMain.java b/modules/facet/src/examples/org/apache/lucene/facet/example/multiCL/MultiCLMain.java new file mode 100644 index 00000000000..6562c3a2dca --- /dev/null +++ b/modules/facet/src/examples/org/apache/lucene/facet/example/multiCL/MultiCLMain.java @@ -0,0 +1,65 @@ +package org.apache.lucene.facet.example.multiCL; + +import java.util.List; + +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.RAMDirectory; + +import org.apache.lucene.facet.example.ExampleResult; +import org.apache.lucene.facet.example.ExampleUtils; +import org.apache.lucene.facet.search.results.FacetResult; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @lucene.experimental + */ +public class MultiCLMain { + + /** + * Driver for the multi sample. + * + * @throws Exception + * on error (no detailed exception handling here for sample + * simplicity + */ + public static void main(String[] args) throws Exception { + new MultiCLMain().runSample(); + ExampleUtils.log("DONE"); + } + + public ExampleResult runSample() throws Exception { + + // create Directories for the search index and for the taxonomy index + Directory indexDir = new RAMDirectory(); + Directory taxoDir = new RAMDirectory(); + + // index the sample documents + ExampleUtils.log("index the sample documents..."); + MultiCLIndexer.index(indexDir, taxoDir); + + ExampleUtils.log("search the sample documents..."); + List facetRes = MultiCLSearcher.searchWithFacets(indexDir, + taxoDir, MultiCLIndexer.MULTI_IPARAMS); + + ExampleResult res = new ExampleResult(); + res.setFacetResults(facetRes); + return res; + } + +} diff --git a/modules/facet/src/examples/org/apache/lucene/facet/example/multiCL/MultiCLSearcher.java b/modules/facet/src/examples/org/apache/lucene/facet/example/multiCL/MultiCLSearcher.java new file mode 100644 index 00000000000..8be59ca3791 --- /dev/null +++ b/modules/facet/src/examples/org/apache/lucene/facet/example/multiCL/MultiCLSearcher.java @@ -0,0 +1,128 @@ +package org.apache.lucene.facet.example.multiCL; + +import java.util.List; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.TopScoreDocCollector; +import org.apache.lucene.store.Directory; + +import org.apache.lucene.search.MultiCollector; +import org.apache.lucene.facet.example.ExampleUtils; +import org.apache.lucene.facet.example.simple.SimpleUtils; +import org.apache.lucene.facet.index.params.FacetIndexingParams; +import org.apache.lucene.facet.search.FacetsCollector; +import org.apache.lucene.facet.search.params.CountFacetRequest; +import org.apache.lucene.facet.search.params.FacetSearchParams; +import org.apache.lucene.facet.search.results.FacetResult; +import org.apache.lucene.facet.taxonomy.CategoryPath; +import org.apache.lucene.facet.taxonomy.TaxonomyReader; +import org.apache.lucene.facet.taxonomy.lucene.LuceneTaxonomyReader; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * MultiSearcher searches index with facets over an index with multiple + * category lists. + * + * @lucene.experimental + */ +public class MultiCLSearcher { + + /** + * Search an index with facets. + * + * @param indexDir + * Directory of the search index. + * @param taxoDir + * Directory of the taxonomy index. + * @throws Exception + * on error (no detailed exception handling here for sample + * simplicity + * @return facet results + */ + public static List searchWithFacets(Directory indexDir, + Directory taxoDir, FacetIndexingParams iParams) throws Exception { + + // prepare index reader and taxonomy. + IndexReader indexReader = IndexReader.open(indexDir); + TaxonomyReader taxo = new LuceneTaxonomyReader(taxoDir); + + // Get results + List results = searchWithFacets(indexReader, taxo, iParams); + + // we're done, close the index reader and the taxonomy. + indexReader.close(); + taxo.close(); + return results; + } + + public static List searchWithFacets(IndexReader indexReader, + TaxonomyReader taxo, FacetIndexingParams iParams) throws Exception { + // prepare searcher to search against + IndexSearcher searcher = new IndexSearcher(indexReader); + + // faceted search is working in 2 steps: + // 1. collect matching documents + // 2. aggregate facets for collected documents and + // generate the requested faceted results from the aggregated facets + + // step 1: create a query for finding matching documents for which we + // accumulate facets + Query q = new TermQuery(new Term(SimpleUtils.TEXT, "Quis")); + ExampleUtils.log("Query: " + q); + + TopScoreDocCollector topDocsCollector = TopScoreDocCollector.create(10, + true); + + // Faceted search parameters indicate which facets are we interested in + FacetSearchParams facetSearchParams = new FacetSearchParams(iParams); + facetSearchParams.addFacetRequest(new CountFacetRequest( + new CategoryPath("5"), 10)); + facetSearchParams.addFacetRequest(new CountFacetRequest( + new CategoryPath("5", "5"), 10)); + facetSearchParams.addFacetRequest(new CountFacetRequest( + new CategoryPath("6", "2"), 10)); + + // Facets collector is the simplest interface for faceted search. + // It provides faceted search functions that are sufficient to many + // application, + // although it is insufficient for tight control on faceted search + // behavior - in those + // situations other, more low-level interfaces are available, as + // demonstrated in other search examples. + FacetsCollector facetsCollector = new FacetsCollector( + facetSearchParams, indexReader, taxo); + + // perform documents search and facets accumulation + searcher.search(q, MultiCollector.wrap(topDocsCollector, facetsCollector)); + + // Obtain facets results and print them + List res = facetsCollector.getFacetResults(); + + int i = 0; + for (FacetResult facetResult : res) { + ExampleUtils.log("Res " + (i++) + ": " + facetResult); + } + return res; + } + +} diff --git a/modules/facet/src/examples/org/apache/lucene/facet/example/simple/SimpleIndexer.java b/modules/facet/src/examples/org/apache/lucene/facet/example/simple/SimpleIndexer.java new file mode 100644 index 00000000000..de6937667d8 --- /dev/null +++ b/modules/facet/src/examples/org/apache/lucene/facet/example/simple/SimpleIndexer.java @@ -0,0 +1,102 @@ +package org.apache.lucene.facet.example.simple; + +import java.util.List; + +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.Field.Index; +import org.apache.lucene.document.Field.Store; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.IndexWriterConfig.OpenMode; +import org.apache.lucene.store.Directory; + +import org.apache.lucene.DocumentBuilder; +import org.apache.lucene.facet.example.ExampleUtils; +import org.apache.lucene.facet.index.CategoryDocumentBuilder; +import org.apache.lucene.facet.taxonomy.CategoryPath; +import org.apache.lucene.facet.taxonomy.TaxonomyWriter; +import org.apache.lucene.facet.taxonomy.lucene.LuceneTaxonomyWriter; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Sample indexer creates an index, and adds to it sample documents and facets. + * + * @lucene.experimental + */ +public class SimpleIndexer { + + /** + * Create an index, and adds to it sample documents and facets. + * @param indexDir Directory in which the index should be created. + * @param taxoDir Directory in which the taxonomy index should be created. + * @throws Exception on error (no detailed exception handling here for sample simplicity + */ + public static void index (Directory indexDir, Directory taxoDir) throws Exception { + + // create and open an index writer + IndexWriter iw = new IndexWriter(indexDir, new IndexWriterConfig(ExampleUtils.EXAMPLE_VER, SimpleUtils.analyzer)); + + // create and open a taxonomy writer + TaxonomyWriter taxo = new LuceneTaxonomyWriter(taxoDir, OpenMode.CREATE); + + // loop over sample documents + int nDocsAdded = 0; + int nFacetsAdded = 0; + for (int docNum=0; docNum facetList = SimpleUtils.categoryPathArrayToList(SimpleUtils.categories[docNum]); + + // we do not alter indexing parameters! + // a category document builder will add the categories to a document once build() is called + DocumentBuilder categoryDocBuilder = new CategoryDocumentBuilder(taxo).setCategoryPaths(facetList); + + // create a plain Lucene document and add some regular Lucene fields to it + Document doc = new Document(); + doc.add(new Field(SimpleUtils.TITLE, SimpleUtils.docTitles[docNum], Store.YES, Index.ANALYZED)); + doc.add(new Field(SimpleUtils.TEXT, SimpleUtils.docTexts[docNum], Store.NO, Index.ANALYZED)); + + // invoke the category document builder for adding categories to the document and, + // as required, to the taxonomy index + categoryDocBuilder.build(doc); + + // finally add the document to the index + iw.addDocument(doc); + + nDocsAdded ++; + nFacetsAdded += facetList.size(); + } + + // commit changes. + // we commit changes to the taxonomy index prior to committing them to the search index. + // this is important, so that all facets referred to by documents in the search index + // will indeed exist in the taxonomy index. + taxo.commit(); + iw.commit(); + + // close the taxonomy index and the index - all modifications are + // now safely in the provided directories: indexDir and taxoDir. + taxo.close(); + iw.close(); + + ExampleUtils.log("Indexed "+nDocsAdded+" documents with overall "+nFacetsAdded+" facets."); + } + +} diff --git a/modules/facet/src/examples/org/apache/lucene/facet/example/simple/SimpleMain.java b/modules/facet/src/examples/org/apache/lucene/facet/example/simple/SimpleMain.java new file mode 100644 index 00000000000..da3a5d33706 --- /dev/null +++ b/modules/facet/src/examples/org/apache/lucene/facet/example/simple/SimpleMain.java @@ -0,0 +1,99 @@ +package org.apache.lucene.facet.example.simple; + +import java.util.List; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.RAMDirectory; + +import org.apache.lucene.facet.example.ExampleResult; +import org.apache.lucene.facet.example.ExampleUtils; +import org.apache.lucene.facet.search.results.FacetResult; +import org.apache.lucene.facet.taxonomy.TaxonomyReader; +import org.apache.lucene.facet.taxonomy.lucene.LuceneTaxonomyReader; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Driver for the simple sample. + * + * @lucene.experimental + */ +public class SimpleMain { + + /** + * Driver for the simple sample. + * @throws Exception on error (no detailed exception handling here for sample simplicity + */ + public static void main(String[] args) throws Exception { + new SimpleMain().runSimple(); + new SimpleMain().runDrillDown().getFacetResults(); + ExampleUtils.log("DONE"); + } + + public ExampleResult runSimple() throws Exception { + // create Directories for the search index and for the taxonomy index + Directory indexDir = new RAMDirectory(); + Directory taxoDir = new RAMDirectory(); + + // index the sample documents + ExampleUtils.log("index the sample documents..."); + SimpleIndexer.index(indexDir, taxoDir); + + // open readers + TaxonomyReader taxo = new LuceneTaxonomyReader(taxoDir); + IndexReader indexReader = IndexReader.open(indexDir, true); + + ExampleUtils.log("search the sample documents..."); + List facetRes = SimpleSearcher.searchWithFacets(indexReader, taxo); + + // close readers + taxo.close(); + indexReader.close(); + + ExampleResult res = new ExampleResult(); + res.setFacetResults(facetRes); + return res; + } + + public ExampleResult runDrillDown() throws Exception { + // create Directories for the search index and for the taxonomy index + Directory indexDir = new RAMDirectory(); + Directory taxoDir = new RAMDirectory(); + + // index the sample documents + ExampleUtils.log("index the sample documents..."); + SimpleIndexer.index(indexDir, taxoDir); + + // open readers + TaxonomyReader taxo = new LuceneTaxonomyReader(taxoDir); + IndexReader indexReader = IndexReader.open(indexDir, true); + + ExampleUtils.log("search the sample documents..."); + List facetRes = SimpleSearcher.searchWithDrillDown(indexReader, taxo); + + // close readers + taxo.close(); + indexReader.close(); + + ExampleResult res = new ExampleResult(); + res.setFacetResults(facetRes); + return res; + } + +} diff --git a/modules/facet/src/examples/org/apache/lucene/facet/example/simple/SimpleSearcher.java b/modules/facet/src/examples/org/apache/lucene/facet/example/simple/SimpleSearcher.java new file mode 100644 index 00000000000..2aa9dafab8c --- /dev/null +++ b/modules/facet/src/examples/org/apache/lucene/facet/example/simple/SimpleSearcher.java @@ -0,0 +1,168 @@ +package org.apache.lucene.facet.example.simple; + +import java.util.Iterator; +import java.util.List; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.TopScoreDocCollector; + +import org.apache.lucene.search.MultiCollector; +import org.apache.lucene.facet.example.ExampleUtils; +import org.apache.lucene.facet.index.params.DefaultFacetIndexingParams; +import org.apache.lucene.facet.index.params.FacetIndexingParams; +import org.apache.lucene.facet.search.DrillDown; +import org.apache.lucene.facet.search.FacetsCollector; +import org.apache.lucene.facet.search.params.CountFacetRequest; +import org.apache.lucene.facet.search.params.FacetRequest; +import org.apache.lucene.facet.search.params.FacetSearchParams; +import org.apache.lucene.facet.search.results.FacetResult; +import org.apache.lucene.facet.search.results.FacetResultNode; +import org.apache.lucene.facet.taxonomy.CategoryPath; +import org.apache.lucene.facet.taxonomy.TaxonomyReader; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * SampleSearcer searches index with facets. + * + * @lucene.experimental + */ +public class SimpleSearcher { + + /** + * Search an index with facets. + * @param indexReader index reader. + * @param taxoReader taxonomy reader. + * @throws Exception on error (no detailed exception handling here for sample simplicity + * @return facet results + */ + public static List searchWithFacets (IndexReader indexReader, + TaxonomyReader taxoReader) throws Exception { + CountFacetRequest facetRequest = new CountFacetRequest(new CategoryPath("root","a"), 10); + return searchWithRequest(indexReader, taxoReader, null, facetRequest); + } + + /** + * Search an index with facets for given facet requests. + * @param indexReader index reader. + * @param taxoReader taxonomy reader. + * @param indexingParams the facet indexing params + * @param facetRequests facet requests of interest + * @throws Exception on error (no detailed exception handling here for sample simplicity + * @return facet results + */ + public static List searchWithRequest(IndexReader indexReader, + TaxonomyReader taxoReader, FacetIndexingParams indexingParams, + FacetRequest... facetRequests) throws Exception { + Query q = new TermQuery(new Term(SimpleUtils.TEXT, "white")); + return searchWithRequestAndQuery(q, indexReader, taxoReader, + indexingParams, facetRequests); + } + + /** + * Search an index with facets for given query and facet requests. + * @param q query of interest + * @param indexReader index reader. + * @param taxoReader taxonomy reader. + * @param indexingParams the facet indexing params + * @param facetRequests facet requests of interest + * @throws Exception on error (no detailed exception handling here for sample simplicity + * @return facet results + */ + public static List searchWithRequestAndQuery(Query q, + IndexReader indexReader, TaxonomyReader taxoReader, + FacetIndexingParams indexingParams, FacetRequest... facetRequests) + throws Exception { + + ExampleUtils.log("Query: " + q); + // prepare searcher to search against + IndexSearcher searcher = new IndexSearcher(indexReader); + + // collect matching documents into a collector + TopScoreDocCollector topDocsCollector = TopScoreDocCollector.create(10, true); + + if (indexingParams == null) { + indexingParams = new DefaultFacetIndexingParams(); + } + + // Faceted search parameters indicate which facets are we interested in + FacetSearchParams facetSearchParams = new FacetSearchParams(indexingParams); + + // Add the facet requests of interest to the search params + for (FacetRequest frq : facetRequests) { + facetSearchParams.addFacetRequest(frq); + } + + FacetsCollector facetsCollector = new FacetsCollector(facetSearchParams, indexReader, taxoReader); + + // perform documents search and facets accumulation + searcher.search(q, MultiCollector.wrap(topDocsCollector, facetsCollector)); + + // Obtain facets results and print them + List res = facetsCollector.getFacetResults(); + + int i = 0; + for (FacetResult facetResult : res) { + ExampleUtils.log("Res " + (i++) + ": " + facetResult); + } + + return res; + } + + /** + * Search an index with facets drill-down. + * @param indexReader index reader. + * @param taxoReader taxonomy reader. + * @throws Exception on error (no detailed exception handling here for sample simplicity + * @return facet results + */ + public static List searchWithDrillDown(IndexReader indexReader, + TaxonomyReader taxoReader) throws Exception { + + // base query the user is interested in + Query baseQuery = new TermQuery(new Term(SimpleUtils.TEXT, "white")); + + // facet of interest + CountFacetRequest facetRequest = new CountFacetRequest(new CategoryPath("root","a"), 10); + + // initial search - all docs matching the base query will contribute to the accumulation + List res1 = searchWithRequest(indexReader, taxoReader, null, facetRequest); + + // a single result (because there was a single request) + FacetResult fres = res1.get(0); + + // assume the user is interested in the second sub-result + // (just take the second sub-result returned by the iterator - we know there are 3 results!) + Iterator resIterator = fres.getFacetResultNode().getSubResults().iterator(); + resIterator.next(); // skip first result + CategoryPath categoryOfInterest = resIterator.next().getLabel(); + + // drill-down preparation: turn the base query into a drill-down query for the category of interest + Query q2 = DrillDown.query(baseQuery, categoryOfInterest); + + // that's it - search with the new query and we're done! + // only documents both matching the base query AND containing the + // category of interest will contribute to the new accumulation + return searchWithRequestAndQuery(q2, indexReader, taxoReader, null, facetRequest); + } + +} diff --git a/modules/facet/src/examples/org/apache/lucene/facet/example/simple/SimpleUtils.java b/modules/facet/src/examples/org/apache/lucene/facet/example/simple/SimpleUtils.java new file mode 100644 index 00000000000..7765c855489 --- /dev/null +++ b/modules/facet/src/examples/org/apache/lucene/facet/example/simple/SimpleUtils.java @@ -0,0 +1,87 @@ +package org.apache.lucene.facet.example.simple; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.core.WhitespaceAnalyzer; + +import org.apache.lucene.facet.example.ExampleUtils; +import org.apache.lucene.facet.taxonomy.CategoryPath; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Some definitions for the Simple Sample. + * + * @lucene.experimental + */ +public class SimpleUtils { + + /** + * Documents text field. + */ + public static final String TEXT = "text"; + + /** + * Documents title field. + */ + public static final String TITLE = "title"; + + /** + * sample documents text (for the text field). + */ + public static String[] docTexts = { + "the white car is the one I want.", + "the white dog does not belong to anyone.", + }; + + /** + * sample documents titles (for the title field). + */ + public static String[] docTitles = { + "white car", + "white dog", + }; + + /** + * Categories: categories[D][N] == category-path no. N for document no. D. + */ + public static CategoryPath[][] categories = { + { new CategoryPath("root","a","f1"), new CategoryPath("root","a","f2") }, + { new CategoryPath("root","a","f1"), new CategoryPath("root","a","f3") }, + }; + + /** + * Analyzer used in the simple sample. + */ + public static final Analyzer analyzer = new WhitespaceAnalyzer(ExampleUtils.EXAMPLE_VER); + + /** + * Utility method: List of category paths out of an array of them... + * @param categoryPaths input array of category paths. + */ + public static List categoryPathArrayToList (CategoryPath...categoryPaths) { + ArrayList res = new ArrayList(); + for (CategoryPath categoryPath : categoryPaths) { + res.add(categoryPath); + } + return res; + } + +} diff --git a/modules/facet/src/examples/org/apache/lucene/facet/example/simple/package.html b/modules/facet/src/examples/org/apache/lucene/facet/example/simple/package.html new file mode 100644 index 00000000000..11d26676a45 --- /dev/null +++ b/modules/facet/src/examples/org/apache/lucene/facet/example/simple/package.html @@ -0,0 +1,17 @@ + + + Simple faceted indexing and search sample + + +

Simple faceted indexing and search sample

+ + A simple faceted example, showing how to: +
    +
  1. Create an index.
  2. +
  3. Add documents with facets to the index.
  4. +
  5. Search the index.
  6. +
+ + For more complex examples see the other sample code packages. + + \ No newline at end of file diff --git a/modules/facet/src/java/org/apache/lucene/DocumentBuilder.java b/modules/facet/src/java/org/apache/lucene/DocumentBuilder.java new file mode 100644 index 00000000000..26cee4b2f2e --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/DocumentBuilder.java @@ -0,0 +1,77 @@ +package org.apache.lucene; + +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * An interface which standardizes the process of building an indexable + * {@link Document}. + *

+ * The idea is that implementations implement {@link #build(Document doc)}, + * which adds to the given Document whatever {@link Field}s it wants to add. A + * DocumentBuilder is also allowed to inspect or change existing Fields in the + * Document, if it wishes to. + *

+ * Implementations should normally have a constructor with parameters which + * determine what {@link #build(Document)} will add to doc.
+ * To allow reuse of the DocumentBuilder object, implementations are also + * encouraged to have a setter method, which remembers its parameters just like + * the constructor. This setter method cannot be described in this interface, + * because it will take different parameters in each implementation. + *

+ * The interface defines a builder pattern, which allows applications to invoke + * several document builders in the following way: + * + *

+ * builder1.build(builder2.build(builder3.build(new Document())));
+ * 
+ * + * @lucene.experimental + */ +public interface DocumentBuilder { + + /** An exception thrown from {@link DocumentBuilder}'s build(). */ + public static class DocumentBuilderException extends Exception { + + public DocumentBuilderException() { + super(); + } + + public DocumentBuilderException(String message) { + super(message); + } + + public DocumentBuilderException(String message, Throwable cause) { + super(message, cause); + } + + public DocumentBuilderException(Throwable cause) { + super(cause); + } + + } + + /** + * Adds to the given document whatever {@link Field}s the implementation needs + * to add. Return the docunment instance to allow for chaining calls. + */ + public Document build(Document doc) throws DocumentBuilderException; + +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/FacetException.java b/modules/facet/src/java/org/apache/lucene/facet/FacetException.java new file mode 100644 index 00000000000..a03797ca091 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/FacetException.java @@ -0,0 +1,46 @@ +package org.apache.lucene.facet; + +import java.io.IOException; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * A parent class for exceptions thrown by the Facets code. + * + * @lucene.experimental + */ +public class FacetException extends IOException { + + public FacetException() { + super(); + } + + public FacetException(String message) { + super(message); + } + + public FacetException(String message, Throwable cause) { + super(message); + initCause(cause); + } + + public FacetException(Throwable cause) { + initCause(cause); + } + +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/enhancements/CategoryEnhancement.java b/modules/facet/src/java/org/apache/lucene/facet/enhancements/CategoryEnhancement.java new file mode 100644 index 00000000000..fced0cf7393 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/enhancements/CategoryEnhancement.java @@ -0,0 +1,127 @@ +package org.apache.lucene.facet.enhancements; + +import org.apache.lucene.analysis.TokenStream; + +import org.apache.lucene.facet.enhancements.params.EnhancementsIndexingParams; +import org.apache.lucene.facet.index.attributes.CategoryAttribute; +import org.apache.lucene.facet.index.attributes.CategoryProperty; +import org.apache.lucene.facet.index.streaming.CategoryListTokenizer; +import org.apache.lucene.facet.index.streaming.CategoryParentsStream; +import org.apache.lucene.facet.taxonomy.TaxonomyWriter; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * This interface allows easy addition of enhanced category features. Usually, a + * {@link CategoryEnhancement} will correspond to a {@link CategoryProperty}. + *

+ * A category enhancement can contribute to the index in two possible ways: + *

    + *
  1. To each category with data relevant to the enhancement, add this data to + * the category's token payload, through + * {@link #getCategoryTokenBytes(CategoryAttribute)}. This data will be read + * during search using {@link #extractCategoryTokenData(byte[], int, int)}.
  2. + *
  3. To each document which contains categories with data relevant to the + * enhancement, add a {@link CategoryListTokenizer} through + * {@link #getCategoryListTokenizer(TokenStream, EnhancementsIndexingParams, TaxonomyWriter)} + * . The {@link CategoryListTokenizer} should add a single token which includes + * all the enhancement relevant data from the categories. The category list + * token's text is defined by {@link #getCategoryListTermText()}.
  4. + *
+ * + * @lucene.experimental + */ +public interface CategoryEnhancement { + + /** + * Get the bytes to be added to the category token payload for this + * enhancement. + *

+ * NOTE: The returned array is copied, it is recommended to allocate + * a new one each time. + *

+ * The bytes generated by this method are the input of + * {@link #extractCategoryTokenData(byte[], int, int)}. + * + * @param categoryAttribute + * The attribute of the category. + * @return The bytes to be added to the category token payload for this + * enhancement. + */ + byte[] getCategoryTokenBytes(CategoryAttribute categoryAttribute); + + /** + * Get the data of this enhancement from a category token payload. + *

+ * The input bytes for this method are generated in + * {@link #getCategoryTokenBytes(CategoryAttribute)}. + * + * @param buffer + * The payload buffer. + * @param offset + * The offset of this enhancement's data in the buffer. + * @param length + * The length of this enhancement's data (bytes). + * @return An Object containing the data. + */ + Object extractCategoryTokenData(byte[] buffer, int offset, int length); + + /** + * Declarative method to indicate whether this enhancement generates + * separate category list. + * + * @return {@code true} if generates category list, else {@code false}. + */ + boolean generatesCategoryList(); + + /** + * Returns the text of this enhancement's category list term. + * + * @return The text of this enhancement's category list term. + */ + String getCategoryListTermText(); + + /** + * Get the {@link CategoryListTokenizer} which generates the category list + * for this enhancement. If {@link #generatesCategoryList()} returns + * {@code false} this method will not be called. + * + * @param tokenizer + * The input stream containing categories. + * @param indexingParams + * The indexing params to use. + * @param taxonomyWriter + * The taxonomy to add categories and get their ordinals. + * @return A {@link CategoryListTokenizer} generating the category list for + * this enhancement, with {@code tokenizer} as it's input. + */ + CategoryListTokenizer getCategoryListTokenizer(TokenStream tokenizer, + EnhancementsIndexingParams indexingParams, + TaxonomyWriter taxonomyWriter); + + /** + * Get a {@link CategoryProperty} class to be retained when creating + * {@link CategoryParentsStream}. + * + * @return the {@link CategoryProperty} class to be retained when creating + * {@link CategoryParentsStream}, or {@code null} if there is no + * such property. + */ + Class getRetainableProperty(); + +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/enhancements/EnhancementsCategoryTokenizer.java b/modules/facet/src/java/org/apache/lucene/facet/enhancements/EnhancementsCategoryTokenizer.java new file mode 100644 index 00000000000..9d401bcd32e --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/enhancements/EnhancementsCategoryTokenizer.java @@ -0,0 +1,121 @@ +package org.apache.lucene.facet.enhancements; + +import java.io.IOException; +import java.util.List; + +import org.apache.lucene.analysis.TokenStream; + +import org.apache.lucene.facet.enhancements.params.EnhancementsIndexingParams; +import org.apache.lucene.facet.index.streaming.CategoryTokenizer; +import org.apache.lucene.util.Vint8; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * A tokenizer which adds to each category token payload according to the + * {@link CategoryEnhancement}s defined in the given + * {@link EnhancementsIndexingParams}. + * + * @lucene.experimental + */ +public class EnhancementsCategoryTokenizer extends CategoryTokenizer { + + /** + * The data buffer used for payload instance. + */ + protected byte[] payloadBytes; + + /** + * The category enhancements to handle + */ + protected List enhancements; + + /** + * Buffers for enhancement payload bytes + */ + protected byte[][] enhancementBytes; + + private int nStart; + + /** + * Constructor. + * + * @param input + * The stream of category tokens. + * @param indexingParams + * The indexing params to use. + * @throws IOException + */ + public EnhancementsCategoryTokenizer(TokenStream input, + EnhancementsIndexingParams indexingParams) throws IOException { + super(input, indexingParams); + payloadBytes = new byte[Vint8.MAXIMUM_BYTES_NEEDED + * (indexingParams.getCategoryEnhancements().size() + 1)]; + enhancements = indexingParams.getCategoryEnhancements(); + if (enhancements != null) { + // create array of bytes per enhancement + enhancementBytes = new byte[enhancements.size()][]; + // write once the number of enhancements in the payload bytes + nStart = Vint8.encode(enhancements.size(), payloadBytes, 0); + } + } + + @Override + protected void setPayload() { + this.payloadAttribute.setPayload(null); + if (enhancements == null) { + return; + } + // clear previous payload content + int nBytes = nStart; + int i = 0; + int nEnhancementBytes = 0; + for (CategoryEnhancement enhancement : enhancements) { + // get payload bytes from each enhancement + enhancementBytes[i] = enhancement + .getCategoryTokenBytes(categoryAttribute); + // write the number of bytes in the payload + if (enhancementBytes[i] == null) { + nBytes += Vint8.encode(0, payloadBytes, nBytes); + } else { + nBytes += Vint8.encode(enhancementBytes[i].length, + payloadBytes, nBytes); + nEnhancementBytes += enhancementBytes[i].length; + } + i++; + } + if (nEnhancementBytes > 0) { + // make sure we have space for all bytes + if (payloadBytes.length < nBytes + nEnhancementBytes) { + byte[] temp = new byte[(nBytes + nEnhancementBytes) * 2]; + System.arraycopy(payloadBytes, 0, temp, 0, nBytes); + payloadBytes = temp; + } + for (i = 0; i < enhancementBytes.length; i++) { + // add the enhancement payload bytes after the existing bytes + if (enhancementBytes[i] != null) { + System.arraycopy(enhancementBytes[i], 0, payloadBytes, + nBytes, enhancementBytes[i].length); + nBytes += enhancementBytes[i].length; + } + } + payload.setData(payloadBytes, 0, nBytes); + payloadAttribute.setPayload(payload); + } + } +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/enhancements/EnhancementsDocumentBuilder.java b/modules/facet/src/java/org/apache/lucene/facet/enhancements/EnhancementsDocumentBuilder.java new file mode 100644 index 00000000000..0ca364a490c --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/enhancements/EnhancementsDocumentBuilder.java @@ -0,0 +1,93 @@ +package org.apache.lucene.facet.enhancements; + +import java.io.IOException; +import java.util.List; + +import org.apache.lucene.analysis.TokenStream; + +import org.apache.lucene.facet.enhancements.params.EnhancementsIndexingParams; +import org.apache.lucene.facet.index.CategoryDocumentBuilder; +import org.apache.lucene.facet.index.attributes.CategoryProperty; +import org.apache.lucene.facet.index.streaming.CategoryAttributesStream; +import org.apache.lucene.facet.index.streaming.CategoryListTokenizer; +import org.apache.lucene.facet.index.streaming.CategoryParentsStream; +import org.apache.lucene.facet.index.streaming.CategoryTokenizer; +import org.apache.lucene.facet.taxonomy.TaxonomyWriter; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * An {@link EnhancementsDocumentBuilder} is a {@link CategoryDocumentBuilder} + * which adds categories to documents according to the list of + * {@link CategoryEnhancement}s from {@link EnhancementsIndexingParams}. The + * additions over {@link CategoryDocumentBuilder} could be in both category + * tokens, and additional category lists. + * + * @lucene.experimental + */ +public class EnhancementsDocumentBuilder extends CategoryDocumentBuilder { + + /** + * @param taxonomyWriter + * @param params + * Indexing params which include {@link CategoryEnhancement}s. + * @throws IOException + */ + public EnhancementsDocumentBuilder(TaxonomyWriter taxonomyWriter, + EnhancementsIndexingParams params) throws IOException { + super(taxonomyWriter, params); + } + + @Override + protected TokenStream getParentsStream(CategoryAttributesStream categoryAttributesStream) { + List> toRetainList = ((EnhancementsIndexingParams) indexingParams) + .getRetainableProperties(); + if (toRetainList != null) { + CategoryParentsStream categoryParentsStream = new CategoryParentsStream( + categoryAttributesStream, taxonomyWriter, indexingParams); + for (Class toRetain : toRetainList) { + categoryParentsStream.addRetainableProperty(toRetain); + } + return categoryParentsStream; + } + return super.getParentsStream(categoryAttributesStream); + } + + @Override + protected CategoryListTokenizer getCategoryListTokenizer(TokenStream categoryStream) { + CategoryListTokenizer tokenizer = super.getCategoryListTokenizer(categoryStream); + // Add tokenizer for each enhancement that produces category list + for (CategoryEnhancement enhancement : ((EnhancementsIndexingParams) indexingParams) + .getCategoryEnhancements()) { + if (enhancement.generatesCategoryList()) { + tokenizer = enhancement.getCategoryListTokenizer(tokenizer, + (EnhancementsIndexingParams) indexingParams, + taxonomyWriter); + } + } + return tokenizer; + } + + @Override + protected CategoryTokenizer getCategoryTokenizer(TokenStream categoryStream) + throws IOException { + return new EnhancementsCategoryTokenizer(categoryStream, + (EnhancementsIndexingParams) indexingParams); + } + +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/enhancements/EnhancementsPayloadIterator.java b/modules/facet/src/java/org/apache/lucene/facet/enhancements/EnhancementsPayloadIterator.java new file mode 100644 index 00000000000..7eb69b96f4c --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/enhancements/EnhancementsPayloadIterator.java @@ -0,0 +1,105 @@ +package org.apache.lucene.facet.enhancements; + +import java.io.IOException; +import java.util.List; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; + +import org.apache.lucene.facet.search.PayloadIterator; +import org.apache.lucene.util.Vint8; +import org.apache.lucene.util.Vint8.Position; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * A {@link PayloadIterator} for iterating over category posting lists generated + * using {@link EnhancementsCategoryTokenizer}. + * + * @lucene.experimental + */ +public class EnhancementsPayloadIterator extends PayloadIterator { + + private CategoryEnhancement[] EnhancedCategories; + int nEnhancements; + private int[] enhancementLength; + private int[] enhancementStart; + + /** + * Constructor. + * + * @param enhancementsList + * A list of the {@link CategoryEnhancement}s from the indexing + * params. + * @param indexReader + * A reader of the index. + * @param term + * The category term to iterate. + * @throws IOException + */ + public EnhancementsPayloadIterator( + List enhancementsList, + IndexReader indexReader, Term term) throws IOException { + super(indexReader, term); + EnhancedCategories = enhancementsList + .toArray(new CategoryEnhancement[enhancementsList.size()]); + enhancementLength = new int[EnhancedCategories.length]; + enhancementStart = new int[EnhancedCategories.length]; + } + + @Override + public boolean setdoc(int docId) throws IOException { + if (!super.setdoc(docId)) { + return false; + } + + // read header - number of enhancements and their lengths + Position position = new Position(); + nEnhancements = Vint8.decode(buffer, position); + for (int i = 0; i < nEnhancements; i++) { + enhancementLength[i] = Vint8.decode(buffer, position); + } + + // set enhancements start points + enhancementStart[0] = position.pos; + for (int i = 1; i < nEnhancements; i++) { + enhancementStart[i] = enhancementStart[i - 1] + enhancementLength[i - 1]; + } + + return true; + } + + /** + * Get the data of the current category and document for a certain + * enhancement, or {@code null} if no such enhancement exists. + * + * @param enhancedCategory + * The category enhancement to apply. + * @return the data of the current category and document for a certain + * enhancement, or {@code null} if no such enhancement exists. + */ + public Object getCategoryData(CategoryEnhancement enhancedCategory) { + for (int i = 0; i < nEnhancements; i++) { + if (enhancedCategory.equals(EnhancedCategories[i])) { + return enhancedCategory.extractCategoryTokenData(buffer, + enhancementStart[i], enhancementLength[i]); + } + } + return null; + } +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/enhancements/association/AssociationEnhancement.java b/modules/facet/src/java/org/apache/lucene/facet/enhancements/association/AssociationEnhancement.java new file mode 100644 index 00000000000..db965c10e10 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/enhancements/association/AssociationEnhancement.java @@ -0,0 +1,153 @@ +package org.apache.lucene.facet.enhancements.association; + +import java.util.HashSet; +import java.util.Set; + +import org.apache.lucene.analysis.TokenStream; + +import org.apache.lucene.facet.enhancements.CategoryEnhancement; +import org.apache.lucene.facet.enhancements.params.EnhancementsIndexingParams; +import org.apache.lucene.facet.index.attributes.CategoryAttribute; +import org.apache.lucene.facet.index.attributes.CategoryProperty; +import org.apache.lucene.facet.index.streaming.CategoryListTokenizer; +import org.apache.lucene.facet.taxonomy.TaxonomyWriter; +import org.apache.lucene.util.Vint8; +import org.apache.lucene.util.Vint8.Position; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * A {@link CategoryEnhancement} for adding associations data to the index + * (categories with {@link AssociationProperty}s). + * + * @lucene.experimental + */ +public class AssociationEnhancement implements CategoryEnhancement { + + static final String CATEGORY_LIST_TERM_TEXT = "CATEGORY_ASSOCIATION_LIST"; + + /** Property Classes which extend AssociationProperty */ + private static final HashSet> ASSOCIATION_PROPERTY_CLASSES; + + /** Property Classes which do not extend AssociationProperty */ + private static final HashSet> NON_ASSOCIATION_PROPERTY_CLASSES; + + static { + ASSOCIATION_PROPERTY_CLASSES = new HashSet>(); + NON_ASSOCIATION_PROPERTY_CLASSES = new HashSet>(); + } + + /** + * For a given class which extends a CategoryProperty, answers whether it is + * an instance of AssociationProperty (AP) or not.
+ * This method is a cheaper replacement for a call to + * instanceof. It has two HashSets - one for classes which are + * an extension to AP and one for the classes which are not. Whenever a + * property class is introduced: + *

    + *
  • if it is known as a property class extending AP (contained in the + * validHashSet)- returns true
  • + *
  • if it is known as a property class NOT extending AP - returns false
  • + *
  • + * If it was not matched against both sets, it calls 'instanceof' to find + * out if it extends AP, puts it in the matching Set and returning true or + * false accordingly
  • + *
+ * + * NOTE: 'instanceof' is only called once per a Class (not instance) of a + * property. And as there are few properties (currently 4 concrete + * implementations) the two sets would be rather small + */ + public static boolean isAssociationProperty(Class clazz) { + if (ASSOCIATION_PROPERTY_CLASSES.contains(clazz)) { + return true; + } + + if (NON_ASSOCIATION_PROPERTY_CLASSES.contains(clazz)) { + return false; + } + + if (AssociationProperty.class.isAssignableFrom(clazz)) { + ASSOCIATION_PROPERTY_CLASSES.add(clazz); + return true; + } + + NON_ASSOCIATION_PROPERTY_CLASSES.add(clazz); + return false; + } + + public boolean generatesCategoryList() { + return true; + } + + public String getCategoryListTermText() { + return CATEGORY_LIST_TERM_TEXT; + } + + public CategoryListTokenizer getCategoryListTokenizer( + TokenStream tokenizer, EnhancementsIndexingParams indexingParams, + TaxonomyWriter taxonomyWriter) { + return new AssociationListTokenizer(tokenizer, indexingParams, this); + } + + public byte[] getCategoryTokenBytes(CategoryAttribute categoryAttribute) { + + AssociationProperty property = getAssociationProperty(categoryAttribute); + + if (property == null) { + return null; + } + + int association = property.getAssociation(); + int bytesNeeded = Vint8.bytesNeeded(association); + byte[] buffer = new byte[bytesNeeded]; + Vint8.encode(association, buffer, 0); + return buffer; + } + + public static AssociationProperty getAssociationProperty( + CategoryAttribute categoryAttribute) { + AssociationProperty property = null; + Set> propertyClasses = categoryAttribute + .getPropertyClasses(); + if (propertyClasses == null) { + return null; + } + for (Class clazz : propertyClasses) { + if (isAssociationProperty(clazz)) { + property = (AssociationProperty) categoryAttribute + .getProperty(clazz); + break; + } + } + return property; + } + + public Object extractCategoryTokenData(byte[] buffer, int offset, int length) { + if (length == 0) { + return null; + } + Integer i = Integer.valueOf(Vint8.decode(buffer, new Position(offset))); + return i; + } + + public Class getRetainableProperty() { + return null; + } + +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/enhancements/association/AssociationFloatProperty.java b/modules/facet/src/java/org/apache/lucene/facet/enhancements/association/AssociationFloatProperty.java new file mode 100644 index 00000000000..0af8fa12797 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/enhancements/association/AssociationFloatProperty.java @@ -0,0 +1,74 @@ +package org.apache.lucene.facet.enhancements.association; + +import org.apache.lucene.facet.index.attributes.CategoryProperty; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * An {@link AssociationProperty} which treats the association as float - the + * association bits are actually float bits, and thus merging two associations + * is done by float summation. + * + * @lucene.experimental + */ +public class AssociationFloatProperty extends AssociationProperty { + + /** + * Constructor. + * + * @param value + * The association value. + */ + public AssociationFloatProperty(float value) { + super(Float.floatToIntBits(value)); + } + + @Override + public boolean equals(Object other) { + if (other == this) { + return true; + } + if (!(other instanceof AssociationFloatProperty)) { + return false; + } + AssociationFloatProperty o = (AssociationFloatProperty) other; + return o.association == this.association; + } + + @Override + public int hashCode() { + return "AssociationFloatProperty".hashCode() * 31 + (int) association; + } + + public void merge(CategoryProperty other) { + AssociationFloatProperty o = (AssociationFloatProperty) other; + this.association = Float.floatToIntBits(Float + .intBitsToFloat((int) this.association) + + Float.intBitsToFloat((int) o.association)); + } + + public float getFloatAssociation() { + return Float.intBitsToFloat((int) association); + } + + @Override + public String toString() { + return getClass().getSimpleName() + ": " + Float.intBitsToFloat(getAssociation()); + } + +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/enhancements/association/AssociationIntProperty.java b/modules/facet/src/java/org/apache/lucene/facet/enhancements/association/AssociationIntProperty.java new file mode 100644 index 00000000000..675cb34aded --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/enhancements/association/AssociationIntProperty.java @@ -0,0 +1,60 @@ +package org.apache.lucene.facet.enhancements.association; + +import org.apache.lucene.facet.index.attributes.CategoryProperty; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * An {@link AssociationProperty} which treats the association as int - merges + * two associations by summation. + * + * @lucene.experimental + */ +public class AssociationIntProperty extends AssociationProperty { + + /** + * @param value + * The association value. + */ + public AssociationIntProperty(int value) { + super(value); + } + + @Override + public boolean equals(Object other) { + if (other == this) { + return true; + } + if (!(other instanceof AssociationIntProperty)) { + return false; + } + AssociationIntProperty o = (AssociationIntProperty) other; + return o.association == this.association; + } + + @Override + public int hashCode() { + return "AssociationIntProperty".hashCode() * 31 + (int) association; + } + + public void merge(CategoryProperty other) { + AssociationIntProperty o = (AssociationIntProperty) other; + this.association += o.association; + } + +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/enhancements/association/AssociationListTokenizer.java b/modules/facet/src/java/org/apache/lucene/facet/enhancements/association/AssociationListTokenizer.java new file mode 100644 index 00000000000..c0a30a8c7a3 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/enhancements/association/AssociationListTokenizer.java @@ -0,0 +1,90 @@ +package org.apache.lucene.facet.enhancements.association; + +import java.io.IOException; + +import org.apache.lucene.analysis.TokenStream; + +import org.apache.lucene.facet.enhancements.CategoryEnhancement; +import org.apache.lucene.facet.enhancements.params.EnhancementsIndexingParams; +import org.apache.lucene.facet.index.CategoryListPayloadStream; +import org.apache.lucene.facet.index.attributes.OrdinalProperty; +import org.apache.lucene.facet.index.streaming.CategoryListTokenizer; +import org.apache.lucene.util.encoding.SimpleIntEncoder; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Tokenizer for associations of a category + * + * @lucene.experimental + */ +public class AssociationListTokenizer extends CategoryListTokenizer { + + protected CategoryListPayloadStream payloadStream; + + private String categoryListTermText; + + public AssociationListTokenizer(TokenStream input, + EnhancementsIndexingParams indexingParams, CategoryEnhancement enhancement) { + super(input, indexingParams); + categoryListTermText = enhancement.getCategoryListTermText(); + } + + @Override + protected void handleStartOfInput() throws IOException { + payloadStream = null; + } + + @Override + public final boolean incrementToken() throws IOException { + if (input.incrementToken()) { + if (categoryAttribute != null) { + AssociationProperty associationProperty = AssociationEnhancement + .getAssociationProperty(categoryAttribute); + if (associationProperty != null + && associationProperty.hasBeenSet()) { + OrdinalProperty ordinalProperty = (OrdinalProperty) categoryAttribute + .getProperty(OrdinalProperty.class); + if (ordinalProperty == null) { + throw new IOException( + "Error: Association without ordinal"); + } + + if (payloadStream == null) { + payloadStream = new CategoryListPayloadStream( + new SimpleIntEncoder()); + } + payloadStream.appendIntToStream(ordinalProperty + .getOrdinal()); + payloadStream.appendIntToStream(associationProperty + .getAssociation()); + } + } + return true; + } + if (payloadStream != null) { + termAttribute.setEmpty().append(categoryListTermText); + payload.setData(payloadStream.convertStreamToByteArray()); + payloadAttribute.setPayload(payload); + payloadStream = null; + return true; + } + return false; + } + +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/enhancements/association/AssociationProperty.java b/modules/facet/src/java/org/apache/lucene/facet/enhancements/association/AssociationProperty.java new file mode 100644 index 00000000000..bdda839ae1a --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/enhancements/association/AssociationProperty.java @@ -0,0 +1,73 @@ +package org.apache.lucene.facet.enhancements.association; + +import org.apache.lucene.facet.index.attributes.CategoryAttribute; +import org.apache.lucene.facet.index.attributes.CategoryProperty; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * A {@link CategoryProperty} associating a single integer value to a + * {@link CategoryAttribute}. It should be used to describe the association + * between the category and the document. + *

+ * This class leave to extending classes the definition of + * {@link #merge(CategoryProperty)} policy for the integer associations. + *

+ * Note: The association value is added both to a special category list, + * and to the category tokens. + * + * @see AssociationEnhancement + * @lucene.experimental + */ +public abstract class AssociationProperty implements CategoryProperty { + + protected long association = Integer.MAX_VALUE + 1; + + /** + * Construct an {@link AssociationProperty}. + * + * @param value + * The association value. + */ + public AssociationProperty(int value) { + this.association = value; + } + + /** + * Returns the association value. + * + * @return The association value. + */ + public int getAssociation() { + return (int) association; + } + + /** + * Returns whether this attribute has been set (not all categories have an + * association). + */ + public boolean hasBeenSet() { + return this.association <= Integer.MAX_VALUE; + } + + @Override + public String toString() { + return getClass().getSimpleName() + ": " + association; + } + +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/enhancements/association/AssociationsPayloadIterator.java b/modules/facet/src/java/org/apache/lucene/facet/enhancements/association/AssociationsPayloadIterator.java new file mode 100644 index 00000000000..bae9a410d64 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/enhancements/association/AssociationsPayloadIterator.java @@ -0,0 +1,235 @@ +package org.apache.lucene.facet.enhancements.association; + +import java.io.IOException; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; + +import org.apache.lucene.facet.index.params.CategoryListParams; +import org.apache.lucene.facet.search.PayloadIntDecodingIterator; +import org.apache.lucene.util.collections.IntIterator; +import org.apache.lucene.util.collections.IntToIntMap; +import org.apache.lucene.util.encoding.SimpleIntDecoder; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Allows easy iteration over the associations payload, decoding and breaking it + * to (ordinal, value) pairs, stored in a hash. + * + * @lucene.experimental + */ +public class AssociationsPayloadIterator { + + /** + * Default Term for associations + */ + public static final Term ASSOCIATION_POSTING_TERM = new Term( + CategoryListParams.DEFAULT_TERM.field(), + AssociationEnhancement.CATEGORY_LIST_TERM_TEXT); + + /** + * Hash mapping to ordinals to the associated int value + */ + private IntToIntMap ordinalToAssociationMap; + + /** + * An inner payload decoder which actually goes through the posting and + * decode the ints representing the ordinals and the values + */ + private PayloadIntDecodingIterator associationPayloadIter; + + /** + * Marking whether there are associations (at all) in the given index + */ + private boolean hasAssociations = false; + + /** + * The long-special-value returned for ordinals which have no associated int + * value. It is not in the int range of values making it a valid mark. + */ + public final static long NO_ASSOCIATION = Integer.MAX_VALUE + 1; + + /** + * Construct a new association-iterator, initializing the inner payload + * iterator, with the supplied term and checking whether there are any + * associations within the given index + * + * @param reader + * a reader containing the postings to be iterated + * @param field + * the field containing the relevant associations list term + */ + public AssociationsPayloadIterator(IndexReader reader, String field) + throws IOException { + // Initialize the payloadDecodingIterator + associationPayloadIter = new PayloadIntDecodingIterator( + reader, + // TODO (Facet): should consolidate with AssociationListTokenizer which + // uses AssociationEnhancement.getCatTermText() + new Term(field, AssociationEnhancement.CATEGORY_LIST_TERM_TEXT), + new SimpleIntDecoder()); + + // Check whether there are any associations + hasAssociations = associationPayloadIter.init(); + + ordinalToAssociationMap = new IntToIntMap(); + } + + /** + * Skipping to the next document, fetching its associations & populating the + * map. + * + * @param docId + * document id to be skipped to + * @return true if the document contains associations and they were fetched + * correctly. false otherwise. + * @throws IOException + * on error + */ + public boolean setNextDoc(int docId) throws IOException { + ordinalToAssociationMap.clear(); + boolean docContainsAssociations = false; + try { + docContainsAssociations = fetchAssociations(docId); + } catch (IOException e) { + IOException ioe = new IOException( + "An Error occured while reading a document's associations payload (docId=" + + docId + ")"); + ioe.initCause(e); + throw ioe; + } + + return docContainsAssociations; + } + + /** + * Get int association value for the given ordinal.
+ * The return is either an int value casted as long if the ordinal has an + * associated value. Otherwise the returned value would be + * {@link #NO_ASSOCIATION} which is 'pure long' value (e.g not in the int + * range of values) + * + * @param ordinal + * for which the association value is requested + * @return the associated int value (encapsulated in a long) if the ordinal + * had an associated value, or {@link #NO_ASSOCIATION} otherwise + */ + public long getAssociation(int ordinal) { + if (ordinalToAssociationMap.containsKey(ordinal)) { + return ordinalToAssociationMap.get(ordinal); + } + + return NO_ASSOCIATION; + } + + /** + * Get an iterator over the ordinals which has an association for the + * document set by {@link #setNextDoc(int)}. + */ + public IntIterator getAssociatedOrdinals() { + return ordinalToAssociationMap.keyIterator(); + } + + /** + * Skips to the given docId, getting the values in pairs of (ordinal, value) + * and populating the map + * + * @param docId + * document id owning the associations + * @return true if associations were fetched successfully, false otherwise + * @throws IOException + * on error + */ + private boolean fetchAssociations(int docId) throws IOException { + // No associations at all? don't bother trying to seek the docID in the + // posting + if (!hasAssociations) { + return false; + } + + // No associations for this document? well, nothing to decode than, + // return false + if (!associationPayloadIter.skipTo(docId)) { + return false; + } + + // loop over all the values decoded from the payload in pairs. + for (;;) { + // Get the ordinal + long ordinal = associationPayloadIter.nextCategory(); + + // if no ordinal - it's the end of data, break the loop + if (ordinal > Integer.MAX_VALUE) { + break; + } + + // get the associated value + long association = associationPayloadIter.nextCategory(); + // If we're at this step - it means we have an ordinal, do we have + // an association for it? + if (association > Integer.MAX_VALUE) { + // No association!!! A Broken Pair!! PANIC! + throw new IOException( + "ERROR! Associations should come in pairs of (ordinal, value), yet this payload has an odd number of values! (docId=" + + docId + ")"); + } + // Populate the map with the given ordinal and association pair + ordinalToAssociationMap.put((int) ordinal, (int) association); + } + + return true; + } + + @Override + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime + * result + + ((associationPayloadIter == null) ? 0 + : associationPayloadIter.hashCode()); + return result; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) { + return true; + } + + if (obj == null) { + return false; + } + + if (getClass() != obj.getClass()) { + return false; + } + + AssociationsPayloadIterator other = (AssociationsPayloadIterator) obj; + if (associationPayloadIter == null) { + if (other.associationPayloadIter != null) { + return false; + } + } else if (!associationPayloadIter.equals(other.associationPayloadIter)) { + return false; + } + return true; + } + +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/enhancements/association/package.html b/modules/facet/src/java/org/apache/lucene/facet/enhancements/association/package.html new file mode 100644 index 00000000000..7c19bb32452 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/enhancements/association/package.html @@ -0,0 +1,13 @@ + + +Association category enhancements + + +

Association category enhancements

+ +A {@link org.apache.lucene.facet.enhancements.CategoryEnhancement CategoryEnhancement} +for adding associations data to the index (categories with +{@link org.apache.lucene.facet.enhancements.association.AssociationProperty AssociationProperty}'s). + + + \ No newline at end of file diff --git a/modules/facet/src/java/org/apache/lucene/facet/enhancements/package.html b/modules/facet/src/java/org/apache/lucene/facet/enhancements/package.html new file mode 100644 index 00000000000..f8515111dd2 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/enhancements/package.html @@ -0,0 +1,32 @@ + + +Enhanced category features + + +

Enhanced category features

+ +Mechanisms for addition of enhanced category features. +

A {@link org.apache.lucene.facet.enhancements.CategoryEnhancement CategoryEnhancement} +(which can correspond to a +{@link org.apache.lucene.facet.index.attributes.CategoryProperty CategoryProperty}) +can contribute to the index in two possible ways: +

    +
  1. To each category with data relevant to the enhancement, + add this data to the category's token payload, through + {@link org.apache.lucene.facet.enhancements.CategoryEnhancement#getCategoryTokenBytes(CategoryAttribute) CategoryEnhancement.getCategoryTokenBytes()}. + This data will be read during search using + {@link org.apache.lucene.facet.enhancements.CategoryEnhancement#extractCategoryTokenData(byte[], int, int) CategoryEnhancement.extractCategoryTokenData()}. +
  2. +
  3. To each document which contains categories with data relevant to the enhancement, add a + {@link org.apache.lucene.facet.index.streaming.CategoryListTokenizer CategoryListTokenizer} through + {@link org.apache.lucene.facet.enhancements.CategoryEnhancement#getCategoryListTokenizer CategoryEnhancement.getCategoryListTokenizer()} . + The + {@link org.apache.lucene.facet.index.streaming.CategoryListTokenizer CategoryListTokenizer} + should add a single token which includes all the enhancement relevant data from the categories. + The category list token's text is defined by + {@link org.apache.lucene.facet.enhancements.CategoryEnhancement#getCategoryListTermText() CategoryEnhancement.getCategoryListTermText()}. +
  4. +
+ + + diff --git a/modules/facet/src/java/org/apache/lucene/facet/enhancements/params/DefaultEnhancementsIndexingParams.java b/modules/facet/src/java/org/apache/lucene/facet/enhancements/params/DefaultEnhancementsIndexingParams.java new file mode 100644 index 00000000000..57d4580055c --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/enhancements/params/DefaultEnhancementsIndexingParams.java @@ -0,0 +1,98 @@ +package org.apache.lucene.facet.enhancements.params; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.lucene.facet.enhancements.CategoryEnhancement; +import org.apache.lucene.facet.index.attributes.CategoryProperty; +import org.apache.lucene.facet.index.params.CategoryListParams; +import org.apache.lucene.facet.index.params.PerDimensionIndexingParams; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Default implementation of {@link EnhancementsIndexingParams} + * + * @lucene.experimental + */ +public class DefaultEnhancementsIndexingParams extends + PerDimensionIndexingParams implements EnhancementsIndexingParams { + + private List enhancedCategories; + + /** + * Construct with a certain {@link CategoryEnhancement enhancement} + * @throws IllegalArgumentException if no enhancements are provided + */ + public DefaultEnhancementsIndexingParams(CategoryEnhancement... enhancements) { + super(); + validateparams(enhancements); + addCategoryEnhancements(enhancements); + } + + private void validateparams(CategoryEnhancement... enhancements) { + if (enhancements==null || enhancements.length<1) { + throw new IllegalArgumentException("at least one enhancement is required"); + } + } + + /** + * Construct with certain {@link CategoryEnhancement enhancements} + * and {@link CategoryListParams} + * @throws IllegalArgumentException if no enhancements are provided + */ + public DefaultEnhancementsIndexingParams( + CategoryListParams categoryListParams, + CategoryEnhancement... enhancements) { + super(categoryListParams); + validateparams(enhancements); + addCategoryEnhancements(enhancements); + } + + public void addCategoryEnhancements(CategoryEnhancement... enhancements) { + if (enhancedCategories == null) { + enhancedCategories = new ArrayList(); + } + for (CategoryEnhancement categoryEnhancement : enhancements) { + enhancedCategories.add(categoryEnhancement); + } + } + + public List getCategoryEnhancements() { + if (enhancedCategories == null || enhancedCategories.isEmpty()) { + return null; + } + return enhancedCategories; + } + + public List> getRetainableProperties() { + if (enhancedCategories == null) { + return null; + } + List> retainableProperties = new ArrayList>(); + for (CategoryEnhancement enhancement : enhancedCategories) { + if (enhancement.getRetainableProperty() != null) { + retainableProperties.add(enhancement.getRetainableProperty()); + } + } + if (retainableProperties.isEmpty()) { + return null; + } + return retainableProperties; + } +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/enhancements/params/EnhancementsIndexingParams.java b/modules/facet/src/java/org/apache/lucene/facet/enhancements/params/EnhancementsIndexingParams.java new file mode 100644 index 00000000000..2862e5b5297 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/enhancements/params/EnhancementsIndexingParams.java @@ -0,0 +1,66 @@ +package org.apache.lucene.facet.enhancements.params; + +import java.util.List; + +import org.apache.lucene.facet.enhancements.CategoryEnhancement; +import org.apache.lucene.facet.enhancements.EnhancementsDocumentBuilder; +import org.apache.lucene.facet.index.attributes.CategoryProperty; +import org.apache.lucene.facet.index.params.FacetIndexingParams; +import org.apache.lucene.facet.index.streaming.CategoryParentsStream; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * {@link FacetIndexingParams Facet indexing parameters} for defining + * {@link CategoryEnhancement category enhancements}. It must contain at least + * one enhancement, otherwise nothing is "enhanced" about it. When there are + * more than one, the order matters - see {@link #getCategoryEnhancements()}. + * + * @see EnhancementsDocumentBuilder + * @lucene.experimental + */ +public interface EnhancementsIndexingParams extends FacetIndexingParams { + + /** + * Add {@link CategoryEnhancement}s to the indexing parameters + * @param enhancements enhancements to add + */ + public void addCategoryEnhancements(CategoryEnhancement... enhancements); + + /** + * Get a list of the active category enhancements. If no enhancements exist + * return {@code null}. The order of enhancements in the returned list + * dictates the order in which the enhancements data appear in the category + * tokens payload. + * + * @return A list of the active category enhancements, or {@code null} if + * there are no enhancements. + */ + public List getCategoryEnhancements(); + + /** + * Get a list of {@link CategoryProperty} classes to be retained when + * creating {@link CategoryParentsStream}. + * + * @return the list of {@link CategoryProperty} classes to be retained when + * creating {@link CategoryParentsStream}, or {@code null} if there + * are no such properties. + */ + public List> getRetainableProperties(); + +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/enhancements/params/package.html b/modules/facet/src/java/org/apache/lucene/facet/enhancements/params/package.html new file mode 100644 index 00000000000..6ebdfab784a --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/enhancements/params/package.html @@ -0,0 +1,16 @@ + + +Enhanced category features + + +

Enhanced category features

+ +{@link org.apache.lucene.facet.index.params.FacetIndexingParams FacetIndexingParams} +used by +{@link org.apache.lucene.facet.enhancements.EnhancementsDocumentBuilder EnhancementsDocumentBuilder} +for adding +{@link org.apache.lucene.facet.enhancements.CategoryEnhancement CategoryEnhancement}'s +to the indexing parameters, and accessing them during indexing and search. + + + \ No newline at end of file diff --git a/modules/facet/src/java/org/apache/lucene/facet/index/CategoryContainer.java b/modules/facet/src/java/org/apache/lucene/facet/index/CategoryContainer.java new file mode 100644 index 00000000000..16336e14eac --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/index/CategoryContainer.java @@ -0,0 +1,282 @@ +package org.apache.lucene.facet.index; + +import java.io.IOException; +import java.io.ObjectInputStream; +import java.io.ObjectOutputStream; +import java.io.Serializable; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; +import java.util.Set; + +import org.apache.lucene.util.Attribute; + +import org.apache.lucene.facet.FacetException; +import org.apache.lucene.facet.index.attributes.CategoryAttribute; +import org.apache.lucene.facet.index.attributes.CategoryAttributeImpl; +import org.apache.lucene.facet.index.attributes.CategoryProperty; +import org.apache.lucene.facet.taxonomy.CategoryPath; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * A container to add categories which are to be introduced to + * {@link CategoryDocumentBuilder#setCategories(Iterable)}. Categories can be + * added with Properties. + * + * @lucene.experimental + */ +public class CategoryContainer implements Iterable, Serializable { + + protected transient Map map; + + /** + * Constructor. + */ + public CategoryContainer() { + map = new HashMap(); + } + + /** + * Add a category. + * + * @param categoryPath + * The path of the category. + * @return The {@link CategoryAttribute} of the category. + */ + public CategoryAttribute addCategory(CategoryPath categoryPath) { + return mapCategoryAttribute(categoryPath); + } + + /** + * Add a category with a property. + * + * @param categoryPath + * The path of the category. + * @param property + * The property to associate to the category. + * @return The {@link CategoryAttribute} of the category. + */ + public CategoryAttribute addCategory(CategoryPath categoryPath, + CategoryProperty property) { + /* + * This method is a special case of addCategory with multiple + * properties, but it is kept here for two reasons: 1) Using the array + * version has some performance cost, and 2) it is expected that most + * calls will be for this version (single property). + */ + CategoryAttribute ca = mapCategoryAttribute(categoryPath); + ca.addProperty(property); + return ca; + } + + /** + * Add a category with multiple properties. + * + * @param categoryPath + * The path of the category. + * @param properties + * The properties to associate to the category. + * @return The {@link CategoryAttribute} of the category. + * @throws FacetException + * When the category already has a property of the same type as + * one of the new properties, and merging for this property type + * is prohibited. + */ + public CategoryAttribute addCategory(CategoryPath categoryPath, + CategoryProperty... properties) throws FacetException { + CategoryAttribute ca = mapCategoryAttribute(categoryPath); + for (CategoryProperty attribute : properties) { + ca.addProperty(attribute); + } + return ca; + } + + /** + * Add an entire {@link CategoryAttribute}. + * + * @param categoryAttribute + * The {@link CategoryAttribute} to add. + * @return The {@link CategoryAttribute} of the category (could be different + * from the one provided). + * @throws FacetException + */ + public CategoryAttribute addCategory(CategoryAttribute categoryAttribute) + throws FacetException { + CategoryAttribute ca = mapCategoryAttribute(categoryAttribute + .getCategoryPath()); + Set> propertyClasses = categoryAttribute + .getPropertyClasses(); + if (propertyClasses != null) { + for (Class propertyClass : propertyClasses) { + ca.addProperty(categoryAttribute.getProperty(propertyClass)); + } + } + return ca; + } + + /** + * Get the {@link CategoryAttribute} object for a specific + * {@link CategoryPath}, from the map. + */ + private final CategoryAttribute mapCategoryAttribute( + CategoryPath categoryPath) { + CategoryAttribute ca = map.get(categoryPath); + if (ca == null) { + ca = new CategoryAttributeImpl(categoryPath); + map.put(categoryPath, ca); + } + return ca; + } + + /** + * Get the {@link CategoryAttribute} this container has for a certain + * category, or {@code null} if the category is not in the container. + * + * @param categoryPath + * The category path of the requested category. + */ + public CategoryAttribute getCategoryAttribute(CategoryPath categoryPath) { + return map.get(categoryPath); + } + + public Iterator iterator() { + return map.values().iterator(); + } + + /** + * Remove all categories. + */ + public void clear() { + map.clear(); + } + + /** + * Add the categories from another {@link CategoryContainer} to this one. + * + * @param other + * The {@link CategoryContainer} to take categories from. + * @throws FacetException + * If any prohibited merge of category properties is attempted. + */ + public void merge(CategoryContainer other) throws FacetException { + for (CategoryAttribute categoryAttribute : other.map.values()) { + addCategory(categoryAttribute); + } + } + + /** + * Get the number of categories in the container. + * + * @return The number of categories in the container. + */ + public int size() { + return map.size(); + } + + @Override + public String toString() { + StringBuilder builder = new StringBuilder("CategoryContainer"); + for (CategoryAttribute ca : map.values()) { + builder.append('\n'); + builder.append('\t'); + builder.append(ca.toString()); + } + return builder.toString(); + } + + /** + * Serialize object content to given {@link ObjectOutputStream} + */ + private void writeObject(ObjectOutputStream out) throws IOException { + out.defaultWriteObject(); + // write the number of categories + out.writeInt(size()); + // write the category attributes + for (CategoryAttribute ca : this) { + serializeCategoryAttribute(out, ca); + } + } + + /** + * Serialize each of the {@link CategoryAttribute}s to the given + * {@link ObjectOutputStream}.
+ * NOTE: {@link CategoryProperty}s are {@link Serializable}, but do not + * assume that Lucene's {@link Attribute}s are as well + * @throws IOException + */ + protected void serializeCategoryAttribute(ObjectOutputStream out, + CategoryAttribute ca) throws IOException { + out.writeObject(ca.getCategoryPath()); + Set> propertyClasses = ca.getPropertyClasses(); + if (propertyClasses != null) { + out.writeInt(propertyClasses.size()); + for (Class clazz : propertyClasses) { + out.writeObject(ca.getProperty(clazz)); + } + } else { + out.writeInt(0); + } + } + + /** + * Deserialize object from given {@link ObjectInputStream} + */ + private void readObject(ObjectInputStream in) throws IOException, + ClassNotFoundException { + in.defaultReadObject(); + map = new HashMap(); + int size = in.readInt(); + for (int i = 0; i < size; i++) { + deserializeCategoryAttribute(in); + } + } + + /** + * De-Serialize each of the {@link CategoryAttribute}s from the given + * {@link ObjectInputStream}. + */ + protected void deserializeCategoryAttribute(ObjectInputStream in) + throws IOException, ClassNotFoundException { + CategoryPath cp = (CategoryPath) in.readObject(); + int nProperties = in.readInt(); + if (nProperties == 0) { + addCategory(cp); + } else { + for (int j = 0; j < nProperties; j++) { + CategoryProperty property = (CategoryProperty) in.readObject(); + addCategory(cp, property); + } + } + } + + @Override + public boolean equals(Object o) { + if (! (o instanceof CategoryContainer)) { + return false; + } + + CategoryContainer that = (CategoryContainer)o; + return this.map.equals(that.map); + } + + @Override + public int hashCode() { + return map.hashCode(); + } +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/index/CategoryDocumentBuilder.java b/modules/facet/src/java/org/apache/lucene/facet/index/CategoryDocumentBuilder.java new file mode 100644 index 00000000000..64c1d5a0335 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/index/CategoryDocumentBuilder.java @@ -0,0 +1,298 @@ +package org.apache.lucene.facet.index; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; + +import org.apache.lucene.DocumentBuilder; +import org.apache.lucene.facet.index.attributes.CategoryAttribute; +import org.apache.lucene.facet.index.attributes.CategoryAttributesIterable; +import org.apache.lucene.facet.index.categorypolicy.OrdinalPolicy; +import org.apache.lucene.facet.index.categorypolicy.PathPolicy; +import org.apache.lucene.facet.index.params.DefaultFacetIndexingParams; +import org.apache.lucene.facet.index.params.FacetIndexingParams; +import org.apache.lucene.facet.index.streaming.CategoryAttributesStream; +import org.apache.lucene.facet.index.streaming.CategoryListTokenizer; +import org.apache.lucene.facet.index.streaming.CategoryParentsStream; +import org.apache.lucene.facet.index.streaming.CategoryTokenizer; +import org.apache.lucene.facet.index.streaming.CountingListTokenizer; +import org.apache.lucene.facet.taxonomy.CategoryPath; +import org.apache.lucene.facet.taxonomy.TaxonomyWriter; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * A utility class which allows attachment of {@link CategoryPath}s or + * {@link CategoryAttribute}s to a given document using a taxonomy.
+ * Construction could be done with either a given {@link FacetIndexingParams} or + * the default implementation {@link DefaultFacetIndexingParams}.
+ * A CategoryDocumentBuilder can be reused by repeatedly setting the categories + * and building the document. Categories are provided either as + * {@link CategoryAttribute} elements through {@link #setCategories(Iterable)}, + * or as {@link CategoryPath} elements through + * {@link #setCategoryPaths(Iterable)}. + *

+ * Note that both {@link #setCategories(Iterable)} and + * {@link #setCategoryPaths(Iterable)} return this + * {@link CategoryDocumentBuilder}, allowing the following pattern: {@code new + * CategoryDocumentBuilder(taxonomy, + * params).setCategories(categories).build(doc)}. + * + * @lucene.experimental + */ +public class CategoryDocumentBuilder implements DocumentBuilder { + + /** + * A {@link TaxonomyWriter} for adding categories and retrieving their + * ordinals. + */ + protected final TaxonomyWriter taxonomyWriter; + + /** + * Parameters to be used when indexing categories. + */ + protected final FacetIndexingParams indexingParams; + + /** + * A list of fields which is filled at ancestors' construction and used + * during {@link CategoryDocumentBuilder#build(Document)}. + */ + protected final ArrayList fieldList = new ArrayList(); + + protected Map> categoriesMap; + + /** + * Creating a facets document builder with default facet indexing + * parameters.
+ * See: + * {@link #CategoryDocumentBuilder(TaxonomyWriter, FacetIndexingParams)} + * + * @param taxonomyWriter + * to which new categories will be added, as well as translating + * known categories to ordinals + * @throws IOException + * + */ + public CategoryDocumentBuilder(TaxonomyWriter taxonomyWriter) + throws IOException { + this(taxonomyWriter, new DefaultFacetIndexingParams()); + } + + /** + * Creating a facets document builder with a given facet indexing parameters + * object.
+ * + * @param taxonomyWriter + * to which new categories will be added, as well as translating + * known categories to ordinals + * @param params + * holds all parameters the indexing process should use such as + * category-list parameters + * @throws IOException + */ + public CategoryDocumentBuilder(TaxonomyWriter taxonomyWriter, + FacetIndexingParams params) throws IOException { + this.taxonomyWriter = taxonomyWriter; + this.indexingParams = params; + this.categoriesMap = new HashMap>(); + } + + /** + * Set the categories of the document builder from an {@link Iterable} of + * {@link CategoryPath} objects. + * + * @param categoryPaths + * An iterable of CategoryPath objects which holds the categories + * (facets) which will be added to the document at + * {@link #build(Document)} + * @return This CategoryDocumentBuilder, to enable this one line call: + * {@code new} {@link #CategoryDocumentBuilder(TaxonomyWriter)}. + * {@link #setCategoryPaths(Iterable)}.{@link #build(Document)}. + * @throws IOException + */ + public CategoryDocumentBuilder setCategoryPaths( + Iterable categoryPaths) throws IOException { + if (categoryPaths == null) { + fieldList.clear(); + return this; + } + return setCategories(new CategoryAttributesIterable(categoryPaths)); + } + + /** + * Set the categories of the document builder from an {@link Iterable} of + * {@link CategoryAttribute} objects. + * + * @param categories + * An iterable of {@link CategoryAttribute} objects which holds + * the categories (facets) which will be added to the document at + * {@link #build(Document)} + * @return This CategoryDocumentBuilder, to enable this one line call: + * {@code new} {@link #CategoryDocumentBuilder(TaxonomyWriter)}. + * {@link #setCategories(Iterable)}.{@link #build(Document)}. + * @throws IOException + */ + public CategoryDocumentBuilder setCategories( + Iterable categories) throws IOException { + fieldList.clear(); + if (categories == null) { + return this; + } + + // get field-name to a list of facets mapping as different facets could + // be added to different category-lists on different fields + fillCategoriesMap(categories); + + // creates a different stream for each different field + for (Entry> e : categoriesMap + .entrySet()) { + // create a category attributes stream for the array of facets + CategoryAttributesStream categoryAttributesStream = new CategoryAttributesStream( + e.getValue()); + + // Set a suitable {@link TokenStream} using + // CategoryParentsStream, followed by CategoryListTokenizer and + // CategoryTokenizer composition (the ordering of the last two is + // not mandatory). + CategoryParentsStream parentsStream = (CategoryParentsStream) getParentsStream(categoryAttributesStream); + CategoryListTokenizer categoryListTokenizer = getCategoryListTokenizer(parentsStream); + CategoryTokenizer stream = getCategoryTokenizer(categoryListTokenizer); + + // Finally creating a suitable field with stream and adding it to a + // master field-list, used during the build process (see + // super.build()) + fieldList.add(new Field(e.getKey(), stream)); + } + + return this; + } + + /** + * Get a stream of categories which includes the parents, according to + * policies defined in indexing parameters. + * + * @param categoryAttributesStream + * The input stream + * @return The parents stream. + * @see OrdinalPolicy OrdinalPolicy (for policy of adding category tokens for parents) + * @see PathPolicy PathPolicy (for policy of adding category list tokens for parents) + */ + protected TokenStream getParentsStream( + CategoryAttributesStream categoryAttributesStream) { + return new CategoryParentsStream(categoryAttributesStream, + taxonomyWriter, indexingParams); + } + + /** + * Fills the categories mapping between a field name and a list of + * categories that belongs to it according to this builder's + * {@link FacetIndexingParams} object + * + * @param categories + * Iterable over the category attributes + */ + protected void fillCategoriesMap(Iterable categories) + throws IOException { + categoriesMap.clear(); + + // for-each category + for (CategoryAttribute category : categories) { + // extracting the field-name to which this category belongs + String fieldName = indexingParams.getCategoryListParams( + category.getCategoryPath()).getTerm().field(); + + // getting the list of categories which belongs to that field + List list = categoriesMap.get(fieldName); + + // if no such list exists + if (list == null) { + // adding a new one to the map + list = new ArrayList(); + categoriesMap.put(fieldName, list); + } + + // adding the new category to the list + list.add(category.clone()); + } + } + + /** + * Get a category list tokenizer (or a series of such tokenizers) to create + * the category list tokens. + * + * @param categoryStream + * A stream containing {@link CategoryAttribute} with the + * relevant data. + * @return The category list tokenizer (or series of tokenizers) to be used + * in creating category list tokens. + */ + protected CategoryListTokenizer getCategoryListTokenizer( + TokenStream categoryStream) { + return getCountingListTokenizer(categoryStream); + } + + /** + * Get a {@link CountingListTokenizer} for creating counting list token. + * + * @param categoryStream + * A stream containing {@link CategoryAttribute}s with the + * relevant data. + * @return A counting list tokenizer to be used in creating counting list + * token. + */ + protected CountingListTokenizer getCountingListTokenizer( + TokenStream categoryStream) { + return new CountingListTokenizer(categoryStream, indexingParams); + } + + /** + * Get a {@link CategoryTokenizer} to create the category tokens. + * This method can be overridden for adding more attributes to the category + * tokens. + * + * @param categoryStream + * A stream containing {@link CategoryAttribute} with the + * relevant data. + * @return The {@link CategoryTokenizer} to be used in creating category + * tokens. + * @throws IOException + */ + protected CategoryTokenizer getCategoryTokenizer(TokenStream categoryStream) + throws IOException { + return new CategoryTokenizer(categoryStream, indexingParams); + } + + /** + * Adds the fields created in one of the "set" methods to the document + */ + public Document build(Document doc) { + for (Field f : fieldList) { + f.setOmitNorms(true); + doc.add(f); + } + return doc; + } + +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/index/CategoryListPayloadStream.java b/modules/facet/src/java/org/apache/lucene/facet/index/CategoryListPayloadStream.java new file mode 100644 index 00000000000..eb0026b41cc --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/index/CategoryListPayloadStream.java @@ -0,0 +1,65 @@ +package org.apache.lucene.facet.index; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; + +import org.apache.lucene.util.encoding.IntEncoder; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Accumulates category IDs for a single document, for writing in byte array + * form, for example, to a Lucene Payload. + * + * @lucene.experimental + */ +public class CategoryListPayloadStream { + + private ByteArrayOutputStream baos = new ByteArrayOutputStream(50); + private IntEncoder encoder; + + /** Creates a Payload stream using the specified encoder. */ + public CategoryListPayloadStream(IntEncoder encoder) { + this.encoder = encoder; + this.encoder.reInit(baos); + } + + /** Appends an integer to the stream. */ + public void appendIntToStream(int intValue) throws IOException { + encoder.encode(intValue); + } + + /** Returns the streamed bytes so far accumulated, as an array of bytes. */ + public byte[] convertStreamToByteArray() { + try { + encoder.close(); + return baos.toByteArray(); + } catch (IOException e) { + // This cannot happen, because of BAOS (no I/O). + return new byte[0]; + } + } + + /** Resets this stream to begin building a new payload. */ + public void reset() throws IOException { + encoder.close(); + baos.reset(); + encoder.reInit(baos); + } + +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/index/FacetsPayloadProcessorProvider.java b/modules/facet/src/java/org/apache/lucene/facet/index/FacetsPayloadProcessorProvider.java new file mode 100644 index 00000000000..881ebf0048f --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/index/FacetsPayloadProcessorProvider.java @@ -0,0 +1,188 @@ +package org.apache.lucene.facet.index; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.index.PayloadProcessorProvider; +import org.apache.lucene.index.Term; +import org.apache.lucene.store.Directory; + +import org.apache.lucene.facet.index.params.CategoryListParams; +import org.apache.lucene.facet.index.params.FacetIndexingParams; +import org.apache.lucene.facet.taxonomy.lucene.LuceneTaxonomyWriter.OrdinalMap; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.encoding.IntDecoder; +import org.apache.lucene.util.encoding.IntEncoder; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * A {@link PayloadProcessorProvider} for updating facets ordinal references, + * based on an ordinal map. You should use this code in conjunction with merging + * taxonomies - after you merge taxonomies, you receive an {@link OrdinalMap} + * which maps the 'old' payloads to the 'new' ones. You can use that map to + * re-map the payloads which contain the facets information (ordinals) either + * before or while merging the indexes. + *

+ * For re-mapping the ordinals before you merge the indexes, do the following: + * + *

+ * // merge the old taxonomy with the new one.
+ * OrdinalMap map = LuceneTaxonomyWriter.addTaxonomies();
+ * int[] ordmap = map.getMap();
+ * 
+ * // re-map the ordinals on the old directory.
+ * Directory oldDir;
+ * FacetsPayloadProcessorProvider fppp = new FacetsPayloadProcessorProvider(
+ *     oldDir, ordmap);
+ * IndexWriterConfig conf = new IndexWriterConfig(VER, ANALYZER);
+ * conf.setMergePolicy(new ForceOptimizeMergePolicy());
+ * IndexWriter writer = new IndexWriter(oldDir, conf);
+ * writer.setPayloadProcessorProvider(fppp);
+ * writer.optimize();
+ * writer.close();
+ * 
+ * // merge that directory with the new index.
+ * IndexWriter newWriter; // opened on the 'new' Directory
+ * newWriter.addIndexes(oldDir);
+ * newWriter.commit();
+ * 
+ * + * For re-mapping the ordinals during index merge, do the following: + * + *
+ * // merge the old taxonomy with the new one.
+ * OrdinalMap map = LuceneTaxonomyWriter.addTaxonomies();
+ * int[] ordmap = map.getMap();
+ * 
+ * // Add the index and re-map ordinals on the go
+ * IndexReader r = IndexReader.open(oldDir);
+ * IndexWriterConfig conf = new IndexWriterConfig(VER, ANALYZER);
+ * IndexWriter writer = new IndexWriter(newDir, conf);
+ * writer.setPayloadProcessorProvider(fppp);
+ * writer.addIndexes(r);
+ * writer.commit();
+ * 
+ *

+ * NOTE: while the second example looks simpler, IndexWriter may trigger + * a long merge due to addIndexes. The first example avoids this perhaps + * unneeded merge, as well as can be done separately (e.g. on another node) + * before the index is merged. + * + * @lucene.experimental + */ +public class FacetsPayloadProcessorProvider extends PayloadProcessorProvider { + + private final Directory workDir; + + private final DirPayloadProcessor dirProcessor; + + /** + * Construct FacetsPayloadProcessorProvider with FacetIndexingParams + * + * @param dir the {@link Directory} containing the segments to update + * @param ordinalMap an array mapping previous facets ordinals to new ones + * @param indexingParams the facets indexing parameters + */ + public FacetsPayloadProcessorProvider(Directory dir, int[] ordinalMap, + FacetIndexingParams indexingParams) { + workDir = dir; + dirProcessor = new FacetsDirPayloadProcessor(indexingParams, ordinalMap); + } + + @Override + public DirPayloadProcessor getDirProcessor(Directory dir) throws IOException { + if (workDir != dir) { + return null; + } + return dirProcessor; + } + + public static class FacetsDirPayloadProcessor extends DirPayloadProcessor { + + private final Map termMap = new HashMap(1); + + private final int[] ordinalMap; + + /** + * Construct FacetsDirPayloadProcessor with custom FacetIndexingParams + * @param ordinalMap an array mapping previous facets ordinals to new ones + * @param indexingParams the facets indexing parameters + */ + protected FacetsDirPayloadProcessor(FacetIndexingParams indexingParams, int[] ordinalMap) { + this.ordinalMap = ordinalMap; + for (CategoryListParams params: indexingParams.getAllCategoryListParams()) { + termMap.put(params.getTerm(), params); + } + } + + @Override + public PayloadProcessor getProcessor(String field, BytesRef bytes) throws IOException { + // TODO (Facet): don't create terms + CategoryListParams params = termMap.get(new Term(field, bytes)); + if (params == null) { + return null; + } + return new FacetsPayloadProcessor(params, ordinalMap); + } + + } + + /** A PayloadProcessor for updating facets ordinal references, based on an ordinal map */ + public static class FacetsPayloadProcessor extends PayloadProcessor { + + private final IntEncoder encoder; + private final IntDecoder decoder; + private final int[] ordinalMap; + private final ByteArrayOutputStream os = new ByteArrayOutputStream(); + + /** + * @param params defines the encoding of facet ordinals as payload + * @param ordinalMap an array mapping previous facets ordinals to new ones + */ + protected FacetsPayloadProcessor(CategoryListParams params, int[] ordinalMap) { + encoder = params.createEncoder(); + decoder = encoder.createMatchingDecoder(); + this.ordinalMap = ordinalMap; + } + + @Override + public void processPayload(BytesRef payload) throws IOException { + InputStream is = new ByteArrayInputStream(payload.bytes, payload.offset, payload.length); + decoder.reInit(is); + os.reset(); + encoder.reInit(os); + long ordinal; + while ((ordinal = decoder.decode()) != IntDecoder.EOS) { + int newOrdinal = ordinalMap[(int)ordinal]; + encoder.encode(newOrdinal); + } + encoder.close(); + // TODO (Facet): avoid copy? + byte out[] = os.toByteArray(); + payload.bytes = out; + payload.offset = 0; + payload.length = out.length; + } + } + +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/index/attributes/CategoryAttribute.java b/modules/facet/src/java/org/apache/lucene/facet/index/attributes/CategoryAttribute.java new file mode 100644 index 00000000000..55ceef7afdc --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/index/attributes/CategoryAttribute.java @@ -0,0 +1,129 @@ +package org.apache.lucene.facet.index.attributes; + +import java.util.Collection; +import java.util.Set; + +import org.apache.lucene.util.Attribute; + +import org.apache.lucene.facet.taxonomy.CategoryPath; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * An attribute which contains for a certain category the {@link CategoryPath} + * and additional properties. + * + * @lucene.experimental + */ +public interface CategoryAttribute extends Attribute { + + /** + * Set the content of this {@link CategoryAttribute} from another + * {@link CategoryAttribute} object. + * + * @param other + * The {@link CategoryAttribute} to take the content from. + */ + public void set(CategoryAttribute other); + + /** + * Sets the category path value of this attribute. + * + * @param cp + * A category path. May not be null. + */ + public void setCategoryPath(CategoryPath cp); + + /** + * Returns the value of this attribute: a category path. + * + * @return The category path last assigned to this attribute, or null if + * none has been assigned. + */ + public CategoryPath getCategoryPath(); + + /** + * Add a property. The property can be later retrieved using + * {@link #getProperty(Class)} with this property class .
+ * Adding multiple properties of the same class is forbidden. + * + * @param property + * The property to add. + * @throws UnsupportedOperationException + * When attempting to add a property of a class that was added + * before and merge is prohibited. + */ + public void addProperty(CategoryProperty property) + throws UnsupportedOperationException; + + /** + * Get a property of a certain property class. + * + * @param propertyClass + * The required property class. + * @return The property of the given class, or null if no such property + * exists. + */ + public CategoryProperty getProperty( + Class propertyClass); + + /** + * Get a property of one of given property classes. + * + * @param propertyClasses + * The property classes. + * @return A property matching one of the given classes, or null if no such + * property exists. + */ + public CategoryProperty getProperty( + Collection> propertyClasses); + + /** + * Get all the active property classes. + * + * @return A set containing the active property classes, or {@code null} if + * there are no properties. + */ + public Set> getPropertyClasses(); + + /** + * Clone this {@link CategoryAttribute}. + * + * @return A clone of this {@link CategoryAttribute}. + */ + public CategoryAttribute clone(); + + /** + * Resets this attribute to its initial value: a null category path and no + * properties. + */ + public void clear(); + + /** + * Clear all properties. + */ + public void clearProperties(); + + /** + * Remove an property of a certain property class. + * + * @param propertyClass + * The required property class. + */ + public void remove(Class propertyClass); +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/index/attributes/CategoryAttributeImpl.java b/modules/facet/src/java/org/apache/lucene/facet/index/attributes/CategoryAttributeImpl.java new file mode 100644 index 00000000000..1bfa4d95b62 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/index/attributes/CategoryAttributeImpl.java @@ -0,0 +1,192 @@ +package org.apache.lucene.facet.index.attributes; + +import java.util.Collection; +import java.util.HashMap; +import java.util.Set; + +import org.apache.lucene.util.AttributeImpl; + +import org.apache.lucene.facet.taxonomy.CategoryPath; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * An implementation of {@link CategoryAttribute}. + * + * @lucene.experimental + */ +public final class CategoryAttributeImpl extends AttributeImpl implements + CategoryAttribute { + + /** + * The category path instance. + */ + protected CategoryPath categoryPath; + + /** + * A map of properties associated to the current category path. + */ + protected HashMap, CategoryProperty> properties; + + /** + * Construct an empty CategoryAttributeImpl. + */ + public CategoryAttributeImpl() { + // do nothing + } + + /** + * Construct a CategoryAttributeImpl with the given CategoryPath. + * + * @param categoryPath + * The category path to use. + */ + public CategoryAttributeImpl(CategoryPath categoryPath) { + setCategoryPath(categoryPath); + } + + public void set(CategoryAttribute other) { + ((CategoryAttributeImpl) other).copyTo(this); + } + + /** + * Returns the category path value. + * + * @return The category path last assigned to this attribute, or null if + * none has been assigned. + */ + public CategoryPath getCategoryPath() { + return categoryPath; + } + + public void setCategoryPath(CategoryPath cp) { + categoryPath = cp; + } + + public void addProperty(CategoryProperty property) + throws UnsupportedOperationException { + if (properties == null) { + properties = new HashMap, CategoryProperty>(); + } + CategoryProperty existing = properties.get(property.getClass()); + if (existing == null) { + properties.put(property.getClass(), property); + } else { + existing.merge(property); + } + } + + public CategoryProperty getProperty( + Class propertyClass) { + if (properties == null) { + return null; + } + return properties.get(propertyClass); + } + + public CategoryProperty getProperty( + Collection> propertyClasses) { + if (properties == null) { + return null; + } + for (Class propertyClass : propertyClasses) { + CategoryProperty categoryProperty = properties.get(propertyClass); + if (categoryProperty != null) { + return categoryProperty; + } + } + return null; + } + + @Override + public void copyTo(AttributeImpl target) { + ((CategoryAttributeImpl) target).categoryPath = this.categoryPath; + ((CategoryAttributeImpl) target).properties = this.properties; + } + + @SuppressWarnings("unchecked") + @Override + public CategoryAttribute clone() { + CategoryAttributeImpl ca = (CategoryAttributeImpl) super.clone(); + if (categoryPath != null) { + ca.categoryPath = (CategoryPath) categoryPath.clone(); + } + if (properties != null && !properties.isEmpty()) { + ca.properties = (HashMap, CategoryProperty>) properties + .clone(); + } + return ca; + } + + @Override + public void clear() { + categoryPath = null; + clearProperties(); + } + + public void clearProperties() { + if (properties != null) { + properties.clear(); + } + } + + @Override + public boolean equals(Object o) { + if (o == this) { + return true; + } + if (!(o instanceof CategoryAttributeImpl)) { + return false; + } + CategoryAttributeImpl other = (CategoryAttributeImpl) o; + if (categoryPath == null) { + return (other.categoryPath == null); + } + if (!categoryPath.equals(other.categoryPath)) { + return false; + } + if (properties == null || properties.isEmpty()) { + return (other.properties == null || other.properties.isEmpty()); + } + return properties.equals(other.properties); + } + + @Override + public int hashCode() { + if (categoryPath == null) { + return 0; + } + int hashCode = categoryPath.hashCode(); + if (properties != null && !properties.isEmpty()) { + hashCode ^= properties.hashCode(); + } + return hashCode; + } + + public Set> getPropertyClasses() { + if (properties == null || properties.isEmpty()) { + return null; + } + return properties.keySet(); + } + + public void remove(Class propertyClass) { + properties.remove(propertyClass); + } + +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/index/attributes/CategoryAttributesIterable.java b/modules/facet/src/java/org/apache/lucene/facet/index/attributes/CategoryAttributesIterable.java new file mode 100644 index 00000000000..74760d75b9b --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/index/attributes/CategoryAttributesIterable.java @@ -0,0 +1,69 @@ +package org.apache.lucene.facet.index.attributes; + +import java.util.Iterator; + +import org.apache.lucene.facet.index.streaming.CategoryAttributesStream; +import org.apache.lucene.facet.taxonomy.CategoryPath; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * This class transforms an {@link Iterable} of {@link CategoryPath} objects + * into an {@link Iterable} of {@link CategoryAttribute} objects, which can be + * used to construct a {@link CategoryAttributesStream}. + * + * @lucene.experimental + */ +public class CategoryAttributesIterable implements Iterable { + + private Iterable inputIterable; + + public CategoryAttributesIterable(Iterable inputIterable) { + this.inputIterable = inputIterable; + } + + public Iterator iterator() { + return new CategoryAttributesIterator(this.inputIterable); + } + + private static class CategoryAttributesIterator implements Iterator { + + private Iterator internalIterator; + private CategoryAttributeImpl categoryAttributeImpl; + + public CategoryAttributesIterator(Iterable inputIterable) { + this.internalIterator = inputIterable.iterator(); + this.categoryAttributeImpl = new CategoryAttributeImpl(); + } + + public boolean hasNext() { + return this.internalIterator.hasNext(); + } + + public CategoryAttribute next() { + this.categoryAttributeImpl.setCategoryPath(this.internalIterator + .next()); + return this.categoryAttributeImpl; + } + + public void remove() { + this.internalIterator.remove(); + } + + } +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/index/attributes/CategoryProperty.java b/modules/facet/src/java/org/apache/lucene/facet/index/attributes/CategoryProperty.java new file mode 100644 index 00000000000..e9e55ba5df6 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/index/attributes/CategoryProperty.java @@ -0,0 +1,51 @@ +package org.apache.lucene.facet.index.attributes; + +import java.io.Serializable; + +import org.apache.lucene.facet.index.CategoryContainer; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Property that can be added to {@link CategoryAttribute}s during indexing. + * Note that properties are put in a map and could be shallow copied during + * {@link CategoryAttributeImpl#clone()}, therefore reuse of + * {@link CategoryProperty} objects is not recommended. Also extends + * {@link Serializable}, making the {@link CategoryContainer} serialization more + * elegant. + * + * @lucene.experimental + */ +public interface CategoryProperty extends Serializable { + + /** + * When adding categories with properties to a certain document, it is + * possible that the same category will be added more than once with + * different instances of the same property. This method defined how to + * treat such cases, by merging the newly added property into the one + * previously added. Implementing classes can assume that this method will + * be called only with a property of the same class. + * + * @param other + * The category property to merge. + * @throws UnsupportedOperationException + * If merging is prohibited for this property. + */ + public void merge(CategoryProperty other) + throws UnsupportedOperationException; +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/index/attributes/OrdinalProperty.java b/modules/facet/src/java/org/apache/lucene/facet/index/attributes/OrdinalProperty.java new file mode 100644 index 00000000000..2511ce9668c --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/index/attributes/OrdinalProperty.java @@ -0,0 +1,71 @@ +package org.apache.lucene.facet.index.attributes; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * A {@link CategoryProperty} holding the ordinal from the taxonomy of the + * current category in {@link CategoryAttribute}. + *

+ * Ordinal properties are added internally during processing of category + * streams, and it is recommended not to use it externally. + * + * @lucene.experimental + */ +public class OrdinalProperty implements CategoryProperty { + + protected int ordinal = -1; + + public int getOrdinal() { + return ordinal; + } + + public boolean hasBeenSet() { + return this.ordinal >= 0; + } + + public void setOrdinal(int value) { + this.ordinal = value; + } + + public void clear() { + this.ordinal = -1; + } + + @Override + public boolean equals(Object other) { + if (other == this) { + return true; + } + if (!(other instanceof OrdinalProperty)) { + return false; + } + OrdinalProperty o = (OrdinalProperty) other; + return o.ordinal == this.ordinal; + } + + @Override + public int hashCode() { + return this.ordinal; + } + + public void merge(CategoryProperty other) { + throw new UnsupportedOperationException( + "Merging ordinal attributes is prohibited"); + } + +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/index/attributes/package.html b/modules/facet/src/java/org/apache/lucene/facet/index/attributes/package.html new file mode 100644 index 00000000000..8964fafa652 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/index/attributes/package.html @@ -0,0 +1,13 @@ + + +Category attributes and their properties for indexing + + +

Category attributes and their properties for indexing

+ +Attributes for a {@link org.apache.lucene.facet.taxonomy.CategoryPath category}, +possibly containing +{@link org.apache.lucene.facet.index.attributes.CategoryProperty category property}'s. + + + \ No newline at end of file diff --git a/modules/facet/src/java/org/apache/lucene/facet/index/categorypolicy/DefaultOrdinalPolicy.java b/modules/facet/src/java/org/apache/lucene/facet/index/categorypolicy/DefaultOrdinalPolicy.java new file mode 100644 index 00000000000..95de238c841 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/index/categorypolicy/DefaultOrdinalPolicy.java @@ -0,0 +1,43 @@ +package org.apache.lucene.facet.index.categorypolicy; + +import org.apache.lucene.facet.taxonomy.TaxonomyReader; +import org.apache.lucene.facet.taxonomy.TaxonomyWriter; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * This class filters our the ROOT category ID. For more information see + * {@link OrdinalPolicy}. + * + * @lucene.experimental + */ +public class DefaultOrdinalPolicy implements OrdinalPolicy { + + /** + * Filters out (returns false) ordinals equal or less than + * {@link TaxonomyReader#ROOT_ORDINAL}. true otherwise. + */ + public boolean shouldAdd(int ordinal) { + return ordinal > TaxonomyReader.ROOT_ORDINAL; + } + + /** + * Implemented as NO-OP as the default is not taxonomy dependent + */ + public void init(TaxonomyWriter taxonomyWriter) { } +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/index/categorypolicy/DefaultPathPolicy.java b/modules/facet/src/java/org/apache/lucene/facet/index/categorypolicy/DefaultPathPolicy.java new file mode 100644 index 00000000000..2fb172dad57 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/index/categorypolicy/DefaultPathPolicy.java @@ -0,0 +1,38 @@ +package org.apache.lucene.facet.index.categorypolicy; + +import org.apache.lucene.facet.taxonomy.CategoryPath; +import org.apache.lucene.facet.taxonomy.TaxonomyReader; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * This class filters our the ROOT category path. For more information see + * {@link PathPolicy}. + * + * @lucene.experimental + */ +public class DefaultPathPolicy implements PathPolicy { + + /** + * Filters out (returns false) CategoryPaths equal or less than + * {@link TaxonomyReader#ROOT_ORDINAL}. true otherwise. + */ + public boolean shouldAdd(CategoryPath categoryPath) { + return categoryPath.length() > 0; + } +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/index/categorypolicy/NonTopLevelOrdinalPolicy.java b/modules/facet/src/java/org/apache/lucene/facet/index/categorypolicy/NonTopLevelOrdinalPolicy.java new file mode 100644 index 00000000000..ee4c6fb89f5 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/index/categorypolicy/NonTopLevelOrdinalPolicy.java @@ -0,0 +1,71 @@ +package org.apache.lucene.facet.index.categorypolicy; + +import org.apache.lucene.facet.taxonomy.TaxonomyReader; +import org.apache.lucene.facet.taxonomy.TaxonomyWriter; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Filter out any "top level" category ordinals.
{@link #shouldAdd(int)}. + * + * @lucene.experimental + */ +public class NonTopLevelOrdinalPolicy implements OrdinalPolicy { + + /** + * The taxonomyWriter with which the given ordinals' parent is determined. + */ + private TaxonomyWriter taxonomyWriter; + + /** + * Constructs a new non-top-level-ordinal-filter. With a given + * taxonomyWriter. + * + */ + public NonTopLevelOrdinalPolicy() { + this.taxonomyWriter = null; + } + + /** + * @param taxonomyWriter + * A relevant taxonomyWriter object, with which ordinals sent to + * {@link #shouldAdd(int)} are examined. + */ + public void init(TaxonomyWriter taxonomyWriter) { + this.taxonomyWriter = taxonomyWriter; + } + + /** + * Filters out ordinal which are ROOT or who's parent is ROOT. In order to + * determine if a parent is root, there's a need for + * {@link TaxonomyWriter#getParent(int)}. + */ + public boolean shouldAdd(int ordinal) { + if (ordinal > TaxonomyReader.ROOT_ORDINAL) { + try { + if (this.taxonomyWriter.getParent(ordinal) > TaxonomyReader.ROOT_ORDINAL) { + return true; + } + } catch (Exception e) { + return false; + } + } + return false; + } + +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/index/categorypolicy/NonTopLevelPathPolicy.java b/modules/facet/src/java/org/apache/lucene/facet/index/categorypolicy/NonTopLevelPathPolicy.java new file mode 100644 index 00000000000..768c0b20cb5 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/index/categorypolicy/NonTopLevelPathPolicy.java @@ -0,0 +1,43 @@ +package org.apache.lucene.facet.index.categorypolicy; + +import org.apache.lucene.facet.taxonomy.CategoryPath; +import org.apache.lucene.facet.taxonomy.TaxonomyReader; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * This class filters our the ROOT category, and it's direct descendants. For + * more information see {@link PathPolicy}. + * + * @lucene.experimental + */ +public class NonTopLevelPathPolicy implements PathPolicy { + + /** + * The shortest path length delivered is two components (root + one child). + */ + public final int DEFAULT_MINIMAL_SUBPATH_LENGTH = 2; + + /** + * Filters out (returns false) CategoryPaths equal or less than + * {@link TaxonomyReader#ROOT_ORDINAL}. true otherwise. + */ + public boolean shouldAdd(CategoryPath categoryPath) { + return categoryPath.length() >= DEFAULT_MINIMAL_SUBPATH_LENGTH; + } +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/index/categorypolicy/OrdinalPolicy.java b/modules/facet/src/java/org/apache/lucene/facet/index/categorypolicy/OrdinalPolicy.java new file mode 100644 index 00000000000..b300a28cfb2 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/index/categorypolicy/OrdinalPolicy.java @@ -0,0 +1,56 @@ +package org.apache.lucene.facet.index.categorypolicy; + +import java.io.Serializable; + +import org.apache.lucene.facet.index.streaming.CategoryParentsStream; +import org.apache.lucene.facet.taxonomy.TaxonomyWriter; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Filtering category ordinals in {@link CategoryParentsStream}, where a given + * category ordinal is added to the stream, and than its parents are being added + * one after the other using {@link TaxonomyWriter#getParent(int)}.
+ * That loop should have a stop point - the default approach (excluding the + * ROOT) is implemented in {@link DefaultOrdinalPolicy}. + * + * @lucene.experimental + */ +public interface OrdinalPolicy extends Serializable { + + /** + * Check whether a given category ordinal should be added to the stream. + * + * @param ordinal + * A given category ordinal which is to be tested for stream + * addition. + * @return true if the category should be added. + * false otherwise. + */ + public abstract boolean shouldAdd(int ordinal); + + /** + * Initialize the policy with a TaxonomyWriter. This method can be + * implemented as noop if the ordinal policy is not taxonomy dependent + * + * @param taxonomyWriter + * A relevant taxonomyWriter object, with which ordinals sent to + * {@link #shouldAdd(int)} are examined. + */ + public abstract void init(TaxonomyWriter taxonomyWriter); +} \ No newline at end of file diff --git a/modules/facet/src/java/org/apache/lucene/facet/index/categorypolicy/PathPolicy.java b/modules/facet/src/java/org/apache/lucene/facet/index/categorypolicy/PathPolicy.java new file mode 100644 index 00000000000..9f49f502b08 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/index/categorypolicy/PathPolicy.java @@ -0,0 +1,47 @@ +package org.apache.lucene.facet.index.categorypolicy; + +import java.io.Serializable; + +import org.apache.lucene.facet.index.streaming.CategoryParentsStream; +import org.apache.lucene.facet.taxonomy.CategoryPath; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Filtering category paths in {@link CategoryParentsStream}, where a given + * category is added to the stream, and than all its parents are being + * added one after the other by successively removing the last component.
+ * That loop should have a stop point - the default approach (excluding the + * ROOT) is implemented in {@link DefaultOrdinalPolicy}. + * + * @lucene.experimental + */ +public interface PathPolicy extends Serializable { + + /** + * Check whether a given category path should be added to the stream. + * + * @param categoryPath + * A given category path which is to be tested for stream + * addition. + * @return true if the category path should be added. + * false otherwise. + */ + public abstract boolean shouldAdd(CategoryPath categoryPath); + +} \ No newline at end of file diff --git a/modules/facet/src/java/org/apache/lucene/facet/index/categorypolicy/package.html b/modules/facet/src/java/org/apache/lucene/facet/index/categorypolicy/package.html new file mode 100644 index 00000000000..b95117ef93a --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/index/categorypolicy/package.html @@ -0,0 +1,21 @@ + + +Policies for indexing categories + + +

Policies for indexing categories

+ +There are two kinds of policies: +
    +
  • Path policies are based on the path of the category.
  • +
  • Ordinal policies are based on the ordinal of the category.
  • +
+ +Policies are "consulted" with during indexing, for deciding whether a category should +be added to the index or not. The two kinds of policies can be used for different purposes. +For example, path policies dictates which categories can participate in a drill-down operation, +while ordinal policies affect which can be accumulated (e.g. counted). + + + + \ No newline at end of file diff --git a/modules/facet/src/java/org/apache/lucene/facet/index/package.html b/modules/facet/src/java/org/apache/lucene/facet/index/package.html new file mode 100644 index 00000000000..18c67078124 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/index/package.html @@ -0,0 +1,15 @@ + + +Indexing of document categories + + +

Indexing of document categories

+ +Attachment of +{@link org.apache.lucene.facet.taxonomy.CategoryPath CategoryPath}'s +or {@link org.apache.lucene.facet.index.attributes.CategoryAttribute CategoryAttribute}'s +to a given document using a +{@link org.apache.lucene.facet.taxonomy.TaxonomyWriter Taxonomy}. + + + \ No newline at end of file diff --git a/modules/facet/src/java/org/apache/lucene/facet/index/params/CategoryListParams.java b/modules/facet/src/java/org/apache/lucene/facet/index/params/CategoryListParams.java new file mode 100644 index 00000000000..4ec1a142f4d --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/index/params/CategoryListParams.java @@ -0,0 +1,149 @@ +package org.apache.lucene.facet.index.params; + +import java.io.IOException; +import java.io.Serializable; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; + +import org.apache.lucene.facet.search.CategoryListIterator; +import org.apache.lucene.facet.search.PayloadIntDecodingIterator; +import org.apache.lucene.facet.search.TotalFacetCounts; +import org.apache.lucene.facet.util.PartitionsUtils; +import org.apache.lucene.util.encoding.DGapIntEncoder; +import org.apache.lucene.util.encoding.IntDecoder; +import org.apache.lucene.util.encoding.IntEncoder; +import org.apache.lucene.util.encoding.SortingIntEncoder; +import org.apache.lucene.util.encoding.UniqueValuesIntEncoder; +import org.apache.lucene.util.encoding.VInt8IntEncoder; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Contains parameters for a category list * + * + * @lucene.experimental + */ +public class CategoryListParams implements Serializable { + + /** The default term used to store the facets information. */ + public static final Term DEFAULT_TERM = new Term("$facets", "$fulltree$"); + + private final Term term; + + private final int hashCode; + + /** + * Constructs a default category list parameters object, using + * {@link #DEFAULT_TERM}. + */ + public CategoryListParams() { + this(DEFAULT_TERM); + } + + /** + * Constructs a category list parameters object, using the given {@link Term}. + * @param term who's payload hold the category-list. + */ + public CategoryListParams(Term term) { + this.term = term; + // Pre-compute the hashCode because these objects are immutable. Saves + // some time on the comparisons later. + this.hashCode = term.hashCode(); + } + + /** + * A {@link Term} who's payload holds the category-list. + */ + public final Term getTerm() { + return term; + } + + /** + * Allows to override how categories are encoded and decoded. A matching + * {@link IntDecoder} is provided by the {@link IntEncoder}. + *

+ * Default implementation creates a new Sorting(Unique(DGap)) encoder. + * Uniqueness in this regard means when the same category appears twice in a + * document, only one appearance would be encoded. This has effect on facet + * counting results. + *

+ * Some possible considerations when overriding may be: + *

    + *
  • an application "knows" that all categories are unique. So no need to + * pass through the unique filter.
  • + *
  • Another application might wish to count multiple occurrences of the + * same category, or, use a faster encoding which will consume more space.
  • + *
+ * In any event when changing this value make sure you know what you are + * doing, and test the results - e.g. counts, if the application is about + * counting facets. + */ + public IntEncoder createEncoder() { + return new SortingIntEncoder(new UniqueValuesIntEncoder(new DGapIntEncoder(new VInt8IntEncoder()))); + } + + /** + * Equality is defined by the 'term' that defines this category list. + * Sub-classes should override this method if a more complex calculation + * is needed to ensure equality. + */ + @Override + public boolean equals(Object o) { + if (o == this) { + return true; + } + if (!(o instanceof CategoryListParams)) { + return false; + } + CategoryListParams other = (CategoryListParams) o; + if (this.hashCode != other.hashCode) { + return false; + } + // The above hashcodes might equal each other in the case of a collision, + // so at this point only directly term equality testing will settle + // the equality test. + return this.term.equals(other.term); + } + + /** + * Hashcode is similar to {@link #equals(Object)}, in that it uses + * the term that defines this category list to derive the hashcode. + * Subclasses need to ensure that equality/hashcode is correctly defined, + * or there could be side-effects in the {@link TotalFacetCounts} caching + * mechanism (as the filename for a Total Facet Counts array cache + * is dependent on the hashCode, so it should consistently return the same + * hash for identity). + */ + @Override + public int hashCode() { + return this.hashCode; + } + + /** + * Create the category list iterator for the specified partition. + */ + public CategoryListIterator createCategoryListIterator(IndexReader reader, + int partition) throws IOException { + String categoryListTermStr = PartitionsUtils.partitionName(this, partition); + Term payloadTerm = new Term(term.field(), categoryListTermStr); + return new PayloadIntDecodingIterator(reader, payloadTerm, + createEncoder().createMatchingDecoder()); + } + +} \ No newline at end of file diff --git a/modules/facet/src/java/org/apache/lucene/facet/index/params/DefaultFacetIndexingParams.java b/modules/facet/src/java/org/apache/lucene/facet/index/params/DefaultFacetIndexingParams.java new file mode 100644 index 00000000000..557b9180e6b --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/index/params/DefaultFacetIndexingParams.java @@ -0,0 +1,196 @@ +package org.apache.lucene.facet.index.params; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.lucene.facet.index.categorypolicy.DefaultOrdinalPolicy; +import org.apache.lucene.facet.index.categorypolicy.DefaultPathPolicy; +import org.apache.lucene.facet.index.categorypolicy.OrdinalPolicy; +import org.apache.lucene.facet.index.categorypolicy.PathPolicy; +import org.apache.lucene.facet.taxonomy.CategoryPath; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Default implementation for {@link FacetIndexingParams}. + *

+ * Getters for partition-size, {@link OrdinalPolicy} and + * {@link PathPolicy} are all final, and so the proper way to modify them when + * extending this class is through {@link #fixedPartitionSize()}, + * {@link #fixedOrdinalPolicy()} or {@link #fixedPathPolicy()} accordingly. + * + * @lucene.experimental + */ +public class DefaultFacetIndexingParams implements FacetIndexingParams { + + /** + * delimiter between a categories in a path, e.g. Products FACET_DELIM + * Consumer FACET_DELIM Tv. This should be a character not found in any path + * component + */ + public static final char DEFAULT_FACET_DELIM_CHAR = '\uF749'; + + private final CategoryListParams clpParams; + private final OrdinalPolicy ordinalPolicy; + private final PathPolicy pathPolicy; + private final int partitionSize; + + public DefaultFacetIndexingParams() { + this(new CategoryListParams()); + } + + public DefaultFacetIndexingParams(CategoryListParams categoryListParams) { + clpParams = categoryListParams; + ordinalPolicy = fixedOrdinalPolicy(); + pathPolicy = fixedPathPolicy(); + partitionSize = fixedPartitionSize(); + } + + public CategoryListParams getCategoryListParams(CategoryPath category) { + return clpParams; + } + + public int drillDownTermText(CategoryPath path, char[] buffer) { + return path.copyToCharArray(buffer, 0, -1, getFacetDelimChar()); + } + + /** + * "fixed" partition size. + * @see #getPartitionSize() + */ + protected int fixedPartitionSize() { + return Integer.MAX_VALUE; + } + + /** + * "fixed" ordinal policy. + * @see #getOrdinalPolicy() + */ + protected OrdinalPolicy fixedOrdinalPolicy() { + return new DefaultOrdinalPolicy(); + } + + /** + * "fixed" path policy. + * @see #getPathPolicy() + */ + protected PathPolicy fixedPathPolicy() { + return new DefaultPathPolicy(); + } + + public final int getPartitionSize() { + return partitionSize; + } + + /* + * (non-Javadoc) + * + * @see + * org.apache.lucene.facet.index.params.FacetIndexingParams#getAllCategoryListParams + * () + */ + public Iterable getAllCategoryListParams() { + List res = new ArrayList(); + res.add(clpParams); + return res; + } + + public final OrdinalPolicy getOrdinalPolicy() { + return ordinalPolicy; + } + + public final PathPolicy getPathPolicy() { + return pathPolicy; + } + + /* (non-Javadoc) + * @see java.lang.Object#hashCode() + */ + @Override + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + + ((clpParams == null) ? 0 : clpParams.hashCode()); + result = prime * result + + ((ordinalPolicy == null) ? 0 : ordinalPolicy.hashCode()); + result = prime * result + partitionSize; + result = prime * result + + ((pathPolicy == null) ? 0 : pathPolicy.hashCode()); + + for (CategoryListParams clp: getAllCategoryListParams()) { + result ^= clp.hashCode(); + } + + return result; + } + + /* (non-Javadoc) + * @see java.lang.Object#equals(java.lang.Object) + */ + @Override + public boolean equals(Object obj) { + if (this == obj) { + return true; + } + if (obj == null) { + return false; + } + if (!(obj instanceof DefaultFacetIndexingParams)) { + return false; + } + DefaultFacetIndexingParams other = (DefaultFacetIndexingParams) obj; + if (clpParams == null) { + if (other.clpParams != null) { + return false; + } + } else if (!clpParams.equals(other.clpParams)) { + return false; + } + if (ordinalPolicy == null) { + if (other.ordinalPolicy != null) { + return false; + } + } else if (!ordinalPolicy.equals(other.ordinalPolicy)) { + return false; + } + if (partitionSize != other.partitionSize) { + return false; + } + if (pathPolicy == null) { + if (other.pathPolicy != null) { + return false; + } + } else if (!pathPolicy.equals(other.pathPolicy)) { + return false; + } + + Iterable cLs = getAllCategoryListParams(); + Iterable otherCLs = other.getAllCategoryListParams(); + + return cLs.equals(otherCLs); + } + + /** + * Use {@link #DEFAULT_FACET_DELIM_CHAR} as the delimiter. + */ + public char getFacetDelimChar() { + return DEFAULT_FACET_DELIM_CHAR; + } + +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/index/params/FacetIndexingParams.java b/modules/facet/src/java/org/apache/lucene/facet/index/params/FacetIndexingParams.java new file mode 100644 index 00000000000..8a91b145de7 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/index/params/FacetIndexingParams.java @@ -0,0 +1,98 @@ +package org.apache.lucene.facet.index.params; + +import java.io.Serializable; + +import org.apache.lucene.facet.index.categorypolicy.OrdinalPolicy; +import org.apache.lucene.facet.index.categorypolicy.PathPolicy; +import org.apache.lucene.facet.taxonomy.CategoryPath; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Parameters on how facets are to be written to the index. + * For example, which fields and terms are used to refer to the indexed posting list. + *

+ * If non-default parameters were used during indexing, the same parameters + * must also be passed during faceted search. This requirement is analogous + * to the requirement during search to know which fields were indexed, and which + * Analyzer was used on the text. + * + * @lucene.experimental + */ +public interface FacetIndexingParams extends Serializable { + + /** + * The name of the category-list to put this category in, or null if this + * category should not be aggregatable. + *

+ * By default, all categories are written to the same category list, but + * applications which know in advance that in some situations only parts + * of the category hierarchy needs to be counted can divide the categories + * into two or more different category lists. + *

+ * If null is returned for a category, it means that this category should + * not appear in any category list, and thus counts for it cannot be + * aggregated. This category can still be used for drill-down, even though + * the count for it is not known. + */ + public CategoryListParams getCategoryListParams(CategoryPath category); + + /** + * Return info about all category lists in the index. + * + * @see #getCategoryListParams(CategoryPath) + */ + public Iterable getAllCategoryListParams(); + + // TODO (Facet): Add special cases of exact/non-exact category term-text + + /** + * Return the drilldown Term-Text which does not need to do any allocations. + * The number of chars set is returned. + *

+ * Note: Make sure buffer is large enough. + * @see CategoryPath#charsNeededForFullPath() + */ + public int drillDownTermText(CategoryPath path, char[] buffer); + + /** + * Get the partition size. + * Same value should be used during the life time of an index. + * At search time this value is compared with actual taxonomy size and their minimum is used. + */ + public int getPartitionSize(); + + /** + * Get the policy for indexing category paths, + * used for deciding how "high" to climb in taxonomy + * from a category when ingesting its category paths. + */ + public PathPolicy getPathPolicy(); + + /** + * Get the policy for indexing category ordinals, + * used for deciding how "high" to climb in taxonomy + * from a category when ingesting its ordinals + */ + public OrdinalPolicy getOrdinalPolicy(); + + /** + * Get the delimiter character used internally for drill-down terms + */ + public char getFacetDelimChar(); +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/index/params/FacetParamsMissingPropertyException.java b/modules/facet/src/java/org/apache/lucene/facet/index/params/FacetParamsMissingPropertyException.java new file mode 100644 index 00000000000..adb8181e509 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/index/params/FacetParamsMissingPropertyException.java @@ -0,0 +1,32 @@ +package org.apache.lucene.facet.index.params; + +import org.apache.lucene.facet.FacetException; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Thrown when the facets params are missing a property. * + * + * @lucene.experimental + */ +public class FacetParamsMissingPropertyException extends FacetException { + + public FacetParamsMissingPropertyException(String key) { + super("Property with key \"" + key + "\" not found"); + } +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/index/params/PerDimensionIndexingParams.java b/modules/facet/src/java/org/apache/lucene/facet/index/params/PerDimensionIndexingParams.java new file mode 100644 index 00000000000..1df4c0e7540 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/index/params/PerDimensionIndexingParams.java @@ -0,0 +1,105 @@ +package org.apache.lucene.facet.index.params; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.facet.taxonomy.CategoryPath; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * A FacetIndexingParams that utilizes different category lists, defined by the + * dimension specified CategoryPaths (see + * {@link PerDimensionIndexingParams#addCategoryListParams(CategoryPath, CategoryListParams)} + *

+ * A 'dimension' is defined as the first or "zero-th" component in a + * CategoryPath. For example, if a CategoryPath is defined as + * "/Author/American/Mark Twain", then the dimension is "Author". + *

+ * This class also uses the 'default' CategoryListParams (as specified by + * {@link CategoryListParams#CategoryListParams()} when + * {@link #getCategoryListParams(CategoryPath)} is called for a CategoryPath + * whose dimension component has not been specifically defined. + * + * @lucene.experimental + */ +public class PerDimensionIndexingParams extends DefaultFacetIndexingParams { + + // "Root" or "first component" of a Category Path maps to a + // CategoryListParams + private final Map clParamsMap = new HashMap(); + + /** + * Construct with the default {@link CategoryListParams} as the default + * CategoryListParams for unspecified CategoryPaths. + */ + public PerDimensionIndexingParams() { + this(new CategoryListParams()); + } + + /** + * Construct with the included categoryListParams as the default + * CategoryListParams for unspecified CategoryPaths. + * + * @param categoryListParams + * the default categoryListParams to use + */ + public PerDimensionIndexingParams(CategoryListParams categoryListParams) { + super(categoryListParams); + } + + /** + * Get all the categoryListParams, including the default. + */ + @Override + public Iterable getAllCategoryListParams() { + ArrayList vals = + new ArrayList(clParamsMap.values()); + for (CategoryListParams clp : super.getAllCategoryListParams()) { + vals.add(clp); + } + return vals; + } + + /** + * Get the CategoryListParams based on the dimension or "zero-th category" + * of the specified CategoryPath. + */ + @Override + public CategoryListParams getCategoryListParams(CategoryPath category) { + if (category != null) { + CategoryListParams clParams = clParamsMap.get(category.getComponent(0)); + if (clParams != null) { + return clParams; + } + } + return super.getCategoryListParams(category); + } + + /** + * Add a CategoryListParams for a given CategoryPath's dimension or + * "zero-th" category. + * + * @param category + * @param clParams + */ + public void addCategoryListParams(CategoryPath category, CategoryListParams clParams) { + clParamsMap.put(category.getComponent(0), clParams); + } +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/index/params/package.html b/modules/facet/src/java/org/apache/lucene/facet/index/params/package.html new file mode 100644 index 00000000000..ad71ae6e53e --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/index/params/package.html @@ -0,0 +1,12 @@ + + +Indexing-time specifications for handling facets + + +

Indexing-time specifications for handling facets

+ +Parameters on how facets are to be written to the index, +such as which fields and terms are used to refer to the facets posting list. + + + \ No newline at end of file diff --git a/modules/facet/src/java/org/apache/lucene/facet/index/streaming/CategoryAttributesStream.java b/modules/facet/src/java/org/apache/lucene/facet/index/streaming/CategoryAttributesStream.java new file mode 100644 index 00000000000..a869219c378 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/index/streaming/CategoryAttributesStream.java @@ -0,0 +1,81 @@ +package org.apache.lucene.facet.index.streaming; + +import java.io.IOException; +import java.util.Iterator; + +import org.apache.lucene.analysis.TokenStream; + +import org.apache.lucene.facet.index.attributes.CategoryAttribute; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * An attribute stream built from an {@link Iterable} of + * {@link CategoryAttribute}. This stream should then be passed through several + * filters (see {@link CategoryParentsStream}, {@link CategoryListTokenizer} and + * {@link CategoryTokenizer}) until a token stream is produced that can be + * indexed by Lucene. + *

+ * A CategoryAttributesStream object can be reused for producing more than one + * stream. To do that, the user should cause the underlying + * Iterable object to return a new set of categories, and + * then call {@link #reset()} to allow this stream to be used again. + * + * @lucene.experimental + */ +public class CategoryAttributesStream extends TokenStream { + + protected CategoryAttribute categoryAttribute; + + private Iterable iterable; + private Iterator iterator; + + /** + * Constructor + * + * @param iterable + * {@link Iterable} of {@link CategoryAttribute}, from which + * categories are taken. + */ + public CategoryAttributesStream(Iterable iterable) { + this.iterable = iterable; + this.iterator = null; + this.categoryAttribute = this.addAttribute(CategoryAttribute.class); + } + + @Override + public final boolean incrementToken() throws IOException { + if (iterator == null) { + if (iterable == null) { + return false; + } + iterator = iterable.iterator(); + } + if (iterator.hasNext()) { + categoryAttribute.set(iterator.next()); + return true; + } + return false; + } + + @Override + public void reset() { + this.iterator = null; + } + +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/index/streaming/CategoryListTokenizer.java b/modules/facet/src/java/org/apache/lucene/facet/index/streaming/CategoryListTokenizer.java new file mode 100644 index 00000000000..999f379ad94 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/index/streaming/CategoryListTokenizer.java @@ -0,0 +1,67 @@ +package org.apache.lucene.facet.index.streaming; + +import java.io.IOException; + +import org.apache.lucene.analysis.TokenStream; + +import org.apache.lucene.facet.index.params.FacetIndexingParams; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * A base class for category list tokenizers, which add category list tokens to + * category streams. + * + * @lucene.experimental + */ +public abstract class CategoryListTokenizer extends CategoryTokenizerBase { + + /** + * @see CategoryTokenizerBase#CategoryTokenizerBase(TokenStream, FacetIndexingParams) + */ + public CategoryListTokenizer(TokenStream input, + FacetIndexingParams indexingParams) { + super(input, indexingParams); + } + + /** + * A method invoked once when the input stream begins, for subclass-specific + * processing. Subclass implementations must invoke this one, too! + */ + protected void handleStartOfInput() throws IOException { + // In this class, we do nothing. + } + + /** + * A method invoked once when the input stream ends, for subclass-specific + * processing. + */ + protected void handleEndOfInput() throws IOException { + // In this class, we do nothing. + } + + @Override + public void reset() throws IOException { + super.reset(); + handleStartOfInput(); + } + + @Override + public abstract boolean incrementToken() throws IOException; + +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/index/streaming/CategoryParentsStream.java b/modules/facet/src/java/org/apache/lucene/facet/index/streaming/CategoryParentsStream.java new file mode 100644 index 00000000000..b5a0cb34d84 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/index/streaming/CategoryParentsStream.java @@ -0,0 +1,189 @@ +package org.apache.lucene.facet.index.streaming; + +import java.io.IOException; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.Set; + +import org.apache.lucene.analysis.TokenFilter; + +import org.apache.lucene.facet.index.attributes.CategoryAttribute; +import org.apache.lucene.facet.index.attributes.CategoryProperty; +import org.apache.lucene.facet.index.attributes.OrdinalProperty; +import org.apache.lucene.facet.index.categorypolicy.OrdinalPolicy; +import org.apache.lucene.facet.index.categorypolicy.PathPolicy; +import org.apache.lucene.facet.index.params.FacetIndexingParams; +import org.apache.lucene.facet.taxonomy.CategoryPath; +import org.apache.lucene.facet.taxonomy.TaxonomyWriter; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * This class adds parents to a {@link CategoryAttributesStream}. The parents + * are added according to the {@link PathPolicy} and {@link OrdinalPolicy} from + * the {@link FacetIndexingParams} given in the constructor.
+ * By default, category properties are removed when creating parents of a + * certain category. However, it is possible to retain certain property types + * using {@link #addRetainableProperty(Class)}. + * + * @lucene.experimental + */ +public class CategoryParentsStream extends TokenFilter { + + /** + * A {@link TaxonomyWriter} for adding categories and retrieving their + * ordinals. + */ + protected TaxonomyWriter taxonomyWriter; + + /** An attribute containing all data related to the category */ + protected CategoryAttribute categoryAttribute; + + /** A category property containing the category ordinal */ + protected OrdinalProperty ordinalProperty; + + /** + * A set of property classes that are to be retained when creating a parent + * token. + */ + private Set> retainableProperties; + + /** A {@link PathPolicy} for the category's parents' category paths. */ + private PathPolicy pathPolicy; + + /** An {@link OrdinalPolicy} for the category's parents' ordinals. */ + private OrdinalPolicy ordinalPolicy; + + /** + * Constructor. + * + * @param input + * The input stream to handle, must be derived from + * {@link CategoryAttributesStream}. + * @param taxonomyWriter + * The taxonomy writer to use for adding categories and + * retrieving their ordinals. + * @param indexingParams + * The indexing params used for filtering parents. + */ + public CategoryParentsStream(CategoryAttributesStream input, + TaxonomyWriter taxonomyWriter, FacetIndexingParams indexingParams) { + super(input); + this.categoryAttribute = this.addAttribute(CategoryAttribute.class); + this.taxonomyWriter = taxonomyWriter; + this.pathPolicy = indexingParams.getPathPolicy(); + this.ordinalPolicy = indexingParams.getOrdinalPolicy(); + this.ordinalPolicy.init(taxonomyWriter); + this.ordinalProperty = new OrdinalProperty(); + + } + + @Override + public final boolean incrementToken() throws IOException { + if (this.categoryAttribute.getCategoryPath() != null) { + // try adding the parent of the current category to the stream + clearCategoryProperties(); + boolean added = false; + // set the parent's ordinal, if illegal set -1 + int ordinal = this.ordinalProperty.getOrdinal(); + if (ordinal != -1) { + ordinal = this.taxonomyWriter.getParent(ordinal); + if (this.ordinalPolicy.shouldAdd(ordinal)) { + this.ordinalProperty.setOrdinal(ordinal); + try { + this.categoryAttribute.addProperty(ordinalProperty); + } catch (UnsupportedOperationException e) { + throw new IOException(e.getLocalizedMessage()); + } + added = true; + } else { + this.ordinalProperty.setOrdinal(-1); + } + } + // set the parent's category path, if illegal set null + CategoryPath cp = this.categoryAttribute.getCategoryPath(); + if (cp != null) { + cp.trim(1); + // if ordinal added, must also have category paths + if (added || this.pathPolicy.shouldAdd(cp)) { + this.categoryAttribute.setCategoryPath(cp); + added = true; + } else { + this.categoryAttribute.clear(); + } + } + if (added) { + // a legal parent exists + return true; + } + } + // no more parents - get new category + if (input.incrementToken()) { + int ordinal = taxonomyWriter.addCategory(this.categoryAttribute.getCategoryPath()); + this.ordinalProperty.setOrdinal(ordinal); + try { + this.categoryAttribute.addProperty(this.ordinalProperty); + } catch (UnsupportedOperationException e) { + throw new IOException(e.getLocalizedMessage()); + } + return true; + } + return false; + } + + /** + * Clear the properties of the current {@link CategoryAttribute} attribute + * before setting the parent attributes.
+ * It is possible to retain properties of certain types the parent tokens, + * using {@link #addRetainableProperty(Class)}. + */ + protected void clearCategoryProperties() { + if (this.retainableProperties == null + || this.retainableProperties.isEmpty()) { + this.categoryAttribute.clearProperties(); + } else { + List> propertyClassesToRemove = + new LinkedList>(); + for (Class propertyClass : this.categoryAttribute + .getPropertyClasses()) { + if (!this.retainableProperties.contains(propertyClass)) { + propertyClassesToRemove.add(propertyClass); + } + } + for (Class propertyClass : propertyClassesToRemove) { + this.categoryAttribute.remove(propertyClass); + } + } + } + + /** + * Add a {@link CategoryProperty} class which is retained when creating + * parent tokens. + * + * @param toRetain + * The property class to retain. + */ + public void addRetainableProperty(Class toRetain) { + if (this.retainableProperties == null) { + this.retainableProperties = new HashSet>(); + } + this.retainableProperties.add(toRetain); + } + +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/index/streaming/CategoryTokenizer.java b/modules/facet/src/java/org/apache/lucene/facet/index/streaming/CategoryTokenizer.java new file mode 100644 index 00000000000..a2c4db2cea6 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/index/streaming/CategoryTokenizer.java @@ -0,0 +1,67 @@ +package org.apache.lucene.facet.index.streaming; + +import java.io.IOException; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; + +import org.apache.lucene.facet.index.params.FacetIndexingParams; +import org.apache.lucene.facet.taxonomy.CategoryPath; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Basic class for setting the {@link CharTermAttribute}s and + * {@link PayloadAttribute}s of category tokens. + * + * @lucene.experimental + */ +public class CategoryTokenizer extends CategoryTokenizerBase { + + /** + * @see CategoryTokenizerBase#CategoryTokenizerBase(TokenStream, + * FacetIndexingParams) + */ + public CategoryTokenizer(TokenStream input, + FacetIndexingParams indexingParams) { + super(input, indexingParams); + } + + @Override + public final boolean incrementToken() throws IOException { + if (input.incrementToken()) { + if (categoryAttribute != null && categoryAttribute.getCategoryPath() != null) { + CategoryPath categoryPath = categoryAttribute.getCategoryPath(); + char[] termBuffer = termAttribute.resizeBuffer(categoryPath.charsNeededForFullPath()); + int nChars = indexingParams.drillDownTermText(categoryPath, termBuffer); + termAttribute.setLength(nChars); + setPayload(); + } + return true; + } + return false; + } + + /** + * Set the payload of the current category token. + */ + protected void setPayload() { + } + +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/index/streaming/CategoryTokenizerBase.java b/modules/facet/src/java/org/apache/lucene/facet/index/streaming/CategoryTokenizerBase.java new file mode 100644 index 00000000000..81a242e4f1f --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/index/streaming/CategoryTokenizerBase.java @@ -0,0 +1,78 @@ +package org.apache.lucene.facet.index.streaming; + +import java.io.IOException; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.index.Payload; + +import org.apache.lucene.facet.index.CategoryDocumentBuilder; +import org.apache.lucene.facet.index.attributes.CategoryAttribute; +import org.apache.lucene.facet.index.params.FacetIndexingParams; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * A base class for all token filters which add term and payload attributes to + * tokens and are to be used in {@link CategoryDocumentBuilder}. Contains three + * attributes: {@link CategoryAttribute}, {@link CharTermAttribute} and + * {@link PayloadAttribute}. + * + * @lucene.experimental + */ +public abstract class CategoryTokenizerBase extends TokenFilter { + + /** The stream's category attributes. */ + protected CategoryAttribute categoryAttribute; + + /** The stream's payload attribute. */ + protected PayloadAttribute payloadAttribute; + + /** The stream's term attribute. */ + protected CharTermAttribute termAttribute; + + /** The object used for constructing payloads. */ + protected Payload payload = new Payload(); + + /** Indexing params for creating term text **/ + protected FacetIndexingParams indexingParams; + + /** + * Constructor. + * + * @param input + * The input stream, either {@link CategoryParentsStream} or an + * extension of {@link CategoryTokenizerBase}. + * @param indexingParams + * The indexing params to use. + */ + public CategoryTokenizerBase(TokenStream input, + FacetIndexingParams indexingParams) { + super(input); + this.categoryAttribute = this.addAttribute(CategoryAttribute.class); + this.termAttribute = this.addAttribute(CharTermAttribute.class); + this.payloadAttribute = this.addAttribute(PayloadAttribute.class); + this.indexingParams = indexingParams; + } + + @Override + public abstract boolean incrementToken() throws IOException; + +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/index/streaming/CountingListTokenizer.java b/modules/facet/src/java/org/apache/lucene/facet/index/streaming/CountingListTokenizer.java new file mode 100644 index 00000000000..84df4c27f1c --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/index/streaming/CountingListTokenizer.java @@ -0,0 +1,125 @@ +package org.apache.lucene.facet.index.streaming; + +import java.io.IOException; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map.Entry; + +import org.apache.lucene.analysis.TokenStream; + +import org.apache.lucene.facet.index.CategoryListPayloadStream; +import org.apache.lucene.facet.index.attributes.OrdinalProperty; +import org.apache.lucene.facet.index.params.CategoryListParams; +import org.apache.lucene.facet.index.params.FacetIndexingParams; +import org.apache.lucene.facet.taxonomy.CategoryPath; +import org.apache.lucene.facet.util.PartitionsUtils; +import org.apache.lucene.util.encoding.IntEncoder; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * {@link CategoryListTokenizer} for facet counting + * + * @lucene.experimental + */ +public class CountingListTokenizer extends CategoryListTokenizer { + + /** A table for retrieving payload streams by category-list name. */ + protected HashMap payloadStreamsByName = + new HashMap(); + + /** An iterator over the payload streams */ + protected Iterator> payloadStreamIterator; + + public CountingListTokenizer(TokenStream input, + FacetIndexingParams indexingParams) { + super(input, indexingParams); + this.payloadStreamsByName = new HashMap(); + } + + @Override + protected void handleStartOfInput() throws IOException { + payloadStreamsByName.clear(); + payloadStreamIterator = null; + } + + @Override + public final boolean incrementToken() throws IOException { + if (input.incrementToken()) { + if (this.categoryAttribute != null) { + OrdinalProperty ordinalProperty = (OrdinalProperty) this.categoryAttribute + .getProperty(OrdinalProperty.class); + if (ordinalProperty != null && legalCategory()) { + CategoryPath categoryPath = this.categoryAttribute + .getCategoryPath(); + int ordinal = ordinalProperty.getOrdinal(); + CategoryListPayloadStream payloadStream = getPayloadStream( + categoryPath, ordinal); + int partitionSize = indexingParams.getPartitionSize(); + payloadStream.appendIntToStream(ordinal % partitionSize); + } + } + return true; + } + if (this.payloadStreamIterator == null) { + this.handleEndOfInput(); + this.payloadStreamIterator = this.payloadStreamsByName.entrySet() + .iterator(); + } + if (this.payloadStreamIterator.hasNext()) { + Entry entry = this.payloadStreamIterator + .next(); + String countingListName = entry.getKey(); + int length = countingListName.length(); + this.termAttribute.resizeBuffer(length); + countingListName.getChars(0, length, termAttribute.buffer(), 0); + this.termAttribute.setLength(length); + CategoryListPayloadStream payloadStream = entry.getValue(); + payload.setData(payloadStream.convertStreamToByteArray()); + this.payloadAttribute.setPayload(payload); + return true; + } + return false; + } + + /** + * A method which allows extending classes to filter the categories going + * into the counting list. + * + * @return By default returns {@code true}, meaning the current category is + * to be part of the counting list. For categories that should be + * filtered, return {@code false}. + */ + protected boolean legalCategory() { + return true; + } + + protected CategoryListPayloadStream getPayloadStream( + CategoryPath categoryPath, int ordinal) throws IOException { + CategoryListParams clParams = this.indexingParams.getCategoryListParams(categoryPath); + String name = PartitionsUtils.partitionNameByOrdinal(indexingParams, clParams, ordinal); + CategoryListPayloadStream fps = payloadStreamsByName.get(name); + if (fps == null) { + IntEncoder encoder = clParams.createEncoder(); + fps = new CategoryListPayloadStream(encoder); + payloadStreamsByName.put(name, fps); + } + return fps; + } + +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/index/streaming/package.html b/modules/facet/src/java/org/apache/lucene/facet/index/streaming/package.html new file mode 100644 index 00000000000..8b0fb92e8ea --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/index/streaming/package.html @@ -0,0 +1,19 @@ + + +Expert: attributes streaming definition for indexing facets + + +

Expert: attributes streaming definition for indexing facets

+ +Steaming of facets attributes is a low level indexing interface with Lucene indexing. +There are two types of category related streams: +
    +
  • Category tokenizer stream handles tokenization for a single category, + e.g. for creating drill-down tokens.
  • +
  • Category list tokenizer stream handles tokenization for multiple categories, + e.g. for creating a counting list token, representing all the categories of + a certain document.
  • +
+ + + \ No newline at end of file diff --git a/modules/facet/src/java/org/apache/lucene/facet/package.html b/modules/facet/src/java/org/apache/lucene/facet/package.html new file mode 100644 index 00000000000..494f27fe13e --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/package.html @@ -0,0 +1,8 @@ + + + Faceted Indexing and Search + + + Provides faceted indexing and search capabilities. + + \ No newline at end of file diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/AdaptiveFacetsAccumulator.java b/modules/facet/src/java/org/apache/lucene/facet/search/AdaptiveFacetsAccumulator.java new file mode 100644 index 00000000000..81c6c4e0f79 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/search/AdaptiveFacetsAccumulator.java @@ -0,0 +1,116 @@ +package org.apache.lucene.facet.search; + +import java.io.IOException; +import java.util.List; + +import org.apache.lucene.index.IndexReader; + +import org.apache.lucene.facet.search.params.FacetSearchParams; +import org.apache.lucene.facet.search.results.FacetResult; +import org.apache.lucene.facet.search.results.FacetResultNode; +import org.apache.lucene.facet.search.sampling.Sampler; +import org.apache.lucene.facet.search.sampling.SamplingAccumulator; +import org.apache.lucene.facet.taxonomy.TaxonomyReader; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * {@link FacetsAccumulator} whose behavior regarding complements, sampling, + * etc. is not set up front but rather is determined at accumulation time + * according to the statistics of the accumulated set of documents and the + * index. + *

+ * Note: Sampling accumulation (Accumulation over a sampled-set of the results), + * does not guarantee accurate values for + * {@link FacetResult#getNumValidDescendants()} & + * {@link FacetResultNode#getResidue()}. + * + * @lucene.experimental + */ +public final class AdaptiveFacetsAccumulator extends StandardFacetsAccumulator { + + private Sampler sampler = new Sampler(); + + /** + * Create an {@link AdaptiveFacetsAccumulator} + * @see StandardFacetsAccumulator#StandardFacetsAccumulator(FacetSearchParams, IndexReader, TaxonomyReader) + */ + public AdaptiveFacetsAccumulator(FacetSearchParams searchParams, IndexReader indexReader, + TaxonomyReader taxonomyReader) { + super(searchParams, indexReader, taxonomyReader); + } + + /** + * Create an {@link AdaptiveFacetsAccumulator} + * @see StandardFacetsAccumulator#StandardFacetsAccumulator(FacetSearchParams, IndexReader, TaxonomyReader, + * IntArrayAllocator, FloatArrayAllocator) + */ + public AdaptiveFacetsAccumulator(FacetSearchParams searchParams, IndexReader indexReader, + TaxonomyReader taxonomyReader, IntArrayAllocator intArrayAllocator, + FloatArrayAllocator floatArrayAllocator) { + super(searchParams, indexReader, taxonomyReader, intArrayAllocator, floatArrayAllocator); + } + + /** + * Set the sampler. + * @param sampler sampler to set + */ + public void setSampler(Sampler sampler) { + this.sampler = sampler; + } + + @Override + public List accumulate(ScoredDocIDs docids) throws IOException { + FacetsAccumulator delegee = appropriateFacetCountingAccumulator(docids); + + if (delegee == this) { + return super.accumulate(docids); + } + + return delegee.accumulate(docids); + } + + /** + * Compute the appropriate facet accumulator to use. + * If no special/clever adaptation is possible/needed return this (self). + */ + private FacetsAccumulator appropriateFacetCountingAccumulator(ScoredDocIDs docids) { + // Verify that searchPareams permit sampling/complement/etc... otherwise do default + if (!mayComplement()) { + return this; + } + + // Now we're sure we can use the sampling methods as we're in a counting only mode + + // Verify that sampling is enabled and required ... otherwise do default + if (sampler == null || !sampler.shouldSample(docids)) { + return this; + } + + SamplingAccumulator samplingAccumulator = new SamplingAccumulator(sampler, searchParams, indexReader, taxonomyReader); + samplingAccumulator.setComplementThreshold(getComplementThreshold()); + return samplingAccumulator; + } + + /** + * @return the sampler in effect + */ + public final Sampler getSampler() { + return sampler; + } +} \ No newline at end of file diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/CategoryListIterator.java b/modules/facet/src/java/org/apache/lucene/facet/search/CategoryListIterator.java new file mode 100644 index 00000000000..5a134c42a04 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/search/CategoryListIterator.java @@ -0,0 +1,69 @@ +package org.apache.lucene.facet.search; + +import java.io.IOException; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * An interface for iterating over a "category list", i.e., the list of + * categories per document. + *

+ * NOTE: + *

    + *
  • This class operates as a key to a Map. Appropriate implementation of + * hashCode() and equals() must be provided. + *
  • {@link #init()} must be called before you consume any categories, or call + * {@link #skipTo(int)}. + *
  • {@link #skipTo(int)} must be called before any calls to + * {@link #nextCategory()}. + *
  • {@link #nextCategory()} returns values < {@link Integer#MAX_VALUE}, so + * you can use it as a stop condition. + *
+ * + * @lucene.experimental + */ +public interface CategoryListIterator { + + /** + * Initializes the iterator. This method must be called before any calls to + * {@link #skipTo(int)}, and its return value indicates whether there are + * any relevant documents for this iterator. If it returns false, any call + * to {@link #skipTo(int)} will return false as well.
+ * NOTE: calling this method twice may result in skipping over + * documents for some implementations. Also, calling it again after all + * documents were consumed may yield unexpected behavior. + */ + public boolean init() throws IOException; + + /** + * Skips forward to document docId. Returns true iff this document exists + * and has any categories. This method must be called before calling + * {@link #nextCategory()} for a particular document.
+ * NOTE: Users should call this method with increasing docIds, and + * implementations can assume that this is the case. + */ + public boolean skipTo(int docId) throws IOException; + + /** + * Returns the next category for the current document that is set through + * {@link #skipTo(int)}, or a number higher than {@link Integer#MAX_VALUE}. + * No assumptions can be made on the order of the categories. + */ + public long nextCategory() throws IOException; + +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/DrillDown.java b/modules/facet/src/java/org/apache/lucene/facet/search/DrillDown.java new file mode 100644 index 00000000000..823a7292feb --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/search/DrillDown.java @@ -0,0 +1,110 @@ +package org.apache.lucene.facet.search; + +import org.apache.lucene.index.Term; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.BooleanClause.Occur; + +import org.apache.lucene.facet.index.params.CategoryListParams; +import org.apache.lucene.facet.index.params.FacetIndexingParams; +import org.apache.lucene.facet.search.params.FacetSearchParams; +import org.apache.lucene.facet.taxonomy.CategoryPath; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Creation of drill down term or query. + * + * @lucene.experimental + */ +public final class DrillDown { + + /** + * @see #term(FacetIndexingParams, CategoryPath) + */ + public static final Term term(FacetSearchParams sParams, CategoryPath path) { + return term(sParams.getFacetIndexingParams(), path); + } + + /** + * Return a term for drilling down into a category. + */ + public static final Term term(FacetIndexingParams iParams, CategoryPath path) { + CategoryListParams clp = iParams.getCategoryListParams(path); + char[] buffer = new char[path.charsNeededForFullPath()]; + iParams.drillDownTermText(path, buffer); + return new Term(clp.getTerm().field(), String.valueOf(buffer)); + } + + /** + * Return a query for drilling down into all given categories (AND). + * @see #term(FacetSearchParams, CategoryPath) + * @see #query(FacetSearchParams, Query, CategoryPath...) + */ + public static final Query query(FacetIndexingParams iParams, CategoryPath... paths) { + if (paths==null || paths.length==0) { + throw new IllegalArgumentException("Empty category path not allowed for drill down query!"); + } + if (paths.length==1) { + return new TermQuery(term(iParams, paths[0])); + } + BooleanQuery res = new BooleanQuery(); + for (CategoryPath cp : paths) { + res.add(new TermQuery(term(iParams, cp)), Occur.MUST); + } + return res; + } + + /** + * Return a query for drilling down into all given categories (AND). + * @see #term(FacetSearchParams, CategoryPath) + * @see #query(FacetSearchParams, Query, CategoryPath...) + */ + public static final Query query(FacetSearchParams sParams, CategoryPath... paths) { + return query(sParams.getFacetIndexingParams(), paths); + } + + /** + * Turn a base query into a drilling-down query for all given category paths (AND). + * @see #query(FacetIndexingParams, CategoryPath...) + */ + public static final Query query(FacetIndexingParams iParams, Query baseQuery, CategoryPath... paths) { + BooleanQuery res = new BooleanQuery(); + res.add(baseQuery, Occur.MUST); + res.add(query(iParams, paths), Occur.MUST); + return res; + } + + /** + * Turn a base query into a drilling-down query for all given category paths (AND). + * @see #query(FacetSearchParams, CategoryPath...) + */ + public static final Query query(FacetSearchParams sParams, Query baseQuery, CategoryPath... paths) { + return query(sParams.getFacetIndexingParams(), baseQuery, paths); + } + + /** + * Turn a base query into a drilling-down query using the default {@link FacetSearchParams} + * @see #query(FacetSearchParams, Query, CategoryPath...) + */ + public static final Query query(Query baseQuery, CategoryPath... paths) { + return query(new FacetSearchParams(), baseQuery, paths); + } + +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/FacetArrays.java b/modules/facet/src/java/org/apache/lucene/facet/search/FacetArrays.java new file mode 100644 index 00000000000..cf954ade935 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/search/FacetArrays.java @@ -0,0 +1,91 @@ +package org.apache.lucene.facet.search; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Provider of arrays used for facet operations such as counting. + * + * @lucene.experimental + */ +public class FacetArrays { + + private int[] intArray; + private float[] floatArray; + private IntArrayAllocator intArrayAllocator; + private FloatArrayAllocator floatArrayAllocator; + private int arraysLength; + + /** + * Create a FacetArrays with certain array allocators. + * @param intArrayAllocator allocator for int arrays. + * @param floatArrayAllocator allocator for float arrays. + */ + public FacetArrays(IntArrayAllocator intArrayAllocator, + FloatArrayAllocator floatArrayAllocator) { + this.intArrayAllocator = intArrayAllocator; + this.floatArrayAllocator = floatArrayAllocator; + } + + /** + * Notify allocators that they can free arrays allocated + * on behalf of this FacetArrays object. + */ + public void free() { + if (intArrayAllocator!=null) { + intArrayAllocator.free(intArray); + // Should give up handle to the array now + // that it is freed. + intArray = null; + } + if (floatArrayAllocator!=null) { + floatArrayAllocator.free(floatArray); + // Should give up handle to the array now + // that it is freed. + floatArray = null; + } + arraysLength = 0; + } + + /** + * Obtain an int array, e.g. for facet counting. + */ + public int[] getIntArray() { + if (intArray == null) { + intArray = intArrayAllocator.allocate(); + arraysLength = intArray.length; + } + return intArray; + } + + /** Obtain a float array, e.g. for evaluating facet association values. */ + public float[] getFloatArray() { + if (floatArray == null) { + floatArray = floatArrayAllocator.allocate(); + arraysLength = floatArray.length; + } + return floatArray; + } + + /** + * Return the arrays length + */ + public int getArraysLength() { + return arraysLength; + } + +} \ No newline at end of file diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/FacetResultsHandler.java b/modules/facet/src/java/org/apache/lucene/facet/search/FacetResultsHandler.java new file mode 100644 index 00000000000..10ea4847009 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/search/FacetResultsHandler.java @@ -0,0 +1,161 @@ +package org.apache.lucene.facet.search; + +import java.io.IOException; + +import org.apache.lucene.facet.search.params.FacetRequest; +import org.apache.lucene.facet.search.results.FacetResult; +import org.apache.lucene.facet.search.results.FacetResultNode; +import org.apache.lucene.facet.search.results.IntermediateFacetResult; +import org.apache.lucene.facet.taxonomy.TaxonomyReader; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Handler for facet results. + *

+ * The facet results handler provided by the {@link FacetRequest} to + * a {@link FacetsAccumulator}. + *

+ * First it is used by {@link FacetsAccumulator} to obtain a temporary + * facet result for each partition and to merge results of several partitions. + *

+ * Later the accumulator invokes the handler to render the results, creating + * {@link FacetResult} objects. + *

+ * Last the accumulator invokes the handler to label final results. + * + * @lucene.experimental + */ +public abstract class FacetResultsHandler { + + /** Taxonomy for which facets are handled */ + protected final TaxonomyReader taxonomyReader; + + /** + * Facet request served by this handler. + */ + protected final FacetRequest facetRequest; + + /** + * Create a faceted search handler. + * @param taxonomyReader See {@link #getTaxonomyReader()}. + * @param facetRequest See {@link #getFacetRequest()}. + */ + public FacetResultsHandler(TaxonomyReader taxonomyReader, + FacetRequest facetRequest) { + this.taxonomyReader = taxonomyReader; + this.facetRequest = facetRequest; + } + + /** + * Fetch results of a single partition, given facet arrays for that partition, + * and based on the matching documents and faceted search parameters. + * + * @param arrays + * facet arrays for the certain partition + * @param offset + * offset in input arrays where partition starts + * @return temporary facet result, potentially, to be passed back to + * this result handler for merging, or null in case that + * constructor parameter, facetRequest, requests an + * illegal FacetResult, like, e.g., a root node category path that + * does not exist in constructor parameter taxonomyReader + * . + * @throws IOException + * on error + */ + public abstract IntermediateFacetResult fetchPartitionResult(FacetArrays arrays, int offset) throws IOException; + + /** + * Merge results of several facet partitions. Logic of the merge is undefined + * and open for interpretations. For example, a merge implementation could + * keep top K results. Passed {@link IntermediateFacetResult} must be ones + * that were created by this handler otherwise a {@link ClassCastException} is + * thrown. In addition, all passed {@link IntermediateFacetResult} must have + * the same {@link FacetRequest} otherwise an {@link IllegalArgumentException} + * is thrown. + * + * @param tmpResults one or more temporary results created by this + * handler. + * @return temporary facet result that represents to union, as specified by + * this handler, of the input temporary facet results. + * @throws IOException on error. + * @throws ClassCastException if the temporary result passed was not created + * by this handler + * @throws IllegalArgumentException if passed facetResults do not + * have the same {@link FacetRequest} + * @see IntermediateFacetResult#getFacetRequest() + */ + public abstract IntermediateFacetResult mergeResults(IntermediateFacetResult... tmpResults) + throws IOException, ClassCastException, IllegalArgumentException; + + /** + * Create a facet result from the temporary result. + * @param tmpResult temporary result to be rendered as a {@link FacetResult} + * @throws IOException on error. + */ + public abstract FacetResult renderFacetResult(IntermediateFacetResult tmpResult) throws IOException ; + + /** + * Perform any rearrangement as required on a facet result that has changed after + * it was rendered. + *

+ * Possible use case: a sampling facets accumulator invoked another + * other facets accumulator on a sample set of documents, obtained + * rendered facet results, fixed their counts, and now it is needed + * to sort the results differently according to the fixed counts. + * @param facetResult result to be rearranged. + * @see FacetResultNode#setValue(double) + */ + public abstract FacetResult rearrangeFacetResult(FacetResult facetResult); + + /** + * Label results according to settings in {@link FacetRequest}, + * such as {@link FacetRequest#getNumLabel()}. + * Usually invoked by {@link FacetsAccumulator#accumulate(ScoredDocIDs)} + * @param facetResult facet result to be labeled. + * @throws IOException on error + */ + public abstract void labelResult (FacetResult facetResult) throws IOException; + + /** Return taxonomy reader used for current facets accumulation operation. */ + public final TaxonomyReader getTaxonomyReader() { + return this.taxonomyReader; + } + + /** Return the facet request served by this handler. */ + public final FacetRequest getFacetRequest() { + return this.facetRequest; + } + + /** + * Check if an array contains the partition which contains ordinal + * + * @param ordinal + * checked facet + * @param facetArrays + * facet arrays for the certain partition + * @param offset + * offset in input arrays where partition starts + */ + protected boolean isSelfPartition (int ordinal, FacetArrays facetArrays, int offset) { + int partitionSize = facetArrays.getArraysLength(); + return ordinal / partitionSize == offset / partitionSize; + } + +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/FacetsAccumulator.java b/modules/facet/src/java/org/apache/lucene/facet/search/FacetsAccumulator.java new file mode 100644 index 00000000000..b707de6e187 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/search/FacetsAccumulator.java @@ -0,0 +1,153 @@ +package org.apache.lucene.facet.search; + +import java.io.IOException; +import java.util.List; + +import org.apache.lucene.index.IndexReader; + +import org.apache.lucene.facet.search.params.FacetSearchParams; +import org.apache.lucene.facet.search.params.FacetRequest; +import org.apache.lucene.facet.search.results.FacetResult; +import org.apache.lucene.facet.taxonomy.TaxonomyReader; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Driver for Accumulating facets of faceted search requests over given + * documents. + * + * @lucene.experimental + */ +public abstract class FacetsAccumulator { + + /** + * Default threshold for using the complements optimization. + * If accumulating facets for a document set larger than this ratio of the index size than + * perform the complement optimization. + * @see #setComplementThreshold(double) for more info on the complements optimization. + */ + public static final double DEFAULT_COMPLEMENT_THRESHOLD = 0.6; + + /** + * Passing this to {@link #setComplementThreshold(double)} will disable using complement optimization. + */ + public static final double DISABLE_COMPLEMENT = Double.POSITIVE_INFINITY; // > 1 actually + + /** + * Passing this to {@link #setComplementThreshold(double)} will force using complement optimization. + */ + public static final double FORCE_COMPLEMENT = 0; // <=0 + + private double complementThreshold = DEFAULT_COMPLEMENT_THRESHOLD; + + protected final TaxonomyReader taxonomyReader; + protected final IndexReader indexReader; + protected FacetSearchParams searchParams; + + private boolean allowLabeling = true; + + public FacetsAccumulator(FacetSearchParams searchParams, + IndexReader indexReader, + TaxonomyReader taxonomyReader) { + this.indexReader = indexReader; + this.taxonomyReader = taxonomyReader; + this.searchParams = searchParams; + } + + /** + * Accumulate facets over given documents, according to facet requests in effect. + * @param docids documents (and their scores) for which facets are Accumulated. + * @return Accumulated facets. + * @throws IOException on error. + */ + // internal API note: it was considered to move the docids into the constructor as well, + // but this prevents nice extension capabilities, especially in the way that + // Sampling Accumulator works with the (any) delegated accumulator. + public abstract List accumulate(ScoredDocIDs docids) throws IOException; + + /** + * @return the complement threshold + * @see #setComplementThreshold(double) + */ + public double getComplementThreshold() { + return complementThreshold; + } + + /** + * Set the complement threshold. + * This threshold will dictate whether the complements optimization is applied. + * The optimization is to count for less documents. It is useful when the same + * FacetSearchParams are used for varying sets of documents. The first time + * complements is used the "total counts" are computed - counting for all the + * documents in the collection. Then, only the complementing set of documents + * is considered, and used to decrement from the overall counts, thereby + * walking through less documents, which is faster. + *

+ * Note that this optimization is only available when searching an index + * whose {@link IndexReader} implements both + * {@link IndexReader#directory()} and {@link IndexReader#getVersion()} + * otherwise the optimization is silently disabled regardless of + * the complement threshold settings. + *

+ * For the default settings see {@link #DEFAULT_COMPLEMENT_THRESHOLD}. + *

+ * To forcing complements in all cases pass {@link #FORCE_COMPLEMENT}. + * This is mostly useful for testing purposes, as forcing complements when only + * tiny fraction of available documents match the query does not make sense and + * would incur performance degradations. + *

+ * To disable complements pass {@link #DISABLE_COMPLEMENT}. + * @param complementThreshold the complement threshold to set + */ + public void setComplementThreshold(double complementThreshold) { + this.complementThreshold = complementThreshold; + } + + /** + * Check if labeling is allowed for this accumulator. + *

+ * By default labeling is allowed. + * This allows one accumulator to invoke other accumulators for accumulation + * but keep to itself the responsibility of labeling. + * This might br handy since labeling is a costly operation. + * @return true of labeling is allowed for this accumulator + * @see #setAllowLabeling(boolean) + */ + protected boolean isAllowLabeling() { + return allowLabeling; + } + + /** + * Set whether labeling is allowed for this accumulator. + * @param allowLabeling new setting for allow labeling + * @see #isAllowLabeling() + */ + protected void setAllowLabeling(boolean allowLabeling) { + this.allowLabeling = allowLabeling; + } + + /** check if all requests are complementable */ + protected boolean mayComplement() { + for (FacetRequest freq:searchParams.getFacetRequests()) { + if (!freq.supportsComplements()) { + return false; + } + } + return true; + } +} \ No newline at end of file diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/FacetsCollector.java b/modules/facet/src/java/org/apache/lucene/facet/search/FacetsCollector.java new file mode 100644 index 00000000000..af3a57a62a7 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/search/FacetsCollector.java @@ -0,0 +1,137 @@ +package org.apache.lucene.facet.search; + +import java.io.IOException; +import java.util.List; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexReader.AtomicReaderContext; +import org.apache.lucene.search.Collector; +import org.apache.lucene.search.Scorer; + +import org.apache.lucene.facet.search.params.FacetRequest; +import org.apache.lucene.facet.search.params.FacetSearchParams; +import org.apache.lucene.facet.search.results.FacetResult; +import org.apache.lucene.facet.taxonomy.TaxonomyReader; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Collector for facet accumulation. * + * + * @lucene.experimental + */ +public class FacetsCollector extends Collector { + + protected final FacetsAccumulator facetsAccumulator; + private ScoredDocIdCollector scoreDocIdCollector; + private List results; + private Object resultsGuard; + + /** + * Create a collector for accumulating facets while collecting documents + * during search. + * + * @param facetSearchParams + * faceted search parameters defining which facets are required and + * how. + * @param indexReader + * searched index. + * @param taxonomyReader + * taxonomy containing the facets. + */ + public FacetsCollector(FacetSearchParams facetSearchParams, + IndexReader indexReader, TaxonomyReader taxonomyReader) { + facetsAccumulator = initFacetsAccumulator(facetSearchParams, indexReader, taxonomyReader); + scoreDocIdCollector = initScoredDocCollector(facetSearchParams, indexReader, taxonomyReader); + resultsGuard = new Object(); + } + + /** + * Create a {@link ScoredDocIdCollector} to be used as the first phase of + * the facet collection. If all facetRequests are do not require the + * document score, a ScoredDocIdCollector which does not store the document + * scores would be returned. Otherwise a SDIC which does store the documents + * will be returned, having an initial allocated space for 1000 such + * documents' scores. + */ + protected ScoredDocIdCollector initScoredDocCollector( + FacetSearchParams facetSearchParams, IndexReader indexReader, + TaxonomyReader taxonomyReader) { + for (FacetRequest frq : facetSearchParams.getFacetRequests()) { + if (frq.requireDocumentScore()) { + return ScoredDocIdCollector.create(1000, true); + } + } + return ScoredDocIdCollector.create(indexReader.maxDoc(), false); + } + + /** + * Create the {@link FacetsAccumulator} to be used. Default is + * {@link StandardFacetsAccumulator}. Called once at the constructor of the collector. + * + * @param facetSearchParams + * The search params. + * @param indexReader + * A reader to the index to search in. + * @param taxonomyReader + * A reader to the active taxonomy. + * @return The {@link FacetsAccumulator} to use. + */ + protected FacetsAccumulator initFacetsAccumulator(FacetSearchParams facetSearchParams, + IndexReader indexReader, + TaxonomyReader taxonomyReader) { + return new StandardFacetsAccumulator(facetSearchParams, indexReader, taxonomyReader); + } + + /** + * Return accumulated facets results (according to faceted search parameters) + * for collected documents. + * @throws IOException on error + */ + public List getFacetResults() throws IOException { + synchronized (resultsGuard) { // over protection + if (results == null) { + // lazy creation but just once + results = facetsAccumulator.accumulate(scoreDocIdCollector.getScoredDocIDs()); + scoreDocIdCollector = null; + } + return results; + } + } + + @Override + public boolean acceptsDocsOutOfOrder() { + return false; + } + + @Override + public void collect(int doc) throws IOException { + scoreDocIdCollector.collect(doc); + } + + @Override + public void setNextReader(AtomicReaderContext context) throws IOException { + scoreDocIdCollector.setNextReader(context); + } + + @Override + public void setScorer(Scorer scorer) throws IOException { + scoreDocIdCollector.setScorer(scorer); + } + +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/FloatArrayAllocator.java b/modules/facet/src/java/org/apache/lucene/facet/search/FloatArrayAllocator.java new file mode 100644 index 00000000000..78abe88cbfb --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/search/FloatArrayAllocator.java @@ -0,0 +1,68 @@ +package org.apache.lucene.facet.search; + +import java.util.Arrays; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * An FloatArrayAllocator is an object which manages float array objects + * of a certain size. These float arrays are needed temporarily during + * faceted search (see {@link FacetsAccumulator} and can be reused across searches + * instead of being allocated afresh on every search. + *

+ * An FloatArrayAllocator is thread-safe. + * + * @lucene.experimental + */ +public final class FloatArrayAllocator extends TemporaryObjectAllocator { + + // An FloatArrayAllocater deals with integer arrays of a fixed size, size. + private int size; + + /** + * Construct an allocator for float arrays of size size, + * keeping around a pool of up to maxArrays old arrays. + *

+ * Note that the pool size only restricts the number of arrays that hang + * around when not needed, but not the maximum number of arrays + * that are allocated when actually is use: If a number of concurrent + * threads ask for an allocation, all of them will get a counter array, + * even if their number is greater than maxArrays. If an application wants + * to limit the number of concurrent threads making allocations, it needs + * to do so on its own - for example by blocking new threads until the + * existing ones have finished. + *

+ * In particular, when maxArrays=0, this object behaves as a trivial + * allocator, always allocating a new array and never reusing an old one. + */ + public FloatArrayAllocator(int size, int maxArrays) { + super(maxArrays); + this.size = size; + } + + @Override + public float[] create() { + return new float[size]; + } + + @Override + public void clear(float[] array) { + Arrays.fill(array, 0); + } + +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/Heap.java b/modules/facet/src/java/org/apache/lucene/facet/search/Heap.java new file mode 100644 index 00000000000..8dc5ccef449 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/search/Heap.java @@ -0,0 +1,56 @@ +package org.apache.lucene.facet.search; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Declares an interface for heap (and heap alike) structures, + * handling a given type T + * + * @lucene.experimental + */ +public interface Heap { + /** + * Get and remove the top of the Heap
+ * NOTE: Once {@link #pop()} is called no other {@link #add(Object)} or + * {@link #insertWithOverflow(Object)} should be called. + */ + public T pop(); + + /** Get (But not remove) the top of the Heap */ + public T top(); + + /** + * Insert a new value, returning the overflowen object
+ * NOTE: This method should not be called after invoking {@link #pop()} + */ + public T insertWithOverflow(T value); + + /** + * Add a new value to the heap, return the new top().
+ * Some implementations may choose to not implement this functionality. + * In such a case null should be returned.
+ * NOTE: This method should not be called after invoking {@link #pop()} + */ + public T add(T frn); + + /** Clear the heap */ + public void clear(); + + /** Return the amount of objects currently in the heap */ + public int size(); +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/IntArrayAllocator.java b/modules/facet/src/java/org/apache/lucene/facet/search/IntArrayAllocator.java new file mode 100644 index 00000000000..6b03d1cfd02 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/search/IntArrayAllocator.java @@ -0,0 +1,68 @@ +package org.apache.lucene.facet.search; + +import java.util.Arrays; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * An IntArrayAllocator is an object which manages counter array objects + * of a certain length. These counter arrays are needed temporarily during + * faceted search (see {@link FacetsAccumulator} and can be reused across searches + * instead of being allocated afresh on every search. + *

+ * An IntArrayAllocator is thread-safe. + * + * @lucene.experimental + */ +public final class IntArrayAllocator extends TemporaryObjectAllocator { + + // An IntArrayAllocater deals with integer arrays of a fixed length. + private int length; + + /** + * Construct an allocator for counter arrays of length length, + * keeping around a pool of up to maxArrays old arrays. + *

+ * Note that the pool size only restricts the number of arrays that hang + * around when not needed, but not the maximum number of arrays + * that are allocated when actually is use: If a number of concurrent + * threads ask for an allocation, all of them will get a counter array, + * even if their number is greater than maxArrays. If an application wants + * to limit the number of concurrent threads making allocations, it needs + * to do so on its own - for example by blocking new threads until the + * existing ones have finished. + *

+ * In particular, when maxArrays=0, this object behaves as a trivial + * allocator, always allocating a new array and never reusing an old one. + */ + public IntArrayAllocator(int length, int maxArrays) { + super(maxArrays); + this.length = length; + } + + @Override + public int[] create() { + return new int[length]; + } + + @Override + public void clear(int[] array) { + Arrays.fill(array, 0); + } + +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/PayloadIntDecodingIterator.java b/modules/facet/src/java/org/apache/lucene/facet/search/PayloadIntDecodingIterator.java new file mode 100644 index 00000000000..87b6e1f2c62 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/search/PayloadIntDecodingIterator.java @@ -0,0 +1,117 @@ +package org.apache.lucene.facet.search; + +import java.io.IOException; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; + +import org.apache.lucene.util.UnsafeByteArrayInputStream; +import org.apache.lucene.util.encoding.IntDecoder; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * A payload deserializer comes with its own working space (buffer). One need to + * define the {@link IndexReader} and {@link Term} in which the payload resides. + * The iterator then consumes the payload information of each document and + * decodes it into categories. A typical use case of this class is: + * + *

+ * IndexReader reader = [open your reader];
+ * Term t = new Term("field", "where-payload-exists");
+ * CategoryListIterator cli = new PayloadIntDecodingIterator(reader, t);
+ * if (!cli.init()) {
+ *   // it means there are no payloads / documents associated with that term.
+ *   // Usually a sanity check. However, init() must be called.
+ * }
+ * DocIdSetIterator disi = [you usually iterate on something else, such as a Scorer];
+ * int doc;
+ * while ((doc = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
+ *   cli.setdoc(doc);
+ *   long category;
+ *   while ((category = cli.nextCategory()) < Integer.MAX_VALUE) {
+ *   }
+ * }
+ * 
+ * + * @lucene.experimental + */ +public class PayloadIntDecodingIterator implements CategoryListIterator { + + private final UnsafeByteArrayInputStream ubais; + private final IntDecoder decoder; + + private final IndexReader indexReader; + private final Term term; + private final PayloadIterator pi; + private final int hashCode; + + public PayloadIntDecodingIterator(IndexReader indexReader, Term term, IntDecoder decoder) + throws IOException { + this(indexReader, term, decoder, new byte[1024]); + } + + public PayloadIntDecodingIterator(IndexReader indexReader, Term term, IntDecoder decoder, + byte[] buffer) throws IOException { + pi = new PayloadIterator(indexReader, term, buffer); + ubais = new UnsafeByteArrayInputStream(); + this.decoder = decoder; + hashCode = indexReader.hashCode() ^ term.hashCode(); + this.term = term; + this.indexReader = indexReader; + } + + @Override + public boolean equals(Object other) { + if (!(other instanceof PayloadIntDecodingIterator)) { + return false; + } + PayloadIntDecodingIterator that = (PayloadIntDecodingIterator) other; + if (hashCode != that.hashCode) { + return false; + } + + // Hash codes are the same, check equals() to avoid cases of hash-collisions. + return indexReader.equals(that.indexReader) && term.equals(that.term); + } + + @Override + public int hashCode() { + return hashCode; + } + + public boolean init() throws IOException { + return pi.init(); + } + + public long nextCategory() throws IOException { + return decoder.decode(); + } + + public boolean skipTo(int docId) throws IOException { + if (!pi.setdoc(docId)) { + return false; + } + + // Initializing the decoding mechanism with the new payload data + ubais.reInit(pi.getBuffer(), 0, pi.getPayloadLength()); + decoder.reInit(ubais); + return true; + } + +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/PayloadIterator.java b/modules/facet/src/java/org/apache/lucene/facet/search/PayloadIterator.java new file mode 100644 index 00000000000..c9aaf3c332d --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/search/PayloadIterator.java @@ -0,0 +1,138 @@ +package org.apache.lucene.facet.search; + +import java.io.IOException; + +import org.apache.lucene.index.DocsAndPositionsEnum; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.MultiFields; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * A utility class for iterating through a posting list of a given term and + * retrieving the payload of the first occurrence in every document. Comes with + * its own working space (buffer). + * + * @lucene.experimental + */ +public class PayloadIterator { + + protected byte[] buffer; + protected int payloadLength; + + DocsAndPositionsEnum tp; + + private boolean hasMore; + + public PayloadIterator(IndexReader indexReader, Term term) + throws IOException { + this(indexReader, term, new byte[1024]); + } + + public PayloadIterator(IndexReader indexReader, Term term, byte[] buffer) + throws IOException { + this.buffer = buffer; + // TODO (Facet): avoid Multi*? + Bits deletedDocs = MultiFields.getDeletedDocs(indexReader); + this.tp = MultiFields.getTermPositionsEnum(indexReader, deletedDocs, term.field(), term.bytes()); + } + + /** + * (re)initialize the iterator. Should be done before the first call to + * {@link #setdoc(int)}. Returns false if there is no category list found + * (no setdoc() will never return true). + */ + public boolean init() throws IOException { + hasMore = tp != null && tp.nextDoc() != DocIdSetIterator.NO_MORE_DOCS; + return hasMore; + } + + /** + * Skip forward to document docId. Return true if this document exists and + * has any payload. + *

+ * Users should call this method with increasing docIds, and implementations + * can assume that this is the case. + */ + public boolean setdoc(int docId) throws IOException { + if (!hasMore) { + return false; + } + + if (tp.docID() > docId) { + return false; + } + + // making sure we have the requested document + if (tp.docID() < docId) { + // Skipping to requested document + if (tp.advance(docId) == DocIdSetIterator.NO_MORE_DOCS) { + this.hasMore = false; + return false; + } + + // If document not found (skipped to much) + if (tp.docID() != docId) { + return false; + } + } + + // Prepare for payload extraction + tp.nextPosition(); + + // TODO: fix bug in SepCodec and then remove this check (the null check should be enough) + if (!tp.hasPayload()) { + return false; + } + + BytesRef br = tp.getPayload(); + + if (br == null || br.length == 0) { + return false; + } + + this.payloadLength = br.length; + + if (this.payloadLength > this.buffer.length) { + // Growing if necessary. + this.buffer = new byte[this.payloadLength * 2 + 1]; + } + // Loading the payload + System.arraycopy(br.bytes, br.offset, this.buffer, 0, payloadLength); + return true; + } + + /** + * Get the buffer with the content of the last read payload. + */ + public byte[] getBuffer() { + return buffer; + } + + /** + * Get the length of the last read payload. + */ + public int getPayloadLength() { + return payloadLength; + } + +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/SamplingWrapper.java b/modules/facet/src/java/org/apache/lucene/facet/search/SamplingWrapper.java new file mode 100644 index 00000000000..61a09b4d607 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/search/SamplingWrapper.java @@ -0,0 +1,118 @@ +package org.apache.lucene.facet.search; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.lucene.facet.search.params.FacetSearchParams; +import org.apache.lucene.facet.search.results.FacetResult; +import org.apache.lucene.facet.search.results.FacetResultNode; +import org.apache.lucene.facet.search.sampling.Sampler; +import org.apache.lucene.facet.search.sampling.Sampler.SampleResult; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Wrap any Facets Accumulator with sampling. + *

+ * Note: Sampling accumulation (Accumulation over a sampled-set of the results), + * does not guarantee accurate values for + * {@link FacetResult#getNumValidDescendants()} & + * {@link FacetResultNode#getResidue()}. + * + * @lucene.experimental + */ +public class SamplingWrapper extends FacetsAccumulator { + + private FacetsAccumulator delegee; + private Sampler sampler; + + public SamplingWrapper(FacetsAccumulator delegee, Sampler sampler) { + super(delegee.searchParams, delegee.indexReader, delegee.taxonomyReader); + this.delegee = delegee; + this.sampler = sampler; + } + + @Override + public List accumulate(ScoredDocIDs docids) throws IOException { + // first let delegee accumulate without labeling at all (though + // currently it doesn't matter because we have to label all returned anyhow) + boolean origAllowLabeling = isAllowLabeling(); + setAllowLabeling(false); + + // Replacing the original searchParams with the over-sampled (and without statistics-compute) + FacetSearchParams original = delegee.searchParams; + delegee.searchParams = sampler.overSampledSearchParams(original); + + SampleResult sampleSet = sampler.getSampleSet(docids); + + List sampleRes = delegee.accumulate(sampleSet.docids); + setAllowLabeling(origAllowLabeling); + + List fixedRes = new ArrayList(); + for (FacetResult fres : sampleRes) { + // for sure fres is not null because this is guaranteed by the delegee. + FacetResultsHandler frh = fres.getFacetRequest().createFacetResultsHandler(taxonomyReader); + // fix the result of current request + sampler.getSampleFixer(indexReader, taxonomyReader, searchParams) + .fixResult(docids, fres); + fres = frh.rearrangeFacetResult(fres); // let delegee's handler do any + + // Using the sampler to trim the extra (over-sampled) results + fres = sampler.trimResult(fres); + + // final labeling if allowed (because labeling is a costly operation) + if (isAllowLabeling()) { + frh.labelResult(fres); + } + fixedRes.add(fres); // add to final results + } + + delegee.searchParams = original; // Back to original params + + return fixedRes; + } + + /** + * @see FacetsAccumulator#getComplementThreshold() + */ + @Override + public double getComplementThreshold() { + return delegee.getComplementThreshold(); + } + + /** + * @param complementThreshold + * @see FacetsAccumulator#setComplementThreshold(double) + */ + @Override + public void setComplementThreshold(double complementThreshold) { + delegee.setComplementThreshold(complementThreshold); + } + + @Override + protected boolean isAllowLabeling() { + return delegee.isAllowLabeling(); + } + + @Override + protected void setAllowLabeling(boolean allowLabeling) { + delegee.setAllowLabeling(allowLabeling); + } + +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/ScoredDocIDs.java b/modules/facet/src/java/org/apache/lucene/facet/search/ScoredDocIDs.java new file mode 100644 index 00000000000..e183f9ecae2 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/search/ScoredDocIDs.java @@ -0,0 +1,42 @@ +package org.apache.lucene.facet.search; + +import java.io.IOException; + +import org.apache.lucene.search.DocIdSet; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Document IDs with scores for each, driving facets accumulation. Document + * scores are optionally used in the process of facets scoring. + * + * @see FacetsAccumulator#accumulate(ScoredDocIDs) + * @lucene.experimental + */ +public interface ScoredDocIDs { + + /** Returns an iterator over the document IDs and their scores. */ + public ScoredDocIDsIterator iterator() throws IOException; + + /** Returns the set of doc IDs. */ + public DocIdSet getDocIDs(); + + /** Returns the number of scored documents. */ + public int size(); + +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/ScoredDocIDsIterator.java b/modules/facet/src/java/org/apache/lucene/facet/search/ScoredDocIDsIterator.java new file mode 100644 index 00000000000..70f239810ce --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/search/ScoredDocIDsIterator.java @@ -0,0 +1,43 @@ +package org.apache.lucene.facet.search; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Iterator over document IDs and their scores. Each {@link #next()} retrieves + * the next docID and its score which can be later be retrieved by + * {@link #getDocID()} and {@link #getScore()}. NOTE: you must call + * {@link #next()} before {@link #getDocID()} and/or {@link #getScore()}, or + * otherwise the returned values are unexpected. + * + * @lucene.experimental + */ +public interface ScoredDocIDsIterator { + + /** Default score used in case scoring is disabled. */ + public static final float DEFAULT_SCORE = 1.0f; + + /** Iterate to the next document/score pair. Returns true iff there is such a pair. */ + public abstract boolean next(); + + /** Returns the ID of the current document. */ + public abstract int getDocID(); + + /** Returns the score of the current document. */ + public abstract float getScore(); + +} \ No newline at end of file diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/ScoredDocIdCollector.java b/modules/facet/src/java/org/apache/lucene/facet/search/ScoredDocIdCollector.java new file mode 100644 index 00000000000..43fdbf14926 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/search/ScoredDocIdCollector.java @@ -0,0 +1,224 @@ +package org.apache.lucene.facet.search; + +import java.io.IOException; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexReader.AtomicReaderContext; +import org.apache.lucene.search.Collector; +import org.apache.lucene.search.DocIdSet; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.Scorer; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.OpenBitSet; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * A {@link Collector} which stores all docIDs and their scores in a + * {@link ScoredDocIDs} instance. If scoring is not enabled, then the default + * score as set in {@link #setDefaultScore(float)} (or + * {@link ScoredDocIDsIterator#DEFAULT_SCORE}) will be set for all documents. + * + * @lucene.experimental + */ +public abstract class ScoredDocIdCollector extends Collector { + + private static final class NonScoringDocIdCollector extends ScoredDocIdCollector { + + float defaultScore = ScoredDocIDsIterator.DEFAULT_SCORE; + + @SuppressWarnings("synthetic-access") + public NonScoringDocIdCollector(int maxDoc) { + super(maxDoc); + } + + @Override + public boolean acceptsDocsOutOfOrder() { return true; } + + @Override + public void collect(int doc) throws IOException { + docIds.fastSet(docBase + doc); + ++numDocIds; + } + + @Override + public float getDefaultScore() { + return defaultScore; + } + + @Override + public ScoredDocIDsIterator scoredDocIdsIterator() throws IOException { + return new ScoredDocIDsIterator() { + + private DocIdSetIterator docIdsIter = docIds.iterator(); + private int nextDoc; + + public int getDocID() { return nextDoc; } + public float getScore() { return defaultScore; } + + public boolean next() { + try { + nextDoc = docIdsIter.nextDoc(); + return nextDoc != DocIdSetIterator.NO_MORE_DOCS; + } catch (IOException e) { + // This should not happen as we're iterating over an OpenBitSet. For + // completeness, terminate iteration + nextDoc = DocIdSetIterator.NO_MORE_DOCS; + return false; + } + } + + }; + } + + @Override + public void setDefaultScore(float defaultScore) { + this.defaultScore = defaultScore; + } + + @Override + public void setScorer(Scorer scorer) throws IOException {} + } + + private static final class ScoringDocIdCollector extends ScoredDocIdCollector { + + float[] scores; + private Scorer scorer; + + @SuppressWarnings("synthetic-access") + public ScoringDocIdCollector(int maxDoc) { + super(maxDoc); + scores = new float[maxDoc]; + } + + @Override + public boolean acceptsDocsOutOfOrder() { return false; } + + @Override + public void collect(int doc) throws IOException { + docIds.fastSet(docBase + doc); + + float score = this.scorer.score(); + if (numDocIds >= scores.length) { + float[] newScores = new float[ArrayUtil.oversize(numDocIds + 1, 4)]; + System.arraycopy(scores, 0, newScores, 0, numDocIds); + scores = newScores; + } + scores[numDocIds] = score; + ++numDocIds; + } + + @Override + public ScoredDocIDsIterator scoredDocIdsIterator() throws IOException { + return new ScoredDocIDsIterator() { + + private DocIdSetIterator docIdsIter = docIds.iterator(); + private int nextDoc; + private int scoresIdx = -1; + + public int getDocID() { return nextDoc; } + public float getScore() { return scores[scoresIdx]; } + + public boolean next() { + try { + nextDoc = docIdsIter.nextDoc(); + if (nextDoc == DocIdSetIterator.NO_MORE_DOCS) { + return false; + } + ++scoresIdx; + return true; + } catch (IOException e) { + // This should not happen as we're iterating over an OpenBitSet. For + // completeness, terminate iteration + nextDoc = DocIdSetIterator.NO_MORE_DOCS; + return false; + } + } + + }; + } + + @Override + public float getDefaultScore() { return ScoredDocIDsIterator.DEFAULT_SCORE; } + + @Override + public void setDefaultScore(float defaultScore) {} + + @Override + public void setScorer(Scorer scorer) throws IOException { + this.scorer = scorer; + } + } + + protected int numDocIds; + protected int docBase; + protected final OpenBitSet docIds; + + /** + * Creates a new {@link ScoredDocIdCollector} with the given parameters. + * + * @param maxDoc the number of documents that are expected to be collected. + * Note that if more documents are collected, unexpected exceptions may + * be thrown. Usually you should pass {@link IndexReader#maxDoc()} of + * the same IndexReader with which the search is executed. + * @param enableScoring if scoring is enabled, a score will be computed for + * every matching document, which might be expensive. Therefore if you + * do not require scoring, it is better to set it to false. + */ + public static ScoredDocIdCollector create(int maxDoc, boolean enableScoring) { + return enableScoring ? new ScoringDocIdCollector(maxDoc) + : new NonScoringDocIdCollector(maxDoc); + } + + private ScoredDocIdCollector(int maxDoc) { + numDocIds = 0; + docIds = new OpenBitSet(maxDoc); + } + + /** Returns the default score used when scoring is disabled. */ + public abstract float getDefaultScore(); + + /** Set the default score. Only applicable if scoring is disabled. */ + public abstract void setDefaultScore(float defaultScore); + + public abstract ScoredDocIDsIterator scoredDocIdsIterator() throws IOException; + + public ScoredDocIDs getScoredDocIDs() { + return new ScoredDocIDs() { + + public ScoredDocIDsIterator iterator() throws IOException { + return scoredDocIdsIterator(); + } + + public DocIdSet getDocIDs() { + return docIds; + } + + public int size() { + return numDocIds; + } + + }; + } + + @Override + public void setNextReader(AtomicReaderContext context) throws IOException { + this.docBase = context.docBase; + } + +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/StandardFacetsAccumulator.java b/modules/facet/src/java/org/apache/lucene/facet/search/StandardFacetsAccumulator.java new file mode 100644 index 00000000000..98728e00961 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/search/StandardFacetsAccumulator.java @@ -0,0 +1,338 @@ +package org.apache.lucene.facet.search; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map.Entry; +import java.util.logging.Level; +import java.util.logging.Logger; + +import org.apache.lucene.index.IndexReader; + +import org.apache.lucene.facet.search.aggregator.Aggregator; +import org.apache.lucene.facet.search.params.FacetSearchParams; +import org.apache.lucene.facet.search.params.FacetRequest; +import org.apache.lucene.facet.search.results.FacetResult; +import org.apache.lucene.facet.search.results.IntermediateFacetResult; +import org.apache.lucene.facet.taxonomy.TaxonomyReader; +import org.apache.lucene.facet.util.PartitionsUtils; +import org.apache.lucene.facet.util.ScoredDocIdsUtils; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Standard implementation for {@link FacetsAccumulator}, utilizing partitions to save on memory. + *

+ * Why partitions? Because if there are say 100M categories out of which + * only top K are required, we must first compute value for all 100M categories + * (going over all documents) and only then could we select top K. + * This is made easier on memory by working in partitions of distinct categories: + * Once a values for a partition are found, we take the top K for that + * partition and work on the next partition, them merge the top K of both, + * and so forth, thereby computing top K with RAM needs for the size of + * a single partition rather than for the size of all the 100M categories. + *

+ * Decision on partitions size is done at indexing time, and the facet information + * for each partition is maintained separately. + *

+ * Implementation detail: Since facets information of each partition is + * maintained in a separate "category list", we can be more efficient + * at search time, because only the facet info for a single partition + * need to be read while processing that partition. + * + * @lucene.experimental + */ +public class StandardFacetsAccumulator extends FacetsAccumulator { + + private static final Logger logger = Logger.getLogger(StandardFacetsAccumulator.class.getName()); + + protected final IntArrayAllocator intArrayAllocator; + protected final FloatArrayAllocator floatArrayAllocator; + + protected int partitionSize; + protected int maxPartitions; + protected boolean isUsingComplements; + + private TotalFacetCounts totalFacetCounts; + + private Object accumulateGuard; + + public StandardFacetsAccumulator(FacetSearchParams searchParams, IndexReader indexReader, + TaxonomyReader taxonomyReader, IntArrayAllocator intArrayAllocator, + FloatArrayAllocator floatArrayAllocator) { + + super(searchParams,indexReader,taxonomyReader); + int realPartitionSize = intArrayAllocator == null || floatArrayAllocator == null + ? PartitionsUtils.partitionSize(searchParams, taxonomyReader) : -1; // -1 if not needed. + this.intArrayAllocator = intArrayAllocator != null + ? intArrayAllocator + // create a default one if null was provided + : new IntArrayAllocator(realPartitionSize, 1); + this.floatArrayAllocator = floatArrayAllocator != null + ? floatArrayAllocator + // create a default one if null provided + : new FloatArrayAllocator(realPartitionSize, 1); + // can only be computed later when docids size is known + isUsingComplements = false; + partitionSize = PartitionsUtils.partitionSize(searchParams, taxonomyReader); + maxPartitions = (int) Math.ceil(this.taxonomyReader.getSize() / (double) partitionSize); + accumulateGuard = new Object(); + } + + public StandardFacetsAccumulator(FacetSearchParams searchParams, IndexReader indexReader, + TaxonomyReader taxonomyReader) { + + this(searchParams, indexReader, taxonomyReader, null, null); + } + + @Override + public List accumulate(ScoredDocIDs docids) throws IOException { + + // synchronize to prevent calling two accumulate()'s at the same time. + // We decided not to synchronize the method because that might mislead + // users to feel encouraged to call this method simultaneously. + synchronized (accumulateGuard) { + + // only now we can compute this + isUsingComplements = shouldComplement(docids); + + if (isUsingComplements) { + try { + totalFacetCounts = TotalFacetCountsCache.getSingleton() + .getTotalCounts(indexReader, taxonomyReader, + searchParams.getFacetIndexingParams(), searchParams.getClCache()); + if (totalFacetCounts != null) { + docids = ScoredDocIdsUtils.getComplementSet(docids, indexReader); + } else { + isUsingComplements = false; + } + } catch (UnsupportedOperationException e) { + // TODO (Facet): this exception is thrown from TotalCountsKey if the + // IndexReader used does not support getVersion(). We should re-think + // this: is this tiny detail worth disabling total counts completely + // for such readers? Currently, it's not supported by Parallel and + // MultiReader, which might be problematic for several applications. + // We could, for example, base our "isCurrent" logic on something else + // than the reader's version. Need to think more deeply about it. + if (logger.isLoggable(Level.FINEST)) { + logger.log(Level.FINEST, "IndexReader used does not support completents: ", e); + } + isUsingComplements = false; + } catch (IOException e) { + if (logger.isLoggable(Level.FINEST)) { + logger.log(Level.FINEST, "Failed to load/calculate total counts (complement counting disabled): ", e); + } + // silently fail if for some reason failed to load/save from/to dir + isUsingComplements = false; + } catch (Exception e) { + // give up: this should not happen! + IOException ioEx = new IOException( + "PANIC: Got unexpected exception while trying to get/calculate total counts: " + +e.getMessage()); + ioEx.initCause(e); + throw ioEx; + } + } + + docids = actualDocsToAccumulate(docids); + + FacetArrays facetArrays = new FacetArrays(intArrayAllocator, floatArrayAllocator); + + HashMap fr2tmpRes = new HashMap(); + + try { + for (int part = 0; part < maxPartitions; part++) { + + // fill arrays from category lists + fillArraysForPartition(docids, facetArrays, part); + + int offset = part * partitionSize; + + // for each partition we go over all requests and handle + // each, where + // the request maintains the merged result. + // In this implementation merges happen after each + // partition, + // but other impl could merge only at the end. + for (FacetRequest fr : searchParams.getFacetRequests()) { + FacetResultsHandler frHndlr = fr.createFacetResultsHandler(taxonomyReader); + IntermediateFacetResult res4fr = frHndlr.fetchPartitionResult(facetArrays, offset); + IntermediateFacetResult oldRes = fr2tmpRes.get(fr); + if (oldRes != null) { + res4fr = frHndlr.mergeResults(oldRes, res4fr); + } + fr2tmpRes.put(fr, res4fr); + } + } + } finally { + facetArrays.free(); + } + + // gather results from all requests into a list for returning them + List res = new ArrayList(); + for (FacetRequest fr : searchParams.getFacetRequests()) { + FacetResultsHandler frHndlr = fr.createFacetResultsHandler(taxonomyReader); + IntermediateFacetResult tmpResult = fr2tmpRes.get(fr); + if (tmpResult == null) { + continue; // do not add a null to the list. + } + FacetResult facetRes = frHndlr.renderFacetResult(tmpResult); + // final labeling if allowed (because labeling is a costly operation) + if (isAllowLabeling()) { + frHndlr.labelResult(facetRes); + } + res.add(facetRes); + } + + return res; + } + } + + /** + * Set the actual set of documents over which accumulation should take place. + *

+ * Allows to override the set of documents to accumulate for. Invoked just + * before actual accumulating starts. From this point that set of documents + * remains unmodified. Default implementation just returns the input + * unchanged. + * + * @param docids + * candidate documents to accumulate for + * @return actual documents to accumulate for + */ + protected ScoredDocIDs actualDocsToAccumulate(ScoredDocIDs docids) throws IOException { + return docids; + } + + /** Check if it is worth to use complements */ + protected boolean shouldComplement(ScoredDocIDs docids) { + return + mayComplement() && + (docids.size() > indexReader.numDocs() * getComplementThreshold()) ; + } + + /** + * Iterate over the documents for this partition and fill the facet arrays with the correct + * count/complement count/value. + * @param internalCollector + * @param facetArrays + * @param part + * @throws IOException + */ + private final void fillArraysForPartition(ScoredDocIDs docids, + FacetArrays facetArrays, int partition) throws IOException { + + if (isUsingComplements) { + initArraysByTotalCounts(facetArrays, partition, docids.size()); + } else { + facetArrays.free(); // to get a cleared array for this partition + } + + HashMap categoryLists = getCategoryListMap( + facetArrays, partition); + + for (Entry entry : categoryLists.entrySet()) { + CategoryListIterator categoryList = entry.getKey(); + if (!categoryList.init()) { + continue; + } + + Aggregator categorator = entry.getValue(); + ScoredDocIDsIterator iterator = docids.iterator(); + while (iterator.next()) { + int docID = iterator.getDocID(); + if (!categoryList.skipTo(docID)) { + continue; + } + categorator.setNextDoc(docID, iterator.getScore()); + long ordinal; + while ((ordinal = categoryList.nextCategory()) <= Integer.MAX_VALUE) { + categorator.aggregate((int) ordinal); + } + } + } + } + + /** + * Init arrays for partition by total counts, optionally applying a factor + */ + private final void initArraysByTotalCounts(FacetArrays facetArrays, int partition, int nAccumulatedDocs) { + int[] intArray = facetArrays.getIntArray(); + totalFacetCounts.fillTotalCountsForPartition(intArray, partition); + double totalCountsFactor = getTotalCountsFactor(); + // fix total counts, but only if the effect of this would be meaningfull. + if (totalCountsFactor < 0.99999) { + int delta = nAccumulatedDocs + 1; + for (int i = 0; i < intArray.length; i++) { + intArray[i] *= totalCountsFactor; + // also translate to prevent loss of non-positive values + // due to complement sampling (ie if sampled docs all decremented a certain category). + intArray[i] += delta; + } + } + } + + /** + * Expert: factor by which counts should be multiplied when initializing + * the count arrays from total counts. + * Default implementation for this returns 1, which is a no op. + * @return a factor by which total counts should be multiplied + */ + protected double getTotalCountsFactor() { + return 1; + } + + /** + * Create an {@link Aggregator} and a {@link CategoryListIterator} for each + * and every {@link FacetRequest}. Generating a map, matching each + * categoryListIterator to its matching aggregator. + *

+ * If two CategoryListIterators are served by the same aggregator, a single + * aggregator is returned for both. + * + * NOTE: If a given category list iterator is needed with two different + * aggregators (e.g counting and association) - an exception is thrown as this + * functionality is not supported at this time. + */ + protected HashMap getCategoryListMap(FacetArrays facetArrays, + int partition) throws IOException { + + HashMap categoryLists = new HashMap(); + + for (FacetRequest facetRequest : searchParams.getFacetRequests()) { + Aggregator categoryAggregator = facetRequest.createAggregator( + isUsingComplements, facetArrays, indexReader, taxonomyReader); + + CategoryListIterator cli = + facetRequest.createCategoryListIterator(indexReader, taxonomyReader, searchParams, partition); + + // get the aggregator + Aggregator old = categoryLists.put(cli, categoryAggregator); + + if (old != null && !old.equals(categoryAggregator)) { + // TODO (Facet): create a more meaningful RE class, and throw it. + throw new RuntimeException( + "Overriding existing category list with different aggregator. THAT'S A NO NO!"); + } + // if the aggregator is the same we're covered + } + + return categoryLists; + } +} \ No newline at end of file diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/TemporaryObjectAllocator.java b/modules/facet/src/java/org/apache/lucene/facet/search/TemporaryObjectAllocator.java new file mode 100644 index 00000000000..5b219e493c4 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/search/TemporaryObjectAllocator.java @@ -0,0 +1,114 @@ +package org.apache.lucene.facet.search; + +import java.util.concurrent.ConcurrentLinkedQueue; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * An TemporaryObjectAllocator is an object which manages large, reusable, + * temporary objects needed during multiple concurrent computations. The idea + * is to remember some of the previously allocated temporary objects, and + * reuse them if possible to avoid constant allocation and garbage-collection + * of these objects. + *

+ * This technique is useful for temporary counter arrays in faceted search + * (see {@link FacetsAccumulator}), which can be reused across searches instead + * of being allocated afresh on every search. + *

+ * A TemporaryObjectAllocator is thread-safe. + * + * @lucene.experimental + */ +public abstract class TemporaryObjectAllocator { + + // In the "pool" we hold up to "maxObjects" old objects, and if the pool + // is not empty, we return one of its objects rather than allocating a new + // one. + ConcurrentLinkedQueue pool = new ConcurrentLinkedQueue(); + int maxObjects; + + /** + * Construct an allocator for objects of a certain type, keeping around a + * pool of up to maxObjects old objects. + *

+ * Note that the pool size only restricts the number of objects that hang + * around when not needed, but not the maximum number of objects + * that are allocated when actually is use: If a number of concurrent + * threads ask for an allocation, all of them will get an object, even if + * their number is greater than maxObjects. If an application wants to + * limit the number of concurrent threads making allocations, it needs to + * do so on its own - for example by blocking new threads until the + * existing ones have finished. If more than maxObjects are freed, only + * maxObjects of them will be kept in the pool - the rest will not and + * will eventually be garbage-collected by Java. + *

+ * In particular, when maxObjects=0, this object behaves as a trivial + * allocator, always allocating a new array and never reusing an old one. + */ + public TemporaryObjectAllocator(int maxObjects) { + this.maxObjects = maxObjects; + } + + /** + * Subclasses must override this method to actually create a new object + * of the desired type. + * + */ + protected abstract T create(); + + /** + * Subclasses must override this method to clear an existing object of + * the desired type, to prepare it for reuse. Note that objects will be + * cleared just before reuse (on allocation), not when freed. + */ + protected abstract void clear(T object); + + /** + * Allocate a new object. If there's a previously allocated object in our + * pool, we return it immediately. Otherwise, a new object is allocated. + *

+ * Don't forget to call {@link #free(Object)} when you're done with the object, + * to return it to the pool. If you don't, memory is not leaked, + * but the pool will remain empty and a new object will be allocated each + * time (just like the maxArrays=0 case). + */ + public final T allocate() { + T object = pool.poll(); + if (object==null) { + return create(); + } + clear(object); + return object; + } + + /** + * Return a no-longer-needed object back to the pool. If we already have + * enough objects in the pool (maxObjects as specified in the constructor), + * the array will not be saved, and Java will eventually garbage collect + * it. + *

+ * In particular, when maxArrays=0, the given array is never saved and + * free does nothing. + */ + public final void free(T object) { + if (pool.size() < maxObjects && object != null) { + pool.add(object); + } + } + +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/TopKFacetResultsHandler.java b/modules/facet/src/java/org/apache/lucene/facet/search/TopKFacetResultsHandler.java new file mode 100644 index 00000000000..43df368c661 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/search/TopKFacetResultsHandler.java @@ -0,0 +1,292 @@ +package org.apache.lucene.facet.search; + +import java.io.IOException; +import java.util.ArrayList; + +import org.apache.lucene.facet.search.params.FacetRequest; +import org.apache.lucene.facet.search.results.FacetResult; +import org.apache.lucene.facet.search.results.FacetResultNode; +import org.apache.lucene.facet.search.results.MutableFacetResultNode; +import org.apache.lucene.facet.search.results.IntermediateFacetResult; +import org.apache.lucene.facet.taxonomy.TaxonomyReader; +import org.apache.lucene.facet.taxonomy.TaxonomyReader.ChildrenArrays; +import org.apache.lucene.facet.util.ResultSortUtils; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Generate Top-K results for a particular FacetRequest. + *

+ * K is global (among all results) and is defined by {@link FacetRequest#getNumResults()}. + *

+ * Note: Values of 0 (Zero) are ignored by this results handler. + * + * @lucene.experimental + */ +public class TopKFacetResultsHandler extends FacetResultsHandler { + + /** + * Construct top-K results handler. + * @param taxonomyReader taxonomy reader + * @param facetRequest facet request being served + */ + public TopKFacetResultsHandler(TaxonomyReader taxonomyReader, + FacetRequest facetRequest) { + super(taxonomyReader, facetRequest); + } + + // fetch top K for specific partition. + @Override + public IntermediateFacetResult fetchPartitionResult(FacetArrays facetArrays, int offset) + throws IOException { + TopKFacetResult res = null; + int ordinal = taxonomyReader.getOrdinal(facetRequest.getCategoryPath()); + if (ordinal != TaxonomyReader.INVALID_ORDINAL) { + double value = 0; + if (isSelfPartition(ordinal, facetArrays, offset)) { + int partitionSize = facetArrays.getArraysLength(); + value = facetRequest.getValueOf(facetArrays, ordinal % partitionSize); + } + + // TODO (Facet): should initial value of "residue" depend on aggregator if not sum? + MutableFacetResultNode parentResultNode = + new MutableFacetResultNode(ordinal, value); + + Heap heap = ResultSortUtils.createSuitableHeap(facetRequest); + int totalFacets = heapDescendants(ordinal, heap, parentResultNode, facetArrays, offset); + res = new TopKFacetResult(facetRequest, parentResultNode, totalFacets); + res.setHeap(heap); + } + return res; + } + + // merge given top K results into current + @Override + public IntermediateFacetResult mergeResults(IntermediateFacetResult... tmpResults) throws IOException { + + int ordinal = taxonomyReader.getOrdinal(facetRequest.getCategoryPath()); + MutableFacetResultNode resNode = new MutableFacetResultNode(ordinal, 0); + + int totalFacets = 0; + Heap heap = null; + + // merge other results in queue + for (IntermediateFacetResult tmpFres : tmpResults) { + // cast should succeed + TopKFacetResult fres = (TopKFacetResult) tmpFres; + totalFacets += fres.getNumValidDescendants(); + // set the value for the result node representing the facet request + resNode.increaseValue(fres.getFacetResultNode().getValue()); + Heap tmpHeap = fres.getHeap(); + if (heap == null) { + heap = tmpHeap; + continue; + } + // bring sub results from heap of tmp res into result heap + for (int i = tmpHeap.size(); i > 0; i--) { + + FacetResultNode a = heap.insertWithOverflow(tmpHeap.pop()); + if (a != null) { + resNode.increaseResidue(a.getResidue()); + } + } + } + + TopKFacetResult res = new TopKFacetResult(facetRequest, resNode, totalFacets); + res.setHeap(heap); + return res; + } + + /** + * Finds the top K descendants of ordinal, which are at most facetRequest.getDepth() + * deeper than facetRequest.getCategoryPath (whose ordinal is input parameter ordinal). + * Candidates are restricted to current "counting list" and current "partition", + * they join the overall priority queue pq of size K. + * @return total number of descendants considered here by pq, excluding ordinal itself. + */ + private int heapDescendants(int ordinal, Heap pq, + MutableFacetResultNode parentResultNode, FacetArrays facetArrays, int offset) { + int partitionSize = facetArrays.getArraysLength(); + int endOffset = offset + partitionSize; + ChildrenArrays childrenArray = taxonomyReader.getChildrenArrays(); + int[] youngestChild = childrenArray.getYoungestChildArray(); + int[] olderSibling = childrenArray.getOlderSiblingArray(); + FacetResultNode reusable = null; + int localDepth = 0; + int depth = facetRequest.getDepth(); + int[] ordinalStack = new int[2+Math.min(Short.MAX_VALUE, depth)]; + int childrenCounter = 0; + + int tosOrdinal; // top of stack element + + int yc = youngestChild[ordinal]; + while (yc >= endOffset) { + yc = olderSibling[yc]; + } + // make use of the fact that TaxonomyReader.INVALID_ORDINAL == -1, < endOffset + // and it, too, can stop the loop. + ordinalStack[++localDepth] = yc; + + /* + * stack holds input parameter ordinal in position 0. + * Other elements are < endoffset. + * Only top of stack can be TaxonomyReader.INVALID_ORDINAL, and this if and only if + * the element below it exhausted all its children: has them all processed. + * + * stack elements are processed (counted and accumulated) only if they + * belong to current partition (between offset and endoffset) and first time + * they are on top of stack + * + * loop as long as stack is not empty of elements other than input ordinal, or for a little while -- it sibling + */ + while (localDepth > 0) { + tosOrdinal = ordinalStack[localDepth]; + if (tosOrdinal == TaxonomyReader.INVALID_ORDINAL) { + // element below tos has all its children, and itself, all processed + // need to proceed to its sibling + localDepth--; + // change element now on top of stack to its sibling. + ordinalStack[localDepth] = olderSibling[ordinalStack[localDepth]]; + continue; + } + // top of stack is not invalid, this is the first time we see it on top of stack. + // collect it, if belongs to current partition, and then push its kids on itself, if applicable + if (tosOrdinal >= offset) { // tosOrdinal resides in current partition + int relativeOrdinal = tosOrdinal % partitionSize; + double value = facetRequest.getValueOf(facetArrays, relativeOrdinal); + if (value != 0 && !Double.isNaN(value)) { + // Count current ordinal -- the TOS + if (reusable == null) { + reusable = new MutableFacetResultNode(tosOrdinal, value); + } else { + // it is safe to cast since reusable was created here. + ((MutableFacetResultNode)reusable).reset(tosOrdinal, value); + } + ++childrenCounter; + reusable = pq.insertWithOverflow(reusable); + if (reusable != null) { + // TODO (Facet): is other logic (not add) needed, per aggregator? + parentResultNode.increaseResidue(reusable.getValue()); + } + } + } + if (localDepth < depth) { + // push kid of current tos + yc = youngestChild[tosOrdinal]; + while (yc >= endOffset) { + yc = olderSibling[yc]; + } + ordinalStack[++localDepth] = yc; + } else { // localDepth == depth; current tos exhausted its possible children, mark this by pushing INVALID_ORDINAL + ordinalStack[++localDepth] = TaxonomyReader.INVALID_ORDINAL; + } + } // endof while stack is not empty + + return childrenCounter; // we're done + } + + @Override + public FacetResult renderFacetResult(IntermediateFacetResult tmpResult) { + TopKFacetResult res = (TopKFacetResult) tmpResult; // cast is safe by contract of this class + if (res != null) { + Heap heap = res.getHeap(); + MutableFacetResultNode resNode = (MutableFacetResultNode)res.getFacetResultNode(); // cast safe too + for (int i = heap.size(); i > 0; i--) { + resNode.insertSubResult(heap.pop()); + } + } + return res; + } + + @Override + public FacetResult rearrangeFacetResult(FacetResult facetResult) { + TopKFacetResult res = (TopKFacetResult) facetResult; // cast is safe by contract of this class + Heap heap = res.getHeap(); + heap.clear(); // just to be safe + MutableFacetResultNode topFrn = (MutableFacetResultNode) res.getFacetResultNode(); // safe cast + for (FacetResultNode frn : topFrn.getSubResults()) { + heap.add(frn); + } + int size = heap.size(); + ArrayList subResults = new ArrayList(size); + for (int i = heap.size(); i > 0; i--) { + subResults.add(0,heap.pop()); + } + topFrn.setSubResults(subResults); + return res; + } + + @Override + // label top K sub results + public void labelResult(FacetResult facetResult) throws IOException { + if (facetResult != null) { // any result to label? + FacetResultNode facetResultNode = facetResult.getFacetResultNode(); + if (facetResultNode != null) { // any result to label? + facetResultNode.getLabel(taxonomyReader); + int num2label = facetRequest.getNumLabel(); + for (FacetResultNode frn : facetResultNode.getSubResults()) { + if (--num2label < 0) { + break; + } + frn.getLabel(taxonomyReader); + } + } + } + } + + //////////////////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////////////////// + + /** + * Private Mutable implementation of result of faceted search. + */ + private static class TopKFacetResult extends FacetResult implements IntermediateFacetResult { + + // TODO (Facet): is it worth to override PriorityQueue.getSentinelObject() + // for any of our PQs? + private Heap heap; + + /** + * Create a Facet Result. + * @param facetRequest Request for which this result was obtained. + * @param facetResultNode top result node for this facet result. + * @param totalFacets - number of children of the targetFacet, up till the requested depth. + */ + TopKFacetResult(FacetRequest facetRequest, MutableFacetResultNode facetResultNode, int totalFacets) { + super(facetRequest, facetResultNode, totalFacets); + } + + /** + * @return the heap + */ + public Heap getHeap() { + return heap; + } + + /** + * Set the heap for this result. + * @param heap heap top be set. + */ + public void setHeap(Heap heap) { + this.heap = heap; + } + + } + + ////////////////////////////////////////////////////// +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/TopKInEachNodeHandler.java b/modules/facet/src/java/org/apache/lucene/facet/search/TopKInEachNodeHandler.java new file mode 100644 index 00000000000..2aee7351a9c --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/search/TopKInEachNodeHandler.java @@ -0,0 +1,797 @@ +package org.apache.lucene.facet.search; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.lucene.util.PriorityQueue; + +import org.apache.lucene.facet.search.params.FacetRequest; +import org.apache.lucene.facet.search.params.FacetRequest.SortOrder; +import org.apache.lucene.facet.search.results.FacetResult; +import org.apache.lucene.facet.search.results.FacetResultNode; +import org.apache.lucene.facet.search.results.MutableFacetResultNode; +import org.apache.lucene.facet.search.results.IntermediateFacetResult; +import org.apache.lucene.facet.taxonomy.TaxonomyReader; +import org.apache.lucene.facet.taxonomy.TaxonomyReader.ChildrenArrays; +import org.apache.lucene.util.collections.IntIterator; +import org.apache.lucene.util.collections.IntToObjectMap; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Generates {@link FacetResult} from the count arrays aggregated for a particular + * {@link FacetRequest}. + * The generated {@link FacetResult} is a subtree of the taxonomy tree. + * Its root node, {@link FacetResult#getFacetResultNode()}, + * is the facet specified by {@link FacetRequest#getCategoryPath()}, + * and the enumerated children, {@link FacetResultNode#getSubResults()}, of each node in that + * {@link FacetResult} are the top K ( = {@link FacetRequest#getNumResults()}) among its children + * in the taxonomy. + * Top in the sense {@link FacetRequest#getSortBy()}, + * which can be by the values aggregated in the count arrays, or by ordinal numbers; + * also specified is the sort order, {@link FacetRequest#getSortOrder()}, + * ascending or descending, of these values or ordinals before their top K are selected. + * The depth (number of levels excluding the root) of the + * {@link FacetResult} tree is specified by {@link FacetRequest#getDepth()}. + *

+ * Because the number of selected children of each node is restricted, + * and not the overall number of nodes in the {@link FacetResult}, facets not selected + * into {@link FacetResult} might have better values, or ordinals, (typically, + * higher counts), than facets that are selected into the {@link FacetResult}. + *

+ * The generated {@link FacetResult} also provides with + * {@link FacetResult#getNumValidDescendants()}, which returns the total number of facets + * that are descendants of the root node, no deeper than {@link FacetRequest#getDepth()}, and + * which have valid value. The rootnode itself is not counted here. + * Valid value is determined by the {@link FacetResultsHandler}. + * {@link TopKInEachNodeHandler} defines valid as != 0. + *

+ * NOTE: this code relies on the assumption that {@link TaxonomyReader#INVALID_ORDINAL} == -1, a smaller + * value than any valid ordinal. + * + * @lucene.experimental + */ +public class TopKInEachNodeHandler extends FacetResultsHandler { + + public TopKInEachNodeHandler(TaxonomyReader taxonomyReader, + FacetRequest facetRequest) { + super(taxonomyReader, facetRequest); + } + + /** + * Recursively explore all facets that can be potentially included in the + * {@link FacetResult} to be generated, and that belong to the given + * partition, so that values can be examined and collected. For each such + * node, gather its top K ({@link FacetRequest#getNumResults()}) children + * among its children that are encountered in the given particular partition + * (aka current counting list). + * + * @return {@link IntermediateFacetResult} consisting of + * {@link IntToObjectMap} that maps potential + * {@link FacetResult} nodes to their top K children encountered in + * the current partition. Note that the mapped potential tree nodes + * need not belong to the given partition, only the top K children + * mapped to. The aim is to identify nodes that are certainly excluded + * from the {@link FacetResult} to be eventually (after going through + * all the partitions) returned by this handler, because they have K + * better siblings, already identified in this partition. For the + * identified excluded nodes, we only count number of their + * descendants in the subtree (to be included in + * {@link FacetResult#getNumValidDescendants()}), but not bother with + * selecting top K in these generations, which, by definition, are, + * too, excluded from the FacetResult tree. + * @param arrays the already filled in count array, potentially only covering + * one partition: the ordinals ranging from + * @param offset to offset + the length of the count arrays + * within arrays (exclusive) + * @throws IOException in case + * {@link TaxonomyReader#getOrdinal(org.apache.lucene.facet.taxonomy.CategoryPath)} + * does. + * @see FacetResultsHandler#fetchPartitionResult(FacetArrays, int) + */ + @Override + public IntermediateFacetResult fetchPartitionResult(FacetArrays arrays, int offset) throws IOException { + + // get the root of the result tree to be returned, and the depth of that result tree + // (depth means number of node levels excluding the root). + int rootNode = this.taxonomyReader.getOrdinal(this.facetRequest.getCategoryPath()); + if (rootNode == TaxonomyReader.INVALID_ORDINAL) { + return null; + } + + int K = Math.min(facetRequest.getNumResults(),taxonomyReader.getSize()); // number of best results in each node + + // this will grow into the returned IntermediateFacetResult + IntToObjectMap AACOsOfOnePartition = new IntToObjectMap(); + + int partitionSize = arrays.getArraysLength(); // all partitions, except, possibly, the last, + // have the same length. Hence modulo is OK. + + int depth = facetRequest.getDepth(); + + if (depth == 0) { + // Need to only have root node. + IntermediateFacetResultWithHash tempFRWH = new IntermediateFacetResultWithHash( + facetRequest, AACOsOfOnePartition); + if (isSelfPartition(rootNode, arrays, offset)) { + tempFRWH.isRootNodeIncluded = true; + tempFRWH.rootNodeValue = this.facetRequest.getValueOf(arrays, rootNode % partitionSize); + } + return tempFRWH; + } + + if (depth > Short.MAX_VALUE - 3) { + depth = Short.MAX_VALUE -3; + } + + int endOffset = offset + partitionSize; // one past the largest ordinal in the partition + ChildrenArrays childrenArray = taxonomyReader.getChildrenArrays(); + int[] youngestChild = childrenArray.getYoungestChildArray(); + int[] olderSibling = childrenArray.getOlderSiblingArray(); + int totalNumOfDescendantsConsidered = 0; // total number of facets with value != 0, + // in the tree. These include those selected as top K in each node, and all the others that + // were not. Not including rootNode + + // the following priority queue will be used again and again for each node recursed into + // to select its best K children among its children encountered in the given partition + PriorityQueue pq = + new AggregatedCategoryHeap(K, this.getSuitableACComparator()); + + // reusables will feed the priority queue in each use + AggregatedCategory [] reusables = new AggregatedCategory[2+K]; + for (int i = 0; i < reusables.length; i++) { + reusables[i] = new AggregatedCategory(1,0); + } + + /* + * The returned map is built by a recursive visit of potential tree nodes. Nodes + * determined to be excluded from the FacetResult are not recursively explored as others, + * they are only recursed in order to count the number of their descendants. + * Also, nodes that they and any of their descendants can not be mapped into facets encountered + * in this partition, are, too, explored no further. These are facets whose ordinal + * numbers are greater than the ordinals of the given partition. (recall that the Taxonomy + * maintains that a parent ordinal is smaller than any of its descendants' ordinals). + * So, when scanning over all children of a potential tree node n: (1) all children with ordinal number + * greater than those in the given partition are skipped over, (2) among the children of n residing + * in this partition, the best K children are selected (using pq) for usual further recursion + * and the rest (those rejected out from the pq) are only recursed for counting total number + * of descendants, and (3) all the children of ordinal numbers smaller than the given partition + * are further explored in the usual way, since these may lead to descendants residing in this partition. + * + * ordinalStack drives the recursive descent. + * Top of stack holds the current node which we recurse from. + * ordinalStack[0] holds the root of the facetRequest, and + * it is always maintained that parent(ordianlStack[i]) = ordinalStack[i-1]. + * localDepth points to the current top of ordinalStack. + * Only top of ordinalStack can be TaxonomyReader.INVALID_ORDINAL, and this if and only if + * the element below it explored all its relevant children. + */ + int[] ordinalStack = new int[depth+2]; // for 0 and for invalid on top + ordinalStack[0] = rootNode; + int localDepth = 0; + + /* + * bestSignlingsStack[i] maintains the best K children of ordinalStack[i-1], namely, + * the best K siblings of ordinalStack[i], best K among those residing in the given partition. + * Note that the residents of ordinalStack need not belong + * to the current partition, only the residents of bestSignlingsStack. + * When exploring the children of ordianlStack[i-1] that reside in the current partition + * (after the top K of them have been determined and stored into bestSignlingsStack[i]), + * siblingExplored[i] points into bestSignlingsStack[i], to the child now explored, hence + * residing in ordinalStack[i], and firstToTheLeftOfPartition[i] holds the largest ordinal of + * a sibling smaller than the ordinals in the partition. + * When siblingExplored[i] == max int, the top K siblings of ordinalStack[i] among those siblings + * that reside in this partition have not been determined yet. + * if siblingExplored[i] < 0, the node in ordinalStack[i] is to the left of partition + * (i.e. of a smaller ordinal than the current partition) + * (step (3) above is executed for the children of ordianlStack[i-1]) + */ + int[][] bestSignlingsStack = new int[depth+2][]; + int[] siblingExplored = new int[depth+2]; + int[] firstToTheLeftOfPartition = new int [depth+2]; + + int tosOrdinal; // top of stack element, the ordinal at the top of stack + + /* + * to start the loop, complete the datastructures for root node: + * push its youngest child to ordinalStack; make a note in siblingExplored[] that the children + * of rootNode, which reside in the current partition have not been read yet to select the top + * K of them. Also, make rootNode as if, related to its parent, rootNode belongs to the children + * of ordinal numbers smaller than those of the current partition (this will ease on end condition -- + * we can continue to the older sibling of rootNode once the localDepth goes down, before we verify that + * it went that down) + */ + ordinalStack[++localDepth] = youngestChild[rootNode]; + siblingExplored[localDepth] = Integer.MAX_VALUE; // we have not verified position wrt current partition + siblingExplored[0] = -1; // as if rootNode resides to the left of current position + + /* + * now the whole recursion: loop as long as stack is not empty of elements descendants of + * facetRequest's root. + */ + + while (localDepth > 0) { + tosOrdinal = ordinalStack[localDepth]; + if (tosOrdinal == TaxonomyReader.INVALID_ORDINAL) { + // the brotherhood that has been occupying the top of stack is all exhausted. + // Hence, element below tos, namely, father of tos, has all its children, + // and itself, all explored. + localDepth--; + // replace this father, now on top of stack, by this father's sibling: + // this parent's ordinal can not be greater than current partition, as otherwise + // its child, now just removed, would not have been pushed on it. + // so the father is either inside the partition, or smaller ordinal + if (siblingExplored[localDepth] < 0 ) { + ordinalStack[localDepth] = olderSibling[ordinalStack[localDepth]]; + continue; + } + // in this point, siblingExplored[localDepth] between 0 and number of bestSiblings + // it can not be max int + siblingExplored[localDepth]--; + if (siblingExplored[localDepth] == -1 ) { + //siblings residing in the partition have been all processed, we now move + // to those of ordinal numbers smaller than the partition + ordinalStack[localDepth] = firstToTheLeftOfPartition[localDepth]; + } else { + // still explore siblings residing in the partition + // just move to the next one + ordinalStack[localDepth] = bestSignlingsStack[localDepth][siblingExplored[localDepth]]; + } + continue; + } // endof tosOrdinal is invalid, and hence removed, and its parent was replaced by this + // parent's sibling + + // now try to push a kid, but first look at tos whether it 'deserves' its kids explored: + // it is not to the right of current partition, and we know whether to only count or to + // select best K siblings. + if (siblingExplored[localDepth] == Integer.MAX_VALUE) { + //tosOrdinal was not examined yet for its position relative to current partition + // and the best K of current partition, among its siblings, have not been determined yet + while (tosOrdinal >= endOffset) { + tosOrdinal = olderSibling[tosOrdinal]; + } + // now it is inside. Run it and all its siblings inside the partition through a heap + // and in doing so, count them, find best K, and sum into residue + double residue = 0f; // the sum of all the siblings from this partition that do not make + // it to top K + pq.clear(); + + //reusables are consumed as from a stack. The stack starts full and returns full. + int tosReuslables = reusables.length -1; + + while (tosOrdinal >= offset) { // while tosOrdinal belongs to the given partition; here, too, we use the fact + // that TaxonomyReader.INVALID_ORDINAL == -1 < offset + double value = facetRequest.getValueOf(arrays, tosOrdinal % partitionSize); + if (value != 0) { // the value of yc is not 0, it is to be considered. + totalNumOfDescendantsConsidered++; + + // consume one reusable, and push to the priority queue + AggregatedCategory ac = reusables[tosReuslables--]; + ac.ordinal = tosOrdinal; + ac.value = value; + ac = pq.insertWithOverflow(ac); + if (null != ac) { + residue += ac.value; + // TODO (Facet): could it be that we need to do something + // else, not add, depending on the aggregator? + + /* when a facet is excluded from top K, because already in this partition it has + * K better siblings, it is only recursed for count only. + */ + // update totalNumOfDescendants by the now excluded node and all its descendants + totalNumOfDescendantsConsidered--; // reduce the 1 earned when the excluded node entered the heap + // and now return it and all its descendants. These will never make it to FacetResult + totalNumOfDescendantsConsidered += countOnly (ac.ordinal, youngestChild, + olderSibling, arrays, partitionSize, offset, endOffset, localDepth, depth); + reusables[++tosReuslables] = ac; + } + } + tosOrdinal = olderSibling[tosOrdinal]; + } + // now pq has best K children of ordinals that belong to the given partition. + // Populate a new AACO with them. + // tosOrdinal is now first sibling smaller than partition, make a note of that + firstToTheLeftOfPartition[localDepth] = tosOrdinal; + int aaci = pq.size(); + int[] ords = new int[aaci]; + double [] vals = new double [aaci]; + while (aaci > 0) { + AggregatedCategory ac = pq.pop(); + ords[--aaci] = ac.ordinal; + vals[aaci] = ac.value; + reusables[++tosReuslables] = ac; + } + // if more than 0 ordinals, add this AACO to the map to be returned, + // and add ords to sibling stack, and make a note in siblingExplored that these are to + // be visited now + if (ords.length > 0) { + AACOsOfOnePartition.put(ordinalStack[localDepth-1], new AACO(ords,vals,residue)); + bestSignlingsStack[localDepth] = ords; + siblingExplored[localDepth] = ords.length-1; + ordinalStack[localDepth] = ords[ords.length-1]; + } else { + // no ordinals siblings of tosOrdinal in current partition, move to the left of it + // tosOrdinal is already there (to the left of partition). + // make a note of it in siblingExplored + ordinalStack[localDepth] = tosOrdinal; + siblingExplored[localDepth] = -1; + } + continue; + } // endof we did not check the position of a valid ordinal wrt partition + + // now tosOrdinal is a valid ordinal, inside partition or to the left of it, we need + // to push its kids on top of it, if not too deep. + // Make a note that we did not check them yet + if (localDepth >= depth) { + // localDepth == depth; current tos exhausted its possible children, mark this by pushing INVALID_ORDINAL + ordinalStack[++localDepth] = TaxonomyReader.INVALID_ORDINAL; + continue; + } + ordinalStack[++localDepth] = youngestChild[tosOrdinal]; + siblingExplored[localDepth] = Integer.MAX_VALUE; + } // endof loop while stack is not empty + + // now generate a TempFacetResult from AACOsOfOnePartition, and consider self. + IntermediateFacetResultWithHash tempFRWH = new IntermediateFacetResultWithHash( + facetRequest, AACOsOfOnePartition); + if (isSelfPartition(rootNode, arrays, offset)) { + tempFRWH.isRootNodeIncluded = true; + tempFRWH.rootNodeValue = this.facetRequest.getValueOf(arrays, rootNode % partitionSize); + } + tempFRWH.totalNumOfFacetsConsidered = totalNumOfDescendantsConsidered; + return tempFRWH; + + } + + /** + * Recursively count ordinal, whose depth is currentDepth, + * and all its descendants down to maxDepth (including), + * descendants whose value in the count arrays, arrays, is != 0. + * The count arrays only includes the current partition, from offset, to (exclusive) + * endOffset. + * It is assumed that ordinal < endOffset, + * otherwise, not ordinal, and none of its descendants, reside in + * the current partition. ordinal < offset is allowed, + * as ordinal's descendants might be >= offeset. + * + * @param ordinal a facet ordinal. + * @param youngestChild mapping a given ordinal to its youngest child in the taxonomy (of largest ordinal number), + * or to -1 if has no children. + * @param olderSibling mapping a given ordinal to its older sibling, or to -1 + * @param arrays values for the ordinals in the given partition + * @param offset the first (smallest) ordinal in the given partition + * @param partitionSize number of ordinals in the given partition + * @param endOffset one larger than the largest ordinal that belong to this partition + * @param currentDepth the depth or ordinal in the TaxonomyTree (relative to rootnode of the facetRequest) + * @param maxDepth maximal depth of descendants to be considered here (measured relative to rootnode of the + * facetRequest). + * + * @return the number of nodes, from ordinal down its descendants, of depth <= maxDepth, + * which reside in the current partition, and whose value != 0 + */ + private int countOnly(int ordinal, int[] youngestChild, int[] olderSibling, + FacetArrays arrays, int partitionSize, int offset, + int endOffset, int currentDepth, int maxDepth) { + int ret = 0; + if (offset <= ordinal) { + // ordinal belongs to the current partition + if (0 != facetRequest.getValueOf(arrays, ordinal % partitionSize)) { + ret++; + } + } + // now consider children of ordinal, if not too deep + if (currentDepth >= maxDepth) { + return ret; + } + + int yc = youngestChild[ordinal]; + while (yc >= endOffset) { + yc = olderSibling[yc]; + } + while (yc > TaxonomyReader.INVALID_ORDINAL) { // assuming this is -1, smaller than any legal ordinal + ret += countOnly (yc, youngestChild, olderSibling, arrays, + partitionSize, offset, endOffset, currentDepth+1, maxDepth); + yc = olderSibling[yc]; + } + return ret; + } + + /** + * Merge several partitions' {@link IntermediateFacetResult}-s into one of the + * same format + * + * @see FacetResultsHandler#mergeResults(IntermediateFacetResult...) + */ + @Override + public IntermediateFacetResult mergeResults(IntermediateFacetResult... tmpResults) + throws ClassCastException, IllegalArgumentException { + + if (tmpResults.length == 0) { + return null; + } + + int i=0; + // skip over null tmpResults + for (; (i < tmpResults.length)&&(tmpResults[i] == null); i++) {} + if (i == tmpResults.length) { + // all inputs are null + return null; + } + + // i points to the first non-null input + int K = this.facetRequest.getNumResults(); // number of best result in each node + IntermediateFacetResultWithHash tmpToReturn = (IntermediateFacetResultWithHash)tmpResults[i++]; + + // now loop over the rest of tmpResults and merge each into tmpToReturn + for ( ; i < tmpResults.length; i++) { + IntermediateFacetResultWithHash tfr = (IntermediateFacetResultWithHash)tmpResults[i]; + tmpToReturn.totalNumOfFacetsConsidered += tfr.totalNumOfFacetsConsidered; + if (tfr.isRootNodeIncluded) { + tmpToReturn.isRootNodeIncluded = true; + tmpToReturn.rootNodeValue = tfr.rootNodeValue; + } + // now merge the HashMap of tfr into this of tmpToReturn + IntToObjectMap tmpToReturnMapToACCOs = tmpToReturn.mapToAACOs; + IntToObjectMap tfrMapToACCOs = tfr.mapToAACOs; + IntIterator tfrIntIterator = tfrMapToACCOs.keyIterator(); + //iterate over all ordinals in tfr that are maps to their children (and the residue over + // non included chilren) + while (tfrIntIterator.hasNext()) { + int tfrkey = tfrIntIterator.next(); + AACO tmpToReturnAACO = null; + if (null == (tmpToReturnAACO = tmpToReturnMapToACCOs.get(tfrkey))) { + // if tmpToReturn does not have any kids of tfrkey, map all the kids + // from tfr to it as one package, along with their redisude + tmpToReturnMapToACCOs.put(tfrkey, tfrMapToACCOs.get(tfrkey)); + } else { + // merge the best K children of tfrkey as appear in tmpToReturn and in tfr + AACO tfrAACO = tfrMapToACCOs.get(tfrkey); + int resLength = tfrAACO.ordinals.length + tmpToReturnAACO.ordinals.length; + if (K < resLength) { + resLength = K; + } + int[] resOrds = new int [resLength]; + double[] resVals = new double [resLength]; + double resResidue = tmpToReturnAACO.residue + tfrAACO.residue; + int indexIntoTmpToReturn = 0; + int indexIntoTFR = 0; + ACComparator merger = getSuitableACComparator(); // by facet Request + for (int indexIntoRes = 0; indexIntoRes < resLength; indexIntoRes++) { + if (indexIntoTmpToReturn >= tmpToReturnAACO.ordinals.length) { + //tmpToReturnAACO (former result to return) ran out of indices + // it is all merged into resOrds and resVal + resOrds[indexIntoRes] = tfrAACO.ordinals[indexIntoTFR]; + resVals[indexIntoRes] = tfrAACO.values[indexIntoTFR]; + indexIntoTFR++; + continue; + } + if (indexIntoTFR >= tfrAACO.ordinals.length) { + // tfr ran out of indices + resOrds[indexIntoRes] = tmpToReturnAACO.ordinals[indexIntoTmpToReturn]; + resVals[indexIntoRes] = tmpToReturnAACO.values[indexIntoTmpToReturn]; + indexIntoTmpToReturn++; + continue; + } + // select which goes now to res: next (ord, value) from tmpToReturn or from tfr: + if (merger.leftGoesNow( tmpToReturnAACO.ordinals[indexIntoTmpToReturn], + tmpToReturnAACO.values[indexIntoTmpToReturn], + tfrAACO.ordinals[indexIntoTFR], + tfrAACO.values[indexIntoTFR])) { + resOrds[indexIntoRes] = tmpToReturnAACO.ordinals[indexIntoTmpToReturn]; + resVals[indexIntoRes] = tmpToReturnAACO.values[indexIntoTmpToReturn]; + indexIntoTmpToReturn++; + } else { + resOrds[indexIntoRes] = tfrAACO.ordinals[indexIntoTFR]; + resVals[indexIntoRes] = tfrAACO.values[indexIntoTFR]; + indexIntoTFR++; + } + } // end of merge of best kids of tfrkey that appear in tmpToReturn and its kids that appear in tfr + // altogether yielding no more that best K kids for tfrkey, not to appear in the new shape of + // tmpToReturn + + while (indexIntoTmpToReturn < tmpToReturnAACO.ordinals.length) { + resResidue += tmpToReturnAACO.values[indexIntoTmpToReturn++]; + } + while (indexIntoTFR < tfrAACO.ordinals.length) { + resResidue += tfrAACO.values[indexIntoTFR++]; + } + //update the list of best kids of tfrkey as appear in tmpToReturn + tmpToReturnMapToACCOs.put(tfrkey, new AACO(resOrds, resVals, resResidue)); + } // endof need to merge both AACO -- children and residue for same ordinal + + } // endof loop over all ordinals in tfr + } // endof loop over all temporary facet results to merge + + return tmpToReturn; + } + + private static class AggregatedCategoryHeap extends PriorityQueue { + + private ACComparator merger; + public AggregatedCategoryHeap(int size, ACComparator merger) { + super(size); + this.merger = merger; + } + + @Override + protected boolean lessThan(AggregatedCategory arg1, AggregatedCategory arg2) { + return merger.leftGoesNow(arg2.ordinal, arg2.value, arg1.ordinal, arg1.value); + } + + } + + private static class ResultNodeHeap extends PriorityQueue { + private ACComparator merger; + public ResultNodeHeap(int size, ACComparator merger) { + super(size); + this.merger = merger; + } + + @Override + protected boolean lessThan(FacetResultNode arg1, FacetResultNode arg2) { + return merger.leftGoesNow(arg2.getOrdinal(), arg2.getValue(), arg1.getOrdinal(), arg1.getValue()); + } + + } + + /** + * @return the {@link ACComparator} that reflects the order, + * expressed in the {@link FacetRequest}, of + * facets in the {@link FacetResult}. + */ + + private ACComparator getSuitableACComparator() { + if (facetRequest.getSortOrder() == SortOrder.ASCENDING) { + switch (facetRequest.getSortBy()) { + case VALUE: + return new AscValueACComparator(); + case ORDINAL: + return new AscOrdACComparator(); + } + } else { + switch (facetRequest.getSortBy()) { + case VALUE: + return new DescValueACComparator(); + case ORDINAL: + return new DescOrdACComparator(); + } + } + return null; + } + + /** + * A comparator of two Aggregated Categories according to the order + * (ascending / descending) and item (ordinal or value) specified in the + * FacetRequest for the FacetResult to be generated + */ + + private static abstract class ACComparator { + ACComparator() { } + protected abstract boolean leftGoesNow (int ord1, double val1, int ord2, double val2); + } + + private static final class AscValueACComparator extends ACComparator { + + AscValueACComparator() { } + + @Override + protected boolean leftGoesNow (int ord1, double val1, int ord2, double val2) { + return (val1 < val2); + } + } + + private static final class DescValueACComparator extends ACComparator { + + DescValueACComparator() { } + + @Override + protected boolean leftGoesNow (int ord1, double val1, int ord2, double val2) { + return (val1 > val2); + } + } + + private static final class AscOrdACComparator extends ACComparator { + + AscOrdACComparator() { } + + @Override + protected boolean leftGoesNow (int ord1, double val1, int ord2, double val2) { + return (ord1 < ord2); + } + } + + private static final class DescOrdACComparator extends ACComparator { + + DescOrdACComparator() { } + + @Override + protected boolean leftGoesNow (int ord1, double val1, int ord2, double val2) { + return (ord1 > ord2); + } + } + + /** + * Intermediate result to hold counts from one or more partitions processed + * thus far. Its main field, constructor parameter mapToAACOs, is a map + * from ordinals to AACOs. The AACOs mapped to contain ordinals and values + * encountered in the count arrays of the partitions processed thus far. The + * ordinals mapped from are their parents, and they may be not contained in + * the partitions processed thus far. All nodes belong to the taxonomy subtree + * defined at the facet request, constructor parameter facetReq, by its + * root and depth. + */ + public static class IntermediateFacetResultWithHash implements IntermediateFacetResult { + protected IntToObjectMap mapToAACOs; + FacetRequest facetRequest; + boolean isRootNodeIncluded; // among the ordinals in the partitions + // processed thus far + double rootNodeValue; // the value of it, in case encountered. + int totalNumOfFacetsConsidered; // total number of facets + // which belong to facetRequest subtree and have value != 0, + // and have been encountered thus far in the partitions processed. + // root node of result tree is not included in this count. + + public IntermediateFacetResultWithHash(FacetRequest facetReq, + IntToObjectMap mapToAACOs) { + this.mapToAACOs = mapToAACOs; + this.facetRequest = facetReq; + this.isRootNodeIncluded = false; + this.rootNodeValue = 0.0; + this.totalNumOfFacetsConsidered = 0; + } + + public FacetRequest getFacetRequest() { + return this.facetRequest; + } + } // endof FacetResultWithHash + + /** + * Maintains info of one entry in the filled up count array: + * an ordinal number of a category and the value aggregated for it + * (typically, that value is the count for that ordinal). + */ + private static final class AggregatedCategory { + int ordinal; + double value; + AggregatedCategory(int ord, double val) { + this.ordinal = ord; + this.value = val; + } + } + + /** + * Maintains an array of {@link AggregatedCategory}. For space consideration, this is implemented as + * a pair of arrays, ordinals and values, rather than one array of pairs. + * Enumerated in ordinals are siblings, + * potential nodes of the {@link FacetResult} tree + * (i.e., the descendants of the root node, no deeper than the specified depth). + * No more than K ( = {@link FacetRequest#getNumResults()}) + * siblings are enumerated, and + * residue holds the sum of values of the siblings rejected from the + * enumerated top K. + */ + private static final class AACO { + int [] ordinals; // ordinals of the best K children, sorted from best to least + double [] values; // the respective values for these children + double residue; // sum of values of all other children, that did not get into top K + AACO (int[] ords, double[] vals, double r) { + this.ordinals = ords; + this.values = vals; + this.residue = r; + } + } + + @Override + /** + * Recursively label the first facetRequest.getNumLabel() sub results + * of the root of a given {@link FacetResult}, or of an already labeled node in it. + * I.e., a node is labeled only if it is the root or all its ancestors are labeled. + */ + public void labelResult(FacetResult facetResult) throws IOException { + if (facetResult == null) { + return; // any result to label? + } + FacetResultNode rootNode = facetResult.getFacetResultNode(); + recursivelyLabel(rootNode, facetRequest.getNumLabel()); + } + + private void recursivelyLabel(FacetResultNode node, int numToLabel) throws IOException { + if (node == null) { + return; + } + node.getLabel(this.taxonomyReader); // attach a label -- category path -- to the node + if (null == node.getSubResults()) { + return; // if node has no children -- done + } + + // otherwise, label the first numToLabel of these children, and recursively -- their children. + int numLabeled = 0; + for (FacetResultNode frn : node.getSubResults()) { + // go over the children of node from first to last, no more than numToLable of them + recursivelyLabel(frn, numToLabel); + if (++numLabeled >= numToLabel) { + return; + } + } + } + + @Override + // verifies that the children of each node are sorted by the order + // specified by the facetRequest. + // the values in these nodes may have changed due to a re-count, for example + // following the accumulation by Sampling. + // so now we test and re-order if necessary. + public FacetResult rearrangeFacetResult(FacetResult facetResult) { + PriorityQueue nodesHeap = + new ResultNodeHeap(this.facetRequest.getNumResults(), this.getSuitableACComparator()); + MutableFacetResultNode topFrn = (MutableFacetResultNode) facetResult.getFacetResultNode(); // safe cast + rearrangeChilrenOfNode(topFrn, nodesHeap); + return facetResult; + } + + private void rearrangeChilrenOfNode(FacetResultNode node, + PriorityQueue nodesHeap) { + nodesHeap.clear(); // just to be safe + for (FacetResultNode frn : node.getSubResults()) { + nodesHeap.add(frn); + } + int size = nodesHeap.size(); + ArrayList subResults = new ArrayList(size); + while (nodesHeap.size()>0) { + subResults.add(0,nodesHeap.pop()); + } + ((MutableFacetResultNode)node).setSubResults(subResults); + for (FacetResultNode frn : node.getSubResults()) { + rearrangeChilrenOfNode(frn, nodesHeap); + } + + } + + @Override + public FacetResult renderFacetResult(IntermediateFacetResult tmpResult) throws IOException { + IntermediateFacetResultWithHash tmp = (IntermediateFacetResultWithHash) tmpResult; + int ordinal = this.taxonomyReader.getOrdinal(this.facetRequest.getCategoryPath()); + if ((tmp == null) || (ordinal == TaxonomyReader.INVALID_ORDINAL)) { + return null; + } + double value = Double.NaN; + if (tmp.isRootNodeIncluded) { + value = tmp.rootNodeValue; + } + MutableFacetResultNode root = generateNode (ordinal, value, tmp.mapToAACOs); + return new FacetResult (tmp.facetRequest, root, tmp.totalNumOfFacetsConsidered); + + } + + private MutableFacetResultNode generateNode (int ordinal, double val, IntToObjectMap mapToAACOs) { + MutableFacetResultNode node = new MutableFacetResultNode(ordinal, val); + AACO aaco = mapToAACOs.get(ordinal); + if (null == aaco) { + return node; + } + List list = new ArrayList(); + for (int i = 0; i < aaco.ordinals.length; i++) { + list.add(generateNode(aaco.ordinals[i], aaco.values[i], mapToAACOs)); + } + node.setSubResults(list); + node.setResidue(aaco.residue); + return node; + } + +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/TotalFacetCounts.java b/modules/facet/src/java/org/apache/lucene/facet/search/TotalFacetCounts.java new file mode 100644 index 00000000000..4ab9e816bff --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/search/TotalFacetCounts.java @@ -0,0 +1,188 @@ +package org.apache.lucene.facet.search; + +import java.io.BufferedInputStream; +import java.io.BufferedOutputStream; +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.util.HashMap; +import java.util.concurrent.atomic.AtomicInteger; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.store.LockObtainFailedException; + +import org.apache.lucene.facet.index.params.CategoryListParams; +import org.apache.lucene.facet.index.params.FacetIndexingParams; +import org.apache.lucene.facet.search.aggregator.Aggregator; +import org.apache.lucene.facet.search.aggregator.CountingAggregator; +import org.apache.lucene.facet.search.cache.CategoryListCache; +import org.apache.lucene.facet.search.cache.CategoryListData; +import org.apache.lucene.facet.search.params.FacetSearchParams; +import org.apache.lucene.facet.taxonomy.TaxonomyReader; +import org.apache.lucene.facet.util.PartitionsUtils; +import org.apache.lucene.facet.util.ScoredDocIdsUtils; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Maintain Total Facet Counts per partition, for given parameters: + *

    + *
  • Index reader of an index
  • + *
  • Taxonomy index reader
  • + *
  • Facet indexing params (and particularly the category list params)
  • + *
  • + *
+ * The total facet counts are maintained as an array of arrays of integers, + * where a separate array is kept for each partition. + * + * @lucene.experimental + */ +public class TotalFacetCounts { + + /** total facet counts per partition: totalCounts[partition][ordinal%partitionLength] */ + private int[][] totalCounts = null; + + private final TaxonomyReader taxonomy; + private final FacetIndexingParams facetIndexingParams; + + private final static AtomicInteger atomicGen4Test = new AtomicInteger(1); + /** Creation type for test purposes */ + enum CreationType { Computed, Loaded } // for testing + final int gen4test; + final CreationType createType4test; + + /** + * Construct by key - from index Directory or by recomputing. + * @param key the key mapping of this total facet counts (index, taxonomy, category lists...) + */ + private TotalFacetCounts (TaxonomyReader taxonomy, FacetIndexingParams facetIndexingParams, + int[][] counts, CreationType createType4Test) throws IOException, LockObtainFailedException { + this.taxonomy = taxonomy; + this.facetIndexingParams = facetIndexingParams; + this.totalCounts = counts; + this.createType4test = createType4Test; + this.gen4test = atomicGen4Test.incrementAndGet(); + } + + /** + * Fill a partition's array with the TotalCountsArray values. + * @param partitionArray array to fill + * @param partition number of required partition + */ + public void fillTotalCountsForPartition(int[] partitionArray, int partition) { + int partitionSize = partitionArray.length; + int[] countArray = totalCounts[partition]; + if (countArray == null) { + countArray = new int[partitionSize]; + totalCounts[partition] = countArray; + } + int length = Math.min(partitionSize, countArray.length); + System.arraycopy(countArray, 0, partitionArray, 0, length); + } + + /** + * Return the total count of an input category + * @param ordinal ordinal of category whose total count is required + */ + public int getTotalCount(int ordinal) { + int partition = PartitionsUtils.partitionNumber(facetIndexingParams,ordinal); + int offset = ordinal % PartitionsUtils.partitionSize(facetIndexingParams, taxonomy); + return totalCounts[partition][offset]; + } + + static TotalFacetCounts loadFromFile(File inputFile, TaxonomyReader taxonomy, + FacetIndexingParams facetIndexingParams) throws IOException { + DataInputStream dis = new DataInputStream(new BufferedInputStream(new FileInputStream(inputFile))); + try { + int[][] counts = new int[dis.readInt()][]; + for (int i=0; i getCategoryListMap( + FacetArrays facetArrays, int partition) throws IOException { + + Aggregator aggregator = new CountingAggregator(counts[partition]); + HashMap map = new HashMap(); + for (CategoryListParams clp: facetIndexingParams.getAllCategoryListParams()) { + final CategoryListIterator cli = clIteraor(clCache, clp, indexReader, partition); + map.put(cli, aggregator); + } + return map; + } + }; + fe.setComplementThreshold(FacetsAccumulator.DISABLE_COMPLEMENT); + fe.accumulate(ScoredDocIdsUtils.createAllDocsScoredDocIDs(indexReader)); + return new TotalFacetCounts(taxonomy, facetIndexingParams, counts, CreationType.Computed); + } + + static CategoryListIterator clIteraor(CategoryListCache clCache, CategoryListParams clp, + IndexReader indexReader, int partition) throws IOException { + if (clCache != null) { + CategoryListData cld = clCache.get(clp); + if (cld != null) { + return cld.iterator(partition); + } + } + return clp.createCategoryListIterator(indexReader, partition); + } +} \ No newline at end of file diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/TotalFacetCountsCache.java b/modules/facet/src/java/org/apache/lucene/facet/search/TotalFacetCountsCache.java new file mode 100644 index 00000000000..23b9562f05c --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/search/TotalFacetCountsCache.java @@ -0,0 +1,285 @@ +package org.apache.lucene.facet.search; + +import java.io.File; +import java.io.IOException; +import java.util.Iterator; +import java.util.LinkedHashMap; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ConcurrentLinkedQueue; + +import org.apache.lucene.index.IndexReader; + +import org.apache.lucene.facet.index.params.CategoryListParams; +import org.apache.lucene.facet.index.params.FacetIndexingParams; +import org.apache.lucene.facet.search.cache.CategoryListCache; +import org.apache.lucene.facet.taxonomy.TaxonomyReader; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Manage an LRU cache for {@link TotalFacetCounts} per index, taxonomy, and + * facet indexing params. + * + * @lucene.experimental + */ +public final class TotalFacetCountsCache { + + /** + * Default size of in memory cache for computed total facet counts. + * Set to 2 for the case when an application reopened a reader and + * the original one is still in use (Otherwise there will be + * switching again and again between the two.) + */ + public static final int DEFAULT_CACHE_SIZE = 2; + + private static final TotalFacetCountsCache singleton = new TotalFacetCountsCache(); + + /** + * Get the single instance of this cache + */ + public static TotalFacetCountsCache getSingleton() { + return singleton; + } + + /** + * In-memory cache of TFCs. + *
    + *
  • It's size is kept within limits through {@link #trimCache()}. + *
  • An LRU eviction policy is applied, by maintaining active keys in {@link #lruKeys}. + *
  • After each addition to the cache, trimCache is called, to remove entries least recently used. + *
+ * @see #markRecentlyUsed(TFCKey) + */ + private ConcurrentHashMap cache = new ConcurrentHashMap(); + + /** + * A queue of active keys for applying LRU policy on eviction from the {@link #cache}. + * @see #markRecentlyUsed(TFCKey) + */ + private ConcurrentLinkedQueue lruKeys = new ConcurrentLinkedQueue(); + + private int maxCacheSize = DEFAULT_CACHE_SIZE; + + /** private constructor for singleton pattern */ + private TotalFacetCountsCache() { + } + + /** + * Get the total facet counts for a reader/taxonomy pair and facet indexing parameters. + * If not in cache, computed here and added to the cache for later use. + * @param indexReader the documents index + * @param taxonomy the taxonomy index + * @param facetIndexingParams facet indexing parameters + * @param clCache category list cache for faster computation, can be null + * @return the total facet counts. + */ + public TotalFacetCounts getTotalCounts(IndexReader indexReader, TaxonomyReader taxonomy, + FacetIndexingParams facetIndexingParams, CategoryListCache clCache) throws IOException { + // create the key + TFCKey key = new TFCKey(indexReader, taxonomy, facetIndexingParams); + // it is important that this call is not synchronized, so that available TFC + // would not wait for one that needs to be computed. + TotalFacetCounts tfc = cache.get(key); + if (tfc != null) { + markRecentlyUsed(key); + return tfc; + } + return computeAndCache(key, clCache); + } + + /** + * Mark key as it as recently used. + *

+ * Implementation notes: Synchronization considerations and the interaction between lruKeys and cache: + *

    + *
  1. A concurrent {@link LinkedHashMap} would have made this class much simpler. + * But unfortunately, Java does not provide one. + * Instead, we combine two concurrent objects: + *
      + *
    • {@link ConcurrentHashMap} for the cached TFCs. + *
    • {@link ConcurrentLinkedQueue} for active keys + *
    + *
  2. Both {@link #lruKeys} and {@link #cache} are concurrently safe. + *
  3. Checks for a cached item through getTotalCounts() are not synchronized. + * Therefore, the case that a needed TFC is in the cache is very fast: + * it does not wait for the computation of other TFCs. + *
  4. computeAndCache() is synchronized, and, has a (double) check of the required + * TFC, to avoid computing the same TFC twice. + *
  5. A race condition in this method (markRecentlyUsed) might result in two copies + * of the same 'key' in lruKeys, but this is handled by the loop in trimCache(), + * where an attempt to remove the same key twice is a no-op. + *
+ */ + private void markRecentlyUsed(TFCKey key) { + lruKeys.remove(key); + lruKeys.add(key); + } + + private synchronized void trimCache() { + // loop until cache is of desired size. + while (cache.size()>maxCacheSize ) { + TFCKey key = lruKeys.poll(); + if (key==null) { //defensive + // it is defensive since lruKeys presumably covers the cache keys + key = cache.keys().nextElement(); + } + // remove this element. Note that an attempt to remove with the same key again is a no-op, + // which gracefully handles the possible race in markRecentlyUsed(). + cache.remove(key); + } + } + + /** + * compute TFC and cache it, after verifying it was not just added - for this + * matter this method is synchronized, which is not too bad, because there is + * lots of work done in the computations. + */ + private synchronized TotalFacetCounts computeAndCache(TFCKey key, CategoryListCache clCache) throws IOException { + TotalFacetCounts tfc = cache.get(key); + if (tfc == null) { + tfc = TotalFacetCounts.compute(key.indexReader, key.taxonomy, key.facetIndexingParams, clCache); + lruKeys.add(key); + cache.put(key,tfc); + trimCache(); + } + return tfc; + } + + /** + * Load {@link TotalFacetCounts} matching input parameters from the provided outputFile + * and add them into the cache for the provided indexReader, taxonomy, and facetIndexingParams. + * If a {@link TotalFacetCounts} for these parameters already exists in the cache, it will be + * replaced by the loaded one. + * @param inputFile file from which to read the data + * @param indexReader the documents index + * @param taxonomy the taxonomy index + * @param facetIndexingParams the facet indexing parameters + * @throws IOException on error + * @see #store(File, IndexReader, TaxonomyReader, FacetIndexingParams, CategoryListCache) + */ + public synchronized void load(File inputFile, IndexReader indexReader, TaxonomyReader taxonomy, + FacetIndexingParams facetIndexingParams) throws IOException { + if (!inputFile.isFile() || !inputFile.exists() || !inputFile.canRead()) { + throw new IllegalArgumentException("Exepecting an existing readable file: "+inputFile); + } + TFCKey key = new TFCKey(indexReader, taxonomy, facetIndexingParams); + TotalFacetCounts tfc = TotalFacetCounts.loadFromFile(inputFile, taxonomy, facetIndexingParams); + cache.put(key,tfc); + trimCache(); + markRecentlyUsed(key); + } + + /** + * Store the {@link TotalFacetCounts} matching input parameters into the provided outputFile, + * making them available for a later call to {@link #load(File, IndexReader, TaxonomyReader, FacetIndexingParams)}. + * If these {@link TotalFacetCounts} are available in the cache, they are used. But if they are + * not in the cache, this call will first compute them (which will also add them to the cache). + * @param outputFile file to store in. + * @param indexReader the documents index + * @param taxonomy the taxonomy index + * @param facetIndexingParams the facet indexing parameters + * @param clCache category list cache for faster computation, can be null + * @throws IOException on error + * @see #load(File, IndexReader, TaxonomyReader, FacetIndexingParams) + * @see #getTotalCounts(IndexReader, TaxonomyReader, FacetIndexingParams, CategoryListCache) + */ + public void store(File outputFile, IndexReader indexReader, TaxonomyReader taxonomy, + FacetIndexingParams facetIndexingParams, CategoryListCache clCache) throws IOException { + File parentFile = outputFile.getParentFile(); + if ( + ( outputFile.exists() && (!outputFile.isFile() || !outputFile.canWrite())) || + (!outputFile.exists() && (!parentFile.isDirectory() || !parentFile.canWrite())) + ) { + throw new IllegalArgumentException("Exepecting a writable file: "+outputFile); + } + TotalFacetCounts tfc = getTotalCounts(indexReader, taxonomy, facetIndexingParams, clCache); + TotalFacetCounts.storeToFile(outputFile, tfc); + } + + private static class TFCKey { + final IndexReader indexReader; + final TaxonomyReader taxonomy; + private final Iterable clps; + private final int hashCode; + private final int nDels; // needed when a reader used for faceted search was just used for deletion. + final FacetIndexingParams facetIndexingParams; + + public TFCKey(IndexReader indexReader, TaxonomyReader taxonomy, + FacetIndexingParams facetIndexingParams) { + this.indexReader = indexReader; + this.taxonomy = taxonomy; + this.facetIndexingParams = facetIndexingParams; + this.clps = facetIndexingParams.getAllCategoryListParams(); + this.nDels = indexReader.numDeletedDocs(); + hashCode = indexReader.hashCode() ^ taxonomy.hashCode(); + } + + @Override + public int hashCode() { + return hashCode; + } + + @Override + public boolean equals(Object other) { + TFCKey o = (TFCKey) other; + if (indexReader != o.indexReader || taxonomy != o.taxonomy || nDels != o.nDels) { + return false; + } + Iterator it1 = clps.iterator(); + Iterator it2 = o.clps.iterator(); + while (it1.hasNext() && it2.hasNext()) { + if (!it1.next().equals(it2.next())) { + return false; + } + } + return it1.hasNext() == it2.hasNext(); + } + } + + /** + * Clear the cache. + */ + public synchronized void clear() { + cache.clear(); + lruKeys.clear(); + } + + /** + * @return the maximal cache size + */ + public int getCacheSize() { + return maxCacheSize; + } + + /** + * Set the number of TotalFacetCounts arrays that will remain in memory cache. + *

+ * If new size is smaller than current size, the cache is appropriately trimmed. + *

+ * Minimal size is 1, so passing zero or negative size would result in size of 1. + * @param size new size to set + */ + public void setCacheSize(int size) { + if (size < 1) size = 1; + int origSize = maxCacheSize; + maxCacheSize = size; + if (maxCacheSize < origSize) { // need to trim only if the cache was reduced + trimCache(); + } + } +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/aggregator/Aggregator.java b/modules/facet/src/java/org/apache/lucene/facet/search/aggregator/Aggregator.java new file mode 100644 index 00000000000..45f15ca4a09 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/search/aggregator/Aggregator.java @@ -0,0 +1,51 @@ +package org.apache.lucene.facet.search.aggregator; + +import java.io.IOException; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * An Aggregator is the analogue of Lucene's Collector (see + * {@link org.apache.lucene.search.Collector}), for processing the categories + * belonging to a certain document. The Aggregator is responsible for doing + * whatever it wishes with the categories it is fed, e.g., counting the number + * of times that each category appears, or performing some computation on their + * association values. + *

+ * Much of the function of an Aggregator implementation is not described by this + * interface. This includes the constructor and getter methods to retrieve the + * results of the aggregation. + * + * @lucene.experimental + */ +public interface Aggregator { + + /** + * Specify the document (and its score in the search) that the following + * {@link #aggregate(int)} calls will pertain to. + */ + void setNextDoc(int docid, float score) throws IOException; + + /** + * Collect (and do whatever an implementation deems appropriate) the + * category given by its ordinal. This category belongs to a document + * given earlier by {@link #setNextDoc(int, float)}. + */ + void aggregate(int ordinal); + +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/aggregator/ComplementCountingAggregator.java b/modules/facet/src/java/org/apache/lucene/facet/search/aggregator/ComplementCountingAggregator.java new file mode 100644 index 00000000000..eab1eb38cc9 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/search/aggregator/ComplementCountingAggregator.java @@ -0,0 +1,37 @@ +package org.apache.lucene.facet.search.aggregator; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * A {@link CountingAggregator} used during complement counting. + * + * @lucene.experimental + */ +public class ComplementCountingAggregator extends CountingAggregator { + + public ComplementCountingAggregator(int[] counterArray) { + super(counterArray); + } + + @Override + public void aggregate(int ordinal) { + assert counterArray[ordinal]!=0:"complement aggregation: count is about to become negative for ordinal "+ordinal; + --counterArray[ordinal]; + } + +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/aggregator/CountingAggregator.java b/modules/facet/src/java/org/apache/lucene/facet/search/aggregator/CountingAggregator.java new file mode 100644 index 00000000000..d3569a42556 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/search/aggregator/CountingAggregator.java @@ -0,0 +1,59 @@ +package org.apache.lucene.facet.search.aggregator; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * A CountingAggregator updates a counter array with the size of the whole + * taxonomy, counting the number of times each category appears in the given set + * of documents. + * + * @lucene.experimental + */ +public class CountingAggregator implements Aggregator { + + protected int[] counterArray; + + public void aggregate(int ordinal) { + ++counterArray[ordinal]; + } + + public void setNextDoc(int docid, float score) { + // There's nothing for us to do here since we only increment the count by 1 + // in this aggregator. + } + + public CountingAggregator(int[] counterArray) { + this.counterArray = counterArray; + } + + @Override + public boolean equals(Object obj) { + if (obj == null || obj.getClass() != this.getClass()) { + return false; + } + CountingAggregator that = (CountingAggregator) obj; + return that.counterArray == this.counterArray; + } + + @Override + public int hashCode() { + int hashCode = counterArray == null ? 0 : counterArray.hashCode(); + + return hashCode; + } +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/aggregator/ScoringAggregator.java b/modules/facet/src/java/org/apache/lucene/facet/search/aggregator/ScoringAggregator.java new file mode 100644 index 00000000000..6b1843c22fb --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/search/aggregator/ScoringAggregator.java @@ -0,0 +1,58 @@ +package org.apache.lucene.facet.search.aggregator; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * An {@link Aggregator} which updates the weight of a category according to the + * scores of the documents it was found in. + * + * @lucene.experimental + */ +public class ScoringAggregator implements Aggregator { + + private final float[] scoreArray; + private float score; + private final int hashCode; + + public ScoringAggregator(float[] counterArray) { + this.scoreArray = counterArray; + this.hashCode = scoreArray == null ? 0 : scoreArray.hashCode(); + } + + public void aggregate(int ordinal) { + scoreArray[ordinal] += score; + } + + @Override + public boolean equals(Object obj) { + if (obj == null || obj.getClass() != this.getClass()) { + return false; + } + ScoringAggregator that = (ScoringAggregator) obj; + return that.scoreArray == this.scoreArray; + } + + @Override + public int hashCode() { + return hashCode; + } + + public void setNextDoc(int docid, float score) { + this.score = score; + } +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/aggregator/association/AssociationFloatSumAggregator.java b/modules/facet/src/java/org/apache/lucene/facet/search/aggregator/association/AssociationFloatSumAggregator.java new file mode 100644 index 00000000000..ab20ffbf9c9 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/search/aggregator/association/AssociationFloatSumAggregator.java @@ -0,0 +1,74 @@ +package org.apache.lucene.facet.search.aggregator.association; + +import java.io.IOException; + +import org.apache.lucene.facet.enhancements.association.AssociationsPayloadIterator; +import org.apache.lucene.facet.index.params.CategoryListParams; +import org.apache.lucene.facet.search.aggregator.Aggregator; +import org.apache.lucene.index.IndexReader; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * An {@link Aggregator} which updates the weight of a category by summing the + * weights of the float association it finds for every document. + * + * @lucene.experimental + */ +public class AssociationFloatSumAggregator implements Aggregator { + + protected final String field; + protected final float[] sumArray; + protected final AssociationsPayloadIterator associationsPayloadIterator; + + public AssociationFloatSumAggregator(IndexReader reader, float[] sumArray) throws IOException { + this(CategoryListParams.DEFAULT_TERM.field(), reader, sumArray); + } + + public AssociationFloatSumAggregator(String field, IndexReader reader, float[] sumArray) throws IOException { + this.field = field; + associationsPayloadIterator = new AssociationsPayloadIterator(reader, field); + this.sumArray = sumArray; + } + + public void aggregate(int ordinal) { + long association = associationsPayloadIterator.getAssociation(ordinal); + if (association != AssociationsPayloadIterator.NO_ASSOCIATION) { + sumArray[ordinal] += Float.intBitsToFloat((int) association); + } + } + + @Override + public boolean equals(Object obj) { + if (obj == null || obj.getClass() != this.getClass()) { + return false; + } + AssociationFloatSumAggregator that = (AssociationFloatSumAggregator) obj; + return that.field.equals(field) && that.sumArray == sumArray; + } + + @Override + public int hashCode() { + return field.hashCode(); + } + + public void setNextDoc(int docid, float score) throws IOException { + associationsPayloadIterator.setNextDoc(docid); + } + +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/aggregator/association/AssociationIntSumAggregator.java b/modules/facet/src/java/org/apache/lucene/facet/search/aggregator/association/AssociationIntSumAggregator.java new file mode 100644 index 00000000000..7452aabf430 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/search/aggregator/association/AssociationIntSumAggregator.java @@ -0,0 +1,74 @@ +package org.apache.lucene.facet.search.aggregator.association; + +import java.io.IOException; + +import org.apache.lucene.facet.enhancements.association.AssociationsPayloadIterator; +import org.apache.lucene.facet.index.params.CategoryListParams; +import org.apache.lucene.facet.search.aggregator.Aggregator; +import org.apache.lucene.index.IndexReader; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * An {@link Aggregator} which updates the weight of a category by summing the + * weights of the integer association it finds for every document. + * + * @lucene.experimental + */ +public class AssociationIntSumAggregator implements Aggregator { + + protected final String field; + protected final int[] sumArray; + protected final AssociationsPayloadIterator associationsPayloadIterator; + + public AssociationIntSumAggregator(IndexReader reader, int[] sumArray) throws IOException { + this(CategoryListParams.DEFAULT_TERM.field(), reader, sumArray); + } + + public AssociationIntSumAggregator(String field, IndexReader reader, int[] sumArray) throws IOException { + this.field = field; + associationsPayloadIterator = new AssociationsPayloadIterator(reader, field); + this.sumArray = sumArray; + } + + public void aggregate(int ordinal) { + long association = associationsPayloadIterator.getAssociation(ordinal); + if (association != AssociationsPayloadIterator.NO_ASSOCIATION) { + sumArray[ordinal] += association; + } + } + + @Override + public boolean equals(Object obj) { + if (obj == null || obj.getClass() != this.getClass()) { + return false; + } + AssociationIntSumAggregator that = (AssociationIntSumAggregator) obj; + return that.field.equals(field) && that.sumArray == sumArray; + } + + @Override + public int hashCode() { + return field.hashCode(); + } + + public void setNextDoc(int docid, float score) throws IOException { + associationsPayloadIterator.setNextDoc(docid); + } + +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/aggregator/package.html b/modules/facet/src/java/org/apache/lucene/facet/search/aggregator/package.html new file mode 100644 index 00000000000..baa8f958b98 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/search/aggregator/package.html @@ -0,0 +1,12 @@ + + + Aggregating Facets during Faceted Search + + +

Aggregating Facets during Faceted Search

+ + A facets aggregator is the parallel of Lucene's Collector. + While Collector collected matching documents, + an aggregator aggregates facets of a matching document. + + \ No newline at end of file diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/cache/CategoryListCache.java b/modules/facet/src/java/org/apache/lucene/facet/search/cache/CategoryListCache.java new file mode 100644 index 00000000000..2acc218307f --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/search/cache/CategoryListCache.java @@ -0,0 +1,61 @@ +package org.apache.lucene.facet.search.cache; + +import java.io.IOException; +import java.util.HashMap; + +import org.apache.lucene.index.IndexReader; + +import org.apache.lucene.facet.index.params.CategoryListParams; +import org.apache.lucene.facet.index.params.FacetIndexingParams; +import org.apache.lucene.facet.taxonomy.TaxonomyReader; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Cache for {@link CategoryListData}, per {@link CategoryListParams}. + * + * @lucene.experimental + */ +public class CategoryListCache { + + private HashMap + cldMap = new HashMap(); + + /** + * Fetch the cached {@link CategoryListData} for a given {@link CategoryListParams}. + */ + public CategoryListData get(CategoryListParams clp) { + return cldMap.get(clp); + } + + /** + * Register a pre-computed {@link CategoryListData}. + */ + public void register(CategoryListParams clp, CategoryListData clData) { + cldMap.put(clp,clData); + } + + /** + * Load and register {@link CategoryListData}. + */ + public void loadAndRegister(CategoryListParams clp, + IndexReader reader, TaxonomyReader taxo, FacetIndexingParams iparams) throws IOException { + CategoryListData clData = new CategoryListData(reader, taxo, iparams, clp); + register(clp,clData); + } +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/cache/CategoryListData.java b/modules/facet/src/java/org/apache/lucene/facet/search/cache/CategoryListData.java new file mode 100644 index 00000000000..a661077ad7e --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/search/cache/CategoryListData.java @@ -0,0 +1,135 @@ +package org.apache.lucene.facet.search.cache; + +import java.io.IOException; + +import org.apache.lucene.index.IndexReader; + +import org.apache.lucene.facet.index.params.CategoryListParams; +import org.apache.lucene.facet.index.params.FacetIndexingParams; +import org.apache.lucene.facet.search.CategoryListIterator; +import org.apache.lucene.facet.taxonomy.TaxonomyReader; +import org.apache.lucene.util.collections.IntArray; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Category list data maintained in RAM. + *

+ * Speeds up facets accumulation when more RAM is available. + *

+ * Note that this will consume more memory: one int (4 bytes) for each category + * of each document. + *

+ * Note: at the moment this class is insensitive to updates of the index, and, + * in particular, does not make use of Lucene's ability to refresh a single + * segment. + *

+ * See {@link CategoryListCache#register(CategoryListParams, CategoryListData)} + * and + * {@link CategoryListCache#loadAndRegister(CategoryListParams, IndexReader, TaxonomyReader, FacetIndexingParams)}. + * + * @lucene.experimental + */ +public class CategoryListData { + + // TODO (Facet): experiment with different orders - p-d-c vs. current d-p-c. + private transient volatile int[][][] docPartitionCategories; + + /** + * Empty constructor for extensions with modified computation of the data. + */ + protected CategoryListData() { + } + + /** + * Compute category list data for caching for faster iteration. + */ + CategoryListData(IndexReader reader, TaxonomyReader taxo, + FacetIndexingParams iparams, CategoryListParams clp) throws IOException { + + final int maxDoc = reader.maxDoc(); + int[][][]dpf = new int[maxDoc][][]; + int numPartitions = (int)Math.ceil(taxo.getSize()/(double)iparams.getPartitionSize()); + IntArray docCategories = new IntArray(); + for (int part=0; partpart; + } + + public long nextCategory() throws IOException { + if (nextCategoryIndex >= dpc[currDoc][part].length) { + return 1L+Integer.MAX_VALUE; + } + return dpc[currDoc][part][nextCategoryIndex++]; + } + + public boolean skipTo(int docId) throws IOException { + final boolean res = dpc.length>docId && dpc[docId]!=null && dpc[docId][part]!=null; + if (res) { + currDoc = docId; + nextCategoryIndex = 0; + } + return res; + } + } +} \ No newline at end of file diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/package.html b/modules/facet/src/java/org/apache/lucene/facet/search/package.html new file mode 100644 index 00000000000..74e85fb4455 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/search/package.html @@ -0,0 +1,42 @@ + + + Faceted Search API + + +

Faceted Search API

+ + API for faceted search has several interfaces - simple, top level ones, adequate for most users, + and advanced, more complicated ones, for the more advanced users. + +

+ + We now describe the simpler interfaces. + There are mainly 3 interfaces for faceted search: +

    +
  1. {@link org.apache.lucene.facet.search.params.FacetRequest Facets Request} + defines requirements: +
      +
    • which facets are required, e.g. depth
    • +
    • what is computed for each facet - e.g. count, score.
    • +
    +
  2. +
  3. {@link org.apache.lucene.facet.search.FacetsAccumulator Facets Extractor} + Controls how facets are extracted, with variations of: +
      +
    • default (partitioned, like all extractors).
    • +
    • sampled - inspects only a fraction of the documents.
    • +
    +
  4. +
  5. {@link org.apache.lucene.facet.search.FacetResultsHandler Facet Results Handler } + Controls how results are further processed and merged (also between partitions): +
      +
    • Top K.
    • +
    • Tree.
    • +
    • Tree with top K at each level
    • +
    • ...
    • +
    +
  6. +
+ + + \ No newline at end of file diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/params/CountFacetRequest.java b/modules/facet/src/java/org/apache/lucene/facet/search/params/CountFacetRequest.java new file mode 100644 index 00000000000..099ed021562 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/search/params/CountFacetRequest.java @@ -0,0 +1,75 @@ +package org.apache.lucene.facet.search.params; + +import org.apache.lucene.index.IndexReader; + +import org.apache.lucene.facet.search.FacetArrays; +import org.apache.lucene.facet.search.aggregator.Aggregator; +import org.apache.lucene.facet.search.aggregator.ComplementCountingAggregator; +import org.apache.lucene.facet.search.aggregator.CountingAggregator; +import org.apache.lucene.facet.taxonomy.CategoryPath; +import org.apache.lucene.facet.taxonomy.TaxonomyReader; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Facet request for counting facets. + * + * @lucene.experimental + */ +public class CountFacetRequest extends FacetRequest { + + /** + * Create a count facet request for a given node in the taxonomy. + * + * @param path category path of the category of interest. + * @param num number of child categories for which count info is requeted. + * reqiested. Default implementation will find top categories, - + * this behavior can be overridden by overriding + * {@link #createFacetResultsHandler(TaxonomyReader)}. + */ + public CountFacetRequest(CategoryPath path, int num) { + super(path, num); + } + + @Override + public Aggregator createAggregator(boolean useComplements, + FacetArrays arrays, IndexReader reader, + TaxonomyReader taxonomy) { + // we rely on that, if needed, result is cleared by arrays! + int[] a = arrays.getIntArray(); + if (useComplements) { + return new ComplementCountingAggregator(a); + } + return new CountingAggregator(a); + } + + @Override + public double getValueOf(FacetArrays arrays, int ordinal) { + return arrays.getIntArray()[ordinal]; + } + + @Override + public boolean supportsComplements() { + return true; + } + + @Override + public boolean requireDocumentScore() { + return false; + } +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/params/FacetRequest.java b/modules/facet/src/java/org/apache/lucene/facet/search/params/FacetRequest.java new file mode 100644 index 00000000000..7366c5c0a95 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/search/params/FacetRequest.java @@ -0,0 +1,377 @@ +package org.apache.lucene.facet.search.params; + +import java.io.IOException; + +import org.apache.lucene.index.IndexReader; + +import org.apache.lucene.facet.index.params.CategoryListParams; +import org.apache.lucene.facet.search.CategoryListIterator; +import org.apache.lucene.facet.search.FacetArrays; +import org.apache.lucene.facet.search.FacetResultsHandler; +import org.apache.lucene.facet.search.TopKFacetResultsHandler; +import org.apache.lucene.facet.search.TopKInEachNodeHandler; +import org.apache.lucene.facet.search.aggregator.Aggregator; +import org.apache.lucene.facet.search.cache.CategoryListData; +import org.apache.lucene.facet.search.cache.CategoryListCache; +import org.apache.lucene.facet.taxonomy.CategoryPath; +import org.apache.lucene.facet.taxonomy.TaxonomyReader; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Request to accumulate facet information for a specified facet and possibly + * also some of its descendants, upto a specified depth. + *

+ * The facet request additionally defines what information should + * be computed within the facet results, if and how should results + * be ordered, etc. + *

+ * An example facet request is to look at all sub-categories of "Author", and + * return the 10 with the highest counts (sorted by decreasing count). + * + * @lucene.experimental + */ +public abstract class FacetRequest implements Cloneable { + + /** + * Default depth for facets accumulation. + * @see #getDepth() + */ + public static final int DEFAULT_DEPTH = 1; + + /** + * Default sort mode. + * @see #getSortBy() + */ + public static final SortBy DEFAULT_SORT_BY = SortBy.VALUE; + + /** + * Default result mode + * @see #getResultMode() + */ + public static final ResultMode DEFAULT_RESULT_MODE = ResultMode.GLOBAL_FLAT; + + private final CategoryPath categoryPath; + private final int numResults; + private int numLabel; + private int depth; + private SortOrder sortOrder; + private SortBy sortBy; + + /** + * Computed at construction, this hashCode is based on two final members + * {@link CategoryPath} and numResults + */ + private final int hashCode; + + private ResultMode resultMode = DEFAULT_RESULT_MODE; + + /** + * Initialize the request with a given path, and a requested number of facets + * results. By default, all returned results would be labeled - to alter this + * default see {@link #setNumLabel(int)}. + *

+ * NOTE: if numResults is given as + * Integer.MAX_VALUE than all the facet results would be + * returned, without any limit. + *

+ * NOTE: it is assumed that the given {@link CategoryPath} is not + * modified after construction of this object. Otherwise, some things may not + * function properly, e.g. {@link #hashCode()}. + * + * @throws IllegalArgumentException if numResults is ≤ 0 + */ + public FacetRequest(CategoryPath path, int numResults) { + if (numResults <= 0) { + throw new IllegalArgumentException("num results must be a positive (>0) number: " + numResults); + } + if (path == null) { + throw new IllegalArgumentException("category path cannot be null!"); + } + categoryPath = path; + this.numResults = numResults; + numLabel = numResults; + depth = DEFAULT_DEPTH; + sortBy = DEFAULT_SORT_BY; + sortOrder = SortOrder.DESCENDING; + + hashCode = categoryPath.hashCode() ^ this.numResults; + } + + @Override + public Object clone() throws CloneNotSupportedException { + // Overridden to make it public + return super.clone(); + } + + public void setNumLabel(int numLabel) { + this.numLabel = numLabel; + } + + public void setDepth(int depth) { + this.depth = depth; + } + + public void setSortOrder(SortOrder sortOrder) { + this.sortOrder = sortOrder; + } + + public void setSortBy(SortBy sortBy) { + this.sortBy = sortBy; + } + + /** + * The root category of this facet request. The categories that are returned + * as a result of this request will all be descendants of this root. + *

+ * NOTE: you should not modify the returned {@link CategoryPath}, or + * otherwise some methonds may not work properly, e.g. {@link #hashCode()}. + */ + public final CategoryPath getCategoryPath() { + return categoryPath; + } + + /** + * How deeply to look under the given category. If the depth is 0, + * only the category itself is counted. If the depth is 1, its immediate + * children are also counted, and so on. If the depth is Integer.MAX_VALUE, + * all the category's descendants are counted.
+ * TODO (Facet): add AUTO_EXPAND option + */ + public final int getDepth() { + return depth; + } + + /** + * If getNumLabel() + * If Integer.MAX_VALUE is specified, all + * results are labled. + *

+ * The purpose of this parameter is to avoid having to run the whole + * faceted search again when the user asks for more values for the facet; + * The application can ask (getNumResults()) for more values than it needs + * to show, but keep getNumLabel() only the number it wants to immediately + * show. The slow-down caused by finding more values is negligible, because + * the slowest part - finding the categories' paths, is avoided. + *

+ * Depending on the {@link #getResultMode() LimitsMode}, + * this limit is applied globally or per results node. + * In the global mode, if this limit is 3, + * only 3 top results would be labeled. + * In the per-node mode, if this limit is 3, + * 3 top children of {@link #getCategoryPath() the target category} would be labeled, + * as well as 3 top children of each of them, and so forth, until the depth defined + * by {@link #getDepth()}. + * @see #getResultMode() + */ + public final int getNumLabel() { + return numLabel; + } + + /** + * The number of sub-categories to return (at most). + * If the sub-categories are returned. + *

+ * If Integer.MAX_VALUE is specified, all + * sub-categories are returned. + *

+ * Depending on the {@link #getResultMode() LimitsMode}, + * this limit is applied globally or per results node. + * In the global mode, if this limit is 3, + * only 3 top results would be computed. + * In the per-node mode, if this limit is 3, + * 3 top children of {@link #getCategoryPath() the target category} would be returned, + * as well as 3 top children of each of them, and so forth, until the depth defined + * by {@link #getDepth()}. + * @see #getResultMode() + */ + public final int getNumResults() { + return numResults; + } + + /** + * Sort options for facet results. + */ + public enum SortBy { + /** sort by category ordinal with the taxonomy */ + ORDINAL, + + /** sort by computed category value */ + VALUE + } + + /** Specify how should results be sorted. */ + public final SortBy getSortBy() { + return sortBy; + } + + /** Requested sort order for the results. */ + public enum SortOrder { ASCENDING, DESCENDING } + + /** Return the requested order of results. */ + public final SortOrder getSortOrder() { + return sortOrder; + } + + @Override + public String toString() { + return categoryPath.toString()+" nRes="+numResults+" nLbl="+numLabel; + } + + /** + * Creates a new {@link FacetResultsHandler} that matches the request logic + * and current settings, such as {@link #getDepth() depth}, + * {@link #getResultMode() limits-mode}, etc, as well as the passed in + * {@link TaxonomyReader}. + * + * @param taxonomyReader taxonomy reader is needed e.g. for knowing the + * taxonomy size. + */ + public FacetResultsHandler createFacetResultsHandler(TaxonomyReader taxonomyReader) { + try { + if (resultMode == ResultMode.PER_NODE_IN_TREE) { + return new TopKInEachNodeHandler(taxonomyReader, (FacetRequest) clone()); + } + return new TopKFacetResultsHandler(taxonomyReader, (FacetRequest) clone()); + } catch (CloneNotSupportedException e) { + // Shouldn't happen since we implement Cloneable. If it does happen, it is + // probably because the class was changed to not implement Cloneable + // anymore. + throw new RuntimeException(e); + } + } + + /** + * Result structure manner of applying request's limits such as + * {@link #getNumLabel()} and + * {@link #getNumResults()}. + */ + public enum ResultMode { + /** Limits are applied per node, and the result has a full tree structure. */ + PER_NODE_IN_TREE, + + /** Limits are applied globally, on total number of results, and the result has a flat structure. */ + GLOBAL_FLAT + } + + /** Return the requested result mode. */ + public final ResultMode getResultMode() { + return resultMode; + } + + /** + * @param resultMode the resultMode to set + * @see #getResultMode() + */ + public void setResultMode(ResultMode resultMode) { + this.resultMode = resultMode; + } + + @Override + public int hashCode() { + return hashCode; + } + + @Override + public boolean equals(Object o) { + if (o instanceof FacetRequest) { + FacetRequest that = (FacetRequest)o; + return that.hashCode == this.hashCode && + that.categoryPath.equals(this.categoryPath) && + that.numResults == this.numResults && + that.depth == this.depth && + that.resultMode == this.resultMode && + that.numLabel == this.numLabel; + } + return false; + } + + /** + * Create an aggregator for this facet request. Aggregator action depends on + * request definition. For a count request, it will usually increment the + * count for that facet. + * + * @param useComplements + * whether the complements optimization is being used for current + * computation. + * @param arrays + * provider for facet arrays in use for current computation. + * @param indexReader + * index reader in effect. + * @param taxonomy + * reader of taxonomy in effect. + * @throws IOException + */ + public abstract Aggregator createAggregator(boolean useComplements, + FacetArrays arrays, IndexReader indexReader, + TaxonomyReader taxonomy) throws IOException; + + /** + * Create the category list iterator for the specified partition. + * If a non null cache is provided which contains the required data, + * use it for the iteration. + */ + public CategoryListIterator createCategoryListIterator(IndexReader reader, + TaxonomyReader taxo, FacetSearchParams sParams, int partition) + throws IOException { + CategoryListCache clCache = sParams.getClCache(); + CategoryListParams clParams = sParams.getFacetIndexingParams().getCategoryListParams(categoryPath); + if (clCache!=null) { + CategoryListData clData = clCache.get(clParams); + if (clData!=null) { + return clData.iterator(partition); + } + } + return clParams.createCategoryListIterator(reader, partition); + } + + /** + * Return the value of a category used for facets computations for this + * request. For a count request this would be the count for that facet, i.e. + * an integer number. but for other requests this can be the result of a more + * complex operation, and the result can be any double precision number. + * Having this method with a general name value which is double + * precision allows to have more compact API and code for handling counts and + * perhaps other requests (such as for associations) very similarly, and by + * the same code and API, avoiding code duplication. + * + * @param arrays + * provider for facet arrays in use for current computation. + * @param idx + * an index into the count arrays now in effect in + * arrays. E.g., for ordinal number n, with + * partition, of size partitionSize, now covering n, + * getValueOf would be invoked with idx + * being n % partitionSize. + */ + public abstract double getValueOf(FacetArrays arrays, int idx); + + /** + * Indicates whether this facet request is eligible for applying the complements optimization. + */ + public boolean supportsComplements() { + return false; // by default: no + } + + /** Indicates whether the results of this request depends on each result document's score */ + public abstract boolean requireDocumentScore(); + +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/params/FacetSearchParams.java b/modules/facet/src/java/org/apache/lucene/facet/search/params/FacetSearchParams.java new file mode 100644 index 00000000000..99a6bd4dc28 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/search/params/FacetSearchParams.java @@ -0,0 +1,130 @@ +package org.apache.lucene.facet.search.params; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.lucene.facet.index.params.DefaultFacetIndexingParams; +import org.apache.lucene.facet.index.params.FacetIndexingParams; +import org.apache.lucene.facet.search.cache.CategoryListCache; +import org.apache.lucene.facet.search.results.FacetResult; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Faceted search parameters indicate for which facets should info be gathered. + *

+ * The contained facet requests define for which facets should info be gathered. + *

+ * Contained faceted indexing parameters provide required info on how + * to read and interpret the underlying faceted information in the search index. + * + * @lucene.experimental + */ +public class FacetSearchParams { + + protected final FacetIndexingParams indexingParams; + protected final List facetRequests; + private CategoryListCache clCache = null; + + /** + * Construct with specific faceted indexing parameters. + * It is important to know the indexing parameters so as to e.g. + * read facets data correctly from the index. + * {@link #addFacetRequest(FacetRequest)} must be called at least once + * for this faceted search to find any faceted result. + * @param indexingParams Indexing faceted parameters which were used at indexing time. + * @see #addFacetRequest(FacetRequest) + */ + public FacetSearchParams(FacetIndexingParams indexingParams) { + this.indexingParams = indexingParams; + facetRequests = new ArrayList(); + } + + /** + * Construct with default faceted indexing parameters. + * Usage of this constructor is valid only if also during indexing the + * default faceted indexing parameters were used. + * {@link #addFacetRequest(FacetRequest)} must be called at least once + * for this faceted search to find any faceted result. + * @see #addFacetRequest(FacetRequest) + */ + public FacetSearchParams() { + this(new DefaultFacetIndexingParams()); + } + + /** + * A list of {@link FacetRequest} objects, determining what to count. + * If the returned collection is empty, the faceted search will return no facet results! + */ + public final FacetIndexingParams getFacetIndexingParams() { + return indexingParams; + } + + /** + * Parameters which controlled the indexing of facets, and which are also + * needed during search. + */ + public final List getFacetRequests() { + return facetRequests; + } + + /** + * Add a facet request to apply for this faceted search. + * This method must be called at least once for faceted search + * to find any faceted result.
+ * NOTE: The order of addition implies the order of the {@link FacetResult}s + * @param facetRequest facet request to be added. + */ + public void addFacetRequest(FacetRequest facetRequest) { + if (facetRequest == null) { + throw new IllegalArgumentException("Provided facetRequest must not be null"); + } + facetRequests.add(facetRequest); + } + + @Override + public String toString() { + final char TAB = '\t'; + final char NEWLINE = '\n'; + + StringBuilder sb = new StringBuilder("IndexingParams: "); + sb.append(NEWLINE).append(TAB).append(getFacetIndexingParams()); + + sb.append(NEWLINE).append("FacetRequests:"); + for (FacetRequest facetRequest : getFacetRequests()) { + sb.append(NEWLINE).append(TAB).append(facetRequest); + } + + return sb.toString(); + } + + /** + * @return the cldCache in effect + */ + public CategoryListCache getClCache() { + return clCache; + } + + /** + * Set Cached Category Lists data to be used in Faceted search. + * @param clCache the cldCache to set + */ + public void setClCache(CategoryListCache clCache) { + this.clCache = clCache; + } +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/params/ScoreFacetRequest.java b/modules/facet/src/java/org/apache/lucene/facet/search/params/ScoreFacetRequest.java new file mode 100644 index 00000000000..dcb723a1a4a --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/search/params/ScoreFacetRequest.java @@ -0,0 +1,63 @@ +package org.apache.lucene.facet.search.params; + +import org.apache.lucene.index.IndexReader; + +import org.apache.lucene.facet.search.FacetArrays; +import org.apache.lucene.facet.search.aggregator.Aggregator; +import org.apache.lucene.facet.search.aggregator.ScoringAggregator; +import org.apache.lucene.facet.taxonomy.CategoryPath; +import org.apache.lucene.facet.taxonomy.TaxonomyReader; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Facet request for weighting facets according to document scores. + * + * @lucene.experimental + */ +public class ScoreFacetRequest extends FacetRequest { + + /** Create a score facet request for a given node in the taxonomy. */ + public ScoreFacetRequest(CategoryPath path, int num) { + super(path, num); + } + + @Override + public Aggregator createAggregator(boolean useComplements, + FacetArrays arrays, IndexReader reader, + TaxonomyReader taxonomy) { + assert !useComplements : "complements are not supported by this FacetRequest"; + return new ScoringAggregator(arrays.getFloatArray()); + } + + @Override + public double getValueOf(FacetArrays arrays, int ordinal) { + return arrays.getFloatArray()[ordinal]; + } + + @Override + public boolean supportsComplements() { + return false; + } + + @Override + public boolean requireDocumentScore() { + return true; + } + +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/params/association/AssociationFloatSumFacetRequest.java b/modules/facet/src/java/org/apache/lucene/facet/search/params/association/AssociationFloatSumFacetRequest.java new file mode 100644 index 00000000000..ce7a1c93bab --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/search/params/association/AssociationFloatSumFacetRequest.java @@ -0,0 +1,70 @@ +package org.apache.lucene.facet.search.params.association; + +import java.io.IOException; + +import org.apache.lucene.index.IndexReader; + +import org.apache.lucene.facet.search.FacetArrays; +import org.apache.lucene.facet.search.aggregator.Aggregator; +import org.apache.lucene.facet.search.aggregator.association.AssociationFloatSumAggregator; +import org.apache.lucene.facet.search.params.FacetRequest; +import org.apache.lucene.facet.taxonomy.CategoryPath; +import org.apache.lucene.facet.taxonomy.TaxonomyReader; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Facet request for weighting facets according to their float association by + * summing the association values. + * + * @lucene.experimental + */ +public class AssociationFloatSumFacetRequest extends FacetRequest { + + /** + * Create a float association facet request for a given node in the + * taxonomy. + */ + public AssociationFloatSumFacetRequest(CategoryPath path, int num) { + super(path, num); + } + + @Override + public Aggregator createAggregator(boolean useComplements, + FacetArrays arrays, IndexReader reader, + TaxonomyReader taxonomy) throws IOException { + assert !useComplements : "complements are not supported by this FacetRequest"; + return new AssociationFloatSumAggregator(reader, arrays.getFloatArray()); + } + + @Override + public double getValueOf(FacetArrays arrays, int ordinal) { + return arrays.getFloatArray()[ordinal]; + } + + @Override + public boolean supportsComplements() { + return false; + } + + @Override + public boolean requireDocumentScore() { + return false; + } + +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/params/association/AssociationIntSumFacetRequest.java b/modules/facet/src/java/org/apache/lucene/facet/search/params/association/AssociationIntSumFacetRequest.java new file mode 100644 index 00000000000..32ee7881e3d --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/search/params/association/AssociationIntSumFacetRequest.java @@ -0,0 +1,70 @@ +package org.apache.lucene.facet.search.params.association; + +import java.io.IOException; + +import org.apache.lucene.index.IndexReader; + +import org.apache.lucene.facet.search.FacetArrays; +import org.apache.lucene.facet.search.aggregator.Aggregator; +import org.apache.lucene.facet.search.aggregator.association.AssociationIntSumAggregator; +import org.apache.lucene.facet.search.params.FacetRequest; +import org.apache.lucene.facet.taxonomy.CategoryPath; +import org.apache.lucene.facet.taxonomy.TaxonomyReader; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Facet request for weighting facets according to their integer association by + * summing the association values. + * + * @lucene.experimental + */ +public class AssociationIntSumFacetRequest extends FacetRequest { + + /** + * Create an integer association facet request for a given node in the + * taxonomy. + */ + public AssociationIntSumFacetRequest(CategoryPath path, int num) { + super(path, num); + } + + @Override + public Aggregator createAggregator(boolean useComplements, + FacetArrays arrays, IndexReader reader, + TaxonomyReader taxonomy) throws IOException { + assert !useComplements : "complements are not supported by this FacetRequest"; + return new AssociationIntSumAggregator(reader, arrays.getIntArray()); + } + + @Override + public double getValueOf(FacetArrays arrays, int ordinal) { + return arrays.getIntArray()[ordinal]; + } + + @Override + public boolean supportsComplements() { + return false; + } + + @Override + public boolean requireDocumentScore() { + return false; + } + +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/params/package.html b/modules/facet/src/java/org/apache/lucene/facet/search/params/package.html new file mode 100644 index 00000000000..7957d9b9ecf --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/search/params/package.html @@ -0,0 +1,8 @@ + + + Parameters for Faceted Search + + +

Parameters for Faceted Search

+ + \ No newline at end of file diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/results/FacetResult.java b/modules/facet/src/java/org/apache/lucene/facet/search/results/FacetResult.java new file mode 100644 index 00000000000..af0d32cd2a8 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/search/results/FacetResult.java @@ -0,0 +1,103 @@ +package org.apache.lucene.facet.search.results; + +import org.apache.lucene.facet.search.params.FacetRequest; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Result of faceted search. + * + * @lucene.experimental + */ +public class FacetResult { + + private final FacetRequest facetRequest; + private final FacetResultNode rootNode; + private final int numValidDescendants; + + public FacetResult(FacetRequest facetRequest, FacetResultNode rootNode, int numValidDescendants) { + this.facetRequest = facetRequest; + this.rootNode = rootNode; + this.numValidDescendants = numValidDescendants; + } + + /** + * Facet result node matching the root of the {@link #getFacetRequest() facet request}. + * @see #getFacetRequest() + * @see FacetRequest#getCategoryPath() + */ + public final FacetResultNode getFacetResultNode() { + return this.rootNode; + } + + /** + * Number of descendants of {@link #getFacetResultNode() root facet result node}, + * up till the requested depth, which are valid by the + * {@link FacetRequest#createFacetResultsHandler(org.apache.lucene.facet.taxonomy.TaxonomyReader) + * results handler in effect}. Typically -- have value != 0. + * This number does not include the root node. + * @see #getFacetRequest() + * @see FacetRequest#getDepth() + */ + public final int getNumValidDescendants() { + return this.numValidDescendants; + } + + /** + * Request for which this result was obtained. + */ + public final FacetRequest getFacetRequest() { + return this.facetRequest; + } + + /** + * String representation of this facet result. + * Use with caution: might return a very long string. + * @param prefix prefix for each result line + * @see #toString() + */ + public String toString(String prefix) { + StringBuilder sb = new StringBuilder(); + String nl = ""; + + // request + if (this.facetRequest != null) { + sb.append(nl).append(prefix).append("Request: ").append( + this.facetRequest.toString()); + nl = "\n"; + } + + // total facets + sb.append(nl).append(prefix).append("Num valid Descendants (up to specified depth): ").append( + this.numValidDescendants); + nl = "\n"; + + // result node + if (this.rootNode != null) { + sb.append(nl).append(this.rootNode.toString(prefix + "\t")); + } + + return sb.toString(); + } + + @Override + public String toString() { + return toString(""); + } + +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/results/FacetResultNode.java b/modules/facet/src/java/org/apache/lucene/facet/search/results/FacetResultNode.java new file mode 100644 index 00000000000..eff9f2b4a45 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/search/results/FacetResultNode.java @@ -0,0 +1,110 @@ +package org.apache.lucene.facet.search.results; + +import java.io.IOException; + +import org.apache.lucene.facet.search.FacetResultsHandler; +import org.apache.lucene.facet.search.params.FacetRequest; +import org.apache.lucene.facet.search.sampling.SampleFixer; +import org.apache.lucene.facet.taxonomy.CategoryPath; +import org.apache.lucene.facet.taxonomy.TaxonomyReader; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Result of faceted search for a certain taxonomy node. + * + * @lucene.experimental + */ +public interface FacetResultNode { + + /** + * String representation of this facet result node. + * Use with caution: might return a very long string. + * @param prefix prefix for each result line + */ + public String toString(String prefix); + + /** + * Ordinal of the category of this result. + */ + public int getOrdinal(); + + /** + * Category path of the category of this result, or null if not computed, + * because the application did not request to compute it. + * To force computing the label in case not yet computed use + * {@link #getLabel(TaxonomyReader)}. + * @see FacetRequest#getNumLabel() + * @see #getLabel(TaxonomyReader) + */ + public CategoryPath getLabel(); + + /** + * Category path of the category of this result. + * If not already computed, will be computed now. + *

+ * Use with caution: loading a label for results is costly, performance wise. + * Therefore force labels loading only when really needed. + * @param taxonomyReader taxonomy reader for forcing (lazy) labeling of this result. + * @throws IOException on error + * @see FacetRequest#getNumLabel() + */ + public CategoryPath getLabel(TaxonomyReader taxonomyReader) throws IOException; + + /** + * Value of this result - usually either count or a value derived from some + * computing on the association of it. + */ + public double getValue(); + + /** + * Value of screened out sub results. + *

+ * If only part of valid results are returned, e.g. because top K were requested, + * provide info on "what else is there under this result node". + */ + public double getResidue(); + + /** + * Contained sub results. + * These are either child facets, if a tree result was requested, or simply descendants, in case + * tree result was not requested. In the first case, all returned are both descendants of + * this node in the taxonomy and siblings of each other in the taxonomy. + * In the latter case they are only guaranteed to be descendants of + * this node in the taxonomy. + */ + public Iterable getSubResults(); + + /** + * Number of sub results + */ + public int getNumSubResults(); + + /** + * Expert: Set a new value for this result node. + *

+ * Allows to modify the value of this facet node. + * Used for example to tune a sampled value, e.g. by + * {@link SampleFixer#fixResult(org.apache.lucene.facet.search.ScoredDocIDs, FacetResult)} + * @param value the new value to set + * @see #getValue() + * @see FacetResultsHandler#rearrangeFacetResult(FacetResult) + */ + public void setValue(double value); + +} \ No newline at end of file diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/results/IntermediateFacetResult.java b/modules/facet/src/java/org/apache/lucene/facet/search/results/IntermediateFacetResult.java new file mode 100644 index 00000000000..100256b3e24 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/search/results/IntermediateFacetResult.java @@ -0,0 +1,41 @@ +package org.apache.lucene.facet.search.results; + +import org.apache.lucene.facet.search.FacetResultsHandler; +import org.apache.lucene.facet.search.params.FacetRequest; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Intermediate {@link FacetResult} of faceted search. + *

+ * This is an empty interface on purpose. + *

+ * It allows {@link FacetResultsHandler} to return intermediate result objects + * that only it knows how to interpret, and so the handler has maximal freedom + * in defining what an intermediate result is, depending on its specific logic. + * + * @lucene.experimental + */ +public interface IntermediateFacetResult { + + /** + * Facet request for which this temporary result was created. + */ + FacetRequest getFacetRequest(); + +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/results/MutableFacetResultNode.java b/modules/facet/src/java/org/apache/lucene/facet/search/results/MutableFacetResultNode.java new file mode 100644 index 00000000000..92dcecb3f9e --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/search/results/MutableFacetResultNode.java @@ -0,0 +1,344 @@ +package org.apache.lucene.facet.search.results; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.lucene.facet.taxonomy.CategoryPath; +import org.apache.lucene.facet.taxonomy.TaxonomyReader; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Mutable implementation for Result of faceted search for a certain taxonomy node. + * + * @lucene.experimental + */ +public class MutableFacetResultNode implements FacetResultNode { + + /** + * Empty sub results to be returned when there are no results. + * We never return null, so that code using this can remain simpler. + */ + private static final ArrayList EMPTY_SUB_RESULTS = new ArrayList(); + + private int ordinal; + private CategoryPath label = null; + private double value; + private double residue; + private List subResults; + + /** + * Create a Facet Result Node. + * + * @param ordinal + * ordinal in the taxonomy of the category of this result. + * @param value + * value this result. + */ + public MutableFacetResultNode(int ordinal, double value) { + this(ordinal, value, 0, null, null); + } + + /** + * Reset a facet Result Node. + *

+ * Used at the population of facet results, not intended for regular use by + * applications. + * + * @param ordinal + * ordinal in the taxonomy of the category of this result. + * @param value + * value of this result. + */ + public void reset(int ordinal, double value) { + this.ordinal = ordinal; + this.value = value; + if (subResults != null) { + subResults.clear(); + } + label = null; + residue = 0; + } + + /** + * Create a Facet Result Node. + * + * @param ordinal + * ordinal in the taxonomy of the category of this result. + * @param value + * value of this result. + * @param residue + * Value of screened out sub results. + * @param label + * label of the category path of this result. + * @param subResults + * - sub results, usually descendants, sometimes child results, of + * this result - depending on the request. + */ + public MutableFacetResultNode(int ordinal, double value, double residue, + CategoryPath label, List subResults) { + this.ordinal = ordinal; + this.value = value; + this.residue = residue; + this.label = label; + this.subResults = subResults; + } + + /** + * Create a mutable facet result node from another result node + * @param other other result node to copy from + * @param takeSubResults set to true to take also sub results of other node + */ + public MutableFacetResultNode(FacetResultNode other, boolean takeSubResults) { + this(other.getOrdinal(), other.getValue(), other.getResidue(), other + .getLabel(), takeSubResults ? resultsToList(other.getSubResults()) + : null); + } + + private static List resultsToList( + Iterable subResults) { + if (subResults == null) { + return null; + } + ArrayList res = new ArrayList(); + for (FacetResultNode r : subResults) { + res.add(r); + } + return res; + } + + @Override + public String toString() { + return toString(""); + } + + /** + * Number of sub results. + */ + private int numSubResults() { + if (subResults == null) { + return 0; + } + return subResults.size(); + } + + /* + * (non-Javadoc) + * + * @see + * org.apache.lucene.facet.search.results2.FacetResultNode#toString(java.lang. + * String) + */ + public String toString(String prefix) { + StringBuilder sb = new StringBuilder(prefix); + + sb.append("Facet Result Node with ").append(numSubResults()).append( + " sub result nodes.\n"); + + // label + sb.append(prefix).append("Name: ").append(getLabel()).append("\n"); + + // value + sb.append(prefix).append("Value: ").append(value).append("\n"); + + // residue + sb.append(prefix).append("Residue: ").append(residue).append("\n"); + + if (subResults != null) { + int i = 0; + for (FacetResultNode subRes : subResults) { + sb.append("\n").append(prefix).append("Subresult #").append(i++) + .append("\n").append(subRes.toString(prefix + "\t")); + } + } + + return sb.toString(); + } + + public final int getOrdinal() { + return ordinal; + } + + public final CategoryPath getLabel() { + return label; + } + + /** + * Set the label of the category of this result. + * @param label the label to set. + * @see #getLabel() + */ + public void setLabel(CategoryPath label) { + this.label = label; + } + + public final double getValue() { + return value; + } + + /** + * Set the value of this result. + * + * @param value + * the value to set + * @see #getValue() + */ + public void setValue(double value) { + this.value = value; + } + + /** + * increase the value for this result. + * @param addedValue the value to add + * @see #getValue() + */ + public void increaseValue(double addedValue) { + this.value += addedValue; + } + + public final double getResidue() { + return residue; + } + + /** + * Set the residue. + * @param residue the residue to set + * @see #getResidue() + */ + public void setResidue(double residue) { + this.residue = residue; + } + + /** + * increase the residue for this result. + * @param addedResidue the residue to add + * @see #getResidue() + */ + public void increaseResidue(double addedResidue) { + this.residue += addedResidue; + } + + public final Iterable getSubResults() { + return subResults != null ? subResults : EMPTY_SUB_RESULTS; + } + + /** + * Trim sub results to a given size. + *

+ * Note: Although the {@link #getResidue()} is not guaranteed to be + * accurate, it is worth fixing it, as possible, by taking under account the + * trimmed sub-nodes. + */ + public void trimSubResults(int size) { + if (subResults == null || subResults.size() == 0) { + return; + } + + ArrayList trimmed = new ArrayList(size); + for (int i = 0; i < subResults.size() && i < size; i++) { + MutableFacetResultNode trimmedNode = toImpl(subResults.get(i)); + trimmedNode.trimSubResults(size); + trimmed.add(trimmedNode); + } + + /* + * If we are trimming, it means Sampling is in effect and the extra + * (over-sampled) results are being trimmed. Although the residue is not + * guaranteed to be accurate for Sampling, we try our best to fix it. + * The node's residue now will take under account the sub-nodes we're + * trimming. + */ + for (int i = size; i < subResults.size(); i++) { + increaseResidue(subResults.get(i).getValue()); + } + + subResults = trimmed; + } + + /** + * Set the sub results. + * @param subResults the sub-results to set + */ + public void setSubResults(List subResults) { + this.subResults = subResults; + } + + /** + * Append a sub result (as last). + * @param subRes sub-result to be appended + */ + public void appendSubResult(FacetResultNode subRes) { + if (subResults == null) { + subResults = new ArrayList(); + } + subResults.add(subRes); + } + + /** + * Insert sub result (as first). + * @param subRes sub-result to be inserted + */ + public void insertSubResult(FacetResultNode subRes) { + if (subResults == null) { + subResults = new ArrayList(); + } + subResults.add(0, subRes); + } + + /* + * (non-Javadoc) + * + * @see + * org.apache.lucene.facet.search.results.FacetResultNode#getLabel(org.apache.lucene + * .facet.taxonomy.TaxonomyReader) + */ + public final CategoryPath getLabel(TaxonomyReader taxonomyReader) + throws IOException { + if (label == null) { + label = taxonomyReader.getPath(ordinal); + } + return label; + } + + /* + * (non-Javadoc) + * + * @see org.apache.lucene.facet.search.results.FacetResultNode#getNumSubResults() + */ + public final int getNumSubResults() { + return subResults == null ? 0 : subResults.size(); + } + + /** + * Internal utility: turn a result node into an implementation class + * with richer API that allows modifying it. + *

+ * In case that input result node is already of an implementation + * class only casting is done, but in any case we pay the price + * of checking "instance of". + * @param frn facet result node to be turned into an implementation class object + */ + public static MutableFacetResultNode toImpl(FacetResultNode frn) { + if (frn instanceof MutableFacetResultNode) { + return (MutableFacetResultNode) frn; + } + return new MutableFacetResultNode(frn, true); + } + +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/results/package.html b/modules/facet/src/java/org/apache/lucene/facet/search/results/package.html new file mode 100644 index 00000000000..2006bf13571 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/search/results/package.html @@ -0,0 +1,18 @@ + + + Results of Faceted Search + + +

Results of Faceted Search

+ + + \ No newline at end of file diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/sampling/SampleFixer.java b/modules/facet/src/java/org/apache/lucene/facet/search/sampling/SampleFixer.java new file mode 100644 index 00000000000..dc6a3a2eff1 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/search/sampling/SampleFixer.java @@ -0,0 +1,44 @@ +package org.apache.lucene.facet.search.sampling; + +import java.io.IOException; + +import org.apache.lucene.facet.search.ScoredDocIDs; +import org.apache.lucene.facet.search.results.FacetResult; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Fixer of sample facet accumulation results + * + * @lucene.experimental + */ +public interface SampleFixer { + + /** + * Alter the input result, fixing it to account for the sampling. This + * implementation can compute accurate or estimated counts for the sampled facets. + * For example, a faster correction could just multiply by a compensating factor. + * + * @param origDocIds + * full set of matching documents. + * @param fres + * sample result to be fixed. + * @throws IOException + */ + public void fixResult(ScoredDocIDs origDocIds, FacetResult fres) throws IOException; +} \ No newline at end of file diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/sampling/Sampler.java b/modules/facet/src/java/org/apache/lucene/facet/search/sampling/Sampler.java new file mode 100644 index 00000000000..debebeafd58 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/search/sampling/Sampler.java @@ -0,0 +1,238 @@ +package org.apache.lucene.facet.search.sampling; + +import java.io.IOException; +import java.util.logging.Level; +import java.util.logging.Logger; + +import org.apache.lucene.index.IndexReader; + +import org.apache.lucene.facet.search.FacetArrays; +import org.apache.lucene.facet.search.ScoredDocIDs; +import org.apache.lucene.facet.search.aggregator.Aggregator; +import org.apache.lucene.facet.search.params.FacetRequest; +import org.apache.lucene.facet.search.params.FacetSearchParams; +import org.apache.lucene.facet.search.results.FacetResult; +import org.apache.lucene.facet.search.results.FacetResultNode; +import org.apache.lucene.facet.search.results.MutableFacetResultNode; +import org.apache.lucene.facet.taxonomy.TaxonomyReader; +import org.apache.lucene.facet.util.RandomSample; +import org.apache.lucene.facet.util.ScoredDocIdsUtils; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Sampling definition for facets accumulation + *

+ * The Sampler uses TAKMI style counting to provide a 'best guess' top-K result + * set of the facets accumulated. + *

+ * Note: Sampling accumulation (Accumulation over a sampled-set of the results), + * does not guarantee accurate values for + * {@link FacetResult#getNumValidDescendants()} & + * {@link FacetResultNode#getResidue()}. + * + * @lucene.experimental + */ +public class Sampler { + + private static final Logger logger = Logger.getLogger(Sampler.class.getName()); + + private final SamplingParams samplingParams; + + /** + * Construct with {@link SamplingParams} + */ + public Sampler() { + this(new SamplingParams()); + } + + /** + * Construct with certain {@link SamplingParams} + * @param params sampling params in effect + * @throws IllegalArgumentException if the provided SamplingParams are not valid + */ + public Sampler(SamplingParams params) throws IllegalArgumentException { + if (!params.validate()) { + throw new IllegalArgumentException("The provided SamplingParams are not valid!!"); + } + this.samplingParams = params; + } + + /** + * Check if this sampler would complement for the input docIds + */ + public boolean shouldSample(ScoredDocIDs docIds) { + return docIds.size() > samplingParams.getSamplingThreshold(); + } + + /** + * Compute a sample set out of the input set, based on the {@link SamplingParams#getSampleRatio()} + * in effect. Sub classes can override to alter how the sample set is + * computed. + *

+ * If the input set is of size smaller than {@link SamplingParams#getMinSampleSize()}, + * the input set is returned (no sampling takes place). + *

+ * Other than that, the returned set size will not be larger than {@link SamplingParams#getMaxSampleSize()} + * nor smaller than {@link SamplingParams#getMinSampleSize()}. + * @param docids + * full set of matching documents out of which a sample is needed. + */ + public SampleResult getSampleSet(ScoredDocIDs docids) throws IOException { + if (!shouldSample(docids)) { + return new SampleResult(docids, 1d); + } + + int actualSize = docids.size(); + int sampleSetSize = (int) (actualSize * samplingParams.getSampleRatio()); + sampleSetSize = Math.max(sampleSetSize, samplingParams.getMinSampleSize()); + sampleSetSize = Math.min(sampleSetSize, samplingParams.getMaxSampleSize()); + + int[] sampleSet = null; + try { + sampleSet = RandomSample.repeatableSample(docids, actualSize, + sampleSetSize); + } catch (IOException e) { + if (logger.isLoggable(Level.WARNING)) { + logger.log(Level.WARNING, "sampling failed: "+e.getMessage()+" - falling back to no sampling!", e); + } + return new SampleResult(docids, 1d); + } + + ScoredDocIDs sampled = ScoredDocIdsUtils.createScoredDocIDsSubset(docids, + sampleSet); + if (logger.isLoggable(Level.FINEST)) { + logger.finest("******************** " + sampled.size()); + } + return new SampleResult(sampled, sampled.size()/(double)docids.size()); + } + + /** + * Get a fixer of sample facet accumulation results. Default implementation + * returns a TakmiSampleFixer which is adequate only for + * counting. For any other accumulator, provide a different fixer. + */ + public SampleFixer getSampleFixer( + IndexReader indexReader, TaxonomyReader taxonomyReader, + FacetSearchParams searchParams) { + return new TakmiSampleFixer(indexReader, taxonomyReader, searchParams); + } + + /** + * Result of sample computation + */ + public final static class SampleResult { + public final ScoredDocIDs docids; + public final double actualSampleRatio; + protected SampleResult(ScoredDocIDs docids, double actualSampleRatio) { + this.docids = docids; + this.actualSampleRatio = actualSampleRatio; + } + } + + /** + * Return the sampling params in effect + */ + public final SamplingParams getSamplingParams() { + return samplingParams; + } + + /** + * Trim the input facet result.
+ * Note: It is only valid to call this method with result obtained for a + * facet request created through {@link #overSampledSearchParams(FacetSearchParams)}. + * + * @throws IllegalArgumentException + * if called with results not obtained for requests created + * through {@link #overSampledSearchParams(FacetSearchParams)} + */ + public FacetResult trimResult(FacetResult facetResult) throws IllegalArgumentException { + double overSampleFactor = getSamplingParams().getOversampleFactor(); + if (overSampleFactor <= 1) { // no factoring done? + return facetResult; + } + + OverSampledFacetRequest sampledFreq = null; + + try { + sampledFreq = (OverSampledFacetRequest)facetResult.getFacetRequest(); + } catch (ClassCastException e) { + throw new IllegalArgumentException( + "It is only valid to call this method with result obtained for a" + + "facet request created through sampler.overSamlpingSearchParams()", + e); + } + + FacetRequest origFrq = sampledFreq.orig; + + MutableFacetResultNode trimmedRootNode = MutableFacetResultNode.toImpl(facetResult.getFacetResultNode()); + trimmedRootNode.trimSubResults(origFrq.getNumResults()); + + return new FacetResult(origFrq, trimmedRootNode, facetResult.getNumValidDescendants()); + } + + /** + * Over-sampled search params, wrapping each request with an over-sampled one. + */ + public FacetSearchParams overSampledSearchParams(FacetSearchParams original) { + FacetSearchParams res = original; + // So now we can sample -> altering the searchParams to accommodate for the statistical error for the sampling + double overSampleFactor = getSamplingParams().getOversampleFactor(); + if (overSampleFactor > 1) { // any factoring to do? + res = new FacetSearchParams(original.getFacetIndexingParams()); + for (FacetRequest frq: original.getFacetRequests()) { + int overSampledNumResults = (int) Math.ceil(frq.getNumResults() * overSampleFactor); + res.addFacetRequest(new OverSampledFacetRequest(frq, overSampledNumResults)); + } + } + return res; + } + + /** + * Wrapping a facet request for over sampling. + * Implementation detail: even if the original request is a count request, no + * statistics will be computed for it as the wrapping is not a count request. + * This is ok, as the sampling accumulator is later computing the statistics + * over the original requests. + */ + private static class OverSampledFacetRequest extends FacetRequest { + final FacetRequest orig; + public OverSampledFacetRequest(FacetRequest orig, int num) { + super(orig.getCategoryPath(), num); + this.orig = orig; + } + + @Override + public Aggregator createAggregator(boolean useComplements, + FacetArrays arrays, IndexReader indexReader, + TaxonomyReader taxonomy) throws IOException { + return orig.createAggregator(useComplements, arrays, indexReader, + taxonomy); + } + + @Override + public double getValueOf(FacetArrays arrays, int idx) { + return orig.getValueOf(arrays, idx); + } + + @Override + public boolean requireDocumentScore() { + return orig.requireDocumentScore(); + } + } +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/sampling/SamplingAccumulator.java b/modules/facet/src/java/org/apache/lucene/facet/search/sampling/SamplingAccumulator.java new file mode 100644 index 00000000000..fa48c684479 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/search/sampling/SamplingAccumulator.java @@ -0,0 +1,143 @@ +package org.apache.lucene.facet.search.sampling; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.lucene.index.IndexReader; + +import org.apache.lucene.facet.search.FacetResultsHandler; +import org.apache.lucene.facet.search.FacetsAccumulator; +import org.apache.lucene.facet.search.FloatArrayAllocator; +import org.apache.lucene.facet.search.IntArrayAllocator; +import org.apache.lucene.facet.search.SamplingWrapper; +import org.apache.lucene.facet.search.ScoredDocIDs; +import org.apache.lucene.facet.search.StandardFacetsAccumulator; +import org.apache.lucene.facet.search.params.FacetSearchParams; +import org.apache.lucene.facet.search.results.FacetResult; +import org.apache.lucene.facet.search.results.FacetResultNode; +import org.apache.lucene.facet.search.sampling.Sampler.SampleResult; +import org.apache.lucene.facet.taxonomy.TaxonomyReader; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Facets accumulation with sampling.
+ *

+ * Note two major differences between this class and {@link SamplingWrapper}: + *

    + *
  1. Latter can wrap any other {@link FacetsAccumulator} while this class + * directly extends {@link StandardFacetsAccumulator}.
  2. + *
  3. This class can effectively apply sampling on the complement set of + * matching document, thereby working efficiently with the complement + * optimization - see {@link FacetsAccumulator#getComplementThreshold()}.
  4. + *
+ *

+ * Note: Sampling accumulation (Accumulation over a sampled-set of the results), + * does not guarantee accurate values for + * {@link FacetResult#getNumValidDescendants()} & + * {@link FacetResultNode#getResidue()}. + * + * @see Sampler + * @lucene.experimental + */ +public class SamplingAccumulator extends StandardFacetsAccumulator { + + private double samplingRatio = -1d; + private final Sampler sampler; + + /** + * Constructor... + */ + public SamplingAccumulator( + Sampler sampler, + FacetSearchParams searchParams, + IndexReader indexReader, TaxonomyReader taxonomyReader, + IntArrayAllocator intArrayAllocator, + FloatArrayAllocator floatArrayAllocator) { + super(searchParams, indexReader, taxonomyReader, intArrayAllocator, + floatArrayAllocator); + this.sampler = sampler; + } + + /** + * Constructor... + */ + public SamplingAccumulator( + Sampler sampler, + FacetSearchParams searchParams, + IndexReader indexReader, TaxonomyReader taxonomyReader) { + super(searchParams, indexReader, taxonomyReader); + this.sampler = sampler; + } + + @Override + public List accumulate(ScoredDocIDs docids) throws IOException { + // first let delegee accumulate without labeling at all (though + // currently it doesn't matter because we have to label all returned anyhow) + boolean origAllowLabeling = isAllowLabeling(); + setAllowLabeling(false); + + // Replacing the original searchParams with the over-sampled + FacetSearchParams original = searchParams; + searchParams = sampler.overSampledSearchParams(original); + + List sampleRes = super.accumulate(docids); + setAllowLabeling(origAllowLabeling); + + List fixedRes = new ArrayList(); + for (FacetResult fres : sampleRes) { + // for sure fres is not null because this is guaranteed by the delegee. + FacetResultsHandler frh = fres.getFacetRequest().createFacetResultsHandler( + taxonomyReader); + // fix the result of current request + sampler.getSampleFixer(indexReader, taxonomyReader, searchParams) + .fixResult(docids, fres); + + fres = frh.rearrangeFacetResult(fres); // let delegee's handler do any + + // Using the sampler to trim the extra (over-sampled) results + fres = sampler.trimResult(fres); + // arranging it needs to + // final labeling if allowed (because labeling is a costly operation) + if (isAllowLabeling()) { + frh.labelResult(fres); + } + fixedRes.add(fres); // add to final results + } + + searchParams = original; // Back to original params + + return fixedRes; + } + + @Override + protected ScoredDocIDs actualDocsToAccumulate(ScoredDocIDs docids) throws IOException { + SampleResult sampleRes = sampler.getSampleSet(docids); + samplingRatio = sampleRes.actualSampleRatio; + return sampleRes.docids; + } + + @Override + protected double getTotalCountsFactor() { + if (samplingRatio<0) { + throw new IllegalStateException("Total counts ratio unavailable because actualDocsToAccumulate() was not invoked"); + } + return samplingRatio; + } +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/sampling/SamplingParams.java b/modules/facet/src/java/org/apache/lucene/facet/search/sampling/SamplingParams.java new file mode 100644 index 00000000000..014fb7cb1d0 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/search/sampling/SamplingParams.java @@ -0,0 +1,169 @@ +package org.apache.lucene.facet.search.sampling; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Parameters for sampling, dictating whether sampling is to take place and how. + * + * @lucene.experimental + */ +public class SamplingParams { + + /** + * Default factor by which more results are requested over the sample set. + * @see SamplingParams#getOversampleFactor() + */ + public static final double DEFAULT_OVERSAMPLE_FACTOR = 2d; + + /** + * Default ratio between size of sample to original size of document set. + * @see Sampler#getSampleSet(org.apache.lucene.facet.search.ScoredDocIDs) + */ + public static final double DEFAULT_SAMPLE_RATIO = 0.01; + + /** + * Default maximum size of sample. + * @see Sampler#getSampleSet(org.apache.lucene.facet.search.ScoredDocIDs) + */ + public static final int DEFAULT_MAX_SAMPLE_SIZE = 10000; + + /** + * Default minimum size of sample. + * @see Sampler#getSampleSet(org.apache.lucene.facet.search.ScoredDocIDs) + */ + public static final int DEFAULT_MIN_SAMPLE_SIZE = 100; + + /** + * Default sampling threshold, if number of results is less than this number - no sampling will take place + * @see SamplingParams#getSampleRatio() + */ + public static final int DEFAULT_SAMPLING_THRESHOLD = 75000; + + private int maxSampleSize = DEFAULT_MAX_SAMPLE_SIZE; + private int minSampleSize = DEFAULT_MIN_SAMPLE_SIZE; + private double sampleRatio = DEFAULT_SAMPLE_RATIO; + private int samplingThreshold = DEFAULT_SAMPLING_THRESHOLD; + private double oversampleFactor = DEFAULT_OVERSAMPLE_FACTOR; + + /** + * Return the maxSampleSize. + * In no case should the resulting sample size exceed this value. + * @see Sampler#getSampleSet(org.apache.lucene.facet.search.ScoredDocIDs) + */ + public final int getMaxSampleSize() { + return maxSampleSize; + } + + /** + * Return the minSampleSize. + * In no case should the resulting sample size be smaller than this value. + * @see Sampler#getSampleSet(org.apache.lucene.facet.search.ScoredDocIDs) + */ + public final int getMinSampleSize() { + return minSampleSize; + } + + /** + * @return the sampleRatio + * @see Sampler#getSampleSet(org.apache.lucene.facet.search.ScoredDocIDs) + */ + public final double getSampleRatio() { + return sampleRatio; + } + + /** + * Return the samplingThreshold. + * Sampling would be performed only for document sets larger than this. + */ + public final int getSamplingThreshold() { + return samplingThreshold; + } + + /** + * @param maxSampleSize + * the maxSampleSize to set + * @see #getMaxSampleSize() + */ + public void setMaxSampleSize(int maxSampleSize) { + this.maxSampleSize = maxSampleSize; + } + + /** + * @param minSampleSize + * the minSampleSize to set + * @see #getMinSampleSize() + */ + public void setMinSampleSize(int minSampleSize) { + this.minSampleSize = minSampleSize; + } + + /** + * @param sampleRatio + * the sampleRatio to set + * @see #getSampleRatio() + */ + public void setSampleRatio(double sampleRatio) { + this.sampleRatio = sampleRatio; + } + + /** + * Set a sampling-threshold + * @see #getSamplingThreshold() + */ + public void setSampingThreshold(int sampingThreshold) { + this.samplingThreshold = sampingThreshold; + } + + /** + * Check validity of sampling settings, making sure that + *

    + *
  • minSampleSize <= maxSampleSize <= samplingThreshold
  • + *
  • 0 < samplingRatio <= 1
  • + *
+ * + * @return true if valid, false otherwise + */ + public boolean validate() { + return + samplingThreshold >= maxSampleSize && + maxSampleSize >= minSampleSize && + sampleRatio > 0 && + sampleRatio < 1; + } + + /** + * Return the oversampleFactor. When sampling, we would collect that much more + * results, so that later, when selecting top out of these, chances are higher + * to get actual best results. Note that having this value larger than 1 only + * makes sense when using a SampleFixer which finds accurate results, such as + * TakmiSampleFixer. When this value is smaller than 1, it is + * ignored and no oversampling takes place. + */ + public final double getOversampleFactor() { + return oversampleFactor; + } + + /** + * @param oversampleFactor the oversampleFactor to set + * @see #getOversampleFactor() + */ + public void setOversampleFactor(double oversampleFactor) { + this.oversampleFactor = oversampleFactor; + } + +} \ No newline at end of file diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/sampling/TakmiSampleFixer.java b/modules/facet/src/java/org/apache/lucene/facet/search/sampling/TakmiSampleFixer.java new file mode 100644 index 00000000000..300721a870d --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/search/sampling/TakmiSampleFixer.java @@ -0,0 +1,180 @@ +package org.apache.lucene.facet.search.sampling; + +import java.io.IOException; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.MultiFields; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.util.Bits; + +import org.apache.lucene.facet.search.DrillDown; +import org.apache.lucene.facet.search.ScoredDocIDs; +import org.apache.lucene.facet.search.ScoredDocIDsIterator; +import org.apache.lucene.facet.search.params.FacetSearchParams; +import org.apache.lucene.facet.search.results.FacetResult; +import org.apache.lucene.facet.search.results.FacetResultNode; +import org.apache.lucene.facet.taxonomy.CategoryPath; +import org.apache.lucene.facet.taxonomy.TaxonomyReader; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Fix sampling results by counting the intersection between two lists: a + * TermDocs (list of documents in a certain category) and a DocIdSetIterator + * (list of documents matching the query). + * + * + * @lucene.experimental + */ +// TODO (Facet): implement also an estimated fixing by ratio (taking into +// account "translation" of counts!) +class TakmiSampleFixer implements SampleFixer { + + private TaxonomyReader taxonomyReader; + private IndexReader indexReader; + private FacetSearchParams searchParams; + + public TakmiSampleFixer(IndexReader indexReader, + TaxonomyReader taxonomyReader, FacetSearchParams searchParams) { + this.indexReader = indexReader; + this.taxonomyReader = taxonomyReader; + this.searchParams = searchParams; + } + + public void fixResult(ScoredDocIDs origDocIds, FacetResult fres) + throws IOException { + FacetResultNode topRes = fres.getFacetResultNode(); + fixResultNode(topRes, origDocIds); + } + + /** + * Fix result node count, and, recursively, fix all its children + * + * @param facetResNode + * result node to be fixed + * @param docIds + * docids in effect + * @throws IOException + */ + private void fixResultNode(FacetResultNode facetResNode, ScoredDocIDs docIds) + throws IOException { + recount(facetResNode, docIds); + for (FacetResultNode frn : facetResNode.getSubResults()) { + fixResultNode(frn, docIds); + } + } + + /** + * Internal utility: recount for a facet result node + * + * @param fresNode + * result node to be recounted + * @param docIds + * full set of matching documents. + * @throws IOException + */ + private void recount(FacetResultNode fresNode, ScoredDocIDs docIds) + throws IOException { + // TODO (Facet): change from void to return the new, smaller docSet, and use + // that for the children, as this will make their intersection ops faster. + // can do this only when the new set is "sufficiently" smaller. + + /* We need the category's path name in order to do its recounting. + * If it is missing, because the option to label only part of the + * facet results was exercise, we need to calculate them anyway, so + * in essence sampling with recounting spends some extra cycles for + * labeling results for which labels are not required. */ + CategoryPath catPath = fresNode.getLabel(taxonomyReader); // force labeling + + Term drillDownTerm = DrillDown.term(searchParams, catPath); + // TODO (Facet): avoid Multi*? + Bits deletedDocs = MultiFields.getDeletedDocs(indexReader); + int updatedCount = countIntersection(MultiFields.getTermDocsEnum(indexReader, deletedDocs, drillDownTerm.field(), drillDownTerm.bytes()), + docIds.iterator()); + + fresNode.setValue(updatedCount); + } + + /** + * Count the size of the intersection between two lists: a TermDocs (list of + * documents in a certain category) and a DocIdSetIterator (list of documents + * matching a query). + */ + private static int countIntersection(DocsEnum p1, ScoredDocIDsIterator p2) + throws IOException { + // The documentation of of both TermDocs and DocIdSetIterator claim + // that we must do next() before doc(). So we do, and if one of the + // lists is empty, obviously return 0; + if (p1 == null || p1.nextDoc() == DocIdSetIterator.NO_MORE_DOCS) { + return 0; + } + if (!p2.next()) { + return 0; + } + + int d1 = p1.docID(); + int d2 = p2.getDocID(); + + int count = 0; + for (;;) { + if (d1 == d2) { + ++count; + if (p1.nextDoc() == DocIdSetIterator.NO_MORE_DOCS) { + break; // end of list 1, nothing more in intersection + } + d1 = p1.docID(); + if (!advance(p2, d1)) { + break; // end of list 2, nothing more in intersection + } + d2 = p2.getDocID(); + } else if (d1 < d2) { + if (p1.advance(d2) == DocIdSetIterator.NO_MORE_DOCS) { + break; // end of list 1, nothing more in intersection + } + d1 = p1.docID(); + } else /* d1>d2 */ { + if (!advance(p2, d1)) { + break; // end of list 2, nothing more in intersection + } + d2 = p2.getDocID(); + } + } + return count; + } + + /** + * utility: advance the iterator until finding (or exceeding) specific + * document + * + * @param iterator + * iterator being advanced + * @param targetDoc + * target of advancing + * @return false if iterator exhausted, true otherwise. + */ + private static boolean advance(ScoredDocIDsIterator iterator, int targetDoc) { + while (iterator.next()) { + if (iterator.getDocID() >= targetDoc) { + return true; // target reached + } + } + return false; // exhausted + } +} \ No newline at end of file diff --git a/modules/facet/src/java/org/apache/lucene/facet/search/sampling/package.html b/modules/facet/src/java/org/apache/lucene/facet/search/sampling/package.html new file mode 100644 index 00000000000..9ea9f97e1b4 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/search/sampling/package.html @@ -0,0 +1,8 @@ + + + Sampling for facets accumulation + + +

Sampling for facets accumulation

+ + \ No newline at end of file diff --git a/modules/facet/src/java/org/apache/lucene/facet/taxonomy/CategoryPath.java b/modules/facet/src/java/org/apache/lucene/facet/taxonomy/CategoryPath.java new file mode 100644 index 00000000000..389cd1f6e30 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/taxonomy/CategoryPath.java @@ -0,0 +1,1053 @@ +package org.apache.lucene.facet.taxonomy; + +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.OutputStreamWriter; +import java.io.Serializable; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * A CategoryPath holds a sequence of string components, specifying the + * hierarchical name of a category. + *

+ * CategoryPath is designed to reduce the number of object allocations, in two + * ways: First, it keeps the components internally in two arrays, rather than + * keeping individual strings. Second, it allows reusing the same CategoryPath + * object (which can be clear()ed and new components add()ed again) and of + * add()'s parameter (which can be a reusable object, not just a string). + * + * @lucene.experimental + */ +public class CategoryPath implements Serializable, Cloneable, Comparable { + + // A category path is a sequence of string components. It is kept + // internally as one large character array "chars" with all the string + // concatenated (without separators), and an array of integers "ends" + // pointing to the/ end of each component. Both arrays may be larger + // than actually in use. An additional integer, "ncomponents" specifies + // how many components are actually set. + // We use shorts instead of ints for "ends" to save a bit of space. This + // means that our path lengths are limited to 32767 characters - which + // should not be a problem in any realistic scenario. + protected char[] chars; + protected short[] ends; + protected short ncomponents; + + /** + * Return the number of components in the facet path. Note that this is + * not the number of characters, but the number of components. + */ + public short length() { + return ncomponents; + } + + /** + * Trim the last components from the path. + * + * @param nTrim + * Number of components to trim. If larger than the number of + * components this path has, the entire path will be cleared. + */ + public void trim(int nTrim) { + if (nTrim >= this.ncomponents) { + clear(); + } else if (nTrim > 0) { + this.ncomponents -= nTrim; + } + } + + /** + * Returns the current character capacity of the CategoryPath. The character + * capacity is the size of the internal buffer used to hold the characters + * of all the path's components. When a component is added and the capacity + * is not big enough, the buffer is automatically grown, and capacityChars() + * increases. + */ + public int capacityChars() { + return chars.length; + } + + /** + * Returns the current component capacity of the CategoryPath. The component + * capacity is the maximum number of components that the internal buffer can + * currently hold. When a component is added beyond this capacity, the + * buffer is automatically grown, and capacityComponents() increases. + */ + public int capacityComponents() { + return ends.length; + } + + /** + * Construct a new empty CategoryPath object. CategoryPath objects are meant + * to be reused, by add()ing components, and later clear()ing, and add()ing + * components again. The CategoryPath object is created with a buffer + * pre-allocated for a given number of characters and components, but the + * buffer will grow as necessary (see {@link #capacityChars()} and + * {@link #capacityComponents()}). + */ + public CategoryPath(int capacityChars, int capacityComponents) { + ncomponents = 0; + chars = new char[capacityChars]; + ends = new short[capacityComponents]; + } + + /** + * Create an empty CategoryPath object. Equivalent to the constructor + * {@link #CategoryPath(int, int)} with the two initial-capacity arguments + * set to zero. + */ + public CategoryPath() { + this(0, 0); + } + + /** + * Add the given component to the end of the path. + *

+ * Note that when a String object is passed to this method, a reference to + * it is not saved (rather, its content is copied), which will lead to that + * String object being gc'ed. To reduce the number of garbage objects, you + * can pass a mutable CharBuffer instead of an immutable String to this + * method. + */ + public void add(CharSequence component) { + // Set the new end, increasing the "ends" array sizes if necessary: + if (ncomponents >= ends.length) { + short[] newends = new short[(ends.length + 1) * 2]; + System.arraycopy(ends, 0, newends, 0, ends.length); + ends = newends; + } + short prevend = (ncomponents == 0) ? 0 : ends[ncomponents - 1]; + int cmplen = component.length(); + ends[ncomponents] = (short) (prevend + cmplen); + + // Copy the new component's characters, increasing the "chars" array + // sizes if necessary: + if (ends[ncomponents] > chars.length) { + char[] newchars = new char[ends[ncomponents] * 2]; + System.arraycopy(chars, 0, newchars, 0, chars.length); + chars = newchars; + } + for (int i = 0; i < cmplen; i++) { + chars[prevend++] = component.charAt(i); + } + + ncomponents++; + } + + /** + * Empty the CategoryPath object, so that it has zero components. The + * capacity of the object (see {@link #capacityChars()} and + * {@link #capacityComponents()}) is not reduced, so that the object can be + * reused without frequent reallocations. + */ + public void clear() { + ncomponents = 0; + } + + /** + * Build a string representation of the path, with its components separated + * by the given delimiter character. The resulting string is appended to a + * given Appendable, e.g., a StringBuilder, CharBuffer or Writer. + *

+ * Note that the two cases of zero components and one component with zero + * length produce indistinguishable results (both of them append nothing). + * This is normally not a problem, because components should not normally + * have zero lengths. + *

+ * An IOException can be thrown if the given Appendable's append() throws + * this exception. + */ + public void appendTo(Appendable out, char delimiter) throws IOException { + if (ncomponents == 0) { + return; // just append nothing... + } + for (int i = 0; i < ends[0]; i++) { + out.append(chars[i]); + } + for (int j = 1; j < ncomponents; j++) { + out.append(delimiter); + for (int i = ends[j - 1]; i < ends[j]; i++) { + out.append(chars[i]); + } + } + } + + /** + * like {@link #appendTo(Appendable, char)}, but takes only a prefix of the + * path, rather than the whole path. + *

+ * If the given prefix length is negative or bigger than the path's actual + * length, the whole path is taken. + */ + public void appendTo(Appendable out, char delimiter, int prefixLen) + throws IOException { + if (prefixLen < 0 || prefixLen > ncomponents) { + prefixLen = ncomponents; + } + if (prefixLen == 0) { + return; // just append nothing... + } + for (int i = 0; i < ends[0]; i++) { + out.append(chars[i]); + } + for (int j = 1; j < prefixLen; j++) { + out.append(delimiter); + for (int i = ends[j - 1]; i < ends[j]; i++) { + out.append(chars[i]); + } + } + } + + /** + * like {@link #appendTo(Appendable, char)}, but takes only a part of the + * path, rather than the whole path. + *

+ * start specifies the first component in the subpath, and + * end is one past the last component. If start is + * negative, 0 is assumed, and if end is negative or past the + * end of the path, the path is taken until the end. Otherwise, if + * end<=start, nothing is appended. Nothing is appended also in + * the case that the path is empty. + */ + public void appendTo(Appendable out, char delimiter, int start, int end) + throws IOException { + if (start < 0) { + start = 0; + } + if (end < 0 || end > ncomponents) { + end = ncomponents; + } + if (end <= start) { + return; // just append nothing... + } + for (int i = (start == 0 ? 0 : ends[start - 1]); i < ends[start]; i++) { + out.append(chars[i]); + } + for (int j = start + 1; j < end; j++) { + out.append(delimiter); + for (int i = ends[j - 1]; i < ends[j]; i++) { + out.append(chars[i]); + } + } + } + + /** + * Build a string representation of the path, with its components separated + * by the given delimiter character. The resulting string is returned as a + * new String object. To avoid this temporary object creation, consider + * using {@link #appendTo(Appendable, char)} instead. + *

+ * Note that the two cases of zero components and one component with zero + * length produce indistinguishable results (both of them return an empty + * string). This is normally not a problem, because components should not + * normally have zero lengths. + */ + public String toString(char delimiter) { + if (ncomponents == 0) { + return ""; + } + StringBuilder sb = new StringBuilder(ends[ncomponents - 1] + + (ncomponents - 1)); + try { + this.appendTo(sb, delimiter); + } catch (IOException e) { + // can't happen, because StringBuilder.append() never actually + // throws an exception! + } + return sb.toString(); + } + + /** + * This method, an implementation of the {@link Object#toString()} + * interface, is to allow simple printing of a CategoryPath, for debugging + * purposes. When possible, it recommended to avoid using it it, and rather, + * if you want to output the path with its components separated by a + * delimiter character, specify the delimiter explicitly, with + * {@link #toString(char)}. + */ + @Override + public String toString() { + return toString('/'); + } + + /** + * like {@link #toString(char)}, but takes only a prefix with a given number + * of components, rather than the whole path. + *

+ * If the given length is negative or bigger than the path's actual length, + * the whole path is taken. + */ + public String toString(char delimiter, int prefixLen) { + if (prefixLen < 0 || prefixLen > ncomponents) { + prefixLen = ncomponents; + } + if (prefixLen == 0) { + return ""; + } + StringBuilder sb = new StringBuilder(ends[prefixLen - 1] + + (prefixLen - 1)); + try { + this.appendTo(sb, delimiter, prefixLen); + } catch (IOException e) { + // can't happen, because sb.append() never actually throws an + // exception + } + return sb.toString(); + } + + /** + * like {@link #toString(char)}, but takes only a part of the path, rather + * than the whole path. + *

+ * start specifies the first component in the subpath, and + * end is one past the last component. If start is + * negative, 0 is assumed, and if end is negative or past the + * end of the path, the path is taken until the end. Otherwise, if + * end<=start, an empty string is returned. An emptry string is + * returned also in the case that the path is empty. + */ + public String toString(char delimiter, int start, int end) { + if (start < 0) { + start = 0; + } + if (end < 0 || end > ncomponents) { + end = ncomponents; + } + if (end <= start) { + return ""; + } + int startchar = (start == 0) ? 0 : ends[start - 1]; + StringBuilder sb = new StringBuilder(ends[end - 1] - startchar + + (end - start) - 1); + try { + this.appendTo(sb, delimiter, start, end); + } catch (IOException e) { + // can't happen, because sb.append() never actually throws an + // exception + } + return sb.toString(); + } + + /** + * Return the i'th component of the path, in a new String object. If there + * is no i'th component, a null is returned. + */ + public String getComponent(int i) { + if (i < 0 || i >= ncomponents) { + return null; + } + if (i == 0) { + return new String(chars, 0, ends[0]); + } + return new String(chars, ends[i - 1], ends[i] - ends[i - 1]); + } + + /** + * Return the last component of the path, in a new String object. If the + * path is empty, a null is returned. + */ + public String lastComponent() { + if (ncomponents == 0) { + return null; + } + if (ncomponents == 1) { + return new String(chars, 0, ends[0]); + } + return new String(chars, ends[ncomponents - 2], ends[ncomponents - 1] + - ends[ncomponents - 2]); + } + + /** + * Copies the specified number of components from this category path to the + * specified character array, with the components separated by a given + * delimiter character. The array must be large enough to hold the + * components and separators - the amount of needed space can be calculated + * with {@link #charsNeededForFullPath()}. + *

+ * This method returns the number of characters written to the array. + * + * @param outputBuffer + * The destination character array. + * @param outputBufferStart + * The first location to write in the output array. + * @param numberOfComponentsToCopy + * The number of path components to write to the destination + * buffer. + * @param separatorChar + * The separator inserted between every pair of path components + * in the output buffer. + * @see #charsNeededForFullPath() + */ + public int copyToCharArray(char[] outputBuffer, int outputBufferStart, + int numberOfComponentsToCopy, char separatorChar) { + if (numberOfComponentsToCopy == 0) { + return 0; + } + if (numberOfComponentsToCopy < 0 + || numberOfComponentsToCopy > ncomponents) { + numberOfComponentsToCopy = ncomponents; + } + int outputBufferInitialStart = outputBufferStart; // for calculating + // chars copied. + int sourceStart = 0; + int sourceLength = ends[0]; + for (int component = 0; component < numberOfComponentsToCopy; component++) { + if (component > 0) { + sourceStart = ends[component - 1]; + sourceLength = ends[component] - sourceStart; + outputBuffer[outputBufferStart++] = separatorChar; + } + System.arraycopy(chars, sourceStart, outputBuffer, + outputBufferStart, sourceLength); + outputBufferStart += sourceLength; + } + return outputBufferStart - outputBufferInitialStart; + } + + /** + * Returns the number of characters required to represent this entire + * category path, if written using + * {@link #copyToCharArray(char[], int, int, char)} or + * {@link #appendTo(Appendable, char)}. This includes the number of + * characters in all the components, plus the number of separators between + * them (each one character in the aforementioned methods). + */ + public int charsNeededForFullPath() { + if (ncomponents == 0) { + return 0; + } + return ends[ncomponents - 1] + ncomponents - 1; + } + + /** + * Construct a new CategoryPath object, given a single string with + * components separated by a given delimiter character. + *

+ * The initial capacity of the constructed object will be exactly what is + * needed to hold the given path. This fact is convenient when creating a + * temporary object that will not be reused later. + */ + public CategoryPath(String pathString, char delimiter) { + if (pathString.length() == 0) { + ncomponents = 0; + chars = new char[0]; + ends = new short[0]; + return; + } + + // This constructor is often used for creating a temporary object + // (one which will not be reused to hold multiple paths), so we want + // to do our best to allocate exactly the needed size - not less (to + // avoid reallocation) and not more (so as not to waste space). + // To do this, we unfortunately need to make an additional pass on the + // given string: + int nparts = 1; + for (int i = pathString.indexOf(delimiter); i >= 0; i = pathString + .indexOf(delimiter, i + 1)) { + nparts++; + } + + ends = new short[nparts]; + chars = new char[pathString.length() - nparts + 1]; + ncomponents = 0; + + add(pathString, delimiter); + } + + /** + * Add the given components to the end of the path. The components are given + * in a single string, separated by a given delimiter character. If the + * given string is empty, it is assumed to refer to the root (empty) + * category, and nothing is added to the path (rather than adding a single + * empty component). + *

+ * Note that when a String object is passed to this method, a reference to + * it is not saved (rather, its content is copied), which will lead to that + * String object being gc'ed. To reduce the number of garbage objects, you + * can pass a mutable CharBuffer instead of an immutable String to this + * method. + */ + public void add(CharSequence pathString, char delimiter) { + int len = pathString.length(); + if (len == 0) { + return; // assume root category meant, so add nothing. + } + short pos = (ncomponents == 0) ? 0 : ends[ncomponents - 1]; + for (int i = 0; i < len; i++) { + char c = pathString.charAt(i); + if (c == delimiter) { + if (ncomponents >= ends.length) { + short[] newends = new short[(ends.length + 1) * 2]; + System.arraycopy(ends, 0, newends, 0, ends.length); + ends = newends; + } + ends[ncomponents++] = pos; + } else { + if (pos >= chars.length) { + char[] newchars = new char[(chars.length + 1) * 2]; + System.arraycopy(chars, 0, newchars, 0, chars.length); + chars = newchars; + } + chars[pos++] = c; + } + } + + // Don't forget to count the last component! + if (ncomponents >= ends.length) { + short[] newends = new short[(ends.length + 1) * 2]; + System.arraycopy(ends, 0, newends, 0, ends.length); + ends = newends; + } + ends[ncomponents++] = pos; + } + + /** + * Construct a new CategoryPath object, copying an existing path given as an + * array of strings. + *

+ * The new object occupies exactly the space it needs, without any spare + * capacity. This is the expected behavior in the typical use case, where + * this constructor is used to create a temporary object which is never + * reused. + */ + public CategoryPath(CharSequence... components) { + this.ncomponents = (short) components.length; + this.ends = new short[ncomponents]; + if (ncomponents > 0) { + this.ends[0] = (short) components[0].length(); + for (int i = 1; i < ncomponents; i++) { + this.ends[i] = (short) (this.ends[i - 1] + components[i] + .length()); + } + this.chars = new char[this.ends[ncomponents - 1]]; + CharSequence cs = components[0]; + if (cs instanceof String) { + ((String) cs).getChars(0, cs.length(), this.chars, 0); + } else { + for (int j = 0, k = cs.length(); j < k; j++) { + this.chars[j] = cs.charAt(j); + } + } + for (int i = 1; i < ncomponents; i++) { + cs = components[i]; + int offset = this.ends[i - 1]; + if (cs instanceof String) { + ((String) cs).getChars(0, cs.length(), this.chars, offset); + } else { + for (int j = 0, k = cs.length(); j < k; j++) { + this.chars[j + offset] = cs.charAt(j); + } + } + } + } else { + this.chars = new char[0]; + } + } + + /** + * Construct a new CategoryPath object, copying the path given in an + * existing CategoryPath object. + *

+ * This copy-constructor is handy when you need to save a reference to a + * CategoryPath (e.g., when it serves as a key to a hash-table), but cannot + * save a reference to the original object because its contents can be + * changed later by the user. Copying the contents into a new object is a + * solution. + *

+ * This constructor does not copy the capacity (spare buffer size) + * of the existing CategoryPath. Rather, the new object occupies exactly the + * space it needs, without any spare. This is the expected behavior in the + * typical use case outlined in the previous paragraph. + */ + public CategoryPath(CategoryPath existing) { + ncomponents = existing.ncomponents; + if (ncomponents == 0) { + chars = new char[0]; + ends = new short[0]; + return; + } + + chars = new char[existing.ends[ncomponents - 1]]; + System.arraycopy(existing.chars, 0, chars, 0, chars.length); + ends = new short[ncomponents]; + System.arraycopy(existing.ends, 0, ends, 0, ends.length); + } + + /** + * Construct a new CategoryPath object, copying a prefix with the given + * number of components of the path given in an existing CategoryPath + * object. + *

+ * If the given length is negative or bigger than the given path's actual + * length, the full path is taken. + *

+ * This constructor is often convenient for creating a temporary object with + * a path's prefix, but this practice is wasteful, and therefore + * inadvisable. Rather, the application should be written in a way that + * allows considering only a prefix of a given path, without needing to make + * a copy of that path. + */ + public CategoryPath(CategoryPath existing, int prefixLen) { + if (prefixLen < 0 || prefixLen > existing.ncomponents) { + ncomponents = existing.ncomponents; + } else { + ncomponents = (short) prefixLen; + } + if (ncomponents == 0) { + chars = new char[0]; + ends = new short[0]; + return; + } + + chars = new char[existing.ends[ncomponents - 1]]; + System.arraycopy(existing.chars, 0, chars, 0, chars.length); + ends = new short[ncomponents]; + System.arraycopy(existing.ends, 0, ends, 0, ends.length); + } + + @Override + public Object clone() { + return new CategoryPath(this); + } + + /** + * Compare the given CategoryPath to another one. For two category paths to + * be considered equal, only the path they contain needs to be identical The + * unused capacity of the objects is not considered in the comparison. + */ + @Override + public boolean equals(Object obj) { + if (obj instanceof CategoryPath) { + CategoryPath other = (CategoryPath) obj; + if (other.ncomponents != this.ncomponents) { + return false; + } + // Unfortunately, Arrays.equal() can only compare entire arrays, + // and in our case we potentially have unused parts of the arrays + // that must not be compared... I wish that some future version + // of Java has a offset and length parameter to Arrays.equal + // (sort of like System.arraycopy()). + if (ncomponents == 0) { + return true; // nothing to compare... + } + for (int i = 0; i < ncomponents; i++) { + if (this.ends[i] != other.ends[i]) { + return false; + } + } + int len = ends[ncomponents - 1]; + for (int i = 0; i < len; i++) { + if (this.chars[i] != other.chars[i]) { + return false; + } + } + return true; + } + return false; + } + + /** + * Test whether this object is a descendant of another CategoryPath. This is + * true if the other CategoryPath is the prefix of this. + */ + public boolean isDescendantOf(CategoryPath other) { + if (this.ncomponents < other.ncomponents) { + return false; + } + int j = 0; + for (int i = 0; i < other.ncomponents; i++) { + if (ends[i] != other.ends[i]) { + return false; + } + for (; j < ends[i]; j++) { + if (this.chars[j] != other.chars[j]) { + return false; + } + } + } + return true; + } + + /** + * Calculate a hashCode for this path, used when a CategoryPath serves as a + * hash-table key. If two objects are equal(), their hashCodes need to be + * equal, so like in equal(), hashCode does not consider unused portions of + * the internal buffers in its calculation. + *

+ * The hash function used is modeled after Java's String.hashCode() - a + * simple multiplicative hash function with the multiplier 31. The same hash + * function also appeared in Kernighan & Ritchie's second edition of + * "The C Programming Language" (1988). + */ + @Override + public int hashCode() { + if (ncomponents == 0) { + return 0; + } + int hash = ncomponents; + // Unfortunately, Arrays.hashCode() can only calculate a hash code + // for an entire arrays, and in our case we potentially have unused + // parts of the arrays that must be ignored, so must use our own loop + // over the characters. I wish that some future version of Java will + // add offset and length parameters to Arrays.hashCode (sort of like + // System.arraycopy()'s parameters). + for (int i = 0; i < ncomponents; i++) { + hash = hash * 31 + ends[i]; + } + int len = ends[ncomponents - 1]; + for (int i = 0; i < len; i++) { + hash = hash * 31 + chars[i]; + } + return hash; + } + + /** + * Like {@link #hashCode()}, but find the hash function of a prefix with the + * given number of components, rather than of the entire path. + */ + public int hashCode(int prefixLen) { + if (prefixLen < 0 || prefixLen > ncomponents) { + prefixLen = ncomponents; + } + if (prefixLen == 0) { + return 0; + } + int hash = prefixLen; + for (int i = 0; i < prefixLen; i++) { + hash = hash * 31 + ends[i]; + } + int len = ends[prefixLen - 1]; + for (int i = 0; i < len; i++) { + hash = hash * 31 + chars[i]; + } + return hash; + } + + /** + * Calculate a 64-bit hash function for this path. Unlike + * {@link #hashCode()}, this method is not part of the Java standard, and is + * only used if explicitly called by the user. + *

+ * If two objects are equal(), their hash codes need to be equal, so like in + * {@link #equals(Object)}, longHashCode does not consider unused portions + * of the internal buffers in its calculation. + *

+ * The hash function used is a simple multiplicative hash function, with the + * multiplier 65599. While Java's standard multiplier 31 (used in + * {@link #hashCode()}) gives a good distribution for ASCII strings, it + * turns out that for foreign-language strings (with 16-bit characters) it + * gives too many collisions, and a bigger multiplier produces fewer + * collisions in this case. + */ + public long longHashCode() { + if (ncomponents == 0) { + return 0; + } + long hash = ncomponents; + for (int i = 0; i < ncomponents; i++) { + hash = hash * 65599 + ends[i]; + } + int len = ends[ncomponents - 1]; + for (int i = 0; i < len; i++) { + hash = hash * 65599 + chars[i]; + } + return hash; + } + + /** + * Like {@link #longHashCode()}, but find the hash function of a prefix with + * the given number of components, rather than of the entire path. + */ + public long longHashCode(int prefixLen) { + if (prefixLen < 0 || prefixLen > ncomponents) { + prefixLen = ncomponents; + } + if (prefixLen == 0) { + return 0; + } + long hash = prefixLen; + for (int i = 0; i < prefixLen; i++) { + hash = hash * 65599 + ends[i]; + } + int len = ends[prefixLen - 1]; + for (int i = 0; i < len; i++) { + hash = hash * 65599 + chars[i]; + } + return hash; + } + + /** + * Write out a serialized (as a character sequence) representation of the + * path to a given Appendable (e.g., a StringBuilder, CharBuffer, Writer, or + * something similar. + *

+ * This method may throw a IOException if the given Appendable threw this + * exception while appending. + */ + public void serializeAppendTo(Appendable out) throws IOException { + // Note that we use the fact that ncomponents and ends[] are shorts, + // so we can write them as chars: + out.append((char) ncomponents); + if (ncomponents == 0) { + return; + } + for (int i = 0; i < ncomponents; i++) { + out.append((char) ends[i]); + } + int usedchars = ends[ncomponents - 1]; + for (int i = 0; i < usedchars; i++) { + out.append(chars[i]); + } + } + + /** + * Just like {@link #serializeAppendTo(Appendable)}, but writes only a + * prefix of the CategoryPath. + */ + public void serializeAppendTo(int prefixLen, Appendable out) + throws IOException { + if (prefixLen < 0 || prefixLen > ncomponents) { + prefixLen = ncomponents; + } + // Note that we use the fact that ncomponents and ends[] are shorts, + // so we can write them as chars: + out.append((char) prefixLen); + if (prefixLen == 0) { + return; + } + for (int i = 0; i < prefixLen; i++) { + out.append((char) ends[i]); + } + int usedchars = ends[prefixLen - 1]; + for (int i = 0; i < usedchars; i++) { + out.append(chars[i]); + } + } + + /** + * Set a CategoryPath from a character-sequence representation written by + * {@link #serializeAppendTo(Appendable)}. + *

+ * Reading starts at the given offset into the given character sequence, and + * the offset right after the end of this path is returned. + */ + public int setFromSerialized(CharSequence buffer, int offset) { + ncomponents = (short) buffer.charAt(offset++); + if (ncomponents == 0) { + return offset; + } + + if (ncomponents >= ends.length) { + ends = new short[Math.max(ends.length * 2, ncomponents)]; + } + for (int i = 0; i < ncomponents; i++) { + ends[i] = (short) buffer.charAt(offset++); + } + + int usedchars = ends[ncomponents - 1]; + if (usedchars > chars.length) { + chars = new char[Math.max(chars.length * 2, usedchars)]; + } + for (int i = 0; i < usedchars; i++) { + chars[i] = buffer.charAt(offset++); + } + + return offset; + } + + /** + * Check whether the current path is identical to the one serialized (with + * {@link #serializeAppendTo(Appendable)}) in the given buffer, at the given + * offset. + */ + public boolean equalsToSerialized(CharSequence buffer, int offset) { + int n = (short) buffer.charAt(offset++); + if (ncomponents != n) { + return false; + } + if (ncomponents == 0) { + return true; + } + for (int i = 0; i < ncomponents; i++) { + if (ends[i] != (short) buffer.charAt(offset++)) { + return false; + } + } + int usedchars = ends[ncomponents - 1]; + for (int i = 0; i < usedchars; i++) { + if (chars[i] != buffer.charAt(offset++)) { + return false; + } + } + return true; + } + + /** + * Just like {@link #equalsToSerialized(CharSequence, int)}, but compare to + * a prefix of the CategoryPath, instead of the whole CategoryPath. + */ + public boolean equalsToSerialized(int prefixLen, CharSequence buffer, + int offset) { + if (prefixLen < 0 || prefixLen > ncomponents) { + prefixLen = ncomponents; + } + int n = (short) buffer.charAt(offset++); + if (prefixLen != n) { + return false; + } + if (prefixLen == 0) { + return true; + } + for (int i = 0; i < prefixLen; i++) { + if (ends[i] != (short) buffer.charAt(offset++)) { + return false; + } + } + int usedchars = ends[prefixLen - 1]; + for (int i = 0; i < usedchars; i++) { + if (chars[i] != buffer.charAt(offset++)) { + return false; + } + } + return true; + } + + /** + * This method calculates a hash function of a path that has been written to + * (using {@link #serializeAppendTo(Appendable)}) a character buffer. It is + * guaranteed that the value returned is identical to that which + * {@link #hashCode()} would have produced for the original object before it + * was serialized. + */ + public static int hashCodeOfSerialized(CharSequence buffer, int offset) { + // Note: the algorithm here must be identical to that of hashCode(), + // in order that they produce identical results! + int ncomponents = (short) buffer.charAt(offset++); + if (ncomponents == 0) { + return 0; + } + int hash = ncomponents; + for (int i = 0; i < ncomponents; i++) { + hash = hash * 31 + buffer.charAt(offset++); + } + int len = buffer.charAt(offset - 1); + for (int i = 0; i < len; i++) { + hash = hash * 31 + buffer.charAt(offset++); + } + return hash; + } + + /** + * Serializes the content of this CategoryPath to a byte stream, using UTF-8 + * encoding to convert characters to bytes, and treating the ends as 16-bit + * characters. + * + * @param osw + * The output byte stream. + * @throws IOException + * If there are encoding errors. + */ + // TODO (Facet): consolidate all de/serialize method names to + // serialize() and unserialize() + public void serializeToStreamWriter(OutputStreamWriter osw) + throws IOException { + osw.write(this.ncomponents); + if (this.ncomponents <= 0) { + return; + } + for (int j = 0; j < this.ncomponents; j++) { + osw.write(this.ends[j]); + } + osw.write(this.chars, 0, this.ends[this.ncomponents - 1]); + } + + /** + * Serializes the content of this CategoryPath to a byte stream, using UTF-8 + * encoding to convert characters to bytes, and treating the ends as 16-bit + * characters. + * + * @param isr + * The input stream. + * @throws IOException + * If there are encoding errors. + */ + public void deserializeFromStreamReader(InputStreamReader isr) + throws IOException { + this.ncomponents = (short) isr.read(); + if (this.ncomponents <= 0) { + return; + } + if (this.ends == null || this.ends.length < this.ncomponents) { + this.ends = new short[this.ncomponents]; + } + for (int j = 0; j < this.ncomponents; j++) { + this.ends[j] = (short) isr.read(); + } + if (this.chars == null + || this.ends[this.ncomponents - 1] > chars.length) { + this.chars = new char[this.ends[this.ncomponents - 1]]; + } + isr.read(this.chars, 0, this.ends[this.ncomponents - 1]); + } + + private void writeObject(java.io.ObjectOutputStream out) + throws IOException { + OutputStreamWriter osw = new OutputStreamWriter(out, "UTF-8"); + this.serializeToStreamWriter(osw); + osw.flush(); + } + + private void readObject(java.io.ObjectInputStream in) throws IOException, ClassNotFoundException { + InputStreamReader isr = new InputStreamReader(in, "UTF-8"); + this.deserializeFromStreamReader(isr); + } + + /** + * Compares this CategoryPath with the other CategoryPath for lexicographic + * order. + * Returns a negative integer, zero, or a positive integer as this + * CategoryPath lexicographically precedes, equals to, or lexicographically follows + * the other CategoryPath. + */ + public int compareTo(CategoryPath other) { + int minlength = (this.length() < other.length()) ? this.length() : other.length(); + int ch = 0; + for (int co = 0 ; co < minlength; co++) { + if (this.ends[co] <= other.ends[co]) { + for ( ; ch < this.ends[co] ; ch++) { + if (this.chars[ch] != other.chars[ch]) { + return this.chars[ch] - other.chars[ch]; + } + } + if (this.ends[co] < other.ends[co]) { + return -1; + } + } else /* this.ends[co] > other.ends[co] */ { + for ( ; ch < other.ends[co] ; ch++) { + if (this.chars[ch] != other.chars[ch]) { + return this.chars[ch] - other.chars[ch]; + } + } + return +1; + } + } + // one is a prefix of the other + return this.length() - other.length(); + } +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyReader.java b/modules/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyReader.java new file mode 100644 index 00000000000..2d9649f399f --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyReader.java @@ -0,0 +1,274 @@ +package org.apache.lucene.facet.taxonomy; + +import java.io.Closeable; +import java.io.IOException; +import java.util.Map; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * TaxonomyReader is the read-only interface with which the faceted-search + * library uses the taxonomy during search time. + *

+ * A TaxonomyReader holds a list of categories. Each category has a serial + * number which we call an "ordinal", and a hierarchical "path" name: + *

    + *
  • + * The ordinal is an integer that starts at 0 for the first category (which is + * always the root category), and grows contiguously as more categories are + * added; Note that once a category is added, it can never be deleted. + *
  • + * The path is a CategoryPath object specifying the category's position in the + * hierarchy. + *
+ * Notes about concurrent access to the taxonomy: + *

+ * An implementation must allow multiple readers to be active concurrently + * with a single writer. Readers follow so-called "point in time" semantics, + * i.e., a TaxonomyReader object will only see taxonomy entries which were + * available at the time it was created. What the writer writes is only + * available to (new) readers after the writer's commit() is called. + *

+ * In faceted search, two separate indices are used: the main Lucene index, + * and the taxonomy. Because the main index refers to the categories listed + * in the taxonomy, it is important to open the taxonomy *after* opening the + * main index, and it is also necessary to reopen() the taxonomy after + * reopen()ing the main index. + *

+ * This order is important, otherwise it would be possible for the main index + * to refer to a category which is not yet visible in the old snapshot of + * the taxonomy. Note that it is indeed fine for the the taxonomy to be opened + * after the main index - even a long time after. The reason is that once + * a category is added to the taxonomy, it can never be changed or deleted, + * so there is no danger that a "too new" taxonomy not being consistent with + * an older index. + * + * @lucene.experimental + */ +public interface TaxonomyReader extends Closeable { + + /** + * The root category (the category with the empty path) always has the + * ordinal 0, to which we give a name ROOT_ORDINAL. + * getOrdinal() of an empty path will always return ROOT_ORDINAL, and + * getCategory(ROOT_ORDINAL) will return the empty path. + */ + public final static int ROOT_ORDINAL = 0; + + /** + * Ordinals are always non-negative, so a negative ordinal can be used to + * signify an error. Methods here return INVALID_ORDINAL (-1) in this case. + */ + public final static int INVALID_ORDINAL = -1; + + /** + * getOrdinal() returns the ordinal of the category given as a path. + * The ordinal is the category's serial number, an integer which starts + * with 0 and grows as more categories are added (note that once a category + * is added, it can never be deleted). + *

+ * If the given category wasn't found in the taxonomy, INVALID_ORDINAL is + * returned. + */ + public int getOrdinal(CategoryPath categoryPath) throws IOException; + + /** + * getPath() returns the path name of the category with the given + * ordinal. The path is returned as a new CategoryPath object - to + * reuse an existing object, use {@link #getPath(int, CategoryPath)}. + *

+ * A null is returned if a category with the given ordinal does not exist. + */ + public CategoryPath getPath(int ordinal) throws IOException; + + /** + * getPath() returns the path name of the category with the given + * ordinal. The path is written to the given CategoryPath object (which + * is cleared first). + *

+ * If a category with the given ordinal does not exist, the given + * CategoryPath object is not modified, and the method returns + * false. Otherwise, the method returns true. + */ + public boolean getPath(int ordinal, CategoryPath result) throws IOException; + + /** + * refresh() re-reads the taxonomy information if there were any changes to + * the taxonomy since this instance was opened or last refreshed. Calling + * refresh() is more efficient than close()ing the old instance and opening a + * new one. + *

+ * If there were no changes since this instance was opened or last refreshed, + * then this call does nothing. Note, however, that this is still a relatively + * slow method (as it needs to verify whether there have been any changes on + * disk to the taxonomy), so it should not be called too often needlessly. In + * faceted search, the taxonomy reader's refresh() should be called only after + * a reopen() of the main index. + *

+ * It should be noted that refresh() is similar in purpose to + * IndexReader.reopen(), but the two methods behave differently. refresh() + * refreshes the existing TaxonomyReader object, rather than opening a new one + * in addition to the old one as reopen() does. The reason is that in a + * taxonomy, one can only add new categories and cannot modify or delete + * existing categories; Therefore, there is no reason to keep an old snapshot + * of the taxonomy open - refreshing the taxonomy to the newest data and using + * this new snapshots in all threads (whether new or old) is fine. This saves + * us needing to keep multiple copies of the taxonomy open in memory. + */ + public void refresh() throws IOException; + + /** + * getParent() returns the ordinal of the parent category of the category + * with the given ordinal. + *

+ * When a category is specified as a path name, finding the path of its + * parent is as trivial as dropping the last component of the path. + * getParent() is functionally equivalent to calling getPath() on the + * given ordinal, dropping the last component of the path, and then calling + * getOrdinal() to get an ordinal back. However, implementations are + * expected to provide a much more efficient implementation: + *

+ * getParent() should be a very quick method, as it is used during the + * facet aggregation process in faceted search. Implementations will most + * likely want to serve replies to this method from a pre-filled cache. + *

+ * If the given ordinal is the ROOT_ORDINAL, an INVALID_ORDINAL is returned. + * If the given ordinal is a top-level category, the ROOT_ORDINAL is returned. + * If an invalid ordinal is given (negative or beyond the last available + * ordinal), an ArrayIndexOutOfBoundsException is thrown. However, it is + * expected that getParent will only be called for ordinals which are + * already known to be in the taxonomy. + */ + public int getParent(int ordinal) throws IOException; + + /** + * getParentArray() returns an int array of size getSize() listing the + * ordinal of the parent category of each category in the taxonomy. + *

+ * The caller can hold on to the array it got indefinitely - it is + * guaranteed that no-one else will modify it. The other side of the + * same coin is that the caller must treat the array it got as read-only + * and not modify it, because other callers might have gotten the + * same array too (and getParent() calls might be answered from the + * same array). + *

+ * If you use getParentArray() instead of getParent(), remember that + * the array you got is (naturally) not modified after a refresh(), + * so you should always call getParentArray() again after a refresh(). + *

+ * This method's function is similar to allocating an array of size + * getSize() and filling it with getParent() calls, but implementations + * are encouraged to implement it much more efficiently, with O(1) + * complexity. This can be done, for example, by the implementation + * already keeping the parents in an array, and just returning this + * array (without any allocation or copying) when requested. + */ + public int[] getParentArray() throws IOException; + + /** + * Equivalent representations of the taxonomy's parent info, + * used internally for efficient computation of facet results: + * "youngest child" and "oldest sibling" + */ + public static interface ChildrenArrays { + /** + * getYoungestChildArray() returns an int array of size getSize() + * listing the ordinal of the youngest (highest numbered) child + * category of each category in the taxonomy. The value for a leaf + * category (a category without children) is + * INVALID_ORDINAL. + */ + public int[] getYoungestChildArray(); + /** + * getOlderSiblingArray() returns an int array of size getSize() + * listing for each category the ordinal of its immediate older + * sibling (the sibling in the taxonomy tree with the highest ordinal + * below that of the given ordinal). The value for a category with no + * older sibling is INVALID_ORDINAL. + */ + public int[] getOlderSiblingArray(); + } + + /** + * getChildrenArrays() returns a {@link ChildrenArrays} object which can + * be used together to efficiently enumerate the children of any category. + *

+ * The caller can hold on to the object it got indefinitely - it is + * guaranteed that no-one else will modify it. The other side of the + * same coin is that the caller must treat the object which it got (and + * the arrays it contains) as read-only and not modify it, because + * other callers might have gotten the same object too. + *

+ * Implementations should have O(getSize()) time for the first call or + * after a refresh(), but O(1) time for further calls. In neither case + * there should be a need to read new data from disk. These guarantees + * are most likely achieved by calculating this object (based on the + * getParentArray()) when first needed, and later (if the taxonomy was not + * refreshed) returning the same object (without any allocation or copying) + * when requested. + *

+ * The reason we have one method returning one object, rather than two + * methods returning two arrays, is to avoid race conditions in a multi- + * threaded application: We want to avoid the possibility of returning one + * new array and one old array, as those could not be used together. + */ + public ChildrenArrays getChildrenArrays(); + + /** + * Retrieve user committed data. + * @see TaxonomyWriter#commit(Map) + */ + public Map getCommitUserData(); + + /** + * Expert: increments the refCount of this TaxonomyReader instance. + * RefCounts can be used to determine when a taxonomy reader can be closed + * safely, i.e. as soon as there are no more references. + * Be sure to always call a corresponding decRef(), in a finally clause; + * otherwise the reader may never be closed. + */ + public void incRef(); + + /** + * Expert: decreases the refCount of this TaxonomyReader instance. + * If the refCount drops to 0, then pending changes (if any) can be + * committed to the taxonomy index and this reader can be closed. + * @throws IOException + */ + public void decRef() throws IOException; + + /** + * Expert: returns the current refCount for this taxonomy reader + */ + public int getRefCount(); + + /** + * getSize() returns the number of categories in the taxonomy. + *

+ * Because categories are numbered consecutively starting with 0, it + * means the taxonomy contains ordinals 0 through getSize()-1. + *

+ * Note that the number returned by getSize() is often slightly higher + * than the number of categories inserted into the taxonomy; This is + * because when a category is added to the taxonomy, its ancestors + * are also added automatically (including the root, which always get + * ordinal 0). + */ + public int getSize(); + +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyWriter.java b/modules/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyWriter.java new file mode 100644 index 00000000000..6cc9ca862dd --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyWriter.java @@ -0,0 +1,134 @@ +package org.apache.lucene.facet.taxonomy; + +import java.io.Closeable; +import java.io.IOException; +import java.util.Map; + +import org.apache.lucene.index.IndexWriter; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * TaxonomyWriter is the interface which the faceted-search library uses + * to dynamically build the taxonomy at indexing time. + *

+ * Notes about concurrent access to the taxonomy: + *

+ * An implementation must allow multiple readers and a single writer to be + * active concurrently. Readers follow so-called "point in time" semantics, + * i.e., a reader object will only see taxonomy entries which were available + * at the time it was created. What the writer writes is only available to + * (new) readers after the writer's commit() is called. + *

+ * Faceted search keeps two indices - namely Lucene's main index, and this + * taxonomy index. When one or more readers are active concurrently with the + * writer, care must be taken to avoid an inconsistency between the state of + * these two indices: When writing to the indices, the taxonomy must always + * be committed to disk *before* the main index, because the main index + * refers to categories listed in the taxonomy. + * Such control can best be achieved by turning off the main index's + * "autocommit" feature, and explicitly calling commit() for both indices + * (first for the taxonomy, then for the main index). + * In old versions of Lucene (2.2 or earlier), when autocommit could not be + * turned off, a more complicated solution needs to be used. E.g., use + * some sort of (possibly inter-process) locking to ensure that a reader + * is being opened only right after both indices have been flushed (and + * before anything else is written to them). + * + * @lucene.experimental + */ +public interface TaxonomyWriter extends Closeable { + + /** + * addCategory() adds a category with a given path name to the taxonomy, + * and returns its ordinal. If the category was already present in + * the taxonomy, its existing ordinal is returned. + *

+ * Before adding a category, addCategory() makes sure that all its + * ancestor categories exist in the taxonomy as well. As result, the + * ordinal of a category is guaranteed to be smaller then the ordinal of + * any of its descendants. + */ + public int addCategory(CategoryPath categoryPath) throws IOException; + + /** + * Calling commit() ensures that all the categories written so far are + * visible to a reader that is opened (or reopened) after that call. + * When the index is closed(), commit() is also implicitly done. + */ + public void commit() throws IOException; + + /** + * Like commit(), but also store properties with the index. These properties + * are retrievable by {@link TaxonomyReader#getCommitUserData}. + * See {@link IndexWriter#commit(Map)}. + */ + public void commit(Map commitUserData) throws IOException; + + /** + * prepare most of the work needed for a two-phase commit. + * See {@link IndexWriter#prepareCommit}. + */ + public void prepareCommit() throws IOException; + + /** + * Like above, and also prepares to store user data with the index. + * See {@link IndexWriter#prepareCommit(Map)} + */ + public void prepareCommit(Map commitUserData) throws IOException; + + /** + * getParent() returns the ordinal of the parent category of the category + * with the given ordinal. + *

+ * When a category is specified as a path name, finding the path of its + * parent is as trivial as dropping the last component of the path. + * getParent() is functionally equivalent to calling getPath() on the + * given ordinal, dropping the last component of the path, and then calling + * getOrdinal() to get an ordinal back. + *

+ * If the given ordinal is the ROOT_ORDINAL, an INVALID_ORDINAL is returned. + * If the given ordinal is a top-level category, the ROOT_ORDINAL is returned. + * If an invalid ordinal is given (negative or beyond the last available + * ordinal), an ArrayIndexOutOfBoundsException is thrown. However, it is + * expected that getParent will only be called for ordinals which are + * already known to be in the taxonomy. + *

+ * TODO (Facet): instead of a getParent(ordinal) method, consider having a + * getCategory(categorypath, prefixlen) which is similar to addCategory + * except it doesn't add new categories; This method can be used to get + * the ordinals of all prefixes of the given category, and it can use + * exactly the same code and cache used by addCategory() so it means less code. + */ + public int getParent(int ordinal) throws IOException; + + /** + * getSize() returns the number of categories in the taxonomy. + *

+ * Because categories are numbered consecutively starting with 0, it + * means the taxonomy contains ordinals 0 through getSize()-1. + *

+ * Note that the number returned by getSize() is often slightly higher + * than the number of categories inserted into the taxonomy; This is + * because when a category is added to the taxonomy, its ancestors + * are also added automatically (including the root, which always get + * ordinal 0). + */ + public int getSize(); + +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/taxonomy/lucene/Consts.java b/modules/facet/src/java/org/apache/lucene/facet/taxonomy/lucene/Consts.java new file mode 100644 index 00000000000..9c53c48f6c7 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/taxonomy/lucene/Consts.java @@ -0,0 +1,58 @@ +package org.apache.lucene.facet.taxonomy.lucene; + +import org.apache.lucene.document.FieldSelector; +import org.apache.lucene.document.FieldSelectorResult; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @lucene.experimental + */ +abstract class Consts { + + static final String FULL = "$full_path$"; + static final String FIELD_PAYLOADS = "$payloads$"; + static final String PAYLOAD_PARENT = "p"; + static final char[] PAYLOAD_PARENT_CHARS = PAYLOAD_PARENT.toCharArray(); + + /** + * The following is a "field selector", an object which tells Lucene to + * extract only a single field rather than a whole document. + */ + public static final FieldSelector fullPathSelector = new FieldSelector() { + public FieldSelectorResult accept(String fieldName) { + if (fieldName.equals(FULL)) { + return FieldSelectorResult.LOAD_AND_BREAK; + } + return FieldSelectorResult.NO_LOAD; + } + }; + + /** + * Delimiter used for creating the full path of a category from the list of + * its labels from root. It is forbidden for labels to contain this + * character. + *

+ * Originally, we used \uFFFE, officially a "unicode noncharacter" (invalid + * unicode character) for this purpose. Recently, we switched to the + * "private-use" character \uF749. + */ + //static final char DEFAULT_DELIMITER = '\uFFFE'; + static final char DEFAULT_DELIMITER = '\uF749'; + +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/taxonomy/lucene/LuceneTaxonomyReader.java b/modules/facet/src/java/org/apache/lucene/facet/taxonomy/lucene/LuceneTaxonomyReader.java new file mode 100644 index 00000000000..9bcae6e6a17 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/taxonomy/lucene/LuceneTaxonomyReader.java @@ -0,0 +1,569 @@ +package org.apache.lucene.facet.taxonomy.lucene; + +import java.io.File; +import java.io.IOException; +import java.util.Iterator; +import java.util.Map; +import java.util.Map.Entry; +import java.util.concurrent.locks.ReadWriteLock; +import java.util.concurrent.locks.ReentrantReadWriteLock; +import java.util.logging.Level; +import java.util.logging.Logger; + +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.MultiFields; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; + +import org.apache.lucene.facet.taxonomy.CategoryPath; +import org.apache.lucene.facet.taxonomy.TaxonomyReader; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.collections.LRUHashMap; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * LuceneTaxonomyReader is a {@link TaxonomyReader} which retrieves stored + * taxonomy information from a separate Lucene index. By using a Lucene index, + * rather than some specialized file format, we get for "free" its correctness + * (especially regarding concurrency), and the ability to save it on any + * implementation of Directory (and not just the file system). + *

+ * Reading from the on-disk index on every method call is too slow, so this + * implementation employs caching: Some methods cache recent requests and + * their results, while other methods prefetch all the data into memory + * and then provide answers directly from in-memory tables. See the + * documentation of individual methods for comments on their performance. + * + * @lucene.experimental + */ +public class LuceneTaxonomyReader implements TaxonomyReader { + + private static final Logger logger = Logger.getLogger(LuceneTaxonomyReader.class.getName()); + + private IndexReader indexReader; + + // The following lock is used to allow multiple threads to read from the + // index concurrently, while having them block during the very short + // critical moment of refresh() (see comments below). Note, however, that + // we only read from the index when we don't have the entry in our cache, + // and the caches are locked separately. + private ReadWriteLock indexReaderLock = new ReentrantReadWriteLock(); + + // The following are the limited-size LRU caches used to cache the latest + // results from getOrdinal() and getCategoryCache(). + // Because LRUHashMap is not thread-safe, we need to synchronize on this + // object when using it. Unfortunately, this is not optimal under heavy + // contention because it means that while one thread is using the cache + // (reading or modifying) others are blocked from using it - or even + // starting to do benign things like calculating the hash function. A more + // efficient approach would be to use a non-locking (as much as possible) + // concurrent solution, along the lines of java.util.concurrent.ConcurrentHashMap + // but with LRU semantics. + // However, even in the current sub-optimal implementation we do not make + // the mistake of locking out readers while waiting for disk in a cache + // miss - below, we do not hold cache lock while reading missing data from + // disk. + private final LRUHashMap getOrdinalCache; + private final LRUHashMap getCategoryCache; + + // getParent() needs to be extremely efficient, to the point that we need + // to fetch all the data in advance into memory, and answer these calls + // from memory. Currently we use a large integer array, which is + // initialized when the taxonomy is opened, and potentially enlarged + // when it is refresh()ed. + // These arrays are not syncrhonized. Rather, the reference to the array + // is volatile, and the only writing operation (refreshPrefetchArrays) + // simply creates a new array and replaces the reference. The volatility + // of the reference ensures the correct atomic replacement and its + // visibility properties (the content of the array is visible when the + // new reference is visible). + private ParentArray parentArray; + + private char delimiter = Consts.DEFAULT_DELIMITER; + + /** + * Open for reading a taxonomy stored in a given {@link Directory}. + * @param directory + * The {@link Directory} in which to the taxonomy lives. Note that + * the taxonomy is read directly to that directory (not from a + * subdirectory of it). + * @throws CorruptIndexException if the Taxonomy is corrupted. + * @throws IOException if another error occurred. + */ + public LuceneTaxonomyReader(Directory directory) + throws CorruptIndexException, IOException { + this.indexReader = openIndexReader(directory); + + // These are the default cache sizes; they can be configured after + // construction with the cache's setMaxSize() method + getOrdinalCache = new LRUHashMap(4000); + getCategoryCache = new LRUHashMap(4000); + + // TODO (Facet): consider lazily create parent array it when asked, not in the constructor + parentArray = new ParentArray(); + parentArray.refresh(indexReader); + } + + protected IndexReader openIndexReader(Directory directory) throws CorruptIndexException, IOException { + return IndexReader.open(directory); + } + + // convenience constructors... deprecated because they cause confusion + // because they use parent directory instead of the actual directory. + private Directory ourDirectory = null; // remember directory to close later, but only if we opened it here + /** + * Open for reading a taxonomy stored in a subdirectory of a given + * directory on the file system. + * @param parentDir The parent directory of the taxonomy's directory + * (usually this would be the directory holding the index). + * @param name The name of the taxonomy, and the subdirectory holding it. + * @throws CorruptIndexException if the Taxonomy is corrupted. + * @throws IOException if another error occurred. + */ + @Deprecated + public LuceneTaxonomyReader(File parentDir, String name) + throws CorruptIndexException, IOException { + this(FSDirectory.open(new File(parentDir, name))); + ourDirectory = indexReader.directory(); // remember to close the directory we opened + } + + /** + * Open for reading a taxonomy stored in a subdirectory of a given + * directory on the file system. + * @param parentDir The parent directory of the taxonomy's directory. + * @param name The name of the taxonomy, and the subdirectory holding it. + * @throws CorruptIndexException if the Taxonomy is corrupted. + * @throws IOException if another error occurred. + */ + @Deprecated + public LuceneTaxonomyReader(String parentDir, String name) + throws CorruptIndexException, IOException { + this(FSDirectory.open(new File(parentDir, name))); + ourDirectory = indexReader.directory(); // rememebr to close the directory we opened + } + + /** + * setCacheSize controls the maximum allowed size of each of the caches + * used by {@link #getPath(int)} and {@link #getOrdinal(CategoryPath)}. + *

+ * Currently, if the given size is smaller than the current size of + * a cache, it will not shrink, and rather we be limited to its current + * size. + * @param size the new maximum cache size, in number of entries. + */ + public void setCacheSize(int size) { + synchronized(getCategoryCache) { + getCategoryCache.setMaxSize(size); + } + synchronized(getOrdinalCache) { + getOrdinalCache.setMaxSize(size); + } + } + + /** + * setDelimiter changes the character that the taxonomy uses in its + * internal storage as a delimiter between category components. Do not + * use this method unless you really know what you are doing. + *

+ * If you do use this method, make sure you call it before any other + * methods that actually queries the taxonomy. Moreover, make sure you + * always pass the same delimiter for all LuceneTaxonomyWriter and + * LuceneTaxonomyReader objects you create. + */ + public void setDelimiter(char delimiter) { + this.delimiter = delimiter; + } + + public int getOrdinal(CategoryPath categoryPath) throws IOException { + if (categoryPath.length()==0) { + return ROOT_ORDINAL; + } + String path = categoryPath.toString(delimiter); + + // First try to find the answer in the LRU cache: + synchronized(getOrdinalCache) { + Integer res = getOrdinalCache.get(path); + if (res!=null) { + return res.intValue(); + } + } + + // If we're still here, we have a cache miss. We need to fetch the + // value from disk, and then also put it in the cache: + int ret = TaxonomyReader.INVALID_ORDINAL; + try { + indexReaderLock.readLock().lock(); + // TODO (Facet): avoid Multi*? + Bits deletedDocs = MultiFields.getDeletedDocs(indexReader); + DocsEnum docs = MultiFields.getTermDocsEnum(indexReader, deletedDocs, Consts.FULL, new BytesRef(path)); + if (docs != null && docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { + ret = docs.docID(); + } + } finally { + indexReaderLock.readLock().unlock(); + } + + // Put the new value in the cache. Note that it is possible that while + // we were doing the above fetching (without the cache locked), some + // other thread already added the same category to the cache. We do + // not care about this possibilty, as LRUCache replaces previous values + // of the same keys (it doesn't store duplicates). + synchronized(getOrdinalCache) { + // GB: new Integer(int); creates a new object each and every time. + // Integer.valueOf(int) might not (See JavaDoc). + getOrdinalCache.put(path, Integer.valueOf(ret)); + } + + return ret; + } + + public CategoryPath getPath(int ordinal) throws CorruptIndexException, IOException { + // TODO (Facet): Currently, the LRU cache we use (getCategoryCache) holds + // strings with delimiters, not CategoryPath objects, so even if + // we have a cache hit, we need to process the string and build a new + // CategoryPath object every time. What is preventing us from putting + // the actual CategoryPath object in the cache is the fact that these + // objects are mutable. So we should create an immutable (read-only) + // interface that CategoryPath implements, and this method should + // return this interface, not the writable CategoryPath. + String label = getLabel(ordinal); + if (label==null) { + return null; + } + return new CategoryPath(label, delimiter); + } + + public boolean getPath(int ordinal, CategoryPath result) throws CorruptIndexException, IOException { + String label = getLabel(ordinal); + if (label==null) { + return false; + } + result.clear(); + result.add(label, delimiter); + return true; + } + + private String getLabel(int catID) throws CorruptIndexException, IOException { + // First try to find the answer in the LRU cache. It is very + // unfortunate that we need to allocate an Integer object here - + // it would have been better if we used a hash table specifically + // designed for int keys... + // GB: new Integer(int); creates a new object each and every time. + // Integer.valueOf(int) might not (See JavaDoc). + Integer catIDInteger = Integer.valueOf(catID); + + synchronized(getCategoryCache) { + String res = getCategoryCache.get(catIDInteger); + if (res!=null) { + return res; + } + } + + // If we're still here, we have a cache miss. We need to fetch the + // value from disk, and then also put it in the cache: + String ret; + try { + indexReaderLock.readLock().lock(); + // The taxonomy API dictates that if we get an invalid category + // ID, we should return null, If we don't check this here, we + // can some sort of an exception from the document() call below. + // NOTE: Currently, we *do not* cache this return value; There + // isn't much point to do so, because checking the validity of + // the docid doesn't require disk access - just comparing with + // the number indexReader.maxDoc(). + if (catID<0 || catID>=indexReader.maxDoc()) { + return null; + } + ret = indexReader.document(catID, Consts.fullPathSelector) + .get(Consts.FULL); + } finally { + indexReaderLock.readLock().unlock(); + } + // Put the new value in the cache. Note that it is possible that while + // we were doing the above fetching (without the cache locked), some + // other thread already added the same category to the cache. We do + // not care about this possibility, as LRUCache replaces previous + // values of the same keys (it doesn't store duplicates). + synchronized (getCategoryCache) { + getCategoryCache.put(catIDInteger, ret); + } + + return ret; + } + + public int getParent(int ordinal) { + // Note how we don't need to hold the read lock to do the following, + // because the array reference is volatile, ensuring the correct + // visibility and ordering: if we get the new reference, the new + // data is also visible to this thread. + return getParentArray()[ordinal]; + } + + /** + * getParentArray() returns an int array of size getSize() listing the + * ordinal of the parent category of each category in the taxonomy. + *

+ * The caller can hold on to the array it got indefinitely - it is + * guaranteed that no-one else will modify it. The other side of the + * same coin is that the caller must treat the array it got as read-only + * and not modify it, because other callers might have gotten the + * same array too, and getParent() calls are also answered from the + * same array. + *

+ * The getParentArray() call is extremely efficient, merely returning + * a reference to an array that already exists. For a caller that plans + * to call getParent() for many categories, using getParentArray() and + * the array it returns is a somewhat faster approach because it avoids + * the overhead of method calls and volatile dereferencing. + *

+ * If you use getParentArray() instead of getParent(), remember that + * the array you got is (naturally) not modified after a refresh(), + * so you should always call getParentArray() again after a refresh(). + */ + + public int[] getParentArray() { + // Note how we don't need to hold the read lock to do the following, + // because the array reference is volatile, ensuring the correct + // visibility and ordering: if we get the new reference, the new + // data is also visible to this thread. + return parentArray.getArray(); + } + + // Note that refresh() is synchronized (it is the only synchronized + // method in this class) to ensure that it never gets called concurrently + // with itself. + public synchronized void refresh() throws IOException { + /* + * Since refresh() can be a lengthy operation, it is very important that we + * avoid locking out all readers for its duration. This is why we don't hold + * the indexReaderLock write lock for the entire duration of this method. In + * fact, it is enough to hold it only during a single assignment! Other + * comments in this method will explain this. + */ + + // note that the lengthy operation indexReader.reopen() does not + // modify the reader, so we can do it without holding a lock. We can + // safely read indexReader without holding the write lock, because + // no other thread can be writing at this time (this method is the + // only possible writer, and it is "synchronized" to avoid this case). + IndexReader r2 = indexReader.reopen(); + if (indexReader != r2) { + IndexReader oldreader = indexReader; + // we can close the old searcher, but need to synchronize this + // so that we don't close it in the middle that another routine + // is reading from it. + indexReaderLock.writeLock().lock(); + indexReader = r2; + indexReaderLock.writeLock().unlock(); + // We can close the old reader, but need to be certain that we + // don't close it while another method is reading from it. + // Luckily, we can be certain of that even without putting the + // oldreader.close() in the locked section. The reason is that + // after lock() succeeded above, we know that all existing readers + // had finished (this is what a read-write lock ensures). New + // readers, starting after the unlock() we just did, already got + // the new indexReader we set above. So nobody can be possibly + // using the old indexReader, and we can close it: + oldreader.close(); + + // We prefetch some of the arrays to make requests much faster. + // Let's refresh these prefetched arrays; This refresh is much + // is made more efficient by assuming that it is enough to read + // the values for new categories (old categories could not have been + // changed or deleted) + // Note that this this done without the write lock being held, + // which means that it is possible that during a refresh(), a + // reader will have some methods (like getOrdinal and getCategory) + // return fresh information, while getParent() + // (only to be prefetched now) still return older information. + // We consider this to be acceptable. The important thing, + // however, is that refreshPrefetchArrays() itself writes to + // the arrays in a correct manner (see discussion there) + parentArray.refresh(indexReader); + + // Remove any INVALID_ORDINAL values from the ordinal cache, + // because it is possible those are now answered by the new data! + Iterator> i = getOrdinalCache.entrySet().iterator(); + while (i.hasNext()) { + Entry e = i.next(); + if (e.getValue().intValue() == INVALID_ORDINAL) { + i.remove(); + } + } + } + } + + public void close() throws IOException { + indexReader.close(); + if (ourDirectory!=null) { + ourDirectory.close(); + } + } + + public int getSize() { + indexReaderLock.readLock().lock(); + try { + return indexReader.numDocs(); + } finally { + indexReaderLock.readLock().unlock(); + } + } + + public Map getCommitUserData() { + return indexReader.getCommitUserData(); + } + + private ChildrenArrays childrenArrays; + Object childrenArraysRebuild = new Object(); + + public ChildrenArrays getChildrenArrays() { + // Check if the taxonomy grew since we built the array, and if it + // did, create new (and larger) arrays and fill them as required. + // We do all this under a lock, two prevent to concurrent calls to + // needlessly do the same array building at the same time. + synchronized(childrenArraysRebuild) { + int num = getSize(); + int first; + if (childrenArrays==null) { + first = 0; + } else { + first = childrenArrays.getYoungestChildArray().length; + } + // If the taxonomy hasn't grown, we can return the existing object + // immediately + if (first == num) { + return childrenArrays; + } + // Otherwise, build new arrays for a new ChildrenArray object. + // These arrays start with an enlarged copy of the previous arrays, + // and then are modified to take into account the new categories: + int[] newYoungestChildArray = new int[num]; + int[] newOlderSiblingArray = new int[num]; + // In Java 6, we could just do Arrays.copyOf()... + if (childrenArrays!=null) { + System.arraycopy(childrenArrays.getYoungestChildArray(), 0, + newYoungestChildArray, 0, childrenArrays.getYoungestChildArray().length); + System.arraycopy(childrenArrays.getOlderSiblingArray(), 0, + newOlderSiblingArray, 0, childrenArrays.getOlderSiblingArray().length); + } + int[] parents = getParentArray(); + for (int i=first; i + * By using a Lucene index to store the information on disk, rather than some + * specialized file format, we get for "free" Lucene's correctness (especially + * regarding multi-process concurrency), and the ability to write to any + * implementation of Directory (and not just the file system). + *

+ * In addition to the permanently-stored Lucene index, efficiency dictates that + * we also keep an in-memory cache of recently seen or all + * categories, so that we do not need to go back to disk for every category + * addition to see which ordinal this category already has, if any. A + * {@link TaxonomyWriterCache} object determines the specific caching algorithm + * used. + *

+ * This class offers some hooks for extending classes to control the + * {@link IndexWriter} instance that is used. See {@link #openLuceneIndex} and + * {@link #closeLuceneIndex()} . + * + * @lucene.experimental + */ +public class LuceneTaxonomyWriter implements TaxonomyWriter { + + protected IndexWriter indexWriter; + private int nextID; + private char delimiter = Consts.DEFAULT_DELIMITER; + private SinglePositionTokenStream parentStream = new SinglePositionTokenStream(Consts.PAYLOAD_PARENT); + private Field parentStreamField; + private Field fullPathField; + + private TaxonomyWriterCache cache; + /** + * We call the cache "complete" if we know that every category in our + * taxonomy is in the cache. When the cache is not complete, and + * we can't find a category in the cache, we still need to look for it + * in the on-disk index; Therefore when the cache is not complete, we + * need to open a "reader" to the taxonomy index. + * The cache becomes incomplete if it was never filled with the existing + * categories, or if a put() to the cache ever returned true (meaning + * that some of the cached data was cleared). + */ + private boolean cacheIsComplete; + private IndexReader reader; + private int cacheMisses; + + /** + * setDelimiter changes the character that the taxonomy uses in its internal + * storage as a delimiter between category components. Do not use this + * method unless you really know what you are doing. It has nothing to do + * with whatever character the application may be using to represent + * categories for its own use. + *

+ * If you do use this method, make sure you call it before any other methods + * that actually queries the taxonomy. Moreover, make sure you always pass + * the same delimiter for all LuceneTaxonomyWriter and LuceneTaxonomyReader + * objects you create for the same directory. + */ + public void setDelimiter(char delimiter) { + this.delimiter = delimiter; + } + + /** + * Forcibly unlocks the taxonomy in the named directory. + *

+ * Caution: this should only be used by failure recovery code, when it is + * known that no other process nor thread is in fact currently accessing + * this taxonomy. + *

+ * This method is unnecessary if your {@link Directory} uses a + * {@link NativeFSLockFactory} instead of the default + * {@link SimpleFSLockFactory}. When the "native" lock is used, a lock + * does not stay behind forever when the process using it dies. + */ + public static void unlock(Directory directory) throws IOException { + IndexWriter.unlock(directory); + } + + /** + * Construct a Taxonomy writer. + * + * @param directory + * The {@link Directory} in which to store the taxonomy. Note that + * the taxonomy is written directly to that directory (not to a + * subdirectory of it). + * @param openMode + * Specifies how to open a taxonomy for writing: APPEND + * means open an existing index for append (failing if the index does + * not yet exist). CREATE means create a new index (first + * deleting the old one if it already existed). + * APPEND_OR_CREATE appends to an existing index if there + * is one, otherwise it creates a new index. + * @param cache + * A {@link TaxonomyWriterCache} implementation which determines + * the in-memory caching policy. See for example + * {@link LruTaxonomyWriterCache} and {@link Cl2oTaxonomyWriterCache}. + * If null or missing, {@link #defaultTaxonomyWriterCache()} is used. + * @throws CorruptIndexException + * if the taxonomy is corrupted. + * @throws LockObtainFailedException + * if the taxonomy is locked by another writer. If it is known + * that no other concurrent writer is active, the lock might + * have been left around by an old dead process, and should be + * removed using {@link #unlock(Directory)}. + * @throws IOException + * if another error occurred. + */ + public LuceneTaxonomyWriter(Directory directory, OpenMode openMode, + TaxonomyWriterCache cache) + throws CorruptIndexException, LockObtainFailedException, + IOException { + + openLuceneIndex(directory, openMode); + reader = null; + + parentStreamField = new Field(Consts.FIELD_PAYLOADS, parentStream); + parentStreamField.setOmitNorms(true); + fullPathField = new Field(Consts.FULL, "", Store.YES, Index.NOT_ANALYZED_NO_NORMS); + fullPathField.setOmitTermFreqAndPositions(true); + + this.nextID = indexWriter.maxDoc(); + + if (cache==null) { + cache = defaultTaxonomyWriterCache(); + } + this.cache = cache; + + if (nextID == 0) { + cacheIsComplete = true; + // Make sure that the taxonomy always contain the root category + // with category id 0. + addCategory(new CategoryPath()); + refreshReader(); + } else { + // There are some categories on the disk, which we have not yet + // read into the cache, and therefore the cache is incomplete. + // We chose not to read all the categories into the cache now, + // to avoid terrible performance when a taxonomy index is opened + // to add just a single category. We will do it later, after we + // notice a few cache misses. + cacheIsComplete = false; + } + cacheMisses = 0; + } + + /** + * A hook for extensions of this class to provide their own + * {@link IndexWriter} implementation or instance. Extending classes can + * instantiate and configure the {@link IndexWriter} as they see fit, + * including setting a {@link org.apache.lucene.index.MergeScheduler}, or + * {@link org.apache.lucene.index.IndexDeletionPolicy}, different RAM size + * etc.
+ * NOTE: the instance this method returns will be closed upon calling + * to {@link #close()}. If you wish to do something different, you should + * override {@link #closeLuceneIndex()}. + * + * @param directory the {@link Directory} on top of wich an + * {@link IndexWriter} should be opened. + * @param openMode see {@link OpenMode} + */ + protected void openLuceneIndex (Directory directory, OpenMode openMode) + throws CorruptIndexException, LockObtainFailedException, IOException { + // Make sure we use a MergePolicy which merges segments in-order and thus + // keeps the doc IDs ordered as well (this is crucial for the taxonomy + // index). + IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_30, + new KeywordAnalyzer()).setOpenMode(openMode).setMergePolicy( + new LogByteSizeMergePolicy()); + indexWriter = new IndexWriter(directory, config); + } + + // Currently overridden by a unit test that verifies that every index we open + // is close()ed. + /** + * Open an {@link IndexReader} from the {@link #indexWriter} member, by + * calling {@link IndexWriter#getReader()}. Extending classes can override + * this method to return their own {@link IndexReader}. + */ + protected IndexReader openReader() throws IOException { + return IndexReader.open(indexWriter, true); + } + + /** + * Creates a new instance with a default cached as defined by + * {@link #defaultTaxonomyWriterCache()}. + */ + public LuceneTaxonomyWriter(Directory directory, OpenMode openMode) + throws CorruptIndexException, LockObtainFailedException, IOException { + this(directory, openMode, defaultTaxonomyWriterCache()); + } + + /** + * Defines the default {@link TaxonomyWriterCache} to use in constructors + * which do not specify one. + *

+ * The current default is {@link Cl2oTaxonomyWriterCache} constructed + * with the parameters (1024, 0.15f, 3), i.e., the entire taxonomy is + * cached in memory while building it. + */ + public static TaxonomyWriterCache defaultTaxonomyWriterCache() { + return new Cl2oTaxonomyWriterCache(1024, 0.15f, 3); + } + + // convenience constructors: + + public LuceneTaxonomyWriter(Directory d) + throws CorruptIndexException, LockObtainFailedException, + IOException { + this(d, OpenMode.CREATE_OR_APPEND); + } + + /** + * Frees used resources as well as closes the underlying {@link IndexWriter}, + * which commits whatever changes made to it to the underlying + * {@link Directory}. + */ + public synchronized void close() throws CorruptIndexException, IOException { + closeLuceneIndex(); + closeResources(); + } + + /** + * Returns the number of memory bytes used by the cache. + * @return Number of cache bytes in memory, for CL2O only; zero otherwise. + */ + public int getCacheMemoryUsage() { + if (this.cache == null || !(this.cache instanceof Cl2oTaxonomyWriterCache)) { + return 0; + } + return ((Cl2oTaxonomyWriterCache)this.cache).getMemoryUsage(); + } + + /** + * A hook for extending classes to close additional resources that were used. + * The default implementation closes the {@link IndexReader} as well as the + * {@link TaxonomyWriterCache} instances that were used.
+ * NOTE: if you override this method, you should include a + * super.closeResources() call in your implementation. + */ + protected synchronized void closeResources() throws IOException { + if (reader != null) { + reader.close(); + reader = null; + } + if (cache != null) { + cache.close(); + cache = null; + } + } + + /** + * A hook for extending classes to control closing the {@link IndexWriter} + * returned by {@link #openLuceneIndex}. + */ + protected void closeLuceneIndex() throws CorruptIndexException, IOException { + if (indexWriter != null) { + indexWriter.close(); + indexWriter = null; + } + } + + /** + * Look up the given category in the cache and/or the on-disk storage, + * returning the category's ordinal, or a negative number in case the + * category does not yet exist in the taxonomy. + */ + protected int findCategory(CategoryPath categoryPath) throws IOException { + // If we can find the category in our cache, we can return the + // response directly from it: + int res = cache.get(categoryPath); + if (res >= 0) { + return res; + } + // If we know that the cache is complete, i.e., contains every category + // which exists, we can return -1 immediately. However, if the cache is + // not complete, we need to check the disk. + if (cacheIsComplete) { + return -1; + } + cacheMisses++; + // After a few cache misses, it makes sense to read all the categories + // from disk and into the cache. The reason not to do this on the first + // cache miss (or even when opening the writer) is that it will + // significantly slow down the case when a taxonomy is opened just to + // add one category. The idea only spending a long time on reading + // after enough time was spent on cache misses is known as a "online + // algorithm". + if (perhapsFillCache()) { + return cache.get(categoryPath); + } + + // We need to get an answer from the on-disk index. If a reader + // is not yet open, do it now: + if (reader == null) { + reader = openReader(); + } + + // TODO (Facet): avoid Multi*? + Bits deletedDocs = MultiFields.getDeletedDocs(reader); + DocsEnum docs = MultiFields.getTermDocsEnum(reader, deletedDocs, Consts.FULL, + new BytesRef(categoryPath.toString(delimiter))); + if (docs == null || docs.nextDoc() == DocIdSetIterator.NO_MORE_DOCS) { + return -1; // category does not exist in taxonomy + } + // Note: we do NOT add to the cache the fact that the category + // does not exist. The reason is that our only use for this + // method is just before we actually add this category. If + // in the future this usage changes, we should consider caching + // the fact that the category is not in the taxonomy. + addToCache(categoryPath, docs.docID()); + return docs.docID(); + } + + /** + * Look up the given prefix of the given category in the cache and/or the + * on-disk storage, returning that prefix's ordinal, or a negative number in + * case the category does not yet exist in the taxonomy. + */ + private int findCategory(CategoryPath categoryPath, int prefixLen) + throws IOException { + int res = cache.get(categoryPath, prefixLen); + if (res >= 0) { + return res; + } + if (cacheIsComplete) { + return -1; + } + cacheMisses++; + if (perhapsFillCache()) { + return cache.get(categoryPath, prefixLen); + } + if (reader == null) { + reader = openReader(); + } + Bits deletedDocs = MultiFields.getDeletedDocs(reader); + DocsEnum docs = MultiFields.getTermDocsEnum(reader, deletedDocs, Consts.FULL, + new BytesRef(categoryPath.toString(delimiter, prefixLen))); + if (docs == null || docs.nextDoc() == DocIdSetIterator.NO_MORE_DOCS) { + return -1; // category does not exist in taxonomy + } + addToCache(categoryPath, prefixLen, docs.docID()); + return docs.docID(); + } + + // TODO (Facet): addCategory() is synchronized. This means that if indexing is + // multi-threaded, a new category that needs to be written to disk (and + // potentially even trigger a lengthy merge) locks out other addCategory() + // calls - even those which could immediately return a cached value. + // We definitely need to fix this situation! + public synchronized int addCategory(CategoryPath categoryPath) + throws IOException { + // If the category is already in the cache and/or the taxonomy, we + // should return its existing ordinal: + int res = findCategory(categoryPath); + if (res < 0) { + // This is a new category, and we need to insert it into the index + // (and the cache). Actually, we might also need to add some of + // the category's ancestors before we can add the category itself + // (while keeping the invariant that a parent is always added to + // the taxonomy before its child). internalAddCategory() does all + // this recursively: + res = internalAddCategory(categoryPath, categoryPath.length()); + } + return res; + + } + + /** + * Add a new category into the index (and the cache), and return its new + * ordinal. + *

+ * Actually, we might also need to add some of the category's ancestors + * before we can add the category itself (while keeping the invariant that a + * parent is always added to the taxonomy before its child). We do this by + * recursion. + */ + private int internalAddCategory(CategoryPath categoryPath, int length) + throws CorruptIndexException, IOException { + + // Find our parent's ordinal (recursively adding the parent category + // to the taxonomy if it's not already there). Then add the parent + // ordinal as payloads (rather than a stored field; payloads can be + // more efficiently read into memory in bulk by LuceneTaxonomyReader) + int parent; + if (length > 1) { + parent = findCategory(categoryPath, length - 1); + if (parent < 0) { + parent = internalAddCategory(categoryPath, length - 1); + } + } else if (length == 1) { + parent = TaxonomyReader.ROOT_ORDINAL; + } else { + parent = TaxonomyReader.INVALID_ORDINAL; + } + int id = addCategoryDocument(categoryPath, length, parent); + + return id; + } + + // Note that the methods calling addCategoryDocument() are synchornized, + // so this method is effectively synchronized as well, but we'll add + // synchronized to be on the safe side, and we can reuse class-local objects + // instead of allocating them every time + protected synchronized int addCategoryDocument(CategoryPath categoryPath, + int length, int parent) + throws CorruptIndexException, IOException { + // Before Lucene 2.9, position increments >=0 were supported, so we + // added 1 to parent to allow the parent -1 (the parent of the root). + // Unfortunately, starting with Lucene 2.9, after LUCENE-1542, this is + // no longer enough, since 0 is not encoded consistently either (see + // comment in SinglePositionTokenStream). But because we must be + // backward-compatible with existing indexes, we can't just fix what + // we write here (e.g., to write parent+2), and need to do a workaround + // in the reader (which knows that anyway only category 0 has a parent + // -1). + parentStream.set(parent+1); + Document d = new Document(); + d.add(parentStreamField); + + fullPathField.setValue(categoryPath.toString(delimiter, length)); + d.add(fullPathField); + + // Note that we do no pass an Analyzer here because the fields that are + // added to the Document are untokenized or contains their own TokenStream. + // Therefore the IndexWriter's Analyzer has no effect. + indexWriter.addDocument(d); + int id = nextID++; + + addToCache(categoryPath, length, id); + + // also add to the parent array + getParentArray().add(id, parent); + + return id; + } + + private static class SinglePositionTokenStream extends TokenStream { + private CharTermAttribute termAtt; + private PositionIncrementAttribute posIncrAtt; + private boolean returned; + public SinglePositionTokenStream(String word) { + termAtt = addAttribute(CharTermAttribute.class); + posIncrAtt = addAttribute(PositionIncrementAttribute.class); + termAtt.setEmpty().append(word); + returned = true; + } + /** + * Set the value we want to keep, as the position increment. + * Note that when TermPositions.nextPosition() is later used to + * retrieve this value, val-1 will be returned, not val. + *

+ * IMPORTANT NOTE: Before Lucene 2.9, val>=0 were safe (for val==0, + * the retrieved position would be -1). But starting with Lucene 2.9, + * this unfortunately changed, and only val>0 are safe. val=0 can + * still be used, but don't count on the value you retrieve later + * (it could be 0 or -1, depending on circumstances or versions). + * This change is described in Lucene's JIRA: LUCENE-1542. + */ + public void set(int val) { + posIncrAtt.setPositionIncrement(val); + returned = false; + } + @Override + public boolean incrementToken() throws IOException { + if (returned) { + return false; + } + returned = true; + return true; + } + } + + private void addToCache(CategoryPath categoryPath, int id) + throws CorruptIndexException, IOException { + if (cache.put(categoryPath, id)) { + // If cache.put() returned true, it means the cache was limited in + // size, became full, so parts of it had to be cleared. + // Unfortunately we don't know which part was cleared - it is + // possible that a relatively-new category that hasn't yet been + // committed to disk (and therefore isn't yet visible in our + // "reader") was deleted from the cache, and therefore we must + // now refresh the reader. + // Because this is a slow operation, cache implementations are + // expected not to delete entries one-by-one but rather in bulk + // (LruTaxonomyWriterCache removes the 2/3rd oldest entries). + refreshReader(); + cacheIsComplete = false; + } + } + + private void addToCache(CategoryPath categoryPath, int prefixLen, int id) + throws CorruptIndexException, IOException { + if (cache.put(categoryPath, prefixLen, id)) { + refreshReader(); + cacheIsComplete = false; + } + } + + private synchronized void refreshReader() throws IOException { + if (reader != null) { + IndexReader r2 = reader.reopen(); + if (reader != r2) { + reader.close(); + reader = r2; + } + } + } + + /** + * Calling commit() ensures that all the categories written so far are + * visible to a reader that is opened (or reopened) after that call. + * When the index is closed(), commit() is also implicitly done. + * See {@link TaxonomyWriter#commit()} + */ + public synchronized void commit() throws CorruptIndexException, IOException { + indexWriter.commit(); + refreshReader(); + } + + /** + * Like commit(), but also store properties with the index. These properties + * are retrievable by {@link LuceneTaxonomyReader#getCommitUserData}. + * See {@link TaxonomyWriter#commit(Map)}. + */ + public synchronized void commit(Map commitUserData) throws CorruptIndexException, IOException { + indexWriter.commit(commitUserData); + refreshReader(); + } + + /** + * prepare most of the work needed for a two-phase commit. + * See {@link IndexWriter#prepareCommit}. + */ + public synchronized void prepareCommit() throws CorruptIndexException, IOException { + indexWriter.prepareCommit(); + } + + /** + * Like above, and also prepares to store user data with the index. + * See {@link IndexWriter#prepareCommit(Map)} + */ + public synchronized void prepareCommit(Map commitUserData) throws CorruptIndexException, IOException { + indexWriter.prepareCommit(commitUserData); + } + + /** + * getSize() returns the number of categories in the taxonomy. + *

+ * Because categories are numbered consecutively starting with 0, it means + * the taxonomy contains ordinals 0 through getSize()-1. + *

+ * Note that the number returned by getSize() is often slightly higher than + * the number of categories inserted into the taxonomy; This is because when + * a category is added to the taxonomy, its ancestors are also added + * automatically (including the root, which always get ordinal 0). + */ + synchronized public int getSize() { + return indexWriter.maxDoc(); + } + + private boolean alreadyCalledFillCache = false; + + /** + * Set the number of cache misses before an attempt is made to read the + * entire taxonomy into the in-memory cache. + *

+ * LuceneTaxonomyWriter holds an in-memory cache of recently seen + * categories to speed up operation. On each cache-miss, the on-disk index + * needs to be consulted. When an existing taxonomy is opened, a lot of + * slow disk reads like that are needed until the cache is filled, so it + * is more efficient to read the entire taxonomy into memory at once. + * We do this complete read after a certain number (defined by this method) + * of cache misses. + *

+ * If the number is set to 0, the entire taxonomy is read + * into the cache on first use, without fetching individual categories + * first. + *

+ * Note that if the memory cache of choice is limited in size, and cannot + * hold the entire content of the on-disk taxonomy, then it is never + * read in its entirety into the cache, regardless of the setting of this + * method. + */ + public void setCacheMissesUntilFill(int i) { + cacheMissesUntilFill = i; + } + private int cacheMissesUntilFill = 11; + + private boolean perhapsFillCache() throws IOException { + // Note: we assume that we're only called when cacheIsComplete==false. + // TODO (Facet): parametrize this criterion: + if (cacheMisses < cacheMissesUntilFill) { + return false; + } + // If the cache was already filled (or we decided not to fill it because + // there was no room), there is no sense in trying it again. + if (alreadyCalledFillCache) { + return false; + } + alreadyCalledFillCache = true; + // TODO (Facet): we should probably completely clear the cache before starting + // to read it? + if (reader == null) { + reader = openReader(); + } + + if (!cache.hasRoom(reader.numDocs())) { + return false; + } + + CategoryPath cp = new CategoryPath(); + Terms terms = MultiFields.getTerms(reader, Consts.FULL); + // The check is done here to avoid checking it on every iteration of the + // below loop. A null term wlil be returned if there are no terms in the + // lexicon, or after the Consts.FULL term. However while the loop is + // executed we're safe, because we only iterate as long as there are next() + // terms. + if (terms != null) { + TermsEnum termsEnum = terms.iterator(); + Bits deletedDocs = MultiFields.getDeletedDocs(reader); + DocsEnum docsEnum = null; + while (termsEnum.next() != null) { + BytesRef t = termsEnum.term(); + // Since we guarantee uniqueness of categories, each term has exactly + // one document. Also, since we do not allow removing categories (and + // hence documents), there are no deletions in the index. Therefore, it + // is sufficient to call next(), and then doc(), exactly once with no + // 'validation' checks. + docsEnum = termsEnum.docs(deletedDocs, docsEnum); + docsEnum.nextDoc(); + cp.clear(); + // TODO (Facet): avoid String creation/use bytes? + cp.add(t.utf8ToString(), delimiter); + cache.put(cp, docsEnum.docID()); + } + } + + cacheIsComplete = true; + // No sense to keep the reader open - we will not need to read from it + // if everything is in the cache. + reader.close(); + reader = null; + return true; + } + + // TODO (Facet): synchronization of some sort? + private ParentArray parentArray; + private ParentArray getParentArray() throws IOException { + if (parentArray==null) { + if (reader == null) { + reader = openReader(); + } + parentArray = new ParentArray(); + parentArray.refresh(reader); + } + return parentArray; + } + public int getParent(int ordinal) throws IOException { + // Note: the following if() just enforces that a user can never ask + // for the parent of a nonexistant category - even if the parent array + // was allocated bigger than it really needs to be. + if (ordinal >= getSize()) { + throw new ArrayIndexOutOfBoundsException(); + } + return getParentArray().getArray()[ordinal]; + } + + /** + * Take all the categories of one or more given taxonomies, and add them to + * the main taxonomy (this), if they are not already there. + *

+ * Additionally, fill a mapping for each of the added taxonomies, + * mapping its ordinals to the ordinals in the enlarged main taxonomy. + * These mapping are saved into an array of OrdinalMap objects given by the + * user, one for each of the given taxonomies (not including "this", the main + * taxonomy). Often the first of these will be a MemoryOrdinalMap and the + * others will be a DiskOrdinalMap - see discussion in {OrdinalMap}. + *

+ * Note that the taxonomies to be added are given as Directory objects, + * not opened TaxonomyReader/TaxonomyWriter objects, so if any of them are + * currently managed by an open TaxonomyWriter, make sure to commit() (or + * close()) it first. The main taxonomy (this) is an open TaxonomyWriter, + * and does not need to be commit()ed before this call. + */ + public void addTaxonomies(Directory[] taxonomies, OrdinalMap[] ordinalMaps) throws IOException { + // To prevent us stepping on the rest of this class's decisions on when + // to open a reader, and when not, we'll be opening a new reader instead + // of using the existing "reader" object: + IndexReader mainreader = openReader(); + // TODO (Facet): can this then go segment-by-segment and avoid MultiDocsEnum etc? + Terms terms = MultiFields.getTerms(mainreader, Consts.FULL); + assert terms != null; // TODO (Facet): explicit check / throw exception? + TermsEnum mainte = terms.iterator(); + DocsEnum mainde = null; + + IndexReader[] otherreaders = new IndexReader[taxonomies.length]; + TermsEnum[] othertes = new TermsEnum[taxonomies.length]; + DocsEnum[] otherdocsEnum = new DocsEnum[taxonomies.length]; // just for reuse + for (int i=0; i0) { + String first=null; + for (int i=0; i0) { + first = currentOthers[i]; + } + } + int comp = 0; + if (currentMain==null || (comp = currentMain.compareTo(first))>0) { + // If 'first' is before currentMain, or currentMain is null, + // then 'first' is a new category and we need to add it to the + // main taxonomy. Then for all taxonomies with this 'first' + // category, we need to add the new category number to their + // map, and move to the next category in all of them. + cp.clear(); + cp.add(first, delimiter); + // We can call internalAddCategory() instead of addCategory() + // because we know the category hasn't been seen yet. + int newordinal = internalAddCategory(cp, cp.length()); + // TODO (Facet): we already had this term in our hands before, in nextTE... + // // TODO (Facet): no need to make this term? + Term t = new Term(Consts.FULL, first); + for (int i=0; i 0 */ { + // The currentMain doesn't appear in any of the other taxonomies - + // we don't need to do anything, just continue to the next one + currentMain = nextTE(mainte); + } + } + + // Close all the readers we've opened, and also tell the ordinal maps + // we're done adding to them + mainreader.close(); + for (int i=0; i + * addToTaxonomies() merges one or more taxonomies into the given taxonomy + * (this). An OrdinalMap is filled for each of the added taxonomies, + * containing the new ordinal (in the merged taxonomy) of each of the + * categories in the old taxonomy. + *

+ * There exist two implementations of OrdinalMap: MemoryOrdinalMap and + * DiskOrdinalMap. As their names suggest, the former keeps the map in + * memory and the latter in a temporary disk file. Because these maps will + * later be needed one by one (to remap the counting lists), not all at the + * same time, it is recommended to put the first taxonomy's map in memory, + * and all the rest on disk (later to be automatically read into memory one + * by one, when needed). + */ + public static interface OrdinalMap { + /** + * Set the size of the map. This MUST be called before addMapping(). + * It is assumed (but not verified) that addMapping() will then be + * called exactly 'size' times, with different origOrdinals between 0 + * and size-1. + */ + public void setSize(int size) throws IOException; + public void addMapping(int origOrdinal, int newOrdinal) throws IOException; + /** + * Call addDone() to say that all addMapping() have been done. + * In some implementations this might free some resources. + */ + public void addDone() throws IOException; + /** + * Return the map from the taxonomy's original (consecutive) ordinals + * to the new taxonomy's ordinals. If the map has to be read from disk + * and ordered appropriately, it is done when getMap() is called. + * getMap() should only be called once, and only when the map is actually + * needed. Calling it will also free all resources that the map might + * be holding (such as temporary disk space), other than the returned int[]. + */ + public int[] getMap() throws IOException; + } + + /** + * {@link OrdinalMap} maintained in memory + */ + public static final class MemoryOrdinalMap implements OrdinalMap { + int[] map; + public void setSize(int taxonomySize) { + map = new int[taxonomySize]; + } + public void addMapping(int origOrdinal, int newOrdinal) { + map[origOrdinal] = newOrdinal; + } + public void addDone() { /* nothing to do */ } + public int[] getMap() { + return map; + } + } + + /** + * {@link OrdinalMap} maintained on file system + */ + public static final class DiskOrdinalMap implements OrdinalMap { + File tmpfile; + DataOutputStream out; + + public DiskOrdinalMap(File tmpfile) throws FileNotFoundException { + this.tmpfile = tmpfile; + out = new DataOutputStream(new BufferedOutputStream( + new FileOutputStream(tmpfile))); + } + + public void addMapping(int origOrdinal, int newOrdinal) throws IOException { + out.writeInt(origOrdinal); + out.writeInt(newOrdinal); + } + + public void setSize(int taxonomySize) throws IOException { + out.writeInt(taxonomySize); + } + + public void addDone() throws IOException { + if (out!=null) { + out.close(); + out = null; + } + } + + int[] map = null; + + public int[] getMap() throws IOException { + if (map!=null) { + return map; + } + addDone(); // in case this wasn't previously called + DataInputStream in = new DataInputStream(new BufferedInputStream( + new FileInputStream(tmpfile))); + map = new int[in.readInt()]; + // NOTE: The current code assumes here that the map is complete, + // i.e., every ordinal gets one and exactly one value. Otherwise, + // we may run into an EOF here, or vice versa, not read everything. + for (int i=0; i0) { + prefetchParentOrdinal[0] = TaxonomyReader.INVALID_ORDINAL; + } + first = 1; + } else { + first = prefetchParentOrdinal.length; + if (first==num) { + return; // nothing to do - no category was added + } + // In Java 6, we could just do Arrays.copyOf()... + int[] newarray = new int[num]; + System.arraycopy(prefetchParentOrdinal, 0, newarray, 0, + prefetchParentOrdinal.length); + prefetchParentOrdinal = newarray; + } + + // Read the new part of the parents array from the positions: + // TODO (Facet): avoid Multi*? + Bits deletedDocs = MultiFields.getDeletedDocs(indexReader); + DocsAndPositionsEnum positions = MultiFields.getTermPositionsEnum(indexReader, deletedDocs, + Consts.FIELD_PAYLOADS, new BytesRef(Consts.PAYLOAD_PARENT)); + if ((positions == null || positions.advance(first) == DocsAndPositionsEnum.NO_MORE_DOCS) && first < num) { + throw new CorruptIndexException("Missing parent data for category " + first); + } + for (int i=first; i= i (this is an + // invariant kept throughout this loop) + if (positions.docID()==i) { + if (positions.freq() == 0) { // shouldn't happen + throw new CorruptIndexException( + "Missing parent data for category "+i); + } + + // TODO (Facet): keep a local (non-volatile) copy of the prefetchParentOrdinal + // reference, because access to volatile reference is slower (?). + // Note: The positions we get here are one less than the position + // increment we added originally, so we get here the right numbers: + prefetchParentOrdinal[i] = positions.nextPosition(); + + if (positions.nextDoc() == DocsAndPositionsEnum.NO_MORE_DOCS) { + if ( i+1 < num ) { + throw new CorruptIndexException( + "Missing parent data for category "+(i+1)); + } + break; + } + } else { // this shouldn't happen + throw new CorruptIndexException( + "Missing parent data for category "+i); + } + } + } + + /** + * add() is used in LuceneTaxonomyWriter, not in LuceneTaxonomyReader. + * It is only called from a synchronized method, so it is not reentrant, + * and also doesn't need to worry about reads happening at the same time. + * + * NOTE: add() and refresh() CANNOT be used together. If you call add(), + * this changes the arrays and refresh() can no longer be used. + */ + void add(int ordinal, int parentOrdinal) throws IOException { + if (ordinal >= prefetchParentOrdinal.length) { + // grow the array, if necessary. + // In Java 6, we could just do Arrays.copyOf()... + int[] newarray = new int[ordinal*2+1]; + System.arraycopy(prefetchParentOrdinal, 0, newarray, 0, + prefetchParentOrdinal.length); + prefetchParentOrdinal = newarray; + } + prefetchParentOrdinal[ordinal] = parentOrdinal; + } + +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/taxonomy/lucene/package.html b/modules/facet/src/java/org/apache/lucene/facet/taxonomy/lucene/package.html new file mode 100644 index 00000000000..e1ee3308179 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/taxonomy/lucene/package.html @@ -0,0 +1,9 @@ + + +Taxonomy implemented using a Lucene-Index + + +

Taxonomy implemented using a Lucene-Index

+ + + \ No newline at end of file diff --git a/modules/facet/src/java/org/apache/lucene/facet/taxonomy/package.html b/modules/facet/src/java/org/apache/lucene/facet/taxonomy/package.html new file mode 100644 index 00000000000..ab92496bd8c --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/taxonomy/package.html @@ -0,0 +1,32 @@ + + +Taxonomy of Categories + + +

Taxonomy of Categories

+ + Facets are defined using a hierarchy of categories, known as a + Taxonomy + +
+ For example, in a book store application, a Taxonomy could have the + following hierarchy: +

+

    +
  • Author
  • +
      +
    • Mark Twain
    • +
    • J. K. Rowling
    • +
    +
+
    +
  • Date
  • +
      +
    • 2010
    • +
    • 2009
    • +
    +
+ + The Taxonomy translates category-paths into category-ordinal and vice versa. + + \ No newline at end of file diff --git a/modules/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/TaxonomyWriterCache.java b/modules/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/TaxonomyWriterCache.java new file mode 100644 index 00000000000..8d80ce1f91b --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/TaxonomyWriterCache.java @@ -0,0 +1,115 @@ +package org.apache.lucene.facet.taxonomy.writercache; + +import org.apache.lucene.facet.taxonomy.CategoryPath; +import org.apache.lucene.facet.taxonomy.lucene.LuceneTaxonomyWriter; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * TaxonomyWriterCache is a relatively simple interface for a cache of + * category->ordinal mappings, used in TaxonomyWriter implementations + * (such as {@link LuceneTaxonomyWriter}). + *

+ * It basically has put() methods for adding a mapping, and get() for looking + * a mapping up the cache. The cache does not guarantee to hold + * everything that has been put into it, and might in fact selectively + * delete some of the mappings (e.g., the ones least recently used). + * This means that if get() returns a negative response, it does not + * necessarily mean that the category doesn't exist - just that it is not + * in the cache. The caller can only infer that the category doesn't exist + * if it knows the cache to be complete (because all the categories were + * loaded into the cache, and since then no put() returned true). + *

However, + * if it does so, it should clear out large parts of the cache at once, because + * the user will typically need to work hard to recover from every cache + * cleanup (see {@link #put(CategoryPath, int)}'s return value). + * + * @lucene.experimental + */ +public interface TaxonomyWriterCache { + + /** + * Let go of whatever resources the cache is holding. After a close(), + * this object can no longer be used. + */ + public void close(); + + /** + * Lookup a category in the cache, returning its ordinal, or a negative + * number if the category is not in the cache. + *

+ * It is up to the caller to remember what a negative response means: + * If the caller knows the cache is complete (it was initially + * fed with all the categories, and since then put() never returned true) + * it means the category does not exist. Otherwise, the category might + * still exist, but just be missing from the cache. + */ + public int get(CategoryPath categoryPath); + + /** + * Like {@link #get(CategoryPath)}, but for a given prefix of the + * category path. + *

+ * If the given length is negative or bigger than the path's actual + * length, the full path is taken. + */ + public int get(CategoryPath categoryPath, int length); + + /** + * Add a category to the cache, with the given ordinal as the value. + *

+ * If the implementation keeps only a partial cache (e.g., an LRU cache) + * and finds that its cache is full, it should clear up part of the cache + * and return true. Otherwise, it should return + * false. + *

+ * The reason why the caller needs to know if part of the cache was + * cleared is that in that case it will have to commit its on-disk index + * (so that all the latest category additions can be searched on disk, if + * we can't rely on the cache to contain them). + *

+ * Ordinals should be non-negative. Currently there is no defined way to + * specify that a cache should remember a category does NOT exist. + * It doesn't really matter, because normally the next thing we do after + * finding that a category does not exist is to add it. + */ + public boolean put(CategoryPath categoryPath, int ordinal); + + /** + * Like {@link #put(CategoryPath, int)}, but for a given prefix of the + * category path. + *

+ * If the given length is negative or bigger than the path's actual + * length, the full path is taken. + */ + public boolean put(CategoryPath categoryPath, int prefixLen, int ordinal); + + /** + * Sometimes the cache is either unlimited in size, or limited by a very + * big size, and in that case when we add a lot of categories it might + * make sense to pre-load the cache with all the existing categories. + * However, this pre-load does not make sense when the allowed cache + * size is small. The hasRoom() method allows to differentiate between + * these cases. + *

+ * After hasRoom(n) returned true, the following n put() + * should return false (meaning that the cache was not cleared). + */ + public boolean hasRoom(int numberOfEntries); + +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/cl2o/CharBlockArray.java b/modules/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/cl2o/CharBlockArray.java new file mode 100644 index 00000000000..13e0112c3de --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/cl2o/CharBlockArray.java @@ -0,0 +1,195 @@ +package org.apache.lucene.facet.taxonomy.writercache.cl2o; + +import java.io.IOException; +import java.io.InputStream; +import java.io.ObjectInputStream; +import java.io.ObjectOutputStream; +import java.io.OutputStream; +import java.io.Serializable; +import java.util.ArrayList; +import java.util.List; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Similar to {@link StringBuilder}, but with a more efficient growing strategy. + * This class uses char array blocks to grow. + * + * @lucene.experimental + */ +class CharBlockArray implements Appendable, Serializable, CharSequence { + + private static final long serialVersionUID = 1L; + + private final static int DefaultBlockSize = 32 * 1024; // 32 KB default size + + final static class Block implements Serializable, Cloneable { + private static final long serialVersionUID = 1L; + + char[] chars; + int length; + + Block(int size) { + this.chars = new char[size]; + this.length = 0; + } + } + + List blocks; + Block current; + int blockSize; + int length; + + CharBlockArray() { + this(DefaultBlockSize); + } + + CharBlockArray(int blockSize) { + this.blocks = new ArrayList(); + this.blockSize = blockSize; + addBlock(); + } + + private void addBlock() { + this.current = new Block(this.blockSize); + this.blocks.add(this.current); + } + + int blockIndex(int index) { + return index / blockSize; + } + + int indexInBlock(int index) { + return index % blockSize; + } + + public CharBlockArray append(CharSequence chars) { + return append(chars, 0, chars.length()); + } + + public CharBlockArray append(char c) { + if (this.current.length == this.blockSize) { + addBlock(); + } + this.current.chars[this.current.length++] = c; + this.length++; + + return this; + } + + public CharBlockArray append(CharSequence chars, int start, int length) { + int end = start + length; + for (int i = start; i < end; i++) { + append(chars.charAt(i)); + } + return this; + } + + public CharBlockArray append(char[] chars, int start, int length) { + int offset = start; + int remain = length; + while (remain > 0) { + if (this.current.length == this.blockSize) { + addBlock(); + } + int toCopy = remain; + int remainingInBlock = this.blockSize - this.current.length; + if (remainingInBlock < toCopy) { + toCopy = remainingInBlock; + } + System.arraycopy(chars, offset, this.current.chars, this.current.length, toCopy); + offset += toCopy; + remain -= toCopy; + this.current.length += toCopy; + } + + this.length += length; + return this; + } + + public CharBlockArray append(String s) { + int remain = s.length(); + int offset = 0; + while (remain > 0) { + if (this.current.length == this.blockSize) { + addBlock(); + } + int toCopy = remain; + int remainingInBlock = this.blockSize - this.current.length; + if (remainingInBlock < toCopy) { + toCopy = remainingInBlock; + } + s.getChars(offset, offset + toCopy, this.current.chars, this.current.length); + offset += toCopy; + remain -= toCopy; + this.current.length += toCopy; + } + + this.length += s.length(); + return this; + } + + public char charAt(int index) { + Block b = this.blocks.get(blockIndex(index)); + return b.chars[indexInBlock(index)]; + } + + public int length() { + return this.length; + } + + public CharSequence subSequence(int start, int end) { + throw new UnsupportedOperationException("subsequence not implemented yet"); + } + + @Override + public String toString() { + StringBuilder b = new StringBuilder(blockSize * this.blocks.size()); + for (int i = 0; i < this.blocks.size(); i++) { + b.append(this.blocks.get(i).chars); + } + return b.toString(); + } + + void flush(OutputStream out) throws IOException { + ObjectOutputStream oos = null; + try { + oos = new ObjectOutputStream(out); + oos.writeObject(this); + oos.flush(); + } finally { + if (oos != null) { + oos.close(); + } + } + } + + public static CharBlockArray open(InputStream in) throws IOException, ClassNotFoundException { + ObjectInputStream ois = null; + try { + ois = new ObjectInputStream(in); + CharBlockArray a = (CharBlockArray) ois.readObject(); + return a; + } finally { + if (ois != null) { + ois.close(); + } + } + } + +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/cl2o/Cl2oTaxonomyWriterCache.java b/modules/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/cl2o/Cl2oTaxonomyWriterCache.java new file mode 100644 index 00000000000..a9822f945ee --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/cl2o/Cl2oTaxonomyWriterCache.java @@ -0,0 +1,82 @@ +package org.apache.lucene.facet.taxonomy.writercache.cl2o; + +import org.apache.lucene.facet.taxonomy.CategoryPath; +import org.apache.lucene.facet.taxonomy.writercache.TaxonomyWriterCache; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * {@link TaxonomyWriterCache} using {@link CompactLabelToOrdinal}. Although + * called cache, it maintains in memory all the mappings from category to + * ordinal, relying on that {@link CompactLabelToOrdinal} is an efficient + * mapping for this purpose. + * + * @lucene.experimental + */ +public class Cl2oTaxonomyWriterCache implements TaxonomyWriterCache { + + private CompactLabelToOrdinal cache; + + public Cl2oTaxonomyWriterCache(int initialCapcity, float loadFactor, int numHashArrays) { + this.cache = new CompactLabelToOrdinal(initialCapcity, loadFactor, numHashArrays); + } + + public void close() { + cache=null; + } + + public boolean hasRoom(int n) { + // This cache is unlimited, so we always have room for remembering more: + return true; + } + + public int get(CategoryPath categoryPath) { + return cache.getOrdinal(categoryPath); + } + + public int get(CategoryPath categoryPath, int length) { + if (length<0 || length>categoryPath.length()) { + length = categoryPath.length(); + } + return cache.getOrdinal(categoryPath, length); + } + + public boolean put(CategoryPath categoryPath, int ordinal) { + cache.addLabel(categoryPath, ordinal); + // Tell the caller we didn't clear part of the cache, so it doesn't + // have to flush its on-disk index now + return false; + } + + public boolean put(CategoryPath categoryPath, int prefixLen, int ordinal) { + cache.addLabel(categoryPath, prefixLen, ordinal); + // Tell the caller we didn't clear part of the cache, so it doesn't + // have to flush its on-disk index now + return false; + } + + /** + * Returns the number of bytes in memory used by this object. + * @return Number of bytes in memory used by this object. + */ + public int getMemoryUsage() { + int memoryUsage = (this.cache == null) ? 0 : this.cache.getMemoryUsage(); + return memoryUsage; + } + +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/cl2o/CollisionMap.java b/modules/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/cl2o/CollisionMap.java new file mode 100644 index 00000000000..247de7259d4 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/cl2o/CollisionMap.java @@ -0,0 +1,267 @@ +package org.apache.lucene.facet.taxonomy.writercache.cl2o; + +import java.io.IOException; +import java.util.Iterator; +import java.util.NoSuchElementException; + +import org.apache.lucene.facet.taxonomy.CategoryPath; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * HashMap to store colliding labels. See {@link CompactLabelToOrdinal} for + * details. + * + * @lucene.experimental + */ +public class CollisionMap { + + private int capacity; + private float loadFactor; + private int size; + private int threshold; + + static class Entry { + int offset; + int cid; + Entry next; + int hash; + + Entry(int offset, int cid, int h, Entry e) { + this.offset = offset; + this.cid = cid; + this.next = e; + this.hash = h; + } + } + + private CharBlockArray labelRepository; + + private Entry[] entries; + + public CollisionMap(CharBlockArray labelRepository) { + this(16 * 1024, 0.75f, labelRepository); + } + + public CollisionMap(int initialCapacity, CharBlockArray labelRepository) { + this(initialCapacity, 0.75f, labelRepository); + } + + private CollisionMap(int initialCapacity, float loadFactor, CharBlockArray labelRepository) { + this.labelRepository = labelRepository; + this.loadFactor = loadFactor; + this.capacity = CompactLabelToOrdinal.determineCapacity(2, initialCapacity); + + this.entries = new Entry[this.capacity]; + this.threshold = (int) (this.capacity * this.loadFactor); + } + + public int size() { + return this.size; + } + + public int capacity() { + return this.capacity; + } + + private void grow() { + int newCapacity = this.capacity * 2; + Entry[] newEntries = new Entry[newCapacity]; + Entry[] src = this.entries; + + for (int j = 0; j < src.length; j++) { + Entry e = src[j]; + if (e != null) { + src[j] = null; + do { + Entry next = e.next; + int hash = e.hash; + int i = indexFor(hash, newCapacity); + e.next = newEntries[i]; + newEntries[i] = e; + e = next; + } while (e != null); + } + } + + this.capacity = newCapacity; + this.entries = newEntries; + this.threshold = (int) (this.capacity * this.loadFactor); + } + + public int get(CategoryPath label, int hash) { + int bucketIndex = indexFor(hash, this.capacity); + Entry e = this.entries[bucketIndex]; + + while (e != null && !(hash == e.hash && label.equalsToSerialized(this.labelRepository, e.offset))) { + e = e.next; + } + if (e == null) { + return LabelToOrdinal.InvalidOrdinal; + } + + return e.cid; + } + + public int get(CategoryPath label, int prefixLen, int hash) { + int bucketIndex = indexFor(hash, this.capacity); + Entry e = this.entries[bucketIndex]; + + while (e != null && !(hash == e.hash && label.equalsToSerialized(prefixLen, this.labelRepository, e.offset))) { + e = e.next; + } + if (e == null) { + return LabelToOrdinal.InvalidOrdinal; + } + + return e.cid; + } + + public int addLabel(CategoryPath label, int hash, int cid) { + int bucketIndex = indexFor(hash, this.capacity); + for (Entry e = this.entries[bucketIndex]; e != null; e = e.next) { + if (e.hash == hash && label.equalsToSerialized(this.labelRepository, e.offset)) { + return e.cid; + } + } + + // new string; add to label repository + int offset = this.labelRepository.length(); + try { + label.serializeAppendTo(labelRepository); + } catch (IOException e) { + // can't happen, because labelRepository.append() doesn't throw an exception + } + + addEntry(offset, cid, hash, bucketIndex); + return cid; + } + + public int addLabel(CategoryPath label, int prefixLen, int hash, int cid) { + int bucketIndex = indexFor(hash, this.capacity); + for (Entry e = this.entries[bucketIndex]; e != null; e = e.next) { + if (e.hash == hash && label.equalsToSerialized(prefixLen, this.labelRepository, e.offset)) { + return e.cid; + } + } + + // new string; add to label repository + int offset = this.labelRepository.length(); + try { + label.serializeAppendTo(prefixLen, labelRepository); + } catch (IOException e) { + // can't happen, because labelRepository.append() doesn't throw an exception + } + + addEntry(offset, cid, hash, bucketIndex); + return cid; + } + + /** + * This method does not check if the same value is already + * in the map because we pass in an char-array offset, so + * so we now that we're in resize-mode here. + */ + public void addLabelOffset(int hash, int offset, int cid) { + int bucketIndex = indexFor(hash, this.capacity); + addEntry(offset, cid, hash, bucketIndex); + } + + private void addEntry(int offset, int cid, int hash, int bucketIndex) { + Entry e = this.entries[bucketIndex]; + this.entries[bucketIndex] = new Entry(offset, cid, hash, e); + if (this.size++ >= this.threshold) { + grow(); + } + } + + Iterator entryIterator() { + return new EntryIterator(entries, size); + } + + /** + * Returns index for hash code h. + */ + static int indexFor(int h, int length) { + return h & (length - 1); + } + + /** + * Returns an estimate of the memory usage of this CollisionMap. + * @return The approximate number of bytes used by this structure. + */ + int getMemoryUsage() { + int memoryUsage = 0; + if (this.entries != null) { + for (Entry e : this.entries) { + if (e != null) { + memoryUsage += (4 * 4); + for (Entry ee = e.next; ee != null; ee = ee.next) { + memoryUsage += (4 * 4); + } + } + } + } + return memoryUsage; + } + + private class EntryIterator implements Iterator { + Entry next; // next entry to return + int index; // current slot + Entry[] ents; + + EntryIterator(Entry[] entries, int size) { + this.ents = entries; + Entry[] t = entries; + int i = t.length; + Entry n = null; + if (size != 0) { // advance to first entry + while (i > 0 && (n = t[--i]) == null) { + // advance + } + } + this.next = n; + this.index = i; + } + + public boolean hasNext() { + return this.next != null; + } + + public Entry next() { + Entry e = this.next; + if (e == null) throw new NoSuchElementException(); + + Entry n = e.next; + Entry[] t = ents; + int i = this.index; + while (n == null && i > 0) { + n = t[--i]; + } + this.index = i; + this.next = n; + return e; + } + + public void remove() { + throw new UnsupportedOperationException(); + } + + } + +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/cl2o/CompactLabelToOrdinal.java b/modules/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/cl2o/CompactLabelToOrdinal.java new file mode 100644 index 00000000000..a33002114a2 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/cl2o/CompactLabelToOrdinal.java @@ -0,0 +1,572 @@ +package org.apache.lucene.facet.taxonomy.writercache.cl2o; + +import java.io.BufferedInputStream; +import java.io.BufferedOutputStream; +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.util.Iterator; + +import org.apache.lucene.facet.taxonomy.CategoryPath; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * This is a very efficient LabelToOrdinal implementation that uses a + * CharBlockArray to store all labels and a configurable number of HashArrays to + * reference the labels. + *

+ * Since the HashArrays don't handle collisions, a {@link CollisionMap} is used + * to store the colliding labels. + *

+ * This data structure grows by adding a new HashArray whenever the number of + * collisions in the {@link CollisionMap} exceeds {@code loadFactor} * + * {@link #getMaxOrdinal()}. Growing also includes reinserting all colliding + * labels into the HashArrays to possibly reduce the number of collisions. + * + * For setting the {@code loadFactor} see + * {@link #CompactLabelToOrdinal(int, float, int)}. + * + *

+ * This data structure has a much lower memory footprint (~30%) compared to a + * Java HashMap. It also only uses a small fraction of objects + * a HashMap would use, thus limiting the GC overhead. Ingestion speed was also + * ~50% faster compared to a HashMap for 3M unique labels. + * + * @lucene.experimental + */ +public class CompactLabelToOrdinal extends LabelToOrdinal { + + public static final float DefaultLoadFactor = 0.15f; + + static final char TerminatorChar = 0xffff; + private static final int Collision = -5; + + private HashArray[] hashArrays; + private CollisionMap collisionMap; + private CharBlockArray labelRepository; + + private int capacity; + private int threshold; + private float loadFactor; + + public int sizeOfMap() { + return this.collisionMap.size(); + } + + private CompactLabelToOrdinal() { + } + + public CompactLabelToOrdinal(int initialCapacity, float loadFactor, + int numHashArrays) { + + this.hashArrays = new HashArray[numHashArrays]; + + this.capacity = determineCapacity((int) Math.pow(2, numHashArrays), + initialCapacity); + init(); + this.collisionMap = new CollisionMap(this.labelRepository); + + this.counter = 0; + this.loadFactor = loadFactor; + + this.threshold = (int) (this.loadFactor * this.capacity); + } + + static int determineCapacity(int minCapacity, int initialCapacity) { + int capacity = minCapacity; + while (capacity < initialCapacity) { + capacity <<= 1; + } + return capacity; + } + + private void init() { + labelRepository = new CharBlockArray(); + try { + new CategoryPath().serializeAppendTo(labelRepository); + } catch (IOException e) { } //can't happen + + int c = this.capacity; + for (int i = 0; i < this.hashArrays.length; i++) { + this.hashArrays[i] = new HashArray(c); + c /= 2; + } + } + + @Override + public void addLabel(CategoryPath label, int ordinal) { + if (this.collisionMap.size() > this.threshold) { + grow(); + } + + int hash = CompactLabelToOrdinal.stringHashCode(label); + for (int i = 0; i < this.hashArrays.length; i++) { + if (addLabel(this.hashArrays[i], label, hash, ordinal)) { + return; + } + } + + int prevVal = this.collisionMap.addLabel(label, hash, ordinal); + if (prevVal != ordinal) { + throw new IllegalArgumentException("Label already exists: " + + label.toString('/') + " prev ordinal " + prevVal); + } + } + + @Override + public void addLabel(CategoryPath label, int prefixLen, int ordinal) { + if (this.collisionMap.size() > this.threshold) { + grow(); + } + + int hash = CompactLabelToOrdinal.stringHashCode(label, prefixLen); + for (int i = 0; i < this.hashArrays.length; i++) { + if (addLabel(this.hashArrays[i], label, prefixLen, hash, ordinal)) { + return; + } + } + + int prevVal = this.collisionMap.addLabel(label, prefixLen, hash, ordinal); + if (prevVal != ordinal) { + throw new IllegalArgumentException("Label already exists: " + + label.toString('/', prefixLen) + " prev ordinal " + prevVal); + } + } + + @Override + public int getOrdinal(CategoryPath label) { + if (label == null) { + return LabelToOrdinal.InvalidOrdinal; + } + + int hash = CompactLabelToOrdinal.stringHashCode(label); + for (int i = 0; i < this.hashArrays.length; i++) { + int ord = getOrdinal(this.hashArrays[i], label, hash); + if (ord != Collision) { + return ord; + } + } + + return this.collisionMap.get(label, hash); + } + + @Override + public int getOrdinal(CategoryPath label, int prefixLen) { + if (label == null) { + return LabelToOrdinal.InvalidOrdinal; + } + + int hash = CompactLabelToOrdinal.stringHashCode(label, prefixLen); + for (int i = 0; i < this.hashArrays.length; i++) { + int ord = getOrdinal(this.hashArrays[i], label, prefixLen, hash); + if (ord != Collision) { + return ord; + } + } + + return this.collisionMap.get(label, prefixLen, hash); + } + + private void grow() { + HashArray temp = this.hashArrays[this.hashArrays.length - 1]; + + for (int i = this.hashArrays.length - 1; i > 0; i--) { + this.hashArrays[i] = this.hashArrays[i - 1]; + } + + this.capacity *= 2; + this.hashArrays[0] = new HashArray(this.capacity); + + for (int i = 1; i < this.hashArrays.length; i++) { + int[] sourceOffsetArray = this.hashArrays[i].offsets; + int[] sourceCidsArray = this.hashArrays[i].cids; + + for (int k = 0; k < sourceOffsetArray.length; k++) { + + for (int j = 0; j < i && sourceOffsetArray[k] != 0; j++) { + int[] targetOffsetArray = this.hashArrays[j].offsets; + int[] targetCidsArray = this.hashArrays[j].cids; + + int newIndex = indexFor(stringHashCode( + this.labelRepository, sourceOffsetArray[k]), + targetOffsetArray.length); + if (targetOffsetArray[newIndex] == 0) { + targetOffsetArray[newIndex] = sourceOffsetArray[k]; + targetCidsArray[newIndex] = sourceCidsArray[k]; + sourceOffsetArray[k] = 0; + } + } + } + } + + for (int i = 0; i < temp.offsets.length; i++) { + int offset = temp.offsets[i]; + if (offset > 0) { + int hash = stringHashCode(this.labelRepository, offset); + addLabelOffset(hash, temp.cids[i], offset); + } + } + + CollisionMap oldCollisionMap = this.collisionMap; + this.collisionMap = new CollisionMap(oldCollisionMap.capacity(), + this.labelRepository); + this.threshold = (int) (this.capacity * this.loadFactor); + + Iterator it = oldCollisionMap.entryIterator(); + while (it.hasNext()) { + CollisionMap.Entry e = it.next(); + addLabelOffset(stringHashCode(this.labelRepository, e.offset), + e.cid, e.offset); + } + } + + private boolean addLabel(HashArray a, CategoryPath label, int hash, + int ordinal) { + int index = CompactLabelToOrdinal.indexFor(hash, a.offsets.length); + int offset = a.offsets[index]; + + if (offset == 0) { + a.offsets[index] = this.labelRepository.length(); + try { + label.serializeAppendTo(this.labelRepository); + } catch (IOException e) { + // can't happen - LabelRepository.append() never throws an + // exception + } + a.cids[index] = ordinal; + return true; + } + + return false; + } + + private boolean addLabel(HashArray a, CategoryPath label, int prefixLen, + int hash, int ordinal) { + int index = CompactLabelToOrdinal.indexFor(hash, a.offsets.length); + int offset = a.offsets[index]; + + if (offset == 0) { + a.offsets[index] = this.labelRepository.length(); + try { + label.serializeAppendTo(prefixLen, this.labelRepository); + } catch (IOException e) { + // can't happen - LabelRepository.append() never throws an + // exception + } + a.cids[index] = ordinal; + return true; + } + + return false; + } + + private void addLabelOffset(int hash, int cid, int knownOffset) { + for (int i = 0; i < this.hashArrays.length; i++) { + if (addLabelOffsetToHashArray(this.hashArrays[i], hash, cid, + knownOffset)) { + return; + } + } + + this.collisionMap.addLabelOffset(hash, knownOffset, cid); + + if (this.collisionMap.size() > this.threshold) { + grow(); + } + } + + private boolean addLabelOffsetToHashArray(HashArray a, int hash, int ordinal, + int knownOffset) { + + int index = CompactLabelToOrdinal.indexFor(hash, a.offsets.length); + int offset = a.offsets[index]; + + if (offset == 0) { + a.offsets[index] = knownOffset; + a.cids[index] = ordinal; + return true; + } + + return false; + } + + private int getOrdinal(HashArray a, CategoryPath label, int hash) { + if (label == null) { + return LabelToOrdinal.InvalidOrdinal; + } + + int index = CompactLabelToOrdinal.indexFor(hash, a.offsets.length); + int offset = a.offsets[index]; + if (offset == 0) { + return LabelToOrdinal.InvalidOrdinal; + } + + if (label.equalsToSerialized(labelRepository, offset)) { + return a.cids[index]; + } + + return Collision; + } + + private int getOrdinal(HashArray a, CategoryPath label, int prefixLen, int hash) { + if (label == null) { + return LabelToOrdinal.InvalidOrdinal; + } + + int index = CompactLabelToOrdinal.indexFor(hash, a.offsets.length); + int offset = a.offsets[index]; + if (offset == 0) { + return LabelToOrdinal.InvalidOrdinal; + } + + if (label.equalsToSerialized(prefixLen, labelRepository, offset)) { + return a.cids[index]; + } + + return Collision; + } + + /** + * Returns index for hash code h. + */ + static int indexFor(int h, int length) { + return h & (length - 1); + } + + // static int stringHashCode(String label) { + // int len = label.length(); + // int hash = 0; + // int i; + // for (i = 0; i < len; ++i) + // hash = 33 * hash + label.charAt(i); + // + // hash = hash ^ ((hash >>> 20) ^ (hash >>> 12)); + // hash = hash ^ (hash >>> 7) ^ (hash >>> 4); + // + // return hash; + // + // } + + static int stringHashCode(CategoryPath label) { + int hash = label.hashCode(); + + hash = hash ^ ((hash >>> 20) ^ (hash >>> 12)); + hash = hash ^ (hash >>> 7) ^ (hash >>> 4); + + return hash; + + } + + static int stringHashCode(CategoryPath label, int prefixLen) { + int hash = label.hashCode(prefixLen); + + hash = hash ^ ((hash >>> 20) ^ (hash >>> 12)); + hash = hash ^ (hash >>> 7) ^ (hash >>> 4); + + return hash; + + } + + static int stringHashCode(CharBlockArray labelRepository, int offset) { + int hash = CategoryPath.hashCodeOfSerialized(labelRepository, offset); + + hash = hash ^ ((hash >>> 20) ^ (hash >>> 12)); + hash = hash ^ (hash >>> 7) ^ (hash >>> 4); + + return hash; + } + + // public static boolean equals(CharSequence label, CharBlockArray array, + // int offset) { + // // CONTINUE HERE + // int len = label.length(); + // int bi = array.blockIndex(offset); + // CharBlockArray.Block b = array.blocks.get(bi); + // int index = array.indexInBlock(offset); + // + // for (int i = 0; i < len; i++) { + // if (label.charAt(i) != b.chars[index]) { + // return false; + // } + // index++; + // if (index == b.length) { + // b = array.blocks.get(++bi); + // index = 0; + // } + // } + // + // return b.chars[index] == TerminatorChar; + // } + + /** + * Returns an estimate of the amount of memory used by this table. Called only in + * this package. Memory is consumed mainly by three structures: the hash arrays, + * label repository and collision map. + */ + int getMemoryUsage() { + int memoryUsage = 0; + if (this.hashArrays != null) { + // HashArray capacity is instance-specific. + for (HashArray ha : this.hashArrays) { + // Each has 2 capacity-length arrays of ints. + memoryUsage += ( ha.capacity * 2 * 4 ) + 4; + } + } + if (this.labelRepository != null) { + // All blocks are the same size. + int blockSize = this.labelRepository.blockSize; + // Each block has room for blockSize UTF-16 chars. + int actualBlockSize = ( blockSize * 2 ) + 4; + memoryUsage += this.labelRepository.blocks.size() * actualBlockSize; + memoryUsage += 8; // Two int values for array as a whole. + } + if (this.collisionMap != null) { + memoryUsage += this.collisionMap.getMemoryUsage(); + } + return memoryUsage; + } + + /** + * Opens the file and reloads the CompactLabelToOrdinal. The file it expects + * is generated from the {@link #flush()} command. + */ + static CompactLabelToOrdinal open(File file, float loadFactor, + int numHashArrays) throws IOException { + /** + * Part of the file is the labelRepository, which needs to be rehashed + * and label offsets re-added to the object. I am unsure as to why we + * can't just store these off in the file as well, but in keeping with + * the spirit of the original code, I did it this way. (ssuppe) + */ + CompactLabelToOrdinal l2o = new CompactLabelToOrdinal(); + l2o.loadFactor = loadFactor; + l2o.hashArrays = new HashArray[numHashArrays]; + + DataInputStream dis = null; + try { + dis = new DataInputStream(new BufferedInputStream( + new FileInputStream(file))); + + // TaxiReader needs to load the "counter" or occupancy (L2O) to know + // the next unique facet. we used to load the delimiter too, but + // never used it. + l2o.counter = dis.readInt(); + + l2o.capacity = determineCapacity((int) Math.pow(2, + l2o.hashArrays.length), l2o.counter); + l2o.init(); + + // now read the chars + l2o.labelRepository = CharBlockArray.open(dis); + + l2o.collisionMap = new CollisionMap(l2o.labelRepository); + + // Calculate hash on the fly based on how CategoryPath hashes + // itself. Maybe in the future we can call some static based methods + // in CategoryPath so that this doesn't break again? I don't like + // having code in two different places... + int cid = 0; + // Skip the initial offset, it's the CategoryPath(0,0), which isn't + // a hashed value. + int offset = 1; + int lastStartOffset = offset; + // This loop really relies on a well-formed input (assumes pretty blindly + // that array offsets will work). Since the initial file is machine + // generated, I think this should be OK. + while (offset < l2o.labelRepository.length()) { + // First component is numcomponents, so we initialize the hash + // to this + int ncomponents = l2o.labelRepository.charAt(offset++); + int hash = ncomponents; + // If ncomponents is 0, then we are done? + if (ncomponents != 0) { + + // usedchars is always the last member of the 'ends' array + // in serialization. Rather than rebuild the entire array, + // assign usedchars to the last value we read in. This will + // be slightly more memory efficient. + int usedchars = 0; + for (int i = 0; i < ncomponents; i++) { + usedchars = l2o.labelRepository.charAt(offset++); + hash = hash * 31 + usedchars; + } + // Hash the usedchars for this label + for (int i = 0; i < usedchars; i++) { + hash = hash * 31 + l2o.labelRepository.charAt(offset++); + } + } + // Now that we've hashed the components of the label, do the + // final part of the hash algorithm. + hash = hash ^ ((hash >>> 20) ^ (hash >>> 12)); + hash = hash ^ (hash >>> 7) ^ (hash >>> 4); + // Add the label, and let's keep going + l2o.addLabelOffset(hash, cid, lastStartOffset); + cid++; + lastStartOffset = offset; + } + + } catch (ClassNotFoundException cnfe) { + throw new IOException("Invalid file format. Cannot deserialize."); + } finally { + if (dis != null) { + dis.close(); + } + } + + l2o.threshold = (int) (l2o.loadFactor * l2o.capacity); + return l2o; + + } + + void flush(File file) throws IOException { + FileOutputStream fos = new FileOutputStream(file); + + try { + BufferedOutputStream os = new BufferedOutputStream(fos); + + DataOutputStream dos = new DataOutputStream(os); + dos.writeInt(this.counter); + + // write the labelRepository + this.labelRepository.flush(dos); + + // Closes the data output stream + dos.close(); + + } finally { + fos.close(); + } + } + + private static final class HashArray { + int[] offsets; + int[] cids; + + int capacity; + + HashArray(int c) { + this.capacity = c; + this.offsets = new int[this.capacity]; + this.cids = new int[this.capacity]; + } + } +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/cl2o/LabelToOrdinal.java b/modules/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/cl2o/LabelToOrdinal.java new file mode 100644 index 00000000000..fc7fc790d5a --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/cl2o/LabelToOrdinal.java @@ -0,0 +1,73 @@ +package org.apache.lucene.facet.taxonomy.writercache.cl2o; + +import org.apache.lucene.facet.taxonomy.CategoryPath; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Abstract class for storing Label->Ordinal mappings in a taxonomy. + * + * @lucene.experimental + */ +public abstract class LabelToOrdinal { + + protected int counter; + public static final int InvalidOrdinal = -2; + + /** + * return the maximal Ordinal assigned so far + */ + public int getMaxOrdinal() { + return this.counter; + } + + /** + * Returns the next unassigned ordinal. The default behavior of this method + * is to simply increment a counter. + */ + public int getNextOrdinal() { + return this.counter++; + } + + /** + * Adds a new label if its not yet in the table. + * Throws an {@link IllegalArgumentException} if the same label with + * a different ordinal was previoulsy added to this table. + */ + public abstract void addLabel(CategoryPath label, int ordinal); + + /** + * Adds a new label if its not yet in the table. + * Throws an {@link IllegalArgumentException} if the same label with + * a different ordinal was previoulsy added to this table. + */ + public abstract void addLabel(CategoryPath label, int prefixLen, int ordinal); + + /** + * @return the ordinal assigned to the given label, + * or {@link #InvalidOrdinal} if the label cannot be found in this table. + */ + public abstract int getOrdinal(CategoryPath label); + + /** + * @return the ordinal assigned to the given label, + * or {@link #InvalidOrdinal} if the label cannot be found in this table. + */ + public abstract int getOrdinal(CategoryPath label, int prefixLen); + +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/cl2o/package.html b/modules/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/cl2o/package.html new file mode 100644 index 00000000000..1b1a3a6d8b8 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/cl2o/package.html @@ -0,0 +1,11 @@ + + +Category->Ordinal caching implementation using an optimized data-structures + + +

Category->Ordinal caching implementation using an optimized data-structures

+ + The internal map data structure consumes less memory (~30%) and is faster (~50%) compared to a + Java HashMap<String, Integer>. + + \ No newline at end of file diff --git a/modules/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/lru/LruTaxonomyWriterCache.java b/modules/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/lru/LruTaxonomyWriterCache.java new file mode 100644 index 00000000000..ecd05555432 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/lru/LruTaxonomyWriterCache.java @@ -0,0 +1,123 @@ +package org.apache.lucene.facet.taxonomy.writercache.lru; + +import org.apache.lucene.facet.taxonomy.CategoryPath; +import org.apache.lucene.facet.taxonomy.writercache.TaxonomyWriterCache; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * LRU {@link TaxonomyWriterCache} - good choice for huge taxonomies. + * + * @lucene.experimental + */ +public class LruTaxonomyWriterCache implements TaxonomyWriterCache { + + public enum LRUType { LRU_HASHED, LRU_STRING } + + private NameIntCacheLRU cache; + + public LruTaxonomyWriterCache(int cacheSize) { + // TODO (Facet): choose between NameHashIntCacheLRU and NameIntCacheLRU. + // For guaranteed correctness - not relying on no-collisions in the hash + // function, NameIntCacheLRU should be used: + // On the other hand, NameHashIntCacheLRU takes less RAM but if there + // are collisions (which we never found) two different paths would be + // mapped to the same ordinal... + this(cacheSize, LRUType.LRU_HASHED); + } + + public LruTaxonomyWriterCache(int cacheSize, LRUType lruType) { + // TODO (Facet): choose between NameHashIntCacheLRU and NameIntCacheLRU. + // For guaranteed correctness - not relying on no-collisions in the hash + // function, NameIntCacheLRU should be used: + // On the other hand, NameHashIntCacheLRU takes less RAM but if there + // are collisions (which we never found) two different paths would be + // mapped to the same ordinal... + if (lruType == LRUType.LRU_HASHED) { + this.cache = new NameHashIntCacheLRU(cacheSize); + } else { + this.cache = new NameIntCacheLRU(cacheSize); + } + } + + public boolean hasRoom(int n) { + return n<=(cache.getMaxSize()-cache.getSize()); + } + + public void close() { + cache.clear(); + cache=null; + } + + public int get(CategoryPath categoryPath) { + Integer res = cache.get(categoryPath); + if (res == null) { + return -1; + } + + return res.intValue(); + } + + public int get(CategoryPath categoryPath, int length) { + if (length<0 || length>categoryPath.length()) { + length = categoryPath.length(); + } + // TODO (Facet): unfortunately, we make a copy here! we can avoid part of + // the copy by creating a wrapper object (but this still creates a new + // object). A better implementation of the cache would not use Java's + // hash table, but rather some other hash table we can control, and + // pass the length parameter into it... + Integer res = cache.get(new CategoryPath(categoryPath, length)); + if (res==null) { + return -1; + } + return res.intValue(); + } + + public boolean put(CategoryPath categoryPath, int ordinal) { + boolean ret = cache.put(categoryPath, new Integer(ordinal)); + // If the cache is full, we need to clear one or more old entries + // from the cache. However, if we delete from the cache a recent + // addition that isn't yet in our reader, for this entry to be + // visible to us we need to make sure that the changes have been + // committed and we reopen the reader. Because this is a slow + // operation, we don't delete entries one-by-one but rather in bulk + // (put() removes the 2/3rd oldest entries). + if (ret) { + cache.makeRoomLRU(); + } + return ret; + } + + public boolean put(CategoryPath categoryPath, int prefixLen, int ordinal) { + boolean ret = cache.put(categoryPath, prefixLen, new Integer(ordinal)); + // If the cache is full, we need to clear one or more old entries + // from the cache. However, if we delete from the cache a recent + // addition that isn't yet in our reader, for this entry to be + // visible to us we need to make sure that the changes have been + // committed and we reopen the reader. Because this is a slow + // operation, we don't delete entries one-by-one but rather in bulk + // (put() removes the 2/3rd oldest entries). + if (ret) { + cache.makeRoomLRU(); + } + return ret; + } + +} + diff --git a/modules/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/lru/NameHashIntCacheLRU.java b/modules/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/lru/NameHashIntCacheLRU.java new file mode 100644 index 00000000000..f07643588f0 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/lru/NameHashIntCacheLRU.java @@ -0,0 +1,46 @@ +package org.apache.lucene.facet.taxonomy.writercache.lru; + +import org.apache.lucene.facet.taxonomy.CategoryPath; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * An an LRU cache of mapping from name to int. + * Used to cache Ordinals of category paths. + * It uses as key, hash of the path instead of the path. + * This way the cahce takes less RAM, but correctness depends on + * assuming no collisions. + * + * @lucene.experimental + */ +public class NameHashIntCacheLRU extends NameIntCacheLRU { + + NameHashIntCacheLRU(int maxCacheSize) { + super(maxCacheSize); + } + + @Override + Object key(CategoryPath name) { + return new Long(name.longHashCode()); + } + + @Override + Object key(CategoryPath name, int prefixLen) { + return new Long(name.longHashCode(prefixLen)); + } +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/lru/NameIntCacheLRU.java b/modules/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/lru/NameIntCacheLRU.java new file mode 100644 index 00000000000..baecdbb3847 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/lru/NameIntCacheLRU.java @@ -0,0 +1,144 @@ +package org.apache.lucene.facet.taxonomy.writercache.lru; + +import java.util.HashMap; +import java.util.Iterator; +import java.util.LinkedHashMap; +import org.apache.lucene.facet.taxonomy.CategoryPath; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * An an LRU cache of mapping from name to int. + * Used to cache Ordinals of category paths. + * + * @lucene.experimental + */ +// Note: Nothing in this class is synchronized. The caller is assumed to be +// synchronized so that no two methods of this class are called concurrently. +class NameIntCacheLRU { + + private HashMap cache; + long nMisses = 0; // for debug + long nHits = 0; // for debug + private int maxCacheSize; + + NameIntCacheLRU(int maxCacheSize) { + this.maxCacheSize = maxCacheSize; + createCache(maxCacheSize); + } + + public int getMaxSize() { + return maxCacheSize; + } + + public int getSize() { + return cache.size(); + } + + private void createCache (int maxSize) { + if (maxSize(1000,(float)0.7,true); //for LRU + } else { + cache = new HashMap(1000,(float)0.7); //no need for LRU + } + } + + Integer get (CategoryPath name) { + Integer res = cache.get(key(name)); + if (res==null) { + nMisses ++; + } else { + nHits ++; + } + return res; + } + + /** + * Subclasses can override this to provide caching by e.g. hash of the string. + * @param name + * @return + */ + Object key(CategoryPath name) { + // Note that a copy constructor (cloning) here is necessary, because a + // CategoryPath object is mutable, so we cannot save a reference to an + // existing CategoryPath. Subclasses which override this method can + // avoid this cloning by, e.g., hashing the name. + return new CategoryPath(name); + } + + Object key(CategoryPath name, int prefixLen) { + // Note that a copy constructor (cloning) here is necessary, because a + // CategoryPath object is mutable, so we cannot save a reference to an + // existing CategoryPath. Subclasses which override this method can + // avoid this cloning by, e.g., hashing the name. + return new CategoryPath(name, prefixLen); + } + + /** + * Add a new value to cache. + * Return true if cache became full and some room need to be made. + */ + boolean put (CategoryPath name, Integer val) { + cache.put(key(name), val); + return isCacheFull(); + } + + boolean put (CategoryPath name, int prefixLen, Integer val) { + cache.put(key(name, prefixLen), val); + return isCacheFull(); + } + + private boolean isCacheFull() { + return (cache.size()>maxCacheSize); + } + + void clear() { + cache.clear(); + } + + String stats() { + return "#miss="+nMisses+" #hit="+nHits; + } + + /** + * If cache is full remove least recently used entries from cache. + * Return true if anything was removed, false otherwise. + * + * See comment in {@link LuceneTaxonomyWriter#addToCache(String, Integer)} + * for an explanation why we clean 2/3rds of the cache, and not just one + * entry. + */ + boolean makeRoomLRU() { + if (!isCacheFull()) { + return false; + } + int n = cache.size() - (2*maxCacheSize)/3; + if (n<=0) { + return false; + } + Iterator it = cache.keySet().iterator(); + int i = 0; + while (i + +An LRU cache implementation for the CategoryPath to Ordinal map + + +

An LRU cache implementation for the CategoryPath to Ordinal map

+ + + \ No newline at end of file diff --git a/modules/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/package.html b/modules/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/package.html new file mode 100644 index 00000000000..92d2fe71843 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/package.html @@ -0,0 +1,9 @@ + + +Improves indexing time by caching a map of CategoryPath to their Ordinal + + +

Improves indexing time by caching a map of CategoryPath to their Ordinal

+ + + \ No newline at end of file diff --git a/modules/facet/src/java/org/apache/lucene/facet/util/MultiCategoryListIterator.java b/modules/facet/src/java/org/apache/lucene/facet/util/MultiCategoryListIterator.java new file mode 100644 index 00000000000..3a6e5099631 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/util/MultiCategoryListIterator.java @@ -0,0 +1,83 @@ +package org.apache.lucene.facet.util; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.lucene.facet.search.CategoryListIterator; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Iterates over multiple {@link CategoryListIterator}s, consuming the provided + * iterators in order. + * + * @lucene.experimental + */ +public class MultiCategoryListIterator implements CategoryListIterator { + + private final CategoryListIterator[] iterators; + private final List validIterators; + private final List perDocValidIterators; + + /** Receives the iterators to iterate on */ + public MultiCategoryListIterator(CategoryListIterator... iterators) { + this.iterators = iterators; + this.validIterators = new ArrayList(); + this.perDocValidIterators = new ArrayList(); + } + + /** Fails if all given iterators fail to init */ + public boolean init() throws IOException { + for (CategoryListIterator cli : iterators) { + if (cli.init()) { + validIterators.add(cli); + } + } + return !validIterators.isEmpty(); + } + + /** + * Return a value larger than {@link Integer#MAX_VALUE} only if all + * iterators are exhausted + */ + public long nextCategory() throws IOException { + while (!perDocValidIterators.isEmpty()) { + long value = perDocValidIterators.get(0).nextCategory(); + if (value <= Integer.MAX_VALUE) { + return value; + } + perDocValidIterators.remove(0); + } + return 0x100000000L; + } + + /** + * Fails only if skipTo on all the provided iterators returned {@code false} + */ + public boolean skipTo(int docId) throws IOException { + perDocValidIterators.clear(); + for (CategoryListIterator cli : validIterators) { + if (cli.skipTo(docId)) { + perDocValidIterators.add(cli); + } + } + return !perDocValidIterators.isEmpty(); + } + +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/util/PartitionsUtils.java b/modules/facet/src/java/org/apache/lucene/facet/util/PartitionsUtils.java new file mode 100644 index 00000000000..b287de68abf --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/util/PartitionsUtils.java @@ -0,0 +1,104 @@ +package org.apache.lucene.facet.util; + +import org.apache.lucene.facet.index.params.CategoryListParams; +import org.apache.lucene.facet.index.params.FacetIndexingParams; +import org.apache.lucene.facet.search.params.FacetSearchParams; +import org.apache.lucene.facet.taxonomy.TaxonomyReader; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Utilities for partitions - sizes and such + * + * @lucene.experimental + */ +public final class PartitionsUtils { + + /** + * Get the offset for a given partition. That is, what is the minimum number an + * ordinal could be for a particular partition. + */ + public final static int partitionOffset ( FacetIndexingParams iParams, + int partitionNumber, + final TaxonomyReader taxonomyReader) { + return partitionNumber * partitionSize(iParams, taxonomyReader); + } + + /** + * @see #partitionOffset(FacetIndexingParams, int, TaxonomyReader) + */ + public final static int partitionOffset ( FacetSearchParams sParams, + int partitionNumber, + final TaxonomyReader taxonomyReader) { + return partitionOffset(sParams.getFacetIndexingParams(), partitionNumber, taxonomyReader); + } + + /** + * Get the partition size in this parameter, or return the size of the taxonomy, which + * is smaller. (Guarantees usage of as little memory as possible at search time). + */ + public final static int partitionSize(FacetIndexingParams indexingParams, final TaxonomyReader taxonomyReader) { + return Math.min(indexingParams.getPartitionSize(), taxonomyReader.getSize()); + } + + /** + * @see #partitionSize(FacetIndexingParams, TaxonomyReader) + */ + public final static int partitionSize(FacetSearchParams sParams, final TaxonomyReader taxonomyReader) { + return partitionSize(sParams.getFacetIndexingParams(), taxonomyReader); + } + + /** + * Partition number of an ordinal. + *

+ * This allows to locate the partition containing a certain (facet) ordinal. + * @see FacetIndexingParams#getPartitionSize() + */ + public final static int partitionNumber(FacetIndexingParams iParams, int ordinal) { + return ordinal / iParams.getPartitionSize(); + } + + /** + * @see #partitionNumber(FacetIndexingParams, int) + */ + public final static int partitionNumber(FacetSearchParams sParams, int ordinal) { + return partitionNumber(sParams.getFacetIndexingParams(), ordinal); + } + + /** + * Partition name by category ordinal + */ + public final static String partitionNameByOrdinal( FacetIndexingParams iParams, + CategoryListParams clParams, + int ordinal) { + int partition = partitionNumber(iParams, ordinal); + return partitionName(clParams, partition); + } + + /** + * Partition name by its number + */ + public final static String partitionName(CategoryListParams clParams, int partition) { + String term = clParams.getTerm().text(); + if (partition == 0) { + return term; // for backwards compatibility we do not add a partition number in this case + } + return term + partition; + } + +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/util/RandomSample.java b/modules/facet/src/java/org/apache/lucene/facet/util/RandomSample.java new file mode 100644 index 00000000000..63c419aadc8 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/util/RandomSample.java @@ -0,0 +1,638 @@ +package org.apache.lucene.facet.util; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.logging.Level; +import java.util.logging.Logger; + +import org.apache.lucene.analysis.core.KeywordAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.LockObtainFailedException; +import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.util.PriorityQueue; +import org.apache.lucene.util.Version; + +import org.apache.lucene.facet.search.ScoredDocIDs; +import org.apache.lucene.facet.search.ScoredDocIDsIterator; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Take random samples of large collections. + * + * @lucene.experimental + */ +public class RandomSample { + + private static final Logger logger = Logger.getLogger(RandomSample.class.getName()); + + /** + * Returns sampleSize values from the first collectionSize + * locations of collection, chosen using + * the TRAVERSAL algorithm. The sample values are not sorted. + * @param collection The values from which a sample is wanted. + * @param collectionSize The number of values (from the first) from which to draw the sample. + * @param sampleSize The number of values to return. + * @return An array of values chosen from the collection. + * @see Algorithm#TRAVERSAL + */ + public static int[] repeatableSample(ScoredDocIDs collection, + int collectionSize, int sampleSize) + throws IOException { + return RandomSample.repeatableSample(collection, collectionSize, + sampleSize, Algorithm.HASHING, Sorted.NO); + } + + /** + * Returns sampleSize values from the first collectionSize + * locations of collection, chosen using algorithm. + * @param collection The values from which a sample is wanted. + * @param collectionSize The number of values (from the first) from which to draw the sample. + * @param sampleSize The number of values to return. + * @param algorithm Which algorithm to use. + * @param sorted Sorted.YES to sort the sample values in ascending order before returning; + * Sorted.NO to return them in essentially random order. + * @return An array of values chosen from the collection. + */ + public static int[] repeatableSample(ScoredDocIDs collection, + int collectionSize, int sampleSize, + Algorithm algorithm, Sorted sorted) + throws IOException { + if (collection == null) { + throw new IOException("docIdSet is null"); + } + if (sampleSize < 1) { + throw new IOException("sampleSize < 1 (" + sampleSize + ")"); + } + if (collectionSize < sampleSize) { + throw new IOException("collectionSize (" + collectionSize + ") less than sampleSize (" + sampleSize + ")"); + } + int[] sample = new int[sampleSize]; + long[] times = new long[4]; + if (algorithm == Algorithm.TRAVERSAL) { + RandomSample.sample1(collection, collectionSize, sample, times); + } else if (algorithm == Algorithm.HASHING) { + RandomSample.sample2(collection, collectionSize, sample, times); + } else { + throw new IllegalArgumentException("Invalid algorithm selection"); + } + if (sorted == Sorted.YES) { + Arrays.sort(sample); + } + if (RandomSample.returnTimings) { + times[3] = System.currentTimeMillis(); + if (logger.isLoggable(Level.FINEST)) { + logger.finest("Times: " + (times[1] - times[0]) + "ms, " + + (times[2] - times[1]) + "ms, " + (times[3] - times[2])+"ms"); + } + } + return sample; + } + + /** + * Returns sample.length values chosen from the first collectionSize + * locations of collection, using the TRAVERSAL algorithm. The sample is + * pseudorandom: no subset of the original collection + * is in principle more likely to occur than any other, but for a given collection + * and sample size, the same sample will always be returned. This algorithm walks the + * original collection in a methodical way that is guaranteed not to visit any location + * more than once, which makes sampling without replacement faster because removals don't + * have to be tracked, and the number of operations is proportional to the sample size, + * not the collection size. + * Times for performance measurement + * are returned in times, which must be an array of at least three longs, containing + * nanosecond event times. The first + * is set when the algorithm starts; the second, when the step size has been calculated; + * and the third when the sample has been taken. + * @param collection The set to be sampled. + * @param collectionSize The number of values to use (starting from first). + * @param sample The array in which to return the sample. + * @param times The times of three events, for measuring performance. + */ + private static void sample1(ScoredDocIDs collection, int collectionSize, int[] sample, long[] times) + throws IOException { + ScoredDocIDsIterator it = collection.iterator(); + if (RandomSample.returnTimings) { + times[0] = System.currentTimeMillis(); + } + int sampleSize = sample.length; + int prime = RandomSample.findGoodStepSize(collectionSize, sampleSize); + int mod = prime % collectionSize; + if (RandomSample.returnTimings) { + times[1] = System.currentTimeMillis(); + } + int sampleCount = 0; + int index = 0; + for (; sampleCount < sampleSize;) { + if (index + mod < collectionSize) { + for (int i = 0; i < mod; i++, index++) { + it.next(); + } + } else { + index = index + mod - collectionSize; + it = collection.iterator(); + for (int i = 0; i < index; i++) { + it.next(); + } + } + sample[sampleCount++] = it.getDocID(); + } + if (RandomSample.returnTimings) { + times[2] = System.currentTimeMillis(); + } + } // end RandomSample.sample1() + + /** + * Returns a value which will allow the caller to walk + * a collection of collectionSize values, without repeating or missing + * any, and spanning the collection from beginning to end at least once with + * sampleSize visited locations. Choosing a value + * that is relatively prime to the collection size ensures that stepping by that size (modulo + * the collection size) will hit all locations without repeating, eliminating the need to + * track previously visited locations for a "without replacement" sample. Starting with the + * square root of the collection size ensures that either the first or second prime tried will + * work (they can't both divide the collection size). It also has the property that N steps of + * size N will span a collection of N**2 elements once. If the sample is bigger than N, it will + * wrap multiple times (without repeating). If the sample is smaller, a step size is chosen + * that will result in at least one spanning of the collection. + * + * @param collectionSize The number of values in the collection to be sampled. + * @param sampleSize The number of values wanted in the sample. + * @return A good increment value for walking the collection. + */ + private static int findGoodStepSize(int collectionSize, int sampleSize) { + int i = (int) Math.sqrt(collectionSize); + if (sampleSize < i) { + i = collectionSize / sampleSize; + } + do { + i = RandomSample.findNextPrimeAfter(i); + } while (collectionSize % i == 0); + return i; + } // end RandomSample.findGoodStepSize() + + /** + * Returns the first prime number that is larger than n. + * @param n A number less than the prime to be returned. + * @return The smallest prime larger than n. + */ + private static int findNextPrimeAfter(int n) { + n += (n % 2 == 0) ? 1 : 2; // next odd + foundFactor: for (;; n += 2) { + int sri = (int) (Math.sqrt(n)); + for (int primeIndex = 0; primeIndex < RandomSample.N_PRIMES; primeIndex++) { + int p = RandomSample.primes[primeIndex]; + if (p > sri) { + return n; + } + if (n % p == 0) { + continue foundFactor; + } + } + for (int p = RandomSample.primes[RandomSample.N_PRIMES - 1] + 2;; p += 2) { + if (p > sri) { + return n; + } + if (n % p == 0) { + continue foundFactor; + } + } + } + } // end RandomSample.findNextPrimeAfter() + + /** + * Divides the values in collection into numSubranges + * subranges from minValue to maxValue and returns the + * number of values in each subrange. (For testing the flatness of distribution of + * a sample.) + * @param collection The collection of values to be counted. + * @param range The number of possible values. + * @param numSubranges How many intervals to divide the value range into. + */ + private static int[] countsBySubrange(int[] collection, int range, int numSubranges) { + int[] counts = new int[numSubranges]; + Arrays.fill(counts, 0); + int numInSubrange = range / numSubranges; + for (int j = 0; j < collection.length; j++) { + counts[collection[j] / numInSubrange]++; + } + return counts; + } // end RandomSample.countsBySubrange() + + /** + * Factors value into primes. + */ + public static int[] factor(long value) { + ArrayList list = new ArrayList(); + while (value > 1 && value % 2 == 0) { + list.add(2); + value /= 2; + } + long sqrt = Math.round(Math.sqrt(value)); + for (int pIndex = 0, lim = RandomSample.primes.length; pIndex < lim; pIndex++) { + int p = RandomSample.primes[pIndex]; + if (p >= sqrt) { + break; + } + while (value % p == 0) { + list.add(p); + value /= p; + sqrt = Math.round(Math.sqrt(value)); + } + } + if (list.size() == 0 || value > Integer.MAX_VALUE) { + throw new RuntimeException("Prime or too large to factor: "+value); + } + if ((int)value > 1) { + list.add((int)value); + } + int[] factors = new int[list.size()]; + for (int j = 0; j < factors.length; j++) { + factors[j] = list.get(j).intValue(); + } + return factors; + } // end RandomSample.factor() + + /** + * The first N_PRIMES primes, after 2. + */ + private static final int N_PRIMES = 4000; + private static int[] primes = new int[RandomSample.N_PRIMES]; + static { + RandomSample.primes[0] = 3; + for (int count = 1; count < RandomSample.N_PRIMES; count++) { + primes[count] = RandomSample.findNextPrimeAfter(primes[count - 1]); + } + } + + /** + * Returns sample.length values chosen from the first collectionSize + * locations of collection, using the HASHING algorithm. Performance measurements + * are returned in times, which must be an array of at least three longs. The first + * will be set when the algorithm starts; the second, when a hash key has been calculated and + * inserted into the priority queue for every element in the collection; and the third when the + * original elements associated with the keys remaining in the PQ have been stored in the sample + * array for return. + *

+ * This algorithm slows as the sample size becomes a significant fraction of the collection + * size, because the PQ is as large as the sample set, and will not do early rejection of values + * below the minimum until it fills up, and a larger PQ contains more small values to be purged, + * resulting in less early rejection and more logN insertions. + * + * @param collection The set to be sampled. + * @param collectionSize The number of values to use (starting from first). + * @param sample The array in which to return the sample. + * @param times The times of three events, for measuring performance. + */ + private static void sample2(ScoredDocIDs collection, int collectionSize, int[] sample, long[] times) + throws IOException { + if (RandomSample.returnTimings) { + times[0] = System.currentTimeMillis(); + } + int sampleSize = sample.length; + IntPriorityQueue pq = new IntPriorityQueue(sampleSize); + /* + * Convert every value in the collection to a hashed "weight" value, and insert + * into a bounded PQ (retains only sampleSize highest weights). + */ + ScoredDocIDsIterator it = collection.iterator(); + while (it.next()) { + pq.insertWithReuse((int)(it.getDocID() * PHI_32) & 0x7FFFFFFF); + } + if (RandomSample.returnTimings) { + times[1] = System.currentTimeMillis(); + } + /* + * Extract heap, convert weights back to original values, and return as integers. + */ + Object[] heap = pq.getHeap(); + for (int si = 0; si < sampleSize; si++) { + sample[si] = (int)(((IntPriorityQueue.MI)(heap[si+1])).value * PHI_32I) & 0x7FFFFFFF; + } + if (RandomSample.returnTimings) { + times[2] = System.currentTimeMillis(); + } + } // end RandomSample.sample2() + + /** + * A bounded priority queue for Integers, to retain a specified number of + * the highest-weighted values for return as a random sample. + */ + private static class IntPriorityQueue extends PriorityQueue { + + /** + * Creates a bounded PQ of size size. + * @param size The number of elements to retain. + */ + public IntPriorityQueue(int size) { + super(size); + } + + /** + * Inserts an integer with overflow and object reuse. + */ + public void insertWithReuse(int intval) { + if (this.mi == null) { + this.mi = new MI(); + } + this.mi.value = intval; + this.mi = (MI)this.insertWithOverflow(this.mi); + } // end IntPriorityQueue.insertWithReuse() + + /** + * Returns the underlying data structure for faster access. Extracting elements + * one at a time would require N logN time, and since we want the elements sorted + * in ascending order by value (not weight), the array is useful as-is. + * @return The underlying heap array. + */ + public Object[] getHeap() { + return getHeapArray(); + } + + /** + * Returns true if o1's weight is less than that of o2, for + * ordering in the PQ. + * @return True if o1 weighs less than o2. + */ + @Override + public boolean lessThan(Object o1, Object o2) { + return ((MI)o1).value < ((MI)o2).value; + } + + /** + * A mutable integer that lets queue objects be reused once they start overflowing. + */ + private static class MI { + MI() { } + public int value; + } // end class RandomSample.IntPriorityQueue.MI + + /** + * The mutable integer instance for reuse after first overflow. + */ + private MI mi; + + } // end class RandomSample.IntPriorityQueue + + /** + * For specifying which sampling algorithm to use. + */ + public static class Algorithm { + + /** + * Specifies a methodical traversal algorithm, which is guaranteed to span the collection + * at least once, and never to return duplicates. Faster than the hashing algorithm and + * uses much less space, but the randomness of the sample may be affected by systematic + * variations in the collection. Requires only an array for the sample, and visits only + * the number of elements in the sample set, not the full set. + */ + // TODO (Facet): This one produces a bimodal distribution (very flat around + // each peak!) for collection size 10M and sample sizes 10k and 10544. + // Figure out why. + public static final Algorithm TRAVERSAL = new Algorithm("Traversal"); + + /** + * Specifies a Fibonacci-style hash algorithm (see Knuth, S&S), which generates a less + * systematically distributed subset of the sampled collection than the traversal method, + * but requires a bounded priority queue the size of the sample, and creates an object + * containing a sampled value and its hash, for every element in the full set. + */ + public static final Algorithm HASHING = new Algorithm("Hashing"); + + /** + * Constructs an instance of an algorithm. + * @param name An ID for printing. + */ + private Algorithm(String name) { + this.name = name; + } + + /** + * Prints this algorithm's name. + */ + @Override + public String toString() { + return this.name; + } + + /** + * The name of this algorithm, for printing. + */ + private String name; + + } // end class RandomSample.Algorithm + + /** + * For specifying whether to sort the sample. + */ + public static class Sorted { + + /** + * Specifies sorting the resulting sample before returning. + */ + public static final Sorted YES = new Sorted("sorted"); + + /** + * Specifies not sorting the resulting sample. + */ + public static final Sorted NO = new Sorted("unsorted"); + + /** + * Constructs an instance of a "sorted" selector. + * @param name An ID for printing. + */ + private Sorted(String name) { + this.name = name; + } + + /** + * Prints this selector's name. + */ + @Override + public String toString() { + return this.name; + } + + /** + * The name of this selector, for printing. + */ + private String name; + + } // end class RandomSample.Sorted + + /** + * Magic number 1: prime closest to phi, in 32 bits. + */ + private static final long PHI_32 = 2654435769L; + + /** + * Magic number 2: multiplicative inverse of PHI_32, modulo 2**32. + */ + private static final long PHI_32I = 340573321L; + + /** + * Switch to cause methods to return timings. + */ + private static boolean returnTimings = false; + + /** + * Self-test. + */ + public static void main(String[] args) throws Exception { + RandomSample.returnTimings = true; + /* + * Create an array of sequential integers, from which samples will be taken. + */ + final int COLLECTION_SIZE = 10 * 1000 * 1000; + ScoredDocIDs collection = createAllScoredDocs(COLLECTION_SIZE); + + /* + * Factor PHI. + * + int[] factors = RandomSample.factor(PHI_32); + System.out.print("Factors of PHI_32: "); + for (int k : factors) { + System.out.print(k+", "); + } + System.out.println(""); + + * Verify inverse relationship of PHI & phi. + * + boolean inverseValid = true; + for (int j = 0; j < Integer.MAX_VALUE; j++) { + int k = (int)(j * PHI_32) & 0x7FFFFFFF; + int m = (int)(k * PHI_32I) & 0X7FFFFFFF; + if (j != m) { + System.out.println("Inverse not valid for "+j); + inverseValid = false; + } + } + System.out.println("Inverse valid? "+inverseValid); + */ + /* + * Take samples of various sizes from the full set, verify no duplicates, + * check flatness. + */ + int[] sampleSizes = { + 10, 57, 100, 333, 1000, 2154, 10000 + }; + Algorithm[] algorithms = { Algorithm.HASHING, Algorithm.TRAVERSAL }; + for (int sampleSize : sampleSizes) { + for (Algorithm algorithm : algorithms) { + System.out.println("Sample size " + sampleSize + + ", algorithm " + algorithm + "..."); + /* + * Take the sample. + */ + int[] sample = RandomSample.repeatableSample( + collection, COLLECTION_SIZE, sampleSize, algorithm, Sorted.YES); + /* + * Check for duplicates. + */ + boolean noDups = true; + for (int j = 0; j < sampleSize - 1; j++) { + if (sample[j] == sample[j + 1]) { + System.out.println("Duplicate value " + + sample[j] + " at " + j + ", " + + (j + 1)); + noDups = false; + break; + } + } + if (noDups) { + System.out.println("No duplicates."); + } + if (algorithm == Algorithm.HASHING) { + System.out.print("Hashed sample, up to 100 of "+sampleSize+": "); + int lim = Math.min(100, sampleSize); + for (int k = 0; k < lim; k++) { + System.out.print(sample[k]+", "); + } + System.out.println(""); + } + /* + * Check flatness of distribution in sample. + */ + final int N_INTERVALS = 100; + int[] counts = RandomSample.countsBySubrange(sample, COLLECTION_SIZE, N_INTERVALS); + int minCount = Integer.MAX_VALUE; + int maxCount = Integer.MIN_VALUE; + int avgCount = 0; + for (int j = 0; j < N_INTERVALS; j++) { + int count = counts[j]; + if (count < minCount) { + minCount = count; + } + if (count > maxCount) { + maxCount = count; + } + avgCount += count; + } + avgCount /= N_INTERVALS; + System.out.println("Min, max, avg: "+minCount+", "+maxCount+", "+avgCount); + + if (((double)minCount - avgCount)/avgCount < -0.05 && (minCount - avgCount) < -5) { + System.out.println("Not flat enough."); + } else if (((double)maxCount - avgCount)/avgCount > 0.05 && (maxCount - avgCount) > 5) { + System.out.println("Not flat enough."); + } else { + System.out.println("Flat enough."); + } + if (sampleSize == 10544 && algorithm == Algorithm.TRAVERSAL) { + System.out.print("Counts of interest: "); + for (int j = 0; j < N_INTERVALS; j++) { + System.out.print(counts[j]+", "); + } + System.out.println(""); + } + } + } + System.out.println("Last prime is " + + RandomSample.primes[RandomSample.N_PRIMES - 1]); + } + + private static ScoredDocIDs createAllScoredDocs(final int COLLECTION_SIZE) + throws CorruptIndexException, LockObtainFailedException, IOException { + ScoredDocIDs collection; + + IndexReader reader = null; + Directory ramDir = new RAMDirectory(); + try { + IndexWriter writer = new IndexWriter(ramDir, new IndexWriterConfig(Version.LUCENE_30, new KeywordAnalyzer())); + for (int i = 0; i < COLLECTION_SIZE; i++) { + writer.addDocument(new Document()); + } + writer.commit(); + writer.close(); + reader = IndexReader.open(ramDir); + collection = ScoredDocIdsUtils.createAllDocsScoredDocIDs(reader); + } finally { + if (reader != null) { + reader.close(); + } + ramDir.close(); + } + return collection; + } +} // end class RandomSample diff --git a/modules/facet/src/java/org/apache/lucene/facet/util/ResultSortUtils.java b/modules/facet/src/java/org/apache/lucene/facet/util/ResultSortUtils.java new file mode 100644 index 00000000000..3eb42d60396 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/util/ResultSortUtils.java @@ -0,0 +1,194 @@ +package org.apache.lucene.facet.util; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; + +import org.apache.lucene.util.PriorityQueue; + +import org.apache.lucene.facet.search.Heap; +import org.apache.lucene.facet.search.params.FacetRequest; +import org.apache.lucene.facet.search.params.FacetRequest.SortOrder; +import org.apache.lucene.facet.search.results.FacetResultNode; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Utilities for generating facet results sorted as required + * + * @lucene.experimental + */ +public class ResultSortUtils { + + /** + * Create a suitable heap according to facet request being served. + * @return heap for maintaining results for specified request. + * @throws IllegalArgumentException is provided facet request is not supported + */ + public static Heap createSuitableHeap(FacetRequest facetRequest) { + int nresults = facetRequest.getNumResults(); + boolean accending = (facetRequest.getSortOrder() == SortOrder.ASCENDING); + + if (nresults == Integer.MAX_VALUE) { + return new AllValueHeap(accending); + } + + if (accending) { + switch (facetRequest.getSortBy()) { + case VALUE: + return new MaxValueHeap(nresults); + case ORDINAL: + return new MaxOrdinalHeap(nresults); + } + } else { + switch (facetRequest.getSortBy()) { + case VALUE: + return new MinValueHeap(nresults); + case ORDINAL: + return new MinOrdinalHeap(nresults); + } + } + throw new IllegalArgumentException("none supported facet request: "+facetRequest); + } + + private static class MinValueHeap extends PriorityQueue implements Heap { + public MinValueHeap(int size) { + super(size); + } + + @Override + protected boolean lessThan(FacetResultNode arg0, FacetResultNode arg1) { + double value0 = arg0.getValue(); + double value1 = arg1.getValue(); + + int valueCompare = Double.compare(value0, value1); + if (valueCompare == 0) { + return arg0.getOrdinal() < arg1.getOrdinal(); + } + + return valueCompare < 0; + } + + } + + private static class MaxValueHeap extends PriorityQueue implements Heap { + public MaxValueHeap(int size) { + super(size); + } + + @Override + protected boolean lessThan(FacetResultNode arg0, FacetResultNode arg1) { + double value0 = arg0.getValue(); + double value1 = arg1.getValue(); + + int valueCompare = Double.compare(value0, value1); + if (valueCompare == 0) { + return arg0.getOrdinal() > arg1.getOrdinal(); + } + + return valueCompare > 0; + } + } + + private static class MinOrdinalHeap extends + PriorityQueue implements Heap { + public MinOrdinalHeap(int size) { + super(size); + } + + @Override + protected boolean lessThan(FacetResultNode arg0, FacetResultNode arg1) { + return arg0.getOrdinal() < arg1.getOrdinal(); + } + + } + + private static class MaxOrdinalHeap extends + PriorityQueue implements Heap { + public MaxOrdinalHeap(int size) { + super(size); + } + + @Override + protected boolean lessThan(FacetResultNode arg0, FacetResultNode arg1) { + return arg0.getOrdinal() > arg1.getOrdinal(); + } + + } + + /** + * Create a Heap-Look-Alike, which implements {@link Heap}, but uses a + * regular ArrayList for holding ALL the objects given, + * only sorting upon the first call to {@link #pop()}. + */ + private static class AllValueHeap implements Heap { + private ArrayList resultNodes = new ArrayList(); + final boolean accending; + private boolean isReady = false; + public AllValueHeap(boolean accending) { + this.accending = accending; + } + + public FacetResultNode insertWithOverflow(FacetResultNode node) { + resultNodes.add(node); + return null; + } + + public FacetResultNode pop() { + if (!isReady) { + Collections.sort(resultNodes, new Comparator() { + public int compare(FacetResultNode o1, FacetResultNode o2) { + int value = Double.compare(o1.getValue(), o2 + .getValue()); + if (value == 0) { + value = o1.getOrdinal() - o2.getOrdinal(); + } + if (accending) { + value = -value; + } + return value; + } + }); + isReady = true; + } + + return resultNodes.remove(0); + } + + public int size() { + return resultNodes.size(); + } + + public FacetResultNode top() { + if (resultNodes.size() > 0) { + return resultNodes.get(0); + } + + return null; + } + + public FacetResultNode add(FacetResultNode frn) { + resultNodes.add(frn); + return null; + } + + public void clear() { + resultNodes.clear(); + } + } +} diff --git a/modules/facet/src/java/org/apache/lucene/facet/util/ScoredDocIdsUtils.java b/modules/facet/src/java/org/apache/lucene/facet/util/ScoredDocIdsUtils.java new file mode 100644 index 00000000000..26709b71c5a --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/util/ScoredDocIdsUtils.java @@ -0,0 +1,399 @@ +package org.apache.lucene.facet.util; + +import java.io.IOException; +import java.util.Arrays; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.MultiFields; +import org.apache.lucene.search.DocIdSet; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.OpenBitSet; +import org.apache.lucene.util.OpenBitSetDISI; + +import org.apache.lucene.facet.search.ScoredDocIDs; +import org.apache.lucene.facet.search.ScoredDocIDsIterator; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Utility methods for Scored Doc IDs. + * + * @lucene.experimental + */ +public class ScoredDocIdsUtils { + + /** + * Create a complement of the input set. The returned {@link ScoredDocIDs} + * does not contain any scores, which makes sense given that the complementing + * documents were not scored. + * + * Note: the complement set does NOT contain doc ids which are noted as deleted by the given reader + * + * @param docids to be complemented. + * @param reader holding the number of documents & information about deletions. + */ + public final static ScoredDocIDs getComplementSet(final ScoredDocIDs docids, final IndexReader reader) + throws IOException { + final int maxDoc = reader.maxDoc(); + + DocIdSet docIdSet = docids.getDocIDs(); + final OpenBitSet complement; + if (docIdSet instanceof OpenBitSet) { + // That is the most common case, if ScoredDocIdsCollector was used. + complement = (OpenBitSet) ((OpenBitSet) docIdSet).clone(); + } else { + complement = new OpenBitSetDISI(docIdSet.iterator(), maxDoc); + } + + complement.flip(0, maxDoc); + + // Remove all Deletions from the complement set + clearDeleted(reader, complement); + + return createScoredDocIds(complement, maxDoc); + } + + /** + * Clear all deleted documents from a given open-bit-set according to a given reader + */ + private static void clearDeleted(final IndexReader reader, + final OpenBitSet set) throws IOException { + + // If there are no deleted docs + if (!reader.hasDeletions()) { + return; // return immediately + } + + Bits bits = MultiFields.getDeletedDocs(reader); + + DocIdSetIterator it = set.iterator(); + int doc = DocIdSetIterator.NO_MORE_DOCS; + while ((doc = it.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { + if (bits.get(doc)) { + set.fastClear(doc); + } + } + } + + /** + * Create a subset of an existing ScoredDocIDs object. + * + * @param allDocIds orginal set + * @param sampleSet Doc Ids of the subset. + */ + public static final ScoredDocIDs createScoredDocIDsSubset(final ScoredDocIDs allDocIds, + final int[] sampleSet) throws IOException { + + // sort so that we can scan docs in order + final int[] docids = sampleSet; + Arrays.sort(docids); + final float[] scores = new float[docids.length]; + // fetch scores and compute size + ScoredDocIDsIterator it = allDocIds.iterator(); + int n = 0; + while (it.next() && n < docids.length) { + int doc = it.getDocID(); + if (doc == docids[n]) { + scores[n] = it.getScore(); + ++n; + } + } + final int size = n; + + return new ScoredDocIDs() { + + public DocIdSet getDocIDs() { + return new DocIdSet() { + + @Override + public boolean isCacheable() { return true; } + + @Override + public DocIdSetIterator iterator() throws IOException { + return new DocIdSetIterator() { + + private int next = -1; + + @Override + public int advance(int target) throws IOException { + while (next < size && docids[next++] < target) { + } + return next == size ? NO_MORE_DOCS : docids[next]; + } + + @Override + public int docID() { + return docids[next]; + } + + @Override + public int nextDoc() throws IOException { + if (++next >= size) { + return NO_MORE_DOCS; + } + return docids[next]; + } + + }; + } + }; + } + + public ScoredDocIDsIterator iterator() throws IOException { + return new ScoredDocIDsIterator() { + + int next = -1; + + public boolean next() { return ++next < size; } + + public float getScore() { return scores[next]; } + + public int getDocID() { return docids[next]; } + }; + } + + public int size() { return size; } + + }; + } + + /** + * Creates a {@link ScoredDocIDs} which returns document IDs all non-deleted doc ids + * according to the given reader. + * The returned set contains the range of [0 .. reader.maxDoc ) doc ids + */ + public static final ScoredDocIDs createAllDocsScoredDocIDs (final IndexReader reader) { + if (reader.hasDeletions()) { + return new AllLiveDocsScoredDocIDs(reader); + } + return new AllDocsScoredDocIDs(reader); + } + + /** + * Create a ScoredDocIDs out of a given docIdSet and the total number of documents in an index + */ + public static final ScoredDocIDs createScoredDocIds(final DocIdSet docIdSet, final int maxDoc) { + return new ScoredDocIDs() { + private int size = -1; + public DocIdSet getDocIDs() { return docIdSet; } + + public ScoredDocIDsIterator iterator() throws IOException { + final DocIdSetIterator docIterator = docIdSet.iterator(); + return new ScoredDocIDsIterator() { + public boolean next() { + try { + return docIterator.nextDoc() != DocIdSetIterator.NO_MORE_DOCS; + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + public float getScore() { return DEFAULT_SCORE; } + + public int getDocID() { return docIterator.docID(); } + }; + } + + public int size() { + // lazy size computation + if (size < 0) { + OpenBitSetDISI openBitSetDISI; + try { + openBitSetDISI = new OpenBitSetDISI(docIdSet.iterator(), maxDoc); + } catch (IOException e) { + throw new RuntimeException(e); + } + size = (int) openBitSetDISI.cardinality(); + } + return size; + } + }; + } + + /** + * All docs ScoredDocsIDs - this one is simply an 'all 1' bitset. Used when + * there are no deletions in the index and we wish to go through each and + * every document + */ + private static class AllDocsScoredDocIDs implements ScoredDocIDs { + final int maxDoc; + + public AllDocsScoredDocIDs(IndexReader reader) { + this.maxDoc = reader.maxDoc(); + } + + public int size() { + return maxDoc; + } + + public DocIdSet getDocIDs() { + return new DocIdSet() { + + @Override + public boolean isCacheable() { + return true; + } + + @Override + public DocIdSetIterator iterator() throws IOException { + return new DocIdSetIterator() { + private int next = -1; + + @Override + public int advance(int target) throws IOException { + if (target <= next) { + target = next + 1; + } + return next = target >= maxDoc ? NO_MORE_DOCS + : target; + } + + @Override + public int docID() { + return next; + } + + @Override + public int nextDoc() throws IOException { + return ++next < maxDoc ? next : NO_MORE_DOCS; + } + + }; + } + }; + } + + public ScoredDocIDsIterator iterator() { + try { + final DocIdSetIterator iter = getDocIDs().iterator(); + return new ScoredDocIDsIterator() { + public boolean next() { + try { + return iter.nextDoc() != DocIdSetIterator.NO_MORE_DOCS; + } catch (IOException e) { + // cannot happen + return false; + } + } + + public float getScore() { + return DEFAULT_SCORE; + } + + public int getDocID() { + return iter.docID(); + } + }; + } catch (IOException e) { + // cannot happen + throw new RuntimeException(e); + } + } + } + + /** + * An All-docs bitset which has '0' for deleted documents and '1' for the + * rest. Useful for iterating over all 'live' documents in a given index. + *

+ * NOTE: this class would work for indexes with no deletions at all, + * although it is recommended to use {@link AllDocsScoredDocIDs} to ease + * the performance cost of validating isDeleted() on each and every docId + */ + private static final class AllLiveDocsScoredDocIDs implements ScoredDocIDs { + final int maxDoc; + final IndexReader reader; + + AllLiveDocsScoredDocIDs(IndexReader reader) { + this.maxDoc = reader.maxDoc(); + this.reader = reader; + } + + public int size() { + return reader.numDocs(); + } + + public DocIdSet getDocIDs() { + return new DocIdSet() { + + @Override + public boolean isCacheable() { + return true; + } + + @Override + public DocIdSetIterator iterator() throws IOException { + return new DocIdSetIterator() { + final Bits deletedDocs = MultiFields.getDeletedDocs(reader); + private int next = -1; + + @Override + public int advance(int target) throws IOException { + if (target > next) { + next = target - 1; + } + return nextDoc(); + } + + @Override + public int docID() { + return next; + } + + @Override + public int nextDoc() throws IOException { + do { + ++next; + } while (next < maxDoc && deletedDocs != null && deletedDocs.get(next)); + + return next < maxDoc ? next : NO_MORE_DOCS; + } + + }; + } + }; + } + + public ScoredDocIDsIterator iterator() { + try { + final DocIdSetIterator iter = getDocIDs().iterator(); + return new ScoredDocIDsIterator() { + public boolean next() { + try { + return iter.nextDoc() != DocIdSetIterator.NO_MORE_DOCS; + } catch (IOException e) { + // cannot happen + return false; + } + } + + public float getScore() { + return DEFAULT_SCORE; + } + + public int getDocID() { + return iter.docID(); + } + }; + } catch (IOException e) { + // cannot happen + throw new RuntimeException(e); + } + } + } +} \ No newline at end of file diff --git a/modules/facet/src/java/org/apache/lucene/facet/util/package.html b/modules/facet/src/java/org/apache/lucene/facet/util/package.html new file mode 100644 index 00000000000..8eaabef4061 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/facet/util/package.html @@ -0,0 +1,8 @@ + + + Various utilities for faceted search + + +

Various utilities for faceted search

+ + \ No newline at end of file diff --git a/modules/facet/src/java/org/apache/lucene/util/UnsafeByteArrayInputStream.java b/modules/facet/src/java/org/apache/lucene/util/UnsafeByteArrayInputStream.java new file mode 100644 index 00000000000..1642b6292ab --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/util/UnsafeByteArrayInputStream.java @@ -0,0 +1,147 @@ +package org.apache.lucene.util; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * This class, much like {@link ByteArrayInputStream} uses a given buffer as a + * source of an InputStream. Unlike ByteArrayInputStream, this class does not + * "waste" memory by creating a local copy of the given buffer, but rather uses + * the given buffer as is. Hence the name Unsafe. While using this class one + * should remember that the byte[] buffer memory is shared and might be changed + * from outside. + * + * For reuse-ability, a call for {@link #reInit(byte[])} can be called, and + * initialize the stream with a new buffer. + * + * @lucene.experimental + */ +public class UnsafeByteArrayInputStream extends InputStream { + + private byte[] buffer; + private int markIndex; + private int upperLimit; + private int index; + + /** + * Creates a new instance by not using any byte[] up front. If you use this + * constructor, you MUST call either of the {@link #reInit(byte[]) reInit} + * methods before you consume any byte from this instance.
+ * This constructor is for convenience purposes only, so that if one does not + * have the byte[] at the moment of creation, one is not forced to pass a + * new byte[0] or something. Obviously in that case, one will + * call either {@link #reInit(byte[]) reInit} methods before using the class. + */ + public UnsafeByteArrayInputStream() { + markIndex = upperLimit = index = 0; + } + + /** + * Creates an UnsafeByteArrayInputStream which uses a given byte array as + * the source of the stream. Default range is [0 , buffer.length) + * + * @param buffer + * byte array used as the source of this stream + */ + public UnsafeByteArrayInputStream(byte[] buffer) { + reInit(buffer, 0, buffer.length); + } + + /** + * Creates an UnsafeByteArrayInputStream which uses a given byte array as + * the source of the stream, at the specific range: [startPos, endPos) + * + * @param buffer + * byte array used as the source of this stream + * @param startPos + * first index (inclusive) to the data lying in the given buffer + * @param endPos + * an index (exclusive) where the data ends. data @ + * buffer[endPos] will never be read + */ + public UnsafeByteArrayInputStream(byte[] buffer, int startPos, int endPos) { + reInit(buffer, startPos, endPos); + } + + @Override + public void mark(int readlimit) { + markIndex = index; + } + + @Override + public boolean markSupported() { + return true; + } + + /** + * Initialize the stream with a given buffer, using the default limits of + * [0, buffer.length) + * + * @param buffer + * byte array used as the source of this stream + */ + public void reInit(byte[] buffer) { + reInit(buffer, 0, buffer.length); + } + + /** + * Initialize the stream with a given byte array as the source of the + * stream, at the specific range: [startPos, endPos) + * + * @param buffer + * byte array used as the source of this stream + * @param startPos + * first index (inclusive) to the data lying in the given buffer + * @param endPos + * an index (exclusive) where the data ends. data @ + * buffer[endPos] will never be read + */ + public void reInit(byte[] buffer, int startPos, int endPos) { + this.buffer = buffer; + markIndex = startPos; + upperLimit = endPos; + index = markIndex; + } + + @Override + public int available() throws IOException { + return upperLimit - index; + } + + /** + * Read a byte. Data returned as an integer [0,255] If end of stream + * reached, returns -1 + */ + @Override + public int read() throws IOException { + return index < upperLimit ? buffer[index++] & 0xff : -1; + } + + /** + * Resets the stream back to its original state. Basically - moving the + * index back to start position. + */ + @Override + public void reset() throws IOException { + index = markIndex; + } + +} diff --git a/modules/facet/src/java/org/apache/lucene/util/UnsafeByteArrayOutputStream.java b/modules/facet/src/java/org/apache/lucene/util/UnsafeByteArrayOutputStream.java new file mode 100644 index 00000000000..d5e7f87c4ff --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/util/UnsafeByteArrayOutputStream.java @@ -0,0 +1,184 @@ +package org.apache.lucene.util; + +import java.io.IOException; +import java.io.OutputStream; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * This class is used as a wrapper to a byte array, extending + * {@link OutputStream}. Data is written in the given byte[] buffer, until its + * length is insufficient. Than the buffer size is doubled and the data is + * written. + * + * This class is Unsafe as it is using a buffer which potentially can be changed + * from the outside. Moreover, when {@link #toByteArray()} is called, the buffer + * itself is returned, and not a copy. + * + * @lucene.experimental + */ +public class UnsafeByteArrayOutputStream extends OutputStream { + + private byte[] buffer; + private int index; + private int startIndex; + + /** + * Constructs a new output stream, with a default allocated buffer which can + * later be obtained via {@link #toByteArray()}. + */ + public UnsafeByteArrayOutputStream() { + reInit(new byte[32], 0); + } + + /** + * Constructs a new output stream, with a given buffer. Writing will start + * at index 0 as a default. + * + * @param buffer + * some space to which writing will be made + */ + public UnsafeByteArrayOutputStream(byte[] buffer) { + reInit(buffer, 0); + } + + /** + * Constructs a new output stream, with a given buffer. Writing will start + * at a given index. + * + * @param buffer + * some space to which writing will be made. + * @param startPos + * an index (inclusive) from white data will be written. + */ + public UnsafeByteArrayOutputStream(byte[] buffer, int startPos) { + reInit(buffer, startPos); + } + + private void grow(int newLength) { + // It actually should be: (Java 1.6) + // buffer = Arrays.copyOf(buffer, newLength); + byte[] newBuffer = new byte[newLength]; + System.arraycopy(buffer, 0, newBuffer, 0, buffer.length); + buffer = newBuffer; + } + + /** + * For reuse-ability, this stream object can be re-initialized with another + * given buffer and starting position. + * + * @param buffer some space to which writing will be made. + * @param startPos an index (inclusive) from white data will be written. + */ + public void reInit(byte[] buffer, int startPos) { + if (buffer.length == 0) { + throw new IllegalArgumentException("initial buffer length must be greater than 0."); + } + this.buffer = buffer; + startIndex = startPos; + index = startIndex; + } + + /** + * For reuse-ability, this stream object can be re-initialized with another + * given buffer, using 0 as default starting position. + * + * @param buffer some space to which writing will be made. + */ + public void reInit(byte[] buffer) { + reInit(buffer, 0); + } + + /** + * writes a given byte(at the form of an int) to the buffer. If the buffer's + * empty space is insufficient, the buffer is doubled. + * + * @param value byte value to be written + */ + @Override + public void write(int value) throws IOException { + if (index >= buffer.length) { + grow(buffer.length << 1); + } + buffer[index++] = (byte) value; + } + + /** + * writes a given byte[], with offset and length to the buffer. If the + * buffer's empty space is insufficient, the buffer is doubled until it + * could contain all the data. + * + * @param b + * byte buffer, containing the source data to be written + * @param off + * index from which data from the buffer b should be written + * @param len + * number of bytes that should be written + */ + @Override + public void write(byte[] b, int off, int len) throws IOException { + // If there's not enough space for the data + int targetLength = index + len; + if (targetLength >= buffer.length) { + // Calculating the new required length of the array, keeping the array + // size a power of 2 if it was initialized like that. + int newlen = buffer.length; + while ((newlen <<= 1) < targetLength) {} + grow(newlen); + } + + // Now that we have enough spare space, we could copy the rest of the + // data + System.arraycopy(b, off, buffer, index, len); + + // Updating the index to next available index. + index += len; + } + + /** + * Returns the byte array saved within the buffer AS IS. + * + * @return the actual inner buffer - not a copy of it. + */ + public byte[] toByteArray() { + return buffer; + } + + /** + * Returns the number of relevant bytes. This objects makes sure the buffer + * is at least the size of it's data. But it can also be twice as big. The + * user would want to process the relevant bytes only. For that he would + * need the count. + * + * @return number of relevant bytes + */ + public int length() { + return index; + } + + /** + * Returns the start position data was written to. This is useful in case you + * used {@link #reInit(byte[], int)} or + * {@link #UnsafeByteArrayOutputStream(byte[], int)} and passed a start + * position which is not 0. + */ + public int getStartPos() { + return startIndex; + } + +} diff --git a/modules/facet/src/java/org/apache/lucene/util/Vint8.java b/modules/facet/src/java/org/apache/lucene/util/Vint8.java new file mode 100644 index 00000000000..5311d53b468 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/util/Vint8.java @@ -0,0 +1,229 @@ +package org.apache.lucene.util; + +import java.io.EOFException; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Variable-length encoding of 32-bit integers, into 8-bit bytes. A number is encoded as follows: + *
    + *
  • If it is less than 127 and non-negative (i.e., if the number uses only 7 bits), it is encoded as + * as single byte: 0bbbbbbb. + *
  • If its highest nonzero bit is greater than bit 6 (0x40), it is represented as a series of + * bytes, each byte's + * 7 LSB containing bits from the original value, with the MSB set for all but the last + * byte. The first encoded byte contains the highest nonzero bits from the + * original; the second byte contains the next 7 MSB; and so on, with the last byte + * containing the 7 LSB of the original. + *
+ * Examples: + *
    + *
  1. n = 117 = 1110101: This has fewer than 8 significant bits, and so is encoded as + * 01110101 = 0x75. + *
  2. n = 100000 = (binary) 11000011010100000. This has 17 significant bits, and so needs + * three Vint8 bytes. Left-zero-pad it to a multiple of 7 bits, then split it into chunks of 7 + * and add an MSB, 0 for the last byte, 1 for the others: 1|0000110 1|0001101 0|0100000 + * = 0x86 0x8D 0x20. + *
+ * This encoder/decoder will correctly handle any 32-bit integer, but for negative numbers, + * and positive numbers with more than 28 significant bits, encoding requires 5 bytes; this + * is not an efficient encoding scheme for large + * positive numbers or any negative number. + *

+ * Compatibility:
+ * This class has been used in products that have shipped to customers, and is needed to + * decode legacy data. Do not modify this class in ways that will break compatibility. + * + * @lucene.experimental + */ +public class Vint8 { + + /** + * Because Java lacks call-by-reference, this class boxes the decoding position, which + * is initially set by the caller, and returned after decoding, incremented by the number + * of bytes processed. + */ + public static class Position { + /** + * Creates a position value set to zero. + */ + public Position() { + // The initial position is zero by default. + } + /** + * Creates a position set to {@code initialPosition}. + * @param initialPosition The starting decoding position in the source buffer. + */ + public Position(int initialPosition) { + this.pos = initialPosition; + } + /** + * The value passed by reference. + */ + public int pos; + } + + /** + * Returns the number of bytes needed to encode {@code number}. + * @param number The number whose encoded length is needed. + * @return The number of bytes needed to encode {@code number}. + */ + public static int bytesNeeded(int number) { + if ((number & ~0x7F) == 0) { + return 1; + } else if ((number & ~0x3FFF) == 0) { + return 2; + } else if ((number & ~0x1FFFFF) == 0) { + return 3; + } else if ((number & ~0xFFFFFFF) == 0) { + return 4; + } else { + return 5; + } + } + + /** + * The maximum number of bytes needed to encode a number using {@code Vint8}. + */ + public static final int MAXIMUM_BYTES_NEEDED = 5; + + /** + * Encodes {@code number} to {@code out}. + * @param number The value to be written in encoded form, to {@code out}. + * @param out The output stream receiving the encoded bytes. + * @exception IOException If there is a problem writing to {@code out}. + */ + public static void encode(int number, OutputStream out) throws IOException { + if ((number & ~0x7F) == 0) { + out.write(number); + } else if ((number & ~0x3FFF) == 0) { + out.write(0x80 | (number >> 7)); + out.write(0x7F & number); + } else if ((number & ~0x1FFFFF) == 0) { + out.write(0x80 | (number >> 14)); + out.write(0x80 | (number >> 7)); + out.write(0x7F & number); + } else if ((number & ~0xFFFFFFF) == 0) { + out.write(0x80 | (number >> 21)); + out.write(0x80 | (number >> 14)); + out.write(0x80 | (number >> 7)); + out.write(0x7F & number); + } else { + out.write(0x80 | (number >> 28)); + out.write(0x80 | (number >> 21)); + out.write(0x80 | (number >> 14)); + out.write(0x80 | (number >> 7)); + out.write(0x7F & number); + } + } + + /** + * Encodes {@code number} into {@code dest}, starting at offset {@code start} from + * the beginning of the array. This method assumes {@code dest} is large enough to + * hold the required number of bytes. + * @param number The number to be encoded. + * @param dest The destination array. + * @param start The starting offset in the array. + * @return The number of bytes used in the array. + */ + public static int encode(int number, byte[] dest, int start) { + if ((number & ~0x7F) == 0) { + dest[start] = (byte) number; + return 1; + } else if ((number & ~0x3FFF) == 0) { + dest[start] = (byte) (0x80 | ((number & 0x3F80) >> 7)); + dest[start + 1] = (byte) (number & 0x7F); + return 2; + } else if ((number & ~0x1FFFFF) == 0) { + dest[start] = (byte) (0x80 | ((number & 0x1FC000) >> 14)); + dest[start + 1] = (byte) (0x80 | ((number & 0x3F80) >> 7)); + dest[start + 2] = (byte) (number & 0x7F); + return 3; + } else if ((number & ~0xFFFFFFF) == 0) { + dest[start] = (byte) (0x80 | ((number & 0xFE00000) >> 21)); + dest[start + 1] = (byte) (0x80 | ((number & 0x1FC000) >> 14)); + dest[start + 2] = (byte) (0x80 | ((number & 0x3F80) >> 7)); + dest[start + 3] = (byte) (number & 0x7F); + return 4; + } else { + dest[start] = (byte) (0x80 | ((number & 0xF0000000) >> 28)); + dest[start + 1] = (byte) (0x80 | ((number & 0xFE00000) >> 21)); + dest[start + 2] = (byte) (0x80 | ((number & 0x1FC000) >> 14)); + dest[start + 3] = (byte) (0x80 | ((number & 0x3F80) >> 7)); + dest[start + 4] = (byte) (number & 0x7F); + return 5; + } + } + + /** + * Decodes a 32-bit integer from {@code bytes}, beginning at offset {@code pos.pos}. + * The decoded value is returned, and {@code pos.pos} is incremented by the number of + * bytes processed. + * @param bytes The byte array containing an encoded value. + * @param pos On entry, the starting position in the array; on return, one greater + * than the position of the last byte decoded in the call. + * @return The decoded value. + */ + public static int decode(byte[] bytes, Position pos) { + int value = 0; + while (true) { + byte first = bytes[pos.pos]; + ++pos.pos; + value |= first & 0x7F; + if ((first & 0x80) == 0) { + return value; + } + value <<= 7; + } + } + + /** + * Decodes a 32-bit integer from bytes read from {@code in}. Bytes are read, + * one at a time, from {@code in}, and it is assumed they represent a 32-bit + * integer encoded using this class's encoding scheme. The decoded value is + * returned. + * @param in The input stream containing the encoded bytes. + * @return The decoded value. + * @exception EOFException If the stream ends before a value has been decoded. + */ + public static int decode(InputStream in) throws IOException { + int value = 0; + while (true) { + int first = in.read(); + if (first < 0) { + throw new EOFException(); + } + value |= first & 0x7F; + if ((first & 0x80) == 0) { + return value; + } + value <<= 7; + } + } + + /** + * The default ctor is made private because all methods of this class are static. + */ + private Vint8() { + // Just making it impossible to instantiate. + } + +} diff --git a/modules/facet/src/java/org/apache/lucene/util/collections/ArrayHashMap.java b/modules/facet/src/java/org/apache/lucene/util/collections/ArrayHashMap.java new file mode 100644 index 00000000000..b2ff56797e8 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/util/collections/ArrayHashMap.java @@ -0,0 +1,546 @@ +package org.apache.lucene.util.collections; + +import java.util.Arrays; +import java.util.Iterator; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * An Array-based hashtable which maps, similar to Java's HashMap, only + * performance tests showed it performs better. + *

+ * The hashtable is constructed with a given capacity, or 16 as a default. In + * case there's not enough room for new pairs, the hashtable grows. Capacity is + * adjusted to a power of 2, and there are 2 * capacity entries for the hash. + * The pre allocated arrays (for keys, values) are at length of capacity + 1, + * where index 0 is used as 'Ground' or 'NULL'. + *

+ * The arrays are allocated ahead of hash operations, and form an 'empty space' + * list, to which the <key,value> pair is allocated. + * + * @lucene.experimental + */ +public class ArrayHashMap implements Iterable { + + /** Implements an IntIterator which iterates over all the allocated indexes. */ + private final class IndexIterator implements IntIterator { + /** + * The last used baseHashIndex. Needed for "jumping" from one hash entry + * to another. + */ + private int baseHashIndex = 0; + + /** The next not-yet-visited index. */ + private int index = 0; + + /** Index of the last visited pair. Used in {@link #remove()}. */ + private int lastIndex = 0; + + /** + * Create the Iterator, make index point to the "first" + * index which is not empty. If such does not exist (eg. the map is + * empty) it would be zero. + */ + public IndexIterator() { + for (baseHashIndex = 0; baseHashIndex < baseHash.length; ++baseHashIndex) { + index = baseHash[baseHashIndex]; + if (index != 0) { + break; + } + } + } + + public boolean hasNext() { + return index != 0; + } + + public int next() { + // Save the last index visited + lastIndex = index; + + // next the index + index = next[index]; + + // if the next index points to the 'Ground' it means we're done with + // the current hash entry and we need to jump to the next one. This + // is done until all the hash entries had been visited. + while (index == 0 && ++baseHashIndex < baseHash.length) { + index = baseHash[baseHashIndex]; + } + + return lastIndex; + } + + @SuppressWarnings("unchecked") + public void remove() { + ArrayHashMap.this.remove((K) keys[lastIndex]); + } + + } + + /** Implements an Iterator, used for iteration over the map's keys. */ + private final class KeyIterator implements Iterator { + private IntIterator iterator = new IndexIterator(); + + KeyIterator() { } + + public boolean hasNext() { + return iterator.hasNext(); + } + + @SuppressWarnings("unchecked") + public K next() { + return (K) keys[iterator.next()]; + } + + public void remove() { + iterator.remove(); + } + } + + /** Implements an Iterator, used for iteration over the map's values. */ + private final class ValueIterator implements Iterator { + private IntIterator iterator = new IndexIterator(); + + ValueIterator() { } + + public boolean hasNext() { + return iterator.hasNext(); + } + + @SuppressWarnings("unchecked") + public V next() { + return (V) values[iterator.next()]; + } + + public void remove() { + iterator.remove(); + } + } + + /** Default capacity - in case no capacity was specified in the constructor */ + private static final int DEFAULT_CAPACITY = 16; + + /** + * Holds the base hash entries. if the capacity is 2^N, than the base hash + * holds 2^(N+1). + */ + int[] baseHash; + + /** + * The current capacity of the map. Always 2^N and never less than 16. We + * never use the zero index. It is needed to improve performance and is also + * used as "ground". + */ + private int capacity; + + /** + * All objects are being allocated at map creation. Those objects are "free" + * or empty. Whenever a new pair comes along, a pair is being "allocated" or + * taken from the free-linked list. as this is just a free list. + */ + private int firstEmpty; + + /** hashFactor is always (2^(N+1)) - 1. Used for faster hashing. */ + private int hashFactor; + + /** Holds the unique keys. */ + Object[] keys; + + /** + * In case of collisions, we implement a double linked list of the colliding + * hash's with the following next[] and prev[]. Those are also used to store + * the "empty" list. + */ + int[] next; + + private int prev; + + /** Number of currently stored objects in the map. */ + private int size; + + /** Holds the values. */ + Object[] values; + + /** Constructs a map with default capacity. */ + public ArrayHashMap() { + this(DEFAULT_CAPACITY); + } + + /** + * Constructs a map with given capacity. Capacity is adjusted to a native + * power of 2, with minimum of 16. + * + * @param capacity minimum capacity for the map. + */ + public ArrayHashMap(int capacity) { + this.capacity = 16; + while (this.capacity < capacity) { + // Multiply by 2 as long as we're still under the requested capacity + this.capacity <<= 1; + } + + // As mentioned, we use the first index (0) as 'Ground', so we need the + // length of the arrays to be one more than the capacity + int arrayLength = this.capacity + 1; + + values = new Object[arrayLength]; + keys = new Object[arrayLength]; + next = new int[arrayLength]; + + // Hash entries are twice as big as the capacity. + int baseHashSize = this.capacity << 1; + + baseHash = new int[baseHashSize]; + + // The has factor is 2^M - 1 which is used as an "AND" hashing operator. + // {@link #calcBaseHash()} + hashFactor = baseHashSize - 1; + + size = 0; + + clear(); + } + + /** + * Adds a pair to the map. Takes the first empty position from the + * empty-linked-list's head - {@link firstEmpty}. New pairs are always + * inserted to baseHash, and are followed by the old colliding pair. + */ + private void prvt_put(K key, V value) { + // Hash entry to which the new pair would be inserted + int hashIndex = calcBaseHashIndex(key); + + // 'Allocating' a pair from the "Empty" list. + int objectIndex = firstEmpty; + + // Setting data + firstEmpty = next[firstEmpty]; + values[objectIndex] = value; + keys[objectIndex] = key; + + // Inserting the new pair as the first node in the specific hash entry + next[objectIndex] = baseHash[hashIndex]; + baseHash[hashIndex] = objectIndex; + + // Announcing a new pair was added! + ++size; + } + + /** Calculating the baseHash index using the internal internal hashFactor. */ + protected int calcBaseHashIndex(K key) { + return key.hashCode() & hashFactor; + } + + /** Empties the map. Generates the "Empty" space list for later allocation. */ + public void clear() { + // Clears the hash entries + Arrays.fill(baseHash, 0); + + // Set size to zero + size = 0; + + // Mark all array entries as empty. This is done with + // firstEmpty pointing to the first valid index (1 as 0 is + // used as 'Ground'). + firstEmpty = 1; + + // And setting all the next[i] to point at + // i+1. + for (int i = 1; i < capacity;) { + next[i] = ++i; + } + + // Surly, the last one should point to the 'Ground'. + next[capacity] = 0; + } + + /** Returns true iff the key exists in the map. */ + public boolean containsKey(K key) { + return find(key) != 0; + } + + /** Returns true iff the object exists in the map. */ + public boolean containsValue(Object o) { + for (Iterator iterator = iterator(); iterator.hasNext();) { + V object = iterator.next(); + if (object.equals(o)) { + return true; + } + } + return false; + } + + /** Returns the index of the given key, or zero if the key wasn't found. */ + protected int find(K key) { + // Calculate the hash entry. + int baseHashIndex = calcBaseHashIndex(key); + + // Start from the hash entry. + int localIndex = baseHash[baseHashIndex]; + + // while the index does not point to the 'Ground' + while (localIndex != 0) { + // returns the index found in case of of a matching key. + if (keys[localIndex].equals(key)) { + return localIndex; + } + + // next the local index + localIndex = next[localIndex]; + } + + // If we got this far, it could only mean we did not find the key we + // were asked for. return 'Ground' index. + return 0; + } + + /** + * Finds the actual index of a given key with it's baseHashIndex. Some methods + * use the baseHashIndex. If those call {@link #find()} there's no need to + * re-calculate that hash. + * + * @return the index of the given key, or 0 if the key wasn't found. + */ + private int findForRemove(K key, int baseHashIndex) { + // Start from the hash entry. + prev = 0; + int index = baseHash[baseHashIndex]; + + // while the index does not point to the 'Ground' + while (index != 0) { + // returns the index found in case of of a matching key. + if (keys[index].equals(key)) { + return index; + } + + // next the local index + prev = index; + index = next[index]; + } + + // If we got thus far, it could only mean we did not find the key we + // were asked for. return 'Ground' index. + return prev = 0; + } + + /** Returns the object mapped with the given key, or null if the key wasn't found. */ + @SuppressWarnings("unchecked") + public V get(K key) { + return (V) values[find(key)]; + } + + /** + * Allocates a new map of double the capacity, and fast-insert the old + * key-value pairs. + */ + @SuppressWarnings("unchecked") + protected void grow() { + ArrayHashMap newmap = new ArrayHashMap(capacity * 2); + + // Iterates fast over the collection. Any valid pair is put into the new + // map without checking for duplicates or if there's enough space for + // it. + for (IndexIterator iterator = new IndexIterator(); iterator.hasNext();) { + int index = iterator.next(); + newmap.prvt_put((K) keys[index], (V) values[index]); + } + + // Copy that's data into this. + capacity = newmap.capacity; + size = newmap.size; + firstEmpty = newmap.firstEmpty; + values = newmap.values; + keys = newmap.keys; + next = newmap.next; + baseHash = newmap.baseHash; + hashFactor = newmap.hashFactor; + } + + /** Returns true iff the map is empty. */ + public boolean isEmpty() { + return size == 0; + } + + /** Returns an iterator on the mapped objects. */ + public Iterator iterator() { + return new ValueIterator(); + } + + /** Returns an iterator on the map keys. */ + public Iterator keyIterator() { + return new KeyIterator(); + } + + /** Prints the baseHash array, used for debugging purposes. */ + @SuppressWarnings("unused") + private void printBaseHash() { + for (int i : baseHash) { + System.out.println(i + ".\t" + i); + } + } + + /** + * Inserts the <key,value> pair into the map. If the key already exists, + * this method updates the mapped value to the given one, returning the old + * mapped value. + * + * @return the old mapped value, or null if the key didn't exist. + */ + @SuppressWarnings("unchecked") + public V put(K key, V e) { + // Does key exists? + int index = find(key); + + // Yes! + if (index != 0) { + // Set new data and exit. + V old = (V) values[index]; + values[index] = e; + return old; + } + + // Is there enough room for a new pair? + if (size == capacity) { + // No? Than grow up! + grow(); + } + + // Now that everything is set, the pair can be just put inside with no + // worries. + prvt_put(key, e); + + return null; + } + + /** + * Removes a <key,value> pair from the map and returns the mapped value, + * or null if the none existed. + * + * @param key used to find the value to remove + * @return the removed value or null if none existed. + */ + @SuppressWarnings("unchecked") + public V remove(K key) { + int baseHashIndex = calcBaseHashIndex(key); + int index = findForRemove(key, baseHashIndex); + if (index != 0) { + // If it is the first in the collision list, we should promote its + // next colliding element. + if (prev == 0) { + baseHash[baseHashIndex] = next[index]; + } + + next[prev] = next[index]; + next[index] = firstEmpty; + firstEmpty = index; + --size; + return (V) values[index]; + } + + return null; + } + + /** Returns number of pairs currently in the map. */ + public int size() { + return this.size; + } + + /** + * Translates the mapped pairs' values into an array of Objects + * + * @return an object array of all the values currently in the map. + */ + public Object[] toArray() { + int j = -1; + Object[] array = new Object[size]; + + // Iterates over the values, adding them to the array. + for (Iterator iterator = iterator(); iterator.hasNext();) { + array[++j] = iterator.next(); + } + return array; + } + + /** + * Translates the mapped pairs' values into an array of V + * + * @param a the array into which the elements of the list are to be stored, if + * it is big enough; otherwise, use as much space as it can. + * @return an array containing the elements of the list + */ + public V[] toArray(V[] a) { + int j = 0; + // Iterates over the values, adding them to the array. + for (Iterator iterator = iterator(); j < a.length + && iterator.hasNext(); ++j) { + a[j] = iterator.next(); + } + if (j < a.length) { + a[j] = null; + } + + return a; + } + + @Override + public String toString() { + StringBuffer sb = new StringBuffer(); + sb.append('{'); + Iterator keyIterator = keyIterator(); + while (keyIterator.hasNext()) { + K key = keyIterator.next(); + sb.append(key); + sb.append('='); + sb.append(get(key)); + if (keyIterator.hasNext()) { + sb.append(','); + sb.append(' '); + } + } + sb.append('}'); + return sb.toString(); + } + + @Override + public int hashCode() { + return getClass().hashCode() ^ size(); + } + + @SuppressWarnings("unchecked") + @Override + public boolean equals(Object o) { + ArrayHashMap that = (ArrayHashMap)o; + if (that.size() != this.size()) { + return false; + } + + Iterator it = keyIterator(); + while (it.hasNext()) { + K key = it.next(); + V v1 = this.get(key); + V v2 = that.get(key); + if ((v1 == null && v2 != null) || + (v1 != null && v2 == null) || + (!v1.equals(v2))) { + return false; + } + } + return true; + } +} \ No newline at end of file diff --git a/modules/facet/src/java/org/apache/lucene/util/collections/DoubleIterator.java b/modules/facet/src/java/org/apache/lucene/util/collections/DoubleIterator.java new file mode 100644 index 00000000000..e301308880d --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/util/collections/DoubleIterator.java @@ -0,0 +1,31 @@ +package org.apache.lucene.util.collections; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Iterator interface for primitive double iteration. * + * + * @lucene.experimental + */ +public interface DoubleIterator { + + boolean hasNext(); + double next(); + void remove(); + +} diff --git a/modules/facet/src/java/org/apache/lucene/util/collections/FloatIterator.java b/modules/facet/src/java/org/apache/lucene/util/collections/FloatIterator.java new file mode 100644 index 00000000000..9d8894b19dc --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/util/collections/FloatIterator.java @@ -0,0 +1,31 @@ +package org.apache.lucene.util.collections; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Iterator interface for primitive int iteration. * + * + * @lucene.experimental + */ +public interface FloatIterator { + + boolean hasNext(); + float next(); + void remove(); + +} diff --git a/modules/facet/src/java/org/apache/lucene/util/collections/FloatToObjectMap.java b/modules/facet/src/java/org/apache/lucene/util/collections/FloatToObjectMap.java new file mode 100644 index 00000000000..ba66d8c38dc --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/util/collections/FloatToObjectMap.java @@ -0,0 +1,630 @@ +package org.apache.lucene.util.collections; + +import java.util.Arrays; +import java.util.Iterator; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + + * An Array-based hashtable which maps primitive float to Objects of generic type + * T.
+ * The hashtable is constracted with a given capacity, or 16 as a default. In + * case there's not enough room for new pairs, the hashtable grows.
+ * Capacity is adjusted to a power of 2, and there are 2 * capacity entries for + * the hash. + * + * The pre allocated arrays (for keys, values) are at length of capacity + 1, + * when index 0 is used as 'Ground' or 'NULL'.
+ * + * The arrays are allocated ahead of hash operations, and form an 'empty space' + * list, to which the key,value pair is allocated. + * + * @lucene.experimental + */ +public class FloatToObjectMap implements Iterable { + + /** + * Implements an IntIterator which iterates over all the allocated indexes. + */ + private final class IndexIterator implements IntIterator { + /** + * The last used baseHashIndex. Needed for "jumping" from one hash entry + * to another. + */ + private int baseHashIndex = 0; + + /** + * The next not-yet-visited index. + */ + private int index = 0; + + /** + * Index of the last visited pair. Used in {@link #remove()}. + */ + private int lastIndex = 0; + + /** + * Create the Iterator, make index point to the "first" + * index which is not empty. If such does not exist (eg. the map is + * empty) it would be zero. + */ + public IndexIterator() { + for (baseHashIndex = 0; baseHashIndex < baseHash.length; ++baseHashIndex) { + index = baseHash[baseHashIndex]; + if (index != 0) { + break; + } + } + } + + public boolean hasNext() { + return (index != 0); + } + + public int next() { + // Save the last index visited + lastIndex = index; + + // next the index + index = next[index]; + + // if the next index points to the 'Ground' it means we're done with + // the current hash entry and we need to jump to the next one. This + // is done until all the hash entries had been visited. + while (index == 0 && ++baseHashIndex < baseHash.length) { + index = baseHash[baseHashIndex]; + } + + return lastIndex; + } + + public void remove() { + FloatToObjectMap.this.remove(keys[lastIndex]); + } + + } + + /** + * Implements an IntIterator, used for iteration over the map's keys. + */ + private final class KeyIterator implements FloatIterator { + private IntIterator iterator = new IndexIterator(); + + KeyIterator() { } + + public boolean hasNext() { + return iterator.hasNext(); + } + + public float next() { + return keys[iterator.next()]; + } + + public void remove() { + iterator.remove(); + } + } + + /** + * Implements an Iterator of a generic type T used for iteration over the + * map's values. + */ + private final class ValueIterator implements Iterator { + private IntIterator iterator = new IndexIterator(); + + ValueIterator() { } + + public boolean hasNext() { + return iterator.hasNext(); + } + + @SuppressWarnings("unchecked") + public T next() { + return (T) values[iterator.next()]; + } + + public void remove() { + iterator.remove(); + } + } + + /** + * Default capacity - in case no capacity was specified in the constructor + */ + private static int defaultCapacity = 16; + + /** + * Holds the base hash entries. if the capacity is 2^N, than the base hash + * holds 2^(N+1). It can hold + */ + int[] baseHash; + + /** + * The current capacity of the map. Always 2^N and never less than 16. We + * never use the zero index. It is needed to improve performance and is also + * used as "ground". + */ + private int capacity; + /** + * All objects are being allocated at map creation. Those objects are "free" + * or empty. Whenever a new pair comes along, a pair is being "allocated" or + * taken from the free-linked list. as this is just a free list. + */ + private int firstEmpty; + + /** + * hashFactor is always (2^(N+1)) - 1. Used for faster hashing. + */ + private int hashFactor; + + /** + * This array holds the unique keys + */ + float[] keys; + + /** + * In case of collisions, we implement a double linked list of the colliding + * hash's with the following next[] and prev[]. Those are also used to store + * the "empty" list. + */ + int[] next; + + private int prev; + + /** + * Number of currently objects in the map. + */ + private int size; + + /** + * This array holds the values + */ + Object[] values; + + /** + * Constructs a map with default capacity. + */ + public FloatToObjectMap() { + this(defaultCapacity); + } + + /** + * Constructs a map with given capacity. Capacity is adjusted to a native + * power of 2, with minimum of 16. + * + * @param capacity + * minimum capacity for the map. + */ + public FloatToObjectMap(int capacity) { + this.capacity = 16; + // Minimum capacity is 16.. + while (this.capacity < capacity) { + // Multiply by 2 as long as we're still under the requested capacity + this.capacity <<= 1; + } + + // As mentioned, we use the first index (0) as 'Ground', so we need the + // length of the arrays to be one more than the capacity + int arrayLength = this.capacity + 1; + + this.values = new Object[arrayLength]; + this.keys = new float[arrayLength]; + this.next = new int[arrayLength]; + + // Hash entries are twice as big as the capacity. + int baseHashSize = this.capacity << 1; + + this.baseHash = new int[baseHashSize]; + + // The has factor is 2^M - 1 which is used as an "AND" hashing operator. + // {@link #calcBaseHash()} + this.hashFactor = baseHashSize - 1; + + this.size = 0; + + clear(); + } + + /** + * Adds a pair to the map. Takes the first empty position from the + * empty-linked-list's head - {@link firstEmpty}. + * + * New pairs are always inserted to baseHash, and are followed by the old + * colliding pair. + * + * @param key + * integer which maps the given Object + * @param e + * element which is being mapped using the given key + */ + private void prvt_put(float key, T e) { + // Hash entry to which the new pair would be inserted + int hashIndex = calcBaseHashIndex(key); + + // 'Allocating' a pair from the "Empty" list. + int objectIndex = firstEmpty; + + // Setting data + firstEmpty = next[firstEmpty]; + values[objectIndex] = e; + keys[objectIndex] = key; + + // Inserting the new pair as the first node in the specific hash entry + next[objectIndex] = baseHash[hashIndex]; + baseHash[hashIndex] = objectIndex; + + // Announcing a new pair was added! + ++size; + } + + /** + * Calculating the baseHash index using the internal hashFactor. + * @param key + */ + protected int calcBaseHashIndex(float key) { + return Float.floatToIntBits(key) & hashFactor; + } + + /** + * Empties the map. Generates the "Empty" space list for later allocation. + */ + public void clear() { + // Clears the hash entries + Arrays.fill(this.baseHash, 0); + + // Set size to zero + size = 0; + + // Mark all array entries as empty. This is done with + // firstEmpty pointing to the first valid index (1 as 0 is + // used as 'Ground'). + firstEmpty = 1; + + // And setting all the next[i] to point at + // i+1. + for (int i = 1; i < this.capacity;) { + next[i] = ++i; + } + + // Surly, the last one should point to the 'Ground'. + next[this.capacity] = 0; + } + + /** + * Checks if a given key exists in the map. + * + * @param key + * that is checked against the map data. + * @return true if the key exists in the map. false otherwise. + */ + public boolean containsKey(float key) { + return find(key) != 0; + } + + /** + * Checks if the given object exists in the map.
+ * This method iterates over the collection, trying to find an equal object. + * + * @param o + * object that is checked against the map data. + * @return true if the object exists in the map (in .equals() meaning). + * false otherwise. + */ + public boolean containsValue(Object o) { + for (Iterator iterator = iterator(); iterator.hasNext();) { + T object = iterator.next(); + if (object.equals(o)) { + return true; + } + } + return false; + } + + /** + * Find the actual index of a given key. + * + * @param key + * @return index of the key. zero if the key wasn't found. + */ + protected int find(float key) { + // Calculate the hash entry. + int baseHashIndex = calcBaseHashIndex(key); + + // Start from the hash entry. + int localIndex = baseHash[baseHashIndex]; + + // while the index does not point to the 'Ground' + while (localIndex != 0) { + // returns the index found in case of of a matching key. + if (keys[localIndex] == key) { + return localIndex; + } + + // next the local index + localIndex = next[localIndex]; + } + + // If we got this far, it could only mean we did not find the key we + // were asked for. return 'Ground' index. + return 0; + } + + /** + * Find the actual index of a given key with it's baseHashIndex.
+ * Some methods use the baseHashIndex. If those call {@link #find()} there's + * no need to re-calculate that hash. + * + * @param key + * @param baseHashIndex + * @return the index of the given key, or 0 as 'Ground' if the key wasn't + * found. + */ + private int findForRemove(float key, int baseHashIndex) { + // Start from the hash entry. + this.prev = 0; + int index = baseHash[baseHashIndex]; + + // while the index does not point to the 'Ground' + while (index != 0) { + // returns the index found in case of of a matching key. + if (keys[index] == key) { + return index; + } + + // next the local index + prev = index; + index = next[index]; + } + + // If we got this far, it could only mean we did not find the key we + // were asked for. return 'Ground' index. + this.prev = 0; + return 0; + } + + /** + * Returns the object mapped with the given key. + * + * @param key + * int who's mapped object we're interested in. + * @return an object mapped by the given key. null if the key wasn't found. + */ + @SuppressWarnings("unchecked") + public T get(float key) { + return (T) values[find(key)]; + } + + /** + * Grows the map. Allocates a new map of double the capacity, and + * fast-insert the old key-value pairs. + */ + @SuppressWarnings("unchecked") + protected void grow() { + FloatToObjectMap that = new FloatToObjectMap( + this.capacity * 2); + + // Iterates fast over the collection. Any valid pair is put into the new + // map without checking for duplicates or if there's enough space for + // it. + for (IndexIterator iterator = new IndexIterator(); iterator.hasNext();) { + int index = iterator.next(); + that.prvt_put(this.keys[index], (T) this.values[index]); + } + + // Copy that's data into this. + this.capacity = that.capacity; + this.size = that.size; + this.firstEmpty = that.firstEmpty; + this.values = that.values; + this.keys = that.keys; + this.next = that.next; + this.baseHash = that.baseHash; + this.hashFactor = that.hashFactor; + } + + /** + * + * @return true if the map is empty. false otherwise. + */ + public boolean isEmpty() { + return size == 0; + } + + /** + * Returns a new iterator for the mapped objects. + */ + public Iterator iterator() { + return new ValueIterator(); + } + + /** Returns an iterator on the map keys. */ + public FloatIterator keyIterator() { + return new KeyIterator(); + } + + /** + * Prints the baseHash array, used for DEBUG purposes. + */ + @SuppressWarnings("unused") + private void printBaseHash() { + for (int i = 0; i < this.baseHash.length; i++) { + System.out.println(i + ".\t" + baseHash[i]); + } + } + + /** + * Inserts the <key,value> pair into the map. If the key already exists, + * this method updates the mapped value to the given one, returning the old + * mapped value. + * + * @return the old mapped value, or null if the key didn't exist. + */ + @SuppressWarnings("unchecked") + public T put(float key, T e) { + // Does key exists? + int index = find(key); + + // Yes! + if (index != 0) { + // Set new data and exit. + T old = (T) values[index]; + values[index] = e; + return old; + } + + // Is there enough room for a new pair? + if (size == capacity) { + // No? Than grow up! + grow(); + } + + // Now that everything is set, the pair can be just put inside with no + // worries. + prvt_put(key, e); + + return null; + } + + /** + * Removes a <key,value> pair from the map and returns the mapped value, + * or null if the none existed. + * + * @param key used to find the value to remove + * @return the removed value or null if none existed. + */ + @SuppressWarnings("unchecked") + public T remove(float key) { + int baseHashIndex = calcBaseHashIndex(key); + int index = findForRemove(key, baseHashIndex); + if (index != 0) { + // If it is the first in the collision list, we should promote its + // next colliding element. + if (prev == 0) { + baseHash[baseHashIndex] = next[index]; + } + + next[prev] = next[index]; + next[index] = firstEmpty; + firstEmpty = index; + --size; + return (T) values[index]; + } + + return null; + } + + /** + * @return number of pairs currently in the map + */ + public int size() { + return this.size; + } + + /** + * Translates the mapped pairs' values into an array of Objects + * + * @return an object array of all the values currently in the map. + */ + public Object[] toArray() { + int j = -1; + Object[] array = new Object[size]; + + // Iterates over the values, adding them to the array. + for (Iterator iterator = iterator(); iterator.hasNext();) { + array[++j] = iterator.next(); + } + return array; + } + + /** + * Translates the mapped pairs' values into an array of T + * + * @param a + * the array into which the elements of the list are to be + * stored, if it is big enough; otherwise, use whatever space we + * have, setting the one after the true data as null. + * + * @return an array containing the elements of the list + * + */ + public T[] toArray(T[] a) { + int j = 0; + // Iterates over the values, adding them to the array. + for (Iterator iterator = iterator(); j < a.length + && iterator.hasNext(); ++j) { + a[j] = iterator.next(); + } + + if (j < a.length) { + a[j] = null; + } + + return a; + } + + @Override + public String toString() { + StringBuffer sb = new StringBuffer(); + sb.append('{'); + FloatIterator keyIterator = keyIterator(); + while (keyIterator.hasNext()) { + float key = keyIterator.next(); + sb.append(key); + sb.append('='); + sb.append(get(key)); + if (keyIterator.hasNext()) { + sb.append(','); + sb.append(' '); + } + } + sb.append('}'); + return sb.toString(); + } + + @Override + public int hashCode() { + return getClass().hashCode() ^ size(); + } + + @SuppressWarnings("unchecked") + @Override + public boolean equals(Object o) { + FloatToObjectMap that = (FloatToObjectMap)o; + if (that.size() != this.size()) { + return false; + } + + FloatIterator it = keyIterator(); + while (it.hasNext()) { + float key = it.next(); + if (!that.containsKey(key)) { + return false; + } + + T v1 = this.get(key); + T v2 = that.get(key); + if ((v1 == null && v2 != null) || + (v1 != null && v2 == null) || + (!v1.equals(v2))) { + return false; + } + } + return true; + } +} \ No newline at end of file diff --git a/modules/facet/src/java/org/apache/lucene/util/collections/IntArray.java b/modules/facet/src/java/org/apache/lucene/util/collections/IntArray.java new file mode 100644 index 00000000000..136955cdc89 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/util/collections/IntArray.java @@ -0,0 +1,252 @@ +package org.apache.lucene.util.collections; + +import java.util.Arrays; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * A Class wrapper for a grow-able int[] which can be sorted and intersect with + * other IntArrays. + * + * @lucene.experimental + */ +public class IntArray { + + /** + * The int[] which holds the data + */ + private int[] data; + + /** + * Holds the number of items in the array. + */ + private int size; + + /** + * A flag which indicates whether a sort should occur of the array is + * already sorted. + */ + private boolean shouldSort; + + /** + * Construct a default IntArray, size 0 and surly a sort should not occur. + */ + public IntArray() { + init(true); + } + + private void init(boolean realloc) { + size = 0; + if (realloc) { + data = new int[0]; + } + shouldSort = false; + } + + /** + * Intersects the data with a given {@link IntHashSet}. + * + * @param set + * A given ArrayHashSetInt which holds the data to be intersected + * against + */ + public void intersect(IntHashSet set) { + int newSize = 0; + for (int i = 0; i < size; ++i) { + if (set.contains(data[i])) { + data[newSize] = data[i]; + ++newSize; + } + } + this.size = newSize; + } + + /** + * Intersects the data with a given IntArray + * + * @param other + * A given IntArray which holds the data to be intersected agains + */ + public void intersect(IntArray other) { + sort(); + other.sort(); + + int myIndex = 0; + int otherIndex = 0; + int newSize = 0; + if (this.size > other.size) { + while (otherIndex < other.size && myIndex < size) { + while (otherIndex < other.size + && other.data[otherIndex] < data[myIndex]) { + ++otherIndex; + } + if (otherIndex == other.size) { + break; + } + while (myIndex < size && other.data[otherIndex] > data[myIndex]) { + ++myIndex; + } + if (other.data[otherIndex] == data[myIndex]) { + data[newSize++] = data[myIndex]; + ++otherIndex; + ++myIndex; + } + } + } else { + while (otherIndex < other.size && myIndex < size) { + while (myIndex < size && other.data[otherIndex] > data[myIndex]) { + ++myIndex; + } + if (myIndex == size) { + break; + } + while (otherIndex < other.size + && other.data[otherIndex] < data[myIndex]) { + ++otherIndex; + } + if (other.data[otherIndex] == data[myIndex]) { + data[newSize++] = data[myIndex]; + ++otherIndex; + ++myIndex; + } + } + } + this.size = newSize; + } + + /** + * Return the size of the Array. Not the allocated size, but the number of + * values actually set. + * + * @return the (filled) size of the array + */ + public int size() { + return size; + } + + /** + * Adds a value to the array. + * + * @param value + * value to be added + */ + public void addToArray(int value) { + if (size == data.length) { + int[] newArray = new int[2 * size + 1]; + System.arraycopy(data, 0, newArray, 0, size); + data = newArray; + } + data[size] = value; + ++size; + shouldSort = true; + } + + /** + * Equals method. Checking the sizes, than the values from the last index to + * the first (Statistically for random should be the same but for our + * specific use would find differences faster). + */ + @Override + public boolean equals(Object o) { + if (!(o instanceof IntArray)) { + return false; + } + + IntArray array = (IntArray) o; + if (array.size != size) { + return false; + } + + sort(); + array.sort(); + + boolean equal = true; + + for (int i = size; i > 0 && equal;) { + --i; + equal = (array.data[i] == this.data[i]); + } + + return equal; + } + + /** + * Sorts the data. If it is needed. + */ + public void sort() { + if (shouldSort) { + shouldSort = false; + Arrays.sort(data, 0, size); + } + } + + /** + * Calculates a hash-code for HashTables + */ + @Override + public int hashCode() { + int hash = 0; + for (int i = 0; i < size; ++i) { + hash = data[i] ^ (hash * 31); + } + return hash; + } + + /** + * Get an element from a specific index. + * + * @param i + * index of which element should be retrieved. + */ + public int get(int i) { + if (i >= size) { + throw new ArrayIndexOutOfBoundsException(i); + } + return this.data[i]; + } + + public void set(int idx, int value) { + if (idx >= size) { + throw new ArrayIndexOutOfBoundsException(idx); + } + this.data[idx] = value; + } + + /** + * toString or not toString. That is the question! + */ + @Override + public String toString() { + String s = "(" + size + ") "; + for (int i = 0; i < size; ++i) { + s += "" + data[i] + ", "; + } + return s; + } + + /** + * Clear the IntArray (set all elements to zero). + * @param resize - if resize is true, then clear actually allocates + * a new array of size 0, essentially 'clearing' the array and freeing + * memory. + */ + public void clear(boolean resize) { + init(resize); + } + +} \ No newline at end of file diff --git a/modules/facet/src/java/org/apache/lucene/util/collections/IntHashSet.java b/modules/facet/src/java/org/apache/lucene/util/collections/IntHashSet.java new file mode 100644 index 00000000000..9a059f7e229 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/util/collections/IntHashSet.java @@ -0,0 +1,552 @@ +package org.apache.lucene.util.collections; + +import java.util.Arrays; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * A Set or primitive int. Implemented as a HashMap of int->int. * + * + * @lucene.experimental + */ +public class IntHashSet { + + // TODO (Facet): This is wasteful as the "values" are actually the "keys" and + // we could spare this amount of space (capacity * sizeof(int)). Perhaps even + // though it is not OOP, we should re-implement the hash for just that cause. + + /** + * Implements an IntIterator which iterates over all the allocated indexes. + */ + private final class IndexIterator implements IntIterator { + /** + * The last used baseHashIndex. Needed for "jumping" from one hash entry + * to another. + */ + private int baseHashIndex = 0; + + /** + * The next not-yet-visited index. + */ + private int index = 0; + + /** + * Index of the last visited pair. Used in {@link #remove()}. + */ + private int lastIndex = 0; + + /** + * Create the Iterator, make index point to the "first" + * index which is not empty. If such does not exist (eg. the map is + * empty) it would be zero. + */ + public IndexIterator() { + for (baseHashIndex = 0; baseHashIndex < baseHash.length; ++baseHashIndex) { + index = baseHash[baseHashIndex]; + if (index != 0) { + break; + } + } + } + + public boolean hasNext() { + return (index != 0); + } + + public int next() { + // Save the last index visited + lastIndex = index; + + // next the index + index = next[index]; + + // if the next index points to the 'Ground' it means we're done with + // the current hash entry and we need to jump to the next one. This + // is done until all the hash entries had been visited. + while (index == 0 && ++baseHashIndex < baseHash.length) { + index = baseHash[baseHashIndex]; + } + + return lastIndex; + } + + public void remove() { + IntHashSet.this.remove(keys[lastIndex]); + } + + } + + /** + * Implements an IntIterator, used for iteration over the map's keys. + */ + private final class KeyIterator implements IntIterator { + private IntIterator iterator = new IndexIterator(); + + KeyIterator() { } + + public boolean hasNext() { + return iterator.hasNext(); + } + + public int next() { + return keys[iterator.next()]; + } + + public void remove() { + iterator.remove(); + } + } + + /** + * Default capacity - in case no capacity was specified in the constructor + */ + private static int defaultCapacity = 16; + + /** + * Holds the base hash entries. if the capacity is 2^N, than the base hash + * holds 2^(N+1). It can hold + */ + int[] baseHash; + + /** + * The current capacity of the map. Always 2^N and never less than 16. We + * never use the zero index. It is needed to improve performance and is also + * used as "ground". + */ + private int capacity; + + /** + * All objects are being allocated at map creation. Those objects are "free" + * or empty. Whenever a new pair comes along, a pair is being "allocated" or + * taken from the free-linked list. as this is just a free list. + */ + private int firstEmpty; + + /** + * hashFactor is always (2^(N+1)) - 1. Used for faster hashing. + */ + private int hashFactor; + + /** + * This array holds the unique keys + */ + int[] keys; + + /** + * In case of collisions, we implement a double linked list of the colliding + * hash's with the following next[] and prev[]. Those are also used to store + * the "empty" list. + */ + int[] next; + + private int prev; + + /** + * Number of currently objects in the map. + */ + private int size; + + /** + * Constructs a map with default capacity. + */ + public IntHashSet() { + this(defaultCapacity); + } + + /** + * Constructs a map with given capacity. Capacity is adjusted to a native + * power of 2, with minimum of 16. + * + * @param capacity + * minimum capacity for the map. + */ + public IntHashSet(int capacity) { + this.capacity = 16; + // Minimum capacity is 16.. + while (this.capacity < capacity) { + // Multiply by 2 as long as we're still under the requested capacity + this.capacity <<= 1; + } + + // As mentioned, we use the first index (0) as 'Ground', so we need the + // length of the arrays to be one more than the capacity + int arrayLength = this.capacity + 1; + + this.keys = new int[arrayLength]; + this.next = new int[arrayLength]; + + // Hash entries are twice as big as the capacity. + int baseHashSize = this.capacity << 1; + + this.baseHash = new int[baseHashSize]; + + // The has factor is 2^M - 1 which is used as an "AND" hashing operator. + // {@link #calcBaseHash()} + this.hashFactor = baseHashSize - 1; + + this.size = 0; + + clear(); + } + + /** + * Adds a pair to the map. Takes the first empty position from the + * empty-linked-list's head - {@link firstEmpty}. + * + * New pairs are always inserted to baseHash, and are followed by the old + * colliding pair. + * + * @param key + * integer which maps the given value + * @param e + * value which is being mapped using the given key + */ + private void prvt_add(int key) { + // Hash entry to which the new pair would be inserted + int hashIndex = calcBaseHashIndex(key); + + // 'Allocating' a pair from the "Empty" list. + int objectIndex = firstEmpty; + + // Setting data + firstEmpty = next[firstEmpty]; + keys[objectIndex] = key; + + // Inserting the new pair as the first node in the specific hash entry + next[objectIndex] = baseHash[hashIndex]; + baseHash[hashIndex] = objectIndex; + + // Announcing a new pair was added! + ++size; + } + + /** + * Calculating the baseHash index using the internal hashFactor + * . + * + * @param key + */ + protected int calcBaseHashIndex(int key) { + return key & hashFactor; + } + + /** + * Empties the map. Generates the "Empty" space list for later allocation. + */ + public void clear() { + // Clears the hash entries + Arrays.fill(this.baseHash, 0); + + // Set size to zero + size = 0; + + // Mark all array entries as empty. This is done with + // firstEmpty pointing to the first valid index (1 as 0 is + // used as 'Ground'). + firstEmpty = 1; + + // And setting all the next[i] to point at + // i+1. + for (int i = 1; i < this.capacity;) { + next[i] = ++i; + } + + // Surly, the last one should point to the 'Ground'. + next[this.capacity] = 0; + } + + /** + * Checks if a given key exists in the map. + * + * @param value + * that is checked against the map data. + * @return true if the key exists in the map. false otherwise. + */ + public boolean contains(int value) { + return find(value) != 0; + } + + /** + * Find the actual index of a given key. + * + * @param key + * @return index of the key. zero if the key wasn't found. + */ + protected int find(int key) { + // Calculate the hash entry. + int baseHashIndex = calcBaseHashIndex(key); + + // Start from the hash entry. + int localIndex = baseHash[baseHashIndex]; + + // while the index does not point to the 'Ground' + while (localIndex != 0) { + // returns the index found in case of of a matching key. + if (keys[localIndex] == key) { + return localIndex; + } + + // next the local index + localIndex = next[localIndex]; + } + + // If we got this far, it could only mean we did not find the key we + // were asked for. return 'Ground' index. + return 0; + } + + /** + * Find the actual index of a given key with it's baseHashIndex.
+ * Some methods use the baseHashIndex. If those call {@link #find()} there's + * no need to re-calculate that hash. + * + * @param key + * @param baseHashIndex + * @return the index of the given key, or 0 as 'Ground' if the key wasn't + * found. + */ + private int findForRemove(int key, int baseHashIndex) { + // Start from the hash entry. + this.prev = 0; + int index = baseHash[baseHashIndex]; + + // while the index does not point to the 'Ground' + while (index != 0) { + // returns the index found in case of of a matching key. + if (keys[index] == key) { + return index; + } + + // next the local index + prev = index; + index = next[index]; + } + + // If we got this far, it could only mean we did not find the key we + // were asked for. return 'Ground' index. + this.prev = 0; + return 0; + } + + /** + * Grows the map. Allocates a new map of double the capacity, and + * fast-insert the old key-value pairs. + */ + protected void grow() { + IntHashSet that = new IntHashSet(this.capacity * 2); + + // Iterates fast over the collection. Any valid pair is put into the new + // map without checking for duplicates or if there's enough space for + // it. + for (IndexIterator iterator = new IndexIterator(); iterator.hasNext();) { + int index = iterator.next(); + that.prvt_add(this.keys[index]); + } + // for (int i = capacity; i > 0; --i) { + // + // that._add(this.keys[i]); + // + // } + + // Copy that's data into this. + this.capacity = that.capacity; + this.size = that.size; + this.firstEmpty = that.firstEmpty; + this.keys = that.keys; + this.next = that.next; + this.baseHash = that.baseHash; + this.hashFactor = that.hashFactor; + } + + /** + * + * @return true if the map is empty. false otherwise. + */ + public boolean isEmpty() { + return size == 0; + } + + /** + * Returns a new iterator for the mapped objects. + */ + public IntIterator iterator() { + return new KeyIterator(); + } + + /** + * Prints the baseHash array, used for debug purposes. + */ + public void printBaseHash() { + for (int i = 0; i < this.baseHash.length; i++) { + if (baseHash[i] != 0) { + System.out.println(i + ".\t" + baseHash[i]); + } + } + } + + /** + * Add a mapping int key -> int value. + *

+ * If the key was already inside just + * updating the value it refers to as the given object. + *

+ * Otherwise if the map is full, first {@link #grow()} the map. + * + * @param value + * integer which maps the given value + * @return true always. + */ + public boolean add(int value) { + // Does key exists? + int index = find(value); + + // Yes! + if (index != 0) { + return true; + } + + // Is there enough room for a new pair? + if (size == capacity) { + // No? Than grow up! + grow(); + } + + // Now that everything is set, the pair can be just put inside with no + // worries. + prvt_add(value); + + return true; + } + + /** + * Remove a pair from the map, specified by it's key. + * + * @param value + * specify the value to be removed + * + * @return true if the map was changed (the key was found and removed). + * false otherwise. + */ + public boolean remove(int value) { + int baseHashIndex = calcBaseHashIndex(value); + int index = findForRemove(value, baseHashIndex); + if (index != 0) { + // If it is the first in the collision list, we should promote its + // next colliding element. + if (prev == 0) { + baseHash[baseHashIndex] = next[index]; + } + + next[prev] = next[index]; + next[index] = firstEmpty; + firstEmpty = index; + --size; + return true; + } + + return false; + } + + /** + * @return number of pairs currently in the map + */ + public int size() { + return this.size; + } + + /** + * Translates the mapped pairs' values into an array of Objects + * + * @return an object array of all the values currently in the map. + */ + public int[] toArray() { + int j = -1; + int[] array = new int[size]; + + // Iterates over the values, adding them to the array. + for (IntIterator iterator = iterator(); iterator.hasNext();) { + array[++j] = iterator.next(); + } + return array; + } + + /** + * Translates the mapped pairs' values into an array of ints + * + * @param a + * the array into which the elements of the map are to be stored, + * if it is big enough; otherwise, a new array of the same + * runtime type is allocated for this purpose. + * + * @return an array containing the values stored in the map + * + */ + public int[] toArray(int[] a) { + int j = 0; + if (a.length < size) { + a = new int[size]; + } + // Iterates over the values, adding them to the array. + for (IntIterator iterator = iterator(); j < a.length + && iterator.hasNext(); ++j) { + a[j] = iterator.next(); + } + return a; + } + + /** + * I have no idea why would anyone call it - but for debug purposes.
+ * Prints the entire map, including the index, key, object, next and prev. + */ + @Override + public String toString() { + StringBuffer sb = new StringBuffer(); + sb.append('{'); + IntIterator iterator = iterator(); + while (iterator.hasNext()) { + sb.append(iterator.next()); + if (iterator.hasNext()) { + sb.append(','); + sb.append(' '); + } + } + sb.append('}'); + return sb.toString(); + } + + public String toHashString() { + String string = "\n"; + StringBuffer sb = new StringBuffer(); + + for (int i = 0; i < this.baseHash.length; i++) { + StringBuffer sb2 = new StringBuffer(); + boolean shouldAppend = false; + sb2.append(i + ".\t"); + for (int index = baseHash[i]; index != 0; index = next[index]) { + sb2.append(" -> " + keys[index] + "@" + index); + shouldAppend = true; + } + if (shouldAppend) { + sb.append(sb2); + sb.append(string); + } + } + + return sb.toString(); + } +} \ No newline at end of file diff --git a/modules/facet/src/java/org/apache/lucene/util/collections/IntIterator.java b/modules/facet/src/java/org/apache/lucene/util/collections/IntIterator.java new file mode 100644 index 00000000000..740b1331489 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/util/collections/IntIterator.java @@ -0,0 +1,31 @@ +package org.apache.lucene.util.collections; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Iterator interface for primitive int iteration. * + * + * @lucene.experimental + */ +public interface IntIterator { + + boolean hasNext(); + int next(); + void remove(); + +} diff --git a/modules/facet/src/java/org/apache/lucene/util/collections/IntToDoubleMap.java b/modules/facet/src/java/org/apache/lucene/util/collections/IntToDoubleMap.java new file mode 100644 index 00000000000..33b61d1dbc4 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/util/collections/IntToDoubleMap.java @@ -0,0 +1,629 @@ +package org.apache.lucene.util.collections; + +import java.util.Arrays; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * An Array-based hashtable which maps primitive int to a primitive double.
+ * The hashtable is constracted with a given capacity, or 16 as a default. In + * case there's not enough room for new pairs, the hashtable grows.
+ * Capacity is adjusted to a power of 2, and there are 2 * capacity entries for + * the hash. + * + * The pre allocated arrays (for keys, values) are at length of capacity + 1, + * when index 0 is used as 'Ground' or 'NULL'.
+ * + * The arrays are allocated ahead of hash operations, and form an 'empty space' + * list, to which the key,value pair is allocated. + * + * @lucene.experimental + */ +public class IntToDoubleMap { + + public static final double GROUND = Double.NaN; + + /** + * Implements an IntIterator which iterates over all the allocated indexes. + */ + private final class IndexIterator implements IntIterator { + /** + * The last used baseHashIndex. Needed for "jumping" from one hash entry + * to another. + */ + private int baseHashIndex = 0; + + /** + * The next not-yet-visited index. + */ + private int index = 0; + + /** + * Index of the last visited pair. Used in {@link #remove()}. + */ + private int lastIndex = 0; + + /** + * Create the Iterator, make index point to the "first" + * index which is not empty. If such does not exist (eg. the map is + * empty) it would be zero. + */ + public IndexIterator() { + for (baseHashIndex = 0; baseHashIndex < baseHash.length; ++baseHashIndex) { + index = baseHash[baseHashIndex]; + if (index != 0) { + break; + } + } + } + + public boolean hasNext() { + return (index != 0); + } + + public int next() { + // Save the last index visited + lastIndex = index; + + // next the index + index = next[index]; + + // if the next index points to the 'Ground' it means we're done with + // the current hash entry and we need to jump to the next one. This + // is done until all the hash entries had been visited. + while (index == 0 && ++baseHashIndex < baseHash.length) { + index = baseHash[baseHashIndex]; + } + + return lastIndex; + } + + public void remove() { + IntToDoubleMap.this.remove(keys[lastIndex]); + } + + } + + /** + * Implements an IntIterator, used for iteration over the map's keys. + */ + private final class KeyIterator implements IntIterator { + private IntIterator iterator = new IndexIterator(); + + KeyIterator() { } + + public boolean hasNext() { + return iterator.hasNext(); + } + + public int next() { + return keys[iterator.next()]; + } + + public void remove() { + iterator.remove(); + } + } + + /** + * Implements an Iterator of a generic type T used for iteration over the + * map's values. + */ + private final class ValueIterator implements DoubleIterator { + private IntIterator iterator = new IndexIterator(); + + ValueIterator() { } + + public boolean hasNext() { + return iterator.hasNext(); + } + + public double next() { + return values[iterator.next()]; + } + + public void remove() { + iterator.remove(); + } + } + + /** + * Default capacity - in case no capacity was specified in the constructor + */ + private static int defaultCapacity = 16; + + /** + * Holds the base hash entries. if the capacity is 2^N, than the base hash + * holds 2^(N+1). It can hold + */ + int[] baseHash; + + /** + * The current capacity of the map. Always 2^N and never less than 16. We + * never use the zero index. It is needed to improve performance and is also + * used as "ground". + */ + private int capacity; + /** + * All objects are being allocated at map creation. Those objects are "free" + * or empty. Whenever a new pair comes along, a pair is being "allocated" or + * taken from the free-linked list. as this is just a free list. + */ + private int firstEmpty; + + /** + * hashFactor is always (2^(N+1)) - 1. Used for faster hashing. + */ + private int hashFactor; + + /** + * This array holds the unique keys + */ + int[] keys; + + /** + * In case of collisions, we implement a double linked list of the colliding + * hash's with the following next[] and prev[]. Those are also used to store + * the "empty" list. + */ + int[] next; + + private int prev; + + /** + * Number of currently objects in the map. + */ + private int size; + + /** + * This array holds the values + */ + double[] values; + + /** + * Constructs a map with default capacity. + */ + public IntToDoubleMap() { + this(defaultCapacity); + } + + /** + * Constructs a map with given capacity. Capacity is adjusted to a native + * power of 2, with minimum of 16. + * + * @param capacity + * minimum capacity for the map. + */ + public IntToDoubleMap(int capacity) { + this.capacity = 16; + // Minimum capacity is 16.. + while (this.capacity < capacity) { + // Multiply by 2 as long as we're still under the requested capacity + this.capacity <<= 1; + } + + // As mentioned, we use the first index (0) as 'Ground', so we need the + // length of the arrays to be one more than the capacity + int arrayLength = this.capacity + 1; + + this.values = new double[arrayLength]; + this.keys = new int[arrayLength]; + this.next = new int[arrayLength]; + + // Hash entries are twice as big as the capacity. + int baseHashSize = this.capacity << 1; + + this.baseHash = new int[baseHashSize]; + + this.values[0] = GROUND; + + // The has factor is 2^M - 1 which is used as an "AND" hashing operator. + // {@link #calcBaseHash()} + this.hashFactor = baseHashSize - 1; + + this.size = 0; + + clear(); + } + + /** + * Adds a pair to the map. Takes the first empty position from the + * empty-linked-list's head - {@link firstEmpty}. + * + * New pairs are always inserted to baseHash, and are followed by the old + * colliding pair. + * + * @param key + * integer which maps the given Object + * @param v + * double value which is being mapped using the given key + */ + private void prvt_put(int key, double v) { + // Hash entry to which the new pair would be inserted + int hashIndex = calcBaseHashIndex(key); + + // 'Allocating' a pair from the "Empty" list. + int objectIndex = firstEmpty; + + // Setting data + firstEmpty = next[firstEmpty]; + values[objectIndex] = v; + keys[objectIndex] = key; + + // Inserting the new pair as the first node in the specific hash entry + next[objectIndex] = baseHash[hashIndex]; + baseHash[hashIndex] = objectIndex; + + // Announcing a new pair was added! + ++size; + } + + /** + * Calculating the baseHash index using the internal hashFactor + * . + * + * @param key + */ + protected int calcBaseHashIndex(int key) { + return key & hashFactor; + } + + /** + * Empties the map. Generates the "Empty" space list for later allocation. + */ + public void clear() { + // Clears the hash entries + Arrays.fill(this.baseHash, 0); + + // Set size to zero + size = 0; + + // Mark all array entries as empty. This is done with + // firstEmpty pointing to the first valid index (1 as 0 is + // used as 'Ground'). + firstEmpty = 1; + + // And setting all the next[i] to point at + // i+1. + for (int i = 1; i < this.capacity;) { + next[i] = ++i; + } + + // Surly, the last one should point to the 'Ground'. + next[this.capacity] = 0; + } + + /** + * Checks if a given key exists in the map. + * + * @param key + * that is checked against the map data. + * @return true if the key exists in the map. false otherwise. + */ + public boolean containsKey(int key) { + return find(key) != 0; + } + + /** + * Checks if the given value exists in the map.
+ * This method iterates over the collection, trying to find an equal object. + * + * @param value + * double value that is checked against the map data. + * @return true if the value exists in the map, false otherwise. + */ + public boolean containsValue(double value) { + for (DoubleIterator iterator = iterator(); iterator.hasNext();) { + double d = iterator.next(); + if (d == value) { + return true; + } + } + return false; + } + + /** + * Find the actual index of a given key. + * + * @param key + * @return index of the key. zero if the key wasn't found. + */ + protected int find(int key) { + // Calculate the hash entry. + int baseHashIndex = calcBaseHashIndex(key); + + // Start from the hash entry. + int localIndex = baseHash[baseHashIndex]; + + // while the index does not point to the 'Ground' + while (localIndex != 0) { + // returns the index found in case of of a matching key. + if (keys[localIndex] == key) { + return localIndex; + } + + // next the local index + localIndex = next[localIndex]; + } + + // If we got this far, it could only mean we did not find the key we + // were asked for. return 'Ground' index. + return 0; + } + + /** + * Find the actual index of a given key with it's baseHashIndex.
+ * Some methods use the baseHashIndex. If those call {@link #find()} there's + * no need to re-calculate that hash. + * + * @param key + * @param baseHashIndex + * @return the index of the given key, or 0 as 'Ground' if the key wasn't + * found. + */ + private int findForRemove(int key, int baseHashIndex) { + // Start from the hash entry. + this.prev = 0; + int index = baseHash[baseHashIndex]; + + // while the index does not point to the 'Ground' + while (index != 0) { + // returns the index found in case of of a matching key. + if (keys[index] == key) { + return index; + } + + // next the local index + prev = index; + index = next[index]; + } + + // If we got this far, it could only mean we did not find the key we + // were asked for. return 'Ground' index. + this.prev = 0; + return 0; + } + + /** + * Returns the value mapped with the given key. + * + * @param key + * int who's mapped object we're interested in. + * @return a double value mapped by the given key. Double.NaN if the key wasn't found. + */ + public double get(int key) { + return values[find(key)]; + } + + /** + * Grows the map. Allocates a new map of double the capacity, and + * fast-insert the old key-value pairs. + */ + protected void grow() { + IntToDoubleMap that = new IntToDoubleMap( + this.capacity * 2); + + // Iterates fast over the collection. Any valid pair is put into the new + // map without checking for duplicates or if there's enough space for + // it. + for (IndexIterator iterator = new IndexIterator(); iterator.hasNext();) { + int index = iterator.next(); + that.prvt_put(this.keys[index], this.values[index]); + } + + // Copy that's data into this. + this.capacity = that.capacity; + this.size = that.size; + this.firstEmpty = that.firstEmpty; + this.values = that.values; + this.keys = that.keys; + this.next = that.next; + this.baseHash = that.baseHash; + this.hashFactor = that.hashFactor; + } + + /** + * + * @return true if the map is empty. false otherwise. + */ + public boolean isEmpty() { + return size == 0; + } + + /** + * Returns a new iterator for the mapped double values. + */ + public DoubleIterator iterator() { + return new ValueIterator(); + } + + /** Returns an iterator on the map keys. */ + public IntIterator keyIterator() { + return new KeyIterator(); + } + + /** + * Prints the baseHash array, used for debug purposes. + */ + @SuppressWarnings("unused") + private void printBaseHash() { + for (int i = 0; i < this.baseHash.length; i++) { + System.out.println(i + ".\t" + baseHash[i]); + } + } + + /** + * Inserts the <key,value> pair into the map. If the key already exists, + * this method updates the mapped value to the given one, returning the old + * mapped value. + * + * @return the old mapped value, or {@link Double#NaN} if the key didn't exist. + */ + public double put(int key, double v) { + // Does key exists? + int index = find(key); + + // Yes! + if (index != 0) { + // Set new data and exit. + double old = values[index]; + values[index] = v; + return old; + } + + // Is there enough room for a new pair? + if (size == capacity) { + // No? Than grow up! + grow(); + } + + // Now that everything is set, the pair can be just put inside with no + // worries. + prvt_put(key, v); + + return Double.NaN; + } + + /** + * Removes a <key,value> pair from the map and returns the mapped value, + * or {@link Double#NaN} if the none existed. + * + * @param key used to find the value to remove + * @return the removed value or {@link Double#NaN} if none existed. + */ + public double remove(int key) { + int baseHashIndex = calcBaseHashIndex(key); + int index = findForRemove(key, baseHashIndex); + if (index != 0) { + // If it is the first in the collision list, we should promote its + // next colliding element. + if (prev == 0) { + baseHash[baseHashIndex] = next[index]; + } + + next[prev] = next[index]; + next[index] = firstEmpty; + firstEmpty = index; + --size; + return values[index]; + } + + return Double.NaN; + } + + /** + * @return number of pairs currently in the map + */ + public int size() { + return this.size; + } + + /** + * Translates the mapped pairs' values into an array of Objects + * + * @return a double array of all the values currently in the map. + */ + public double[] toArray() { + int j = -1; + double[] array = new double[size]; + + // Iterates over the values, adding them to the array. + for (DoubleIterator iterator = iterator(); iterator.hasNext();) { + array[++j] = iterator.next(); + } + return array; + } + + /** + * Translates the mapped pairs' values into an array of T + * + * @param a + * the array into which the elements of the list are to be + * stored. If it is big enough use whatever space we need, + * setting the one after the true data as {@link Double#NaN}. + * + * @return an array containing the elements of the list, using the given + * parameter if big enough, otherwise allocate an appropriate array + * and return it. + * + */ + public double[] toArray(double[] a) { + int j = 0; + if (a.length < this.size()) { + a = new double[this.size()]; + } + + // Iterates over the values, adding them to the array. + for (DoubleIterator iterator = iterator(); iterator.hasNext(); ++j) { + a[j] = iterator.next(); + } + + if (j < a.length) { + a[j] = Double.NaN; + } + + return a; + } + + @Override + public String toString() { + StringBuffer sb = new StringBuffer(); + sb.append('{'); + IntIterator keyIterator = keyIterator(); + while (keyIterator.hasNext()) { + int key = keyIterator.next(); + sb.append(key); + sb.append('='); + sb.append(get(key)); + if (keyIterator.hasNext()) { + sb.append(','); + sb.append(' '); + } + } + sb.append('}'); + return sb.toString(); + } + + @Override + public int hashCode() { + return getClass().hashCode() ^ size(); + } + + @Override + public boolean equals(Object o) { + IntToDoubleMap that = (IntToDoubleMap)o; + if (that.size() != this.size()) { + return false; + } + + IntIterator it = keyIterator(); + while (it.hasNext()) { + int key = it.next(); + if (!that.containsKey(key)) { + return false; + } + + double v1 = this.get(key); + double v2 = that.get(key); + if (Double.compare(v1, v2) != 0) { + return false; + } + } + return true; + } +} \ No newline at end of file diff --git a/modules/facet/src/java/org/apache/lucene/util/collections/IntToIntMap.java b/modules/facet/src/java/org/apache/lucene/util/collections/IntToIntMap.java new file mode 100644 index 00000000000..e7726c87a97 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/util/collections/IntToIntMap.java @@ -0,0 +1,620 @@ +package org.apache.lucene.util.collections; + +import java.util.Arrays; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * An Array-based hashtable which maps primitive int to primitive int.
+ * The hashtable is constracted with a given capacity, or 16 as a default. In + * case there's not enough room for new pairs, the hashtable grows.
+ * Capacity is adjusted to a power of 2, and there are 2 * capacity entries for + * the hash. + * + * The pre allocated arrays (for keys, values) are at length of capacity + 1, + * when index 0 is used as 'Ground' or 'NULL'.
+ * + * The arrays are allocated ahead of hash operations, and form an 'empty space' + * list, to which the key,value pair is allocated. + * + * @lucene.experimental + */ +public class IntToIntMap { + + public static final int GROUD = -1; + /** + * Implements an IntIterator which iterates over all the allocated indexes. + */ + private final class IndexIterator implements IntIterator { + /** + * The last used baseHashIndex. Needed for "jumping" from one hash entry + * to another. + */ + private int baseHashIndex = 0; + + /** + * The next not-yet-visited index. + */ + private int index = 0; + + /** + * Index of the last visited pair. Used in {@link #remove()}. + */ + private int lastIndex = 0; + + /** + * Create the Iterator, make index point to the "first" + * index which is not empty. If such does not exist (eg. the map is + * empty) it would be zero. + */ + public IndexIterator() { + for (baseHashIndex = 0; baseHashIndex < baseHash.length; ++baseHashIndex) { + index = baseHash[baseHashIndex]; + if (index != 0) { + break; + } + } + } + + public boolean hasNext() { + return (index != 0); + } + + public int next() { + // Save the last index visited + lastIndex = index; + + // next the index + index = next[index]; + + // if the next index points to the 'Ground' it means we're done with + // the current hash entry and we need to jump to the next one. This + // is done until all the hash entries had been visited. + while (index == 0 && ++baseHashIndex < baseHash.length) { + index = baseHash[baseHashIndex]; + } + + return lastIndex; + } + + public void remove() { + IntToIntMap.this.remove(keys[lastIndex]); + } + + } + + /** + * Implements an IntIterator, used for iteration over the map's keys. + */ + private final class KeyIterator implements IntIterator { + private IntIterator iterator = new IndexIterator(); + + KeyIterator() { } + + public boolean hasNext() { + return iterator.hasNext(); + } + + public int next() { + return keys[iterator.next()]; + } + + public void remove() { + iterator.remove(); + } + } + + /** + * Implements an IntIterator used for iteration over the map's values. + */ + private final class ValueIterator implements IntIterator { + private IntIterator iterator = new IndexIterator(); + + ValueIterator() { } + + public boolean hasNext() { + return iterator.hasNext(); + } + + public int next() { + return values[iterator.next()]; + } + + public void remove() { + iterator.remove(); + } + } + + /** + * Default capacity - in case no capacity was specified in the constructor + */ + private static int defaultCapacity = 16; + + /** + * Holds the base hash entries. if the capacity is 2^N, than the base hash + * holds 2^(N+1). It can hold + */ + int[] baseHash; + + /** + * The current capacity of the map. Always 2^N and never less than 16. We + * never use the zero index. It is needed to improve performance and is also + * used as "ground". + */ + private int capacity; + /** + * All objects are being allocated at map creation. Those objects are "free" + * or empty. Whenever a new pair comes along, a pair is being "allocated" or + * taken from the free-linked list. as this is just a free list. + */ + private int firstEmpty; + + /** + * hashFactor is always (2^(N+1)) - 1. Used for faster hashing. + */ + private int hashFactor; + + /** + * This array holds the unique keys + */ + int[] keys; + + /** + * In case of collisions, we implement a double linked list of the colliding + * hash's with the following next[] and prev[]. Those are also used to store + * the "empty" list. + */ + int[] next; + + private int prev; + + /** + * Number of currently objects in the map. + */ + private int size; + + /** + * This array holds the values + */ + int[] values; + + /** + * Constructs a map with default capacity. + */ + public IntToIntMap() { + this(defaultCapacity); + } + + /** + * Constructs a map with given capacity. Capacity is adjusted to a native + * power of 2, with minimum of 16. + * + * @param capacity + * minimum capacity for the map. + */ + public IntToIntMap(int capacity) { + this.capacity = 16; + // Minimum capacity is 16.. + while (this.capacity < capacity) { + // Multiply by 2 as long as we're still under the requested capacity + this.capacity <<= 1; + } + + // As mentioned, we use the first index (0) as 'Ground', so we need the + // length of the arrays to be one more than the capacity + int arrayLength = this.capacity + 1; + + this.values = new int[arrayLength]; + this.keys = new int[arrayLength]; + this.next = new int[arrayLength]; + + this.values[0] = GROUD; + + // Hash entries are twice as big as the capacity. + int baseHashSize = this.capacity << 1; + + this.baseHash = new int[baseHashSize]; + + // The has factor is 2^M - 1 which is used as an "AND" hashing operator. + // {@link #calcBaseHash()} + this.hashFactor = baseHashSize - 1; + + this.size = 0; + + clear(); + } + + /** + * Adds a pair to the map. Takes the first empty position from the + * empty-linked-list's head - {@link firstEmpty}. + * + * New pairs are always inserted to baseHash, and are followed by the old + * colliding pair. + * + * @param key + * integer which maps the given value + * @param e + * value which is being mapped using the given key + */ + private void prvt_put(int key, int e) { + // Hash entry to which the new pair would be inserted + int hashIndex = calcBaseHashIndex(key); + + // 'Allocating' a pair from the "Empty" list. + int objectIndex = firstEmpty; + + // Setting data + firstEmpty = next[firstEmpty]; + values[objectIndex] = e; + keys[objectIndex] = key; + + // Inserting the new pair as the first node in the specific hash entry + next[objectIndex] = baseHash[hashIndex]; + baseHash[hashIndex] = objectIndex; + + // Announcing a new pair was added! + ++size; + } + + /** + * Calculating the baseHash index using the internal hashFactor. + * + * @param key + */ + protected int calcBaseHashIndex(int key) { + return key & hashFactor; + } + + /** + * Empties the map. Generates the "Empty" space list for later allocation. + */ + public void clear() { + // Clears the hash entries + Arrays.fill(this.baseHash, 0); + + // Set size to zero + size = 0; + + // Mark all array entries as empty. This is done with + // firstEmpty pointing to the first valid index (1 as 0 is + // used as 'Ground'). + firstEmpty = 1; + + // And setting all the next[i] to point at + // i+1. + for (int i = 1; i < this.capacity;) { + next[i] = ++i; + } + + // Surly, the last one should point to the 'Ground'. + next[this.capacity] = 0; + } + + /** + * Checks if a given key exists in the map. + * + * @param key + * that is checked against the map data. + * @return true if the key exists in the map. false otherwise. + */ + public boolean containsKey(int key) { + return find(key) != 0; + } + + /** + * Checks if the given object exists in the map.
+ * This method iterates over the collection, trying to find an equal object. + * + * @param v + * value that is checked against the map data. + * @return true if the value exists in the map (in .equals() meaning). + * false otherwise. + */ + public boolean containsValue(int v) { + for (IntIterator iterator = iterator(); iterator.hasNext();) { + if (v == iterator.next()) { + return true; + } + } + return false; + } + + /** + * Find the actual index of a given key. + * + * @param key + * @return index of the key. zero if the key wasn't found. + */ + protected int find(int key) { + // Calculate the hash entry. + int baseHashIndex = calcBaseHashIndex(key); + + // Start from the hash entry. + int localIndex = baseHash[baseHashIndex]; + + // while the index does not point to the 'Ground' + while (localIndex != 0) { + // returns the index found in case of of a matching key. + if (keys[localIndex] == key) { + return localIndex; + } + + // next the local index + localIndex = next[localIndex]; + } + + // If we got this far, it could only mean we did not find the key we + // were asked for. return 'Ground' index. + return 0; + } + + /** + * Find the actual index of a given key with it's baseHashIndex.
+ * Some methods use the baseHashIndex. If those call {@link #find()} there's + * no need to re-calculate that hash. + * + * @param key + * @param baseHashIndex + * @return the index of the given key, or 0 as 'Ground' if the key wasn't + * found. + */ + private int findForRemove(int key, int baseHashIndex) { + // Start from the hash entry. + this.prev = 0; + int index = baseHash[baseHashIndex]; + + // while the index does not point to the 'Ground' + while (index != 0) { + // returns the index found in case of of a matching key. + if (keys[index] == key) { + return index; + } + + // next the local index + prev = index; + index = next[index]; + } + + // If we got this far, it could only mean we did not find the key we + // were asked for. return 'Ground' index. + this.prev = 0; + return 0; + } + + /** + * Returns the object mapped with the given key. + * + * @param key + * int who's mapped object we're interested in. + * @return an object mapped by the given key. null if the key wasn't found. + */ + public int get(int key) { + return values[find(key)]; + } + + /** + * Grows the map. Allocates a new map of double the capacity, and + * fast-insert the old key-value pairs. + */ + protected void grow() { + IntToIntMap that = new IntToIntMap( + this.capacity * 2); + + // Iterates fast over the collection. Any valid pair is put into the new + // map without checking for duplicates or if there's enough space for + // it. + for (IndexIterator iterator = new IndexIterator(); iterator.hasNext();) { + int index = iterator.next(); + that.prvt_put(this.keys[index], this.values[index]); + } + + // Copy that's data into this. + this.capacity = that.capacity; + this.size = that.size; + this.firstEmpty = that.firstEmpty; + this.values = that.values; + this.keys = that.keys; + this.next = that.next; + this.baseHash = that.baseHash; + this.hashFactor = that.hashFactor; + } + + /** + * + * @return true if the map is empty. false otherwise. + */ + public boolean isEmpty() { + return size == 0; + } + + /** + * Returns a new iterator for the mapped objects. + */ + public IntIterator iterator() { + return new ValueIterator(); + } + + /** Returns an iterator on the map keys. */ + public IntIterator keyIterator() { + return new KeyIterator(); + } + + /** + * Prints the baseHash array, used for debug purposes. + */ + @SuppressWarnings("unused") + private void printBaseHash() { + for (int i = 0; i < this.baseHash.length; i++) { + System.out.println(i + ".\t" + baseHash[i]); + } + } + + /** + * Inserts the <key,value> pair into the map. If the key already exists, + * this method updates the mapped value to the given one, returning the old + * mapped value. + * + * @return the old mapped value, or 0 if the key didn't exist. + */ + public int put(int key, int e) { + // Does key exists? + int index = find(key); + + // Yes! + if (index != 0) { + // Set new data and exit. + int old = values[index]; + values[index] = e; + return old; + } + + // Is there enough room for a new pair? + if (size == capacity) { + // No? Than grow up! + grow(); + } + + // Now that everything is set, the pair can be just put inside with no + // worries. + prvt_put(key, e); + + return 0; + } + + /** + * Removes a <key,value> pair from the map and returns the mapped value, + * or 0 if the none existed. + * + * @param key used to find the value to remove + * @return the removed value or 0 if none existed. + */ + public int remove(int key) { + int baseHashIndex = calcBaseHashIndex(key); + int index = findForRemove(key, baseHashIndex); + if (index != 0) { + // If it is the first in the collision list, we should promote its + // next colliding element. + if (prev == 0) { + baseHash[baseHashIndex] = next[index]; + } + + next[prev] = next[index]; + next[index] = firstEmpty; + firstEmpty = index; + --size; + return values[index]; + } + + return 0; + } + + /** + * @return number of pairs currently in the map + */ + public int size() { + return this.size; + } + + /** + * Translates the mapped pairs' values into an array of Objects + * + * @return an object array of all the values currently in the map. + */ + public int[] toArray() { + int j = -1; + int[] array = new int[size]; + + // Iterates over the values, adding them to the array. + for (IntIterator iterator = iterator(); iterator.hasNext();) { + array[++j] = iterator.next(); + } + return array; + } + + /** + * Translates the mapped pairs' values into an array of ints + * + * @param a + * the array into which the elements of the map are to be + * stored, if it is big enough; otherwise, a new array of the + * same runtime type is allocated for this purpose. + * + * @return an array containing the values stored in the map + * + */ + public int[] toArray(int[] a) { + int j = 0; + if (a.length < size) { + a = new int[size]; + } + // Iterates over the values, adding them to the array. + for (IntIterator iterator = iterator(); j < a.length + && iterator.hasNext(); ++j) { + a[j] = iterator.next(); + } + return a; + } + + @Override + public String toString() { + StringBuffer sb = new StringBuffer(); + sb.append('{'); + IntIterator keyIterator = keyIterator(); + while (keyIterator.hasNext()) { + int key = keyIterator.next(); + sb.append(key); + sb.append('='); + sb.append(get(key)); + if (keyIterator.hasNext()) { + sb.append(','); + sb.append(' '); + } + } + sb.append('}'); + return sb.toString(); + } + + @Override + public int hashCode() { + return getClass().hashCode() ^ size(); + } + + @Override + public boolean equals(Object o) { + IntToIntMap that = (IntToIntMap)o; + if (that.size() != this.size()) { + return false; + } + + IntIterator it = keyIterator(); + while (it.hasNext()) { + int key = it.next(); + + if (!that.containsKey(key)) { + return false; + } + + int v1 = this.get(key); + int v2 = that.get(key); + if (v1 != v2) { + return false; + } + } + return true; + } +} \ No newline at end of file diff --git a/modules/facet/src/java/org/apache/lucene/util/collections/IntToObjectMap.java b/modules/facet/src/java/org/apache/lucene/util/collections/IntToObjectMap.java new file mode 100644 index 00000000000..27e4654948b --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/util/collections/IntToObjectMap.java @@ -0,0 +1,630 @@ +package org.apache.lucene.util.collections; + +import java.util.Arrays; +import java.util.Iterator; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * An Array-based hashtable which maps primitive int to Objects of generic type + * T.
+ * The hashtable is constracted with a given capacity, or 16 as a default. In + * case there's not enough room for new pairs, the hashtable grows.
+ * Capacity is adjusted to a power of 2, and there are 2 * capacity entries for + * the hash. + * + * The pre allocated arrays (for keys, values) are at length of capacity + 1, + * when index 0 is used as 'Ground' or 'NULL'.
+ * + * The arrays are allocated ahead of hash operations, and form an 'empty space' + * list, to which the key,value pair is allocated. + * + * @lucene.experimental + */ +public class IntToObjectMap implements Iterable { + + /** + * Implements an IntIterator which iterates over all the allocated indexes. + */ + private final class IndexIterator implements IntIterator { + /** + * The last used baseHashIndex. Needed for "jumping" from one hash entry + * to another. + */ + private int baseHashIndex = 0; + + /** + * The next not-yet-visited index. + */ + private int index = 0; + + /** + * Index of the last visited pair. Used in {@link #remove()}. + */ + private int lastIndex = 0; + + /** + * Create the Iterator, make index point to the "first" + * index which is not empty. If such does not exist (eg. the map is + * empty) it would be zero. + */ + public IndexIterator() { + for (baseHashIndex = 0; baseHashIndex < baseHash.length; ++baseHashIndex) { + index = baseHash[baseHashIndex]; + if (index != 0) { + break; + } + } + } + + public boolean hasNext() { + return (index != 0); + } + + public int next() { + // Save the last index visited + lastIndex = index; + + // next the index + index = next[index]; + + // if the next index points to the 'Ground' it means we're done with + // the current hash entry and we need to jump to the next one. This + // is done until all the hash entries had been visited. + while (index == 0 && ++baseHashIndex < baseHash.length) { + index = baseHash[baseHashIndex]; + } + + return lastIndex; + } + + public void remove() { + IntToObjectMap.this.remove(keys[lastIndex]); + } + + } + + /** + * Implements an IntIterator, used for iteration over the map's keys. + */ + private final class KeyIterator implements IntIterator { + private IntIterator iterator = new IndexIterator(); + + KeyIterator() { } + + public boolean hasNext() { + return iterator.hasNext(); + } + + public int next() { + return keys[iterator.next()]; + } + + public void remove() { + iterator.remove(); + } + } + + /** + * Implements an Iterator of a generic type T used for iteration over the + * map's values. + */ + private final class ValueIterator implements Iterator { + private IntIterator iterator = new IndexIterator(); + + ValueIterator() { } + + public boolean hasNext() { + return iterator.hasNext(); + } + + @SuppressWarnings("unchecked") + public T next() { + return (T) values[iterator.next()]; + } + + public void remove() { + iterator.remove(); + } + } + + /** + * Default capacity - in case no capacity was specified in the constructor + */ + private static int defaultCapacity = 16; + + /** + * Holds the base hash entries. if the capacity is 2^N, than the base hash + * holds 2^(N+1). It can hold + */ + int[] baseHash; + + /** + * The current capacity of the map. Always 2^N and never less than 16. We + * never use the zero index. It is needed to improve performance and is also + * used as "ground". + */ + private int capacity; + /** + * All objects are being allocated at map creation. Those objects are "free" + * or empty. Whenever a new pair comes along, a pair is being "allocated" or + * taken from the free-linked list. as this is just a free list. + */ + private int firstEmpty; + + /** + * hashFactor is always (2^(N+1)) - 1. Used for faster hashing. + */ + private int hashFactor; + + /** + * This array holds the unique keys + */ + int[] keys; + + /** + * In case of collisions, we implement a double linked list of the colliding + * hash's with the following next[] and prev[]. Those are also used to store + * the "empty" list. + */ + int[] next; + + private int prev; + + /** + * Number of currently objects in the map. + */ + private int size; + + /** + * This array holds the values + */ + Object[] values; + + /** + * Constructs a map with default capacity. + */ + public IntToObjectMap() { + this(defaultCapacity); + } + + /** + * Constructs a map with given capacity. Capacity is adjusted to a native + * power of 2, with minimum of 16. + * + * @param capacity + * minimum capacity for the map. + */ + public IntToObjectMap(int capacity) { + this.capacity = 16; + // Minimum capacity is 16.. + while (this.capacity < capacity) { + // Multiply by 2 as long as we're still under the requested capacity + this.capacity <<= 1; + } + + // As mentioned, we use the first index (0) as 'Ground', so we need the + // length of the arrays to be one more than the capacity + int arrayLength = this.capacity + 1; + + this.values = new Object[arrayLength]; + this.keys = new int[arrayLength]; + this.next = new int[arrayLength]; + + // Hash entries are twice as big as the capacity. + int baseHashSize = this.capacity << 1; + + this.baseHash = new int[baseHashSize]; + + // The has factor is 2^M - 1 which is used as an "AND" hashing operator. + // {@link #calcBaseHash()} + this.hashFactor = baseHashSize - 1; + + this.size = 0; + + clear(); + } + + /** + * Adds a pair to the map. Takes the first empty position from the + * empty-linked-list's head - {@link firstEmpty}. + * + * New pairs are always inserted to baseHash, and are followed by the old + * colliding pair. + * + * @param key + * integer which maps the given Object + * @param e + * element which is being mapped using the given key + */ + private void prvt_put(int key, T e) { + // Hash entry to which the new pair would be inserted + int hashIndex = calcBaseHashIndex(key); + + // 'Allocating' a pair from the "Empty" list. + int objectIndex = firstEmpty; + + // Setting data + firstEmpty = next[firstEmpty]; + values[objectIndex] = e; + keys[objectIndex] = key; + + // Inserting the new pair as the first node in the specific hash entry + next[objectIndex] = baseHash[hashIndex]; + baseHash[hashIndex] = objectIndex; + + // Announcing a new pair was added! + ++size; + } + + /** + * Calculating the baseHash index using the internal hashFactor. + * + * @param key + */ + protected int calcBaseHashIndex(int key) { + return key & hashFactor; + } + + /** + * Empties the map. Generates the "Empty" space list for later allocation. + */ + public void clear() { + // Clears the hash entries + Arrays.fill(this.baseHash, 0); + + // Set size to zero + size = 0; + + // Mark all array entries as empty. This is done with + // firstEmpty pointing to the first valid index (1 as 0 is + // used as 'Ground'). + firstEmpty = 1; + + // And setting all the next[i] to point at + // i+1. + for (int i = 1; i < this.capacity;) { + next[i] = ++i; + } + + // Surly, the last one should point to the 'Ground'. + next[this.capacity] = 0; + } + + /** + * Checks if a given key exists in the map. + * + * @param key + * that is checked against the map data. + * @return true if the key exists in the map. false otherwise. + */ + public boolean containsKey(int key) { + return find(key) != 0; + } + + /** + * Checks if the given object exists in the map.
+ * This method iterates over the collection, trying to find an equal object. + * + * @param o + * object that is checked against the map data. + * @return true if the object exists in the map (in .equals() meaning). + * false otherwise. + */ + public boolean containsValue(Object o) { + for (Iterator iterator = iterator(); iterator.hasNext();) { + T object = iterator.next(); + if (object.equals(o)) { + return true; + } + } + return false; + } + + /** + * Find the actual index of a given key. + * + * @param key + * @return index of the key. zero if the key wasn't found. + */ + protected int find(int key) { + // Calculate the hash entry. + int baseHashIndex = calcBaseHashIndex(key); + + // Start from the hash entry. + int localIndex = baseHash[baseHashIndex]; + + // while the index does not point to the 'Ground' + while (localIndex != 0) { + // returns the index found in case of of a matching key. + if (keys[localIndex] == key) { + return localIndex; + } + + // next the local index + localIndex = next[localIndex]; + } + + // If we got this far, it could only mean we did not find the key we + // were asked for. return 'Ground' index. + return 0; + } + + /** + * Find the actual index of a given key with it's baseHashIndex.
+ * Some methods use the baseHashIndex. If those call {@link #find()} there's + * no need to re-calculate that hash. + * + * @param key + * @param baseHashIndex + * @return the index of the given key, or 0 as 'Ground' if the key wasn't + * found. + */ + private int findForRemove(int key, int baseHashIndex) { + // Start from the hash entry. + this.prev = 0; + int index = baseHash[baseHashIndex]; + + // while the index does not point to the 'Ground' + while (index != 0) { + // returns the index found in case of of a matching key. + if (keys[index] == key) { + return index; + } + + // next the local index + prev = index; + index = next[index]; + } + + // If we got this far, it could only mean we did not find the key we + // were asked for. return 'Ground' index. + this.prev = 0; + return 0; + } + + /** + * Returns the object mapped with the given key. + * + * @param key + * int who's mapped object we're interested in. + * @return an object mapped by the given key. null if the key wasn't found. + */ + @SuppressWarnings("unchecked") + public T get(int key) { + return (T) values[find(key)]; + } + + /** + * Grows the map. Allocates a new map of double the capacity, and + * fast-insert the old key-value pairs. + */ + @SuppressWarnings("unchecked") + protected void grow() { + IntToObjectMap that = new IntToObjectMap( + this.capacity * 2); + + // Iterates fast over the collection. Any valid pair is put into the new + // map without checking for duplicates or if there's enough space for + // it. + for (IndexIterator iterator = new IndexIterator(); iterator.hasNext();) { + int index = iterator.next(); + that.prvt_put(this.keys[index], (T) this.values[index]); + } + + // Copy that's data into this. + this.capacity = that.capacity; + this.size = that.size; + this.firstEmpty = that.firstEmpty; + this.values = that.values; + this.keys = that.keys; + this.next = that.next; + this.baseHash = that.baseHash; + this.hashFactor = that.hashFactor; + } + + /** + * + * @return true if the map is empty. false otherwise. + */ + public boolean isEmpty() { + return size == 0; + } + + /** + * Returns a new iterator for the mapped objects. + */ + public Iterator iterator() { + return new ValueIterator(); + } + + /** Returns an iterator on the map keys. */ + public IntIterator keyIterator() { + return new KeyIterator(); + } + + /** + * Prints the baseHash array, used for debug purposes. + */ + @SuppressWarnings("unused") + private void printBaseHash() { + for (int i = 0; i < this.baseHash.length; i++) { + System.out.println(i + ".\t" + baseHash[i]); + } + } + + /** + * Inserts the <key,value> pair into the map. If the key already exists, + * this method updates the mapped value to the given one, returning the old + * mapped value. + * + * @return the old mapped value, or null if the key didn't exist. + */ + @SuppressWarnings("unchecked") + public T put(int key, T e) { + // Does key exists? + int index = find(key); + + // Yes! + if (index != 0) { + // Set new data and exit. + T old = (T) values[index]; + values[index] = e; + return old; + } + + // Is there enough room for a new pair? + if (size == capacity) { + // No? Than grow up! + grow(); + } + + // Now that everything is set, the pair can be just put inside with no + // worries. + prvt_put(key, e); + + return null; + } + + /** + * Removes a <key,value> pair from the map and returns the mapped value, + * or null if the none existed. + * + * @param key used to find the value to remove + * @return the removed value or null if none existed. + */ + @SuppressWarnings("unchecked") + public T remove(int key) { + int baseHashIndex = calcBaseHashIndex(key); + int index = findForRemove(key, baseHashIndex); + if (index != 0) { + // If it is the first in the collision list, we should promote its + // next colliding element. + if (prev == 0) { + baseHash[baseHashIndex] = next[index]; + } + + next[prev] = next[index]; + next[index] = firstEmpty; + firstEmpty = index; + --size; + return (T) values[index]; + } + + return null; + } + + /** + * @return number of pairs currently in the map + */ + public int size() { + return this.size; + } + + /** + * Translates the mapped pairs' values into an array of Objects + * + * @return an object array of all the values currently in the map. + */ + public Object[] toArray() { + int j = -1; + Object[] array = new Object[size]; + + // Iterates over the values, adding them to the array. + for (Iterator iterator = iterator(); iterator.hasNext();) { + array[++j] = iterator.next(); + } + return array; + } + + /** + * Translates the mapped pairs' values into an array of T + * + * @param a + * the array into which the elements of the list are to be + * stored, if it is big enough; otherwise, use whatever space we + * have, setting the one after the true data as null. + * + * @return an array containing the elements of the list + * + */ + public T[] toArray(T[] a) { + int j = 0; + // Iterates over the values, adding them to the array. + for (Iterator iterator = iterator(); j < a.length + && iterator.hasNext(); ++j) { + a[j] = iterator.next(); + } + + if (j < a.length) { + a[j] = null; + } + + return a; + } + + @Override + public String toString() { + StringBuffer sb = new StringBuffer(); + sb.append('{'); + IntIterator keyIterator = keyIterator(); + while (keyIterator.hasNext()) { + int key = keyIterator.next(); + sb.append(key); + sb.append('='); + sb.append(get(key)); + if (keyIterator.hasNext()) { + sb.append(','); + sb.append(' '); + } + } + sb.append('}'); + return sb.toString(); + } + + @Override + public int hashCode() { + return getClass().hashCode() ^ size(); + } + + @SuppressWarnings("unchecked") + @Override + public boolean equals(Object o) { + IntToObjectMap that = (IntToObjectMap)o; + if (that.size() != this.size()) { + return false; + } + + IntIterator it = keyIterator(); + while (it.hasNext()) { + int key = it.next(); + if (!that.containsKey(key)) { + return false; + } + + T v1 = this.get(key); + T v2 = that.get(key); + if ((v1 == null && v2 != null) || + (v1 != null && v2 == null) || + (!v1.equals(v2))) { + return false; + } + } + return true; + } +} \ No newline at end of file diff --git a/modules/facet/src/java/org/apache/lucene/util/collections/LRUHashMap.java b/modules/facet/src/java/org/apache/lucene/util/collections/LRUHashMap.java new file mode 100644 index 00000000000..28a22588f1e --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/util/collections/LRUHashMap.java @@ -0,0 +1,105 @@ +package org.apache.lucene.util.collections; + +import java.util.LinkedHashMap; +import java.util.Map; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * LRUHashMap is an extension of Java's HashMap, which has a bounded size(); + * When it reaches that size, each time a new element is added, the least + * recently used (LRU) entry is removed. + *

+ * Java makes it very easy to implement LRUHashMap - all its functionality is + * already available from {@link java.util.LinkedHashMap}, and we just need to + * configure that properly. + *

+ * Note that like HashMap, LRUHashMap is unsynchronized, and the user MUST + * synchronize the access to it if used from several threads. Moreover, while + * with HashMap this is only a concern if one of the threads is modifies the + * map, with LURHashMap every read is a modification (because the LRU order + * needs to be remembered) so proper synchronization is always necessary. + *

+ * With the usual synchronization mechanisms available to the user, this + * unfortunately means that LRUHashMap will probably perform sub-optimally under + * heavy contention: while one thread uses the hash table (reads or writes), any + * other thread will be blocked from using it - or even just starting to use it + * (e.g., calculating the hash function). A more efficient approach would be not + * to use LinkedHashMap at all, but rather to use a non-locking (as much as + * possible) thread-safe solution, something along the lines of + * java.util.concurrent.ConcurrentHashMap (though that particular class does not + * support the additional LRU semantics, which will need to be added separately + * using a concurrent linked list or additional storage of timestamps (in an + * array or inside the entry objects), or whatever). + * + * @lucene.experimental + */ +public class LRUHashMap extends LinkedHashMap { + + private int maxSize; + + /** + * Create a new hash map with a bounded size and with least recently + * used entries removed. + * @param maxSize + * the maximum size (in number of entries) to which the map can grow + * before the least recently used entries start being removed.
+ * Setting maxSize to a very large value, like + * {@link Integer#MAX_VALUE} is allowed, but is less efficient than + * using {@link java.util.HashMap} because our class needs + * to keep track of the use order (via an additional doubly-linked + * list) which is not used when the map's size is always below the + * maximum size. + */ + public LRUHashMap(int maxSize) { + super(16, 0.75f, true); + this.maxSize = maxSize; + } + + /** + * Return the max size + */ + public int getMaxSize() { + return maxSize; + } + + /** + * setMaxSize() allows changing the map's maximal number of elements + * which was defined at construction time. + *

+ * Note that if the map is already larger than maxSize, the current + * implementation does not shrink it (by removing the oldest elements); + * Rather, the map remains in its current size as new elements are + * added, and will only start shrinking (until settling again on the + * give maxSize) if existing elements are explicitly deleted. + */ + public void setMaxSize(int maxSize) { + this.maxSize = maxSize; + } + + // We override LinkedHashMap's removeEldestEntry() method. This method + // is called every time a new entry is added, and if we return true + // here, the eldest element will be deleted automatically. In our case, + // we return true if the size of the map grew beyond our limit - ignoring + // what is that eldest element that we'll be deleting. + @Override + protected boolean removeEldestEntry(Map.Entry eldest) { + return size() > maxSize; + } + +} diff --git a/modules/facet/src/java/org/apache/lucene/util/collections/ObjectToFloatMap.java b/modules/facet/src/java/org/apache/lucene/util/collections/ObjectToFloatMap.java new file mode 100644 index 00000000000..d4ee8112a7c --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/util/collections/ObjectToFloatMap.java @@ -0,0 +1,621 @@ +package org.apache.lucene.util.collections; + +import java.util.Arrays; +import java.util.Iterator; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * An Array-based hashtable which maps Objects of generic type + * T to primitive float values.
+ * The hashtable is constructed with a given capacity, or 16 as a default. In + * case there's not enough room for new pairs, the hashtable grows.
+ * Capacity is adjusted to a power of 2, and there are 2 * capacity entries for + * the hash. + * + * The pre allocated arrays (for keys, values) are at length of capacity + 1, + * when index 0 is used as 'Ground' or 'NULL'.
+ * + * The arrays are allocated ahead of hash operations, and form an 'empty space' + * list, to which the key,value pair is allocated. + * + * @lucene.experimental + */ +public class ObjectToFloatMap { + + /** + * Implements an IntIterator which iterates over all the allocated indexes. + */ + private final class IndexIterator implements IntIterator { + /** + * The last used baseHashIndex. Needed for "jumping" from one hash entry + * to another. + */ + private int baseHashIndex = 0; + + /** + * The next not-yet-visited index. + */ + private int index = 0; + + /** + * Index of the last visited pair. Used in {@link #remove()}. + */ + private int lastIndex = 0; + + /** + * Create the Iterator, make index point to the "first" + * index which is not empty. If such does not exist (eg. the map is + * empty) it would be zero. + */ + public IndexIterator() { + for (baseHashIndex = 0; baseHashIndex < baseHash.length; ++baseHashIndex) { + index = baseHash[baseHashIndex]; + if (index != 0) { + break; + } + } + } + + public boolean hasNext() { + return (index != 0); + } + + public int next() { + // Save the last index visited + lastIndex = index; + + // next the index + index = next[index]; + + // if the next index points to the 'Ground' it means we're done with + // the current hash entry and we need to jump to the next one. This + // is done until all the hash entries had been visited. + while (index == 0 && ++baseHashIndex < baseHash.length) { + index = baseHash[baseHashIndex]; + } + + return lastIndex; + } + + @SuppressWarnings("unchecked") + public void remove() { + ObjectToFloatMap.this.remove((K) keys[lastIndex]); + } + + } + + /** + * Implements an IntIterator, used for iteration over the map's keys. + */ + private final class KeyIterator implements Iterator { + private IntIterator iterator = new IndexIterator(); + + KeyIterator() { } + + public boolean hasNext() { + return iterator.hasNext(); + } + + @SuppressWarnings("unchecked") + public K next() { + return (K) keys[iterator.next()]; + } + + public void remove() { + iterator.remove(); + } + } + + /** + * Implements an Iterator of a generic type T used for iteration over the + * map's values. + */ + private final class ValueIterator implements FloatIterator { + private IntIterator iterator = new IndexIterator(); + + ValueIterator() { } + + public boolean hasNext() { + return iterator.hasNext(); + } + + public float next() { + return values[iterator.next()]; + } + + public void remove() { + iterator.remove(); + } + } + + /** + * Default capacity - in case no capacity was specified in the constructor + */ + private static int defaultCapacity = 16; + + /** + * Holds the base hash entries. if the capacity is 2^N, than the base hash + * holds 2^(N+1). It can hold + */ + int[] baseHash; + + /** + * The current capacity of the map. Always 2^N and never less than 16. We + * never use the zero index. It is needed to improve performance and is also + * used as "ground". + */ + private int capacity; + /** + * All objects are being allocated at map creation. Those objects are "free" + * or empty. Whenever a new pair comes along, a pair is being "allocated" or + * taken from the free-linked list. as this is just a free list. + */ + private int firstEmpty; + + /** + * hashFactor is always (2^(N+1)) - 1. Used for faster hashing. + */ + private int hashFactor; + + /** + * This array holds the unique keys + */ + Object[] keys; + + /** + * In case of collisions, we implement a double linked list of the colliding + * hash's with the following next[] and prev[]. Those are also used to store + * the "empty" list. + */ + int[] next; + + private int prev; + + /** + * Number of currently objects in the map. + */ + private int size; + + /** + * This array holds the values + */ + float[] values; + + /** + * Constructs a map with default capacity. + */ + public ObjectToFloatMap() { + this(defaultCapacity); + } + + /** + * Constructs a map with given capacity. Capacity is adjusted to a native + * power of 2, with minimum of 16. + * + * @param capacity + * minimum capacity for the map. + */ + public ObjectToFloatMap(int capacity) { + this.capacity = 16; + // Minimum capacity is 16.. + while (this.capacity < capacity) { + // Multiply by 2 as long as we're still under the requested capacity + this.capacity <<= 1; + } + + // As mentioned, we use the first index (0) as 'Ground', so we need the + // length of the arrays to be one more than the capacity + int arrayLength = this.capacity + 1; + + this.values = new float[arrayLength]; + this.keys = new Object[arrayLength]; + this.next = new int[arrayLength]; + + // Hash entries are twice as big as the capacity. + int baseHashSize = this.capacity << 1; + + this.baseHash = new int[baseHashSize]; + + // The has factor is 2^M - 1 which is used as an "AND" hashing operator. + // {@link #calcBaseHash()} + this.hashFactor = baseHashSize - 1; + + this.size = 0; + + clear(); + } + + /** + * Adds a pair to the map. Takes the first empty position from the + * empty-linked-list's head - {@link firstEmpty}. + * + * New pairs are always inserted to baseHash, and are followed by the old + * colliding pair. + * + * @param key + * integer which maps the given Object + * @param e + * element which is being mapped using the given key + */ + private void prvt_put(K key, float e) { + // Hash entry to which the new pair would be inserted + int hashIndex = calcBaseHashIndex(key); + + // 'Allocating' a pair from the "Empty" list. + int objectIndex = firstEmpty; + + // Setting data + firstEmpty = next[firstEmpty]; + values[objectIndex] = e; + keys[objectIndex] = key; + + // Inserting the new pair as the first node in the specific hash entry + next[objectIndex] = baseHash[hashIndex]; + baseHash[hashIndex] = objectIndex; + + // Announcing a new pair was added! + ++size; + } + + /** + * Calculating the baseHash index using the internal hashFactor. + * + * @param key + */ + protected int calcBaseHashIndex(K key) { + return key.hashCode() & hashFactor; + } + + /** + * Empties the map. Generates the "Empty" space list for later allocation. + */ + public void clear() { + // Clears the hash entries + Arrays.fill(this.baseHash, 0); + + // Set size to zero + size = 0; + + values[0] = Float.NaN; + + // Mark all array entries as empty. This is done with + // firstEmpty pointing to the first valid index (1 as 0 is + // used as 'Ground'). + firstEmpty = 1; + + // And setting all the next[i] to point at + // i+1. + for (int i = 1; i < this.capacity;) { + next[i] = ++i; + } + + // Surly, the last one should point to the 'Ground'. + next[this.capacity] = 0; + } + + /** + * Checks if a given key exists in the map. + * + * @param key + * that is checked against the map data. + * @return true if the key exists in the map. false otherwise. + */ + public boolean containsKey(K key) { + return find(key) != 0; + } + + /** + * Checks if the given object exists in the map.
+ * This method iterates over the collection, trying to find an equal object. + * + * @param o + * object that is checked against the map data. + * @return true if the object exists in the map (in .equals() meaning). + * false otherwise. + */ + public boolean containsValue(float o) { + for (FloatIterator iterator = iterator(); iterator.hasNext();) { + if (o == iterator.next()) { + return true; + } + } + return false; + } + + /** + * Find the actual index of a given key. + * + * @param key + * @return index of the key. zero if the key wasn't found. + */ + protected int find(K key) { + // Calculate the hash entry. + int baseHashIndex = calcBaseHashIndex(key); + + // Start from the hash entry. + int localIndex = baseHash[baseHashIndex]; + + // while the index does not point to the 'Ground' + while (localIndex != 0) { + // returns the index found in case of of a matching key. + if (keys[localIndex].equals(key)) { + return localIndex; + } + + // next the local index + localIndex = next[localIndex]; + } + + // If we got this far, it could only mean we did not find the key we + // were asked for. return 'Ground' index. + return 0; + } + + /** + * Find the actual index of a given key with it's baseHashIndex.
+ * Some methods use the baseHashIndex. If those call {@link #find()} there's + * no need to re-calculate that hash. + * + * @param key + * @param baseHashIndex + * @return the index of the given key, or 0 as 'Ground' if the key wasn't + * found. + */ + private int findForRemove(K key, int baseHashIndex) { + // Start from the hash entry. + this.prev = 0; + int index = baseHash[baseHashIndex]; + + // while the index does not point to the 'Ground' + while (index != 0) { + // returns the index found in case of of a matching key. + if (keys[index].equals(key)) { + return index; + } + + // next the local index + prev = index; + index = next[index]; + } + + // If we got this far, it could only mean we did not find the key we + // were asked for. return 'Ground' index. + this.prev = 0; + return 0; + } + + /** + * Returns the float mapped with the given key. + * + * @param key + * object who's mapped float we're interested in. + * @return a float mapped by the given key. Float.NaN if the key wasn't found. + */ + public float get(K key) { + return values[find(key)]; + } + + /** + * Grows the map. Allocates a new map of double the capacity, and + * fast-insert the old key-value pairs. + */ + @SuppressWarnings("unchecked") + protected void grow() { + ObjectToFloatMap that = new ObjectToFloatMap( + this.capacity * 2); + + // Iterates fast over the collection. Any valid pair is put into the new + // map without checking for duplicates or if there's enough space for + // it. + for (IndexIterator iterator = new IndexIterator(); iterator.hasNext();) { + int index = iterator.next(); + that.prvt_put((K) this.keys[index], this.values[index]); + } + + // Copy that's data into this. + this.capacity = that.capacity; + this.size = that.size; + this.firstEmpty = that.firstEmpty; + this.values = that.values; + this.keys = that.keys; + this.next = that.next; + this.baseHash = that.baseHash; + this.hashFactor = that.hashFactor; + } + + /** + * + * @return true if the map is empty. false otherwise. + */ + public boolean isEmpty() { + return size == 0; + } + + /** + * Returns a new iterator for the mapped floats. + */ + public FloatIterator iterator() { + return new ValueIterator(); + } + + /** Returns an iterator on the map keys. */ + public Iterator keyIterator() { + return new KeyIterator(); + } + + /** + * Prints the baseHash array, used for debug purposes. + */ + @SuppressWarnings("unused") + private void printBaseHash() { + for (int i = 0; i < this.baseHash.length; i++) { + System.out.println(i + ".\t" + baseHash[i]); + } + } + + /** + * Inserts the <key,value> pair into the map. If the key already exists, + * this method updates the mapped value to the given one, returning the old + * mapped value. + * + * @return the old mapped value, or {@link Float#NaN} if the key didn't exist. + */ + public float put(K key, float e) { + // Does key exists? + int index = find(key); + + // Yes! + if (index != 0) { + // Set new data and exit. + float old = values[index]; + values[index] = e; + return old; + } + + // Is there enough room for a new pair? + if (size == capacity) { + // No? Than grow up! + grow(); + } + + // Now that everything is set, the pair can be just put inside with no + // worries. + prvt_put(key, e); + + return Float.NaN; + } + + /** + * Removes a <key,value> pair from the map and returns the mapped value, + * or {@link Float#NaN} if the none existed. + * + * @param key used to find the value to remove + * @return the removed value or {@link Float#NaN} if none existed. + */ + public float remove(K key) { + int baseHashIndex = calcBaseHashIndex(key); + int index = findForRemove(key, baseHashIndex); + if (index != 0) { + // If it is the first in the collision list, we should promote its + // next colliding element. + if (prev == 0) { + baseHash[baseHashIndex] = next[index]; + } + + next[prev] = next[index]; + next[index] = firstEmpty; + firstEmpty = index; + --size; + return values[index]; + } + + return Float.NaN; + } + + /** + * @return number of pairs currently in the map + */ + public int size() { + return this.size; + } + + /** + * Translates the mapped pairs' values into an array of Objects + * + * @return an object array of all the values currently in the map. + */ + public float[] toArray() { + int j = -1; + float[] array = new float[size]; + + // Iterates over the values, adding them to the array. + for (FloatIterator iterator = iterator(); iterator.hasNext();) { + array[++j] = iterator.next(); + } + return array; + } + + /** + * Translates the mapped pairs' values into an array of T + * + * @param a + * the array into which the elements of the list are to be + * stored, if it is big enough; otherwise, use as much space as it can. + * + * @return an array containing the elements of the list + * + */ + public float[] toArray(float[] a) { + int j = 0; + // Iterates over the values, adding them to the array. + for (FloatIterator iterator = iterator(); j < a.length + && iterator.hasNext(); ++j) { + a[j] = iterator.next(); + } + if (j < a.length) { + a[j] = Float.NaN; + } + + return a; + } + + @Override + public String toString() { + StringBuffer sb = new StringBuffer(); + sb.append('{'); + Iterator keyIterator = keyIterator(); + while (keyIterator.hasNext()) { + K key = keyIterator.next(); + sb.append(key); + sb.append('='); + sb.append(get(key)); + if (keyIterator.hasNext()) { + sb.append(','); + sb.append(' '); + } + } + sb.append('}'); + return sb.toString(); + } + + @Override + public int hashCode() { + return getClass().hashCode() ^ size(); + } + + @SuppressWarnings("unchecked") + @Override + public boolean equals(Object o) { + ObjectToFloatMap that = (ObjectToFloatMap)o; + if (that.size() != this.size()) { + return false; + } + + Iterator it = keyIterator(); + while (it.hasNext()) { + K key = it.next(); + float v1 = this.get(key); + float v2 = that.get(key); + if (Float.compare(v1, v2) != 0) { + return false; + } + } + return true; + } +} \ No newline at end of file diff --git a/modules/facet/src/java/org/apache/lucene/util/collections/ObjectToIntMap.java b/modules/facet/src/java/org/apache/lucene/util/collections/ObjectToIntMap.java new file mode 100644 index 00000000000..e6330c7ed32 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/util/collections/ObjectToIntMap.java @@ -0,0 +1,620 @@ +package org.apache.lucene.util.collections; + +import java.util.Arrays; +import java.util.Iterator; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * An Array-based hashtable which maps Objects of generic type + * T to primitive int values.
+ * The hashtable is constructed with a given capacity, or 16 as a default. In + * case there's not enough room for new pairs, the hashtable grows.
+ * Capacity is adjusted to a power of 2, and there are 2 * capacity entries for + * the hash. + * + * The pre allocated arrays (for keys, values) are at length of capacity + 1, + * when index 0 is used as 'Ground' or 'NULL'.
+ * + * The arrays are allocated ahead of hash operations, and form an 'empty space' + * list, to which the key,value pair is allocated. + * + * @lucene.experimental + */ +public class ObjectToIntMap { + + /** + * Implements an IntIterator which iterates over all the allocated indexes. + */ + private final class IndexIterator implements IntIterator { + /** + * The last used baseHashIndex. Needed for "jumping" from one hash entry + * to another. + */ + private int baseHashIndex = 0; + + /** + * The next not-yet-visited index. + */ + private int index = 0; + + /** + * Index of the last visited pair. Used in {@link #remove()}. + */ + private int lastIndex = 0; + + /** + * Create the Iterator, make index point to the "first" + * index which is not empty. If such does not exist (eg. the map is + * empty) it would be zero. + */ + public IndexIterator() { + for (baseHashIndex = 0; baseHashIndex < baseHash.length; ++baseHashIndex) { + index = baseHash[baseHashIndex]; + if (index != 0) { + break; + } + } + } + + public boolean hasNext() { + return (index != 0); + } + + public int next() { + // Save the last index visited + lastIndex = index; + + // next the index + index = next[index]; + + // if the next index points to the 'Ground' it means we're done with + // the current hash entry and we need to jump to the next one. This + // is done until all the hash entries had been visited. + while (index == 0 && ++baseHashIndex < baseHash.length) { + index = baseHash[baseHashIndex]; + } + + return lastIndex; + } + + @SuppressWarnings("unchecked") + public void remove() { + ObjectToIntMap.this.remove((K) keys[lastIndex]); + } + + } + + /** + * Implements an IntIterator, used for iteration over the map's keys. + */ + private final class KeyIterator implements Iterator { + private IntIterator iterator = new IndexIterator(); + + KeyIterator() { } + + public boolean hasNext() { + return iterator.hasNext(); + } + + @SuppressWarnings("unchecked") + public K next() { + return (K) keys[iterator.next()]; + } + + public void remove() { + iterator.remove(); + } + } + + /** + * Implements an Iterator of a generic type T used for iteration over the + * map's values. + */ + private final class ValueIterator implements IntIterator { + private IntIterator iterator = new IndexIterator(); + + ValueIterator() {} + + public boolean hasNext() { + return iterator.hasNext(); + } + + public int next() { + return values[iterator.next()]; + } + + public void remove() { + iterator.remove(); + } + } + + /** + * Default capacity - in case no capacity was specified in the constructor + */ + private static int defaultCapacity = 16; + + /** + * Holds the base hash entries. if the capacity is 2^N, than the base hash + * holds 2^(N+1). It can hold + */ + int[] baseHash; + + /** + * The current capacity of the map. Always 2^N and never less than 16. We + * never use the zero index. It is needed to improve performance and is also + * used as "ground". + */ + private int capacity; + /** + * All objects are being allocated at map creation. Those objects are "free" + * or empty. Whenever a new pair comes along, a pair is being "allocated" or + * taken from the free-linked list. as this is just a free list. + */ + private int firstEmpty; + + /** + * hashFactor is always (2^(N+1)) - 1. Used for faster hashing. + */ + private int hashFactor; + + /** + * This array holds the unique keys + */ + Object[] keys; + + /** + * In case of collisions, we implement a double linked list of the colliding + * hash's with the following next[] and prev[]. Those are also used to store + * the "empty" list. + */ + int[] next; + + private int prev; + + /** + * Number of currently objects in the map. + */ + private int size; + + /** + * This array holds the values + */ + int[] values; + + /** + * Constructs a map with default capacity. + */ + public ObjectToIntMap() { + this(defaultCapacity); + } + + /** + * Constructs a map with given capacity. Capacity is adjusted to a native + * power of 2, with minimum of 16. + * + * @param capacity + * minimum capacity for the map. + */ + public ObjectToIntMap(int capacity) { + this.capacity = 16; + // Minimum capacity is 16.. + while (this.capacity < capacity) { + // Multiply by 2 as long as we're still under the requested capacity + this.capacity <<= 1; + } + + // As mentioned, we use the first index (0) as 'Ground', so we need the + // length of the arrays to be one more than the capacity + int arrayLength = this.capacity + 1; + + this.values = new int[arrayLength]; + this.keys = new Object[arrayLength]; + this.next = new int[arrayLength]; + + // Hash entries are twice as big as the capacity. + int baseHashSize = this.capacity << 1; + + this.baseHash = new int[baseHashSize]; + + // The has factor is 2^M - 1 which is used as an "AND" hashing operator. + // {@link #calcBaseHash()} + this.hashFactor = baseHashSize - 1; + + this.size = 0; + + clear(); + } + + /** + * Adds a pair to the map. Takes the first empty position from the + * empty-linked-list's head - {@link firstEmpty}. + * + * New pairs are always inserted to baseHash, and are followed by the old + * colliding pair. + * + * @param key + * integer which maps the given Object + * @param e + * element which is being mapped using the given key + */ + private void prvt_put(K key, int e) { + // Hash entry to which the new pair would be inserted + int hashIndex = calcBaseHashIndex(key); + + // 'Allocating' a pair from the "Empty" list. + int objectIndex = firstEmpty; + + // Setting data + firstEmpty = next[firstEmpty]; + values[objectIndex] = e; + keys[objectIndex] = key; + + // Inserting the new pair as the first node in the specific hash entry + next[objectIndex] = baseHash[hashIndex]; + baseHash[hashIndex] = objectIndex; + + // Announcing a new pair was added! + ++size; + } + + /** + * Calculating the baseHash index using the internal hashFactor. + * + * @param key + */ + protected int calcBaseHashIndex(K key) { + return key.hashCode() & hashFactor; + } + + /** + * Empties the map. Generates the "Empty" space list for later allocation. + */ + public void clear() { + // Clears the hash entries + Arrays.fill(this.baseHash, 0); + + // Set size to zero + size = 0; + + values[0] = Integer.MAX_VALUE; + + // Mark all array entries as empty. This is done with + // firstEmpty pointing to the first valid index (1 as 0 is + // used as 'Ground'). + firstEmpty = 1; + + // And setting all the next[i] to point at + // i+1. + for (int i = 1; i < this.capacity;) { + next[i] = ++i; + } + + // Surly, the last one should point to the 'Ground'. + next[this.capacity] = 0; + } + + /** + * Checks if a given key exists in the map. + * + * @param key + * that is checked against the map data. + * @return true if the key exists in the map. false otherwise. + */ + public boolean containsKey(K key) { + return find(key) != 0; + } + + /** + * Checks if the given object exists in the map.
+ * This method iterates over the collection, trying to find an equal object. + * + * @param o + * object that is checked against the map data. + * @return true if the object exists in the map (in .equals() meaning). + * false otherwise. + */ + public boolean containsValue(int o) { + for (IntIterator iterator = iterator(); iterator.hasNext();) { + if (o == iterator.next()) { + return true; + } + } + return false; + } + + /** + * Find the actual index of a given key. + * + * @param key + * @return index of the key. zero if the key wasn't found. + */ + protected int find(K key) { + // Calculate the hash entry. + int baseHashIndex = calcBaseHashIndex(key); + + // Start from the hash entry. + int localIndex = baseHash[baseHashIndex]; + + // while the index does not point to the 'Ground' + while (localIndex != 0) { + // returns the index found in case of of a matching key. + if (keys[localIndex].equals(key)) { + return localIndex; + } + + // next the local index + localIndex = next[localIndex]; + } + + // If we got this far, it could only mean we did not find the key we + // were asked for. return 'Ground' index. + return 0; + } + + /** + * Find the actual index of a given key with it's baseHashIndex.
+ * Some methods use the baseHashIndex. If those call {@link #find()} there's + * no need to re-calculate that hash. + * + * @param key + * @param baseHashIndex + * @return the index of the given key, or 0 as 'Ground' if the key wasn't + * found. + */ + private int findForRemove(K key, int baseHashIndex) { + // Start from the hash entry. + this.prev = 0; + int index = baseHash[baseHashIndex]; + + // while the index does not point to the 'Ground' + while (index != 0) { + // returns the index found in case of of a matching key. + if (keys[index].equals(key)) { + return index; + } + + // next the local index + prev = index; + index = next[index]; + } + + // If we got this far, it could only mean we did not find the key we + // were asked for. return 'Ground' index. + this.prev = 0; + return 0; + } + + /** + * Returns the int mapped with the given key. + * + * @param key + * int who's mapped object we're interested in. + * @return an object mapped by the given key. null if the key wasn't found. + */ + public int get(K key) { + return values[find(key)]; + } + + /** + * Grows the map. Allocates a new map of double the capacity, and + * fast-insert the old key-value pairs. + */ + @SuppressWarnings("unchecked") + protected void grow() { + ObjectToIntMap that = new ObjectToIntMap( + this.capacity * 2); + + // Iterates fast over the collection. Any valid pair is put into the new + // map without checking for duplicates or if there's enough space for + // it. + for (IndexIterator iterator = new IndexIterator(); iterator.hasNext();) { + int index = iterator.next(); + that.prvt_put((K) this.keys[index], this.values[index]); + } + + // Copy that's data into this. + this.capacity = that.capacity; + this.size = that.size; + this.firstEmpty = that.firstEmpty; + this.values = that.values; + this.keys = that.keys; + this.next = that.next; + this.baseHash = that.baseHash; + this.hashFactor = that.hashFactor; + } + + /** + * + * @return true if the map is empty. false otherwise. + */ + public boolean isEmpty() { + return size == 0; + } + + /** + * Returns a new iterator for the mapped objects. + */ + public IntIterator iterator() { + return new ValueIterator(); + } + + public Iterator keyIterator() { + return new KeyIterator(); + } + + /** + * Prints the baseHash array, used for debug purposes. + */ + @SuppressWarnings("unused") + private void printBaseHash() { + for (int i = 0; i < this.baseHash.length; i++) { + System.out.println(i + ".\t" + baseHash[i]); + } + } + + /** + * Inserts the <key,value> pair into the map. If the key already exists, + * this method updates the mapped value to the given one, returning the old + * mapped value. + * + * @return the old mapped value, or 0 if the key didn't exist. + */ + public int put(K key, int e) { + // Does key exists? + int index = find(key); + + // Yes! + if (index != 0) { + // Set new data and exit. + int old = values[index]; + values[index] = e; + return old; + } + + // Is there enough room for a new pair? + if (size == capacity) { + // No? Than grow up! + grow(); + } + + // Now that everything is set, the pair can be just put inside with no + // worries. + prvt_put(key, e); + + return 0; + } + + /** + * Removes a <key,value> pair from the map and returns the mapped value, + * or 0 if the none existed. + * + * @param key used to find the value to remove + * @return the removed value or 0 if none existed. + */ + public int remove(K key) { + int baseHashIndex = calcBaseHashIndex(key); + int index = findForRemove(key, baseHashIndex); + if (index != 0) { + // If it is the first in the collision list, we should promote its + // next colliding element. + if (prev == 0) { + baseHash[baseHashIndex] = next[index]; + } + + next[prev] = next[index]; + next[index] = firstEmpty; + firstEmpty = index; + --size; + return values[index]; + } + + return 0; + } + + /** + * @return number of pairs currently in the map + */ + public int size() { + return this.size; + } + + /** + * Translates the mapped pairs' values into an array of Objects + * + * @return an object array of all the values currently in the map. + */ + public int[] toArray() { + int j = -1; + int[] array = new int[size]; + + // Iterates over the values, adding them to the array. + for (IntIterator iterator = iterator(); iterator.hasNext();) { + array[++j] = iterator.next(); + } + return array; + } + + /** + * Translates the mapped pairs' values into an array of T + * + * @param a + * the array into which the elements of the list are to be + * stored, if it is big enough; otherwise, use as much space as it can. + * + * @return an array containing the elements of the list + * + */ + public int[] toArray(int[] a) { + int j = 0; + // Iterates over the values, adding them to the array. + for (IntIterator iterator = iterator(); j < a.length + && iterator.hasNext(); ++j) { + a[j] = iterator.next(); + } + if (j < a.length) { + a[j] = Integer.MAX_VALUE; + } + + return a; + } + + @Override + public String toString() { + StringBuffer sb = new StringBuffer(); + sb.append('{'); + Iterator keyIterator = keyIterator(); + while (keyIterator.hasNext()) { + K key = keyIterator.next(); + sb.append(key); + sb.append('='); + sb.append(get(key)); + if (keyIterator.hasNext()) { + sb.append(','); + sb.append(' '); + } + } + sb.append('}'); + return sb.toString(); + } + + @Override + public int hashCode() { + return getClass().hashCode() ^ size(); + } + + @SuppressWarnings("unchecked") + @Override + public boolean equals(Object o) { + ObjectToIntMap that = (ObjectToIntMap)o; + if (that.size() != this.size()) { + return false; + } + + Iterator it = keyIterator(); + while (it.hasNext()) { + K key = it.next(); + int v1 = this.get(key); + int v2 = that.get(key); + if (Float.compare(v1, v2) != 0) { + return false; + } + } + return true; + } +} \ No newline at end of file diff --git a/modules/facet/src/java/org/apache/lucene/util/collections/package.html b/modules/facet/src/java/org/apache/lucene/util/collections/package.html new file mode 100644 index 00000000000..6a45426bd81 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/util/collections/package.html @@ -0,0 +1,8 @@ + + +Collections + + + Various optimized Collections implementations. + + \ No newline at end of file diff --git a/modules/facet/src/java/org/apache/lucene/util/encoding/ChunksIntEncoder.java b/modules/facet/src/java/org/apache/lucene/util/encoding/ChunksIntEncoder.java new file mode 100644 index 00000000000..8e2550054dd --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/util/encoding/ChunksIntEncoder.java @@ -0,0 +1,105 @@ +package org.apache.lucene.util.encoding; + +import java.io.IOException; +import java.io.OutputStream; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * An {@link IntEncoder} which encodes values in chunks. Implementations of this + * class assume the data which needs encoding consists of small, consecutive + * values, and therefore the encoder is able to compress them better. You can + * read more on the two implementations {@link FourFlagsIntEncoder} and + * {@link EightFlagsIntEncoder}. + *

+ * Extensions of this class need to implement {@link #encode(int)} in order to + * build the proper indicator (flags). When enough values were accumulated + * (typically the batch size), extensions can call {@link #encodeChunk()} to + * flush the indicator and the rest of the values. + *

+ * NOTE: flags encoders do not accept values ≤ 0 (zero) in their + * {@link #encode(int)}. For performance reasons they do not check that + * condition, however if such value is passed the result stream may be corrupt + * or an exception will be thrown. Also, these encoders perform the best when + * there are many consecutive small values (depends on the encoder + * implementation). If that is not the case, the encoder will occupy 1 more byte + * for every batch number of integers, over whatever + * {@link VInt8IntEncoder} would have occupied. Therefore make sure to check + * whether your data fits into the conditions of the specific encoder. + *

+ * For the reasons mentioned above, these encoders are usually chained with + * {@link UniqueValuesIntEncoder} and {@link DGapIntEncoder} in the following + * manner:

+ * IntEncoder fourFlags = 
+ *         new SortingEncoderFilter(new UniqueValuesIntEncoder(new DGapIntEncoder(new FlagsIntEncoderImpl())));
+ * 
+ * + * @lucene.experimental + */ +public abstract class ChunksIntEncoder extends IntEncoder { + + /** Holds the values which must be encoded, outside the indicator. */ + protected final int[] encodeQueue; + protected int encodeQueueSize = 0; + + /** Encoder used to encode values outside the indicator. */ + protected final IntEncoder encoder = new VInt8IntEncoder(); + + /** Represents bits flag byte. */ + protected int indicator = 0; + + /** Counts the current ordinal of the encoded value. */ + protected byte ordinal = 0; + + protected ChunksIntEncoder(int chunkSize) { + encodeQueue = new int[chunkSize]; + } + + /** + * Encodes the values of the current chunk. First it writes the indicator, and + * then it encodes the values outside the indicator. + */ + protected void encodeChunk() throws IOException { + out.write(indicator); + for (int i = 0; i < encodeQueueSize; ++i) { + encoder.encode(encodeQueue[i]); + } + encodeQueueSize = 0; + ordinal = 0; + indicator = 0; + } + + @Override + public void close() throws IOException { + if (ordinal != 0) { + encodeChunk(); + } + encoder.close(); + super.close(); + } + + @Override + public void reInit(OutputStream out) { + encoder.reInit(out); + super.reInit(out); + ordinal = 0; + indicator = 0; + encodeQueueSize = 0; + } + +} diff --git a/modules/facet/src/java/org/apache/lucene/util/encoding/DGapIntDecoder.java b/modules/facet/src/java/org/apache/lucene/util/encoding/DGapIntDecoder.java new file mode 100644 index 00000000000..75c6f752b71 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/util/encoding/DGapIntDecoder.java @@ -0,0 +1,62 @@ +package org.apache.lucene.util.encoding; + +import java.io.IOException; +import java.io.InputStream; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * An {@link IntDecoder} which wraps another {@link IntDecoder} and reverts the + * d-gap that was encoded by {@link DGapIntEncoder}. The wrapped decoder + * performs the actual decoding, while this class simply adds the decoded value + * to the previous value. + * + * @lucene.experimental + */ +public class DGapIntDecoder extends IntDecoder { + + private final IntDecoder decoder; + + private int prev = 0; + + public DGapIntDecoder(IntDecoder decoder) { + this.decoder = decoder; + } + + @Override + public long decode() throws IOException { + long decode = decoder.decode(); + if (decode == EOS) { + return EOS; + } + + return prev += decode; + } + + @Override + public void reInit(InputStream in) { + decoder.reInit(in); + prev = 0; + } + + @Override + public String toString() { + return "DGap (" + decoder.toString() + ")"; + } + +} diff --git a/modules/facet/src/java/org/apache/lucene/util/encoding/DGapIntEncoder.java b/modules/facet/src/java/org/apache/lucene/util/encoding/DGapIntEncoder.java new file mode 100644 index 00000000000..43bcb4e998c --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/util/encoding/DGapIntEncoder.java @@ -0,0 +1,69 @@ +package org.apache.lucene.util.encoding; + +import java.io.IOException; +import java.io.OutputStream; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * An {@link IntEncoderFilter} which encodes the gap between the given values, + * rather than the values themselves. This encoder usually yields better + * encoding performance space-wise (i.e., the final encoded values consume less + * space) if the values are 'close' to each other. + *

+ * NOTE: this encoder assumes the values are given to + * {@link #encode(int)} in an ascending sorted manner, which ensures only + * positive values are encoded and thus yields better performance. If you are + * not sure whether the values are sorted or not, it is possible to chain this + * encoder with {@link SortingIntEncoder} to ensure the values will be + * sorted before encoding. + * + * @lucene.experimental + */ +public class DGapIntEncoder extends IntEncoderFilter { + + private int prev = 0; + + /** Initializes with the given encoder. */ + public DGapIntEncoder(IntEncoder encoder) { + super(encoder); + } + + @Override + public void encode(int value) throws IOException { + encoder.encode(value - prev); + prev = value; + } + + @Override + public IntDecoder createMatchingDecoder() { + return new DGapIntDecoder(encoder.createMatchingDecoder()); + } + + @Override + public void reInit(OutputStream out) { + super.reInit(out); + prev = 0; + } + + @Override + public String toString() { + return "DGap (" + encoder.toString() + ")"; + } + +} diff --git a/modules/facet/src/java/org/apache/lucene/util/encoding/EightFlagsIntDecoder.java b/modules/facet/src/java/org/apache/lucene/util/encoding/EightFlagsIntDecoder.java new file mode 100644 index 00000000000..5f8e22b63e9 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/util/encoding/EightFlagsIntDecoder.java @@ -0,0 +1,91 @@ +package org.apache.lucene.util.encoding; + +import java.io.IOException; +import java.io.InputStream; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Decodes data which was encoded by {@link EightFlagsIntEncoder}. Scans + * the indicator, one flag (1-bits) at a time, and decodes extra + * data using {@link VInt8IntDecoder}. + * + * @see EightFlagsIntEncoder + * @lucene.experimental + */ +public class EightFlagsIntDecoder extends IntDecoder { + + /** + * Holds all combinations of indicator for fast decoding (saves time + * on real-time bit manipulation) + */ + private static final byte[][] decodeTable = new byte[256][8]; + + /** Generating all combinations of indicator into separate flags. */ + static { + for (int i = 256; i != 0;) { + --i; + for (int j = 8; j != 0;) { + --j; + decodeTable[i][j] = (byte) ((i >>> j) & 0x1); + } + } + } + + private final IntDecoder decoder = new VInt8IntDecoder(); + + /** The indicator for decoding a chunk of 8 integers. */ + private int indicator; + + /** Used as an ordinal of 0 - 7, as the decoder decodes chunks of 8 integers. */ + private int ordinal = 0; + + @Override + public long decode() throws IOException { + // If we've decoded 8 integers, read the next indicator. + if ((ordinal & 0x7) == 0) { + indicator = in.read(); + if (indicator < 0) { + return EOS; + } + ordinal = 0; + } + + if (decodeTable[indicator][ordinal++] == 0) { + // decode the value from the stream. + long decode = decoder.decode(); + return decode == EOS ? EOS : decode + 2; + } + + return 1; + } + + @Override + public void reInit(InputStream in) { + super.reInit(in); + decoder.reInit(in); + ordinal = 0; + indicator = 0; + } + + @Override + public String toString() { + return "EightFlags (VInt8)"; + } + +} diff --git a/modules/facet/src/java/org/apache/lucene/util/encoding/EightFlagsIntEncoder.java b/modules/facet/src/java/org/apache/lucene/util/encoding/EightFlagsIntEncoder.java new file mode 100644 index 00000000000..e3e7d755891 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/util/encoding/EightFlagsIntEncoder.java @@ -0,0 +1,84 @@ +package org.apache.lucene.util.encoding; + +import java.io.IOException; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * A {@link ChunksIntEncoder} which encodes data in chunks of 8. Every group starts with a single + * byte (called indicator) which represents 8 - 1 bit flags, where the value: + *

    + *
  • 1 means the encoded value is '1' + *
  • 0 means the value is encoded using {@link VInt8IntEncoder}, and the + * encoded bytes follow the indicator.
    + * Since value 0 is illegal, and 1 is encoded in the indicator, the actual + * value that is encoded is value-2, which saves some more bits. + *
+ * Encoding example: + *
    + *
  • Original values: 6, 16, 5, 9, 7, 1 + *
  • After sorting: 1, 5, 6, 7, 9, 16 + *
  • D-Gap computing: 1, 4, 1, 1, 2, 5 (so far - done by + * {@link DGapIntEncoder}) + *
  • Encoding: 1,0,1,1,0,0,0,0 as the indicator, by 2 (4-2), 0 (2-2), 3 (5-2). + *
  • Binary encode: 0 | 0 | 0 | 0 | 1 | 1 | 0 | 1 00000010 00000000 + * 00000011 (indicator is underlined).
    + * NOTE: the order of the values in the indicator is lsb ⇒ msb, + * which allows for more efficient decoding. + *
+ * + * @lucene.experimental + */ +public class EightFlagsIntEncoder extends ChunksIntEncoder { + + /** + * Holds all combinations of indicator flags for fast encoding (saves + * time on bit manipulation at encode time) + */ + private static byte[] encodeTable = new byte[] { 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, (byte) 0x80 }; + + public EightFlagsIntEncoder() { + super(8); + } + + @Override + public void encode(int data) throws IOException { + if (data == 1) { + indicator |= encodeTable[ordinal]; + } else { + encodeQueue[encodeQueueSize++] = data - 2; + } + ++ordinal; + + // If 8 values were encoded thus far, 'flush' them including the indicator. + if ((ordinal & 0x7) == 0) { + encodeChunk(); + } + } + + @Override + public IntDecoder createMatchingDecoder() { + return new EightFlagsIntDecoder(); + } + + @Override + public String toString() { + return "EightFlags (" + encoder.toString() + ")"; + } + +} diff --git a/modules/facet/src/java/org/apache/lucene/util/encoding/FourFlagsIntDecoder.java b/modules/facet/src/java/org/apache/lucene/util/encoding/FourFlagsIntDecoder.java new file mode 100644 index 00000000000..efafaca7818 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/util/encoding/FourFlagsIntDecoder.java @@ -0,0 +1,92 @@ +package org.apache.lucene.util.encoding; + +import java.io.IOException; +import java.io.InputStream; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Decodes data which was encoded by {@link FourFlagsIntEncoder}. Scans + * the indicator, one flag (1-bits) at a time, and decodes extra + * data using {@link VInt8IntDecoder}. + * + * @see FourFlagsIntEncoder + * @lucene.experimental + */ +public class FourFlagsIntDecoder extends IntDecoder { + + /** + * Holds all combinations of indicator for fast decoding (saves time + * on real-time bit manipulation) + */ + private final static byte[][] decodeTable = new byte[256][4]; + + /** Generating all combinations of indicator into separate flags. */ + static { + for (int i = 256; i != 0;) { + --i; + for (int j = 4; j != 0;) { + --j; + decodeTable[i][j] = (byte) ((i >>> (j << 1)) & 0x3); + } + } + } + + private final IntDecoder decoder = new VInt8IntDecoder(); + + /** The indicator for decoding a chunk of 4 integers. */ + private int indicator; + + /** Used as an ordinal of 0 - 3, as the decoder decodes chunks of 4 integers. */ + private int ordinal = 0; + + @Override + public long decode() throws IOException { + // If we've decoded 8 integers, read the next indicator. + if ((ordinal & 0x3) == 0) { + indicator = in.read(); + if (indicator < 0) { + return EOS; + } + ordinal = 0; + } + + byte decodeVal = decodeTable[indicator][ordinal++]; + if (decodeVal == 0) { + // decode the value from the stream. + long decode = decoder.decode(); + return decode == EOS ? EOS : decode + 4; + } + + return decodeVal; + } + + @Override + public void reInit(InputStream in) { + super.reInit(in); + decoder.reInit(in); + ordinal = 0; + indicator = 0; + } + + @Override + public String toString() { + return "FourFlags (VInt8)"; + } + +} diff --git a/modules/facet/src/java/org/apache/lucene/util/encoding/FourFlagsIntEncoder.java b/modules/facet/src/java/org/apache/lucene/util/encoding/FourFlagsIntEncoder.java new file mode 100644 index 00000000000..b2945ba1ee5 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/util/encoding/FourFlagsIntEncoder.java @@ -0,0 +1,99 @@ +package org.apache.lucene.util.encoding; + +import java.io.IOException; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * A {@link ChunksIntEncoder} which encodes values in chunks of 4. Every group + * starts with a single byte (called indicator) which represents 4 - 2 bit + * flags, where the values: + *
    + *
  • 1, 2 or 3 mean the encoded value is '1', '2' or '3' respectively. + *
  • 0 means the value is encoded using {@link VInt8IntEncoder}, and the + * encoded bytes follow the indicator.
    + * Since value 0 is illegal, and 1-3 are encoded in the indicator, the actual + * value that is encoded is value-4, which saves some more bits. + *
+ * Encoding example: + *
    + *
  • Original values: 6, 16, 5, 9, 7, 1, 11 + *
  • After sorting: 1, 5, 6, 7, 9, 11, 16 + *
  • D-Gap computing: 1, 4, 1, 1, 2, 5 (so far - done by + * {@link DGapIntEncoder}) + *
  • Encoding: 1,0,1,1 as the first indicator, followed by 0 (4-4), than + * 2,0,0,0 as the second indicator, followed by 1 (5-4) encoded with. + *
  • Binary encode: 01 | 01 | 00 | 01 00000000 00 | 00 | 00 | 10 + * 00000001 (indicators are underlined).
    + * NOTE: the order of the values in the indicator is lsb ⇒ msb, + * which allows for more efficient decoding. + *
+ * + * @lucene.experimental + */ +public class FourFlagsIntEncoder extends ChunksIntEncoder { + + /** + * Holds all combinations of indicator flags for fast encoding (saves + * time on bit manipulation @ encode time) + */ + private static byte[][] encodeTable = new byte[][] { + new byte[] { 0x00, 0x00, 0x00, 0x00 }, + new byte[] { 0x01, 0x04, 0x10, 0x40 }, + new byte[] { 0x02, 0x08, 0x20, (byte) 0x80 }, + new byte[] { 0x03, 0x0C, 0x30, (byte) 0xC0 }, + }; + + public FourFlagsIntEncoder() { + super(4); + } + + /** + * Small values (<=3) are stored in the indicator while larger + * values are saved for later encoding in the {@link #encodeQueue}. Since + * Vint8 will only encode values larger or equal to 4, the values saves for + * encoded are transformed to (value - 4).
+ * When a chunk is ready (got 4 values), the {@link #encodeChunk()} + * takes control. + */ + @Override + public void encode(int data) throws IOException { + if (data <= 3) { + indicator |= encodeTable[data][ordinal]; + } else { + encodeQueue[encodeQueueSize++] = data - 4; + } + ++ordinal; + + // If 4 values were encoded thus far, 'flush' them including the indicator. + if ((ordinal & 0x3) == 0) { + encodeChunk(); + } + } + + @Override + public IntDecoder createMatchingDecoder() { + return new FourFlagsIntDecoder(); + } + + @Override + public String toString() { + return "FourFlags (" + encoder.toString() + ")"; + } + +} diff --git a/modules/facet/src/java/org/apache/lucene/util/encoding/IntDecoder.java b/modules/facet/src/java/org/apache/lucene/util/encoding/IntDecoder.java new file mode 100644 index 00000000000..ebc516ff3d0 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/util/encoding/IntDecoder.java @@ -0,0 +1,53 @@ +package org.apache.lucene.util.encoding; + +import java.io.IOException; +import java.io.InputStream; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Decodes integers from a set {@link InputStream}. For re-usability, the + * decoder's input stream can be set by ({@link #reInit(InputStream)}). + * By design, Decoders are NOT thread-safe. + * + * @lucene.experimental + */ +public abstract class IntDecoder { + + /** A special long value which is used to indicate end-of-stream has reached. */ + public static final long EOS = 0x100000000L; + + /** Input stream from which the encoded bytes are read */ + protected InputStream in; + + /** Sets the input stream from which the encoded data is read. */ + public void reInit(InputStream in) { + this.in = in; + } + + /** + * Decodes data received from the input stream, and returns one decoded + * integer. If end of stream is reached, {@link #EOS} is returned. + * + * @return one decoded integer as long or {@link #EOS} if end-of-stream + * reached. + * @throws IOException if an I/O error occurs + */ + public abstract long decode() throws IOException; + +} diff --git a/modules/facet/src/java/org/apache/lucene/util/encoding/IntEncoder.java b/modules/facet/src/java/org/apache/lucene/util/encoding/IntEncoder.java new file mode 100644 index 00000000000..bef6a342f36 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/util/encoding/IntEncoder.java @@ -0,0 +1,113 @@ +package org.apache.lucene.util.encoding; + +import java.io.IOException; +import java.io.OutputStream; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Encodes integers to a set {@link OutputStream}. Extending classes need to + * override {@link #encode(int)} to encode the value using their encoding + * algorithm. The default implementation of {@link #close()} closes the set + * {@link OutputStream}. + *

+ * The default {@link #IntEncoder() constructor} is provided for convenience + * only. One must call {@link #reInit(OutputStream)} before calling + * {@link #encode(int)} or {@link #close()}. + *

+ * For convenience, each encoder implements {@link #createMatchingDecoder()} for + * easy access to the matching decoder. + *

+ * NOTE: some implementations may buffer the encoded values in memory + * (such as {@link IntEncoderFilter} implementations) and encoding will happen + * only upon calling {@link #close()}. Therefore it is important to always call + * {@link #close()} on the encoder at hand. + *

+ * NOTE: encoders are usually not thread safe, unless specifically + * documented otherwise by an implementation. + * + * @lucene.experimental + */ +public abstract class IntEncoder { + + protected OutputStream out = null; + + /** + * Default constructor, provided here for robustness: if in the future a + * constructor with parameters will be added, this might break custom + * implementations of this class which call this implicit constructor. So we + * make it explicit to avoid any such issue in the future. + */ + public IntEncoder() { + } + + /** + * Instructs the encoder to finish the encoding process. This method closes + * the output stream which was specified by {@link #reInit(OutputStream) + * reInit}. An implementation may do here additional cleanup required to + * complete the encoding, such as flushing internal buffers, etc.
+ * Once this method was called, no further calls to {@link #encode(int) + * encode} should be made before first calling {@link #reInit(OutputStream) + * reInit}. + *

+ * NOTE: overriding classes should make sure they either call + * super.close() or close the output stream themselves. + */ + public void close() throws IOException { + if (out != null) { + out.close(); + } + } + + /** + * Encodes an integer to the output stream given in + * {@link #reInit(OutputStream) reInit} + */ + public abstract void encode(int value) throws IOException; + + /** + * Returns an {@link IntDecoder} which matches this encoder. Every encoder + * must return an {@link IntDecoder} and null is not a valid + * value. If an encoder is just a filter, it should at least return its + * wrapped encoder's matching decoder. + *

+ * NOTE: this method should create a new instance of the matching + * decoder and leave the instance sharing to the caller. Returning the same + * instance over and over is risky because encoders and decoders are not + * thread safe. + */ + public abstract IntDecoder createMatchingDecoder(); + + /** + * Reinitializes the encoder with the give {@link OutputStream}. For + * re-usability it can be changed without the need to reconstruct a new + * object. + *

+ * NOTE: after calling {@link #close()}, one must call + * this method even if the output stream itself hasn't changed. An example + * case is that the output stream wraps a byte[], and the output stream itself + * is reset, but its instance hasn't changed. Some implementations of + * {@link IntEncoder} may write some metadata about themselves to the output + * stream, and therefore it is imperative that one calls this method before + * encoding any data. + */ + public void reInit(OutputStream out) { + this.out = out; + } + +} diff --git a/modules/facet/src/java/org/apache/lucene/util/encoding/IntEncoderFilter.java b/modules/facet/src/java/org/apache/lucene/util/encoding/IntEncoderFilter.java new file mode 100644 index 00000000000..ac49bbd052c --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/util/encoding/IntEncoderFilter.java @@ -0,0 +1,58 @@ +package org.apache.lucene.util.encoding; + +import java.io.IOException; +import java.io.OutputStream; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * An abstract implementation of {@link IntEncoder} which is served as a filter + * on the values to encode. An encoder filter wraps another {@link IntEncoder} + * which does the actual encoding. This allows for chaining filters and + * encoders, such as:

+ * new UniqueValuesIntEncoder(new DGapIntEncoder(new VInt8IntEnoder()));
+ * {@link UniqueValuesIntEncoder} followed by {@link DGapIntEncoder}
+  
+ *

+ * The default implementation implements {@link #close()} by closing the wrapped + * encoder and {@link #reInit(OutputStream)} by re-initializing the wrapped + * encoder. + * + * @lucene.experimental + */ +public abstract class IntEncoderFilter extends IntEncoder { + + protected final IntEncoder encoder; + + protected IntEncoderFilter(IntEncoder encoder) { + this.encoder = encoder; + } + + @Override + public void close() throws IOException { + // There is no need to call super.close(), since we don't pass the output + // stream to super. + encoder.close(); + } + + @Override + public void reInit(OutputStream out) { + encoder.reInit(out); + } + +} diff --git a/modules/facet/src/java/org/apache/lucene/util/encoding/NOnesIntDecoder.java b/modules/facet/src/java/org/apache/lucene/util/encoding/NOnesIntDecoder.java new file mode 100644 index 00000000000..2b93d0eb607 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/util/encoding/NOnesIntDecoder.java @@ -0,0 +1,79 @@ +package org.apache.lucene.util.encoding; + +import java.io.IOException; +import java.io.InputStream; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Decodes data which was encoded by {@link NOnesIntEncoder}. Uses a + * {@link FourFlagsIntDecoder} to perform the actual encoding and translates the + * values back as described in {@link NOnesIntEncoder}. + * + * @see NOnesIntEncoder + * @lucene.experimental + */ +public class NOnesIntDecoder extends FourFlagsIntDecoder { + + /** Number of consecutive '1's to generate upon decoding a '2'. */ + private int n; + + private int onesCounter; + + /** + * Constructs a decoder with a given N (Number of consecutive '1's which are + * translated into a single target value '2'. + */ + public NOnesIntDecoder(int n) { + this.n = n; + } + + @Override + public long decode() throws IOException { + // If we read '2', we should return n '1's. + if (onesCounter > 0) { + --onesCounter; + return 1; + } + + long decode = super.decode(); + if (decode == 1) { + return 1; + } + if (decode == 2) { + onesCounter = n - 1; + return 1; + } + if (decode == 3) { + return 2; + } + return decode == EOS ? EOS : decode - 1; + } + + @Override + public void reInit(InputStream in) { + super.reInit(in); + onesCounter = 0; + } + + @Override + public String toString() { + return "NOnes (" + n + ") (" + super.toString() + ")"; + } + +} diff --git a/modules/facet/src/java/org/apache/lucene/util/encoding/NOnesIntEncoder.java b/modules/facet/src/java/org/apache/lucene/util/encoding/NOnesIntEncoder.java new file mode 100644 index 00000000000..f8e50153c55 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/util/encoding/NOnesIntEncoder.java @@ -0,0 +1,115 @@ +package org.apache.lucene.util.encoding; + +import java.io.IOException; +import java.io.OutputStream; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * A variation of {@link FourFlagsIntEncoder} which translates the data as + * follows: + *

    + *
  • Values ≥ 2 are trnalsated to value+1 (2 ⇒ 3, 3 + * ⇒ 4 and so forth). + *
  • Any N occurrences of 1 are encoded as a single 2. + *
  • Otherwise, each 1 is encoded as 1. + *
+ *

+ * Encoding examples: + *

    + *
  • N = 4: the data 1,1,1,1,1 is translated to: 2, 1 + *
  • N = 3: the data 1,2,3,4,1,1,1,1,5 is translated to 1,3,4,5,2,1,6 + *
+ * NOTE: this encoder does not support values ≤ 0 and + * {@link Integer#MAX_VALUE}. 0 is not supported because it's not supported by + * {@link FourFlagsIntEncoder} and {@link Integer#MAX_VALUE} because this + * encoder translates N to N+1, which will cause an overflow and + * {@link Integer#MAX_VALUE} will become a negative number, which is not + * supported as well.
+ * This does not mean you cannot encode {@link Integer#MAX_VALUE}. If it is not + * the first value to encode, and you wrap this encoder with + * {@link DGapIntEncoder}, then the value that will be sent to this encoder will + * be MAX_VAL - prev. + * + * @lucene.experimental + */ +public class NOnesIntEncoder extends FourFlagsIntEncoder { + + /** Number of consecutive '1's to be translated into single target value '2'. */ + private int n; + + /** Counts the number of consecutive ones seen. */ + private int onesCounter = 0; + + /** + * Constructs an encoder with a given value of N (N: Number of consecutive + * '1's to be translated into single target value '2'). + */ + public NOnesIntEncoder(int n) { + this.n = n; + } + + @Override + public void close() throws IOException { + // We might have ones in our buffer, encode them as neccesary. + while (onesCounter-- > 0) { + super.encode(1); + } + + super.close(); + } + + @Override + public void encode(int value) throws IOException { + if (value == 1) { + // Increment the number of consecutive ones seen so far + if (++onesCounter == n) { + super.encode(2); + onesCounter = 0; + } + return; + } + + // If it's not one - there might have been ones we had to encode prior to + // this value + while (onesCounter > 0) { + --onesCounter; + super.encode(1); + } + + // encode value + 1 --> the translation. + super.encode(value + 1); + } + + @Override + public IntDecoder createMatchingDecoder() { + return new NOnesIntDecoder(n); + } + + @Override + public void reInit(OutputStream out) { + super.reInit(out); + onesCounter = 0; + } + + @Override + public String toString() { + return "NOnes (" + n + ") (" + super.toString() + ")"; + } + +} diff --git a/modules/facet/src/java/org/apache/lucene/util/encoding/SimpleIntDecoder.java b/modules/facet/src/java/org/apache/lucene/util/encoding/SimpleIntDecoder.java new file mode 100644 index 00000000000..fb7ea44d967 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/util/encoding/SimpleIntDecoder.java @@ -0,0 +1,66 @@ +package org.apache.lucene.util.encoding; + +import java.io.IOException; +import java.io.StreamCorruptedException; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * A simple stream decoder which can decode values encoded with + * {@link SimpleIntEncoder}. + * + * @lucene.experimental + */ +public class SimpleIntDecoder extends IntDecoder { + + /** + * reusable buffer - allocated only once as this is not a thread-safe object + */ + private byte[] buffer = new byte[4]; + + @Override + public long decode() throws IOException { + + // we need exactly 4 bytes to decode an int in this decoder impl, otherwise, throw an exception + int offset = 0; + while (offset < 4) { + int nRead = in.read(buffer, offset, 4 - offset); + if (nRead == -1) { + if (offset > 0) { + throw new StreamCorruptedException( + "Need 4 bytes for decoding an int, got only " + offset); + } + return EOS; + } + offset += nRead; + } + + int v = buffer[3] & 0xff; + v |= (buffer[2] << 8) & 0xff00; + v |= (buffer[1] << 16) & 0xff0000; + v |= (buffer[0] << 24) & 0xff000000; + + return v; + } + + @Override + public String toString() { + return "Simple"; + } + +} diff --git a/modules/facet/src/java/org/apache/lucene/util/encoding/SimpleIntEncoder.java b/modules/facet/src/java/org/apache/lucene/util/encoding/SimpleIntEncoder.java new file mode 100644 index 00000000000..1124bd7e791 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/util/encoding/SimpleIntEncoder.java @@ -0,0 +1,57 @@ +package org.apache.lucene.util.encoding; + +import java.io.IOException; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * A simple {@link IntEncoder}, writing an integer as 4 raw bytes. * + * + * @lucene.experimental + */ +public class SimpleIntEncoder extends IntEncoder { + + /** + * This method makes sure the value wasn't previously encoded by checking + * against the Set. If the value wasn't encoded, it's added to the Set, and + * encoded with {#link Vint8#encode} + * + * @param value + * an integer to be encoded + * @throws IOException + * possibly thrown by the OutputStream + */ + @Override + public void encode(int value) throws IOException { + out.write(value >>> 24); + out.write((value >> 16) & 0xFF); + out.write((value >> 8) & 0xFF); + out.write(value & 0xFF); + } + + @Override + public IntDecoder createMatchingDecoder() { + return new SimpleIntDecoder(); + } + + @Override + public String toString() { + return "Simple"; + } + +} diff --git a/modules/facet/src/java/org/apache/lucene/util/encoding/SortingIntEncoder.java b/modules/facet/src/java/org/apache/lucene/util/encoding/SortingIntEncoder.java new file mode 100644 index 00000000000..107d67ceffa --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/util/encoding/SortingIntEncoder.java @@ -0,0 +1,85 @@ +package org.apache.lucene.util.encoding; + +import java.io.IOException; +import java.io.OutputStream; +import java.util.Arrays; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * An {@link IntEncoderFilter} which sorts the values to encode in ascending + * order before encoding them. Encoding therefore happens upon calling + * {@link #close()}. Since this encoder is usually chained with another encoder + * that relies on sorted values, it does not offer a default constructor. + * + * @lucene.experimental + */ +public class SortingIntEncoder extends IntEncoderFilter { + + private float grow = 2.0f; + private int index = 0; + private int[] set = new int[1024]; + + /** Initializes with the given encoder. */ + public SortingIntEncoder(IntEncoder encoder) { + super(encoder); + } + + @Override + public void close() throws IOException { + if (index == 0) { + return; + } + + Arrays.sort(set, 0, index); + for (int i = 0; i < index; i++) { + encoder.encode(set[i]); + } + encoder.close(); + index = 0; + + super.close(); + } + + @Override + public void encode(int value) throws IOException { + if (index == set.length) { + int[] newSet = new int[(int) (set.length * grow)]; + System.arraycopy(set, 0, newSet, 0, set.length); + set = newSet; + } + set[index++] = value; + } + + @Override + public IntDecoder createMatchingDecoder() { + return encoder.createMatchingDecoder(); + } + + @Override + public void reInit(OutputStream out) { + super.reInit(out); + index = 0; + } + + @Override + public String toString() { + return "Sorting (" + encoder.toString() + ")"; + } + +} diff --git a/modules/facet/src/java/org/apache/lucene/util/encoding/UniqueValuesIntEncoder.java b/modules/facet/src/java/org/apache/lucene/util/encoding/UniqueValuesIntEncoder.java new file mode 100644 index 00000000000..197962dcb32 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/util/encoding/UniqueValuesIntEncoder.java @@ -0,0 +1,71 @@ +package org.apache.lucene.util.encoding; + +import java.io.IOException; +import java.io.OutputStream; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * An {@link IntEncoderFilter} which ensures only unique values are encoded. The + * implementation assumes the values given to {@link #encode(int)} are sorted. + * If this is not the case, you can chain this encoder with + * {@link SortingIntEncoder}. + * + * @lucene.experimental + */ +public final class UniqueValuesIntEncoder extends IntEncoderFilter { + + /** + * Denotes an illegal value which we can use to init 'prev' to. Since all + * encoded values are integers, this value is init to MAX_INT+1 and is of type + * long. Therefore we are guaranteed not to get this value in encode. + */ + private static final long ILLEGAL_VALUE = Integer.MAX_VALUE + 1; + + private long prev = ILLEGAL_VALUE; + + /** Constructs a new instance with the given encoder. */ + public UniqueValuesIntEncoder(IntEncoder encoder) { + super(encoder); + } + + @Override + public void encode(int value) throws IOException { + if (prev != value) { + encoder.encode(value); + prev = value; + } + } + + @Override + public IntDecoder createMatchingDecoder() { + return encoder.createMatchingDecoder(); + } + + @Override + public void reInit(OutputStream out) { + super.reInit(out); + prev = ILLEGAL_VALUE; + } + + @Override + public String toString() { + return "Unique (" + encoder.toString() + ")"; + } + +} diff --git a/modules/facet/src/java/org/apache/lucene/util/encoding/VInt8IntDecoder.java b/modules/facet/src/java/org/apache/lucene/util/encoding/VInt8IntDecoder.java new file mode 100644 index 00000000000..04cda3770cb --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/util/encoding/VInt8IntDecoder.java @@ -0,0 +1,58 @@ +package org.apache.lucene.util.encoding; + +import java.io.IOException; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * An {@link IntDecoder} which can decode values encoded by + * {@link VInt8IntEncoder}. + * + * @lucene.experimental + */ +public class VInt8IntDecoder extends IntDecoder { + + private boolean legalEOS = true; + + @Override + public long decode() throws IOException { + int value = 0; + while (true) { + int first = in.read(); + if (first < 0) { + if (!legalEOS) { + throw new IOException("Unexpected End-Of-Stream"); + } + return EOS; + } + value |= first & 0x7F; + if ((first & 0x80) == 0) { + legalEOS = true; + return value; + } + legalEOS = false; + value <<= 7; + } + } + + @Override + public String toString() { + return "VInt8"; + } + +} diff --git a/modules/facet/src/java/org/apache/lucene/util/encoding/VInt8IntEncoder.java b/modules/facet/src/java/org/apache/lucene/util/encoding/VInt8IntEncoder.java new file mode 100644 index 00000000000..228951d6bf3 --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/util/encoding/VInt8IntEncoder.java @@ -0,0 +1,86 @@ +package org.apache.lucene.util.encoding; + +import java.io.IOException; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * An {@link IntEncoder} which implements variable length encoding. A number is + * encoded as follows: + *
    + *
  • If it is less than 127 and non-negative, i.e. uses only 7 bits, it is + * encoded as a single byte: 0bbbbbbb. + *
  • If it occupies more than 7 bits, it is represented as a series of bytes, + * each byte carrying 7 bits. All but the last byte have the MSB set, the last + * one has it unset. + *
+ * Example: + *
    + *
  1. n = 117 = 01110101: This has less than 8 significant bits, therefore is + * encoded as 01110101 = 0x75. + *
  2. n = 100000 = (binary) 11000011010100000. This has 17 significant bits, + * thus needs three Vint8 bytes. Pad it to a multiple of 7 bits, then split it + * into chunks of 7 and add an MSB, 0 for the last byte, 1 for the others: + * 1|0000110 1|0001101 0|0100000 = 0x86 0x8D 0x20. + *
+ * NOTE: although this encoder is not limited to values ≥ 0, it is not + * recommended for use with negative values, as their encoding will result in 5 + * bytes written to the output stream, rather than 4. For such values, either + * use {@link SimpleIntEncoder} or write your own version of variable length + * encoding, which can better handle negative values. + * + * @lucene.experimental + */ +public class VInt8IntEncoder extends IntEncoder { + + @Override + public void encode(int value) throws IOException { + if ((value & ~0x7F) == 0) { + out.write(value); + } else if ((value & ~0x3FFF) == 0) { + out.write(0x80 | (value >> 7)); + out.write(0x7F & value); + } else if ((value & ~0x1FFFFF) == 0) { + out.write(0x80 | (value >> 14)); + out.write(0x80 | (value >> 7)); + out.write(0x7F & value); + } else if ((value & ~0xFFFFFFF) == 0) { + out.write(0x80 | (value >> 21)); + out.write(0x80 | (value >> 14)); + out.write(0x80 | (value >> 7)); + out.write(0x7F & value); + } else { + out.write(0x80 | (value >> 28)); + out.write(0x80 | (value >> 21)); + out.write(0x80 | (value >> 14)); + out.write(0x80 | (value >> 7)); + out.write(0x7F & value); + } + } + + @Override + public IntDecoder createMatchingDecoder() { + return new VInt8IntDecoder(); + } + + @Override + public String toString() { + return "VInt8"; + } + +} diff --git a/modules/facet/src/java/org/apache/lucene/util/encoding/package.html b/modules/facet/src/java/org/apache/lucene/util/encoding/package.html new file mode 100644 index 00000000000..941aefa778c --- /dev/null +++ b/modules/facet/src/java/org/apache/lucene/util/encoding/package.html @@ -0,0 +1,150 @@ + + +Encoding + + +Offers various encoders and decoders for integers, as well as the +mechanisms to create new ones. The super class for all encoders is +{@link org.apache.lucene.util.encoding.IntEncoder} and for most of the +encoders there is a matching {@link +org.apache.lucene.util.encoding.IntDecoder} implementation (not all +encoders need a decoder). +

An encoder encodes the integers that are passed to {@link +org.apache.lucene.util.encoding.IntEncoder#encode(int) encode} into a +set output stream (see {@link +org.apache.lucene.util.encoding.IntEncoder#reInit(OutputStream) +reInit}). One should always call {@link +org.apache.lucene.util.encoding.IntEncoder#close() close} when all +integers have been encoded, to ensure proper finish by the encoder. Some +encoders buffer values in-memory and encode in batches in order to +optimize the encoding, and not closing them may result in loss of +information or corrupt stream. +

A proper and typical usage of an encoder looks like this: +


+int[] data = <the values to encode>
+IntEncoder encoder = new VInt8IntEncoder();
+OutputStream out = new ByteArrayOutputStream();
+encoder.reInit(out);
+for (int val : data) {
+  encoder.encode(val);
+}
+encoder.close();
+
+// Print the bytes in binary
+byte[] bytes = out.toByteArray();
+for (byte b : bytes) {
+  System.out.println(Integer.toBinaryString(b));
+}
+
+Each encoder also implements {@link +org.apache.lucene.util.encoding.IntEncoder#createMatchingDecoder() +createMatchingDecoder} which returns the matching decoder for this encoder. +As mentioned above, not all encoders have a matching decoder (like some +encoder filters which are explained next), however every encoder should +return a decoder following a call to that method. To complete the +example above, one can easily iterate over the decoded values like this: +

+IntDecoder d = e.createMatchingDecoder();
+d.reInit(new ByteArrayInputStream(bytes));
+long val;
+while ((val = d.decode()) != IntDecoder.EOS) {
+  System.out.println(val);
+}
+
+

Some encoders don't perform any encoding at all, or do not include an +encoding logic. Those are called {@link +org.apache.lucene.util.encoding.IntEncoderFilter}s. A filter is an +encoder which delegates the encoding task to a given encoder, however +performs additional logic before the values are sent for encoding. An +example is {@link org.apache.lucene.util.encoding.DGapIntEncoder} +which encodes the gaps between values rather than the values themselves. +Another example is {@link +org.apache.lucene.util.encoding.SortingIntEncoder} which sorts all the +values in ascending order before they are sent for encoding. This +encoder aggregates the values in its {@link +org.apache.lucene.util.encoding.IntEncoder#encode(int) encode} implementation +and decoding only happens upon calling {@link +org.apache.lucene.util.encoding.IntEncoder#close() close}. +

Extending IntEncoder

+Extending {@link org.apache.lucene.util.encoding.IntEncoder} is a very +easy task. One only needs to implement {@link +org.apache.lucene.util.encoding.IntEncoder#encode(int) encode} and +{@link org.apache.lucene.util.encoding.IntEncoder#createMatchingDecoder() +createMatchingDecoder} as the base implementation takes care of +re-initializing the output stream and closing it. The following example +illustrates how can one write an encoder (and a matching decoder) which +'tags' the stream with type/ID of the encoder. Such tagging is important +in scenarios where an application uses different encoders for different +streams, and wants to manage some sort of mapping between an encoder ID +to an IntEncoder/Decoder implementation, so a proper decoder will be +initialized on the fly: +

+public class TaggingIntEncoder extends IntEncoderFilter {
+  
+  public TaggingIntEncoder(IntEncoder encoder) {
+    super(encoder);
+  }
+  
+  @Override
+  public void encode(int value) throws IOException {
+    encoder.encode(value);
+  }
+
+  @Override
+  public IntDecoder createMatchingDecoder() {
+    return new TaggingIntDecoder();
+  }
+	
+  @Override
+  public void reInit(OutputStream out) {
+    super.reInit(os);
+    // Assumes the application has a static EncodersMap class which is able to 
+    // return a unique ID for a given encoder.
+    int encoderID = EncodersMap.getID(encoder);
+    this.out.write(encoderID);
+  }
+
+  @Override
+  public String toString() {
+    return "Tagging (" + encoder.toString() + ")";
+  }
+
+}
+
+And the matching decoder: +

+public class TaggingIntDecoder extends IntDecoder {
+  
+  // Will be initialized upon calling reInit.
+  private IntDecoder decoder;
+  
+  @Override
+  public void reInit(InputStream in) {
+    super.reInit(in);
+    
+    // Read the ID of the encoder that tagged this stream.
+    int encoderID = in.read();
+    
+    // Assumes EncodersMap can return the proper IntEncoder given the ID.
+    decoder = EncodersMap.getEncoder(encoderID).createMatchingDecoder();
+  }
+	
+  @Override
+  public long decode() throws IOException {
+    return decoder.decode();
+  }
+
+  @Override
+  public String toString() {
+    return "Tagging (" + decoder == null ? "none" : decoder.toString() + ")";
+  }
+
+}
+
+The example implements TaggingIntEncoder as a filter over another +encoder. Even though it does not do any filtering on the actual values, it feels +right to present it as a filter. Anyway, this is just an example code and one +can choose to implement it however it makes sense to the application. For +simplicity, error checking was omitted from the sample code. + + \ No newline at end of file diff --git a/modules/facet/src/java/overview.html b/modules/facet/src/java/overview.html new file mode 100644 index 00000000000..60965e1c480 --- /dev/null +++ b/modules/facet/src/java/overview.html @@ -0,0 +1,26 @@ + + + + + facet + + + + facet + + \ No newline at end of file diff --git a/modules/facet/src/test/org/apache/lucene/facet/FacetTestBase.java b/modules/facet/src/test/org/apache/lucene/facet/FacetTestBase.java new file mode 100644 index 00000000000..ebb6880baf2 --- /dev/null +++ b/modules/facet/src/test/org/apache/lucene/facet/FacetTestBase.java @@ -0,0 +1,323 @@ +package org.apache.lucene.facet; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; + +import org.apache.lucene.DocumentBuilder.DocumentBuilderException; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.core.WhitespaceAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.Field.Index; +import org.apache.lucene.document.Field.Store; +import org.apache.lucene.document.Field.TermVector; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.IndexWriterConfig.OpenMode; +import org.apache.lucene.index.MultiFields; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.store.RAMDirectory; + +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.facet.index.CategoryDocumentBuilder; +import org.apache.lucene.facet.index.params.CategoryListParams; +import org.apache.lucene.facet.index.params.DefaultFacetIndexingParams; +import org.apache.lucene.facet.index.params.FacetIndexingParams; +import org.apache.lucene.facet.search.params.FacetRequest; +import org.apache.lucene.facet.search.params.FacetSearchParams; +import org.apache.lucene.facet.search.results.FacetResult; +import org.apache.lucene.facet.search.results.FacetResultNode; +import org.apache.lucene.facet.taxonomy.CategoryPath; +import org.apache.lucene.facet.taxonomy.TaxonomyReader; +import org.apache.lucene.facet.taxonomy.TaxonomyWriter; +import org.apache.lucene.facet.taxonomy.lucene.LuceneTaxonomyReader; +import org.apache.lucene.facet.taxonomy.lucene.LuceneTaxonomyWriter; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** Base faceted search test. */ +public abstract class FacetTestBase extends LuceneTestCase { + + /** Documents text field. */ + protected static final String CONTENT_FIELD = "content"; + + /** Directory for the index */ + protected Directory indexDir; + + /** Directory for the taxonomy */ + protected Directory taxoDir; + + /** taxonomy Reader for the test. */ + protected TaxonomyReader taxoReader; + + /** Index Reader for the test. */ + protected IndexReader indexReader; + + /** Searcher for the test. */ + protected IndexSearcher searcher; + + /** documents text (for the text field). */ + private static final String[] DEFAULT_CONTENT = { + "the white car is the one I want.", + "the white dog does not belong to anyone.", + }; + + /** Facets: facets[D][F] == category-path no. F for document no. D. */ + private static final CategoryPath[][] DEFAULT_CATEGORIES = { + { new CategoryPath("root","a","f1"), new CategoryPath("root","a","f2") }, + { new CategoryPath("root","a","f1"), new CategoryPath("root","a","f3") }, + }; + + /** categories to be added to specified doc */ + protected List getCategories(int doc) { + return Arrays.asList(DEFAULT_CATEGORIES[doc]); + } + + /** Number of documents to index */ + protected int numDocsToIndex() { + return DEFAULT_CONTENT.length; + } + + /** content to be added to specified doc */ + protected String getContent(int doc) { + return DEFAULT_CONTENT[doc]; + } + + /** Prepare index (in RAM) with single partition */ + protected final void initIndex() throws Exception { + initIndex(Integer.MAX_VALUE); + } + + /** Prepare index (in RAM) with some documents and some facets */ + protected final void initIndex(int partitionSize) throws Exception { + initIndex(partitionSize, false); + } + + /** Prepare index (in RAM/Disk) with some documents and some facets */ + protected final void initIndex(int partitionSize, boolean onDisk) throws Exception { + if (VERBOSE) { + System.out.println("Partition Size: " + partitionSize+" onDisk: "+onDisk); + } + + if (onDisk) { + File indexFile = new File(TEMP_DIR,"index"); + indexDir = FSDirectory.open(indexFile); + taxoDir = FSDirectory.open(new File(indexFile,"facets")); + } else { + indexDir = new RAMDirectory(); + taxoDir = new RAMDirectory(); + } + + IndexWriter iw = new IndexWriter(indexDir, new IndexWriterConfig(TEST_VERSION_CURRENT, getAnalyzer())); + TaxonomyWriter taxo = new LuceneTaxonomyWriter(taxoDir, OpenMode.CREATE); + + populateIndex(iw, taxo, getFacetIndexingParams(partitionSize)); + + // commit changes (taxonomy prior to search index for consistency) + taxo.commit(); + iw.commit(); + taxo.close(); + iw.close(); + + // prepare for searching + taxoReader = new LuceneTaxonomyReader(taxoDir); + indexReader = IndexReader.open(indexDir); + searcher = new IndexSearcher(indexReader); + } + + /** Returns a default facet indexing params */ + protected FacetIndexingParams getFacetIndexingParams(final int partSize) { + return new DefaultFacetIndexingParams() { + @Override + protected int fixedPartitionSize() { + return partSize; + } + }; + } + + /** + * Faceted Search Params for the test. + * Sub classes should override in order to test with different faceted search params. + */ + protected FacetSearchParams getFacetedSearchParams() { + return getFacetedSearchParams(Integer.MAX_VALUE); + } + + /** + * Faceted Search Params with specified partition size. + * @see #getFacetedSearchParams() + */ + protected FacetSearchParams getFacetedSearchParams(int partitionSize) { + FacetSearchParams res = new FacetSearchParams(getFacetIndexingParams(partitionSize)); + return res; + } + + /** + * Populate the test index+taxonomy for this test. + *

Subclasses can override this to test different scenarios + */ + protected void populateIndex(IndexWriter iw, TaxonomyWriter taxo, FacetIndexingParams iParams) + throws IOException, DocumentBuilderException, CorruptIndexException { + // add test documents + int numDocsToIndex = numDocsToIndex(); + for (int doc=0; doc a = new ArrayList(); + for (FacetResultNode frn : parentRes.getSubResults()) { + a.add(frn); + } + return a.toArray(new FacetResultNode[0]); + } + + /** utility Create a dummy document with specified categories and content */ + protected final void indexDoc(FacetIndexingParams iParams, IndexWriter iw, + TaxonomyWriter tw, String content, List categories) throws IOException, + CorruptIndexException { + Document d = new Document(); + CategoryDocumentBuilder builder = new CategoryDocumentBuilder(tw, iParams); + builder.setCategoryPaths(categories); + builder.build(d); + d.add(new Field("content", content, Store.YES, Index.ANALYZED, TermVector.NO)); + iw.addDocument(d); + } + + /** Build the "truth" with ALL the facets enumerating indexes content. */ + protected Map facetCountsTruth() throws IOException { + FacetIndexingParams iParams = getFacetIndexingParams(Integer.MAX_VALUE); + String delim = String.valueOf(iParams.getFacetDelimChar()); + Map res = new HashMap(); + HashSet handledTerms = new HashSet(); + for (CategoryListParams clp : iParams.getAllCategoryListParams()) { + Term baseTerm = new Term(clp.getTerm().field()); + if (!handledTerms.add(baseTerm)) { + continue; // already handled this term (for another list) + } + Terms terms = MultiFields.getTerms(indexReader, baseTerm.field()); + if (terms == null) { + continue; + } + Bits deletedDocs = MultiFields.getDeletedDocs(indexReader); + TermsEnum te = terms.iterator(); + DocsEnum de = null; + while (te.next() != null) { + de = te.docs(deletedDocs, de); + int cnt = 0; + while (de.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { + cnt++; + } + res.put(new CategoryPath(te.term().utf8ToString().split(delim)), cnt); + } + } + return res; + } + + /** Validate counts for returned facets, and that there are not too many results */ + protected static void assertCountsAndCardinality(Map facetCountsTruth, List facetResults) throws Exception { + for (FacetResult fr : facetResults) { + FacetResultNode topResNode = fr.getFacetResultNode(); + FacetRequest freq = fr.getFacetRequest(); + if (VERBOSE) { + System.out.println(freq.getCategoryPath().toString()+ "\t\t" + topResNode); + } + assertCountsAndCardinality(facetCountsTruth, topResNode, freq.getNumResults()); + } + } + + /** Validate counts for returned facets, and that there are not too many results */ + private static void assertCountsAndCardinality(Map facetCountsTruth, FacetResultNode resNode, int reqNumResults) throws Exception { + int actualNumResults = resNode.getNumSubResults(); + if (VERBOSE) { + System.out.println("NumResults: " + actualNumResults); + } + assertTrue("Too many results!", actualNumResults <= reqNumResults); + for (FacetResultNode subRes : resNode.getSubResults()) { + assertEquals("wrong count for: "+subRes, facetCountsTruth.get(subRes.getLabel()).intValue(), (int)subRes.getValue()); + assertCountsAndCardinality(facetCountsTruth, subRes, reqNumResults); // recurse into child results + } + } + + /** Validate results equality */ + protected static void assertSameResults(List expected, + List actual) { + String expectedResults = resStringValueOnly(expected); + String actualResults = resStringValueOnly(actual); + if (!expectedResults.equals(actualResults)) { + System.err.println("Results are not the same!"); + System.err.println("Expected:\n" + expectedResults); + System.err.println("Actual" + actualResults); + fail("Results are not the same!"); + } + } + + /** exclude the residue and numDecendants because it is incorrect in sampling */ + private static final String resStringValueOnly(List results) { + StringBuilder sb = new StringBuilder(); + for (FacetResult facetRes : results) { + sb.append(facetRes.toString()).append('\n'); + } + return sb.toString().replaceAll("Residue:.*.0", "").replaceAll("Num valid Descendants.*", ""); + } +} diff --git a/modules/facet/src/test/org/apache/lucene/facet/FacetTestUtils.java b/modules/facet/src/test/org/apache/lucene/facet/FacetTestUtils.java new file mode 100644 index 00000000000..94b6bcd873c --- /dev/null +++ b/modules/facet/src/test/org/apache/lucene/facet/FacetTestUtils.java @@ -0,0 +1,166 @@ +package org.apache.lucene.facet; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; + +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.Field.Index; +import org.apache.lucene.document.Field.Store; +import org.apache.lucene.document.Field.TermVector; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.search.Collector; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.search.TopScoreDocCollector; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.RAMDirectory; + +import org.apache.lucene.search.MultiCollector; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.facet.index.CategoryDocumentBuilder; +import org.apache.lucene.facet.index.params.DefaultFacetIndexingParams; +import org.apache.lucene.facet.index.params.FacetIndexingParams; +import org.apache.lucene.facet.search.FacetsCollector; +import org.apache.lucene.facet.search.params.CountFacetRequest; +import org.apache.lucene.facet.search.params.FacetRequest; +import org.apache.lucene.facet.search.params.FacetSearchParams; +import org.apache.lucene.facet.taxonomy.CategoryPath; +import org.apache.lucene.facet.taxonomy.TaxonomyReader; +import org.apache.lucene.facet.taxonomy.TaxonomyWriter; +import org.apache.lucene.facet.taxonomy.lucene.LuceneTaxonomyReader; +import org.apache.lucene.facet.taxonomy.lucene.LuceneTaxonomyWriter; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class FacetTestUtils { + + public static Directory[][] createIndexTaxonomyDirs(int number) { + Directory[][] dirs = new Directory[number][2]; + for (int i = 0; i < number; i++) { + dirs[i][0] = new RAMDirectory(); + dirs[i][1] = new RAMDirectory(); + } + return dirs; + } + + public static IndexTaxonomyReaderPair[] createIndexTaxonomyReaderPair( + Directory[][] dirs) throws IOException { + IndexTaxonomyReaderPair[] pairs = new IndexTaxonomyReaderPair[dirs.length]; + for (int i = 0; i < dirs.length; i++) { + IndexTaxonomyReaderPair pair = new IndexTaxonomyReaderPair(); + pair.indexReader = IndexReader.open(dirs[i][0]); + pair.indexSearcher = new IndexSearcher(pair.indexReader); + pair.taxReader = new LuceneTaxonomyReader(dirs[i][1]); + pairs[i] = pair; + } + return pairs; + } + + public static IndexTaxonomyWriterPair[] createIndexTaxonomyWriterPair( + Directory[][] dirs) throws IOException { + IndexTaxonomyWriterPair[] pairs = new IndexTaxonomyWriterPair[dirs.length]; + for (int i = 0; i < dirs.length; i++) { + IndexTaxonomyWriterPair pair = new IndexTaxonomyWriterPair(); + pair.indexWriter = new IndexWriter(dirs[i][0], new IndexWriterConfig( + LuceneTestCase.TEST_VERSION_CURRENT, new StandardAnalyzer( + LuceneTestCase.TEST_VERSION_CURRENT))); + pair.taxWriter = new LuceneTaxonomyWriter(dirs[i][1]); + pair.indexWriter.commit(); + pair.taxWriter.commit(); + pairs[i] = pair; + } + return pairs; + } + + public static Collector[] search(IndexSearcher searcher, + TaxonomyReader taxonomyReader, DefaultFacetIndexingParams iParams, + int k, String... facetNames) throws IOException, + IllegalAccessException, InstantiationException { + + Collector[] collectors = new Collector[2]; + + FacetSearchParams facetSearchParams = new FacetSearchParams(iParams); + Collection fRequests = new ArrayList(); + for (String facetName : facetNames) { + CategoryPath cp = new CategoryPath(facetName); + FacetRequest fq = new CountFacetRequest(cp, k); + facetSearchParams.addFacetRequest(fq); + fRequests.add(fq); + } + + TopScoreDocCollector topDocsCollector = TopScoreDocCollector.create( + searcher.getIndexReader().maxDoc(), true); + FacetsCollector facetsCollector = new FacetsCollector( + facetSearchParams, searcher.getIndexReader(), taxonomyReader); + Collector mColl = MultiCollector.wrap(topDocsCollector, facetsCollector); + + collectors[0] = topDocsCollector; + collectors[1] = facetsCollector; + + searcher.search(new MatchAllDocsQuery(), mColl); + return collectors; + } + + public static void add(FacetIndexingParams iParams, IndexWriter iw, + TaxonomyWriter tw, String... strings) throws IOException, + CorruptIndexException { + ArrayList cps = new ArrayList(); + CategoryPath cp = new CategoryPath(strings); + cps.add(cp); + Document d = new Document(); + new CategoryDocumentBuilder(tw, iParams).setCategoryPaths(cps).build(d); + d.add(new Field("content", "alpha", Store.YES, Index.ANALYZED, + TermVector.NO)); + iw.addDocument(d); + } + + public static class IndexTaxonomyReaderPair { + public IndexReader indexReader; + public TaxonomyReader taxReader; + public IndexSearcher indexSearcher; + + public void close() throws IOException { + indexSearcher.close(); + indexReader.close(); + taxReader.close(); + } + + } + + public static class IndexTaxonomyWriterPair { + public IndexWriter indexWriter; + public TaxonomyWriter taxWriter; + + public void close() throws IOException { + indexWriter.close(); + taxWriter.close(); + } + + public void commit() throws IOException { + indexWriter.commit(); + taxWriter.commit(); + } + } + +} diff --git a/modules/facet/src/test/org/apache/lucene/facet/enhancements/CategoryEnhancementDummy1.java b/modules/facet/src/test/org/apache/lucene/facet/enhancements/CategoryEnhancementDummy1.java new file mode 100644 index 00000000000..bd49b96d7a3 --- /dev/null +++ b/modules/facet/src/test/org/apache/lucene/facet/enhancements/CategoryEnhancementDummy1.java @@ -0,0 +1,70 @@ +package org.apache.lucene.facet.enhancements; + +import org.apache.lucene.analysis.TokenStream; + +import org.apache.lucene.facet.enhancements.CategoryEnhancement; +import org.apache.lucene.facet.enhancements.params.EnhancementsIndexingParams; +import org.apache.lucene.facet.index.attributes.CategoryAttribute; +import org.apache.lucene.facet.index.attributes.CategoryProperty; +import org.apache.lucene.facet.index.streaming.CategoryListTokenizer; +import org.apache.lucene.facet.taxonomy.TaxonomyWriter; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class CategoryEnhancementDummy1 implements CategoryEnhancement { + + public boolean generatesCategoryList() { + return false; + } + + public String getCategoryListTermText() { + return null; + } + + public CategoryListTokenizer getCategoryListTokenizer( + TokenStream tokenizer, EnhancementsIndexingParams indexingParams, + TaxonomyWriter taxonomyWriter) { + return null; + } + + public byte[] getCategoryTokenBytes(CategoryAttribute categoryAttribute) { + return null; + } + + public Object extractCategoryTokenData(byte[] buffer, int offset, int length) { + return null; + } + + public Class getRetainableProperty() { + return null; + } + + @Override + public boolean equals(Object o) { + if (o instanceof CategoryEnhancementDummy1) { + return true; + } + return false; + } + + @Override + public int hashCode() { + return super.hashCode(); + } + +} diff --git a/modules/facet/src/test/org/apache/lucene/facet/enhancements/CategoryEnhancementDummy2.java b/modules/facet/src/test/org/apache/lucene/facet/enhancements/CategoryEnhancementDummy2.java new file mode 100644 index 00000000000..a23fe5e95e6 --- /dev/null +++ b/modules/facet/src/test/org/apache/lucene/facet/enhancements/CategoryEnhancementDummy2.java @@ -0,0 +1,79 @@ +package org.apache.lucene.facet.enhancements; + +import org.apache.lucene.analysis.TokenStream; + +import org.apache.lucene.facet.enhancements.CategoryEnhancement; +import org.apache.lucene.facet.enhancements.params.EnhancementsIndexingParams; +import org.apache.lucene.facet.index.DummyProperty; +import org.apache.lucene.facet.index.attributes.CategoryAttribute; +import org.apache.lucene.facet.index.attributes.CategoryProperty; +import org.apache.lucene.facet.index.streaming.CategoryListTokenizer; +import org.apache.lucene.facet.taxonomy.TaxonomyWriter; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class CategoryEnhancementDummy2 implements CategoryEnhancement { + + public static byte[] CATEGORY_TOKEN_BYTES = new byte[] { 3, 0, 7 }; + + public boolean generatesCategoryList() { + return false; + } + + public String getCategoryListTermText() { + return null; + } + + public CategoryListTokenizer getCategoryListTokenizer( + TokenStream tokenizer, EnhancementsIndexingParams indexingParams, + TaxonomyWriter taxonomyWriter) { + return null; + } + + public byte[] getCategoryTokenBytes(CategoryAttribute categoryAttribute) { + return CATEGORY_TOKEN_BYTES; + } + + public Object extractCategoryTokenData(byte[] buffer, int offset, int length) { + if (length != CATEGORY_TOKEN_BYTES.length) { + throw new IllegalArgumentException("unexpected data length " + + length); + } + byte[] ret = new byte[length]; + System.arraycopy(buffer, offset, ret, 0, length); + return ret; + } + + public Class getRetainableProperty() { + return DummyProperty.class; + } + + @Override + public boolean equals(Object o) { + if (o instanceof CategoryEnhancementDummy2) { + return true; + } + return false; + } + + @Override + public int hashCode() { + return super.hashCode(); + } + +} diff --git a/modules/facet/src/test/org/apache/lucene/facet/enhancements/CategoryEnhancementDummy3.java b/modules/facet/src/test/org/apache/lucene/facet/enhancements/CategoryEnhancementDummy3.java new file mode 100644 index 00000000000..e1cae808693 --- /dev/null +++ b/modules/facet/src/test/org/apache/lucene/facet/enhancements/CategoryEnhancementDummy3.java @@ -0,0 +1,78 @@ +package org.apache.lucene.facet.enhancements; + +import org.apache.lucene.analysis.TokenStream; + +import org.apache.lucene.facet.enhancements.CategoryEnhancement; +import org.apache.lucene.facet.enhancements.params.EnhancementsIndexingParams; +import org.apache.lucene.facet.index.attributes.CategoryAttribute; +import org.apache.lucene.facet.index.attributes.CategoryProperty; +import org.apache.lucene.facet.index.streaming.CategoryListTokenizer; +import org.apache.lucene.facet.taxonomy.TaxonomyWriter; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class CategoryEnhancementDummy3 implements CategoryEnhancement { + + public static byte[] CATEGORY_TOKEN_BYTES = new byte[] { 5, -1, 33, 8 }; + + public boolean generatesCategoryList() { + return false; + } + + public String getCategoryListTermText() { + return null; + } + + public CategoryListTokenizer getCategoryListTokenizer( + TokenStream tokenizer, EnhancementsIndexingParams indexingParams, + TaxonomyWriter taxonomyWriter) { + return null; + } + + public byte[] getCategoryTokenBytes(CategoryAttribute categoryAttribute) { + return CATEGORY_TOKEN_BYTES; + } + + public Object extractCategoryTokenData(byte[] buffer, int offset, int length) { + if (length != CATEGORY_TOKEN_BYTES.length) { + throw new IllegalArgumentException("unexpected data length " + + length); + } + byte[] ret = new byte[length]; + System.arraycopy(buffer, offset, ret, 0, length); + return ret; + } + + public Class getRetainableProperty() { + return null; + } + + @Override + public boolean equals(Object o) { + if (o instanceof CategoryEnhancementDummy3) { + return true; + } + return false; + } + + @Override + public int hashCode() { + return super.hashCode(); + } + +} diff --git a/modules/facet/src/test/org/apache/lucene/facet/enhancements/EnhancementsPayloadIteratorTest.java b/modules/facet/src/test/org/apache/lucene/facet/enhancements/EnhancementsPayloadIteratorTest.java new file mode 100644 index 00000000000..5b4d2da6e9b --- /dev/null +++ b/modules/facet/src/test/org/apache/lucene/facet/enhancements/EnhancementsPayloadIteratorTest.java @@ -0,0 +1,108 @@ +package org.apache.lucene.facet.enhancements; + +import java.io.IOException; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.RAMDirectory; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Test; + +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.facet.enhancements.EnhancementsPayloadIterator; +import org.apache.lucene.facet.enhancements.association.AssociationEnhancement; +import org.apache.lucene.facet.enhancements.params.EnhancementsIndexingParams; +import org.apache.lucene.facet.example.association.AssociationIndexer; +import org.apache.lucene.facet.example.association.AssociationUtils; +import org.apache.lucene.facet.search.DrillDown; +import org.apache.lucene.facet.taxonomy.CategoryPath; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class EnhancementsPayloadIteratorTest extends LuceneTestCase { + + private static Directory indexDir; + private static Directory taxoDir; + private static EnhancementsIndexingParams indexingParams; + private static AssociationEnhancement associationEnhancement; + + @BeforeClass + public static void buildAssociationIndex() throws Exception { + // create Directories for the search index and for the taxonomy index + indexDir = new RAMDirectory(); + taxoDir = new RAMDirectory(); + + // index the sample documents + if (VERBOSE) { + System.out.println("index the sample documents..."); + } + AssociationIndexer.index(indexDir, taxoDir); + + indexingParams = AssociationUtils.assocIndexingParams; + associationEnhancement = (AssociationEnhancement) indexingParams + .getCategoryEnhancements().get(0); + } + + @Test + public void testFullIterator() throws IOException { + IndexReader indexReader = IndexReader.open(indexDir); + Term term = DrillDown.term(indexingParams, new CategoryPath("tags", "lucene")); + EnhancementsPayloadIterator iterator = new EnhancementsPayloadIterator( + indexingParams.getCategoryEnhancements(), indexReader, term); + assertTrue("Unexpected failure of init()", iterator.init()); + assertTrue("Missing instance of tags/lucene in doc 0", iterator.setdoc(0)); + int assoc = (Integer) iterator.getCategoryData(associationEnhancement); + assertEquals("Unexpected association value for tags/lucene in doc 0", 3, assoc, 1E-5); + assertTrue("Missing instance of tags/lucene in doc 1", iterator.setdoc(1)); + assoc = (Integer) iterator.getCategoryData(associationEnhancement); + assertEquals("Unexpected association value for tags/lucene in doc 1", 1, assoc, 1E-5); + } + + @Test + public void testEmptyIterator() throws IOException { + IndexReader indexReader = IndexReader.open(indexDir); + Term term = DrillDown.term(indexingParams, new CategoryPath("root","a", "f2")); + EnhancementsPayloadIterator iterator = new EnhancementsPayloadIterator( + indexingParams.getCategoryEnhancements(), indexReader, term); + assertTrue("Unexpected failure of init()", iterator.init()); + assertFalse("Unexpected payload for root/a/f2 in doc 0", iterator.setdoc(0)); + assertFalse("Unexpected instance of root/a/f2 in doc 1", iterator.setdoc(1)); + } + + @Test + public void testPartialIterator() throws IOException { + IndexReader indexReader = IndexReader.open(indexDir); + Term term = DrillDown.term(indexingParams, new CategoryPath("genre","software")); + EnhancementsPayloadIterator iterator = new EnhancementsPayloadIterator( + indexingParams.getCategoryEnhancements(), indexReader, term); + assertTrue("Unexpected failure of init()", iterator.init()); + assertFalse("Unexpected payload for genre/computing in doc 0", iterator.setdoc(0)); + assertTrue("Missing instance of genre/computing in doc 1", iterator.setdoc(1)); + float assoc = Float.intBitsToFloat((Integer) iterator + .getCategoryData(associationEnhancement)); + assertEquals("Unexpected association value for genre/computing in doc 1", 0.34f, assoc, 0.001); + } + + @AfterClass + public static void closeDirectories() throws IOException { + indexDir.close(); + taxoDir.close(); + } +} diff --git a/modules/facet/src/test/org/apache/lucene/facet/enhancements/TwoEnhancementsTest.java b/modules/facet/src/test/org/apache/lucene/facet/enhancements/TwoEnhancementsTest.java new file mode 100644 index 00000000000..b1d79e5c846 --- /dev/null +++ b/modules/facet/src/test/org/apache/lucene/facet/enhancements/TwoEnhancementsTest.java @@ -0,0 +1,129 @@ +package org.apache.lucene.facet.enhancements; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import org.apache.lucene.analysis.core.WhitespaceAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.Term; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.RAMDirectory; +import org.junit.Test; + +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.facet.enhancements.EnhancementsDocumentBuilder; +import org.apache.lucene.facet.enhancements.EnhancementsPayloadIterator; +import org.apache.lucene.facet.enhancements.params.DefaultEnhancementsIndexingParams; +import org.apache.lucene.facet.enhancements.params.EnhancementsIndexingParams; +import org.apache.lucene.facet.search.DrillDown; +import org.apache.lucene.facet.taxonomy.CategoryPath; +import org.apache.lucene.facet.taxonomy.TaxonomyWriter; +import org.apache.lucene.facet.taxonomy.lucene.LuceneTaxonomyWriter; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class TwoEnhancementsTest extends LuceneTestCase { + + @Test + public void testTwoEmptyAndNonEmptyByteArrays() throws Exception { + Directory indexDir = new RAMDirectory(); + Directory taxoDir = new RAMDirectory(); + + EnhancementsIndexingParams indexingParams = + new DefaultEnhancementsIndexingParams( + new CategoryEnhancementDummy1(), + new CategoryEnhancementDummy3()); + + // add document with a category containing data for both enhancements + List categoryPaths = new ArrayList(); + categoryPaths.add(new CategoryPath("a", "b")); + + IndexWriter indexWriter = new IndexWriter(indexDir, new IndexWriterConfig( + TEST_VERSION_CURRENT, new WhitespaceAnalyzer(TEST_VERSION_CURRENT))); + TaxonomyWriter taxo = new LuceneTaxonomyWriter(taxoDir); + + // a category document builder will add the categories to a document + // once build() is called + Document doc = new Document(); + indexWriter.addDocument(new EnhancementsDocumentBuilder(taxo, + indexingParams).setCategoryPaths(categoryPaths).build(doc)); + + indexWriter.close(); + + IndexReader indexReader = IndexReader.open(indexDir); + Term term = DrillDown.term(indexingParams, new CategoryPath("a","b")); + EnhancementsPayloadIterator iterator = new EnhancementsPayloadIterator( + indexingParams.getCategoryEnhancements(), indexReader, term); + + assertTrue("EnhancementsPayloadIterator failure", iterator.init()); + assertTrue("Missing document 0", iterator.setdoc(0)); + assertNull("Unexpected data for CategoryEnhancementDummy2", iterator + .getCategoryData(new CategoryEnhancementDummy1())); + byte[] dummy3 = (byte[]) iterator + .getCategoryData(new CategoryEnhancementDummy3()); + assertTrue("Bad array returned for CategoryEnhancementDummy3", Arrays + .equals(dummy3, CategoryEnhancementDummy3.CATEGORY_TOKEN_BYTES)); + } + + @Test + public void testTwoNonEmptyByteArrays() throws Exception { + // add document with a category containing data for both enhancements + Directory indexDir = new RAMDirectory(); + Directory taxoDir = new RAMDirectory(); + + EnhancementsIndexingParams indexingParams = + new DefaultEnhancementsIndexingParams( + new CategoryEnhancementDummy2(), + new CategoryEnhancementDummy3()); + + List categoryPaths = new ArrayList(); + categoryPaths.add(new CategoryPath("a", "b")); + + IndexWriter indexWriter = new IndexWriter(indexDir, new IndexWriterConfig( + TEST_VERSION_CURRENT, new WhitespaceAnalyzer(TEST_VERSION_CURRENT))); + TaxonomyWriter taxo = new LuceneTaxonomyWriter(taxoDir); + + // a category document builder will add the categories to a document + // once build() is called + Document doc = new Document(); + indexWriter.addDocument(new EnhancementsDocumentBuilder(taxo, + indexingParams).setCategoryPaths(categoryPaths).build(doc)); + + indexWriter.close(); + + IndexReader indexReader = IndexReader.open(indexDir); + Term term = DrillDown.term(indexingParams, new CategoryPath("a","b")); + EnhancementsPayloadIterator iterator = new EnhancementsPayloadIterator( + indexingParams.getCategoryEnhancements(), indexReader, term); + + assertTrue("EnhancementsPayloadIterator failure", iterator.init()); + assertTrue("Missing document 0", iterator.setdoc(0)); + byte[] dummy2 = (byte[]) iterator + .getCategoryData(new CategoryEnhancementDummy2()); + assertTrue("Bad array returned for CategoryEnhancementDummy2", Arrays + .equals(dummy2, CategoryEnhancementDummy2.CATEGORY_TOKEN_BYTES)); + byte[] dummy3 = (byte[]) iterator + .getCategoryData(new CategoryEnhancementDummy3()); + assertTrue("Bad array returned for CategoryEnhancementDummy3", Arrays + .equals(dummy3, CategoryEnhancementDummy3.CATEGORY_TOKEN_BYTES)); + } +} diff --git a/modules/facet/src/test/org/apache/lucene/facet/enhancements/association/AssociationPropertyTest.java b/modules/facet/src/test/org/apache/lucene/facet/enhancements/association/AssociationPropertyTest.java new file mode 100644 index 00000000000..e6f2f5ab3ce --- /dev/null +++ b/modules/facet/src/test/org/apache/lucene/facet/enhancements/association/AssociationPropertyTest.java @@ -0,0 +1,65 @@ +package org.apache.lucene.facet.enhancements.association; + +import org.junit.Test; + +import org.apache.lucene.facet.FacetException; +import org.apache.lucene.facet.enhancements.association.AssociationFloatProperty; +import org.apache.lucene.facet.enhancements.association.AssociationIntProperty; +import org.apache.lucene.facet.enhancements.association.AssociationProperty; +import org.apache.lucene.util.LuceneTestCase; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** Test {@link AssociationProperty}-ies. */ +public class AssociationPropertyTest extends LuceneTestCase { + + @Test + public void testAssociationCountProperty() throws FacetException { + AssociationProperty aa1 = new AssociationIntProperty(5); + AssociationProperty aa2 = new AssociationIntProperty(3); + assertEquals("Wrong association for property", 5, aa1.getAssociation()); + assertEquals("Wrong association for property", 3, aa2.getAssociation()); + aa1.merge(aa2); + assertEquals("Wrong association for property", 8, aa1.getAssociation()); + } + + @Test + public void testAssociationFloatProperty() throws FacetException { + AssociationFloatProperty aa1 = new AssociationFloatProperty(5); + AssociationFloatProperty aa2 = new AssociationFloatProperty(3); + assertEquals("Wrong association for property", 5.0, aa1.getFloatAssociation(), 0.00001); + assertEquals("Wrong association for property", 3.0, aa2.getFloatAssociation(), 0.00001); + aa1.merge(aa2); + assertEquals("Wrong association for property", 8.0, aa1.getFloatAssociation(), 0.00001); + } + + @Test + public void testEquals() { + AssociationProperty aa1 = new AssociationIntProperty(5); + AssociationProperty aa2 = new AssociationIntProperty(5); + AssociationProperty aa3 = new AssociationFloatProperty(5); + AssociationProperty aa4 = new AssociationFloatProperty(5); + + assertTrue("Should be equal", aa1.equals(aa1)); + assertTrue("Should be equal", aa1.equals(aa2)); + assertFalse("Should not be equal", aa1.equals(aa3)); + assertTrue("Should be equal", aa3.equals(aa3)); + assertTrue("Should be equal", aa3.equals(aa4)); + } + +} diff --git a/modules/facet/src/test/org/apache/lucene/facet/enhancements/association/CustomAssociationPropertyTest.java b/modules/facet/src/test/org/apache/lucene/facet/enhancements/association/CustomAssociationPropertyTest.java new file mode 100644 index 00000000000..8d435965070 --- /dev/null +++ b/modules/facet/src/test/org/apache/lucene/facet/enhancements/association/CustomAssociationPropertyTest.java @@ -0,0 +1,97 @@ +package org.apache.lucene.facet.enhancements.association; + +import org.apache.lucene.analysis.core.KeywordAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.RAMDirectory; +import org.junit.Test; + +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.facet.enhancements.EnhancementsDocumentBuilder; +import org.apache.lucene.facet.enhancements.params.DefaultEnhancementsIndexingParams; +import org.apache.lucene.facet.enhancements.params.EnhancementsIndexingParams; +import org.apache.lucene.facet.index.CategoryContainer; +import org.apache.lucene.facet.index.attributes.CategoryAttributeImpl; +import org.apache.lucene.facet.index.attributes.CategoryProperty; +import org.apache.lucene.facet.taxonomy.CategoryPath; +import org.apache.lucene.facet.taxonomy.lucene.LuceneTaxonomyReader; +import org.apache.lucene.facet.taxonomy.lucene.LuceneTaxonomyWriter; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class CustomAssociationPropertyTest extends LuceneTestCase { + + @Test + public void testCustomProperty() throws Exception { + class CustomProperty extends AssociationIntProperty { + public CustomProperty(int value) { + super(value); + } + @Override + public void merge(CategoryProperty other) { + throw new UnsupportedOperationException(); + } + } + + final int NUM_CATEGORIES = 10; + EnhancementsIndexingParams iParams = new DefaultEnhancementsIndexingParams( + new AssociationEnhancement()); + + Directory iDir = new RAMDirectory(); + Directory tDir = new RAMDirectory(); + + IndexWriter w = new IndexWriter(iDir, new IndexWriterConfig(TEST_VERSION_CURRENT, new KeywordAnalyzer())); + LuceneTaxonomyWriter taxoW = new LuceneTaxonomyWriter(tDir); + + CategoryContainer cc = new CategoryContainer(); + EnhancementsDocumentBuilder builder = new EnhancementsDocumentBuilder(taxoW, iParams); + for (int i = 1; i <= NUM_CATEGORIES; i++) { + CategoryAttributeImpl ca = new CategoryAttributeImpl(new CategoryPath(Integer.toString(i))); + ca.addProperty(new CustomProperty(i)); + + cc.addCategory(ca); + } + builder.setCategories(cc); + w.addDocument(builder.build(new Document())); + taxoW.close(); + w.close(); + + IndexReader reader = IndexReader.open(iDir); + LuceneTaxonomyReader taxo = new LuceneTaxonomyReader(tDir); + String field = iParams.getCategoryListParams(new CategoryPath("0")).getTerm().field(); + AssociationsPayloadIterator api = new AssociationsPayloadIterator(reader, field); + + api.setNextDoc(0); + + boolean flag = false; + for (int i = 1; i <= NUM_CATEGORIES; i++) { + int ordinal = taxo.getOrdinal(new CategoryPath(Integer.toString(i))); + flag = true; + long association = api.getAssociation(ordinal); + assertTrue("Association expected for ordinal "+ordinal+" but none was found", + association <= Integer.MAX_VALUE); + + assertEquals("Wrong association value for category '"+ i+"'", i, (int)association); + } + + assertTrue("No categories found for doc #0", flag); + } +} diff --git a/modules/facet/src/test/org/apache/lucene/facet/enhancements/params/DefaultEnhancementsIndexingParamsTest.java b/modules/facet/src/test/org/apache/lucene/facet/enhancements/params/DefaultEnhancementsIndexingParamsTest.java new file mode 100644 index 00000000000..c151b545b27 --- /dev/null +++ b/modules/facet/src/test/org/apache/lucene/facet/enhancements/params/DefaultEnhancementsIndexingParamsTest.java @@ -0,0 +1,67 @@ +package org.apache.lucene.facet.enhancements.params; + +import java.util.List; + +import org.junit.Test; + +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.facet.enhancements.CategoryEnhancement; +import org.apache.lucene.facet.enhancements.CategoryEnhancementDummy1; +import org.apache.lucene.facet.enhancements.CategoryEnhancementDummy2; +import org.apache.lucene.facet.enhancements.params.DefaultEnhancementsIndexingParams; +import org.apache.lucene.facet.enhancements.params.EnhancementsIndexingParams; +import org.apache.lucene.facet.index.DummyProperty; +import org.apache.lucene.facet.index.attributes.CategoryProperty; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class DefaultEnhancementsIndexingParamsTest extends LuceneTestCase { + + @Test + public void testCategoryEnhancements() { + EnhancementsIndexingParams params = + new DefaultEnhancementsIndexingParams( + new CategoryEnhancementDummy1()); + + // check retainable properties + List> retainableProps = params + .getRetainableProperties(); + assertNull("Unexpected content in retainable list", retainableProps); + + params.addCategoryEnhancements(new CategoryEnhancementDummy2()); + + List enhancements = params + .getCategoryEnhancements(); + + assertEquals("Wrong number of enhancements", 2, enhancements.size()); + + assertTrue("Wrong first enhancement", + enhancements.get(0) instanceof CategoryEnhancementDummy1); + assertTrue("Wrong second enhancement", + enhancements.get(1) instanceof CategoryEnhancementDummy2); + + // re-check retainable properties + retainableProps = params.getRetainableProperties(); + assertNotNull("Unexpected empty retainable list", retainableProps); + assertEquals("Unexpected size of retainable list", 1, retainableProps + .size()); + assertEquals("Wrong property in retainable list", DummyProperty.class, + retainableProps.get(0)); + + } +} diff --git a/modules/facet/src/test/org/apache/lucene/facet/example/TestAdaptiveExample.java b/modules/facet/src/test/org/apache/lucene/facet/example/TestAdaptiveExample.java new file mode 100644 index 00000000000..d9144e8e276 --- /dev/null +++ b/modules/facet/src/test/org/apache/lucene/facet/example/TestAdaptiveExample.java @@ -0,0 +1,40 @@ +package org.apache.lucene.facet.example; + +import org.junit.Test; + +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.facet.example.ExampleResult; +import org.apache.lucene.facet.example.adaptive.AdaptiveMain; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Test that the adaptive example works as expected. This test helps to verify + * that examples code is alive! + */ +public class TestAdaptiveExample extends LuceneTestCase { + + @Test + public void testAdaptive () throws Exception { + ExampleResult res = new AdaptiveMain().runSample(); + assertNotNull("Null result!", res); + assertNotNull("Null facet result!", res.getFacetResults()); + assertEquals("Wrong number of results!",1, res.getFacetResults().size()); + assertEquals("Wrong number of facets!",3, res.getFacetResults().get(0).getNumValidDescendants()); + } +} diff --git a/modules/facet/src/test/org/apache/lucene/facet/example/TestAssociationExample.java b/modules/facet/src/test/org/apache/lucene/facet/example/TestAssociationExample.java new file mode 100644 index 00000000000..bbc0caa66c0 --- /dev/null +++ b/modules/facet/src/test/org/apache/lucene/facet/example/TestAssociationExample.java @@ -0,0 +1,55 @@ +package org.apache.lucene.facet.example; + +import org.junit.Test; + +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.facet.example.ExampleResult; +import org.apache.lucene.facet.example.association.AssociationMain; +import org.apache.lucene.facet.search.results.FacetResultNode; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Test that the association example works as expected. This test helps to + * verify that examples code is alive! + */ +public class TestAssociationExample extends LuceneTestCase { + + private static final double[] EXPECTED_INT_SUM_RESULTS = { 4, 2}; + private static final double[] EXPECTED_FLOAT_SUM_RESULTS = { 1.62, 0.34}; + + @Test + public void testAssociationExamples() throws Exception { + assertExampleResult(new AssociationMain().runSumIntAssociationSample(), EXPECTED_INT_SUM_RESULTS); + assertExampleResult(new AssociationMain().runSumFloatAssociationSample(), EXPECTED_FLOAT_SUM_RESULTS); + } + + private void assertExampleResult(ExampleResult res, double[] expectedResults) { + assertNotNull("Null result!", res); + assertNotNull("Null facet result!", res.getFacetResults()); + assertEquals("Wrong number of results!", 1, res.getFacetResults().size()); + assertEquals("Wrong number of facets!", 2, res.getFacetResults().get(0).getNumValidDescendants()); + + Iterable it = res.getFacetResults().get(0).getFacetResultNode().getSubResults(); + int i = 0; + for (FacetResultNode fResNode : it) { + assertEquals("Wrong result for facet "+fResNode.getLabel(), expectedResults[i++], fResNode.getValue(), 1E-5); + } + } + +} diff --git a/modules/facet/src/test/org/apache/lucene/facet/example/TestMultiCLExample.java b/modules/facet/src/test/org/apache/lucene/facet/example/TestMultiCLExample.java new file mode 100644 index 00000000000..481f5c666ff --- /dev/null +++ b/modules/facet/src/test/org/apache/lucene/facet/example/TestMultiCLExample.java @@ -0,0 +1,90 @@ +package org.apache.lucene.facet.example; + +import java.util.Iterator; +import java.util.List; + +import org.junit.Test; + +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.facet.example.ExampleResult; +import org.apache.lucene.facet.example.multiCL.MultiCLMain; +import org.apache.lucene.facet.search.results.FacetResult; +import org.apache.lucene.facet.search.results.FacetResultNode; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Test that the multi-category list example works as expected. This test helps + * to verify that examples code is alive! + */ +public class TestMultiCLExample extends LuceneTestCase { + + @Test + public void testMulti() throws Exception { + ExampleResult res = new MultiCLMain().runSample(); + assertCorrectMultiResults(res); + } + + public static void assertCorrectMultiResults(ExampleResult exampleResults) + throws Exception { + List results = exampleResults.getFacetResults(); + FacetResult result = results.get(0); + assertNotNull("Result should not be null", result); + assertEquals("Invalid label", "5", result.getFacetResultNode() + .getLabel().toString()); + assertEquals("Invalid value", 2.0, result.getFacetResultNode() + .getValue(), 0.0); + assertEquals("Invalid # of subresults", 3, result.getFacetResultNode() + .getNumSubResults()); + + Iterator subResults = result + .getFacetResultNode().getSubResults().iterator(); + FacetResultNode sub = subResults.next(); + assertEquals("Invalid subresult value", 1.0, sub.getValue(), 0.0); + assertEquals("Invalid subresult label", "5/2", sub.getLabel() + .toString()); + sub = subResults.next(); + assertEquals("Invalid subresult value", 1.0, sub.getValue(), 0.0); + assertEquals("Invalid subresult label", "5/7", sub.getLabel() + .toString()); + sub = subResults.next(); + assertEquals("Invalid subresult value", 1.0, sub.getValue(), 0.0); + assertEquals("Invalid subresult label", "5/5", sub.getLabel() + .toString()); + + result = results.get(1); + assertNotNull("Result should not be null", result); + assertEquals("Invalid label", "5/5", result.getFacetResultNode() + .getLabel().toString()); + assertEquals("Invalid value", 1, + result.getFacetResultNode().getValue(), 0.0); + assertEquals("Invalid number of subresults", 0, result + .getFacetResultNode().getNumSubResults()); + + result = results.get(2); + assertNotNull("Result should not be null", result); + assertEquals("Invalid label", "6/2", result.getFacetResultNode() + .getLabel().toString()); + assertEquals("Invalid value", 1, + result.getFacetResultNode().getValue(), 0.0); + assertEquals("Invalid number of subresults", 0, result + .getFacetResultNode().getNumSubResults()); + + } + +} diff --git a/modules/facet/src/test/org/apache/lucene/facet/example/TestSimpleExample.java b/modules/facet/src/test/org/apache/lucene/facet/example/TestSimpleExample.java new file mode 100644 index 00000000000..de4fdea1ddc --- /dev/null +++ b/modules/facet/src/test/org/apache/lucene/facet/example/TestSimpleExample.java @@ -0,0 +1,67 @@ +package org.apache.lucene.facet.example; + +import java.util.Iterator; + +import org.junit.Test; + +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.facet.example.ExampleResult; +import org.apache.lucene.facet.example.simple.SimpleMain; +import org.apache.lucene.facet.search.results.FacetResult; +import org.apache.lucene.facet.search.results.FacetResultNode; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Test that the simple example works as expected. This test helps to verify + * that examples code is alive! + */ +public class TestSimpleExample extends LuceneTestCase { + + @Test + public void testSimple () throws Exception { + ExampleResult res = new SimpleMain().runSimple(); + assertNotNull("Null result!", res); + assertNotNull("Null facet result!", res.getFacetResults()); + assertEquals("Wrong number of results!",1, res.getFacetResults().size()); + assertEquals("Wrong number of facets!",3, res.getFacetResults().get(0).getNumValidDescendants()); + } + + /** + * In drill down test we are drilling down to a facet that appears in a single document. + * As result, facets that without drill down got count of 2 will now get a count of 1. + */ + @Test + public void testDrillDown () throws Exception { + ExampleResult res = new SimpleMain().runDrillDown(); + assertNotNull("Null result!", res); + assertNotNull("Null facet result!", res.getFacetResults()); + assertEquals("Wrong number of results!",1, res.getFacetResults().size()); + + // drill down facet appears in only 1 doc, and that doc has only 2 facets + FacetResult facetResult = res.getFacetResults().get(0); + assertEquals("Wrong number of facets!",2, facetResult.getNumValidDescendants()); + + Iterator resIterator = facetResult.getFacetResultNode().getSubResults().iterator(); + assertTrue("Too few results", resIterator.hasNext()); + assertEquals("wrong count for first result out of 2", 1, (int)resIterator.next().getValue()); + assertTrue("Too few results", resIterator.hasNext()); + assertEquals("wrong count for second result out of 2", 1, (int)resIterator.next().getValue()); + assertFalse("Too many results!", resIterator.hasNext()); + } +} diff --git a/modules/facet/src/test/org/apache/lucene/facet/index/CategoryContainerTest.java b/modules/facet/src/test/org/apache/lucene/facet/index/CategoryContainerTest.java new file mode 100644 index 00000000000..20ef6334d1f --- /dev/null +++ b/modules/facet/src/test/org/apache/lucene/facet/index/CategoryContainerTest.java @@ -0,0 +1,244 @@ +package org.apache.lucene.facet.index; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.ObjectInputStream; +import java.io.ObjectOutputStream; +import java.util.Iterator; + +import org.junit.Test; + +import org.apache.lucene.facet.FacetException; +import org.apache.lucene.facet.enhancements.association.AssociationIntProperty; +import org.apache.lucene.facet.enhancements.association.AssociationProperty; +import org.apache.lucene.facet.index.CategoryContainer; +import org.apache.lucene.facet.index.attributes.CategoryAttribute; +import org.apache.lucene.facet.index.attributes.CategoryAttributeImpl; +import org.apache.lucene.facet.index.streaming.CategoryAttributesStream; +import org.apache.lucene.facet.taxonomy.CategoryPath; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class CategoryContainerTest extends CategoryContainerTestBase { + + @Test + public void basicTest() { + assertEquals("Wrong number of categories in the container", 3, + categoryContainer.size()); + + categoryContainer.clear(); + assertEquals("Container should not contain categories after clear", 0, + categoryContainer.size()); + } + + @Test + public void testIterator() throws FacetException { + Iterator iterator = categoryContainer.iterator(); + + // count the number of tokens + int nCategories; + for (nCategories = 0; iterator.hasNext(); nCategories++) { + iterator.next(); + } + assertEquals("Wrong number of tokens", 3, nCategories); + } + + @Test + public void testExistingNewCategoryWithProperty() throws FacetException { + categoryContainer.addCategory(new CategoryPath("five", "six"), + new DummyProperty()); + Iterator iterator = categoryContainer.iterator(); + + // count the number of tokens, and check there is one DummyAttribute + int nCategories; + int nProperties = 0; + for (nCategories = 0; iterator.hasNext(); nCategories++) { + CategoryAttribute attribute = iterator.next(); + if (attribute.getProperty(DummyProperty.class) != null) { + nProperties++; + } + } + assertEquals("Wrong number of tokens", 3, nCategories); + assertEquals("Wrong number of tokens with properties", 1, nProperties); + } + + @Test + public void testMultipleCategoriesWithProperties() throws FacetException { + AssociationProperty associationProperty = new AssociationIntProperty( + 49); + categoryContainer.addCategory(new CategoryPath("five", "six"), + new DummyProperty(), associationProperty); + categoryContainer.addCategory(new CategoryPath("seven", "eight"), + new DummyProperty()); + associationProperty = new AssociationIntProperty(123); + categoryContainer.addCategory(new CategoryPath("nine"), + associationProperty, new DummyProperty()); + Iterator iterator = categoryContainer.iterator(); + + // count the number of tokens, and check there is one DummyAttribute + int nCategories; + int nDummyAttributes = 0; + int nAssocAttributes = 0; + for (nCategories = 0; iterator.hasNext(); nCategories++) { + CategoryAttribute attribute = iterator.next(); + if (attribute.getProperty(DummyProperty.class) != null) { + nDummyAttributes++; + } + if (attribute.getProperty(AssociationIntProperty.class) != null) { + nAssocAttributes++; + } + } + assertEquals("Wrong number of tokens", 5, nCategories); + assertEquals("Wrong number of tokens with dummy properties", 3, + nDummyAttributes); + assertEquals("Wrong number of tokens with association properties", 2, + nAssocAttributes); + } + + @Test + public void testAddNewCategoryWithProperty() throws FacetException { + categoryContainer.addCategory(new CategoryPath("seven", "eight"), + new DummyProperty()); + Iterator iterator = categoryContainer.iterator(); + + // count the number of tokens, and check there is one DummyAttribute + int nCategories; + int nProperties = 0; + for (nCategories = 0; iterator.hasNext(); nCategories++) { + CategoryAttribute attribute = iterator.next(); + if (attribute.getProperty(DummyProperty.class) != null) { + nProperties++; + } + } + assertEquals("Wrong number of tokens", 4, nCategories); + assertEquals("Wrong number of tokens with properties", 1, nProperties); + } + + /** + * Test addition of {@link CategoryAttribute} object without properties to a + * {@link CategoryContainer}. + * + * @throws FacetException + */ + @Test + public void testAddCategoryAttributeWithoutProperties() + throws FacetException { + CategoryAttribute newCA = new CategoryAttributeImpl(new CategoryPath( + "seven", "eight")); + categoryContainer.addCategory(newCA); + } + + /** + * Test addition of {@link CategoryAttribute} object with property to a + * {@link CategoryContainer}. + * + * @throws FacetException + */ + @Test + public void testAddCategoryAttributeWithProperty() throws FacetException { + CategoryAttribute newCA = new CategoryAttributeImpl(new CategoryPath( + "seven", "eight")); + newCA.addProperty(new DummyProperty()); + categoryContainer.addCategory(newCA); + Iterator iterator = categoryContainer.iterator(); + + // count the number of tokens, and check there is one DummyAttribute + int nCategories; + int nProperties = 0; + for (nCategories = 0; iterator.hasNext(); nCategories++) { + CategoryAttribute attribute = iterator.next(); + if (attribute.getProperty(DummyProperty.class) != null) { + nProperties++; + } + } + assertEquals("Wrong number of tokens", 4, nCategories); + assertEquals("Wrong number of tokens with properties", 1, nProperties); + } + + /** + * Verifies that a {@link CategoryAttributesStream} can be constructed from + * {@link CategoryContainer} and produce the correct number of tokens. + * + * @throws IOException + */ + @Test + public void testCategoryAttributesStream() throws IOException { + CategoryAttributesStream stream = new CategoryAttributesStream( + categoryContainer); + // count the number of tokens + int nTokens; + for (nTokens = 0; stream.incrementToken(); nTokens++) { + } + assertEquals("Wrong number of tokens", 3, nTokens); + } + + /** + * Test that {@link CategoryContainer} merges properties. + * + * @throws FacetException + */ + @Test + public void testCategoryAttributeMerge() throws FacetException { + categoryContainer.addCategory(initialCatgeories[0], + new AssociationIntProperty(2)); + categoryContainer.addCategory(initialCatgeories[0], + new AssociationIntProperty(15)); + + Iterator iterator = categoryContainer.iterator(); + + int nCategories; + int nAssociations = 0; + for (nCategories = 0; iterator.hasNext(); nCategories++) { + CategoryAttribute ca = iterator.next(); + AssociationProperty aa = (AssociationProperty) ca + .getProperty(AssociationIntProperty.class); + if (aa != null) { + assertEquals("Wrong association value", 17, aa.getAssociation()); + nAssociations++; + } + } + assertEquals("Wrong number of tokens", 3, nCategories); + assertEquals("Wrong number of tokens with associations", 1, + nAssociations); + } + + @Test + public void testSerialization() throws Exception { + AssociationProperty associationProperty = new AssociationIntProperty( + 49); + categoryContainer.addCategory(new CategoryPath("five", "six"), + new DummyProperty(), associationProperty); + categoryContainer.addCategory(new CategoryPath("seven", "eight"), + new DummyProperty()); + associationProperty = new AssociationIntProperty(123); + categoryContainer.addCategory(new CategoryPath("nine"), + associationProperty, new DummyProperty()); + + ByteArrayOutputStream baos = new ByteArrayOutputStream(1024); + ObjectOutputStream out = new ObjectOutputStream(baos); + out.writeObject(categoryContainer); + out.close(); + + ByteArrayInputStream bais = new ByteArrayInputStream(baos.toByteArray()); + ObjectInputStream in = new ObjectInputStream(bais); + assertEquals( + "Original and deserialized CategoryContainer are different", + categoryContainer, in.readObject()); + } +} \ No newline at end of file diff --git a/modules/facet/src/test/org/apache/lucene/facet/index/CategoryContainerTestBase.java b/modules/facet/src/test/org/apache/lucene/facet/index/CategoryContainerTestBase.java new file mode 100644 index 00000000000..e85ea2e115f --- /dev/null +++ b/modules/facet/src/test/org/apache/lucene/facet/index/CategoryContainerTestBase.java @@ -0,0 +1,45 @@ +package org.apache.lucene.facet.index; + +import org.junit.Before; + +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.facet.index.CategoryContainer; +import org.apache.lucene.facet.taxonomy.CategoryPath; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public abstract class CategoryContainerTestBase extends LuceneTestCase { + + protected CategoryContainer categoryContainer; + protected CategoryPath[] initialCatgeories; + + @Before + public void setCategoryContainer() { + initialCatgeories = new CategoryPath[3]; + initialCatgeories[0] = new CategoryPath("one", "two", "three"); + initialCatgeories[1] = new CategoryPath("four"); + initialCatgeories[2] = new CategoryPath("five", "six"); + + categoryContainer = new CategoryContainer(); + + for (int i = 0; i < initialCatgeories.length; i++) { + categoryContainer.addCategory(initialCatgeories[i]); + } + } + +} diff --git a/modules/facet/src/test/org/apache/lucene/facet/index/CategoryListPayloadStreamTest.java b/modules/facet/src/test/org/apache/lucene/facet/index/CategoryListPayloadStreamTest.java new file mode 100644 index 00000000000..218c1eb8c9e --- /dev/null +++ b/modules/facet/src/test/org/apache/lucene/facet/index/CategoryListPayloadStreamTest.java @@ -0,0 +1,80 @@ +package org.apache.lucene.facet.index; + +import java.io.ByteArrayInputStream; + +import org.junit.Test; + +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.facet.index.CategoryListPayloadStream; +import org.apache.lucene.util.encoding.DGapIntDecoder; +import org.apache.lucene.util.encoding.DGapIntEncoder; +import org.apache.lucene.util.encoding.IntDecoder; +import org.apache.lucene.util.encoding.NOnesIntDecoder; +import org.apache.lucene.util.encoding.NOnesIntEncoder; +import org.apache.lucene.util.encoding.UniqueValuesIntEncoder; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class CategoryListPayloadStreamTest extends LuceneTestCase { + + /** + * Verifies that a CategoryListPayloadStream can properly encode values into + * a byte stream for later constructing a Payload. + */ + @Test + public void testStream() throws Exception { + + CategoryListPayloadStream clps = new CategoryListPayloadStream( + new UniqueValuesIntEncoder(new DGapIntEncoder( + new NOnesIntEncoder(3)))); + + clps.appendIntToStream(1); + clps.appendIntToStream(10); + clps.appendIntToStream(100); + clps.appendIntToStream(1000); + clps.appendIntToStream(10000); + clps.appendIntToStream(100000); + clps.appendIntToStream(1000000); + clps.appendIntToStream(10000000); + clps.appendIntToStream(100000000); + clps.appendIntToStream(1000000000); + clps.appendIntToStream(Integer.MAX_VALUE); + + ByteArrayInputStream bais = new ByteArrayInputStream(clps + .convertStreamToByteArray()); + IntDecoder decoder = new DGapIntDecoder(new NOnesIntDecoder(3)); + decoder.reInit(bais); + assertEquals("Wrong value in byte stream", 1, decoder.decode()); + assertEquals("Wrong value in byte stream", 10, decoder.decode()); + assertEquals("Wrong value in byte stream", 100, decoder.decode()); + assertEquals("Wrong value in byte stream", 1000, decoder.decode()); + assertEquals("Wrong value in byte stream", 10000, decoder.decode()); + assertEquals("Wrong value in byte stream", 100000, decoder.decode()); + assertEquals("Wrong value in byte stream", 1000000, decoder.decode()); + assertEquals("Wrong value in byte stream", 10000000, decoder.decode()); + assertEquals("Wrong value in byte stream", 100000000, decoder.decode()); + assertEquals("Wrong value in byte stream", 1000000000, decoder.decode()); + assertEquals("Wrong value in byte stream", Integer.MAX_VALUE, decoder.decode()); + assertEquals("End of stream not reached", IntDecoder.EOS, decoder.decode()); + + clps.reset(); + decoder.reInit(bais); + assertEquals("End of stream not reached", IntDecoder.EOS, decoder.decode()); + } + +} diff --git a/modules/facet/src/test/org/apache/lucene/facet/index/DummyProperty.java b/modules/facet/src/test/org/apache/lucene/facet/index/DummyProperty.java new file mode 100644 index 00000000000..81e91d5fadd --- /dev/null +++ b/modules/facet/src/test/org/apache/lucene/facet/index/DummyProperty.java @@ -0,0 +1,50 @@ +package org.apache.lucene.facet.index; + +import org.apache.lucene.facet.index.attributes.CategoryProperty; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * An empty attribute for testing. + */ +public class DummyProperty implements CategoryProperty { + + @Override + public boolean equals(Object o) { + if (o instanceof DummyProperty) { + return true; + } + return false; + } + + @Override + public int hashCode() { + return super.hashCode(); + } + + public void merge(CategoryProperty other) { + throw new UnsupportedOperationException( + "Merging dummy attribute is prohibited"); + } + + @Override + public String toString() { + return "I am dummy property"; + } + +} diff --git a/modules/facet/src/test/org/apache/lucene/facet/index/FacetsPayloadProcessorProviderTest.java b/modules/facet/src/test/org/apache/lucene/facet/index/FacetsPayloadProcessorProviderTest.java new file mode 100644 index 00000000000..75d5e6405f3 --- /dev/null +++ b/modules/facet/src/test/org/apache/lucene/facet/index/FacetsPayloadProcessorProviderTest.java @@ -0,0 +1,107 @@ +package org.apache.lucene.facet.index; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.lucene.analysis.core.WhitespaceAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.RAMDirectory; +import org.junit.Test; + +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.facet.example.merge.TaxonomyMergeUtils; +import org.apache.lucene.facet.search.FacetsCollector; +import org.apache.lucene.facet.search.params.CountFacetRequest; +import org.apache.lucene.facet.search.params.FacetSearchParams; +import org.apache.lucene.facet.search.results.FacetResult; +import org.apache.lucene.facet.search.results.FacetResultNode; +import org.apache.lucene.facet.taxonomy.CategoryPath; +import org.apache.lucene.facet.taxonomy.lucene.LuceneTaxonomyReader; +import org.apache.lucene.facet.taxonomy.lucene.LuceneTaxonomyWriter; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class FacetsPayloadProcessorProviderTest extends LuceneTestCase { + + private static final int NUM_DOCS = 100; + + @Test + public void testTaxonomyMergeUtils() throws Exception { + Directory dir = new RAMDirectory(); + Directory taxDir = new RAMDirectory(); + buildIndexWithFacets(dir, taxDir, true); + + Directory dir1 = new RAMDirectory(); + Directory taxDir1 = new RAMDirectory(); + buildIndexWithFacets(dir1, taxDir1, false); + + TaxonomyMergeUtils.merge(dir, taxDir, dir1, taxDir1); + + verifyResults(dir1, taxDir1); + } + + private void verifyResults(Directory dir, Directory taxDir) throws IOException { + IndexReader reader1 = IndexReader.open(dir); + LuceneTaxonomyReader taxReader = new LuceneTaxonomyReader(taxDir); + IndexSearcher searcher = new IndexSearcher(reader1); + FacetSearchParams fsp = new FacetSearchParams(); + fsp.addFacetRequest(new CountFacetRequest(new CategoryPath("tag"), NUM_DOCS)); + FacetsCollector collector = new FacetsCollector(fsp, reader1, taxReader); + searcher.search(new MatchAllDocsQuery(), collector); + FacetResult result = collector.getFacetResults().get(0); + FacetResultNode node = result.getFacetResultNode(); + for (FacetResultNode facet: node.getSubResults()) { + int weight = (int)facet.getValue(); + int label = Integer.parseInt(facet.getLabel().getComponent(1)); + //System.out.println(label + ": " + weight); + if (VERBOSE) { + System.out.println(label + ": " + weight); + } + assertEquals(NUM_DOCS ,weight); + } + } + + private void buildIndexWithFacets(Directory dir, Directory taxDir, boolean asc) throws IOException { + IndexWriterConfig config = new IndexWriterConfig(TEST_VERSION_CURRENT, new WhitespaceAnalyzer(TEST_VERSION_CURRENT)); + IndexWriter writer = new IndexWriter(dir, config); + + LuceneTaxonomyWriter taxonomyWriter = new LuceneTaxonomyWriter(taxDir); + for (int i = 1; i <= NUM_DOCS; i++) { + Document doc = new Document(); + List categoryPaths = new ArrayList(i + 1); + for (int j = i; j <= NUM_DOCS; j++) { + int facetValue = asc? j: NUM_DOCS - j; + categoryPaths.add(new CategoryPath("tag", Integer.toString(facetValue))); + } + CategoryDocumentBuilder catBuilder = new CategoryDocumentBuilder(taxonomyWriter); + catBuilder.setCategoryPaths(categoryPaths); + catBuilder.build(doc); + writer.addDocument(doc); + } + taxonomyWriter.close(); + writer.close(); + } + +} diff --git a/modules/facet/src/test/org/apache/lucene/facet/index/attributes/CategoryAttributeImplTest.java b/modules/facet/src/test/org/apache/lucene/facet/index/attributes/CategoryAttributeImplTest.java new file mode 100644 index 00000000000..fb1b27a5357 --- /dev/null +++ b/modules/facet/src/test/org/apache/lucene/facet/index/attributes/CategoryAttributeImplTest.java @@ -0,0 +1,125 @@ +package org.apache.lucene.facet.index.attributes; + +import java.util.ArrayList; +import java.util.List; + +import org.junit.Test; + +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.facet.FacetException; +import org.apache.lucene.facet.index.DummyProperty; +import org.apache.lucene.facet.index.attributes.CategoryAttribute; +import org.apache.lucene.facet.index.attributes.CategoryAttributeImpl; +import org.apache.lucene.facet.taxonomy.CategoryPath; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class CategoryAttributeImplTest extends LuceneTestCase { + + @Test + public void testCategoryPath() { + CategoryAttribute ca = new CategoryAttributeImpl(); + + assertNull("Category Path should be null", ca.getCategoryPath()); + + CategoryPath cp = new CategoryPath("a", "b"); + ca.setCategoryPath(cp); + + assertEquals("Wrong Category Path", cp, ca.getCategoryPath()); + + ca.setCategoryPath(null); + assertNull("Category Path should be null", ca.getCategoryPath()); + + ca = new CategoryAttributeImpl(cp); + assertEquals("Wrong Category Path", cp, ca.getCategoryPath()); + } + + @Test + public void testProperties() throws FacetException { + CategoryAttribute ca = new CategoryAttributeImpl(); + + assertNull("Attribute should be null", ca + .getProperty(DummyProperty.class)); + assertNull("Attribute classes should be null", ca.getPropertyClasses()); + + ca.addProperty(new DummyProperty()); + assertEquals("DummyProperty should be in properties", + new DummyProperty(), ca.getProperty(DummyProperty.class)); + assertEquals("Attribute classes should contain 1 element", 1, ca + .getPropertyClasses().size()); + + boolean failed = false; + try { + ca.addProperty(new DummyProperty()); + } catch (UnsupportedOperationException e) { + failed = true; + } + + if (!failed) { + fail("Two DummyAttributes added to the same CategoryAttribute"); + } + + ca.clearProperties(); + assertNull("Attribute classes should be null", ca.getPropertyClasses()); + + ca.addProperty(new DummyProperty()); + assertEquals("DummyProperty should be in properties", + new DummyProperty(), ca.getProperty(DummyProperty.class)); + ca.remove(DummyProperty.class); + assertEquals("DummyProperty should not be in properties", null, ca + .getProperty(DummyProperty.class)); + assertNull("Attribute classes should be null", ca.getPropertyClasses()); + + ca.addProperty(new DummyProperty()); + List> propertyClasses = new ArrayList>(); + assertEquals("No property expected when no classes given", null, ca + .getProperty(propertyClasses)); + propertyClasses.add(DummyProperty.class); + assertEquals("DummyProperty should be in properties", + new DummyProperty(), ca.getProperty(propertyClasses)); + propertyClasses.add(OrdinalProperty.class); + assertEquals("DummyProperty should be in properties", + new DummyProperty(), ca.getProperty(propertyClasses)); + propertyClasses.clear(); + propertyClasses.add(OrdinalProperty.class); + assertEquals("No ordinal property expected", null, ca + .getProperty(propertyClasses)); + } + + @Test + public void testCloneCopyToAndSet() throws FacetException { + CategoryAttributeImpl ca1 = new CategoryAttributeImpl(); + + CategoryPath cp = new CategoryPath("a", "b"); + ca1.setCategoryPath(cp); + ca1.addProperty(new DummyProperty()); + + CategoryAttribute ca2 = ca1.clone(); + assertEquals("Error in cloning", ca1, ca2); + + CategoryAttributeImpl ca3 = new CategoryAttributeImpl(); + assertNotSame("Should not be the same", ca1, ca3); + ca1.copyTo(ca3); + assertEquals("Error in cloning", ca1, ca3); + + ca2.setCategoryPath(null); + assertNotSame("Should not be the same", ca1, ca2); + ca2.set(ca3); + assertEquals("Error in cloning", ca1, ca2); + } +} diff --git a/modules/facet/src/test/org/apache/lucene/facet/index/attributes/CategoryAttributesIterableTest.java b/modules/facet/src/test/org/apache/lucene/facet/index/attributes/CategoryAttributesIterableTest.java new file mode 100644 index 00000000000..6301b0fded3 --- /dev/null +++ b/modules/facet/src/test/org/apache/lucene/facet/index/attributes/CategoryAttributesIterableTest.java @@ -0,0 +1,53 @@ +package org.apache.lucene.facet.index.attributes; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; + +import org.junit.Test; + +import org.apache.lucene.facet.index.CategoryContainerTestBase; +import org.apache.lucene.facet.index.attributes.CategoryAttribute; +import org.apache.lucene.facet.index.attributes.CategoryAttributesIterable; +import org.apache.lucene.facet.taxonomy.CategoryPath; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class CategoryAttributesIterableTest extends CategoryContainerTestBase { + + @Test + public void testIterator() throws IOException { + List categoryList = new ArrayList(); + for (int i = 0; i < initialCatgeories.length; i++) { + categoryList.add(initialCatgeories[i]); + } + + CategoryAttributesIterable iterable = new CategoryAttributesIterable( + categoryList); + Iterator iterator = iterable.iterator(); + + // count the number of tokens + int nCategories; + for (nCategories = 0; iterator.hasNext(); nCategories++) { + iterator.next(); + } + assertEquals("Wrong number of tokens", 3, nCategories); + } + +} diff --git a/modules/facet/src/test/org/apache/lucene/facet/index/categorypolicy/OrdinalPolicyTest.java b/modules/facet/src/test/org/apache/lucene/facet/index/categorypolicy/OrdinalPolicyTest.java new file mode 100644 index 00000000000..43cfb4602fa --- /dev/null +++ b/modules/facet/src/test/org/apache/lucene/facet/index/categorypolicy/OrdinalPolicyTest.java @@ -0,0 +1,90 @@ +package org.apache.lucene.facet.index.categorypolicy; + +import org.apache.lucene.store.RAMDirectory; +import org.junit.Test; + +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.facet.index.categorypolicy.DefaultOrdinalPolicy; +import org.apache.lucene.facet.index.categorypolicy.NonTopLevelOrdinalPolicy; +import org.apache.lucene.facet.index.categorypolicy.OrdinalPolicy; +import org.apache.lucene.facet.taxonomy.CategoryPath; +import org.apache.lucene.facet.taxonomy.TaxonomyReader; +import org.apache.lucene.facet.taxonomy.TaxonomyWriter; +import org.apache.lucene.facet.taxonomy.lucene.LuceneTaxonomyWriter; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class OrdinalPolicyTest extends LuceneTestCase { + + @Test + public void testDefaultOrdinalPolicy() { + // check ordinal policy + OrdinalPolicy ordinalPolicy = new DefaultOrdinalPolicy(); + assertFalse("default ordinal policy should not match root", ordinalPolicy + .shouldAdd(TaxonomyReader.ROOT_ORDINAL)); + for (int i = 0; i < 300; i++) { + int ordinal = 1 + random.nextInt(Integer.MAX_VALUE - 1); + assertTrue("default ordinal policy should match " + ordinal, + ordinalPolicy.shouldAdd(ordinal)); + } + } + + @Test + public void testNonTopLevelOrdinalPolicy() throws Exception { + TaxonomyWriter taxonomy = null; + taxonomy = new LuceneTaxonomyWriter(new RAMDirectory()); + + int[] topLevelOrdinals = new int[10]; + String[] topLevelStrings = new String[10]; + for (int i = 0; i < 10; i++) { + topLevelStrings[i] = Integer.valueOf(random.nextInt(30)).toString(); + topLevelOrdinals[i] = taxonomy.addCategory(new CategoryPath( + topLevelStrings[i])); + } + int[] nonTopLevelOrdinals = new int[300]; + for (int i = 0; i < 300; i++) { + int nComponents = 2 + random.nextInt(10); + String[] components = new String[nComponents]; + components[0] = topLevelStrings[i % 10]; + for (int j = 1; j < components.length; j++) { + components[j] = (Integer.valueOf(random.nextInt(30))).toString(); + } + nonTopLevelOrdinals[i] = taxonomy.addCategory(new CategoryPath( + components)); + } + // check ordinal policy + OrdinalPolicy ordinalPolicy = new NonTopLevelOrdinalPolicy(); + ordinalPolicy.init(taxonomy); + assertFalse("top level ordinal policy should not match root", ordinalPolicy + .shouldAdd(TaxonomyReader.ROOT_ORDINAL)); + for (int i = 0; i < 10; i++) { + assertFalse("top level ordinal policy should not match " + + topLevelOrdinals[i], + ordinalPolicy.shouldAdd(topLevelOrdinals[i])); + } + for (int i = 0; i < 300; i++) { + assertTrue("top level ordinal policy should match " + + nonTopLevelOrdinals[i], + ordinalPolicy.shouldAdd(nonTopLevelOrdinals[i])); + } + + // check illegal ordinal + assertFalse("Should not add illegal ordinal", ordinalPolicy.shouldAdd(100000)); + } + +} diff --git a/modules/facet/src/test/org/apache/lucene/facet/index/categorypolicy/PathPolicyTest.java b/modules/facet/src/test/org/apache/lucene/facet/index/categorypolicy/PathPolicyTest.java new file mode 100644 index 00000000000..6b6c1e099f0 --- /dev/null +++ b/modules/facet/src/test/org/apache/lucene/facet/index/categorypolicy/PathPolicyTest.java @@ -0,0 +1,92 @@ +package org.apache.lucene.facet.index.categorypolicy; + +import org.apache.lucene.store.RAMDirectory; +import org.junit.Test; + +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.facet.index.categorypolicy.DefaultPathPolicy; +import org.apache.lucene.facet.index.categorypolicy.NonTopLevelPathPolicy; +import org.apache.lucene.facet.index.categorypolicy.PathPolicy; +import org.apache.lucene.facet.taxonomy.CategoryPath; +import org.apache.lucene.facet.taxonomy.TaxonomyWriter; +import org.apache.lucene.facet.taxonomy.lucene.LuceneTaxonomyWriter; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class PathPolicyTest extends LuceneTestCase { + + @Test + public void testDefaultPathPolicy() { + // check path policy + CategoryPath cp = new CategoryPath(); + PathPolicy pathPolicy = new DefaultPathPolicy(); + assertFalse("default path policy should not accept root", + pathPolicy.shouldAdd(cp)); + for (int i = 0; i < 300; i++) { + int nComponents = 1 + random.nextInt(10); + String[] components = new String[nComponents]; + for (int j = 0; j < components.length; j++) { + components[j] = (Integer.valueOf(random.nextInt(30))).toString(); + } + cp = new CategoryPath(components); + assertTrue("default path policy should accept " + + cp.toString('/'), + pathPolicy.shouldAdd(cp)); + } + } + + @Test + public void testNonTopLevelPathPolicy() throws Exception { + TaxonomyWriter taxonomy = null; + taxonomy = new LuceneTaxonomyWriter(new RAMDirectory()); + + CategoryPath[] topLevelPaths = new CategoryPath[10]; + String[] topLevelStrings = new String[10]; + for (int i = 0; i < 10; i++) { + topLevelStrings[i] = Integer.valueOf(random.nextInt(30)).toString(); + + topLevelPaths[i] = new CategoryPath(topLevelStrings[i]); + taxonomy.addCategory(topLevelPaths[i]); + } + CategoryPath[] nonTopLevelPaths = new CategoryPath[300]; + for (int i = 0; i < 300; i++) { + int nComponents = 2 + random.nextInt(10); + String[] components = new String[nComponents]; + components[0] = topLevelStrings[i % 10]; + for (int j = 1; j < components.length; j++) { + components[j] = (Integer.valueOf(random.nextInt(30))).toString(); + } + nonTopLevelPaths[i] = new CategoryPath(components); + taxonomy.addCategory(nonTopLevelPaths[i]); + } + // check ordinal policy + PathPolicy pathPolicy = new NonTopLevelPathPolicy(); + assertFalse("top level path policy should not match root", + pathPolicy.shouldAdd(new CategoryPath())); + for (int i = 0; i < 10; i++) { + assertFalse("top level path policy should not match " + + topLevelPaths[i], + pathPolicy.shouldAdd(topLevelPaths[i])); + } + for (int i = 0; i < 300; i++) { + assertTrue("top level path policy should match " + + nonTopLevelPaths[i], + pathPolicy.shouldAdd(nonTopLevelPaths[i])); + } + } +} diff --git a/modules/facet/src/test/org/apache/lucene/facet/index/params/CategoryListParamsTest.java b/modules/facet/src/test/org/apache/lucene/facet/index/params/CategoryListParamsTest.java new file mode 100644 index 00000000000..37b8f26aad0 --- /dev/null +++ b/modules/facet/src/test/org/apache/lucene/facet/index/params/CategoryListParamsTest.java @@ -0,0 +1,86 @@ +package org.apache.lucene.facet.index.params; + +import org.apache.lucene.index.Term; +import org.junit.Test; + +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.facet.index.params.CategoryListParams; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class CategoryListParamsTest extends LuceneTestCase { + + @Test + public void testDefaultSettings() { + CategoryListParams clp = new CategoryListParams(); + assertEquals("wrong default term", new Term("$facets", "$fulltree$"), clp.getTerm()); + assertEquals("unexpected default encoder", "Sorting (Unique (DGap (VInt8)))", clp.createEncoder().toString()); + assertEquals("unexpected default decoder", "DGap (VInt8)", clp.createEncoder().createMatchingDecoder().toString()); + } + + /** + * Test that the {@link CategoryListParams#hashCode()} and + * {@link CategoryListParams#equals(Object)} are consistent. + */ + @Test + public void testIdentity() { + CategoryListParams clParams1 = new CategoryListParams(); + // Assert identity is correct - a CategoryListParams equals itself. + assertEquals("A CategoryListParams object does not equal itself.", + clParams1, clParams1); + // For completeness, the object's hashcode equals itself + assertEquals("A CategoryListParams object's hashCode does not equal itself.", + clParams1.hashCode(), clParams1.hashCode()); + } + + /** + * Test that CategoryListParams behave correctly when compared against each + * other. + */ + @Test + public void testIdentityConsistency() { + // Test 2 CategoryListParams with the default parameter + CategoryListParams clParams1 = new CategoryListParams(); + CategoryListParams clParams2 = new CategoryListParams(); + assertEquals( + "2 CategoryListParams with the same default term should equal each other.", + clParams1, clParams2); + assertEquals("2 CategoryListParams with the same default term should have the same hashcode", + clParams1.hashCode(), clParams2.hashCode()); + + // Test 2 CategoryListParams with the same specified Term + clParams1 = new CategoryListParams(new Term("test")); + clParams2 = new CategoryListParams(new Term("test")); + assertEquals( + "2 CategoryListParams with the same term should equal each other.", + clParams1, clParams2); + assertEquals("2 CategoryListParams with the same term should have the same hashcode", + clParams1.hashCode(), clParams2.hashCode()); + + // Test 2 CategoryListParams with DIFFERENT terms + clParams1 = new CategoryListParams(new Term("test1")); + clParams2 = new CategoryListParams(new Term("test2")); + assertFalse( + "2 CategoryListParams with the different terms should NOT equal each other.", + clParams1.equals(clParams2)); + assertFalse( + "2 CategoryListParams with the different terms should NOT have the same hashcode.", + clParams1.hashCode() == clParams2.hashCode()); + } + +} diff --git a/modules/facet/src/test/org/apache/lucene/facet/index/params/DefaultFacetIndexingParamsTest.java b/modules/facet/src/test/org/apache/lucene/facet/index/params/DefaultFacetIndexingParamsTest.java new file mode 100644 index 00000000000..86d4e2acc7e --- /dev/null +++ b/modules/facet/src/test/org/apache/lucene/facet/index/params/DefaultFacetIndexingParamsTest.java @@ -0,0 +1,120 @@ +package org.apache.lucene.facet.index.params; + +import org.apache.lucene.index.Term; +import org.junit.Test; + +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.facet.index.categorypolicy.DefaultOrdinalPolicy; +import org.apache.lucene.facet.index.categorypolicy.DefaultPathPolicy; +import org.apache.lucene.facet.index.categorypolicy.OrdinalPolicy; +import org.apache.lucene.facet.index.categorypolicy.PathPolicy; +import org.apache.lucene.facet.index.params.CategoryListParams; +import org.apache.lucene.facet.index.params.DefaultFacetIndexingParams; +import org.apache.lucene.facet.index.params.FacetIndexingParams; +import org.apache.lucene.facet.search.DrillDown; +import org.apache.lucene.facet.taxonomy.CategoryPath; +import org.apache.lucene.facet.taxonomy.TaxonomyReader; +import org.apache.lucene.facet.util.PartitionsUtils; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class DefaultFacetIndexingParamsTest extends LuceneTestCase { + + @Test + public void testDefaultSettings() { + FacetIndexingParams dfip = new DefaultFacetIndexingParams(); + assertNotNull("Missing default category list", dfip + .getAllCategoryListParams()); + assertEquals( + "all categories have the same CategoryListParams by default", + dfip.getCategoryListParams(null), dfip + .getCategoryListParams(new CategoryPath("a"))); + assertEquals( + "Expected default category list term is $facets:$fulltree$", + new Term("$facets", "$fulltree$"), dfip.getCategoryListParams( + null).getTerm()); + String expectedDDText = "a" + + dfip.getFacetDelimChar() + "b"; + CategoryPath cp = new CategoryPath("a", "b"); + assertEquals("wrong drill-down term", new Term("$facets", + expectedDDText), DrillDown.term(dfip,cp)); + char[] buf = new char[20]; + int numchars = dfip.drillDownTermText(cp, buf); + assertEquals("3 characters should be written", 3, numchars); + assertEquals("wrong drill-down term text", expectedDDText, new String( + buf, 0, numchars)); + CategoryListParams clParams = dfip.getCategoryListParams(null); + assertEquals("partition for all ordinals is the first", "$fulltree$", + PartitionsUtils.partitionNameByOrdinal(dfip, clParams , 250)); + assertEquals("for partition 0, the same name should be returned", + "$fulltree$", PartitionsUtils.partitionName(clParams, 0)); + assertEquals( + "for any other, it's the concatenation of name + partition", + "$fulltree$1", PartitionsUtils.partitionName(clParams, 1)); + assertEquals("default partition number is always 0", 0, + PartitionsUtils.partitionNumber(dfip,100)); + assertEquals("default partition size is unbounded", Integer.MAX_VALUE, + dfip.getPartitionSize()); + } + + @Test + public void testCategoryListParamsWithDefaultIndexingParams() { + CategoryListParams clp = new CategoryListParams( + new Term("clp", "value")); + FacetIndexingParams dfip = new DefaultFacetIndexingParams(clp); + assertEquals("Expected default category list term is " + clp.getTerm(), + clp.getTerm(), dfip.getCategoryListParams(null).getTerm()); + } + + @Test + public void testCategoryPolicies() { + FacetIndexingParams dfip = new DefaultFacetIndexingParams(); + long seed = System.currentTimeMillis(); + // check path policy + CategoryPath cp = new CategoryPath(); + PathPolicy pathPolicy = new DefaultPathPolicy(); + assertEquals("path policy does not match default for root" + "(seed " + + seed + ")", pathPolicy.shouldAdd(cp), dfip.getPathPolicy() + .shouldAdd(cp)); + for (int i = 0; i < 30; i++) { + int nComponents = random.nextInt(10); + String[] components = new String[nComponents]; + for (int j = 0; j < components.length; j++) { + components[j] = (Integer.valueOf(random.nextInt(30))).toString(); + } + cp = new CategoryPath(components); + assertEquals("path policy does not match default for " + + cp.toString('/') + "(seed " + seed + ")", pathPolicy + .shouldAdd(cp), dfip.getPathPolicy().shouldAdd(cp)); + } + + // check ordinal policy + OrdinalPolicy ordinalPolicy = new DefaultOrdinalPolicy(); + assertEquals("ordinal policy does not match default for root" + + "(seed " + seed + ")", ordinalPolicy + .shouldAdd(TaxonomyReader.ROOT_ORDINAL), dfip + .getOrdinalPolicy().shouldAdd(TaxonomyReader.ROOT_ORDINAL)); + for (int i = 0; i < 30; i++) { + int ordinal = random.nextInt(); + assertEquals("ordinal policy does not match default for " + ordinal + + "(seed " + seed + ")", ordinalPolicy.shouldAdd(ordinal), + dfip.getOrdinalPolicy().shouldAdd(ordinal)); + } + } + +} \ No newline at end of file diff --git a/modules/facet/src/test/org/apache/lucene/facet/index/params/PerDimensionIndexingParamsTest.java b/modules/facet/src/test/org/apache/lucene/facet/index/params/PerDimensionIndexingParamsTest.java new file mode 100644 index 00000000000..30462864d2e --- /dev/null +++ b/modules/facet/src/test/org/apache/lucene/facet/index/params/PerDimensionIndexingParamsTest.java @@ -0,0 +1,81 @@ +package org.apache.lucene.facet.index.params; + +import org.apache.lucene.index.Term; +import org.junit.Test; + +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.facet.index.params.CategoryListParams; +import org.apache.lucene.facet.index.params.FacetIndexingParams; +import org.apache.lucene.facet.index.params.PerDimensionIndexingParams; +import org.apache.lucene.facet.search.DrillDown; +import org.apache.lucene.facet.taxonomy.CategoryPath; +import org.apache.lucene.facet.util.PartitionsUtils; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class PerDimensionIndexingParamsTest extends LuceneTestCase { + + @Test + public void testTopLevelSettings() { + FacetIndexingParams ifip = new PerDimensionIndexingParams(); + assertNotNull("Missing default category list", ifip + .getAllCategoryListParams()); + assertEquals( + "Expected default category list term is $facets:$fulltree$", + new Term("$facets", "$fulltree$"), ifip.getCategoryListParams( + null).getTerm()); + String expectedDDText = "a" + + ifip.getFacetDelimChar() + "b"; + CategoryPath cp = new CategoryPath("a", "b"); + assertEquals("wrong drill-down term", new Term("$facets", + expectedDDText), DrillDown.term(ifip,cp)); + char[] buf = new char[20]; + int numchars = ifip.drillDownTermText(cp, buf); + assertEquals("3 characters should be written", 3, numchars); + assertEquals("wrong drill-down term text", expectedDDText, new String( + buf, 0, numchars)); + + CategoryListParams clParams = ifip.getCategoryListParams(null); + assertEquals("partition for all ordinals is the first", "$fulltree$", + PartitionsUtils.partitionNameByOrdinal(ifip, clParams , 250)); + assertEquals("for partition 0, the same name should be returned", + "$fulltree$", PartitionsUtils.partitionName(clParams, 0)); + assertEquals( + "for any other, it's the concatenation of name + partition", + "$fulltree$1", PartitionsUtils.partitionName(clParams, 1)); + assertEquals("default partition number is always 0", 0, + PartitionsUtils.partitionNumber(ifip,100)); + + assertEquals("default partition size is unbounded", Integer.MAX_VALUE, + ifip.getPartitionSize()); + } + + @Test + public void testCategoryListParamsAddition() { + PerDimensionIndexingParams tlfip = new PerDimensionIndexingParams(); + CategoryListParams clp = new CategoryListParams( + new Term("clp", "value")); + tlfip.addCategoryListParams(new CategoryPath("a"), clp); + assertEquals("Expected category list term is " + clp.getTerm(), clp + .getTerm(), tlfip.getCategoryListParams(new CategoryPath("a")) + .getTerm()); + assertNotSame("Unexpected default category list " + clp.getTerm(), clp, + tlfip.getCategoryListParams(null)); + } + +} \ No newline at end of file diff --git a/modules/facet/src/test/org/apache/lucene/facet/index/streaming/CategoryAttributesStreamTest.java b/modules/facet/src/test/org/apache/lucene/facet/index/streaming/CategoryAttributesStreamTest.java new file mode 100644 index 00000000000..ace014b530c --- /dev/null +++ b/modules/facet/src/test/org/apache/lucene/facet/index/streaming/CategoryAttributesStreamTest.java @@ -0,0 +1,82 @@ +package org.apache.lucene.facet.index.streaming; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.Set; + +import org.junit.Test; + +import org.apache.lucene.facet.index.CategoryContainerTestBase; +import org.apache.lucene.facet.index.attributes.CategoryAttribute; +import org.apache.lucene.facet.index.attributes.CategoryAttributeImpl; +import org.apache.lucene.facet.index.streaming.CategoryAttributesStream; +import org.apache.lucene.facet.taxonomy.CategoryPath; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class CategoryAttributesStreamTest extends CategoryContainerTestBase { + + /** + * Verifies that a {@link CategoryAttributesStream} accepts + * {@link CategoryAttribute} and passes them on as tokens. + * + * @throws IOException + */ + @Test + public void testStream() throws IOException { + ArrayList attributesList = new ArrayList(); + for (int i = 0; i < initialCatgeories.length; i++) { + attributesList.add(new CategoryAttributeImpl(initialCatgeories[i])); + } + + // test number of tokens + CategoryAttributesStream stream = new CategoryAttributesStream( + attributesList); + int nTokens = 0; + while (stream.incrementToken()) { + nTokens++; + } + assertEquals("Wrong number of tokens", 3, nTokens); + + // test reset + stream.reset(); + nTokens = 0; + while (stream.incrementToken()) { + nTokens++; + } + assertEquals("Wrong number of tokens", 3, nTokens); + + // test reset and contents + Set pathsSet = new HashSet(); + for (int i = 0; i < initialCatgeories.length; i++) { + pathsSet.add(initialCatgeories[i]); + } + stream.reset(); + while (stream.incrementToken()) { + CategoryAttribute fromStream = stream + .getAttribute(CategoryAttribute.class); + if (!pathsSet.remove(fromStream.getCategoryPath())) { + fail("Unexpected category path: " + + fromStream.getCategoryPath().toString(':')); + } + } + assertTrue("all category paths should have been found", pathsSet + .isEmpty()); + } +} diff --git a/modules/facet/src/test/org/apache/lucene/facet/index/streaming/CategoryParentsStreamTest.java b/modules/facet/src/test/org/apache/lucene/facet/index/streaming/CategoryParentsStreamTest.java new file mode 100644 index 00000000000..842e5e98b20 --- /dev/null +++ b/modules/facet/src/test/org/apache/lucene/facet/index/streaming/CategoryParentsStreamTest.java @@ -0,0 +1,205 @@ +package org.apache.lucene.facet.index.streaming; + +import java.io.IOException; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.store.RAMDirectory; +import org.junit.Test; + +import org.apache.lucene.facet.FacetException; +import org.apache.lucene.facet.index.CategoryContainerTestBase; +import org.apache.lucene.facet.index.DummyProperty; +import org.apache.lucene.facet.index.categorypolicy.NonTopLevelOrdinalPolicy; +import org.apache.lucene.facet.index.categorypolicy.NonTopLevelPathPolicy; +import org.apache.lucene.facet.index.categorypolicy.OrdinalPolicy; +import org.apache.lucene.facet.index.categorypolicy.PathPolicy; +import org.apache.lucene.facet.index.params.DefaultFacetIndexingParams; +import org.apache.lucene.facet.index.params.FacetIndexingParams; +import org.apache.lucene.facet.index.streaming.CategoryAttributesStream; +import org.apache.lucene.facet.index.streaming.CategoryListTokenizer; +import org.apache.lucene.facet.index.streaming.CategoryParentsStream; +import org.apache.lucene.facet.taxonomy.TaxonomyWriter; +import org.apache.lucene.facet.taxonomy.lucene.LuceneTaxonomyWriter; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class CategoryParentsStreamTest extends CategoryContainerTestBase { + + /** + * Verifies that a {@link CategoryParentsStream} can be constructed from + * {@link CategoryAttributesStream} and produces the correct number of + * tokens with default facet indexing params. + * + * @throws IOException + */ + @Test + public void testStreamDefaultParams() throws IOException { + TaxonomyWriter taxonomyWriter = new LuceneTaxonomyWriter( + new RAMDirectory()); + CategoryParentsStream stream = new CategoryParentsStream( + new CategoryAttributesStream(categoryContainer), + taxonomyWriter, new DefaultFacetIndexingParams()); + + // count the number of tokens + int nTokens; + for (nTokens = 0; stream.incrementToken(); nTokens++) { + } + // should be 6 - all categories and parents + assertEquals("Wrong number of tokens", 6, nTokens); + + taxonomyWriter.close(); + } + + /** + * Verifies that a {@link CategoryParentsStream} can be constructed from + * {@link CategoryAttributesStream} and produces the correct number of + * tokens with non top level facet indexing params. + * + * @throws IOException + */ + @Test + public void testStreamNonTopLevelParams() throws IOException { + final TaxonomyWriter taxonomyWriter = new LuceneTaxonomyWriter( + new RAMDirectory()); + FacetIndexingParams indexingParams = new DefaultFacetIndexingParams() { + @Override + protected OrdinalPolicy fixedOrdinalPolicy() { + return new NonTopLevelOrdinalPolicy(); + } + @Override + protected PathPolicy fixedPathPolicy() { + return new NonTopLevelPathPolicy(); + } + }; + + CategoryParentsStream stream = new CategoryParentsStream( + new CategoryAttributesStream(categoryContainer), + taxonomyWriter, indexingParams); + + // count the number of tokens + int nTokens; + for (nTokens = 0; stream.incrementToken(); nTokens++) { + } + /* + * should be 4: 3 non top level ("two", "three" and "six"), and one + * explicit top level ("four") + */ + assertEquals("Wrong number of tokens", 4, nTokens); + + taxonomyWriter.close(); + } + + /** + * Verifies the correctness when no attributes in parents are retained in + * {@link CategoryParentsStream}. + * + * @throws IOException + * @throws FacetException + */ + @Test + public void testNoRetainableAttributes() throws IOException, FacetException { + TaxonomyWriter taxonomyWriter = new LuceneTaxonomyWriter(new RAMDirectory()); + + new CategoryParentsStream(new CategoryAttributesStream(categoryContainer), + taxonomyWriter, new DefaultFacetIndexingParams()); + + // add DummyAttribute, but do not retain, only one expected + categoryContainer.addCategory(initialCatgeories[0], new DummyProperty()); + + CategoryParentsStream stream = new CategoryParentsStream(new CategoryAttributesStream( + categoryContainer), taxonomyWriter, + new DefaultFacetIndexingParams()); + + int nAttributes = 0; + while (stream.incrementToken()) { + if (stream.categoryAttribute.getProperty(DummyProperty.class) != null) { + nAttributes++; + } + } + assertEquals("Wrong number of tokens with attributes", 1, nAttributes); + + } + + /** + * Verifies the correctness when attributes in parents are retained in + * {@link CategoryParentsStream}. + * + * @throws IOException + * @throws FacetException + */ + @Test + public void testRetainableAttributes() throws IOException, FacetException { + TaxonomyWriter taxonomyWriter = new LuceneTaxonomyWriter( + new RAMDirectory()); + + FacetIndexingParams indexingParams = new DefaultFacetIndexingParams(); + new CategoryParentsStream(new CategoryAttributesStream( + categoryContainer), taxonomyWriter, indexingParams); + + // add DummyAttribute and retain it, three expected + categoryContainer.clear(); + categoryContainer + .addCategory(initialCatgeories[0], new DummyProperty()); + CategoryParentsStream stream = new CategoryParentsStream( + new CategoryAttributesStream(categoryContainer), + taxonomyWriter, new DefaultFacetIndexingParams()); + stream.addRetainableProperty(DummyProperty.class); + + MyCategoryListTokenizer tokenizer = new MyCategoryListTokenizer(stream, + indexingParams); + + int nAttributes = 0; + try { + while (tokenizer.incrementToken()) { + if (stream.categoryAttribute.getProperty(DummyProperty.class) != null) { + nAttributes++; + } + } + } catch (IOException e) { + fail("Properties retained after stream closed"); + } + assertEquals("Wrong number of tokens with attributes", 3, nAttributes); + + taxonomyWriter.close(); + } + + private final class MyCategoryListTokenizer extends CategoryListTokenizer { + + public MyCategoryListTokenizer(TokenStream input, + FacetIndexingParams indexingParams) { + super(input, indexingParams); + } + + @Override + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + return true; + } + if (categoryAttribute != null) { + if (categoryAttribute.getCategoryPath() == null) { + if (categoryAttribute.getProperty(DummyProperty.class) != null) { + throw new IOException( + "Properties not cleared properly from parents stream"); + } + } + } + return false; + } + + } +} \ No newline at end of file diff --git a/modules/facet/src/test/org/apache/lucene/facet/index/streaming/CategoryTokenizerTest.java b/modules/facet/src/test/org/apache/lucene/facet/index/streaming/CategoryTokenizerTest.java new file mode 100644 index 00000000000..ecef1af7548 --- /dev/null +++ b/modules/facet/src/test/org/apache/lucene/facet/index/streaming/CategoryTokenizerTest.java @@ -0,0 +1,111 @@ +package org.apache.lucene.facet.index.streaming; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.store.RAMDirectory; +import org.junit.Test; + +import org.apache.lucene.facet.index.CategoryContainerTestBase; +import org.apache.lucene.facet.index.attributes.CategoryAttributesIterable; +import org.apache.lucene.facet.index.params.DefaultFacetIndexingParams; +import org.apache.lucene.facet.index.streaming.CategoryAttributesStream; +import org.apache.lucene.facet.index.streaming.CategoryTokenizer; +import org.apache.lucene.facet.taxonomy.CategoryPath; +import org.apache.lucene.facet.taxonomy.TaxonomyWriter; +import org.apache.lucene.facet.taxonomy.lucene.LuceneTaxonomyWriter; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class CategoryTokenizerTest extends CategoryContainerTestBase { + + /** + * Verifies that a {@link CategoryTokenizer} adds the correct + * {@link CharTermAttribute}s to a {@link CategoryAttributesStream}. + * + * @throws IOException + */ + @Test + public void testTokensDefaultParams() throws IOException { + TaxonomyWriter taxonomyWriter = new LuceneTaxonomyWriter( + new RAMDirectory()); + DefaultFacetIndexingParams indexingParams = new DefaultFacetIndexingParams(); + CategoryTokenizer tokenizer = new CategoryTokenizer( + new CategoryAttributesStream(categoryContainer), + indexingParams); + + // count the number of tokens + Set categoryTerms = new HashSet(); + for (int i = 0; i < initialCatgeories.length; i++) { + categoryTerms.add(initialCatgeories[i] + .toString(indexingParams.getFacetDelimChar())); + } + + int nTokens; + for (nTokens = 0; tokenizer.incrementToken(); nTokens++) { + if (!categoryTerms.remove(tokenizer.termAttribute.toString())) { + fail("Unexpected term: " + tokenizer.termAttribute.toString()); + } + } + assertTrue("all category terms should have been found", categoryTerms + .isEmpty()); + + // should be 6 - all categories and parents + assertEquals("Wrong number of tokens", 3, nTokens); + + taxonomyWriter.close(); + } + + /** + * Verifies that {@link CategoryTokenizer} elongates the buffer in + * {@link CharTermAttribute} for long categories. + * + * @throws IOException + */ + @Test + public void testLongCategoryPath() throws IOException { + TaxonomyWriter taxonomyWriter = new LuceneTaxonomyWriter( + new RAMDirectory()); + + List longCategory = new ArrayList(); + longCategory.add(new CategoryPath("one", "two", "three", "four", + "five", "six", "seven")); + + DefaultFacetIndexingParams indexingParams = new DefaultFacetIndexingParams(); + CategoryTokenizer tokenizer = new CategoryTokenizer( + new CategoryAttributesStream(new CategoryAttributesIterable( + longCategory)), indexingParams); + + // count the number of tokens + String categoryTerm = longCategory.get(0).toString( + indexingParams.getFacetDelimChar()); + + assertTrue("Missing token", tokenizer.incrementToken()); + if (!categoryTerm.equals(tokenizer.termAttribute.toString())) { + fail("Unexpected term: " + tokenizer.termAttribute.toString()); + } + + assertFalse("Unexpected token", tokenizer.incrementToken()); + + taxonomyWriter.close(); + } +} diff --git a/modules/facet/src/test/org/apache/lucene/facet/search/AdaptiveAccumulatorTest.java b/modules/facet/src/test/org/apache/lucene/facet/search/AdaptiveAccumulatorTest.java new file mode 100644 index 00000000000..498a4dd2dee --- /dev/null +++ b/modules/facet/src/test/org/apache/lucene/facet/search/AdaptiveAccumulatorTest.java @@ -0,0 +1,38 @@ +package org.apache.lucene.facet.search; + +import org.apache.lucene.index.IndexReader; + +import org.apache.lucene.facet.search.params.FacetSearchParams; +import org.apache.lucene.facet.search.sampling.BaseSampleTestTopK; +import org.apache.lucene.facet.search.sampling.Sampler; +import org.apache.lucene.facet.taxonomy.TaxonomyReader; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class AdaptiveAccumulatorTest extends BaseSampleTestTopK { + + @Override + protected FacetsAccumulator getSamplingAccumulator(Sampler sampler, + TaxonomyReader taxoReader, IndexReader indexReader, + FacetSearchParams searchParams) { + AdaptiveFacetsAccumulator res = new AdaptiveFacetsAccumulator(searchParams, + indexReader, taxoReader); + res.setSampler(sampler); + return res; + } +} diff --git a/modules/facet/src/test/org/apache/lucene/facet/search/BaseTestTopK.java b/modules/facet/src/test/org/apache/lucene/facet/search/BaseTestTopK.java new file mode 100644 index 00000000000..bbf465333ba --- /dev/null +++ b/modules/facet/src/test/org/apache/lucene/facet/search/BaseTestTopK.java @@ -0,0 +1,108 @@ +package org.apache.lucene.facet.search; + +import java.io.IOException; +import java.util.Arrays; +import java.util.List; + +import org.apache.lucene.DocumentBuilder.DocumentBuilderException; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.IndexWriter; + +import org.apache.lucene.facet.FacetTestBase; +import org.apache.lucene.facet.index.params.FacetIndexingParams; +import org.apache.lucene.facet.search.params.CountFacetRequest; +import org.apache.lucene.facet.search.params.FacetSearchParams; +import org.apache.lucene.facet.taxonomy.CategoryPath; +import org.apache.lucene.facet.taxonomy.TaxonomyWriter; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public abstract class BaseTestTopK extends FacetTestBase { + + protected static final String ALPHA = "alpha"; + protected static final String BETA = "beta"; + + /** partition sizes on which the tests are run */ + protected static int[] partitionSizes = new int[] { 2, 3, 100, Integer.MAX_VALUE }; + + /** Categories are generated from range [0,maxCategory) */ + protected static int maxCategory = 5000; + private static final int categoriesPow2 = maxCategory * maxCategory; + + private int currDoc; + private int nextInt; + + @Override + protected void populateIndex(IndexWriter iw, TaxonomyWriter taxo, + FacetIndexingParams iParams) throws IOException, + DocumentBuilderException, CorruptIndexException { + currDoc = -1; + super.populateIndex(iw, taxo, iParams); + } + + /** prepare the next random int */ + private void nextInt(int doc) { + if (currDoc == doc ) { + return; + } + currDoc = doc; + nextInt = random.nextInt(categoriesPow2); + nextInt = (int)Math.sqrt(nextInt); + } + + @Override + protected String getContent(int doc) { + nextInt(doc); + if (random.nextDouble() > 0.1) { + return ALPHA + ' ' + BETA; + } + return ALPHA; + } + + @Override + protected List getCategories(int doc) { + nextInt(doc); + CategoryPath cp = new CategoryPath( + "a", + Integer.toString(nextInt / 1000), + Integer.toString(nextInt / 100), + Integer.toString(nextInt / 10)); + if (VERBOSE) { + System.out.println("Adding CP: " + cp.toString()); + } + return Arrays.asList(new CategoryPath[] { cp }); + } + + protected FacetSearchParams searchParamsWithRequests(int numResults) { + return searchParamsWithRequests(numResults, Integer.MAX_VALUE); + } + + protected FacetSearchParams searchParamsWithRequests(int numResults, int partitionSize) { + FacetSearchParams res = getFacetedSearchParams(partitionSize); + res.addFacetRequest(new CountFacetRequest(new CategoryPath("a"), numResults)); + res.addFacetRequest(new CountFacetRequest(new CategoryPath("a", "1"), numResults)); + res.addFacetRequest(new CountFacetRequest(new CategoryPath("a", "1", "10"), numResults)); + res.addFacetRequest(new CountFacetRequest(new CategoryPath("a", "2", "26", "267"), numResults)); + return res; + } + + @Override + protected int numDocsToIndex() { + return 20000; + } +} diff --git a/modules/facet/src/test/org/apache/lucene/facet/search/CategoryListIteratorTest.java b/modules/facet/src/test/org/apache/lucene/facet/search/CategoryListIteratorTest.java new file mode 100644 index 00000000000..4e81f063dca --- /dev/null +++ b/modules/facet/src/test/org/apache/lucene/facet/search/CategoryListIteratorTest.java @@ -0,0 +1,207 @@ +package org.apache.lucene.facet.search; + +import java.io.IOException; +import java.util.HashSet; +import java.util.Set; + +import org.apache.lucene.analysis.core.KeywordAnalyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.Payload; +import org.apache.lucene.index.Term; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.RAMDirectory; +import org.junit.Test; + +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.facet.search.CategoryListIterator; +import org.apache.lucene.facet.search.PayloadIntDecodingIterator; +import org.apache.lucene.util.UnsafeByteArrayOutputStream; +import org.apache.lucene.util.encoding.DGapIntEncoder; +import org.apache.lucene.util.encoding.IntEncoder; +import org.apache.lucene.util.encoding.SortingIntEncoder; +import org.apache.lucene.util.encoding.UniqueValuesIntEncoder; +import org.apache.lucene.util.encoding.VInt8IntEncoder; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class CategoryListIteratorTest extends LuceneTestCase { + + private static final class DataTokenStream extends TokenStream { + + private int idx; + private PayloadAttribute payload = addAttribute(PayloadAttribute.class); + private byte[] buf = new byte[20]; + UnsafeByteArrayOutputStream ubaos = new UnsafeByteArrayOutputStream(buf); + IntEncoder encoder; + private boolean exhausted = false; + private CharTermAttribute term = addAttribute(CharTermAttribute.class); + + public DataTokenStream(String text, IntEncoder encoder) throws IOException { + this.encoder = encoder; + term.setEmpty().append(text); + } + + public void setIdx(int idx) { + this.idx = idx; + exhausted = false; + } + + @Override + public boolean incrementToken() throws IOException { + if (exhausted) { + return false; + } + + int[] values = data[idx]; + ubaos.reInit(buf); + encoder.reInit(ubaos); + for (int val : values) { + encoder.encode(val); + } + encoder.close(); + payload.setPayload(new Payload(buf, 0, ubaos.length())); + + exhausted = true; + return true; + } + + } + + static final int[][] data = new int[][] { + new int[] { 1, 2 }, new int[] { 3, 4 }, new int[] { 1, 3 }, new int[] { 1, 2, 3, 4 }, + }; + + @Test + public void testPayloadIntDecodingIterator() throws Exception { + Directory dir = new RAMDirectory(); + DataTokenStream dts = new DataTokenStream("1",new SortingIntEncoder( + new UniqueValuesIntEncoder(new DGapIntEncoder(new VInt8IntEncoder())))); + IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new KeywordAnalyzer())); + for (int i = 0; i < data.length; i++) { + dts.setIdx(i); + Document doc = new Document(); + doc.add(new Field("f", dts)); + writer.addDocument(doc); + } + writer.commit(); + writer.close(); + + IndexReader reader = IndexReader.open(dir, true); + CategoryListIterator cli = new PayloadIntDecodingIterator(reader, new Term( + "f","1"), dts.encoder.createMatchingDecoder()); + cli.init(); + int totalCategories = 0; + for (int i = 0; i < data.length; i++) { + Set values = new HashSet(); + for (int j = 0; j < data[i].length; j++) { + values.add(data[i][j]); + } + cli.skipTo(i); + long cat; + while ((cat = cli.nextCategory()) < Integer.MAX_VALUE) { + assertTrue("expected category not found: " + cat, values.contains((int) cat)); + totalCategories ++; + } + } + assertEquals("Missing categories!",10,totalCategories); + reader.close(); + } + + /** + * Test that a document with no payloads does not confuse the payload decoder. + * Test was added for tracker 143670. + * At the time of writing the test it exposes the bug fixed in tracker 143670. + * However NOTE that this exposure depends on Lucene internal implementation and + * as such in the future it may stop to expose that specific bug. + * The test should always pass, though :) + */ + @Test + public void testPayloadIteratorWithInvalidDoc() throws Exception { + Directory dir = new RAMDirectory(); + DataTokenStream dts = new DataTokenStream("1",new SortingIntEncoder( + new UniqueValuesIntEncoder(new DGapIntEncoder(new VInt8IntEncoder())))); + DataTokenStream dts2 = new DataTokenStream("2",new SortingIntEncoder( + new UniqueValuesIntEncoder(new DGapIntEncoder(new VInt8IntEncoder())))); + IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new KeywordAnalyzer())); + for (int i = 0; i < data.length; i++) { + dts.setIdx(i); + Document doc = new Document(); + if (i==0 || i == 2) { + doc.add(new Field("f", dts)); // only docs 0 & 2 have payloads! + } + dts2.setIdx(i); + doc.add(new Field("f", dts2)); + writer.addDocument(doc); + writer.commit(); + } + + // add more documents to expose the bug. + // for some reason, this bug is not exposed unless these additional documents are added. + for (int i = 0; i < 10; ++i) { + Document d = new Document(); + dts.setIdx(2); + d.add(new Field("f", dts2)); + writer.addDocument(d); + if (i %10 == 0) { + writer.commit(); + } + + } + + writer.commit(); + writer.close(); + + IndexReader reader = IndexReader.open(dir, true); + CategoryListIterator cli = new PayloadIntDecodingIterator(reader, new Term( + "f","1"), dts.encoder.createMatchingDecoder()); + cli.init(); + int totalCats = 0; + for (int i = 0; i < data.length; i++) { + // doc no. i + Set values = new HashSet(); + for (int j = 0; j < data[i].length; j++) { + values.add(data[i][j]); + } + boolean hasDoc = cli.skipTo(i); + if (hasDoc) { + assertTrue("Document "+i+" must not have a payload!", i==0 || i==2 ); + long cat; + while ((cat = cli.nextCategory()) < Integer.MAX_VALUE) { + assertTrue("expected category not found: " + cat, values.contains((int) cat)); + ++totalCats; + } + } else { + assertFalse("Document "+i+" must have a payload!", i==0 || i==2 ); + } + + } + assertEquals("Wrong number of total categories!", 4, totalCats); + + // Ok.. went through the first 4 docs, now lets try the 6th doc (docid 5) + assertFalse("Doc #6 (docid=5) should not have a payload!",cli.skipTo(5)); + reader.close(); + } + +} diff --git a/modules/facet/src/test/org/apache/lucene/facet/search/DrillDownTest.java b/modules/facet/src/test/org/apache/lucene/facet/search/DrillDownTest.java new file mode 100644 index 00000000000..2df28dfb21b --- /dev/null +++ b/modules/facet/src/test/org/apache/lucene/facet/search/DrillDownTest.java @@ -0,0 +1,188 @@ +package org.apache.lucene.facet.search; + +import java.io.IOException; +import java.util.ArrayList; + +import org.apache.lucene.analysis.core.KeywordAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.Field.Index; +import org.apache.lucene.document.Field.Store; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.LockObtainFailedException; +import org.apache.lucene.store.RAMDirectory; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Test; + +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.facet.index.CategoryDocumentBuilder; +import org.apache.lucene.facet.index.params.CategoryListParams; +import org.apache.lucene.facet.index.params.PerDimensionIndexingParams; +import org.apache.lucene.facet.search.DrillDown; +import org.apache.lucene.facet.search.params.FacetSearchParams; +import org.apache.lucene.facet.taxonomy.CategoryPath; +import org.apache.lucene.facet.taxonomy.TaxonomyWriter; +import org.apache.lucene.facet.taxonomy.lucene.LuceneTaxonomyReader; +import org.apache.lucene.facet.taxonomy.lucene.LuceneTaxonomyWriter; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class DrillDownTest extends LuceneTestCase { + + private FacetSearchParams defaultParams = new FacetSearchParams(); + private FacetSearchParams nonDefaultParams; + private static IndexReader reader; + private static LuceneTaxonomyReader taxo; + + public DrillDownTest() throws IOException { + PerDimensionIndexingParams iParams = new PerDimensionIndexingParams(); + CategoryListParams aClParams = new CategoryListParams(new Term("testing_facets_a", "a")); + CategoryListParams bClParams = new CategoryListParams(new Term("testing_facets_b", "b")); + + iParams.addCategoryListParams(new CategoryPath("a"), aClParams); + iParams.addCategoryListParams(new CategoryPath("b"), bClParams); + + nonDefaultParams = new FacetSearchParams(iParams); + } + @BeforeClass + public static void createIndexes() throws CorruptIndexException, LockObtainFailedException, IOException { + Directory dir = new RAMDirectory(); + IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new KeywordAnalyzer())); + + Directory taxoDir = new RAMDirectory(); + TaxonomyWriter taxoWriter = new LuceneTaxonomyWriter(taxoDir); + + for (int i = 0; i < 100; i++) { + ArrayList paths = new ArrayList(); + Document doc = new Document(); + if (i % 2 == 0) { // 50 + doc.add(new Field("content", "foo", Store.NO, Index.ANALYZED)); + } + if (i % 3 == 0) { // 33 + doc.add(new Field("content", "bar", Store.NO, Index.ANALYZED)); + } + if (i % 4 == 0) { // 25 + paths.add(new CategoryPath("a")); + } + if (i % 5 == 0) { // 20 + paths.add(new CategoryPath("b")); + } + CategoryDocumentBuilder builder = new CategoryDocumentBuilder(taxoWriter); + builder.setCategoryPaths(paths).build(doc); + writer.addDocument(doc); + } + + taxoWriter.close(); + writer.commit(); + writer.close(); + + reader = IndexReader.open(dir, true); + taxo = new LuceneTaxonomyReader(taxoDir); + } + + @Test + public void testTermNonDefault() { + Term termA = DrillDown.term(nonDefaultParams, new CategoryPath("a")); + assertEquals(new Term("testing_facets_a", "a"), termA); + + Term termB = DrillDown.term(nonDefaultParams, new CategoryPath("b")); + assertEquals(new Term("testing_facets_b", "b"), termB); + } + + @Test + public void testTermDefault() { + String defaultField = CategoryListParams.DEFAULT_TERM.field(); + + Term termA = DrillDown.term(defaultParams, new CategoryPath("a")); + assertEquals(new Term(defaultField, "a"), termA); + + Term termB = DrillDown.term(defaultParams, new CategoryPath("b")); + assertEquals(new Term(defaultField, "b"), termB); + } + + @Test + public void testQuery() throws IOException { + IndexSearcher searcher = new IndexSearcher(reader); + + // Making sure the query yields 25 documents with the facet "a" + Query q = DrillDown.query(defaultParams, new CategoryPath("a")); + TopDocs docs = searcher.search(q, 100); + assertEquals(25, docs.totalHits); + + // Making sure the query yields 5 documents with the facet "b" and the + // previous (facet "a") query as a base query + Query q2 = DrillDown.query(defaultParams, q, new CategoryPath("b")); + docs = searcher.search(q2, 100); + assertEquals(5, docs.totalHits); + + // Making sure that a query of both facet "a" and facet "b" yields 5 results + Query q3 = DrillDown.query(defaultParams, new CategoryPath("a"), new CategoryPath("b")); + docs = searcher.search(q3, 100); + assertEquals(5, docs.totalHits); + + // Check that content:foo (which yields 50% results) and facet/b (which yields 20%) + // would gather together 10 results (10%..) + Query fooQuery = new TermQuery(new Term("content", "foo")); + Query q4 = DrillDown.query(defaultParams, fooQuery, new CategoryPath("b")); + docs = searcher.search(q4, 100); + assertEquals(10, docs.totalHits); + } + + @Test + public void testQueryImplicitDefaultParams() throws IOException { + IndexSearcher searcher = new IndexSearcher(reader); + + // Create the base query to start with + Query q = DrillDown.query(defaultParams, new CategoryPath("a")); + + // Making sure the query yields 5 documents with the facet "b" and the + // previous (facet "a") query as a base query + Query q2 = DrillDown.query(q, new CategoryPath("b")); + TopDocs docs = searcher.search(q2, 100); + assertEquals(5, docs.totalHits); + + // Check that content:foo (which yields 50% results) and facet/b (which yields 20%) + // would gather together 10 results (10%..) + Query fooQuery = new TermQuery(new Term("content", "foo")); + Query q4 = DrillDown.query(fooQuery, new CategoryPath("b")); + docs = searcher.search(q4, 100); + assertEquals(10, docs.totalHits); + } + + @AfterClass + public static void closeIndexes() throws IOException { + if (reader != null) { + reader.close(); + } + + if (taxo != null) { + taxo.close(); + } + } + +} diff --git a/modules/facet/src/test/org/apache/lucene/facet/search/SamplingWrapperTest.java b/modules/facet/src/test/org/apache/lucene/facet/search/SamplingWrapperTest.java new file mode 100644 index 00000000000..e70930afef4 --- /dev/null +++ b/modules/facet/src/test/org/apache/lucene/facet/search/SamplingWrapperTest.java @@ -0,0 +1,37 @@ +package org.apache.lucene.facet.search; + +import org.apache.lucene.index.IndexReader; + +import org.apache.lucene.facet.search.params.FacetSearchParams; +import org.apache.lucene.facet.search.sampling.BaseSampleTestTopK; +import org.apache.lucene.facet.search.sampling.Sampler; +import org.apache.lucene.facet.taxonomy.TaxonomyReader; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class SamplingWrapperTest extends BaseSampleTestTopK { + + @Override + protected FacetsAccumulator getSamplingAccumulator(Sampler sampler, + TaxonomyReader taxoReader, IndexReader indexReader, + FacetSearchParams searchParams) { + FacetsAccumulator fExtrctr = new StandardFacetsAccumulator(searchParams, + indexReader, taxoReader); + return new SamplingWrapper(fExtrctr, sampler); + } +} diff --git a/modules/facet/src/test/org/apache/lucene/facet/search/TestCategoryListCache.java b/modules/facet/src/test/org/apache/lucene/facet/search/TestCategoryListCache.java new file mode 100644 index 00000000000..f77b70963f4 --- /dev/null +++ b/modules/facet/src/test/org/apache/lucene/facet/search/TestCategoryListCache.java @@ -0,0 +1,132 @@ +package org.apache.lucene.facet.search; + +import java.io.IOException; +import java.util.List; +import java.util.Map; + +import org.apache.lucene.search.MatchAllDocsQuery; + +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +import org.apache.lucene.facet.FacetTestBase; +import org.apache.lucene.facet.index.params.CategoryListParams; +import org.apache.lucene.facet.index.params.FacetIndexingParams; +import org.apache.lucene.facet.search.cache.CategoryListCache; +import org.apache.lucene.facet.search.cache.CategoryListData; +import org.apache.lucene.facet.search.params.CountFacetRequest; +import org.apache.lucene.facet.search.params.FacetSearchParams; +import org.apache.lucene.facet.search.results.FacetResult; +import org.apache.lucene.facet.taxonomy.CategoryPath; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class TestCategoryListCache extends FacetTestBase { + + public TestCategoryListCache() { + super(); + } + + @Before + @Override + public void setUp() throws Exception { + super.setUp(); + initIndex(); + } + + @After + @Override + public void tearDown() throws Exception { + closeAll(); + super.tearDown(); + } + + @Test + public void testNoClCache() throws Exception { + doTest(false,false); + } + + @Test + public void testCorrectClCache() throws Exception { + doTest(true,false); + } + + @Test + public void testWrongClCache() throws Exception { + doTest(true,true); + } + + private void doTest(boolean withCache, boolean plantWrongData) throws IOException, Exception { + Map truth = facetCountsTruth(); + CategoryPath cp = (CategoryPath) truth.keySet().toArray()[0]; // any category path will do for this test + CountFacetRequest frq = new CountFacetRequest(cp, 10); + FacetSearchParams sParams = getFacetedSearchParams(); + sParams.addFacetRequest(frq); + if (withCache) { + //let's use a cached cl data + FacetIndexingParams iparams = sParams.getFacetIndexingParams(); + CategoryListParams clp = new CategoryListParams(); // default term ok as only single list + CategoryListCache clCache = new CategoryListCache(); + clCache.loadAndRegister(clp, indexReader, taxoReader, iparams); + if (plantWrongData) { + // let's mess up the cached data and then expect a wrong result... + messCachedData(clCache, clp); + } + sParams.setClCache(clCache); + } + FacetsCollector fc = new FacetsCollector(sParams, indexReader, taxoReader); + searcher.search(new MatchAllDocsQuery(), fc); + List res = fc.getFacetResults(); + try { + assertCountsAndCardinality(truth, res); + assertFalse("Correct results not expected when wrong data was cached", plantWrongData); + } catch (Throwable e) { + assertTrue("Wrong results not expected unless wrong data was cached", withCache); + assertTrue("Wrong results not expected unless wrong data was cached", plantWrongData); + } + } + + /** Mess the cached data for this {@link CategoryListParams} */ + private void messCachedData(CategoryListCache clCache, CategoryListParams clp) { + final CategoryListData cld = clCache.get(clp); + CategoryListData badCld = new CategoryListData() { + @Override + public CategoryListIterator iterator(int partition) throws IOException { + final CategoryListIterator it = cld.iterator(partition); + return new CategoryListIterator() { + public boolean skipTo(int docId) throws IOException { + return it.skipTo(docId); + } + public long nextCategory() throws IOException { + long res = it.nextCategory(); + if (res>Integer.MAX_VALUE) { + return res; + } + return res>1 ? res-1 : res+1; + } + public boolean init() throws IOException { + return it.init(); + } + }; + } + }; + clCache.register(clp, badCld); + } + +} diff --git a/modules/facet/src/test/org/apache/lucene/facet/search/TestFacetArrays.java b/modules/facet/src/test/org/apache/lucene/facet/search/TestFacetArrays.java new file mode 100644 index 00000000000..557a7f6a226 --- /dev/null +++ b/modules/facet/src/test/org/apache/lucene/facet/search/TestFacetArrays.java @@ -0,0 +1,52 @@ +package org.apache.lucene.facet.search; + +import org.junit.Test; + +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.facet.search.FacetArrays; +import org.apache.lucene.facet.search.FloatArrayAllocator; +import org.apache.lucene.facet.search.IntArrayAllocator; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class TestFacetArrays extends LuceneTestCase { + + @Test + public void testSimple() { + FacetArrays arrays = new FacetArrays(new IntArrayAllocator(1, 1), new FloatArrayAllocator(1, 1)); + + int[] intArray = arrays.getIntArray(); + // Set the element, then free + intArray[0] = 1; + arrays.free(); + + // We should expect a cleared array back + intArray = arrays.getIntArray(); + assertEquals("Expected a cleared array back, but the array is still filled", 0, intArray[0]); + + float[] floatArray = arrays.getFloatArray(); + // Set the element, then free + floatArray[0] = 1.0f; + arrays.free(); + + // We should expect a cleared array back + floatArray = arrays.getFloatArray(); + assertEquals("Expected a cleared array back, but the array is still filled", 0.0f, floatArray[0], 0.0); + } + +} diff --git a/modules/facet/src/test/org/apache/lucene/facet/search/TestFacetsAccumulatorWithComplement.java b/modules/facet/src/test/org/apache/lucene/facet/search/TestFacetsAccumulatorWithComplement.java new file mode 100644 index 00000000000..f88269be0a2 --- /dev/null +++ b/modules/facet/src/test/org/apache/lucene/facet/search/TestFacetsAccumulatorWithComplement.java @@ -0,0 +1,161 @@ +package org.apache.lucene.facet.search; + +import java.io.IOException; +import java.util.List; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.MultiReader; +import org.apache.lucene.index.ParallelReader; +import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.search.Query; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +import org.apache.lucene.facet.FacetTestBase; +import org.apache.lucene.facet.search.FacetsAccumulator; +import org.apache.lucene.facet.search.ScoredDocIDs; +import org.apache.lucene.facet.search.ScoredDocIdCollector; +import org.apache.lucene.facet.search.StandardFacetsAccumulator; +import org.apache.lucene.facet.search.params.CountFacetRequest; +import org.apache.lucene.facet.search.params.FacetSearchParams; +import org.apache.lucene.facet.search.results.FacetResult; +import org.apache.lucene.facet.search.results.FacetResultNode; +import org.apache.lucene.facet.taxonomy.CategoryPath; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Test that complementsworks as expected. + * We place this test under *.facet.search rather than *.search + * because the test actually does faceted search. + */ +public class TestFacetsAccumulatorWithComplement extends FacetTestBase { + + @Override + @Before + public void setUp() throws Exception { + super.setUp(); + initIndex(); + } + + @Override + @After + public void tearDown() throws Exception { + closeAll(); + super.tearDown(); + } + + /** + * Test that complements does not cause a failure when using a parallel reader + */ + @Test + public void testComplementsWithParallerReader() throws Exception { + IndexReader origReader = indexReader; + ParallelReader pr = new ParallelReader(true); + pr.add(origReader); + indexReader = pr; + try { + doTestComplements(); + } finally { + indexReader = origReader; + } + } + + /** + * Test that complements works with MultiReader + */ + @Test + public void testComplementsWithMultiReader() throws Exception { + final IndexReader origReader = indexReader; + indexReader = new MultiReader(origReader); + try { + doTestComplements(); + } finally { + indexReader = origReader; + } + } + + /** + * Test that score is indeed constant when using a constant score + */ + @Test + public void testComplements() throws Exception { + doTestComplements(); + } + + private void doTestComplements() throws Exception { + assertTrue("Would like to test this with deletions!",indexReader.hasDeletions()); + assertTrue("Would like to test this with deletions!",indexReader.numDeletedDocs()>0); + Query q = new MatchAllDocsQuery(); //new TermQuery(new Term(TEXT,"white")); + if (VERBOSE) { + System.out.println("Query: "+q); + } + ScoredDocIdCollector dCollector = + ScoredDocIdCollector.create(indexReader.maxDoc(),false); // scoring is disabled + searcher.search(q, dCollector); + + // verify by facet values + List countResWithComplement = findFacets(dCollector.getScoredDocIDs(), true); + List countResNoComplement = findFacets(dCollector.getScoredDocIDs(), false); + + assertEquals("Wrong number of facet count results with complement!",1,countResWithComplement.size()); + assertEquals("Wrong number of facet count results no complement!",1,countResNoComplement.size()); + + FacetResultNode parentResWithComp = countResWithComplement.get(0).getFacetResultNode(); + FacetResultNode parentResNoComp = countResWithComplement.get(0).getFacetResultNode(); + + assertEquals("Wrong number of top count aggregated categories with complement!",3,parentResWithComp.getNumSubResults()); + assertEquals("Wrong number of top count aggregated categories no complement!",3,parentResNoComp.getNumSubResults()); + + } + + @Override + protected FacetSearchParams getFacetedSearchParams() { + FacetSearchParams res = super.getFacetedSearchParams(); + res.addFacetRequest(new CountFacetRequest(new CategoryPath("root","a"), 10)); + return res; + } + + /** compute facets with certain facet requests and docs */ + private List findFacets(ScoredDocIDs sDocids, boolean withComplement) throws IOException { + + FacetsAccumulator fAccumulator = + new StandardFacetsAccumulator(getFacetedSearchParams(), indexReader, taxoReader); + + fAccumulator.setComplementThreshold( + withComplement ? + FacetsAccumulator.FORCE_COMPLEMENT: + FacetsAccumulator.DISABLE_COMPLEMENT); + + List res = fAccumulator.accumulate(sDocids); + + // Results are ready, printing them... + int i = 0; + for (FacetResult facetResult : res) { + if (VERBOSE) { + System.out.println("Res "+(i++)+": "+facetResult); + } + } + + assertEquals(withComplement, ((StandardFacetsAccumulator) fAccumulator).isUsingComplements); + + return res; + } + +} \ No newline at end of file diff --git a/modules/facet/src/test/org/apache/lucene/facet/search/TestMultipleCategoryLists.java b/modules/facet/src/test/org/apache/lucene/facet/search/TestMultipleCategoryLists.java new file mode 100644 index 00000000000..5f2725b8936 --- /dev/null +++ b/modules/facet/src/test/org/apache/lucene/facet/search/TestMultipleCategoryLists.java @@ -0,0 +1,384 @@ +package org.apache.lucene.facet.search; + +import java.io.IOException; +import java.util.Iterator; +import java.util.List; + +import org.apache.lucene.analysis.core.WhitespaceAnalyzer; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.MultiFields; +import org.apache.lucene.index.IndexWriterConfig.OpenMode; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TopScoreDocCollector; +import org.apache.lucene.store.Directory; +import org.junit.Test; + +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.search.MultiCollector; +import org.apache.lucene.facet.FacetTestUtils; +import org.apache.lucene.facet.index.params.CategoryListParams; +import org.apache.lucene.facet.index.params.FacetIndexingParams; +import org.apache.lucene.facet.index.params.PerDimensionIndexingParams; +import org.apache.lucene.facet.search.FacetsCollector; +import org.apache.lucene.facet.search.params.CountFacetRequest; +import org.apache.lucene.facet.search.params.FacetSearchParams; +import org.apache.lucene.facet.search.results.FacetResult; +import org.apache.lucene.facet.search.results.FacetResultNode; +import org.apache.lucene.facet.taxonomy.CategoryPath; +import org.apache.lucene.facet.taxonomy.TaxonomyReader; +import org.apache.lucene.facet.taxonomy.TaxonomyWriter; +import org.apache.lucene.facet.taxonomy.lucene.LuceneTaxonomyReader; +import org.apache.lucene.facet.taxonomy.lucene.LuceneTaxonomyWriter; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class TestMultipleCategoryLists extends LuceneTestCase { + + @Test + public void testDefault() throws Exception { + Directory[][] dirs = getDirs(); + // create and open an index writer + IndexWriter iw = new IndexWriter(dirs[0][0], new IndexWriterConfig( + TEST_VERSION_CURRENT, new WhitespaceAnalyzer(TEST_VERSION_CURRENT))); + // create and open a taxonomy writer + TaxonomyWriter tw = new LuceneTaxonomyWriter(dirs[0][1], OpenMode.CREATE); + + /** + * Configure with no custom counting lists + */ + PerDimensionIndexingParams iParams = new PerDimensionIndexingParams(); + + seedIndex(iw, tw, iParams); + + iw.commit(); + tw.commit(); + + // prepare index reader and taxonomy. + TaxonomyReader tr = new LuceneTaxonomyReader(dirs[0][1]); + IndexReader ir = IndexReader.open(dirs[0][0]); + + // prepare searcher to search against + IndexSearcher searcher = new IndexSearcher(ir); + + FacetsCollector facetsCollector = performSearch(iParams, tr, ir, + searcher); + + // Obtain facets results and hand-test them + assertCorrectResults(facetsCollector); + + DocsEnum td = MultiFields.getTermDocsEnum(ir, MultiFields.getDeletedDocs(ir), "$facets", new BytesRef("$fulltree$")); + assertTrue(td.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); + + tr.close(); + ir.close(); + searcher.close(); + iw.close(); + tw.close(); + } + + @Test + public void testCustom() throws Exception { + Directory[][] dirs = getDirs(); + // create and open an index writer + IndexWriter iw = new IndexWriter(dirs[0][0], new IndexWriterConfig( + TEST_VERSION_CURRENT, new WhitespaceAnalyzer(TEST_VERSION_CURRENT))); + // create and open a taxonomy writer + TaxonomyWriter tw = new LuceneTaxonomyWriter(dirs[0][1], + OpenMode.CREATE); + + PerDimensionIndexingParams iParams = new PerDimensionIndexingParams(); + iParams.addCategoryListParams(new CategoryPath("Author"), + new CategoryListParams(new Term("$author", "Authors"))); + seedIndex(iw, tw, iParams); + + iw.commit(); + tw.commit(); + + // prepare index reader and taxonomy. + TaxonomyReader tr = new LuceneTaxonomyReader(dirs[0][1]); + IndexReader ir = IndexReader.open(dirs[0][0]); + + // prepare searcher to search against + IndexSearcher searcher = new IndexSearcher(ir); + + FacetsCollector facetsCollector = performSearch(iParams, tr, ir, + searcher); + + // Obtain facets results and hand-test them + assertCorrectResults(facetsCollector); + + assertPostingListExists("$facets", "$fulltree$", ir); + assertPostingListExists("$author", "Authors", ir); + + tr.close(); + ir.close(); + searcher.close(); + iw.close(); + tw.close(); + } + + @Test + public void testTwoCustomsSameField() throws Exception { + Directory[][] dirs = getDirs(); + // create and open an index writer + IndexWriter iw = new IndexWriter(dirs[0][0], new IndexWriterConfig( + TEST_VERSION_CURRENT, new WhitespaceAnalyzer(TEST_VERSION_CURRENT))); + // create and open a taxonomy writer + TaxonomyWriter tw = new LuceneTaxonomyWriter(dirs[0][1], + OpenMode.CREATE); + + PerDimensionIndexingParams iParams = new PerDimensionIndexingParams(); + iParams.addCategoryListParams(new CategoryPath("Band"), + new CategoryListParams(new Term("$music", "Bands"))); + iParams.addCategoryListParams(new CategoryPath("Composer"), + new CategoryListParams(new Term("$music", "Composers"))); + seedIndex(iw, tw, iParams); + + iw.commit(); + tw.commit(); + + // prepare index reader and taxonomy. + TaxonomyReader tr = new LuceneTaxonomyReader(dirs[0][1]); + IndexReader ir = IndexReader.open(dirs[0][0]); + + // prepare searcher to search against + IndexSearcher searcher = new IndexSearcher(ir); + + FacetsCollector facetsCollector = performSearch(iParams, tr, ir, + searcher); + + // Obtain facets results and hand-test them + assertCorrectResults(facetsCollector); + + assertPostingListExists("$facets", "$fulltree$", ir); + assertPostingListExists("$music", "Bands", ir); + assertPostingListExists("$music", "Composers", ir); + + tr.close(); + ir.close(); + searcher.close(); + iw.close(); + tw.close(); + } + + private void assertPostingListExists(String field, String text, IndexReader ir) throws IOException { + DocsEnum de = MultiFields.getTermDocsEnum(ir, null, field, new BytesRef(text)); + assertTrue(de.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); + } + + @Test + public void testDifferentFieldsAndText() throws Exception { + Directory[][] dirs = getDirs(); + // create and open an index writer + IndexWriter iw = new IndexWriter(dirs[0][0], new IndexWriterConfig( + TEST_VERSION_CURRENT, new WhitespaceAnalyzer(TEST_VERSION_CURRENT))); + // create and open a taxonomy writer + TaxonomyWriter tw = new LuceneTaxonomyWriter(dirs[0][1], OpenMode.CREATE); + + PerDimensionIndexingParams iParams = new PerDimensionIndexingParams(); + iParams.addCategoryListParams(new CategoryPath("Band"), + new CategoryListParams(new Term("$bands", "Bands"))); + iParams.addCategoryListParams(new CategoryPath("Composer"), + new CategoryListParams(new Term("$composers", "Composers"))); + seedIndex(iw, tw, iParams); + + iw.commit(); + tw.commit(); + + // prepare index reader and taxonomy. + TaxonomyReader tr = new LuceneTaxonomyReader(dirs[0][1]); + IndexReader ir = IndexReader.open(dirs[0][0]); + + // prepare searcher to search against + IndexSearcher searcher = new IndexSearcher(ir); + + FacetsCollector facetsCollector = performSearch(iParams, tr, ir, + searcher); + + // Obtain facets results and hand-test them + assertCorrectResults(facetsCollector); + assertPostingListExists("$facets", "$fulltree$", ir); + assertPostingListExists("$bands", "Bands", ir); + assertPostingListExists("$composers", "Composers", ir); + tr.close(); + ir.close(); + searcher.close(); + iw.close(); + tw.close(); + } + + @Test + public void testSomeSameSomeDifferent() throws Exception { + Directory[][] dirs = getDirs(); + // create and open an index writer + IndexWriter iw = new IndexWriter(dirs[0][0], new IndexWriterConfig( + TEST_VERSION_CURRENT, new WhitespaceAnalyzer(TEST_VERSION_CURRENT))); + // create and open a taxonomy writer + TaxonomyWriter tw = new LuceneTaxonomyWriter(dirs[0][1], + OpenMode.CREATE); + + PerDimensionIndexingParams iParams = new PerDimensionIndexingParams(); + iParams.addCategoryListParams(new CategoryPath("Band"), + new CategoryListParams(new Term("$music", "music"))); + iParams.addCategoryListParams(new CategoryPath("Composer"), + new CategoryListParams(new Term("$music", "music"))); + iParams.addCategoryListParams(new CategoryPath("Author"), + new CategoryListParams(new Term("$literature", "Authors"))); + + seedIndex(iw, tw, iParams); + + iw.commit(); + tw.commit(); + + // prepare index reader and taxonomy. + TaxonomyReader tr = new LuceneTaxonomyReader(dirs[0][1]); + IndexReader ir = IndexReader.open(dirs[0][0]); + + // prepare searcher to search against + IndexSearcher searcher = new IndexSearcher(ir); + + FacetsCollector facetsCollector = performSearch(iParams, tr, ir, + searcher); + + // Obtain facets results and hand-test them + assertCorrectResults(facetsCollector); + assertPostingListExists("$music", "music", ir); + assertPostingListExists("$literature", "Authors", ir); + + tr.close(); + ir.close(); + searcher.close(); + iw.close(); + tw.close(); + } + + private Directory[][] getDirs() throws IOException { + return FacetTestUtils.createIndexTaxonomyDirs(1); + } + + private void assertCorrectResults(FacetsCollector facetsCollector) + throws IOException, IllegalAccessException, InstantiationException { + List res = facetsCollector.getFacetResults(); + + FacetResult results = res.get(0); + FacetResultNode resNode = results.getFacetResultNode(); + Iterable subResults = resNode + .getSubResults(); + Iterator subIter = subResults.iterator(); + + checkResult(resNode, "Band", 5.0); + checkResult(subIter.next(), "Band/Rock & Pop", 4.0); + checkResult(subIter.next(), "Band/Punk", 1.0); + + results = res.get(1); + resNode = results.getFacetResultNode(); + subResults = resNode.getSubResults(); + subIter = subResults.iterator(); + + checkResult(resNode, "Band", 5.0); + checkResult(subIter.next(), "Band/Rock & Pop", 4.0); + checkResult(subIter.next(), "Band/Rock & Pop/Dave Matthews Band", 1.0); + checkResult(subIter.next(), "Band/Rock & Pop/REM", 1.0); + checkResult(subIter.next(), "Band/Rock & Pop/U2", 1.0); + checkResult(subIter.next(), "Band/Punk/The Ramones", 1.0); + checkResult(subIter.next(), "Band/Punk", 1.0); + checkResult(subIter.next(), "Band/Rock & Pop/The Beatles", 1.0); + + results = res.get(2); + resNode = results.getFacetResultNode(); + subResults = resNode.getSubResults(); + subIter = subResults.iterator(); + + checkResult(resNode, "Author", 3.0); + checkResult(subIter.next(), "Author/Kurt Vonnegut", 1.0); + checkResult(subIter.next(), "Author/Stephen King", 1.0); + checkResult(subIter.next(), "Author/Mark Twain", 1.0); + + results = res.get(3); + resNode = results.getFacetResultNode(); + subResults = resNode.getSubResults(); + subIter = subResults.iterator(); + + checkResult(resNode, "Band/Rock & Pop", 4.0); + checkResult(subIter.next(), "Band/Rock & Pop/Dave Matthews Band", 1.0); + checkResult(subIter.next(), "Band/Rock & Pop/REM", 1.0); + checkResult(subIter.next(), "Band/Rock & Pop/U2", 1.0); + checkResult(subIter.next(), "Band/Rock & Pop/The Beatles", 1.0); + } + + private FacetsCollector performSearch(FacetIndexingParams iParams, + TaxonomyReader tr, IndexReader ir, + IndexSearcher searcher) throws IOException { + // step 1: collect matching documents into a collector + Query q = new MatchAllDocsQuery(); + TopScoreDocCollector topDocsCollector = TopScoreDocCollector.create(10, + true); + + // Faceted search parameters indicate which facets are we interested in + FacetSearchParams facetSearchParams = new FacetSearchParams(iParams); + + facetSearchParams.addFacetRequest(new CountFacetRequest( + new CategoryPath("Band"), 10)); + CountFacetRequest bandDepth = new CountFacetRequest(new CategoryPath( + "Band"), 10); + bandDepth.setDepth(2); + facetSearchParams.addFacetRequest(bandDepth); + facetSearchParams.addFacetRequest(new CountFacetRequest( + new CategoryPath("Author"), 10)); + facetSearchParams.addFacetRequest(new CountFacetRequest( + new CategoryPath("Band", "Rock & Pop"), 10)); + + // perform documents search and facets accumulation + FacetsCollector facetsCollector = new FacetsCollector(facetSearchParams, ir, tr); + searcher.search(q, MultiCollector.wrap(topDocsCollector, facetsCollector)); + return facetsCollector; + } + + private void seedIndex(IndexWriter iw, TaxonomyWriter tw, + FacetIndexingParams iParams) throws IOException, CorruptIndexException { + FacetTestUtils.add(iParams, iw, tw, "Author", "Mark Twain"); + FacetTestUtils.add(iParams, iw, tw, "Author", "Stephen King"); + FacetTestUtils.add(iParams, iw, tw, "Author", "Kurt Vonnegut"); + FacetTestUtils.add(iParams, iw, tw, "Band", "Rock & Pop", + "The Beatles"); + FacetTestUtils.add(iParams, iw, tw, "Band", "Punk", "The Ramones"); + FacetTestUtils.add(iParams, iw, tw, "Band", "Rock & Pop", "U2"); + FacetTestUtils.add(iParams, iw, tw, "Band", "Rock & Pop", "REM"); + FacetTestUtils.add(iParams, iw, tw, "Band", "Rock & Pop", + "Dave Matthews Band"); + FacetTestUtils.add(iParams, iw, tw, "Composer", "Bach"); + } + + private static void checkResult(FacetResultNode sub, String label, double value) { + assertEquals("Label of subresult " + sub.getLabel() + " was incorrect", + label, sub.getLabel().toString()); + assertEquals( + "Value for " + sub.getLabel() + " subresult was incorrect", + value, sub.getValue(), 0.0); + } + +} \ No newline at end of file diff --git a/modules/facet/src/test/org/apache/lucene/facet/search/TestScoredDocIdCollector.java b/modules/facet/src/test/org/apache/lucene/facet/search/TestScoredDocIdCollector.java new file mode 100644 index 00000000000..9ec18001614 --- /dev/null +++ b/modules/facet/src/test/org/apache/lucene/facet/search/TestScoredDocIdCollector.java @@ -0,0 +1,177 @@ +package org.apache.lucene.facet.search; + +import java.io.IOException; +import java.util.Arrays; +import java.util.List; + +import org.apache.lucene.index.Term; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; +import org.junit.Before; +import org.junit.Test; + +import org.apache.lucene.facet.FacetTestBase; +import org.apache.lucene.facet.search.FacetsAccumulator; +import org.apache.lucene.facet.search.ScoredDocIDs; +import org.apache.lucene.facet.search.ScoredDocIDsIterator; +import org.apache.lucene.facet.search.ScoredDocIdCollector; +import org.apache.lucene.facet.search.StandardFacetsAccumulator; +import org.apache.lucene.facet.search.params.CountFacetRequest; +import org.apache.lucene.facet.search.params.FacetSearchParams; +import org.apache.lucene.facet.search.params.ScoreFacetRequest; +import org.apache.lucene.facet.search.results.FacetResult; +import org.apache.lucene.facet.search.results.FacetResultNode; +import org.apache.lucene.facet.taxonomy.CategoryPath; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** Test ScoredDocIdCollector. */ +public class TestScoredDocIdCollector extends FacetTestBase { + + @Override + @Before + public void setUp() throws Exception { + super.setUp(); + initIndex(); + } + + @Override + public void tearDown() throws Exception { + closeAll(); + super.tearDown(); + } + + @Test + public void testConstantScore() throws Exception { + // test that constant score works well + assertTrue("Would like to test this with deletions!",indexReader.hasDeletions()); + assertTrue("Would like to test this with deletions!",indexReader.numDeletedDocs()>0); + + Query q = new TermQuery(new Term(CONTENT_FIELD, "white")); + if (VERBOSE) { + System.out.println("Query: " + q); + } + float constScore = 17.0f; + ScoredDocIdCollector dCollector = ScoredDocIdCollector.create(indexReader + .maxDoc(), false); // scoring is disabled + dCollector.setDefaultScore(constScore); + searcher.search(q, dCollector); + + // verify by doc scores at the level of doc-id-iterator + ScoredDocIDs scoredDocIDs = dCollector.getScoredDocIDs(); + assertEquals("Wrong number of matching documents!", 2, scoredDocIDs.size()); + ScoredDocIDsIterator docItr = scoredDocIDs.iterator(); + while (docItr.next()) { + assertEquals("Wrong score for doc " + docItr.getDocID(), constScore, + docItr.getScore(), Double.MIN_VALUE); + } + + // verify by facet values + List countRes = findFacets(scoredDocIDs, getFacetedSearchParams()); + List scoreRes = findFacets(scoredDocIDs, sumScoreSearchParams()); + + assertEquals("Wrong number of facet count results!", 1, countRes.size()); + assertEquals("Wrong number of facet score results!", 1, scoreRes.size()); + + FacetResultNode parentCountRes = countRes.get(0).getFacetResultNode(); + FacetResultNode parentScoreRes = scoreRes.get(0).getFacetResultNode(); + + assertEquals("Wrong number of top count aggregated categories!", 3, + parentCountRes.getNumSubResults()); + assertEquals("Wrong number of top score aggregated categories!", 3, + parentScoreRes.getNumSubResults()); + + // rely on that facet value is computed as doc-score, and + // accordingly compare values of the two top-category results. + + FacetResultNode[] countResNodes = resultNodesAsArray(parentCountRes); + FacetResultNode[] scoreResNodes = resultNodesAsArray(parentScoreRes); + + for (int i = 0; i < scoreResNodes.length; i++) { + assertEquals("Ordinals differ!", + countResNodes[i].getOrdinal(), scoreResNodes[i].getOrdinal()); + assertEquals("Wrong scores!", + constScore * countResNodes[i].getValue(), + scoreResNodes[i].getValue(), + Double.MIN_VALUE); + } + } + + // compute facets with certain facet requests and docs + private List findFacets(ScoredDocIDs sDocids, + FacetSearchParams facetSearchParams) throws IOException { + FacetsAccumulator fAccumulator = new StandardFacetsAccumulator( + facetSearchParams, indexReader, taxoReader); + List res = fAccumulator.accumulate(sDocids); + + // Results are ready, printing them... + int i = 0; + for (FacetResult facetResult : res) { + if (VERBOSE) { + System.out.println("Res " + (i++) + ": " + facetResult); + } + } + + return res; + } + + @Test + public void testOutOfOrderCollectionScoringEnabled() throws Exception { + assertFalse( + "when scoring enabled, out-of-order collection should not be supported", + ScoredDocIdCollector.create(1, true).acceptsDocsOutOfOrder()); + } + + @Test + public void testOutOfOrderCollectionScoringDisabled() throws Exception { + // This used to fail, because ScoredDocIdCollector.acceptDocsOutOfOrder + // returned true, even when scoring was enabled. + final int[] docs = new int[] { 1, 0, 2 }; // out of order on purpose + + ScoredDocIdCollector sdic = ScoredDocIdCollector.create(docs.length, false); + assertTrue( + "when scoring disabled, out-of-order collection should be supported", + sdic.acceptsDocsOutOfOrder()); + for (int i = 0; i < docs.length; i++) { + sdic.collect(docs[i]); + } + + assertEquals("expected 3 documents but got " + sdic.getScoredDocIDs().size(), 3, sdic.getScoredDocIDs().size()); + ScoredDocIDsIterator iter = sdic.getScoredDocIDs().iterator(); + Arrays.sort(docs); + for (int i = 0; iter.next(); i++) { + assertEquals("expected doc " + docs[i], docs[i], iter.getDocID()); + } + } + + /* use a scoring aggregator */ + private FacetSearchParams sumScoreSearchParams() { + // this will use default faceted indexing params, not altering anything about indexing + FacetSearchParams res = super.getFacetedSearchParams(); + res.addFacetRequest(new ScoreFacetRequest(new CategoryPath("root", "a"), 10)); + return res; + } + + @Override + protected FacetSearchParams getFacetedSearchParams() { + FacetSearchParams res = super.getFacetedSearchParams(); + res.addFacetRequest(new CountFacetRequest(new CategoryPath("root","a"), 10)); + return res; + } + +} \ No newline at end of file diff --git a/modules/facet/src/test/org/apache/lucene/facet/search/TestTopKInEachNodeResultHandler.java b/modules/facet/src/test/org/apache/lucene/facet/search/TestTopKInEachNodeResultHandler.java new file mode 100644 index 00000000000..e7ec4eadde7 --- /dev/null +++ b/modules/facet/src/test/org/apache/lucene/facet/search/TestTopKInEachNodeResultHandler.java @@ -0,0 +1,339 @@ +package org.apache.lucene.facet.search; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.Field.Index; +import org.apache.lucene.document.Field.Store; +import org.apache.lucene.document.Field.TermVector; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.IndexWriterConfig.OpenMode; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.RAMDirectory; +import org.junit.Test; + +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.facet.index.CategoryDocumentBuilder; +import org.apache.lucene.facet.index.params.DefaultFacetIndexingParams; +import org.apache.lucene.facet.search.FacetsAccumulator; +import org.apache.lucene.facet.search.FloatArrayAllocator; +import org.apache.lucene.facet.search.IntArrayAllocator; +import org.apache.lucene.facet.search.ScoredDocIdCollector; +import org.apache.lucene.facet.search.StandardFacetsAccumulator; +import org.apache.lucene.facet.search.params.CountFacetRequest; +import org.apache.lucene.facet.search.params.FacetSearchParams; +import org.apache.lucene.facet.search.params.FacetRequest.ResultMode; +import org.apache.lucene.facet.search.results.FacetResult; +import org.apache.lucene.facet.search.results.FacetResultNode; +import org.apache.lucene.facet.taxonomy.CategoryPath; +import org.apache.lucene.facet.taxonomy.TaxonomyWriter; +import org.apache.lucene.facet.taxonomy.lucene.LuceneTaxonomyReader; +import org.apache.lucene.facet.taxonomy.lucene.LuceneTaxonomyWriter; +import org.apache.lucene.facet.util.PartitionsUtils; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class TestTopKInEachNodeResultHandler extends LuceneTestCase { + + //TODO (Facet): Move to extend BaseTestTopK and separate to several smaller test cases (methods) - see TestTopKResultsHandler + + @Test + public void testSimple() throws Exception { + + int[] partitionSizes = new int[] { + 2,3,4, 5, 6, 7, 10, 1000, + Integer.MAX_VALUE }; + + for (int partitionSize : partitionSizes) { + Directory iDir = new RAMDirectory(); + Directory tDir = new RAMDirectory(); + + if (VERBOSE) { + System.out.println("Partition Size: " + partitionSize); + } + + final int pSize = partitionSize; + DefaultFacetIndexingParams iParams = new DefaultFacetIndexingParams() { + @Override + protected int fixedPartitionSize() { + return pSize; + } + }; + + IndexWriter iw = new IndexWriter(iDir, + new IndexWriterConfig(TEST_VERSION_CURRENT, + new StandardAnalyzer(TEST_VERSION_CURRENT)).setOpenMode(OpenMode.CREATE)); + TaxonomyWriter tw = new LuceneTaxonomyWriter(tDir); + prvt_add(iParams, iw, tw, "a", "b"); + prvt_add(iParams, iw, tw, "a", "b", "1"); + prvt_add(iParams, iw, tw, "a", "b", "1"); + prvt_add(iParams, iw, tw, "a", "b", "2"); + prvt_add(iParams, iw, tw, "a", "b", "2"); + prvt_add(iParams, iw, tw, "a", "b", "2"); + prvt_add(iParams, iw, tw, "a", "b", "3"); + prvt_add(iParams, iw, tw, "a", "b", "4"); + prvt_add(iParams, iw, tw, "a", "c"); + prvt_add(iParams, iw, tw, "a", "c"); + prvt_add(iParams, iw, tw, "a", "c"); + prvt_add(iParams, iw, tw, "a", "c"); + prvt_add(iParams, iw, tw, "a", "c"); + prvt_add(iParams, iw, tw, "a", "c", "1"); + prvt_add(iParams, iw, tw, "a", "d"); + prvt_add(iParams, iw, tw, "a", "e"); + + iw.commit(); + iw.close(); + tw.commit(); + tw.close(); + + IndexSearcher is = new IndexSearcher(iDir); + LuceneTaxonomyReader tr = new LuceneTaxonomyReader(tDir); + + // Get all of the documents and run the query, then do different + // facet counts and compare to control + Query q = new TermQuery(new Term("content", "alpha")); + ScoredDocIdCollector scoredDoc = ScoredDocIdCollector.create(is.maxDoc(), true); + + // Collector collector = new MultiCollector(scoredDoc); + is.search(q, scoredDoc); + + CountFacetRequest cfra23 = new CountFacetRequest( + new CategoryPath("a"), 2); + cfra23.setDepth(3); + cfra23.setResultMode(ResultMode.PER_NODE_IN_TREE); + + CountFacetRequest cfra22 = new CountFacetRequest( + new CategoryPath("a"), 2); + cfra22.setDepth(2); + cfra22.setResultMode(ResultMode.PER_NODE_IN_TREE); + + CountFacetRequest cfra21 = new CountFacetRequest( + new CategoryPath("a"), 2); + cfra21.setDepth(1); + cfra21.setResultMode(ResultMode.PER_NODE_IN_TREE); + + CountFacetRequest cfrb22 = new CountFacetRequest( + new CategoryPath("a", "b"), 2); + cfrb22.setDepth(2); + cfrb22.setResultMode(ResultMode.PER_NODE_IN_TREE); + + CountFacetRequest cfrb23 = new CountFacetRequest( + new CategoryPath("a", "b"), 2); + cfrb23.setDepth(3); + cfrb23.setResultMode(ResultMode.PER_NODE_IN_TREE); + + CountFacetRequest cfrb21 = new CountFacetRequest( + new CategoryPath("a", "b"), 2); + cfrb21.setDepth(1); + cfrb21.setResultMode(ResultMode.PER_NODE_IN_TREE); + + CountFacetRequest doctor = new CountFacetRequest( + new CategoryPath("Doctor"), 2); + doctor.setDepth(1); + doctor.setResultMode(ResultMode.PER_NODE_IN_TREE); + + CountFacetRequest cfrb20 = new CountFacetRequest( + new CategoryPath("a", "b"), 2); + cfrb20.setDepth(0); + cfrb20.setResultMode(ResultMode.PER_NODE_IN_TREE); + + FacetSearchParams facetSearchParams = new FacetSearchParams(iParams); + facetSearchParams.addFacetRequest(cfra23); + facetSearchParams.addFacetRequest(cfra22); + facetSearchParams.addFacetRequest(cfra21); + facetSearchParams.addFacetRequest(cfrb23); + facetSearchParams.addFacetRequest(cfrb22); + facetSearchParams.addFacetRequest(cfrb21); + facetSearchParams.addFacetRequest(doctor); + facetSearchParams.addFacetRequest(cfrb20); + + IntArrayAllocator iaa = new IntArrayAllocator(PartitionsUtils.partitionSize(facetSearchParams,tr), 1); + FloatArrayAllocator faa = new FloatArrayAllocator(PartitionsUtils.partitionSize(facetSearchParams,tr), 1); + FacetsAccumulator fctExtrctr = new StandardFacetsAccumulator(facetSearchParams, is.getIndexReader(), tr, iaa, faa); + fctExtrctr.setComplementThreshold(FacetsAccumulator.DISABLE_COMPLEMENT); + long start = System.currentTimeMillis(); + + List facetResults = fctExtrctr.accumulate(scoredDoc.getScoredDocIDs()); + + long end = System.currentTimeMillis(); + if (VERBOSE) { + System.out.println("Time: " + (end - start)); + } + + FacetResult fr = facetResults.get(0); // a, depth=3, K=2 + boolean hasDoctor = "Doctor".equals(fr.getFacetRequest().getCategoryPath().getComponent(0)); + assertEquals(9, fr.getNumValidDescendants()); + FacetResultNode parentRes = fr.getFacetResultNode(); + assertEquals(16.0, parentRes.getValue(), Double.MIN_VALUE); + assertEquals(2.0, parentRes.getResidue(), Double.MIN_VALUE); + assertEquals(2, parentRes.getNumSubResults()); + // two nodes sorted by descending values: a/b with 8 and a/c with 6 + // a/b has residue 2 and two children a/b/2 with value 3, and a/b/1 with value 2. + // a/c has residue 0, and one child a/c/1 with value 1. + double [] expectedValues0 = { 8.0, 2.0, 3.0, 0.0, 2.0, 0.0, 6.0, 0.0, 1.0, 0.0 }; + int i = 0; + for (FacetResultNode node : parentRes.getSubResults()) { + assertEquals(expectedValues0[i++], node.getValue(), Double.MIN_VALUE); + assertEquals(expectedValues0[i++], node.getResidue(), Double.MIN_VALUE); + for (FacetResultNode node2 : node.getSubResults()) { + assertEquals(expectedValues0[i++], node2.getValue(), Double.MIN_VALUE); + assertEquals(expectedValues0[i++], node2.getResidue(), Double.MIN_VALUE); + } + } + + // now just change the value of the first child of the root to 5, and then rearrange + // expected are: first a/c of value 6 and residue 0, and one child a/c/1 with value 1 + // then a/b with value 5 and residue 2, and both children: a/b/2 with value 3, and a/b/1 with value 2. + for (FacetResultNode node : parentRes.getSubResults()) { + node.setValue(5.0); + break; + } + // now rearrange + double [] expectedValues00 = { 6.0, 0.0, 1.0, 0.0, 5.0, 2.0, 3.0, 0.0, 2.0, 0.0 }; + fr = cfra23.createFacetResultsHandler(tr).rearrangeFacetResult(fr); + i = 0; + for (FacetResultNode node : parentRes.getSubResults()) { + assertEquals(expectedValues00[i++], node.getValue(), Double.MIN_VALUE); + assertEquals(expectedValues00[i++], node.getResidue(), Double.MIN_VALUE); + for (FacetResultNode node2 : node.getSubResults()) { + assertEquals(expectedValues00[i++], node2.getValue(), Double.MIN_VALUE); + assertEquals(expectedValues00[i++], node2.getResidue(), Double.MIN_VALUE); + } + } + + fr = facetResults.get(1); // a, depth=2, K=2. same result as before + hasDoctor |= "Doctor".equals(fr.getFacetRequest().getCategoryPath().getComponent(0)); + assertEquals(9, fr.getNumValidDescendants()); + parentRes = fr.getFacetResultNode(); + assertEquals(16.0, parentRes.getValue(), Double.MIN_VALUE); + assertEquals(2.0, parentRes.getResidue(), Double.MIN_VALUE); + assertEquals(2, parentRes.getNumSubResults()); + // two nodes sorted by descending values: a/b with 8 and a/c with 6 + // a/b has residue 2 and two children a/b/2 with value 3, and a/b/1 with value 2. + // a/c has residue 0, and one child a/c/1 with value 1. + i = 0; + for (FacetResultNode node : parentRes.getSubResults()) { + assertEquals(expectedValues0[i++], node.getValue(), Double.MIN_VALUE); + assertEquals(expectedValues0[i++], node.getResidue(), Double.MIN_VALUE); + for (FacetResultNode node2 : node.getSubResults()) { + assertEquals(expectedValues0[i++], node2.getValue(), Double.MIN_VALUE); + assertEquals(expectedValues0[i++], node2.getResidue(), Double.MIN_VALUE); + } + } + + fr = facetResults.get(2); // a, depth=1, K=2 + hasDoctor |= "Doctor".equals(fr.getFacetRequest().getCategoryPath().getComponent(0)); + assertEquals(4, fr.getNumValidDescendants(), 4); + parentRes = fr.getFacetResultNode(); + assertEquals(16.0, parentRes.getValue(), Double.MIN_VALUE); + assertEquals(2.0, parentRes.getResidue(), Double.MIN_VALUE); + assertEquals(2, parentRes.getNumSubResults()); + // two nodes sorted by descending values: + // a/b with value 8 and residue 0 (because no children considered), + // and a/c with value 6 and residue 0 (because no children considered) + double [] expectedValues2 = { 8.0, 0.0, 6.0, 0.0 }; + i = 0; + for (FacetResultNode node : parentRes.getSubResults()) { + assertEquals(expectedValues2[i++], node.getValue(), Double.MIN_VALUE); + assertEquals(expectedValues2[i++], node.getResidue(), Double.MIN_VALUE); + assertEquals(node.getNumSubResults(), 0); + } + + fr = facetResults.get(3); // a/b, depth=3, K=2 + hasDoctor |= "Doctor".equals(fr.getFacetRequest().getCategoryPath().getComponent(0)); + assertEquals(4, fr.getNumValidDescendants()); + parentRes = fr.getFacetResultNode(); + assertEquals(8.0, parentRes.getValue(), Double.MIN_VALUE); + assertEquals(2.0, parentRes.getResidue(), Double.MIN_VALUE); + assertEquals(2, parentRes.getNumSubResults()); + double [] expectedValues3 = { 3.0, 2.0 }; + i = 0; + for (FacetResultNode node : parentRes.getSubResults()) { + assertEquals(expectedValues3[i++], node.getValue(), Double.MIN_VALUE); + assertEquals(0.0, node.getResidue(), Double.MIN_VALUE); + assertEquals(0, node.getNumSubResults()); + } + + fr = facetResults.get(4); // a/b, depth=2, K=2 + hasDoctor |= "Doctor".equals(fr.getFacetRequest().getCategoryPath().getComponent(0)); + assertEquals(4, fr.getNumValidDescendants()); + parentRes = fr.getFacetResultNode(); + assertEquals(8.0, parentRes.getValue(), Double.MIN_VALUE); + assertEquals(2.0, parentRes.getResidue(), Double.MIN_VALUE); + assertEquals(2, parentRes.getNumSubResults()); + i = 0; + for (FacetResultNode node : parentRes.getSubResults()) { + assertEquals(expectedValues3[i++], node.getValue(), Double.MIN_VALUE); + assertEquals(0.0, node.getResidue(), Double.MIN_VALUE); + assertEquals(0, node.getNumSubResults()); + } + + fr = facetResults.get(5); // a/b, depth=1, K=2 + hasDoctor |= "Doctor".equals(fr.getFacetRequest().getCategoryPath().getComponent(0)); + assertEquals(4, fr.getNumValidDescendants()); + parentRes = fr.getFacetResultNode(); + assertEquals(8.0, parentRes.getValue(), Double.MIN_VALUE); + assertEquals(2.0, parentRes.getResidue(), Double.MIN_VALUE); + assertEquals(2, parentRes.getNumSubResults()); + i = 0; + for (FacetResultNode node : parentRes.getSubResults()) { + assertEquals(expectedValues3[i++], node.getValue(), Double.MIN_VALUE); + assertEquals(0.0, node.getResidue(), Double.MIN_VALUE); + assertEquals(0, node.getNumSubResults()); + } + + fr = facetResults.get(6); // a/b, depth=0, K=2 + hasDoctor |= "Doctor".equals(fr.getFacetRequest().getCategoryPath().getComponent(0)); + assertEquals(0, fr.getNumValidDescendants()); // 0 descendants but rootnode + parentRes = fr.getFacetResultNode(); + assertEquals(8.0, parentRes.getValue(), Double.MIN_VALUE); + assertEquals(0.0, parentRes.getResidue(), Double.MIN_VALUE); + assertEquals(0, parentRes.getNumSubResults()); + hasDoctor |= "Doctor".equals(fr.getFacetRequest().getCategoryPath().getComponent(0)); + + // doctor, depth=1, K=2 + assertFalse("Shouldn't have found anything for a FacetRequest " + + "of a facet that doesn't exist in the index.", hasDoctor); + assertEquals("Shouldn't have found more than seven request.", 7, facetResults.size()); + } + + } + + private void prvt_add(DefaultFacetIndexingParams iParams, IndexWriter iw, + TaxonomyWriter tw, String... strings) throws IOException, + CorruptIndexException { + ArrayList cps = new ArrayList(); + CategoryPath cp = new CategoryPath(strings); + cps.add(cp); + Document d = new Document(); + new CategoryDocumentBuilder(tw, iParams).setCategoryPaths(cps).build(d); + d.add(new Field("content", "alpha", Store.YES, Index.ANALYZED, TermVector.NO)); + iw.addDocument(d); + } + +} diff --git a/modules/facet/src/test/org/apache/lucene/facet/search/TestTopKResultsHandler.java b/modules/facet/src/test/org/apache/lucene/facet/search/TestTopKResultsHandler.java new file mode 100644 index 00000000000..6ecb63a30fc --- /dev/null +++ b/modules/facet/src/test/org/apache/lucene/facet/search/TestTopKResultsHandler.java @@ -0,0 +1,239 @@ +package org.apache.lucene.facet.search; + +import java.util.Arrays; +import java.util.List; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.search.MatchAllDocsQuery; +import org.junit.Test; + +import org.apache.lucene.facet.search.params.CountFacetRequest; +import org.apache.lucene.facet.search.params.FacetSearchParams; +import org.apache.lucene.facet.search.results.FacetResult; +import org.apache.lucene.facet.search.results.FacetResultNode; +import org.apache.lucene.facet.taxonomy.CategoryPath; +import org.apache.lucene.facet.taxonomy.TaxonomyReader; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class TestTopKResultsHandler extends BaseTestTopK { + + private static final CategoryPath[] CATEGORIES = { + new CategoryPath( "a", "b"), + new CategoryPath( "a", "b", "1"), + new CategoryPath( "a", "b", "1"), + new CategoryPath( "a", "b", "2"), + new CategoryPath( "a", "b", "2"), + new CategoryPath( "a", "b", "3"), + new CategoryPath( "a", "b", "4"), + new CategoryPath( "a", "c"), + new CategoryPath( "a", "c"), + new CategoryPath( "a", "c"), + new CategoryPath( "a", "c"), + new CategoryPath( "a", "c"), + new CategoryPath( "a", "c", "1"), + }; + + @Override + protected String getContent(int doc) { + return ALPHA; + } + + @Override + protected int numDocsToIndex() { + return CATEGORIES.length; + } + + @Override + protected List getCategories(int doc) { + return Arrays.asList(CATEGORIES[doc]); + } + + /** + * Strait forward test: Adding specific documents with specific facets and + * counting them in the most basic form. + */ + @Test + public void testSimple() throws Exception { + for (int partitionSize : partitionSizes) { + initIndex(partitionSize); + + // do different facet counts and compare to control + FacetSearchParams sParams = getFacetedSearchParams(partitionSize); + + sParams.addFacetRequest(new CountFacetRequest(new CategoryPath("a"), 100)); + CountFacetRequest cfra = new CountFacetRequest(new CategoryPath("a"), 100); + cfra.setDepth(3); + sParams.addFacetRequest(cfra); + sParams.addFacetRequest(new CountFacetRequest(new CategoryPath("a", "b"), 100)); + sParams.addFacetRequest(new CountFacetRequest(new CategoryPath("a", "b", "1"), 100)); + sParams.addFacetRequest(new CountFacetRequest(new CategoryPath("a", "c"), 100)); + + FacetsCollector fc = new FacetsCollector(sParams, indexReader, taxoReader) { + @Override + protected FacetsAccumulator initFacetsAccumulator(FacetSearchParams facetSearchParams, IndexReader indexReader, TaxonomyReader taxonomyReader) { + FacetsAccumulator fa = new StandardFacetsAccumulator(facetSearchParams, indexReader, taxonomyReader); + fa.setComplementThreshold(FacetsAccumulator.DISABLE_COMPLEMENT); + return fa; + } + }; + + searcher.search(new MatchAllDocsQuery(), fc); + long start = System.currentTimeMillis(); + List facetResults = fc.getFacetResults(); + long end = System.currentTimeMillis(); + + if (VERBOSE) { + System.out.println("Time: " + (end - start)); + } + + FacetResult fr = facetResults.get(0); + FacetResultNode parentRes = fr.getFacetResultNode(); + assertEquals(13.0, parentRes.getValue(), Double.MIN_VALUE); + FacetResultNode[] frn = resultNodesAsArray(parentRes); + assertEquals(7.0, frn[0].getValue(), Double.MIN_VALUE); + assertEquals(6.0, frn[1].getValue(), Double.MIN_VALUE); + + fr = facetResults.get(1); + parentRes = fr.getFacetResultNode(); + assertEquals(13.0, parentRes.getValue(), Double.MIN_VALUE); + frn = resultNodesAsArray(parentRes); + assertEquals(7.0, frn[0].getValue(), Double.MIN_VALUE); + assertEquals(6.0, frn[1].getValue(), Double.MIN_VALUE); + assertEquals(2.0, frn[2].getValue(), Double.MIN_VALUE); + assertEquals(2.0, frn[3].getValue(), Double.MIN_VALUE); + assertEquals(1.0, frn[4].getValue(), Double.MIN_VALUE); + assertEquals(1.0, frn[5].getValue(), Double.MIN_VALUE); + + fr = facetResults.get(2); + parentRes = fr.getFacetResultNode(); + assertEquals(7.0, parentRes.getValue(), Double.MIN_VALUE); + frn = resultNodesAsArray(parentRes); + assertEquals(2.0, frn[0].getValue(), Double.MIN_VALUE); + assertEquals(2.0, frn[1].getValue(), Double.MIN_VALUE); + assertEquals(1.0, frn[2].getValue(), Double.MIN_VALUE); + assertEquals(1.0, frn[3].getValue(), Double.MIN_VALUE); + + fr = facetResults.get(3); + parentRes = fr.getFacetResultNode(); + assertEquals(2.0, parentRes.getValue(), Double.MIN_VALUE); + frn = resultNodesAsArray(parentRes); + assertEquals(0, frn.length); + + fr = facetResults.get(4); + parentRes = fr.getFacetResultNode(); + assertEquals(6.0, parentRes.getValue(), Double.MIN_VALUE); + frn = resultNodesAsArray(parentRes); + assertEquals(1.0, frn[0].getValue(), Double.MIN_VALUE); + } + } + + /** + * Creating an index, matching the results of an top K = Integer.MAX_VALUE and top-1000 requests + */ + @Test + public void testGetMaxIntFacets() throws Exception { + for (int partitionSize : partitionSizes) { + initIndex(partitionSize); + + // do different facet counts and compare to control + CategoryPath path = new CategoryPath("a", "b"); + FacetSearchParams sParams = getFacetedSearchParams(partitionSize); + sParams.addFacetRequest(new CountFacetRequest(path, Integer.MAX_VALUE)); + + FacetsCollector fc = new FacetsCollector(sParams, indexReader, taxoReader) { + @Override + protected FacetsAccumulator initFacetsAccumulator(FacetSearchParams facetSearchParams, IndexReader indexReader, TaxonomyReader taxonomyReader) { + FacetsAccumulator fa = new StandardFacetsAccumulator(facetSearchParams, indexReader, taxonomyReader); + fa.setComplementThreshold(FacetsAccumulator.DISABLE_COMPLEMENT); + return fa; + } + }; + + searcher.search(new MatchAllDocsQuery(), fc); + long start = System.currentTimeMillis(); + List results = fc.getFacetResults(); + long end = System.currentTimeMillis(); + + if (VERBOSE) { + System.out.println("Time: " + (end - start)); + } + + assertEquals("Should only be one result as there's only one request", 1, results.size()); + FacetResult res = results.get(0); + assertEquals(path + " should only have 4 desendants", 4, res.getNumValidDescendants()); + + // As a control base results, ask for top-1000 results + FacetSearchParams sParams2 = getFacetedSearchParams(partitionSize); + sParams2.addFacetRequest(new CountFacetRequest(path, Integer.MAX_VALUE)); + + FacetsCollector fc2 = new FacetsCollector(sParams2, indexReader, taxoReader) { + @Override + protected FacetsAccumulator initFacetsAccumulator(FacetSearchParams facetSearchParams, IndexReader indexReader, TaxonomyReader taxonomyReader) { + FacetsAccumulator fa = new StandardFacetsAccumulator(facetSearchParams, indexReader, taxonomyReader); + fa.setComplementThreshold(FacetsAccumulator.DISABLE_COMPLEMENT); + return fa; + } + }; + + searcher.search(new MatchAllDocsQuery(), fc2); + List baseResults = fc2.getFacetResults(); + FacetResult baseRes = baseResults.get(0); + + // Removing the first line which holds the REQUEST and this is surly different between the two + String baseResultString = baseRes.toString(); + baseResultString = baseResultString.substring(baseResultString.indexOf('\n')); + + // Removing the first line + String resultString = res.toString(); + resultString = resultString.substring(resultString.indexOf('\n')); + + assertTrue("Results for k=MAX_VALUE do not match the regular results for k=1000!!", + baseResultString.equals(resultString)); + + closeAll(); + } + } + + @Test + public void testSimpleSearchForNonexistentFacet() throws Exception { + for (int partitionSize : partitionSizes) { + initIndex(partitionSize); + + CategoryPath path = new CategoryPath("Miau Hattulla"); + FacetSearchParams sParams = getFacetedSearchParams(partitionSize); + sParams.addFacetRequest(new CountFacetRequest(path, 10)); + + FacetsCollector fc = new FacetsCollector(sParams, indexReader, taxoReader); + + searcher.search(new MatchAllDocsQuery(), fc); + + long start = System.currentTimeMillis(); + List facetResults = fc.getFacetResults(); + long end = System.currentTimeMillis(); + + if (VERBOSE) { + System.out.println("Time: " + (end - start)); + } + + assertEquals("Shouldn't have found anything for a FacetRequest " + + "of a facet that doesn't exist in the index.", 0, facetResults.size()); + + } + } +} diff --git a/modules/facet/src/test/org/apache/lucene/facet/search/TestTopKResultsHandlerRandom.java b/modules/facet/src/test/org/apache/lucene/facet/search/TestTopKResultsHandlerRandom.java new file mode 100644 index 00000000000..e7e0031abc2 --- /dev/null +++ b/modules/facet/src/test/org/apache/lucene/facet/search/TestTopKResultsHandlerRandom.java @@ -0,0 +1,151 @@ +package org.apache.lucene.facet.search; + +import java.io.IOException; +import java.util.HashMap; +import java.util.List; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.search.Query; +import org.junit.Test; + +import org.apache.lucene.facet.search.params.FacetSearchParams; +import org.apache.lucene.facet.search.results.FacetResult; +import org.apache.lucene.facet.search.results.FacetResultNode; +import org.apache.lucene.facet.taxonomy.TaxonomyReader; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class TestTopKResultsHandlerRandom extends BaseTestTopK { + + /** + * Try out faceted search in it's most basic form (no sampling nor complement + * that is). In this test lots (and lots..) of randomly generated data is + * being indexed, and later on an "over-all" faceted search is performed. The + * results are checked against the DF of each facet by itself + */ + @Test + public void testCountsComplementDisabled() throws Exception { + doTestCounts(false); + } + + private void doTestCounts(boolean doComplement) throws Exception, + IOException, IllegalAccessException, InstantiationException { + for (int partitionSize : partitionSizes) { + initIndex(partitionSize); + + List facetResults = countFacets(partitionSize, 100000, doComplement); + assertCountsAndCardinality(facetCountsTruth(), facetResults); + + closeAll(); + } + } + + /** + * Try out faceted search with complements. In this test lots (and lots..) of + * randomly generated data is being indexed, and later on, a "beta" faceted + * search is performed - retrieving ~90% of the documents so complements takes + * place in here. The results are checked against the a regular (a.k.a + * no-complement, no-sampling) faceted search with the same parameters. + */ + @Test + public void testCountsComplementEnforced() throws Exception { + doTestCounts(true); + } + + private List countFacets(int partitionSize, int numResults, final boolean doComplement) + throws IOException, IllegalAccessException, InstantiationException { + Query q = new MatchAllDocsQuery(); + FacetSearchParams facetSearchParams = searchParamsWithRequests(numResults, partitionSize); + FacetsCollector fc = new FacetsCollector(facetSearchParams, indexReader, taxoReader) { + @Override + protected FacetsAccumulator initFacetsAccumulator( + FacetSearchParams facetSearchParams, IndexReader indexReader, + TaxonomyReader taxonomyReader) { + FacetsAccumulator accumulator = new StandardFacetsAccumulator(facetSearchParams, indexReader, taxonomyReader); + double complement = doComplement ? FacetsAccumulator.FORCE_COMPLEMENT : FacetsAccumulator.DISABLE_COMPLEMENT; + accumulator.setComplementThreshold(complement); + return accumulator; + } + }; + searcher.search(q, fc); + List facetResults = fc.getFacetResults(); + return facetResults; + } + + /** + * Test that indeed top results are returned, ordered same as all results + * also when some facets have the same counts. + */ + @Test + public void testTopCountsOrder() throws Exception { + for (int partitionSize : partitionSizes) { + initIndex(partitionSize); + + List allFacetResults = countFacets(partitionSize, 100000, false); + + HashMap all = new HashMap(); + int maxNumNodes = 0; + int k = 0; + for (FacetResult fr : allFacetResults) { + FacetResultNode topResNode = fr.getFacetResultNode(); + maxNumNodes = Math.max(maxNumNodes, topResNode.getNumSubResults()); + int prevCount = Integer.MAX_VALUE; + int pos = 0; + for (FacetResultNode frn: topResNode.getSubResults()) { + assertTrue("wrong counts order: prev="+prevCount+" curr="+frn.getValue(), prevCount>=frn.getValue()); + prevCount = (int) frn.getValue(); + String key = k+"--"+frn.getLabel()+"=="+frn.getValue(); + if (VERBOSE) { + System.out.println(frn.getLabel() + " - " + frn.getValue() + " "+key+" "+pos); + } + all.put(key, pos++); // will use this later to verify order of sub-results + } + k++; + } + + // verify that when asking for less results, they are always of highest counts + // also verify that the order is stable + for (int n=1; n someResults = countFacets(partitionSize, n, false); + k = 0; + for (FacetResult fr : someResults) { + FacetResultNode topResNode = fr.getFacetResultNode(); + assertTrue("too many results: n="+n+" but got "+topResNode.getNumSubResults(), n>=topResNode.getNumSubResults()); + int pos = 0; + for (FacetResultNode frn: topResNode.getSubResults()) { + String key = k+"--"+frn.getLabel()+"=="+frn.getValue(); + if (VERBOSE) { + System.out.println(frn.getLabel() + " - " + frn.getValue() + " "+key+" "+pos); + } + Integer origPos = all.get(key); + assertNotNull("missing in all results: "+frn,origPos); + assertEquals("wrong order of sub-results!",pos++, origPos.intValue()); // verify order of sub-results + } + k++; + } + } + + closeAll(); // done with this partition + } + } + +} diff --git a/modules/facet/src/test/org/apache/lucene/facet/search/TestTotalFacetCounts.java b/modules/facet/src/test/org/apache/lucene/facet/search/TestTotalFacetCounts.java new file mode 100644 index 00000000000..4b73f56d85f --- /dev/null +++ b/modules/facet/src/test/org/apache/lucene/facet/search/TestTotalFacetCounts.java @@ -0,0 +1,114 @@ +package org.apache.lucene.facet.search; + +import java.io.File; +import java.io.IOException; +import java.util.Arrays; + +import org.apache.lucene.store.Directory; +import org.apache.lucene.util._TestUtil; +import org.junit.Test; + +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.facet.FacetTestUtils; +import org.apache.lucene.facet.FacetTestUtils.IndexTaxonomyReaderPair; +import org.apache.lucene.facet.FacetTestUtils.IndexTaxonomyWriterPair; +import org.apache.lucene.facet.index.params.DefaultFacetIndexingParams; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class TestTotalFacetCounts extends LuceneTestCase { + + private static void initCache(int numEntries) { + TotalFacetCountsCache.getSingleton().clear(); + TotalFacetCountsCache.getSingleton().setCacheSize(numEntries); // Set to keep one in mem + } + + @Test + public void testWriteRead() throws IOException { + doTestWriteRead(14); + doTestWriteRead(100); + doTestWriteRead(7); + doTestWriteRead(3); + doTestWriteRead(1); + } + + private void doTestWriteRead(final int partitionSize) throws IOException { + initCache(1); + + // Create temporary RAMDirectories + Directory[][] dirs = FacetTestUtils.createIndexTaxonomyDirs(1); + // Create our index/taxonomy writers + IndexTaxonomyWriterPair[] writers = FacetTestUtils + .createIndexTaxonomyWriterPair(dirs); + DefaultFacetIndexingParams iParams = new DefaultFacetIndexingParams() { + @Override + protected int fixedPartitionSize() { + return partitionSize; + } + }; + // The counts that the TotalFacetCountsArray should have after adding + // the below facets to the index. + int[] expectedCounts = new int[] { 0, 3, 1, 3, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1 }; + + // Add a facet to the index + TestTotalFacetCountsCache.addFacets(iParams, writers[0].indexWriter, writers[0].taxWriter, "a", "b"); + TestTotalFacetCountsCache.addFacets(iParams, writers[0].indexWriter, writers[0].taxWriter, "c", "d"); + TestTotalFacetCountsCache.addFacets(iParams, writers[0].indexWriter, writers[0].taxWriter, "a", "e"); + TestTotalFacetCountsCache.addFacets(iParams, writers[0].indexWriter, writers[0].taxWriter, "a", "d"); + TestTotalFacetCountsCache.addFacets(iParams, writers[0].indexWriter, writers[0].taxWriter, "c", "g"); + TestTotalFacetCountsCache.addFacets(iParams, writers[0].indexWriter, writers[0].taxWriter, "c", "z"); + TestTotalFacetCountsCache.addFacets(iParams, writers[0].indexWriter, writers[0].taxWriter, "b", "a"); + TestTotalFacetCountsCache.addFacets(iParams, writers[0].indexWriter, writers[0].taxWriter, "1", "2"); + TestTotalFacetCountsCache.addFacets(iParams, writers[0].indexWriter, writers[0].taxWriter, "b", "c"); + + // Commit Changes + writers[0].commit(); + writers[0].close(); + + IndexTaxonomyReaderPair[] readers = + FacetTestUtils.createIndexTaxonomyReaderPair(dirs); + + int[] intArray = new int[iParams.getPartitionSize()]; + + TotalFacetCountsCache tfcc = TotalFacetCountsCache.getSingleton(); + File tmpFile = _TestUtil.createTempFile("test", "tmp", TEMP_DIR); + tfcc.store(tmpFile, readers[0].indexReader, readers[0].taxReader, iParams, null); + tfcc.clear(); // not really required because TFCC overrides on load(), but in the test we need not rely on this. + tfcc.load(tmpFile, readers[0].indexReader, readers[0].taxReader, iParams); + + // now retrieve the one just loaded + TotalFacetCounts totalCounts = + tfcc.getTotalCounts(readers[0].indexReader, readers[0].taxReader, iParams, null); + + int partition = 0; + for (int i=0; i cps = new ArrayList(); + cps.add(new CategoryPath(strings)); + CategoryDocumentBuilder builder = new CategoryDocumentBuilder(tw, iParams); + iw.addDocument(builder.setCategoryPaths(cps).build(new Document())); + } + + /** Clears the cache and sets its size to one. */ + static void initCache() { + TFC.clear(); + TFC.setCacheSize(1); // Set to keep one in memory + } + + @Override + @Before + public void setUp() throws Exception { + super.setUp(); + initCache(); + } + + /** + * Run many instances of {@link MultiCLSearcher} in parallel, results should + * be sane. Each instance has a random delay for reading bytes, to ensure + * that threads finish in different order than started. + */ + @Test + public void testGeneralSynchronization() throws Exception { + int[] numThreads = new int[] { 2, 3, 5, 8 }; + int[] sleepMillis = new int[] { -1, 1, 20, 33 }; + int[] cacheSize = new int[] { 0,1,2,3,5 }; + for (int size : cacheSize) { + for (int sleep : sleepMillis) { + for (int nThreads : numThreads) { + doTestGeneralSynchronization(nThreads, sleep, size); + } + } + } + } + + private void doTestGeneralSynchronization(int numThreads, int sleepMillis, + int cacheSize) throws Exception, CorruptIndexException, IOException, + InterruptedException { + TFC.setCacheSize(cacheSize); + SlowRAMDirectory slowIndexDir = new SlowRAMDirectory(-1, random); + SlowRAMDirectory slowTaxoDir = new SlowRAMDirectory(-1, random); + + // Index documents without the "slowness" + MultiCLIndexer.index(slowIndexDir, slowTaxoDir); + + slowIndexDir.setSleepMillis(sleepMillis); + slowTaxoDir.setSleepMillis(sleepMillis); + + // Open the slow readers + IndexReader slowIndexReader = IndexReader.open(slowIndexDir); + TaxonomyReader slowTaxoReader = new LuceneTaxonomyReader(slowTaxoDir); + + // Class to perform search and return results as threads + class Multi extends Thread { + private List results; + private FacetIndexingParams iParams; + private IndexReader indexReader; + private TaxonomyReader taxoReader; + + public Multi(IndexReader indexReader, TaxonomyReader taxoReader, + FacetIndexingParams iParams) { + this.indexReader = indexReader; + this.taxoReader = taxoReader; + this.iParams = iParams; + } + + public ExampleResult getResults() { + ExampleResult exampleRes = new ExampleResult(); + exampleRes.setFacetResults(results); + return exampleRes; + } + + @Override + public void run() { + try { + results = MultiCLSearcher.searchWithFacets(indexReader, taxoReader, iParams); + } catch (Exception e) { + throw new RuntimeException(e); + } + } + } + + // Instantiate threads, but do not start them + Multi[] multis = new Multi[numThreads]; + for (int i = 0; i < numThreads - 1; i++) { + multis[i] = new Multi(slowIndexReader, slowTaxoReader, MultiCLIndexer.MULTI_IPARAMS); + } + // The last thread uses ONLY the DefaultFacetIndexingParams so that + // it references a different TFC cache. This will still result + // in valid results, but will only search one of the category lists + // instead of all of them. + multis[numThreads - 1] = new Multi(slowIndexReader, slowTaxoReader, new DefaultFacetIndexingParams()); + + // Gentleman, start your engines + for (Multi m : multis) { + m.start(); + } + + // Wait for threads and get results + ExampleResult[] multiResults = new ExampleResult[numThreads]; + for (int i = 0; i < numThreads; i++) { + multis[i].join(); + multiResults[i] = multis[i].getResults(); + } + + // Each of the (numThreads-1) should have the same predictable + // results, which we test for here. + for (int i = 0; i < numThreads - 1; i++) { + ExampleResult eResults = multiResults[i]; + TestMultiCLExample.assertCorrectMultiResults(eResults); + } + + // The last thread, which only searched over the + // DefaultFacetIndexingParams, + // has its own results + ExampleResult eResults = multiResults[numThreads - 1]; + List results = eResults.getFacetResults(); + assertEquals(3, results.size()); + String[] expLabels = new String[] { "5", "5/5", "6/2" }; + double[] expValues = new double[] { 0.0, 0.0, 1.0 }; + for (int i = 0; i < 3; i++) { + FacetResult result = results.get(i); + assertNotNull("Result should not be null", result); + FacetResultNode resNode = result.getFacetResultNode(); + assertEquals("Invalid label", expLabels[i], resNode.getLabel().toString()); + assertEquals("Invalid value", expValues[i], resNode.getValue(), 0.0); + assertEquals("Invalid number of subresults", 0, resNode.getNumSubResults()); + } + // we're done, close the index reader and the taxonomy. + slowIndexReader.close(); + slowTaxoReader.close(); + } + + /** + * Simple test to make sure the TotalFacetCountsManager updates the + * TotalFacetCounts array only when it is supposed to, and whether it + * is recomputed or read from disk. + */ + @Test + public void testGenerationalConsistency() throws Exception { + // Create temporary RAMDirectories + Directory[][] dirs = FacetTestUtils.createIndexTaxonomyDirs(1); + + // Create our index/taxonomy writers + IndexTaxonomyWriterPair[] writers = FacetTestUtils.createIndexTaxonomyWriterPair(dirs); + DefaultFacetIndexingParams iParams = new DefaultFacetIndexingParams(); + + // Add a facet to the index + addFacets(iParams, writers[0].indexWriter, writers[0].taxWriter, "a", "b"); + + // Commit Changes + writers[0].indexWriter.commit(); + writers[0].taxWriter.commit(); + + // Open readers + IndexTaxonomyReaderPair[] readers = FacetTestUtils.createIndexTaxonomyReaderPair(dirs); + + // As this is the first time we have invoked the TotalFacetCountsManager, + // we should expect to compute and not read from disk. + TotalFacetCounts totalCounts = + TFC.getTotalCounts(readers[0].indexReader, readers[0].taxReader, iParams, null); + int prevGen = assertRecomputed(totalCounts, 0, "after first attempt to get it!"); + + // Repeating same operation should pull from the cache - not recomputed. + assertTrue("Should be obtained from cache at 2nd attempt",totalCounts == + TFC.getTotalCounts(readers[0].indexReader, readers[0].taxReader, iParams, null)); + + // Repeat the same operation as above. but clear first - now should recompute again + initCache(); + totalCounts = TFC.getTotalCounts(readers[0].indexReader, readers[0].taxReader, iParams, null); + prevGen = assertRecomputed(totalCounts, prevGen, "after cache clear, 3rd attempt to get it!"); + + //store to file + File outputFile = _TestUtil.createTempFile("test", "tmp", TEMP_DIR); + initCache(); + TFC.store(outputFile, readers[0].indexReader, readers[0].taxReader, iParams, null); + totalCounts = TFC.getTotalCounts(readers[0].indexReader, readers[0].taxReader, iParams, null); + prevGen = assertRecomputed(totalCounts, prevGen, "after cache clear, 4th attempt to get it!"); + + //clear and load + initCache(); + TFC.load(outputFile, readers[0].indexReader, readers[0].taxReader, iParams); + totalCounts = TFC.getTotalCounts(readers[0].indexReader, readers[0].taxReader, iParams, null); + prevGen = assertReadFromDisc(totalCounts, prevGen, "after 5th attempt to get it!"); + + // Add a new facet to the index, commit and refresh readers + addFacets(iParams, writers[0].indexWriter, writers[0].taxWriter, "c", "d"); + writers[0].indexWriter.close(); + writers[0].taxWriter.close(); + + readers[0].taxReader.refresh(); + IndexReader r2 = readers[0].indexReader.reopen(); + // Hold on to the 'original' reader so we can do some checks with it + IndexReader origReader = null; + + assertTrue("Reader must be updated!", readers[0].indexReader != r2); + + // Set the 'original' reader + origReader = readers[0].indexReader; + // Set the new master index Reader + readers[0].indexReader = r2; + + // Try to get total-counts the originalReader AGAIN, just for sanity. Should pull from the cache - not recomputed. + assertTrue("Should be obtained from cache at 6th attempt",totalCounts == + TFC.getTotalCounts(origReader, readers[0].taxReader, iParams, null)); + + // now use the new reader - should recompute + totalCounts = TFC.getTotalCounts(readers[0].indexReader, readers[0].taxReader, iParams, null); + prevGen = assertRecomputed(totalCounts, prevGen, "after updating the index - 7th attempt!"); + + // try again - should not recompute + assertTrue("Should be obtained from cache at 8th attempt",totalCounts == + TFC.getTotalCounts(readers[0].indexReader, readers[0].taxReader, iParams, null)); + + // delete a doc from the reader and commit - should recompute + origReader.close(); + origReader = readers[0].indexReader; + readers[0].indexReader = IndexReader.open(origReader.directory(),false); + initCache(); + totalCounts = TFC.getTotalCounts(readers[0].indexReader, readers[0].taxReader, iParams, null); + prevGen = assertRecomputed(totalCounts, prevGen, "after opening a writable reader - 9th attempt!"); + // now do the delete + readers[0].indexReader.deleteDocument(1); + readers[0].indexReader.commit(null); + totalCounts = TFC.getTotalCounts(readers[0].indexReader, readers[0].taxReader, iParams, null); + prevGen = assertRecomputed(totalCounts, prevGen, "after deleting docs the index - 10th attempt!"); + + origReader.close(); + readers[0].close(); + r2.close(); + outputFile.delete(); + } + + private int assertReadFromDisc(TotalFacetCounts totalCounts, int prevGen, String errMsg) { + assertEquals("should read from disk "+errMsg, CreationType.Loaded, totalCounts.createType4test); + int gen4test = totalCounts.gen4test; + assertTrue("should read from disk "+errMsg, gen4test > prevGen); + return gen4test; + } + + private int assertRecomputed(TotalFacetCounts totalCounts, int prevGen, String errMsg) { + assertEquals("should recompute "+errMsg, CreationType.Computed, totalCounts.createType4test); + int gen4test = totalCounts.gen4test; + assertTrue("should recompute "+errMsg, gen4test > prevGen); + return gen4test; + } + + /** + * This test is to address a bug (Tracker #146354) in a previous version. If a TFC cache is + * written to disk, and then the taxonomy grows (but the index does not change), + * and then the TFC cache is re-read from disk, there will be an exception + * thrown, as the integers are read off of the disk according to taxonomy + * size, which has changed. + */ + @Test + public void testGrowingTaxonomy() throws Exception { + // Create temporary RAMDirectories + Directory[][] dirs = FacetTestUtils.createIndexTaxonomyDirs(1); + // Create our index/taxonomy writers + IndexTaxonomyWriterPair[] writers = FacetTestUtils + .createIndexTaxonomyWriterPair(dirs); + DefaultFacetIndexingParams iParams = new DefaultFacetIndexingParams() { + @Override + protected int fixedPartitionSize() { + return 2; + } + }; + // Add a facet to the index + addFacets(iParams, writers[0].indexWriter, writers[0].taxWriter, "a", "b"); + // Commit Changes + writers[0].indexWriter.commit(); + writers[0].taxWriter.commit(); + + IndexTaxonomyReaderPair[] readers = FacetTestUtils.createIndexTaxonomyReaderPair(dirs); + + // Create TFC and write cache to disk + File outputFile = _TestUtil.createTempFile("test", "tmp", TEMP_DIR); + TFC.store(outputFile, readers[0].indexReader, readers[0].taxReader, iParams, null); + + // Make the taxonomy grow without touching the index + for (int i = 0; i < 10; i++) { + writers[0].taxWriter.addCategory(new CategoryPath("foo", Integer.toString(i))); + } + writers[0].taxWriter.commit(); + readers[0].taxReader.refresh(); + + initCache(); + + // With the bug, this next call should result in an exception + TFC.load(outputFile, readers[0].indexReader, readers[0].taxReader, iParams); + TotalFacetCounts totalCounts = TFC.getTotalCounts( + readers[0].indexReader, readers[0].taxReader, iParams, null); + assertReadFromDisc(totalCounts, 0, "after reading from disk."); + outputFile.delete(); + } + + /** + * Test that a new TFC is only calculated and placed in memory (by two + * threads who want it at the same time) only once. + */ + @Test + public void testMemoryCacheSynchronization() throws Exception { + SlowRAMDirectory indexDir = new SlowRAMDirectory(-1, null); + SlowRAMDirectory taxoDir = new SlowRAMDirectory(-1, null); + + // Write index using 'normal' directories + IndexWriter w = new IndexWriter(indexDir, new IndexWriterConfig( + TEST_VERSION_CURRENT, new WhitespaceAnalyzer(TEST_VERSION_CURRENT))); + LuceneTaxonomyWriter tw = new LuceneTaxonomyWriter(taxoDir); + DefaultFacetIndexingParams iParams = new DefaultFacetIndexingParams(); + // Add documents and facets + for (int i = 0; i < 1000; i++) { + addFacets(iParams, w, tw, "facet", Integer.toString(i)); + } + w.close(); + tw.close(); + + indexDir.setSleepMillis(1); + taxoDir.setSleepMillis(1); + + IndexReader r = IndexReader.open(indexDir); + LuceneTaxonomyReader tr = new LuceneTaxonomyReader(taxoDir); + + // Create and start threads. Thread1 should lock the cache and calculate + // the TFC array. The second thread should block until the first is + // done, then successfully retrieve from the cache without recalculating + // or reading from disk. + TFCThread tfcCalc1 = new TFCThread(r, tr, iParams); + TFCThread tfcCalc2 = new TFCThread(r, tr, iParams); + tfcCalc1.start(); + // Give thread 1 a head start to ensure correct sequencing for testing + Thread.sleep(5); + tfcCalc2.start(); + + tfcCalc1.join(); + tfcCalc2.join(); + + // Since this test ends up with references to the same TFC object, we + // can only test the times to make sure that they are the same. + assertRecomputed(tfcCalc1.tfc, 0, "thread 1 should recompute"); + assertRecomputed(tfcCalc2.tfc, 0, "thread 2 should recompute"); + assertTrue("Both results should be the same (as their inputs are the same objects)", + tfcCalc1.tfc == tfcCalc2.tfc); + + r.close(); + tr.close(); + } + + /** + * Simple test to make sure the TotalFacetCountsManager updates the + * TotalFacetCounts array only when it is supposed to, and whether it + * is recomputed or read from disk, but this time with TWO different + * TotalFacetCounts + */ + @Test + public void testMultipleIndices() throws IOException { + // Create temporary RAMDirectories + Directory[][] dirs = FacetTestUtils.createIndexTaxonomyDirs(2); + // Create our index/taxonomy writers + IndexTaxonomyWriterPair[] writers = FacetTestUtils.createIndexTaxonomyWriterPair(dirs); + DefaultFacetIndexingParams iParams = new DefaultFacetIndexingParams(); + + // Add a facet to the index + addFacets(iParams, writers[0].indexWriter, writers[0].taxWriter, "a", "b"); + addFacets(iParams, writers[1].indexWriter, writers[1].taxWriter, "d", "e"); + // Commit Changes + writers[0].indexWriter.commit(); + writers[0].taxWriter.commit(); + writers[1].indexWriter.commit(); + writers[1].taxWriter.commit(); + + // Open two readers + IndexTaxonomyReaderPair[] readers = FacetTestUtils.createIndexTaxonomyReaderPair(dirs); + + // As this is the first time we have invoked the TotalFacetCountsManager, we + // should expect to compute. + TotalFacetCounts totalCounts0 = + TFC.getTotalCounts(readers[0].indexReader, readers[0].taxReader, iParams, null); + int prevGen = -1; + prevGen = assertRecomputed(totalCounts0, prevGen, "after attempt 1"); + assertTrue("attempt 1b for same input [0] shout find it in cache", + totalCounts0 == TFC.getTotalCounts(readers[0].indexReader, readers[0].taxReader, iParams, null)); + + // 2nd Reader - As this is the first time we have invoked the + // TotalFacetCountsManager, we should expect a state of NEW to be returned. + TotalFacetCounts totalCounts1 = + TFC.getTotalCounts(readers[1].indexReader, readers[1].taxReader, iParams, null); + prevGen = assertRecomputed(totalCounts1, prevGen, "after attempt 2"); + assertTrue("attempt 2b for same input [1] shout find it in cache", + totalCounts1 == TFC.getTotalCounts(readers[1].indexReader, readers[1].taxReader, iParams, null)); + + // Right now cache size is one, so first TFC is gone and should be recomputed + totalCounts0 = + TFC.getTotalCounts(readers[0].indexReader, readers[0].taxReader, iParams, null); + prevGen = assertRecomputed(totalCounts0, prevGen, "after attempt 3"); + + // Similarly will recompute the second result + totalCounts1 = + TFC.getTotalCounts(readers[1].indexReader, readers[1].taxReader, iParams, null); + prevGen = assertRecomputed(totalCounts1, prevGen, "after attempt 4"); + + // Now we set the cache size to two, meaning both should exist in the + // cache simultaneously + TFC.setCacheSize(2); + + // Re-compute totalCounts0 (was evicted from the cache when the cache was smaller) + totalCounts0 = + TFC.getTotalCounts(readers[0].indexReader, readers[0].taxReader, iParams, null); + prevGen = assertRecomputed(totalCounts0, prevGen, "after attempt 5"); + + // now both are in the larger cache and should not be recomputed + totalCounts1 = TFC.getTotalCounts(readers[1].indexReader, + readers[1].taxReader, iParams, null); + assertTrue("with cache of size 2 res no. 0 should come from cache", + totalCounts0 == TFC.getTotalCounts(readers[0].indexReader, readers[0].taxReader, iParams, null)); + assertTrue("with cache of size 2 res no. 1 should come from cache", + totalCounts1 == TFC.getTotalCounts(readers[1].indexReader, readers[1].taxReader, iParams, null)); + + readers[0].close(); + readers[1].close(); + } + +} diff --git a/modules/facet/src/test/org/apache/lucene/facet/search/association/AssociationsFacetRequestTest.java b/modules/facet/src/test/org/apache/lucene/facet/search/association/AssociationsFacetRequestTest.java new file mode 100644 index 00000000000..4f76af6c907 --- /dev/null +++ b/modules/facet/src/test/org/apache/lucene/facet/search/association/AssociationsFacetRequestTest.java @@ -0,0 +1,180 @@ +package org.apache.lucene.facet.search.association; + +import java.util.List; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.RAMDirectory; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Test; + +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.analysis.core.KeywordAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.facet.enhancements.EnhancementsDocumentBuilder; +import org.apache.lucene.facet.enhancements.association.AssociationEnhancement; +import org.apache.lucene.facet.enhancements.association.AssociationFloatProperty; +import org.apache.lucene.facet.enhancements.association.AssociationIntProperty; +import org.apache.lucene.facet.enhancements.params.DefaultEnhancementsIndexingParams; +import org.apache.lucene.facet.index.CategoryContainer; +import org.apache.lucene.facet.search.FacetsCollector; +import org.apache.lucene.facet.search.params.FacetSearchParams; +import org.apache.lucene.facet.search.params.association.AssociationFloatSumFacetRequest; +import org.apache.lucene.facet.search.params.association.AssociationIntSumFacetRequest; +import org.apache.lucene.facet.search.results.FacetResult; +import org.apache.lucene.facet.taxonomy.CategoryPath; +import org.apache.lucene.facet.taxonomy.TaxonomyWriter; +import org.apache.lucene.facet.taxonomy.lucene.LuceneTaxonomyReader; +import org.apache.lucene.facet.taxonomy.lucene.LuceneTaxonomyWriter; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** Test for associations */ +public class AssociationsFacetRequestTest extends LuceneTestCase { + + private static Directory dir = new RAMDirectory(); + private static Directory taxoDir = new RAMDirectory(); + + private static final CategoryPath aint = new CategoryPath("int", "a"); + private static final CategoryPath bint = new CategoryPath("int", "b"); + private static final CategoryPath afloat = new CategoryPath("float", "a"); + private static final CategoryPath bfloat = new CategoryPath("float", "b"); + + @BeforeClass + public static void beforeClassAssociationsFacetRequestTest() throws Exception { + // preparations - index, taxonomy, content + IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new KeywordAnalyzer())); + + TaxonomyWriter taxoWriter = new LuceneTaxonomyWriter(taxoDir); + + EnhancementsDocumentBuilder builder = new EnhancementsDocumentBuilder( + taxoWriter, new DefaultEnhancementsIndexingParams( + new AssociationEnhancement())); + + // index documents, 50% have only 'b' and all have 'a' + for (int i = 0; i < 100; i++) { + Document doc = new Document(); + CategoryContainer container = new CategoryContainer(); + container.addCategory(aint, new AssociationIntProperty(2)); + container.addCategory(afloat, new AssociationFloatProperty(0.5f)); + if (i % 2 == 0) { // 50 + container.addCategory(bint, new AssociationIntProperty(3)); + container.addCategory(bfloat, new AssociationFloatProperty(0.2f)); + } + builder.setCategories(container).build(doc); + writer.addDocument(doc); + } + + taxoWriter.close(); + writer.close(); + } + + @AfterClass + public static void afterClassAssociationsFacetRequestTest() throws Exception { + dir.close(); + taxoDir.close(); + } + + @Test + public void testIntSumAssociation() throws Exception { + IndexReader reader = IndexReader.open(dir, true); + LuceneTaxonomyReader taxo = new LuceneTaxonomyReader(taxoDir); + + // facet requests for two facets + FacetSearchParams fsp = new FacetSearchParams(); + fsp.addFacetRequest(new AssociationIntSumFacetRequest(aint, 10)); + fsp.addFacetRequest(new AssociationIntSumFacetRequest(bint, 10)); + + Query q = new MatchAllDocsQuery(); + + FacetsCollector fc = new FacetsCollector(fsp, reader, taxo); + + new IndexSearcher(reader).search(q, fc); + List res = fc.getFacetResults(); + + assertNotNull("No results!",res); + assertEquals("Wrong number of results!",2, res.size()); + assertEquals("Wrong count for category 'a'!",200, (int) res.get(0).getFacetResultNode().getValue()); + assertEquals("Wrong count for category 'b'!",150, (int) res.get(1).getFacetResultNode().getValue()); + + taxo.close(); + reader.close(); + } + + @Test + public void testFloatSumAssociation() throws Exception { + + IndexReader reader = IndexReader.open(dir, true); + LuceneTaxonomyReader taxo = new LuceneTaxonomyReader(taxoDir); + + // facet requests for two facets + FacetSearchParams fsp = new FacetSearchParams(); + fsp.addFacetRequest(new AssociationFloatSumFacetRequest(afloat, 10)); + fsp.addFacetRequest(new AssociationFloatSumFacetRequest(bfloat, 10)); + + Query q = new MatchAllDocsQuery(); + + FacetsCollector fc = new FacetsCollector(fsp, reader, taxo); + + new IndexSearcher(reader).search(q, fc); + List res = fc.getFacetResults(); + + assertNotNull("No results!",res); + assertEquals("Wrong number of results!",2, res.size()); + assertEquals("Wrong count for category 'a'!",50f, (float) res.get(0).getFacetResultNode().getValue(), 0.00001); + assertEquals("Wrong count for category 'b'!",10f, (float) res.get(1).getFacetResultNode().getValue(), 0.00001); + + taxo.close(); + reader.close(); + } + + @Test + public void testDifferentAggregatorsSameCategoryList() throws Exception { + // Same category list cannot be aggregated by two different aggregators. If + // you want to do that, you need to separate the categories into two + // category list (you'll still have one association list). + IndexReader reader = IndexReader.open(dir, true); + LuceneTaxonomyReader taxo = new LuceneTaxonomyReader(taxoDir); + + // facet requests for two facets + FacetSearchParams fsp = new FacetSearchParams(); + fsp.addFacetRequest(new AssociationIntSumFacetRequest(aint, 10)); + fsp.addFacetRequest(new AssociationIntSumFacetRequest(bint, 10)); + fsp.addFacetRequest(new AssociationFloatSumFacetRequest(afloat, 10)); + fsp.addFacetRequest(new AssociationFloatSumFacetRequest(bfloat, 10)); + + Query q = new MatchAllDocsQuery(); + + FacetsCollector fc = new FacetsCollector(fsp, reader, taxo); + + new IndexSearcher(reader).search(q, fc); + try { + fc.getFacetResults(); + fail("different aggregators for same category list should not be supported"); + } catch (RuntimeException e) { + // ok - expected + } + } + +} diff --git a/modules/facet/src/test/org/apache/lucene/facet/search/params/FacetRequestTest.java b/modules/facet/src/test/org/apache/lucene/facet/search/params/FacetRequestTest.java new file mode 100644 index 00000000000..8a5e2bcdfd9 --- /dev/null +++ b/modules/facet/src/test/org/apache/lucene/facet/search/params/FacetRequestTest.java @@ -0,0 +1,90 @@ +package org.apache.lucene.facet.search.params; + +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.store.RAMDirectory; +import org.junit.Test; + +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.facet.search.FacetResultsHandler; +import org.apache.lucene.facet.taxonomy.CategoryPath; +import org.apache.lucene.facet.taxonomy.TaxonomyReader; +import org.apache.lucene.facet.taxonomy.lucene.LuceneTaxonomyReader; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class FacetRequestTest extends LuceneTestCase { + + @Test(expected=IllegalArgumentException.class) + public void testIllegalNumResults() throws Exception { + new CountFacetRequest(new CategoryPath("a", "b"), 0); + } + + @Test(expected=IllegalArgumentException.class) + public void testIllegalCategoryPath() throws Exception { + new CountFacetRequest(null, 1); + } + + @Test + public void testHashAndEquals() { + CountFacetRequest fr1 = new CountFacetRequest(new CategoryPath("a"), 8); + CountFacetRequest fr2 = new CountFacetRequest(new CategoryPath("a"), 8); + assertEquals("hashCode() should agree on both objects", fr1.hashCode(), fr2.hashCode()); + assertTrue("equals() should return true", fr1.equals(fr2)); + fr1.setDepth(10); + assertFalse("equals() should return false as fr1.depth != fr2.depth", fr1.equals(fr2)); + } + + @Test + public void testGetFacetResultHandlerDifferentTaxonomy() throws Exception { + FacetRequest fr = new CountFacetRequest(new CategoryPath("a"), 10); + RAMDirectory dir1 = new RAMDirectory(); + RAMDirectory dir2 = new RAMDirectory(); + // create empty indexes, so that LTR ctor won't complain about a missing index. + new IndexWriter(dir1, new IndexWriterConfig(TEST_VERSION_CURRENT, null)).close(); + new IndexWriter(dir2, new IndexWriterConfig(TEST_VERSION_CURRENT, null)).close(); + TaxonomyReader tr1 = new LuceneTaxonomyReader(dir1); + TaxonomyReader tr2 = new LuceneTaxonomyReader(dir2); + FacetResultsHandler frh1 = fr.createFacetResultsHandler(tr1); + FacetResultsHandler frh2 = fr.createFacetResultsHandler(tr2); + assertTrue("should not return the same FacetResultHandler instance for different TaxonomyReader instances", frh1 != frh2); + } + + @Test + public void testImmutability() throws Exception { + // Tests that after a FRH is created by FR, changes to FR are not reflected + // in the FRH. + FacetRequest fr = new CountFacetRequest(new CategoryPath("a"), 10); + RAMDirectory dir = new RAMDirectory(); + // create empty indexes, so that LTR ctor won't complain about a missing index. + new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, null)).close(); + TaxonomyReader tr = new LuceneTaxonomyReader(dir); + FacetResultsHandler frh = fr.createFacetResultsHandler(tr); + fr.setDepth(10); + assertEquals(FacetRequest.DEFAULT_DEPTH, frh.getFacetRequest().getDepth()); + } + + @Test + public void testClone() throws Exception { + FacetRequest fr = new CountFacetRequest(new CategoryPath("a"), 10); + FacetRequest clone = (FacetRequest) fr.clone(); + fr.setDepth(10); + assertEquals("depth should not have been affected in the clone", FacetRequest.DEFAULT_DEPTH, clone.getDepth()); + } + +} diff --git a/modules/facet/src/test/org/apache/lucene/facet/search/params/FacetSearchParamsTest.java b/modules/facet/src/test/org/apache/lucene/facet/search/params/FacetSearchParamsTest.java new file mode 100644 index 00000000000..fc0778d3ba4 --- /dev/null +++ b/modules/facet/src/test/org/apache/lucene/facet/search/params/FacetSearchParamsTest.java @@ -0,0 +1,75 @@ +package org.apache.lucene.facet.search.params; + +import org.apache.lucene.store.RAMDirectory; +import org.junit.Test; + +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.facet.index.params.DefaultFacetIndexingParams; +import org.apache.lucene.facet.taxonomy.CategoryPath; +import org.apache.lucene.facet.taxonomy.TaxonomyReader; +import org.apache.lucene.facet.taxonomy.TaxonomyWriter; +import org.apache.lucene.facet.taxonomy.lucene.LuceneTaxonomyReader; +import org.apache.lucene.facet.taxonomy.lucene.LuceneTaxonomyWriter; +import org.apache.lucene.facet.util.PartitionsUtils; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class FacetSearchParamsTest extends LuceneTestCase { + + @Test + public void testDefaultSettings() throws Exception { + FacetSearchParams fsp = new FacetSearchParams(); + assertEquals("unexpected default facet indexing params class", DefaultFacetIndexingParams.class.getName(), fsp.getFacetIndexingParams().getClass().getName()); + assertEquals("no facet requests should be added by default", 0, fsp.getFacetRequests().size()); + RAMDirectory dir = new RAMDirectory(); + new LuceneTaxonomyWriter(dir).close(); + TaxonomyReader tr = new LuceneTaxonomyReader(dir); + assertEquals("unexpected partition offset for 0 categories", 1, PartitionsUtils.partitionOffset(fsp, 1, tr)); + assertEquals("unexpected partition size for 0 categories", 1, PartitionsUtils.partitionSize(fsp,tr)); + } + + @Test + public void testAddFacetRequest() throws Exception { + FacetSearchParams fsp = new FacetSearchParams(); + fsp.addFacetRequest(new CountFacetRequest(new CategoryPath("a", "b"), 1)); + assertEquals("expected 1 facet request", 1, fsp.getFacetRequests().size()); + } + + @Test + public void testPartitionSizeWithCategories() throws Exception { + FacetSearchParams fsp = new FacetSearchParams(); + RAMDirectory dir = new RAMDirectory(); + TaxonomyWriter tw = new LuceneTaxonomyWriter(dir); + tw.addCategory(new CategoryPath("a")); + tw.commit(); + tw.close(); + TaxonomyReader tr = new LuceneTaxonomyReader(dir); + assertEquals("unexpected partition offset for 1 categories", 2, PartitionsUtils.partitionOffset(fsp, 1, tr)); + assertEquals("unexpected partition size for 1 categories", 2, PartitionsUtils.partitionSize(fsp,tr)); + } + + @Test + public void testSearchParamsWithNullRequest() throws Exception { + FacetSearchParams fsp = new FacetSearchParams(); + try { + fsp.addFacetRequest(null); + fail("FacetSearchParams should throw IllegalArgumentException when trying to add a null FacetRequest"); + } catch (IllegalArgumentException e) { + } + } +} diff --git a/modules/facet/src/test/org/apache/lucene/facet/search/params/MultiIteratorsPerCLParamsTest.java b/modules/facet/src/test/org/apache/lucene/facet/search/params/MultiIteratorsPerCLParamsTest.java new file mode 100644 index 00000000000..850218ec4a7 --- /dev/null +++ b/modules/facet/src/test/org/apache/lucene/facet/search/params/MultiIteratorsPerCLParamsTest.java @@ -0,0 +1,263 @@ +package org.apache.lucene.facet.search.params; + +import java.io.IOException; +import java.util.Arrays; +import java.util.List; + +import org.apache.lucene.analysis.core.KeywordAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.RAMDirectory; +import org.junit.Test; + +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.facet.index.CategoryDocumentBuilder; +import org.apache.lucene.facet.index.params.CategoryListParams; +import org.apache.lucene.facet.index.params.DefaultFacetIndexingParams; +import org.apache.lucene.facet.index.params.FacetIndexingParams; +import org.apache.lucene.facet.search.CategoryListIterator; +import org.apache.lucene.facet.search.FacetArrays; +import org.apache.lucene.facet.search.FacetResultsHandler; +import org.apache.lucene.facet.search.FacetsAccumulator; +import org.apache.lucene.facet.search.ScoredDocIDs; +import org.apache.lucene.facet.search.StandardFacetsAccumulator; +import org.apache.lucene.facet.search.TopKFacetResultsHandler; +import org.apache.lucene.facet.search.cache.CategoryListCache; +import org.apache.lucene.facet.search.results.FacetResult; +import org.apache.lucene.facet.search.results.FacetResultNode; +import org.apache.lucene.facet.search.results.IntermediateFacetResult; +import org.apache.lucene.facet.taxonomy.CategoryPath; +import org.apache.lucene.facet.taxonomy.TaxonomyReader; +import org.apache.lucene.facet.taxonomy.TaxonomyWriter; +import org.apache.lucene.facet.taxonomy.lucene.LuceneTaxonomyReader; +import org.apache.lucene.facet.taxonomy.lucene.LuceneTaxonomyWriter; +import org.apache.lucene.facet.util.ScoredDocIdsUtils; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Test faceted search with creation of multiple category list iterators by the + * same CLP, depending on the provided facet request + */ +public class MultiIteratorsPerCLParamsTest extends LuceneTestCase { + + CategoryPath[][] perDocCategories = new CategoryPath[][] { + { new CategoryPath("author", "Mark Twain"), + new CategoryPath("date", "2010") }, + { new CategoryPath("author", "Robert Frost"), + new CategoryPath("date", "2009") }, + { new CategoryPath("author", "Artur Miller"), + new CategoryPath("date", "2010") }, + { new CategoryPath("author", "Edgar Allan Poe"), + new CategoryPath("date", "2009") }, + { new CategoryPath("author", "Henry James"), + new CategoryPath("date", "2010") } }; + + String countForbiddenDimension; + + @Test + public void testCLParamMultiIteratorsByRequest() throws Exception { + doTestCLParamMultiIteratorsByRequest(false); + } + + @Test + public void testCLParamMultiIteratorsByRequestCacheCLI() throws Exception { + doTestCLParamMultiIteratorsByRequest(true); + } + + private void doTestCLParamMultiIteratorsByRequest(boolean cacheCLI) throws Exception, + CorruptIndexException, IOException { + // Create a CLP which generates different CLIs according to the + // FacetRequest's dimension + CategoryListParams clp = new CategoryListParams(); + FacetIndexingParams iParams = new DefaultFacetIndexingParams(clp); + Directory indexDir = new RAMDirectory(); + Directory taxoDir = new RAMDirectory(); + populateIndex(iParams, indexDir, taxoDir); + + TaxonomyReader taxo = new LuceneTaxonomyReader(taxoDir); + IndexReader reader = IndexReader.open(indexDir); + + CategoryListCache clCache = null; + if (cacheCLI) { + // caching the iteratorr, so: + // 1: create the cached iterator, using original params + clCache = new CategoryListCache(); + clCache.loadAndRegister(clp, reader, taxo, iParams); + } + + ScoredDocIDs allDocs = ScoredDocIdsUtils + .createAllDocsScoredDocIDs(reader); + + // Search index with 'author' should filter ONLY ordinals whose parent + // is 'author' + countForbiddenDimension = "date"; + validateFacetedSearch(iParams, taxo, reader, clCache, allDocs, "author", 5, 5); + + // Search index with 'date' should filter ONLY ordinals whose parent is + // 'date' + countForbiddenDimension = "author"; + validateFacetedSearch(iParams, taxo, reader, clCache, allDocs, "date", 5, 2); + + // Search index with both 'date' and 'author' + countForbiddenDimension = null; + validateFacetedSearch(iParams, taxo, reader, clCache, allDocs, new String[] { + "author", "date" }, new int[] { 5, 5 }, new int[] { 5, 2 }); + } + + private void validateFacetedSearch(FacetIndexingParams iParams, + TaxonomyReader taxo, IndexReader reader, CategoryListCache clCache, + ScoredDocIDs allDocs, String dimension, int expectedValue, int expectedNumDescendants) throws IOException { + validateFacetedSearch(iParams, taxo, reader, clCache, allDocs, + new String[] { dimension }, new int[] { expectedValue }, + new int[] { expectedNumDescendants }); + } + + private void validateFacetedSearch(FacetIndexingParams iParams, + TaxonomyReader taxo, IndexReader reader, CategoryListCache clCache, ScoredDocIDs allDocs, + String[] dimension, int[] expectedValue, + int[] expectedNumDescendants) + throws IOException { + FacetSearchParams sParams = new FacetSearchParams(iParams); + sParams.setClCache(clCache); + for (String dim : dimension) { + sParams.addFacetRequest(new PerDimCountFacetRequest( + new CategoryPath(dim), 10)); + } + FacetsAccumulator acc = new StandardFacetsAccumulator(sParams, reader, taxo); + + // no use to test this with complement since at that mode all facets are taken + acc.setComplementThreshold(FacetsAccumulator.DISABLE_COMPLEMENT); + + List results = acc.accumulate(allDocs); + assertEquals("Wrong #results", dimension.length, results.size()); + + for (int i = 0; i < results.size(); i++) { + FacetResult res = results.get(i); + assertEquals("wrong num-descendants for dimension " + dimension[i], + expectedNumDescendants[i], res.getNumValidDescendants()); + FacetResultNode resNode = res.getFacetResultNode(); + assertEquals("wrong value for dimension " + dimension[i], + expectedValue[i], (int) resNode.getValue()); + } + } + + private void populateIndex(FacetIndexingParams iParams, Directory indexDir, + Directory taxoDir) throws Exception { + IndexWriter writer = new IndexWriter(indexDir, new IndexWriterConfig(TEST_VERSION_CURRENT, new KeywordAnalyzer())); + TaxonomyWriter taxoWriter = new LuceneTaxonomyWriter(taxoDir); + + for (CategoryPath[] categories : perDocCategories) { + writer.addDocument(new CategoryDocumentBuilder(taxoWriter, iParams) + .setCategoryPaths(Arrays.asList(categories)).build( + new Document())); + + } + taxoWriter.commit(); + writer.commit(); + taxoWriter.close(); + writer.close(); + } + + private class PerDimCountFacetRequest extends CountFacetRequest { + + public PerDimCountFacetRequest(CategoryPath path, int num) { + super(path, num); + } + + @Override + public CategoryListIterator createCategoryListIterator(IndexReader reader, + TaxonomyReader taxo, FacetSearchParams sParams, int partition) throws IOException { + // categories of certain dimension only + return new PerDimensionCLI(taxo, super.createCategoryListIterator( + reader, taxo, sParams, partition), getCategoryPath()); + } + + @Override + /** Override this method just for verifying that only specified facets are iterated.. */ + public FacetResultsHandler createFacetResultsHandler( + TaxonomyReader taxonomyReader) { + return new TopKFacetResultsHandler(taxonomyReader, this) { + @Override + public IntermediateFacetResult fetchPartitionResult( + FacetArrays facetArrays, int offset) throws IOException { + final IntermediateFacetResult res = super.fetchPartitionResult(facetArrays, offset); + if (countForbiddenDimension!=null) { + int ord = taxonomyReader.getOrdinal(new CategoryPath(countForbiddenDimension)); + assertEquals("Should not have accumulated for dimension '"+countForbiddenDimension+"'!",0,facetArrays.getIntArray()[ord]); + } + return res; + } + }; + } + } + + /** + * a CLI which filters another CLI for the dimension of the provided + * category-path + */ + private static class PerDimensionCLI implements CategoryListIterator { + private final CategoryListIterator superCLI; + private final int[] parentArray; + private final int parentOrdinal; + + PerDimensionCLI(TaxonomyReader taxo, CategoryListIterator superCLI, + CategoryPath requestedPath) throws IOException { + this.superCLI = superCLI; + if (requestedPath == null) { + parentOrdinal = 0; + } else { + CategoryPath cp = new CategoryPath(requestedPath.getComponent(0)); + parentOrdinal = taxo.getOrdinal(cp); + } + parentArray = taxo.getParentArray(); + } + + public boolean init() throws IOException { + return superCLI.init(); + } + + public long nextCategory() throws IOException { + long next; + while ((next = superCLI.nextCategory()) <= Integer.MAX_VALUE + && !isInDimension((int) next)) { + } + + return next; + } + + /** look for original parent ordinal, meaning same dimension */ + private boolean isInDimension(int ordinal) { + while (ordinal > 0) { + if (ordinal == parentOrdinal) { + return true; + } + ordinal = parentArray[ordinal]; + } + return false; + } + + public boolean skipTo(int docId) throws IOException { + return superCLI.skipTo(docId); + } + } +} \ No newline at end of file diff --git a/modules/facet/src/test/org/apache/lucene/facet/search/sampling/BaseSampleTestTopK.java b/modules/facet/src/test/org/apache/lucene/facet/search/sampling/BaseSampleTestTopK.java new file mode 100644 index 00000000000..f578c91e7c9 --- /dev/null +++ b/modules/facet/src/test/org/apache/lucene/facet/search/sampling/BaseSampleTestTopK.java @@ -0,0 +1,135 @@ +package org.apache.lucene.facet.search.sampling; + +import java.io.IOException; +import java.util.List; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; +import org.junit.Test; + +import org.apache.lucene.search.MultiCollector; +import org.apache.lucene.facet.search.BaseTestTopK; +import org.apache.lucene.facet.search.FacetsAccumulator; +import org.apache.lucene.facet.search.FacetsCollector; +import org.apache.lucene.facet.search.ScoredDocIDs; +import org.apache.lucene.facet.search.ScoredDocIdCollector; +import org.apache.lucene.facet.search.params.FacetSearchParams; +import org.apache.lucene.facet.search.results.FacetResult; +import org.apache.lucene.facet.taxonomy.TaxonomyReader; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public abstract class BaseSampleTestTopK extends BaseTestTopK { + + /** Number of top results */ + protected static final int K = 2; + + /** since there is a chance that this test would fail even if the code is correct, retry the sampling */ + protected static final int RETRIES = 4; + + protected abstract FacetsAccumulator getSamplingAccumulator(Sampler sampler, + TaxonomyReader taxoReader, IndexReader indexReader, + FacetSearchParams searchParams); + + @Test + public void testCountUsingComplementSampling() throws Exception { + doTestWithSamping(true); + } + + @Test + public void testCountUsingSampling() throws Exception { + doTestWithSamping(false); + } + + /** + * Try out faceted search with sampling enabled and complements either disabled or enforced + * Lots of randomly generated data is being indexed, and later on a "90% docs" faceted search + * is performed. The results are compared to non-sampled ones. + */ + private void doTestWithSamping(boolean complement) throws Exception, IOException { + for (int partitionSize : partitionSizes) { + initIndex(partitionSize); + + // Get all of the documents and run the query, then do different + // facet counts and compare to control + Query q = new TermQuery(new Term(CONTENT_FIELD, BETA)); // 90% of the docs + ScoredDocIdCollector docCollector = ScoredDocIdCollector.create(searcher.maxDoc(), false); + + FacetSearchParams expectedSearchParams = searchParamsWithRequests(K, partitionSize); + FacetsCollector fc = new FacetsCollector(expectedSearchParams, indexReader, taxoReader); + + searcher.search(q, MultiCollector.wrap(docCollector, fc)); + + List expectedResults = fc.getFacetResults(); + + // complement with sampling! + final Sampler sampler = createSampler(docCollector.getScoredDocIDs()); + + FacetSearchParams samplingSearchParams = searchParamsWithRequests(K, partitionSize); + + // try several times in case of failure, because the test has a chance to fail + // if the top K facets are not sufficiently common with the sample set + for (int n=RETRIES; n>0; n--) { + FacetsCollector samplingFC = samplingCollector(complement, sampler, samplingSearchParams); + + searcher.search(q, samplingFC); + List sampledResults = samplingFC.getFacetResults(); + + try { + assertSameResults(expectedResults, sampledResults); + break; // succeeded + } catch (Exception e) { + if (n<=1) { // otherwise try again + throw e; + } + } + } + } + } + + private FacetsCollector samplingCollector( + final boolean complement, + final Sampler sampler, + FacetSearchParams samplingSearchParams) { + FacetsCollector samplingFC = new FacetsCollector(samplingSearchParams, indexReader, taxoReader) { + @Override + protected FacetsAccumulator initFacetsAccumulator( + FacetSearchParams facetSearchParams, IndexReader indexReader, + TaxonomyReader taxonomyReader) { + FacetsAccumulator acc = getSamplingAccumulator(sampler, taxonomyReader, indexReader, facetSearchParams); + acc.setComplementThreshold(complement ? FacetsAccumulator.FORCE_COMPLEMENT : FacetsAccumulator.DISABLE_COMPLEMENT); + return acc; + } + }; + return samplingFC; + } + + private Sampler createSampler(ScoredDocIDs scoredDocIDs) { + SamplingParams samplingParams = new SamplingParams(); + samplingParams.setSampleRatio(0.8); + samplingParams.setMinSampleSize(100); + samplingParams.setMaxSampleSize(10000); + samplingParams.setSampingThreshold(11000); //force sampling + samplingParams.setOversampleFactor(5.0); + Sampler sampler = new Sampler(samplingParams); + assertTrue("must enable sampling for this test!",sampler.shouldSample(scoredDocIDs)); + return sampler; + } +} diff --git a/modules/facet/src/test/org/apache/lucene/facet/search/sampling/SamplingAccumulatorTest.java b/modules/facet/src/test/org/apache/lucene/facet/search/sampling/SamplingAccumulatorTest.java new file mode 100644 index 00000000000..4bdaf8c6b56 --- /dev/null +++ b/modules/facet/src/test/org/apache/lucene/facet/search/sampling/SamplingAccumulatorTest.java @@ -0,0 +1,35 @@ +package org.apache.lucene.facet.search.sampling; + +import org.apache.lucene.index.IndexReader; + +import org.apache.lucene.facet.search.FacetsAccumulator; +import org.apache.lucene.facet.search.params.FacetSearchParams; +import org.apache.lucene.facet.taxonomy.TaxonomyReader; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class SamplingAccumulatorTest extends BaseSampleTestTopK { + + @Override + protected FacetsAccumulator getSamplingAccumulator(Sampler sampler, + TaxonomyReader taxoReader, IndexReader indexReader, + FacetSearchParams searchParams) { + return new SamplingAccumulator(sampler, searchParams, indexReader, + taxoReader); + } +} diff --git a/modules/facet/src/test/org/apache/lucene/facet/taxonomy/TestCategoryPath.java b/modules/facet/src/test/org/apache/lucene/facet/taxonomy/TestCategoryPath.java new file mode 100644 index 00000000000..784c07cbd2b --- /dev/null +++ b/modules/facet/src/test/org/apache/lucene/facet/taxonomy/TestCategoryPath.java @@ -0,0 +1,901 @@ +package org.apache.lucene.facet.taxonomy; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.OutputStreamWriter; + +import org.junit.Test; + +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.facet.taxonomy.CategoryPath; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class TestCategoryPath extends LuceneTestCase { + + @Test + public void testBasic() { + CategoryPath p = new CategoryPath(0,0); + assertEquals(0, p.length()); + for (int i=0; i<1000; i++) { + p.add("hello"); + assertEquals(i+1, p.length()); + } + } + + @Test + public void testConstructorCapacity() { + CategoryPath p = new CategoryPath(0,0); + assertEquals(0, p.capacityChars()); + assertEquals(0, p.capacityComponents()); + assertEquals(0, p.length()); + p = new CategoryPath(5,18); + assertEquals(5, p.capacityChars()); + assertEquals(18, p.capacityComponents()); + assertEquals(0, p.length()); + p = new CategoryPath(27,13); + assertEquals(27, p.capacityChars()); + assertEquals(13, p.capacityComponents()); + assertEquals(0, p.length()); + } + + @Test + public void testClear() { + CategoryPath p = new CategoryPath(0,0); + p.add("hi"); + p.add("there"); + assertEquals(2, p.length()); + p.clear(); + assertEquals(0, p.length()); + p.add("yo!"); + assertEquals(1, p.length()); + } + + @Test + public void testTrim() { + CategoryPath p = new CategoryPath(0,0); + p.add("this"); + p.add("message"); + p.add("will"); + p.add("self"); + p.add("destruct"); + p.add("in"); + p.add("five"); + p.add("seconds"); + assertEquals(8, p.length()); + p.trim(3); + assertEquals(5, p.length()); + p.trim(0); // no-op + assertEquals(5, p.length()); + p.trim(-3); // no-op + assertEquals(5, p.length()); + p.trim(1); + assertEquals(4, p.length()); + p.trim(8); // clear + assertEquals(0, p.length()); + p.add("yo!"); + assertEquals(1, p.length()); + p.trim(1); // clear + assertEquals(0, p.length()); + } + + @Test + public void testComponentsLimit() { + // Test that we can add up to 2^15-1 components + CategoryPath p = new CategoryPath(0,0); + for (int i=0; i<32767; i++) { + p.add(""); + assertEquals(i+1, p.length()); + } + // Also see that in the current implementation, this is actually + // the limit: if we add one more component, things break (because + // we used a short to hold ncomponents). See that it breaks in the + // way we expect it to: + p.add(""); // this still works, but... + assertEquals(-32768, p.length()); // now the length is wrong and negative + } + + @Test + public void testCharsLimit() { + // Test that we can add up to 2^15-1 characters + CategoryPath p = new CategoryPath(0,0); + for (int i=0; i<8192; i++) { + p.add("aaaa"); + } + // Also see that in the current implementation, this is actually the + // limit: If we add one more character, things break (because ends[] + // is an array of shorts), and we actually get an exception. + try { + p.add("a"); + fail("Should have thrown an exception"); + } catch (ArrayIndexOutOfBoundsException e) { + // good. + } + } + + @Test + public void testToString() { + CategoryPath p = new CategoryPath(0,0); + // When the category is empty, we expect an empty string + assertEquals("", p.toString('/')); + // This is (deliberately, in our implementation) indistinguishable + // from the case of a single empty component: + p.add(""); + assertEquals("", p.toString('/')); + // Check just one category (so no delimiter needed): + p.clear(); + p.add("hello"); + assertEquals("hello", p.toString('/')); + // Now for two categories: + p.clear(); + p.add("hello"); + p.add("world"); + assertEquals("hello/world", p.toString('/')); + // And for a thousand... + p.clear(); + p.add("0"); + StringBuilder expected = new StringBuilder("0"); + for (int i=1; i<1000; i++) { + String num = Integer.toString(i); + p.add(num); + expected.append('/'); + expected.append(num); + } + assertEquals(expected.toString(), p.toString('/')); + // Check that toString() without a parameter just defaults to '/': + assertEquals(expected.toString(), p.toString()); + } + + // testing toString() and its variants already test most of the appendTo() + // code, but not all of it (the "eclemma" code-coverage tool discovered + // this for us). Here we complete the coverage of the appendTo() methods: + @Test + public void testAppendTo() throws IOException { + CategoryPath p = new CategoryPath(0,0); + StringBuilder sb = new StringBuilder(); + p.appendTo(sb, '/'); + assertEquals(0, sb.length()); + p.appendTo(sb, '/', -1); + assertEquals(0, sb.length()); + p.appendTo(sb, '/', 1); + assertEquals(0, sb.length()); + p.appendTo(sb, '/', -1, 1); + assertEquals(0, sb.length()); + } + + @Test + public void testLastComponent() { + CategoryPath p = new CategoryPath(1000,1000); + // When the category is empty, we expect a null + assertNull(p.lastComponent()); + for (int i=0; i<=100; i++) { + String num = Integer.toString(i); + p.add(num); + assertEquals(num, p.lastComponent()); + } + } + + @Test + public void testGetComponent() { + CategoryPath p = new CategoryPath(1000,1000); + // When the category is empty, we expect a null + assertNull(p.getComponent(0)); + assertNull(p.getComponent(1)); + assertNull(p.getComponent(-1)); + for (int i=0; i<=100; i++) { + p.add(Integer.toString(i)); + for (int j=0; j<=i; j++) { + assertEquals(j, Integer.parseInt(p.getComponent(j))); + } + assertNull(p.getComponent(-1)); + assertNull(p.getComponent(i+1)); + } + } + + @Test + public void testToStringPrefix() { + CategoryPath p = new CategoryPath(0,0); + p.add("hi"); + p.add("there"); + p.add("man"); + assertEquals("hi/there/man", p.toString('/')); + assertEquals("", p.toString('/', 0)); + assertEquals("hi", p.toString('/', 1)); + assertEquals("hi/there", p.toString('/', 2)); + assertEquals("hi/there/man", p.toString('/', 3)); + assertEquals("hi/there/man", p.toString('/', 4)); + assertEquals("hi/there/man", p.toString('/', -1)); + } + + @Test + public void testToStringSubpath() { + CategoryPath p = new CategoryPath(0,0); + assertEquals("", p.toString('/', 0, 0)); + p.add("hi"); + p.add("there"); + p.add("man"); + assertEquals("", p.toString('/', 0, 0)); + assertEquals("hi", p.toString('/', 0, 1)); + assertEquals("hi/there", p.toString('/', 0, 2)); + assertEquals("hi/there/man", p.toString('/', 0, 3)); + assertEquals("hi/there/man", p.toString('/', 0, 4)); + assertEquals("hi/there/man", p.toString('/', 0, -1)); + assertEquals("hi/there/man", p.toString('/', -1, -1)); + assertEquals("there/man", p.toString('/', 1, -1)); + assertEquals("man", p.toString('/', 2, -1)); + assertEquals("", p.toString('/', 3, -1)); + assertEquals("there/man", p.toString('/', 1, 3)); + assertEquals("there", p.toString('/', 1, 2)); + assertEquals("", p.toString('/', 1, 1)); + } + + @Test + public void testDelimiterConstructor() { + // Test that the constructor that takes a string and a delimiter + // works correctly. Also check that it allocates exactly the needed + // needed size for the array - not more. + CategoryPath p = new CategoryPath("", '/'); + assertEquals(p.length(), 0); + assertEquals(p.capacityChars(), 0); + assertEquals(p.capacityComponents(), 0); + p = new CategoryPath("hello", '/'); + assertEquals(p.length(), 1); + assertEquals(p.capacityChars(), 5); + assertEquals(p.capacityComponents(), 1); + assertEquals(p.toString('@'), "hello"); + p = new CategoryPath("hi/there", '/'); + assertEquals(p.length(), 2); + assertEquals(p.capacityChars(), 7); + assertEquals(p.capacityComponents(), 2); + assertEquals(p.toString('@'), "hi@there"); + p = new CategoryPath("how/are/you/doing?", '/'); + assertEquals(p.length(), 4); + assertEquals(p.capacityChars(), 15); + assertEquals(p.capacityComponents(), 4); + assertEquals(p.toString('@'), "how@are@you@doing?"); + } + + @Test + public void testDefaultConstructor() { + // test that the default constructor (no parameters) currently + // defaults to creating an object with a 0 initial capacity. + // If we change this default later, we also need to change this + // test. + CategoryPath p = new CategoryPath(); + assertEquals(0, p.capacityChars()); + assertEquals(0, p.capacityComponents()); + assertEquals(0, p.length()); + assertEquals("", p.toString('/')); + } + + @Test + public void testAddEmpty() { + // In the current implementation, p.add("") should add en empty + // component (which is, admitingly, not a useful case. On the other + // hand, p.add("", delimiter) should add no components at all. + // Verify this: + CategoryPath p = new CategoryPath(0, 0); + p.add(""); + assertEquals(1, p.length()); + p.add(""); + assertEquals(2, p.length()); + p.add("", '/'); + assertEquals(2, p.length()); + p.clear(); + p.add("", '/'); + assertEquals(0, p.length()); + } + + @Test + public void testDelimiterAdd() { + // Test that the add() that takes a string and a delimiter + // works correctly. Note that unlike the constructor test above, + // we can't expect the capacity to grow to exactly the length of + // the given category, so we do not test this. + CategoryPath p = new CategoryPath(0, 0); + p.add("", '/'); + assertEquals(0, p.length()); + assertEquals("", p.toString('@'), ""); + p.clear(); + p.add("hello", '/'); + assertEquals(p.length(), 1); + assertEquals(p.toString('@'), "hello"); + p.clear(); + p.add("hi/there", '/'); + assertEquals(p.length(), 2); + assertEquals(p.toString('@'), "hi@there"); + p.clear(); + p.add("how/are/you/doing?", '/'); + assertEquals(p.length(), 4); + assertEquals(p.toString('@'), "how@are@you@doing?"); + // See that this is really an add, not replace: + p.clear(); + p.add("hi/there", '/'); + assertEquals(p.length(), 2); + assertEquals(p.toString('@'), "hi@there"); + p.add("how/are/you/doing", '/'); + assertEquals(p.length(), 6); + assertEquals(p.toString('@'), "hi@there@how@are@you@doing"); + } + + @Test + public void testCopyConstructor() { + CategoryPath p = new CategoryPath(0,0); + int expectedchars=0; + for (int i=0; i<1000; i++) { + CategoryPath clone = new CategoryPath(p); + assertEquals(p.length(), clone.length()); + assertEquals(p.toString('/'), clone.toString('/')); + // verify that the newly created clone has exactly the right + // capacity, with no spare (while the original path p probably + // does have spare) + assertEquals(i, clone.capacityComponents()); + assertEquals(expectedchars, clone.capacityChars()); + // Finally, add another component to the path, for the next + // round of this loop + String num = Integer.toString(i); + p.add(num); + expectedchars+=num.length(); + } + } + + @Test + public void testPrefixCopyConstructor() { + CategoryPath p = new CategoryPath(0,0); + p.add("hi"); + p.add("there"); + p.add("man"); + assertEquals(p.length(), 3); + + CategoryPath p1 = new CategoryPath(p,2); + assertEquals(2, p1.length()); + assertEquals("hi/there", p1.toString('/')); + // the new prefix object should only take the space it needs: + assertEquals(2, p1.capacityComponents()); + assertEquals(7, p1.capacityChars()); + + p1 = new CategoryPath(p,1); + assertEquals(1, p1.length()); + assertEquals("hi", p1.toString('/')); + assertEquals(1, p1.capacityComponents()); + assertEquals(2, p1.capacityChars()); + + p1 = new CategoryPath(p,0); + assertEquals(0, p1.length()); + assertEquals("", p1.toString('/')); + assertEquals(0, p1.capacityComponents()); + assertEquals(0, p1.capacityChars()); + + // with all the following lengths, the prefix should be the whole path: + int[] lengths = { 3, -1, 4 }; + for (int i=0; i0) { + expectedCharsNeeded++; + } + assertEquals(expectedCharsNeeded, p.charsNeededForFullPath()); + } + } + + @Test + public void testCopyToCharArray() { + String[] components = { "hello", "world", "yo" }; + CategoryPath p = new CategoryPath(components); + char[] charArray = new char[p.charsNeededForFullPath()]; + int numCharsCopied = 0; + + numCharsCopied = p.copyToCharArray(charArray, 0, 0, '.'); + assertEquals(0, numCharsCopied); + assertEquals("", new String(charArray, 0, numCharsCopied)); + + numCharsCopied = p.copyToCharArray(charArray, 0, 1, '.'); + assertEquals(5, numCharsCopied); + assertEquals("hello", new String(charArray, 0, numCharsCopied)); + + numCharsCopied = p.copyToCharArray(charArray, 0, 3, '.'); + assertEquals(14, numCharsCopied); + assertEquals("hello.world.yo", new String(charArray, 0, numCharsCopied)); + + numCharsCopied = p.copyToCharArray(charArray, 0, -1, '.'); + assertEquals(14, numCharsCopied); + assertEquals("hello.world.yo", new String(charArray, 0, numCharsCopied)); + numCharsCopied = p.copyToCharArray(charArray, 0, 4, '.'); + assertEquals(14, numCharsCopied); + assertEquals("hello.world.yo", new String(charArray, 0, numCharsCopied)); + } + + @Test + public void testCharSerialization() throws Exception { + CategoryPath[] testCategories = { + new CategoryPath("hi", "there", "man"), + new CategoryPath("hello"), + new CategoryPath("what's", "up"), + // See that an empty category, which generates a (char)0, + // doesn't cause any problems in the middle of the serialization: + new CategoryPath(), + new CategoryPath("another", "example"), + new CategoryPath(), + new CategoryPath() + }; + StringBuilder sb = new StringBuilder(); + for (int i=0; i 0); + pother = new CategoryPath("a/b/c", '/'); + assertTrue(pother.compareTo(p) < 0); + pother = new CategoryPath("a/b/c/e", '/'); + assertTrue(pother.compareTo(p) > 0); + pother = new CategoryPath("a/b/c//e", '/'); + assertTrue(pother.compareTo(p) < 0); + } + + private static class CS implements CharSequence { + public CS(String s) { + this.ca = new char[s.length()]; + s.getChars(0, s.length(), this.ca, 0); + } + public char charAt(int index) { + return this.ca[index]; + } + public int length() { + return this.ca.length; + } + public CharSequence subSequence(int start, int end) { + return null; // not used. + } + private char[] ca; + } + +} diff --git a/modules/facet/src/test/org/apache/lucene/facet/taxonomy/TestTaxonomyCombined.java b/modules/facet/src/test/org/apache/lucene/facet/taxonomy/TestTaxonomyCombined.java new file mode 100644 index 00000000000..fb70255f889 --- /dev/null +++ b/modules/facet/src/test/org/apache/lucene/facet/taxonomy/TestTaxonomyCombined.java @@ -0,0 +1,1071 @@ +package org.apache.lucene.facet.taxonomy; + +import java.io.IOException; +import java.io.PrintWriter; +import java.io.StringWriter; +import java.util.ArrayList; +import java.util.Arrays; + +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.LockObtainFailedException; +import org.apache.lucene.store.RAMDirectory; +import org.junit.Ignore; +import org.junit.Test; + +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.facet.taxonomy.TaxonomyReader.ChildrenArrays; +import org.apache.lucene.facet.taxonomy.lucene.LuceneTaxonomyReader; +import org.apache.lucene.facet.taxonomy.lucene.LuceneTaxonomyWriter; +import org.apache.lucene.util.SlowRAMDirectory; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class TestTaxonomyCombined extends LuceneTestCase { + + /** The following categories will be added to the taxonomy by + fillTaxonomy(), and tested by all tests below: + */ + private final static String[][] categories = { + { "Author", "Tom Clancy" }, + { "Author", "Richard Dawkins" }, + { "Author", "Richard Adams" }, + { "Price", "10", "11" }, + { "Price", "10", "12" }, + { "Price", "20", "27" }, + { "Date", "2006", "05" }, + { "Date", "2005" }, + { "Date", "2006" }, + { "Subject", "Nonfiction", "Children", "Animals" }, + { "Author", "Stephen Jay Gould" }, + { "Author", "\u05e0\u05d3\u05d1\u3042\u0628" }, + }; + + /** When adding the above categories with TaxonomyWriter.addCategory(), + the following paths are expected to be returned: + (note that currently the full path is not returned, and therefore + not tested - rather, just the last component, the ordinal, is returned + and tested. + */ + private final static int[][] expectedPaths = { + { 1, 2 }, + { 1, 3 }, + { 1, 4 }, + { 5, 6, 7 }, + { 5, 6, 8 }, + { 5, 9, 10 }, + { 11, 12, 13 }, + { 11, 14 }, + { 11, 12 }, + { 15, 16, 17, 18 }, + { 1, 19 }, + { 1, 20 } + }; + + /** The taxonomy index is expected to then contain the following + generated categories, with increasing ordinals (note how parent + categories are be added automatically when subcategories are added). + */ + private final static String[][] expectedCategories = { + { }, // the root category + { "Author" }, + { "Author", "Tom Clancy" }, + { "Author", "Richard Dawkins" }, + { "Author", "Richard Adams" }, + { "Price" }, + { "Price", "10" }, + { "Price", "10", "11" }, + { "Price", "10", "12" }, + { "Price", "20" }, + { "Price", "20", "27" }, + { "Date" }, + { "Date", "2006" }, + { "Date", "2006", "05" }, + { "Date", "2005" }, + { "Subject" }, + { "Subject", "Nonfiction" }, + { "Subject", "Nonfiction", "Children" }, + { "Subject", "Nonfiction", "Children", "Animals" }, + { "Author", "Stephen Jay Gould" }, + { "Author", "\u05e0\u05d3\u05d1\u3042\u0628" }, + }; + + /** fillTaxonomy adds the categories in the categories[] array, and asserts + that the additions return exactly the ordinals (in the past - paths) + specified in expectedPaths[]. + Note that this assumes that fillTaxonomy() is called on an empty taxonomy + index. Calling it after something else was already added to the taxonomy + index will surely have this method fail. + */ + public static void fillTaxonomy(TaxonomyWriter tw) throws IOException { + for (int i = 0; i < categories.length; i++) { + int ordinal = tw.addCategory(new CategoryPath(categories[i])); + int expectedOrdinal = expectedPaths[i][expectedPaths[i].length-1]; + if (ordinal!=expectedOrdinal) { + fail("For category "+showcat(categories[i])+" expected ordinal "+ + expectedOrdinal+", but got "+ordinal); + } + } + } + + public static String showcat(String[] path) { + if (path==null) { + return ""; + } + if (path.length==0) { + return ""; + } + if (path.length==1 && path[0].length()==0) { + return "<\"\">"; + } + StringBuilder sb = new StringBuilder(path[0]); + for (int i=1; i"; + } + if (path.length()==0) { + return ""; + } + return "<"+path.toString('/')+">"; + } + + /** Basic tests for TaxonomyWriter. Basically, we test that + IndexWriter.addCategory works, i.e. returns the expected ordinals + (this is tested by calling the fillTaxonomy() method above). + We do not test here that after writing the index can be read - + this will be done in more tests below. + */ + @Test + public void testWriter() throws Exception { + Directory indexDir = new RAMDirectory(); + TaxonomyWriter tw = new LuceneTaxonomyWriter(indexDir); + fillTaxonomy(tw); + // Also check TaxonomyWriter.getSize() - see that the taxonomy's size + // is what we expect it to be. + assertEquals(expectedCategories.length, tw.getSize()); + tw.close(); + } + + /** testWriterTwice is exactly like testWriter, except that after adding + all the categories, we add them again, and see that we get the same + old ids again - not new categories. + */ + @Test + public void testWriterTwice() throws Exception { + Directory indexDir = new RAMDirectory(); + TaxonomyWriter tw = new LuceneTaxonomyWriter(indexDir); + fillTaxonomy(tw); + // run fillTaxonomy again - this will try to add the same categories + // again, and check that we see the same ordinal paths again, not + // different ones. + fillTaxonomy(tw); + // Let's check the number of categories again, to see that no + // extraneous categories were created: + assertEquals(expectedCategories.length, tw.getSize()); + tw.close(); + } + + /** testWriterTwice2 is similar to testWriterTwice, except that the index + is closed and reopened before attempting to write to it the same + categories again. While testWriterTwice can get along with writing + and reading correctly just to the cache, testWriterTwice2 checks also + the actual disk read part of the writer: + */ + @Test + public void testWriterTwice2() throws Exception { + Directory indexDir = new RAMDirectory(); + TaxonomyWriter tw = new LuceneTaxonomyWriter(indexDir); + fillTaxonomy(tw); + tw.close(); + tw = new LuceneTaxonomyWriter(indexDir); + // run fillTaxonomy again - this will try to add the same categories + // again, and check that we see the same ordinals again, not different + // ones, and that the number of categories hasn't grown by the new + // additions + fillTaxonomy(tw); + assertEquals(expectedCategories.length, tw.getSize()); + tw.close(); + } + + /** + * testWriterTwice3 is yet another test which tests creating a taxonomy + * in two separate writing sessions. This test used to fail because of + * a bug involving commit(), explained below, and now should succeed. + * + * @throws Exception + */ + @Test + public void testWriterTwice3() throws Exception { + Directory indexDir = new RAMDirectory(); + // First, create and fill the taxonomy + TaxonomyWriter tw = new LuceneTaxonomyWriter(indexDir); + fillTaxonomy(tw); + tw.close(); + // Now, open the same taxonomy and add the same categories again. + // After a few categories, the LuceneTaxonomyWriter implementation + // will stop looking for each category on disk, and rather read them + // all into memory and close it's reader. The bug was that it closed + // the reader, but forgot that it did (because it didn't set the reader + // reference to null). + tw = new LuceneTaxonomyWriter(indexDir); + fillTaxonomy(tw); + // Add one new category, just to make commit() do something: + tw.addCategory(new CategoryPath("hi")); + // Do a commit(). Here was a bug - if tw had a reader open, it should + // be reopened after the commit. However, in our case the reader should + // not be open (as explained above) but because it was not set to null, + // we forgot that, tried to reopen it, and got an AlreadyClosedException. + tw.commit(); + assertEquals(expectedCategories.length+1, tw.getSize()); + tw.close(); + } + + /** Another set of tests for the writer, which don't use an array and + * try to distill the different cases, and therefore may be more helpful + * for debugging a problem than testWriter() which is hard to know why + * or where it failed. + */ + @Test + public void testWriterSimpler() throws Exception { + Directory indexDir = new RAMDirectory(); + TaxonomyWriter tw = new LuceneTaxonomyWriter(indexDir); + assertEquals(1, tw.getSize()); // the root only + // Test that adding a new top-level category works + assertEquals(1, tw.addCategory(new CategoryPath("a"))); + assertEquals(2, tw.getSize()); + // Test that adding the same category again is noticed, and the + // same ordinal (and not a new one) is returned. + assertEquals(1, tw.addCategory(new CategoryPath("a"))); + assertEquals(2, tw.getSize()); + // Test that adding another top-level category returns a new ordinal, + // not the same one + assertEquals(2, tw.addCategory(new CategoryPath("b"))); + assertEquals(3, tw.getSize()); + // Test that adding a category inside one of the above adds just one + // new ordinal: + assertEquals(3, tw.addCategory(new CategoryPath("a","c"))); + assertEquals(4, tw.getSize()); + // Test that adding the same second-level category doesn't do anything: + assertEquals(3, tw.addCategory(new CategoryPath("a","c"))); + assertEquals(4, tw.getSize()); + // Test that adding a second-level category with two new components + // indeed adds two categories + assertEquals(5, tw.addCategory(new CategoryPath("d","e"))); + assertEquals(6, tw.getSize()); + // Verify that the parents were added above in the order we expected + assertEquals(4, tw.addCategory(new CategoryPath("d"))); + // Similar, but inside a category that already exists: + assertEquals(7, tw.addCategory(new CategoryPath("b", "d","e"))); + assertEquals(8, tw.getSize()); + // And now inside two levels of categories that already exist: + assertEquals(8, tw.addCategory(new CategoryPath("b", "d","f"))); + assertEquals(9, tw.getSize()); + + tw.close(); + } + + /** Test writing an empty index, and seeing that a reader finds in it + the root category, and only it. We check all the methods on that + root category return the expected results. + */ + @Test + public void testRootOnly() throws Exception { + Directory indexDir = new RAMDirectory(); + TaxonomyWriter tw = new LuceneTaxonomyWriter(indexDir); + // right after opening the index, it should already contain the + // root, so have size 1: + assertEquals(1, tw.getSize()); + tw.close(); + TaxonomyReader tr = new LuceneTaxonomyReader(indexDir); + assertEquals(1, tr.getSize()); + assertEquals(0, tr.getPath(0).length()); + assertEquals(TaxonomyReader.INVALID_ORDINAL, tr.getParent(0)); + assertEquals(0, tr.getOrdinal(new CategoryPath())); + tr.close(); + } + + /** The following test is exactly the same as testRootOnly, except we + * do not close the writer before opening the reader. We want to see + * that the root is visible to the reader not only after the writer is + * closed, but immediately after it is created. + */ + @Test + public void testRootOnly2() throws Exception { + Directory indexDir = new RAMDirectory(); + TaxonomyWriter tw = new LuceneTaxonomyWriter(indexDir); + tw.commit(); + TaxonomyReader tr = new LuceneTaxonomyReader(indexDir); + assertEquals(1, tr.getSize()); + assertEquals(0, tr.getPath(0).length()); + assertEquals(TaxonomyReader.INVALID_ORDINAL, tr.getParent(0)); + assertEquals(0, tr.getOrdinal(new CategoryPath())); + tw.close(); + tr.close(); + } + + /** Basic tests for TaxonomyReader's category <=> ordinal transformations + (getSize(), getCategory() and getOrdinal()). + We test that after writing the index, it can be read and all the + categories and ordinals are there just as we expected them to be. + */ + @Test + public void testReaderBasic() throws Exception { + Directory indexDir = new RAMDirectory(); + TaxonomyWriter tw = new LuceneTaxonomyWriter(indexDir); + fillTaxonomy(tw); + tw.close(); + TaxonomyReader tr = new LuceneTaxonomyReader(indexDir); + + // test TaxonomyReader.getSize(): + assertEquals(expectedCategories.length, tr.getSize()); + + // test round trips of ordinal => category => ordinal + for (int i=0; i category conversions. + + Note: At the moment, the parent methods in the reader are deprecated, + but this does not mean they should not be tested! Until they are + removed (*if* they are removed), these tests should remain to see + that they still work correctly. + */ + + @Test + public void testReaderParent() throws Exception { + Directory indexDir = new RAMDirectory(); + TaxonomyWriter tw = new LuceneTaxonomyWriter(indexDir); + fillTaxonomy(tw); + tw.close(); + TaxonomyReader tr = new LuceneTaxonomyReader(indexDir); + + // check that the parent of the root ordinal is the invalid ordinal: + assertEquals(TaxonomyReader.INVALID_ORDINAL, tr.getParent(0)); + + // check parent of non-root ordinals: + for (int ordinal=1; ordinal category conversions from TaxonomyReader. + * + * The difference between testWriterParent1 and testWriterParent2 is that + * the former closes the taxonomy writer before reopening it, while the + * latter does not. + * + * This test code is virtually identical to that of testReaderParent(). + */ + @Test + public void testWriterParent1() throws Exception { + Directory indexDir = new RAMDirectory(); + TaxonomyWriter tw = new LuceneTaxonomyWriter(indexDir); + fillTaxonomy(tw); + tw.close(); + tw = new LuceneTaxonomyWriter(indexDir); + TaxonomyReader tr = new LuceneTaxonomyReader(indexDir); + + checkWriterParent(tr, tw); + + tw.close(); + tr.close(); + } + + @Test + public void testWriterParent2() throws Exception { + Directory indexDir = new RAMDirectory(); + TaxonomyWriter tw = new LuceneTaxonomyWriter(indexDir); + fillTaxonomy(tw); + tw.commit(); + TaxonomyReader tr = new LuceneTaxonomyReader(indexDir); + + checkWriterParent(tr, tw); + + tw.close(); + tr.close(); + } + + private void checkWriterParent(TaxonomyReader tr, TaxonomyWriter tw) throws Exception { + // check that the parent of the root ordinal is the invalid ordinal: + assertEquals(TaxonomyReader.INVALID_ORDINAL, tw.getParent(0)); + + // check parent of non-root ordinals: + for (int ordinal = 1; ordinal < tr.getSize(); ordinal++) { + CategoryPath me = tr.getPath(ordinal); + int parentOrdinal = tw.getParent(ordinal); + CategoryPath parent = tr.getPath(parentOrdinal); + if (parent == null) { + fail("Parent of " + ordinal + " is " + parentOrdinal + + ", but this is not a valid category."); + } + // verify that the parent is indeed my parent, according to the + // strings + if (!new CategoryPath(me, me.length() - 1).equals(parent)) { + fail("Got parent " + parentOrdinal + " for ordinal " + ordinal + + " but categories are " + showcat(parent) + " and " + + showcat(me) + " respectively."); + } + } + + // check parent of of invalid ordinals: + try { + tw.getParent(-1); + fail("getParent for -1 should throw exception"); + } catch (ArrayIndexOutOfBoundsException e) { + // ok + } + try { + tw.getParent(TaxonomyReader.INVALID_ORDINAL); + fail("getParent for INVALID_ORDINAL should throw exception"); + } catch (ArrayIndexOutOfBoundsException e) { + // ok + } + try { + int parent = tw.getParent(tr.getSize()); + fail("getParent for getSize() should throw exception, but returned " + + parent); + } catch (ArrayIndexOutOfBoundsException e) { + // ok + } + } + + /** Tests TaxonomyReader's getParentArray() method. We do not test this + method directly, but rather just compare its results to those from + other methods (which we have already tested above). + */ + @Test + public void testReaderParentArray() throws Exception { + Directory indexDir = new RAMDirectory(); + TaxonomyWriter tw = new LuceneTaxonomyWriter(indexDir); + fillTaxonomy(tw); + tw.close(); + TaxonomyReader tr = new LuceneTaxonomyReader(indexDir); + int[] parents = tr.getParentArray(); + assertEquals(tr.getSize(), parents.length); + for (int i=0; i expectedChildren = new ArrayList(); + for (int j=expectedCategories.length-1; j>=0; j--) { + if (expectedCategories[j].length != expectedCategories[i].length+1) { + continue; // not longer by 1, so can't be a child + } + boolean ischild=true; + for (int k=0; ki; j--) { + if (tr.getParent(j)==i) { + break; // found youngest child + } + } + if (j==i) { // no child found + j=TaxonomyReader.INVALID_ORDINAL; + } + assertEquals(j, youngestChildArray[i]); + } + + // test that the "older sibling" is indeed the least oldest one - and + // not a too old one or -1 (so we didn't miss some children in the + // middle or the end of the chain). + for (int i=0; i=0; j--) { + if (tr.getParent(j)==tr.getParent(i)) { + break; // found youngest older sibling + } + } + if (j<0) { // no sibling found + j=TaxonomyReader.INVALID_ORDINAL; + } + assertEquals(j, olderSiblingArray[i]); + } + + tr.close(); + } + + /** + * Test how getChildrenArrays() deals with the taxonomy's growth: + */ + @Test + public void testChildrenArraysGrowth() throws Exception { + Directory indexDir = new RAMDirectory(); + TaxonomyWriter tw = new LuceneTaxonomyWriter(indexDir); + tw.addCategory(new CategoryPath("hi", "there")); + tw.commit(); + TaxonomyReader tr = new LuceneTaxonomyReader(indexDir); + ChildrenArrays ca = tr.getChildrenArrays(); + assertEquals(3, tr.getSize()); + assertEquals(3, ca.getOlderSiblingArray().length); + assertEquals(3, ca.getYoungestChildArray().length); + assertTrue(Arrays.equals(new int[] { 1, 2, -1 }, ca.getYoungestChildArray())); + assertTrue(Arrays.equals(new int[] { -1, -1, -1 }, ca.getOlderSiblingArray())); + tw.addCategory(new CategoryPath("hi", "ho")); + tw.addCategory(new CategoryPath("hello")); + tw.commit(); + // Before refresh, nothing changed.. + ChildrenArrays newca = tr.getChildrenArrays(); + assertSame(newca, ca); // we got exactly the same object + assertEquals(3, tr.getSize()); + assertEquals(3, ca.getOlderSiblingArray().length); + assertEquals(3, ca.getYoungestChildArray().length); + // After the refresh, things change: + tr.refresh(); + ca = tr.getChildrenArrays(); + assertEquals(5, tr.getSize()); + assertEquals(5, ca.getOlderSiblingArray().length); + assertEquals(5, ca.getYoungestChildArray().length); + assertTrue(Arrays.equals(new int[] { 4, 3, -1, -1, -1 }, ca.getYoungestChildArray())); + assertTrue(Arrays.equals(new int[] { -1, -1, -1, 2, 1 }, ca.getOlderSiblingArray())); + tw.close(); + tr.close(); + } + + /** + * Test that getParentArrays is valid when retrieved during refresh + */ + @Test + @Ignore + public void testTaxonomyReaderRefreshRaces() throws Exception { + // compute base child arrays - after first chunk, and after the other + Directory indexDirBase = new RAMDirectory(); + TaxonomyWriter twBase = new LuceneTaxonomyWriter(indexDirBase); + twBase.addCategory(new CategoryPath("a", "0")); + final CategoryPath abPath = new CategoryPath("a", "b"); + twBase.addCategory(abPath); + twBase.commit(); + TaxonomyReader trBase = new LuceneTaxonomyReader(indexDirBase); + + final ChildrenArrays ca1 = trBase.getChildrenArrays(); + + final int abOrd = trBase.getOrdinal(abPath); + final int abYoungChildBase1 = ca1.getYoungestChildArray()[abOrd]; + + for (int i=0; i < 1<<10; i++) { //1024 facets + twBase.addCategory(new CategoryPath("a", "b", Integer.toString(i))); + } + twBase.commit(); + + trBase.refresh(); + + final ChildrenArrays ca2 = trBase.getChildrenArrays(); + final int abYoungChildBase2 = ca2.getYoungestChildArray()[abOrd]; + + for (int retry=0; retry<100; retry++) { + assertConsistentYoungestChild(abPath, abOrd, abYoungChildBase1, abYoungChildBase2, retry); + } + } + + private void assertConsistentYoungestChild(final CategoryPath abPath, + final int abOrd, final int abYoungChildBase1, final int abYoungChildBase2, final int retry) + throws Exception { + SlowRAMDirectory indexDir = new SlowRAMDirectory(-1,null); // no slowness for intialization + TaxonomyWriter tw = new LuceneTaxonomyWriter(indexDir); + tw.addCategory(new CategoryPath("a", "0")); + tw.addCategory(abPath); + tw.commit(); + + final TaxonomyReader tr = new LuceneTaxonomyReader(indexDir); + for (int i=0; i < 1<<10; i++) { //1024 facets + final CategoryPath cp = new CategoryPath("a", "b", Integer.toString(i)); + tw.addCategory(cp); + assertEquals("Ordinal of "+cp+" must be invalid until Taxonomy Reader was refreshed", TaxonomyReader.INVALID_ORDINAL, tr.getOrdinal(cp)); + } + tw.commit(); + + final boolean[] stop = new boolean[] { false }; + final Throwable[] error = new Throwable[] { null }; + final int retrieval[] = { 0 }; + + Thread thread = new Thread("Child Arrays Verifier") { + @Override + public void run() { + setPriority(1+getPriority()); + try { + while (!stop[0]) { + int lastOrd = tr.getParentArray().length-1; + assertNotNull("path of last-ord "+lastOrd+" is not found!",tr.getPath(lastOrd)); + assertChildrenArrays(tr.getChildrenArrays(),retry,retrieval[0]++); + } + } catch (Throwable e) { + error[0] = e; + stop[0] = true; + } + } + + private void assertChildrenArrays(ChildrenArrays ca, int retry, int retrieval) { + final int abYoungChild = ca.getYoungestChildArray()[abOrd]; + assertTrue( + "Retry "+retry+": retrieval: "+retrieval+": wrong youngest child for category "+abPath+" (ord="+abOrd+ + ") - must be either "+abYoungChildBase1+" or "+abYoungChildBase2+" but was: "+abYoungChild, + abYoungChildBase1==abYoungChild || + abYoungChildBase2==ca.getYoungestChildArray()[abOrd]); + } + }; + thread.start(); + + indexDir.setSleepMillis(1); // some delay for refresh + tr.refresh(); + + stop[0] = true; + thread.join(); + assertNull("Unexpcted exception at retry "+retry+" retrieval "+retrieval[0]+": \n"+stackTraceStr(error[0]), error[0]); + + tw.close(); + tr.close(); + } + + /** Grab the stack trace into a string since the exception was thrown in a thread and we want the assert + * outside the thread to show the stack trace in case of failure. */ + private String stackTraceStr(final Throwable error) { + if (error == null) { + return ""; + } + StringWriter sw = new StringWriter(); + PrintWriter pw = new PrintWriter(sw); + error.printStackTrace(pw); + pw.close(); + return sw.toString(); + } + + /** Test that if separate reader and writer objects are opened, new + categories written into the writer are available to a reader only + after a commit(). + Note that this test obviously doesn't cover all the different + concurrency scenarios, all different methods, and so on. We may + want to write more tests of this sort. + + This test simulates what would happen when there are two separate + processes, one doing indexing, and the other searching, and each opens + its own object (with obviously no connection between the objects) using + the same disk files. Note, though, that this test does not test what + happens when the two processes do their actual work at exactly the same + time. + It also doesn't test multi-threading. + */ + @Test + public void testSeparateReaderAndWriter() throws Exception { + Directory indexDir = new RAMDirectory(); + TaxonomyWriter tw = new LuceneTaxonomyWriter(indexDir); + tw.commit(); + TaxonomyReader tr = new LuceneTaxonomyReader(indexDir); + + int author = 1; + + // getParent() and getSize() test: + try { + tr.getParent(author); + fail("Initially, getParent for "+author+" should throw exception"); + } catch (ArrayIndexOutOfBoundsException e) { + // ok + } + assertEquals(1, tr.getSize()); // the empty taxonomy has size 1 (the root) + tw.addCategory(new CategoryPath("Author")); + try { + tr.getParent(author); + fail("Before commit() and refresh(), getParent for "+author+" should still throw exception"); + } catch (ArrayIndexOutOfBoundsException e) { + // ok + } + assertEquals(1, tr.getSize()); // still root only... + tr.refresh(); // this is not enough, because tw.commit() hasn't been done yet + try { + tr.getParent(author); + fail("Before commit() and refresh(), getParent for "+author+" should still throw exception"); + } catch (ArrayIndexOutOfBoundsException e) { + // ok + } + assertEquals(1, tr.getSize()); // still root only... + tw.commit(); + try { + tr.getParent(author); + fail("Before refresh(), getParent for "+author+" should still throw exception"); + } catch (ArrayIndexOutOfBoundsException e) { + // ok + } + assertEquals(1, tr.getSize()); // still root only... + tr.refresh(); + try { + assertEquals(TaxonomyReader.ROOT_ORDINAL, tr.getParent(author)); + // ok + } catch (ArrayIndexOutOfBoundsException e) { + fail("After category addition, commit() and refresh(), getParent for "+author+" should NOT throw exception"); + } + assertEquals(2, tr.getSize()); // finally, see there are two categories + + // now, add another category, and verify that after commit and refresh + // the parent of this category is correct (this requires the reader + // to correctly update its prefetched parent vector), and that the + // old information also wasn't ruined: + tw.addCategory(new CategoryPath("Author", "Richard Dawkins")); + int dawkins = 2; + tw.commit(); + tr.refresh(); + assertEquals(author, tr.getParent(dawkins)); + assertEquals(TaxonomyReader.ROOT_ORDINAL, tr.getParent(author)); + assertEquals(TaxonomyReader.INVALID_ORDINAL, tr.getParent(TaxonomyReader.ROOT_ORDINAL)); + assertEquals(3, tr.getSize()); + tw.close(); + tr.close(); + } + + @Test + public void testSeparateReaderAndWriter2() throws Exception { + Directory indexDir = new RAMDirectory(); + TaxonomyWriter tw = new LuceneTaxonomyWriter(indexDir); + tw.commit(); + TaxonomyReader tr = new LuceneTaxonomyReader(indexDir); + + // Test getOrdinal(): + CategoryPath author = new CategoryPath("Author"); + + assertEquals(1, tr.getSize()); // the empty taxonomy has size 1 (the root) + assertEquals(TaxonomyReader.INVALID_ORDINAL, tr.getOrdinal(author)); + tw.addCategory(author); + // before commit and refresh, no change: + assertEquals(TaxonomyReader.INVALID_ORDINAL, tr.getOrdinal(author)); + assertEquals(1, tr.getSize()); // still root only... + tr.refresh(); // this is not enough, because tw.commit() hasn't been done yet + assertEquals(TaxonomyReader.INVALID_ORDINAL, tr.getOrdinal(author)); + assertEquals(1, tr.getSize()); // still root only... + tw.commit(); + // still not enough before refresh: + assertEquals(TaxonomyReader.INVALID_ORDINAL, tr.getOrdinal(author)); + assertEquals(1, tr.getSize()); // still root only... + tr.refresh(); // finally + assertEquals(1, tr.getOrdinal(author)); + assertEquals(2, tr.getSize()); // still root only... + tw.close(); + tr.close(); + } + + /** + * Test what happens if we try to write to a locked taxonomy writer, + * and see that we can unlock it and continue. + */ + @Test + public void testWriterLock() throws Exception { + Directory indexDir = new RAMDirectory(); + TaxonomyWriter tw = new LuceneTaxonomyWriter(indexDir); + tw.addCategory(new CategoryPath("hi", "there")); + tw.commit(); + // we deliberately not close the write now, and keep it open and + // locked. + // Verify that the writer worked: + TaxonomyReader tr = new LuceneTaxonomyReader(indexDir); + assertEquals(2, tr.getOrdinal(new CategoryPath("hi", "there"))); + // Try to open a second writer, with the first one locking the directory. + // We expect to get a LockObtainFailedException. + try { + new LuceneTaxonomyWriter(indexDir); + fail("should have failed to write in locked directory"); + } catch (LockObtainFailedException e) { + // this is what we expect to happen. + } + // Remove the lock, and now the open should succeed, and we can + // write to the new writer. + LuceneTaxonomyWriter.unlock(indexDir); + TaxonomyWriter tw2 = new LuceneTaxonomyWriter(indexDir); + tw2.addCategory(new CategoryPath("hey")); + tw2.close(); + // See that the writer indeed wrote: + tr.refresh(); + assertEquals(3, tr.getOrdinal(new CategoryPath("hey"))); + tr.close(); + } + + /** + * fillTaxonomyCheckPaths adds the categories in the categories[] array, + * and asserts that the additions return exactly paths specified in + * expectedPaths[]. This is the same add fillTaxonomy() but also checks + * the correctness of getParent(), not just addCategory(). + * Note that this assumes that fillTaxonomyCheckPaths() is called on an empty + * taxonomy index. Calling it after something else was already added to the + * taxonomy index will surely have this method fail. + */ + public static void fillTaxonomyCheckPaths(TaxonomyWriter tw) throws IOException { + for (int i = 0; i < categories.length; i++) { + int ordinal = tw.addCategory(new CategoryPath(categories[i])); + int expectedOrdinal = expectedPaths[i][expectedPaths[i].length-1]; + if (ordinal!=expectedOrdinal) { + fail("For category "+showcat(categories[i])+" expected ordinal "+ + expectedOrdinal+", but got "+ordinal); + } + for (int j=expectedPaths[i].length-2; j>=0; j--) { + ordinal = tw.getParent(ordinal); + expectedOrdinal = expectedPaths[i][j]; + if (ordinal!=expectedOrdinal) { + fail("For category "+showcat(categories[i])+" expected ancestor level "+ + (expectedPaths[i].length-1-j)+" was "+expectedOrdinal+ + ", but got "+ordinal); + } + } + } + } + + // After fillTaxonomy returned successfully, checkPaths() checks that + // the getParent() calls return as expected, from the table + public static void checkPaths(TaxonomyWriter tw) throws IOException { + for (int i = 0; i < categories.length; i++) { + int ordinal = expectedPaths[i][expectedPaths[i].length-1]; + for (int j=expectedPaths[i].length-2; j>=0; j--) { + ordinal = tw.getParent(ordinal); + int expectedOrdinal = expectedPaths[i][j]; + if (ordinal!=expectedOrdinal) { + fail("For category "+showcat(categories[i])+" expected ancestor level "+ + (expectedPaths[i].length-1-j)+" was "+expectedOrdinal+ + ", but got "+ordinal); + } + } + assertEquals(TaxonomyReader.ROOT_ORDINAL, tw.getParent(expectedPaths[i][0])); + } + assertEquals(TaxonomyReader.INVALID_ORDINAL, tw.getParent(TaxonomyReader.ROOT_ORDINAL)); + } + + /** + * Basic test for TaxonomyWriter.getParent(). This is similar to testWriter + * above, except we also check the parents of the added categories, not just + * the categories themselves. + */ + @Test + public void testWriterCheckPaths() throws Exception { + Directory indexDir = new RAMDirectory(); + TaxonomyWriter tw = new LuceneTaxonomyWriter(indexDir); + fillTaxonomyCheckPaths(tw); + // Also check TaxonomyWriter.getSize() - see that the taxonomy's size + // is what we expect it to be. + assertEquals(expectedCategories.length, tw.getSize()); + tw.close(); + } + + /** + * testWriterCheckPaths2 is the path-checking variant of testWriterTwice + * and testWriterTwice2. After adding all the categories, we add them again, + * and see that we get the same old ids and paths. We repeat the path checking + * yet again after closing and opening the index for writing again - to see + * that the reading of existing data from disk works as well. + */ + @Test + public void testWriterCheckPaths2() throws Exception { + Directory indexDir = new RAMDirectory(); + TaxonomyWriter tw = new LuceneTaxonomyWriter(indexDir); + fillTaxonomy(tw); + checkPaths(tw); + fillTaxonomy(tw); + checkPaths(tw); + tw.close(); + + tw = new LuceneTaxonomyWriter(indexDir); + checkPaths(tw); + fillTaxonomy(tw); + checkPaths(tw); + tw.close(); + } + +// TODO (Facet): test multiple readers, one writer. Have the multiple readers +// using the same object (simulating threads) or different objects +// (simulating processes). +} diff --git a/modules/facet/src/test/org/apache/lucene/facet/taxonomy/lucene/TestAddTaxonomies.java b/modules/facet/src/test/org/apache/lucene/facet/taxonomy/lucene/TestAddTaxonomies.java new file mode 100644 index 00000000000..0c06699b11e --- /dev/null +++ b/modules/facet/src/test/org/apache/lucene/facet/taxonomy/lucene/TestAddTaxonomies.java @@ -0,0 +1,234 @@ +package org.apache.lucene.facet.taxonomy.lucene; + +import java.io.File; + +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.RAMDirectory; +import org.junit.Test; + +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.facet.taxonomy.CategoryPath; +import org.apache.lucene.facet.taxonomy.TaxonomyReader; +import org.apache.lucene.facet.taxonomy.lucene.LuceneTaxonomyReader; +import org.apache.lucene.facet.taxonomy.lucene.LuceneTaxonomyWriter; +import org.apache.lucene.facet.taxonomy.lucene.LuceneTaxonomyWriter.DiskOrdinalMap; +import org.apache.lucene.facet.taxonomy.lucene.LuceneTaxonomyWriter.MemoryOrdinalMap; +import org.apache.lucene.facet.taxonomy.lucene.LuceneTaxonomyWriter.OrdinalMap; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class TestAddTaxonomies extends LuceneTestCase { + + @Test + public void test1() throws Exception { + Directory dir1 = new RAMDirectory(); + LuceneTaxonomyWriter tw1 = new LuceneTaxonomyWriter(dir1); + tw1.addCategory(new CategoryPath("Author", "Mark Twain")); + tw1.addCategory(new CategoryPath("Animals", "Dog")); + Directory dir2 = new RAMDirectory(); + LuceneTaxonomyWriter tw2 = new LuceneTaxonomyWriter(dir2); + tw2.addCategory(new CategoryPath("Author", "Rob Pike")); + tw2.addCategory(new CategoryPath("Aardvarks", "Bob")); + tw2.close(); + Directory dir3 = new RAMDirectory(); + LuceneTaxonomyWriter tw3 = new LuceneTaxonomyWriter(dir3); + tw3.addCategory(new CategoryPath("Author", "Zebra Smith")); + tw3.addCategory(new CategoryPath("Aardvarks", "Bob")); + tw3.addCategory(new CategoryPath("Aardvarks", "Aaron")); + tw3.close(); + + MemoryOrdinalMap[] maps = new MemoryOrdinalMap[2]; + maps[0] = new MemoryOrdinalMap(); + maps[1] = new MemoryOrdinalMap(); + + tw1.addTaxonomies(new Directory[] { dir2, dir3 }, maps); + tw1.close(); + + TaxonomyReader tr = new LuceneTaxonomyReader(dir1); + + // Test that the merged taxonomy now contains what we expect: + // First all the categories of the original taxonomy, in their original order: + assertEquals(tr.getPath(0).toString(), ""); + assertEquals(tr.getPath(1).toString(), "Author"); + assertEquals(tr.getPath(2).toString(), "Author/Mark Twain"); + assertEquals(tr.getPath(3).toString(), "Animals"); + assertEquals(tr.getPath(4).toString(), "Animals/Dog"); + // Then the categories new in the new taxonomy, in alphabetical order: + assertEquals(tr.getPath(5).toString(), "Aardvarks"); + assertEquals(tr.getPath(6).toString(), "Aardvarks/Aaron"); + assertEquals(tr.getPath(7).toString(), "Aardvarks/Bob"); + assertEquals(tr.getPath(8).toString(), "Author/Rob Pike"); + assertEquals(tr.getPath(9).toString(), "Author/Zebra Smith"); + assertEquals(tr.getSize(), 10); + + // Test that the maps contain what we expect + int[] map0 = maps[0].getMap(); + assertEquals(5, map0.length); + assertEquals(0, map0[0]); + assertEquals(1, map0[1]); + assertEquals(8, map0[2]); + assertEquals(5, map0[3]); + assertEquals(7, map0[4]); + + int[] map1 = maps[1].getMap(); + assertEquals(6, map1.length); + assertEquals(0, map1[0]); + assertEquals(1, map1[1]); + assertEquals(9, map1[2]); + assertEquals(5, map1[3]); + assertEquals(7, map1[4]); + assertEquals(6, map1[5]); + } + + // A more comprehensive and big random test. + @Test + public void testbig() throws Exception { + dotest(2, 1000, 5000, false); + dotest(10, 10000, 100, false); + dotest(50, 20, 100, false); + dotest(10, 1000, 10000, false); + dotest(50, 20, 10000, false); + dotest(1, 20, 10000, false); + dotest(10, 1, 10000, false); + dotest(10, 1000, 20000, true); + } + + private void dotest(int ntaxonomies, int ncats, int range, boolean disk) throws Exception { + Directory dirs[] = new Directory[ntaxonomies]; + Directory copydirs[] = new Directory[ntaxonomies]; + + for (int i=0; i1) { + for (int i=0; i= copytr.getSize()); + } else { + assertEquals(copytr.getSize(), tr.getSize()); + } + for (int j=0; j copytr.getSize()) { + String prev = tr.getPath(copytr.getSize()).toString(); + for (int j=copytr.getSize()+1; j openReaders = new HashSet(); + + int iwriter=0; + Set openWriters = new HashSet(); + + LeakChecker() { } + + public LuceneTaxonomyWriter openWriter(Directory dir) throws CorruptIndexException, LockObtainFailedException, IOException { + return new InstrumentedTaxonomyWriter(dir); + } + + public LuceneTaxonomyReader openReader(Directory dir) throws CorruptIndexException, LockObtainFailedException, IOException { + return new InstrumentedTaxonomyReader(dir); + } + + public int nopen() { + int ret=0; + for (int i: openReaders) { + System.err.println("reader "+i+" still open"); + ret++; + } + for (int i: openWriters) { + System.err.println("writer "+i+" still open"); + ret++; + } + return ret; + } + + private class InstrumentedTaxonomyWriter extends LuceneTaxonomyWriter { + public InstrumentedTaxonomyWriter(Directory dir) throws CorruptIndexException, LockObtainFailedException, IOException { + super(dir); + } + @Override + protected IndexReader openReader() throws IOException { + return new InstrumentedIndexReader(super.openReader()); + } + @Override + protected void openLuceneIndex (Directory directory, OpenMode openMode) + throws CorruptIndexException, LockObtainFailedException, IOException { + indexWriter = new InstrumentedIndexWriter(directory, + new IndexWriterConfig(TEST_VERSION_CURRENT, new KeywordAnalyzer()) + .setOpenMode(openMode)); + } + + } + + private class InstrumentedTaxonomyReader extends LuceneTaxonomyReader { + public InstrumentedTaxonomyReader(Directory dir) throws CorruptIndexException, LockObtainFailedException, IOException { + super(dir); + } + @Override + protected IndexReader openIndexReader(Directory dir) throws CorruptIndexException, IOException { + return new InstrumentedIndexReader(IndexReader.open(dir,true)); + } + + } + + private class InstrumentedIndexReader extends FilterIndexReader { + int mynum; + public InstrumentedIndexReader(IndexReader in) { + super(in); + this.in = in; + mynum = ireader++; + openReaders.add(mynum); + // System.err.println("opened "+mynum); + } + @Override + public synchronized IndexReader reopen() throws CorruptIndexException, IOException { + IndexReader n = in.reopen(); + if (n==in) { + return this; + } + return new InstrumentedIndexReader(n); + } + + // Unfortunately, IndexReader.close() is marked final so we can't + // change it! Fortunately, close() calls (if the object wasn't + // already closed) doClose() so we can override it to do our thing - + // just like FilterIndexReader does. + @Override + public void doClose() throws IOException { + in.close(); + if (!openReaders.contains(mynum)) { // probably can't happen... + fail("Reader #"+mynum+" was closed twice!"); + } + openReaders.remove(mynum); + // System.err.println("closed "+mynum); + } + } + private class InstrumentedIndexWriter extends IndexWriter { + int mynum; + public InstrumentedIndexWriter(Directory d, IndexWriterConfig conf) throws CorruptIndexException, LockObtainFailedException, IOException { + super(d, conf); + mynum = iwriter++; + openWriters.add(mynum); + // System.err.println("openedw "+mynum); + } + + @Override + public void close() throws IOException { + super.close(); + if (!openWriters.contains(mynum)) { // probably can't happen... + fail("Writer #"+mynum+" was closed twice!"); + } + openWriters.remove(mynum); + // System.err.println("closedw "+mynum); + } + } + } +} diff --git a/modules/facet/src/test/org/apache/lucene/facet/taxonomy/lucene/TestLuceneTaxonomyWriter.java b/modules/facet/src/test/org/apache/lucene/facet/taxonomy/lucene/TestLuceneTaxonomyWriter.java new file mode 100644 index 00000000000..7519538b329 --- /dev/null +++ b/modules/facet/src/test/org/apache/lucene/facet/taxonomy/lucene/TestLuceneTaxonomyWriter.java @@ -0,0 +1,90 @@ +package org.apache.lucene.facet.taxonomy.lucene; + +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriterConfig.OpenMode; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.RAMDirectory; +import org.junit.Test; + +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.facet.taxonomy.CategoryPath; +import org.apache.lucene.facet.taxonomy.lucene.LuceneTaxonomyWriter; +import org.apache.lucene.facet.taxonomy.writercache.TaxonomyWriterCache; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class TestLuceneTaxonomyWriter extends LuceneTestCase { + + // A No-Op TaxonomyWriterCache which always discards all given categories, and + // always returns true in put(), to indicate some cache entries were cleared. + private static class NoOpCache implements TaxonomyWriterCache { + + NoOpCache() { } + + public void close() {} + public int get(CategoryPath categoryPath) { return -1; } + public int get(CategoryPath categoryPath, int length) { return get(categoryPath); } + public boolean put(CategoryPath categoryPath, int ordinal) { return true; } + public boolean put(CategoryPath categoryPath, int prefixLen, int ordinal) { return true; } + public boolean hasRoom(int numberOfEntries) { return false; } + + } + + @Test + public void testCommit() throws Exception { + // Verifies that nothing is committed to the underlying Directory, if + // commit() wasn't called. + Directory dir = new RAMDirectory(); + LuceneTaxonomyWriter ltw = new LuceneTaxonomyWriter(dir, OpenMode.CREATE_OR_APPEND, new NoOpCache()); + assertFalse(IndexReader.indexExists(dir)); + ltw.commit(); // first commit, so that an index will be created + ltw.addCategory(new CategoryPath("a")); + + IndexReader r = IndexReader.open(dir); + assertEquals("No categories should have been committed to the underlying directory", 1, r.numDocs()); + r.close(); + ltw.close(); + dir.close(); + } + + @Test + public void testCommitUserData() throws Exception { + // Verifies that committed data is retrievable + Directory dir = new RAMDirectory(); + LuceneTaxonomyWriter ltw = new LuceneTaxonomyWriter(dir, OpenMode.CREATE_OR_APPEND, new NoOpCache()); + assertFalse(IndexReader.indexExists(dir)); + ltw.commit(); // first commit, so that an index will be created + ltw.addCategory(new CategoryPath("a")); + ltw.addCategory(new CategoryPath("b")); + Map userCommitData = new HashMap(); + userCommitData.put("testing", "1 2 3"); + ltw.commit(userCommitData); + ltw.close(); + IndexReader r = IndexReader.open(dir); + assertEquals("2 categories plus root should have been committed to the underlying directory", 3, r.numDocs()); + Map readUserCommitData = r.getCommitUserData(); + assertTrue("wrong value extracted from commit data", + "1 2 3".equals(readUserCommitData.get("testing"))); + r.close(); + dir.close(); + } + +} diff --git a/modules/facet/src/test/org/apache/lucene/facet/taxonomy/writercache/cl2o/TestCharBlockArray.java b/modules/facet/src/test/org/apache/lucene/facet/taxonomy/writercache/cl2o/TestCharBlockArray.java new file mode 100644 index 00000000000..58e11821247 --- /dev/null +++ b/modules/facet/src/test/org/apache/lucene/facet/taxonomy/writercache/cl2o/TestCharBlockArray.java @@ -0,0 +1,92 @@ +package org.apache.lucene.facet.taxonomy.writercache.cl2o; + +import java.io.BufferedInputStream; +import java.io.BufferedOutputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; + +import org.junit.Test; + +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.facet.taxonomy.writercache.cl2o.CharBlockArray; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class TestCharBlockArray extends LuceneTestCase { + + @Test public void testArray() throws Exception { + CharBlockArray array = new CharBlockArray(); + StringBuilder builder = new StringBuilder(); + + final int n = 100 * 1000; + + byte[] buffer = new byte[50]; + + for (int i = 0; i < n; i++) { + random.nextBytes(buffer); + int size = 1 + random.nextInt(50); + + String s = new String(buffer, 0, size); + array.append(s); + builder.append(s); + } + + for (int i = 0; i < n; i++) { + random.nextBytes(buffer); + int size = 1 + random.nextInt(50); + + String s = new String(buffer, 0, size); + array.append((CharSequence)s); + builder.append(s); + } + + for (int i = 0; i < n; i++) { + random.nextBytes(buffer); + int size = 1 + random.nextInt(50); + + String s = new String(buffer, 0, size); + for (int j = 0; j < s.length(); j++) { + array.append(s.charAt(j)); + } + builder.append(s); + } + + assertEqualsInternal("GrowingCharArray<->StringBuilder mismatch.", builder, array); + + File f = new File("GrowingCharArrayTest.tmp"); + BufferedOutputStream out = new BufferedOutputStream(new FileOutputStream(f)); + array.flush(out); + out.flush(); + out.close(); + + BufferedInputStream in = new BufferedInputStream(new FileInputStream(f)); + array = CharBlockArray.open(in); + assertEqualsInternal("GrowingCharArray<->StringBuilder mismatch after flush/load.", builder, array); + in.close(); + f.delete(); + } + + private static void assertEqualsInternal(String msg, StringBuilder expected, CharBlockArray actual) { + assertEquals(msg, expected.length(), actual.length()); + for (int i = 0; i < expected.length(); i++) { + assertEquals(msg, expected.charAt(i), actual.charAt(i)); + } + } + +} diff --git a/modules/facet/src/test/org/apache/lucene/facet/taxonomy/writercache/cl2o/TestCompactLabelToOrdinal.java b/modules/facet/src/test/org/apache/lucene/facet/taxonomy/writercache/cl2o/TestCompactLabelToOrdinal.java new file mode 100644 index 00000000000..5c432fc4c27 --- /dev/null +++ b/modules/facet/src/test/org/apache/lucene/facet/taxonomy/writercache/cl2o/TestCompactLabelToOrdinal.java @@ -0,0 +1,123 @@ +package org.apache.lucene.facet.taxonomy.writercache.cl2o; + +import java.io.File; +import java.util.HashMap; +import java.util.Map; + +import org.junit.Test; + +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.facet.taxonomy.CategoryPath; +import org.apache.lucene.facet.taxonomy.writercache.cl2o.CompactLabelToOrdinal; +import org.apache.lucene.facet.taxonomy.writercache.cl2o.LabelToOrdinal; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class TestCompactLabelToOrdinal extends LuceneTestCase { + + @Test + public void testL2O() throws Exception { + LabelToOrdinal map = new LabelToOrdinalMap(); + + CompactLabelToOrdinal compact = new CompactLabelToOrdinal(2000000, 0.15f, 3); + + final int n = 100 * 1000; + final int numUniqueValues = 50 * 1000; + + String[] uniqueValues = new String[numUniqueValues]; + byte[] buffer = new byte[50]; + + for (int i = 0; i < numUniqueValues;) { + random.nextBytes(buffer); + int size = 1 + random.nextInt(50); + + uniqueValues[i] = new String(buffer, 0, size); + if (uniqueValues[i].indexOf(CompactLabelToOrdinal.TerminatorChar) == -1) { + i++; + } + } + + TEMP_DIR.mkdirs(); + File f = new File(TEMP_DIR, "CompactLabelToOrdinalTest.tmp"); + int flushInterval = 10; + + for (int i = 0; i < n * 10; i++) { + if (i > 0 && i % flushInterval == 0) { + compact.flush(f); + compact = CompactLabelToOrdinal.open(f, 0.15f, 3); + assertTrue(f.delete()); + if (flushInterval < (n / 10)) { + flushInterval *= 10; + } + } + + int index = random.nextInt(numUniqueValues); + CategoryPath label = new CategoryPath(uniqueValues[index], '/'); + + int ord1 = map.getOrdinal(label); + int ord2 = compact.getOrdinal(label); + + //System.err.println(ord1+" "+ord2); + + assertEquals(ord1, ord2); + + if (ord1 == LabelToOrdinal.InvalidOrdinal) { + ord1 = compact.getNextOrdinal(); + + map.addLabel(label, ord1); + compact.addLabel(label, ord1); + } + } + + for (int i = 0; i < numUniqueValues; i++) { + CategoryPath label = new CategoryPath(uniqueValues[i], '/'); + int ord1 = map.getOrdinal(label); + int ord2 = compact.getOrdinal(label); + assertEquals(ord1, ord2); + } + } + + private static class LabelToOrdinalMap extends LabelToOrdinal { + private Map map = new HashMap(); + + LabelToOrdinalMap() { } + + @Override + public void addLabel(CategoryPath label, int ordinal) { + map.put(new CategoryPath(label), ordinal); + } + + @Override + public void addLabel(CategoryPath label, int prefixLen, int ordinal) { + map.put(new CategoryPath(label, prefixLen), ordinal); + } + + @Override + public int getOrdinal(CategoryPath label) { + Integer value = map.get(label); + return (value != null) ? value.intValue() : LabelToOrdinal.InvalidOrdinal; + } + + @Override + public int getOrdinal(CategoryPath label, int prefixLen) { + Integer value = map.get(new CategoryPath(label, prefixLen)); + return (value != null) ? value.intValue() : LabelToOrdinal.InvalidOrdinal; + } + + } +} diff --git a/modules/facet/src/test/org/apache/lucene/facet/util/TestScoredDocIDsUtils.java b/modules/facet/src/test/org/apache/lucene/facet/util/TestScoredDocIDsUtils.java new file mode 100644 index 00000000000..8652eed3d46 --- /dev/null +++ b/modules/facet/src/test/org/apache/lucene/facet/util/TestScoredDocIDsUtils.java @@ -0,0 +1,239 @@ +package org.apache.lucene.facet.util; + +import java.io.IOException; + +import org.apache.lucene.analysis.core.KeywordAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.Field.Index; +import org.apache.lucene.document.Field.Store; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.MultiFields; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.DocIdSet; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.OpenBitSet; +import org.apache.lucene.util.OpenBitSetDISI; +import org.junit.Test; + +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.facet.search.ScoredDocIDs; +import org.apache.lucene.facet.search.ScoredDocIDsIterator; +import org.apache.lucene.facet.search.ScoredDocIdCollector; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class TestScoredDocIDsUtils extends LuceneTestCase { + + @Test + public void testComplementIterator() throws Exception { + final int n = 100000; + final OpenBitSet bits = new OpenBitSet(n); + for (int i = 0; i < 5 * n; i++) { + bits.flip(random.nextInt(n)); + } + + OpenBitSet verify = new OpenBitSet(n); + verify.or(bits); + + ScoredDocIDs scoredDocIDs = ScoredDocIdsUtils.createScoredDocIds(bits, n); + + IndexReader reader = createReaderWithNDocs(n); + try { + assertEquals(n - verify.cardinality(), ScoredDocIdsUtils.getComplementSet(scoredDocIDs, + reader).size()); + } finally { + reader.close(); + } + } + + @Test + public void testAllDocs() throws Exception { + int maxDoc = 3; + IndexReader reader = createReaderWithNDocs(maxDoc); + try { + ScoredDocIDs all = ScoredDocIdsUtils.createAllDocsScoredDocIDs(reader); + assertEquals("invalid size", maxDoc, all.size()); + ScoredDocIDsIterator iter = all.iterator(); + int doc = 0; + while (iter.next()) { + assertEquals("invalid doc ID: " + iter.getDocID(), doc++, iter.getDocID()); + assertEquals("invalid score: " + iter.getScore(), ScoredDocIDsIterator.DEFAULT_SCORE, iter.getScore(), 0.0f); + } + assertEquals("invalid maxDoc: " + doc, maxDoc, doc); + + DocIdSet docIDs = all.getDocIDs(); + assertTrue("should be cacheable", docIDs.isCacheable()); + DocIdSetIterator docIDsIter = docIDs.iterator(); + assertEquals("nextDoc() hasn't been called yet", -1, docIDsIter.docID()); + assertEquals(0, docIDsIter.nextDoc()); + assertEquals(1, docIDsIter.advance(1)); + // if advance is smaller than current doc, advance to cur+1. + assertEquals(2, docIDsIter.advance(0)); + } finally { + reader.close(); + } + } + + @Test + public void testWithDeletions() throws Exception { + int N_DOCS = 100; + + DocumentFactory docFactory = new DocumentFactory(N_DOCS) { + @Override + public boolean markedDeleted(int docNum) { + return (docNum % 3 == 0 || // every 3rd documents, including first + docNum == numDocs - 1 || // last document + docNum == numDocs / 2 || // 3 consecutive documents in the middle + docNum == 1 + numDocs / 2 || + docNum == 2 + numDocs / 2); + } + + // every 6th document (starting from the 2nd) would contain 'alpha' + @Override + public boolean haveAlpha(int docNum) { + return (docNum % 6 == 1); + } + }; + + IndexReader reader = createReaderWithNDocs(N_DOCS, docFactory); + try { + int numErasedDocs = reader.numDeletedDocs(); + + ScoredDocIDs allDocs = ScoredDocIdsUtils.createAllDocsScoredDocIDs(reader); + ScoredDocIDsIterator it = allDocs.iterator(); + int numIteratedDocs = 0; + while (it.next()) { + numIteratedDocs++; + int docNum = it.getDocID(); + assertFalse( + "Deleted docs must not appear in the allDocsScoredDocIds set", + docFactory.markedDeleted(docNum)); + } + + assertEquals("Wrong number of (live) documents", allDocs.size(), numIteratedDocs); + + assertEquals("Wrong number of (live) documents", N_DOCS + - numErasedDocs, numIteratedDocs); + + // Get all 'alpha' documents + ScoredDocIdCollector collector = ScoredDocIdCollector.create(reader.maxDoc(), false); + Query q = new TermQuery(new Term(DocumentFactory.field, DocumentFactory.alphaTxt)); + new IndexSearcher(reader).search(q, collector); + + ScoredDocIDs scoredDocIds = collector.getScoredDocIDs(); + OpenBitSet resultSet = new OpenBitSetDISI(scoredDocIds.getDocIDs().iterator(), reader.maxDoc()); + + // Getting the complement set of the query result + ScoredDocIDs complementSet = ScoredDocIdsUtils.getComplementSet(scoredDocIds, reader); + + assertEquals("Number of documents in complement set mismatch", + reader.numDocs() - scoredDocIds.size(), complementSet.size()); + + // now make sure the documents in the complement set are not deleted + // and not in the original result set + ScoredDocIDsIterator compIterator = complementSet.iterator(); + Bits deleted = MultiFields.getDeletedDocs(reader); + while (compIterator.next()) { + int docNum = compIterator.getDocID(); + assertFalse( + "Complement-Set must not contain deleted documents (doc="+docNum+")", + deleted != null && deleted.get(docNum)); + assertFalse( + "Complement-Set must not contain deleted documents (doc="+docNum+")", + docFactory.markedDeleted(docNum)); + assertFalse( + "Complement-Set must not contain docs from the original set (doc="+docNum+")", + resultSet.fastGet(docNum)); + } + } finally { + reader.close(); + } + } + + /** + * Creates an index with n documents, this method is meant for testing purposes ONLY + * Node that this reader is NOT read-only and document can be deleted. + */ + static IndexReader createReaderWithNDocs(int nDocs) throws IOException { + return createReaderWithNDocs(nDocs, new DocumentFactory(nDocs)); + } + + private static class DocumentFactory { + protected final static String field = "content"; + protected final static String delTxt = "delete"; + protected final static String alphaTxt = "alpha"; + + private final static Field deletionMark = new Field(field, delTxt, Store.NO, Index.NOT_ANALYZED_NO_NORMS); + private final static Field alphaContent = new Field(field, alphaTxt, Store.NO, Index.NOT_ANALYZED_NO_NORMS); + + protected final int numDocs; + + public DocumentFactory(int totalNumDocs) { + this.numDocs = totalNumDocs; + } + + public boolean markedDeleted(int docNum) { + return false; + } + + public Document getDoc(int docNum) { + Document doc = new Document(); + if (markedDeleted(docNum)) { + doc.add(deletionMark); + } + + if (haveAlpha(docNum)) { + doc.add(alphaContent); + } + return doc; + } + + public boolean haveAlpha(int docNum) { + return false; + } + } + + static IndexReader createReaderWithNDocs(int nDocs, DocumentFactory docFactory) throws IOException { + Directory ramDir = new RAMDirectory(); + + // Create the index + IndexWriter writer = new IndexWriter(ramDir, new IndexWriterConfig(TEST_VERSION_CURRENT, new KeywordAnalyzer())); + for (int docNum = 0; docNum < nDocs; docNum++) { + writer.addDocument(docFactory.getDoc(docNum)); + } + writer.commit(); + writer.close(); + + // Delete documents marked for deletion + IndexReader reader = IndexReader.open(ramDir, false); + reader.deleteDocuments(new Term(DocumentFactory.field, DocumentFactory.delTxt)); + reader.close(); + + // Open a fresh read-only reader with the deletions in place + return IndexReader.open(ramDir, true); + } +} diff --git a/modules/facet/src/test/org/apache/lucene/util/SlowRAMDirectory.java b/modules/facet/src/test/org/apache/lucene/util/SlowRAMDirectory.java new file mode 100644 index 00000000000..7fb3a016232 --- /dev/null +++ b/modules/facet/src/test/org/apache/lucene/util/SlowRAMDirectory.java @@ -0,0 +1,166 @@ +package org.apache.lucene.util; + +import java.io.IOException; +import java.util.Random; + +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.util.ThreadInterruptedException; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Test utility - slow directory + */ +public class SlowRAMDirectory extends RAMDirectory { + + private static final int IO_SLEEP_THRESHOLD = 50; + + private Random random; + private int sleepMillis; + + public void setSleepMillis(int sleepMillis) { + this.sleepMillis = sleepMillis; + } + + public SlowRAMDirectory(int sleepMillis, Random random) { + this.sleepMillis = sleepMillis; + this.random = random; + } + + @Override + public IndexOutput createOutput(String name) throws IOException { + if (sleepMillis != -1) { + return new SlowIndexOutput(super.createOutput(name)); + } + + return super.createOutput(name); + } + + @Override + public IndexInput openInput(String name) throws IOException { + if (sleepMillis != -1) { + return new SlowIndexInput(super.openInput(name)); + } + return super.openInput(name); + } + + @Override + public IndexInput openInput(String name, int bufferSize) throws IOException { + if (sleepMillis != -1) { + return new SlowIndexInput(super.openInput(name, bufferSize)); + } + return super.openInput(name, bufferSize); + } + + void doSleep(int length) { + int sTime = length<10 ? sleepMillis : (int) (sleepMillis * Math.log(length)); + if (random!=null) { + sTime = random.nextInt(sTime); + } + try { + Thread.sleep(sTime); + } catch (InterruptedException e) { + throw new ThreadInterruptedException(e); + } + } + + /** + * Delegate class to wrap an IndexInput and delay reading bytes by some + * specified time. + */ + private class SlowIndexInput extends IndexInput { + private IndexInput ii; + private int numRead = 0; + + public SlowIndexInput(IndexInput ii) { + this.ii = ii; + } + + @Override + public byte readByte() throws IOException { + if (numRead >= IO_SLEEP_THRESHOLD) { + doSleep(0); + numRead = 0; + } + ++numRead; + return ii.readByte(); + } + + @Override + public void readBytes(byte[] b, int offset, int len) throws IOException { + if (numRead >= IO_SLEEP_THRESHOLD) { + doSleep(len); + numRead = 0; + } + numRead += len; + ii.readBytes(b, offset, len); + } + + @Override public Object clone() { return ii.clone(); } + @Override public void close() throws IOException { ii.close(); } + @Override public boolean equals(Object o) { return ii.equals(o); } + @Override public long getFilePointer() { return ii.getFilePointer(); } + @Override public int hashCode() { return ii.hashCode(); } + @Override public long length() { return ii.length(); } + @Override public void seek(long pos) throws IOException { ii.seek(pos); } + + } + + /** + * Delegate class to wrap an IndexOutput and delay writing bytes by some + * specified time. + */ + private class SlowIndexOutput extends IndexOutput { + + private IndexOutput io; + private int numWrote; + + public SlowIndexOutput(IndexOutput io) { + this.io = io; + } + + @Override + public void writeByte(byte b) throws IOException { + if (numWrote >= IO_SLEEP_THRESHOLD) { + doSleep(0); + numWrote = 0; + } + ++numWrote; + io.writeByte(b); + } + + @Override + public void writeBytes(byte[] b, int offset, int length) throws IOException { + if (numWrote >= IO_SLEEP_THRESHOLD) { + doSleep(length); + numWrote = 0; + } + numWrote += length; + io.writeBytes(b, offset, length); + } + + @Override public void close() throws IOException { io.close(); } + @Override public void flush() throws IOException { io.flush(); } + @Override public long getFilePointer() { return io.getFilePointer(); } + @Override public long length() throws IOException { return io.length(); } + @Override public void seek(long pos) throws IOException { io.seek(pos); } + } + +} diff --git a/modules/facet/src/test/org/apache/lucene/util/UnsafeByteArrayInputStreamTest.java b/modules/facet/src/test/org/apache/lucene/util/UnsafeByteArrayInputStreamTest.java new file mode 100644 index 00000000000..ff13bd065ef --- /dev/null +++ b/modules/facet/src/test/org/apache/lucene/util/UnsafeByteArrayInputStreamTest.java @@ -0,0 +1,139 @@ +package org.apache.lucene.util; + +import java.io.IOException; +import java.util.Arrays; + +import org.junit.Test; + +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.UnsafeByteArrayInputStream; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class UnsafeByteArrayInputStreamTest extends LuceneTestCase { + + @Test + public void testSimple() throws IOException { + int length = 256; + byte[] buffer = new byte[length]; + for (int i = 0; i < length; ++i) { + buffer[i] = (byte) i; + } + byte[] result = new byte[buffer.length]; + UnsafeByteArrayInputStream ubais = new UnsafeByteArrayInputStream(buffer); + + int index = 0; + int by = ubais.read(); + while (by >= 0) { + result[index++] = (byte) (by); + by = ubais.read(); + } + + assertEquals(length, index); + assertTrue(Arrays.equals(buffer, result)); + } + + @Test + public void testStartPos() throws IOException { + int length = 100; + byte[] buffer = new byte[length]; + for (int i = 0; i < length; ++i) { + buffer[i] = (byte) i; + } + int startPos = 5; + byte[] result = new byte[buffer.length]; + UnsafeByteArrayInputStream ubais = new UnsafeByteArrayInputStream(buffer, startPos, length); + + int index = 0; + int by = ubais.read(); + while (by >= 0) { + result[index++] = (byte) (by); + by = ubais.read(); + } + + assertEquals(length - startPos, index); + for (int i = startPos; i < length; i++) { + assertEquals(buffer[i], result[i - startPos]); + } + } + + @Test + public void testReinit() throws IOException { + int length = 100; + byte[] buffer = new byte[length]; + for (int i = 0; i < length; ++i) { + buffer[i] = (byte) i; + } + byte[] result = new byte[buffer.length]; + UnsafeByteArrayInputStream ubais = new UnsafeByteArrayInputStream(buffer); + + int index = 0; + int by = ubais.read(); + while (by >= 0) { + result[index++] = (byte) (by); + by = ubais.read(); + } + + assertEquals(length, index); + assertTrue(Arrays.equals(buffer, result)); + + int length2 = 50; + byte[] buffer2 = new byte[length2]; + for (int i = 0; i < length2; ++i) { + buffer2[i] = (byte) (90 + i); + } + byte[] result2 = new byte[buffer2.length]; + ubais.reInit(buffer2); + + int index2 = 0; + int by2 = ubais.read(); + while (by2 >= 0) { + result2[index2++] = (byte) (by2); + by2 = ubais.read(); + } + + assertEquals(length2, index2); + assertTrue(Arrays.equals(buffer2, result2)); + } + + @Test + public void testDefaultCtor() throws Exception { + UnsafeByteArrayInputStream ubais = new UnsafeByteArrayInputStream(); + assertEquals(0, ubais.available()); + assertEquals(-1, ubais.read()); + } + + @Test + public void testMark() throws Exception { + byte[] bytes = new byte[] { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }; + UnsafeByteArrayInputStream ubais = new UnsafeByteArrayInputStream(bytes); + assertTrue(ubais.markSupported()); + int markIndex = 3; + // Advance the index + for (int i = 0; i < markIndex; i++) { + ubais.read(); + } + ubais.mark(markIndex); + for (int i = markIndex; i < bytes.length; i++) { + ubais.read(); + } + ubais.reset(); + assertEquals(bytes.length - markIndex, ubais.available()); + } + +} diff --git a/modules/facet/src/test/org/apache/lucene/util/UnsafeByteArrayOutputStreamTest.java b/modules/facet/src/test/org/apache/lucene/util/UnsafeByteArrayOutputStreamTest.java new file mode 100644 index 00000000000..a472c44c08b --- /dev/null +++ b/modules/facet/src/test/org/apache/lucene/util/UnsafeByteArrayOutputStreamTest.java @@ -0,0 +1,211 @@ +package org.apache.lucene.util; + +import java.io.IOException; + +import org.junit.Test; + +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.UnsafeByteArrayOutputStream; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class UnsafeByteArrayOutputStreamTest extends LuceneTestCase { + + @Test + public void testSimpleWrite() throws IOException { + int length = 100; + byte[] buffer = new byte[length]; + UnsafeByteArrayOutputStream ubaos = new UnsafeByteArrayOutputStream(buffer); + + for (int i = 0; i < 100; i++) { + ubaos.write((byte) i); + } + + byte[] result = ubaos.toByteArray(); + + assertEquals(length, ubaos.length()); + + for (int j = 0; j < length; ++j) { + assertEquals(result[j], j); + } + } + + @Test + public void testArrayWrite() throws IOException { + int length = 100; + byte[] buffer = new byte[length]; + UnsafeByteArrayOutputStream ubaos = new UnsafeByteArrayOutputStream(buffer); + + for (int i = 0; i < 100; i++) { + ubaos.write((byte) i); + } + + int length2 = 10; + byte[] buffer2 = new byte[length2]; + for (int i = 0; i < length2; i++) { + buffer2[i] = (byte) (8 + i); + } + + ubaos.write(buffer2); + + byte[] result = ubaos.toByteArray(); + + assertEquals(length + length2, ubaos.length()); + + for (int j = 0; j < length; ++j) { + assertEquals(result[j], j); + } + for (int j = 0; j < length2; ++j) { + assertEquals(result[j + length], buffer2[j]); + } + } + + @Test + public void testArrayWriteStartNotZero() throws IOException { + int length = 100; + byte[] buffer = new byte[length]; + UnsafeByteArrayOutputStream ubaos = new UnsafeByteArrayOutputStream(buffer); + + for (int i = 0; i < 100; i++) { + ubaos.write((byte) i); + } + + int length2 = 1000; + byte[] buffer2 = new byte[length2]; + for (int i = 0; i < length2; i++) { + buffer2[i] = (byte) (8 + i); + } + + int length3 = 5; + int start = 2; + ubaos.write(buffer2, start, length3); + + byte[] result = ubaos.toByteArray(); + + assertEquals(length + length3, ubaos.length()); + + for (int j = 0; j < length; ++j) { + assertEquals(result[j], j); + } + for (int j = 0; j < length3; ++j) { + assertEquals(result[j + length], buffer2[j + start]); + } + } + + @Test + public void testBufferGrow() throws IOException { + int length = 100; + byte[] buffer = new byte[length / 10]; + UnsafeByteArrayOutputStream ubaos = new UnsafeByteArrayOutputStream(buffer); + + for (int i = 0; i < length; i++) { + ubaos.write((byte) i); + } + + byte[] result = ubaos.toByteArray(); + + assertEquals(length, ubaos.length()); + + for (int j = 0; j < length; ++j) { + assertEquals(result[j], j); + } + + buffer = ubaos.toByteArray(); + + int length2 = 10; + byte[] buffer2 = new byte[length2]; + for (int i = 0; i < length2; i++) { + buffer2[i] = (byte) (8 + i); + } + + ubaos.reInit(buffer2); + for (int i = 0; i < length2; i++) { + ubaos.write(7 + i); + } + + byte[] result2 = ubaos.toByteArray(); + + assertEquals(length2, ubaos.length()); + + for (int j = 0; j < length2; ++j) { + assertEquals(result2[j], j + 7); + } + + for (int i = 0; i < length; i++) { + assertEquals(buffer[i], i); + } + } + + @Test + public void testStartPos() throws Exception { + byte[] buf = new byte[10]; + for (int i = 0; i < buf.length; i++) { + buf[i] = (byte) i; + } + + int startPos = 3; + UnsafeByteArrayOutputStream ubaos = new UnsafeByteArrayOutputStream(buf, startPos); + int numValues = 5; + for (int i = 0; i < numValues; i++) { + ubaos.write((i + 1) * 2); + } + + // the length of the buffer should be whatever was written after startPos + // and before that. + assertEquals("invalid buffer length", startPos + numValues, ubaos.length()); + + assertEquals("invalid startPos", startPos, ubaos.getStartPos()); + + byte[] bytes = ubaos.toByteArray(); + for (int i = 0; i < startPos; i++) { + assertEquals(i, bytes[i]); + } + + for (int i = startPos, j = 0; j < numValues; i++, j++) { + assertEquals((j + 1) * 2, bytes[i]); + } + + for (int i = startPos + numValues; i < buf.length; i++) { + assertEquals(i, bytes[i]); + } + + } + + @Test + public void testDefaultCtor() throws Exception { + UnsafeByteArrayOutputStream ubaos = new UnsafeByteArrayOutputStream(); + int numValues = 5; + for (int i = 0; i < numValues; i++) { + ubaos.write(i); + } + + assertEquals("invalid buffer length", numValues, ubaos.length()); + + byte[] bytes = ubaos.toByteArray(); + for (int i = 0; i < numValues; i++) { + assertEquals(i, bytes[i]); + } + } + + @Test(expected=IllegalArgumentException.class) + public void testIllegalBufferSize() throws Exception { + UnsafeByteArrayOutputStream ubaos = new UnsafeByteArrayOutputStream(); + ubaos.reInit(new byte[0]); + } + +} diff --git a/modules/facet/src/test/org/apache/lucene/util/Vint8Test.java b/modules/facet/src/test/org/apache/lucene/util/Vint8Test.java new file mode 100644 index 00000000000..905f03586ed --- /dev/null +++ b/modules/facet/src/test/org/apache/lucene/util/Vint8Test.java @@ -0,0 +1,141 @@ +package org.apache.lucene.util; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; + +import org.junit.Test; + +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.Vint8; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Tests the {@link Vint8} class. + */ +public class Vint8Test extends LuceneTestCase { + + /** + * Tests the position wrapper. + * @throws Exception For any reason. + */ + @Test + public void testPosition() throws Exception { + Vint8.Position pos = new Vint8.Position(); + assertEquals(0, pos.pos); + pos = new Vint8.Position(12345); + assertEquals(12345, pos.pos); + } + + private static int[] testValues = { + -1000000000, + -1, 0, (1 << 7) - 1, 1 << 7, (1 << 14) - 1, 1 << 14, + (1 << 21) - 1, 1 << 21, (1 << 28) - 1, 1 << 28 + }; + private static int[] bytesNeededTestValues = { + 5, 5, 1, 1, 2, 2, 3, 3, 4, 4, 5 + }; + + /** + * Tests the {@code bytesNeeded} method. + */ + @Test + public void testBytesNeeded() { + assertEquals(5, Vint8.MAXIMUM_BYTES_NEEDED); + for (int j = 0; j < testValues.length; j++) { + assertEquals(bytesNeededTestValues[j], Vint8.bytesNeeded(testValues[j])); + } + } + + /** + * Tests encoding and decoding to and from a stream. + */ + @Test + public void testStreamEncodingAndDecoding() throws IOException { + ByteArrayOutputStream baos = new ByteArrayOutputStream(256); + int expectedSize = 0; + for (int j = 0; j < testValues.length; j++) { + Vint8.encode(testValues[j], baos); + expectedSize += bytesNeededTestValues[j]; + } + assertEquals(expectedSize, baos.size()); + ByteArrayInputStream bais = new ByteArrayInputStream(baos.toByteArray()); + for (int j = 0; j < testValues.length; j++) { + assertEquals(testValues[j], Vint8.decode(bais)); + } + assertEquals(0, bais.available()); + } + + /** + * Tests encoding and decoding to and from an array. + */ + @Test + public void testArrayEncodingAndDecoding() throws IOException { + byte[] byteArray = new byte[256]; + int position = 0, expectedSize = 0; + for (int j = 0; j < testValues.length; j++) { + position += Vint8.encode(testValues[j], byteArray, position); + expectedSize += bytesNeededTestValues[j]; + } + assertEquals(expectedSize, position); + Vint8.Position pos = new Vint8.Position(); + for (int j = 0; j < testValues.length; j++) { + assertEquals(testValues[j], Vint8.decode(byteArray, pos)); + } + assertEquals(expectedSize, pos.pos); + } + + /** + * The result of encoding the test values with the current algorithm. If these + * values are changed to match an algorithm change, compatibility with legacy + * data will be broken. + */ + private static final byte[] encodedTestValues = { + -4, -93, -108, -20, 0, -1, -1, -1, -1, 127, 0, 127, -127, 0, -1, 127, + -127, -128, 0, -1, -1, 127, -127, -128, -128, 0, -1, -1, -1, 127, -127, + -128, -128, -128, 0 + }; + + /** + * Tests algorithm. + */ + @Test + public void testLegacyCompatibility() throws IOException { + /* To generate the encoded test values: + byte[] byteArray = new byte[256]; + int position = 0, expectedSize = 0; + for (int j = 0; j < testValues.length; j++) { + position += Vint8.encode(testValues[j], byteArray, position); + expectedSize += bytesNeededTestValues[j]; + } + assertEquals(expectedSize, position); + Vint8.Position pos = new Vint8.Position(); + for (int j = 0; j < expectedSize; j++) { + System.out.print(byteArray[j] + ", "); + } + System.out.flush(); + pos.pos = 0; + */ + Vint8.Position pos = new Vint8.Position(); + for (int j = 0; j < testValues.length; j++) { + assertEquals(testValues[j], Vint8.decode(encodedTestValues, pos)); + } + } + +} // end class Vint8Test diff --git a/modules/facet/src/test/org/apache/lucene/util/collections/ArrayHashMapTest.java b/modules/facet/src/test/org/apache/lucene/util/collections/ArrayHashMapTest.java new file mode 100644 index 00000000000..8db8cf092d1 --- /dev/null +++ b/modules/facet/src/test/org/apache/lucene/util/collections/ArrayHashMapTest.java @@ -0,0 +1,269 @@ +package org.apache.lucene.util.collections; + +import java.util.HashSet; +import java.util.Iterator; +import java.util.Random; + +import org.junit.Test; + +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.collections.ArrayHashMap; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class ArrayHashMapTest extends LuceneTestCase { + + public static final int RANDOM_TEST_NUM_ITERATIONS = 100; // set to 100,000 for deeper test + + @Test + public void test0() { + ArrayHashMap map = new ArrayHashMap(); + + assertNull(map.get(0)); + + for (int i = 0; i < 100; ++i) { + int value = 100 + i; + assertFalse(map.containsValue(value)); + map.put(i, value); + assertTrue(map.containsValue(value)); + assertNotNull(map.get(i)); + } + + assertEquals(100, map.size()); + for (int i = 0; i < 100; ++i) { + assertTrue(map.containsKey(i)); + assertEquals(100 + i, map.get(i).intValue()); + + } + + for (int i = 10; i < 90; ++i) { + map.remove(i); + assertNull(map.get(i)); + } + + assertEquals(20, map.size()); + for (int i = 0; i < 100; ++i) { + assertEquals(map.containsKey(i), !(i >= 10 && i < 90)); + } + + for (int i = 5; i < 85; ++i) { + map.put(i, Integer.valueOf(5 + i)); + } + assertEquals(95, map.size()); + for (int i = 0; i < 100; ++i) { + assertEquals(map.containsKey(i), !(i >= 85 && i < 90)); + } + for (int i = 0; i < 5; ++i) { + assertEquals(map.get(i).intValue(), (100 + i)); + } + for (int i = 5; i < 85; ++i) { + assertEquals(map.get(i).intValue(), (5 + i)); + } + for (int i = 90; i < 100; ++i) { + assertEquals(map.get(i).intValue(), (100 + i)); + } + } + + @Test + public void test1() { + ArrayHashMap map = new ArrayHashMap(); + + for (int i = 0; i < 100; ++i) { + map.put(i, Integer.valueOf(100 + i)); + } + + HashSet set = new HashSet(); + + for (Iterator iterator = map.iterator(); iterator.hasNext();) { + set.add(iterator.next()); + } + + assertEquals(set.size(), map.size()); + for (int i = 0; i < 100; ++i) { + assertTrue(set.contains(Integer.valueOf(100 + i))); + } + + set.clear(); + for (Iterator iterator = map.iterator(); iterator.hasNext();) { + Integer integer = iterator.next(); + if (integer % 2 == 1) { + iterator.remove(); + continue; + } + set.add(integer); + } + assertEquals(set.size(), map.size()); + for (int i = 0; i < 100; i += 2) { + assertTrue(set.contains(Integer.valueOf(100 + i))); + } + } + + @Test + public void test2() { + ArrayHashMap map = new ArrayHashMap(); + + assertTrue(map.isEmpty()); + assertNull(map.get(0)); + for (int i = 0; i < 128; ++i) { + int value = i * 4096; + assertFalse(map.containsValue(value)); + map.put(i, value); + assertTrue(map.containsValue(value)); + assertNotNull(map.get(i)); + assertFalse(map.isEmpty()); + } + + assertEquals(128, map.size()); + for (int i = 0; i < 128; ++i) { + assertTrue(map.containsKey(i)); + assertEquals(i * 4096, map.get(i).intValue()); + } + + for (int i = 0; i < 200; i += 2) { + map.remove(i); + } + assertEquals(64, map.size()); + for (int i = 1; i < 128; i += 2) { + assertTrue(map.containsKey(i)); + assertEquals(i * 4096, map.get(i).intValue()); + map.remove(i); + } + assertTrue(map.isEmpty()); + } + + @Test + public void test3() { + ArrayHashMap map = new ArrayHashMap(); + int length = 100; + for (int i = 0; i < length; ++i) { + map.put(i * 64, 100 + i); + } + HashSet keySet = new HashSet(); + for (Iterator iit = map.keyIterator(); iit.hasNext();) { + keySet.add(iit.next()); + } + assertEquals(length, keySet.size()); + for (int i = 0; i < length; ++i) { + assertTrue(keySet.contains(i * 64)); + } + + HashSet valueSet = new HashSet(); + for (Iterator iit = map.iterator(); iit.hasNext();) { + valueSet.add(iit.next()); + } + assertEquals(length, valueSet.size()); + Object[] array = map.toArray(); + assertEquals(length, array.length); + for (Object value : array) { + assertTrue(valueSet.contains(value)); + } + + Integer[] array2 = new Integer[80]; + array2 = map.toArray(array2); + for (int value : array2) { + assertTrue(valueSet.contains(value)); + } + Integer[] array3 = new Integer[120]; + array3 = map.toArray(array3); + for (int i = 0; i < length; ++i) { + assertTrue(valueSet.contains(array3[i])); + } + assertNull(array3[length]); + + for (int i = 0; i < length; ++i) { + assertTrue(map.containsValue(i + 100)); + assertTrue(map.containsKey(i * 64)); + } + + for (Iterator iit = map.keyIterator(); iit.hasNext();) { + iit.next(); + iit.remove(); + } + assertTrue(map.isEmpty()); + assertEquals(0, map.size()); + + } + + // now with random data.. and lots of it + @Test + public void test4() { + ArrayHashMap map = new ArrayHashMap(); + int length = RANDOM_TEST_NUM_ITERATIONS; + + // for a repeatable random sequence + long seed = random.nextLong(); + Random random = new Random(seed); + + for (int i = 0; i < length; ++i) { + int value = random.nextInt(Integer.MAX_VALUE); + map.put(i * 128, value); + } + + assertEquals(length, map.size()); + + // now repeat + random.setSeed(seed); + + for (int i = 0; i < length; ++i) { + int value = random.nextInt(Integer.MAX_VALUE); + assertTrue(map.containsValue(value)); + assertTrue(map.containsKey(i * 128)); + assertEquals(Integer.valueOf(value), map.remove(i * 128)); + } + assertEquals(0, map.size()); + assertTrue(map.isEmpty()); + } + + @Test + public void testEquals() { + ArrayHashMap map1 = new ArrayHashMap(100); + ArrayHashMap map2 = new ArrayHashMap(100); + assertEquals("Empty maps should be equal", map1, map2); + assertEquals("hashCode() for empty maps should be equal", + map1.hashCode(), map2.hashCode()); + + for (int i = 0; i < 100; ++i) { + map1.put(i, Float.valueOf(1f/i)); + map2.put(i, Float.valueOf(1f/i)); + } + assertEquals("Identical maps should be equal", map1, map2); + assertEquals("hashCode() for identical maps should be equal", + map1.hashCode(), map2.hashCode()); + + for (int i = 10; i < 20; i++) { + map1.remove(i); + } + assertFalse("Different maps should not be equal", map1.equals(map2)); + + for (int i = 19; i >=10; --i) { + map2.remove(i); + } + assertEquals("Identical maps should be equal", map1, map2); + assertEquals("hashCode() for identical maps should be equal", + map1.hashCode(), map2.hashCode()); + + map1.put(-1,-1f); + map2.put(-1,-1.1f); + assertFalse("Different maps should not be equal", map1.equals(map2)); + + map2.put(-1,-1f); + assertEquals("Identical maps should be equal", map1, map2); + assertEquals("hashCode() for identical maps should be equal", + map1.hashCode(), map2.hashCode()); + } +} diff --git a/modules/facet/src/test/org/apache/lucene/util/collections/FloatToObjectMapTest.java b/modules/facet/src/test/org/apache/lucene/util/collections/FloatToObjectMapTest.java new file mode 100644 index 00000000000..beef37f95a4 --- /dev/null +++ b/modules/facet/src/test/org/apache/lucene/util/collections/FloatToObjectMapTest.java @@ -0,0 +1,267 @@ +package org.apache.lucene.util.collections; + +import org.junit.Test; +import java.util.HashSet; +import java.util.Iterator; +import java.util.Random; + +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.collections.FloatIterator; +import org.apache.lucene.util.collections.FloatToObjectMap; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class FloatToObjectMapTest extends LuceneTestCase { + + @Test + public void test0() { + FloatToObjectMap map = new FloatToObjectMap(); + + assertNull(map.get(0)); + + for (int i = 0; i < 100; ++i) { + int value = 100 + i; + assertFalse(map.containsValue(value)); + map.put(i * 1.1f, value); + assertTrue(map.containsValue(value)); + assertNotNull(map.get(i * 1.1f)); + } + + assertEquals(100, map.size()); + for (int i = 0; i < 100; ++i) { + assertTrue(map.containsKey(i*1.1f)); + assertEquals(100 + i, map.get(i*1.1f).intValue()); + + } + + for (int i = 10; i < 90; ++i) { + map.remove(i*1.1f); + assertNull(map.get(i*1.1f)); + } + + assertEquals(20, map.size()); + for (int i = 0; i < 100; ++i) { + assertEquals(map.containsKey(i*1.1f), !(i >= 10 && i < 90)); + } + + for (int i = 5; i < 85; ++i) { + map.put(i*1.1f, Integer.valueOf(5 + i)); + } + assertEquals(95, map.size()); + for (int i = 0; i < 100; ++i) { + assertEquals(map.containsKey(i*1.1f), !(i >= 85 && i < 90)); + } + for (int i = 0; i < 5; ++i) { + assertEquals(map.get(i*1.1f).intValue(), (100 + i)); + } + for (int i = 5; i < 85; ++i) { + assertEquals(map.get(i*1.1f).intValue(), (5 + i)); + } + for (int i = 90; i < 100; ++i) { + assertEquals(map.get(i*1.1f).intValue(), (100 + i)); + } + } + + @Test + public void test1() { + FloatToObjectMap map = new FloatToObjectMap(); + + for (int i = 0; i < 100; ++i) { + map.put(i*1.1f, Integer.valueOf(100 + i)); + } + + HashSet set = new HashSet(); + + for (Iterator iterator = map.iterator(); iterator.hasNext();) { + set.add(iterator.next()); + } + + assertEquals(set.size(), map.size()); + for (int i = 0; i < 100; ++i) { + assertTrue(set.contains(Integer.valueOf(100 + i))); + } + + set.clear(); + for (Iterator iterator = map.iterator(); iterator.hasNext();) { + Integer integer = iterator.next(); + if (integer % 2 == 1) { + iterator.remove(); + continue; + } + set.add(integer); + } + assertEquals(set.size(), map.size()); + for (int i = 0; i < 100; i += 2) { + assertTrue(set.contains(Integer.valueOf(100 + i))); + } + } + + @Test + public void test2() { + FloatToObjectMap map = new FloatToObjectMap(); + + assertTrue(map.isEmpty()); + assertNull(map.get(0)); + for (int i = 0; i < 128; ++i) { + int value = i * 4096; + assertFalse(map.containsValue(value)); + map.put(i*1.1f, value); + assertTrue(map.containsValue(value)); + assertNotNull(map.get(i*1.1f)); + assertFalse(map.isEmpty()); + } + + assertEquals(128, map.size()); + for (int i = 0; i < 128; ++i) { + assertTrue(map.containsKey(i*1.1f)); + assertEquals(i * 4096, map.get(i*1.1f).intValue()); + } + + for (int i = 0; i < 200; i += 2) { + map.remove(i*1.1f); + } + assertEquals(64, map.size()); + for (int i = 1; i < 128; i += 2) { + assertTrue(map.containsKey(i*1.1f)); + assertEquals(i * 4096, map.get(i*1.1f).intValue()); + map.remove(i*1.1f); + } + assertTrue(map.isEmpty()); + } + + @Test + public void test3() { + FloatToObjectMap map = new FloatToObjectMap(); + int length = 100; + for (int i = 0; i < length; ++i) { + map.put(i * 64*1.1f, 100 + i); + } + HashSet keySet = new HashSet(); + for (FloatIterator iit = map.keyIterator(); iit.hasNext();) { + keySet.add(iit.next()); + } + assertEquals(length, keySet.size()); + for (int i = 0; i < length; ++i) { + assertTrue(keySet.contains(i * 64*1.1f)); + } + + HashSet valueSet = new HashSet(); + for (Iterator iit = map.iterator(); iit.hasNext();) { + valueSet.add(iit.next()); + } + assertEquals(length, valueSet.size()); + Object[] array = map.toArray(); + assertEquals(length, array.length); + for (Object value : array) { + assertTrue(valueSet.contains(value)); + } + + Integer[] array2 = new Integer[80]; + array2 = map.toArray(array2); + for (int value : array2) { + assertTrue(valueSet.contains(value)); + } + Integer[] array3 = new Integer[120]; + array3 = map.toArray(array3); + for (int i = 0; i < length; ++i) { + assertTrue(valueSet.contains(array3[i])); + } + assertNull(array3[length]); + + for (int i = 0; i < length; ++i) { + assertTrue(map.containsValue(i + 100)); + assertTrue(map.containsKey(i * 64*1.1f)); + } + + for (FloatIterator iit = map.keyIterator(); iit.hasNext();) { + iit.next(); + iit.remove(); + } + assertTrue(map.isEmpty()); + assertEquals(0, map.size()); + + } + + // now with random data.. and lots of it + @Test + public void test4() { + FloatToObjectMap map = new FloatToObjectMap(); + int length = ArrayHashMapTest.RANDOM_TEST_NUM_ITERATIONS; + + // for a repeatable random sequence + long seed = random.nextLong(); + Random random = new Random(seed); + + for (int i = 0; i < length; ++i) { + int value = random.nextInt(Integer.MAX_VALUE); + map.put(i * 128*1.1f, value); + } + + assertEquals(length, map.size()); + + // now repeat + random.setSeed(seed); + + for (int i = 0; i < length; ++i) { + int value = random.nextInt(Integer.MAX_VALUE); + assertTrue(map.containsValue(value)); + assertTrue(map.containsKey(i * 128*1.1f)); + assertEquals(Integer.valueOf(value), map.remove(i * 128*1.1f)); + } + assertEquals(0, map.size()); + assertTrue(map.isEmpty()); + } + + @Test + public void testEquals() { + FloatToObjectMap map1 = new FloatToObjectMap(100); + FloatToObjectMap map2 = new FloatToObjectMap(100); + assertEquals("Empty maps should be equal", map1, map2); + assertEquals("hashCode() for empty maps should be equal", + map1.hashCode(), map2.hashCode()); + + for (int i = 0; i < 100; ++i) { + map1.put(i * 1.1f, 100 + i); + map2.put(i * 1.1f, 100 + i); + } + assertEquals("Identical maps should be equal", map1, map2); + assertEquals("hashCode() for identical maps should be equal", + map1.hashCode(), map2.hashCode()); + + for (int i = 10; i < 20; i++) { + map1.remove(i * 1.1f); + } + assertFalse("Different maps should not be equal", map1.equals(map2)); + + for (int i = 19; i >=10; --i) { + map2.remove(i * 1.1f); + } + assertEquals("Identical maps should be equal", map1, map2); + assertEquals("hashCode() for identical maps should be equal", + map1.hashCode(), map2.hashCode()); + + map1.put(-1.1f,-1); + map2.put(-1.1f,-2); + assertFalse("Different maps should not be equal", map1.equals(map2)); + + map2.put(-1.1f,-1); + assertEquals("Identical maps should be equal", map1, map2); + assertEquals("hashCode() for identical maps should be equal", + map1.hashCode(), map2.hashCode()); + } +} diff --git a/modules/facet/src/test/org/apache/lucene/util/collections/IntArrayTest.java b/modules/facet/src/test/org/apache/lucene/util/collections/IntArrayTest.java new file mode 100644 index 00000000000..9111ef8a0bd --- /dev/null +++ b/modules/facet/src/test/org/apache/lucene/util/collections/IntArrayTest.java @@ -0,0 +1,126 @@ +package org.apache.lucene.util.collections; + +import org.junit.Test; + +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.collections.IntArray; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class IntArrayTest extends LuceneTestCase { + + @Test + public void test0() { + IntArray array = new IntArray(); + + assertEquals(0, array.size()); + + for (int i = 0; i < 100; ++i) { + array.addToArray(i); + } + + assertEquals(100, array.size()); + for (int i = 0; i < 100; ++i) { + assertEquals(i, array.get(i)); + } + + assertTrue(array.equals(array)); + } + + @Test + public void test1() { + IntArray array = new IntArray(); + IntArray array2 = new IntArray(); + + assertEquals(0, array.size()); + + for (int i = 0; i < 100; ++i) { + array.addToArray(99-i); + array2.addToArray(99-i); + } + + assertEquals(100, array.size()); + for (int i = 0; i < 100; ++i) { + assertEquals(i, array.get(99-i)); + } + + array.sort(); + for (int i = 0; i < 100; ++i) { + assertEquals(i, array.get(i)); + } + + assertTrue(array.equals(array2)); + } + + @Test + public void test2() { + IntArray array = new IntArray(); + IntArray array2 = new IntArray(); + IntArray array3 = new IntArray(); + + for (int i = 0; i < 100; ++i) { + array.addToArray(i); + } + + for (int i = 0; i < 100; ++i) { + array2.addToArray(i*2); + } + + for (int i = 0; i < 50; ++i) { + array3.addToArray(i*2); + } + + assertFalse(array.equals(array2)); + + array.intersect(array2); + assertTrue(array.equals(array3)); + assertFalse(array.equals(array2)); + } + + @Test + public void testSet() { + int[] original = new int[] { 2,4,6,8,10,12,14 }; + int[] toSet = new int[] { 1,3,5,7,9,11}; + + IntArray arr = new IntArray(); + for (int val : original) { + arr.addToArray(val); + } + + for (int i = 0; i < toSet.length; i++ ) { + int val = toSet[i]; + arr.set(i, val); + } + + // Test to see if the set worked correctly + for (int i = 0; i < toSet.length; i++ ) { + assertEquals(toSet[i], arr.get(i)); + } + + // Now attempt to set something outside of the array + try { + arr.set(100, 99); + fail("IntArray.set should have thrown an exception for attempting to set outside the array"); + } catch (ArrayIndexOutOfBoundsException e) { + // We expected this to happen so let it fall through + // silently + } + + } + +} diff --git a/modules/facet/src/test/org/apache/lucene/util/collections/IntHashSetTest.java b/modules/facet/src/test/org/apache/lucene/util/collections/IntHashSetTest.java new file mode 100644 index 00000000000..d00453155a1 --- /dev/null +++ b/modules/facet/src/test/org/apache/lucene/util/collections/IntHashSetTest.java @@ -0,0 +1,226 @@ +package org.apache.lucene.util.collections; + +import java.util.HashSet; + +import org.junit.Test; + +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.collections.IntHashSet; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class IntHashSetTest extends LuceneTestCase { + + @Test + public void test0() { + IntHashSet set0 = new IntHashSet(); + + assertEquals(0, set0.size()); + assertTrue(set0.isEmpty()); + set0.add(0); + assertEquals(1, set0.size()); + assertFalse(set0.isEmpty()); + set0.remove(0); + assertEquals(0, set0.size()); + assertTrue(set0.isEmpty()); + } + + @Test + public void test1() { + IntHashSet set0 = new IntHashSet(); + + assertEquals(0, set0.size()); + assertTrue(set0.isEmpty()); + for (int i = 0; i < 1000; ++i) { + set0.add(i); + } + assertEquals(1000, set0.size()); + assertFalse(set0.isEmpty()); + for (int i = 0; i < 1000; ++i) { + assertTrue(set0.contains(i)); + } + + set0.clear(); + assertEquals(0, set0.size()); + assertTrue(set0.isEmpty()); + + } + + @Test + public void test2() { + IntHashSet set0 = new IntHashSet(); + + assertEquals(0, set0.size()); + assertTrue(set0.isEmpty()); + for (int i = 0; i < 1000; ++i) { + set0.add(1); + set0.add(-382); + } + assertEquals(2, set0.size()); + assertFalse(set0.isEmpty()); + set0.remove(-382); + set0.remove(1); + assertEquals(0, set0.size()); + assertTrue(set0.isEmpty()); + + } + + @Test + public void test3() { + IntHashSet set0 = new IntHashSet(); + + assertEquals(0, set0.size()); + assertTrue(set0.isEmpty()); + for (int i = 0; i < 1000; ++i) { + set0.add(i); + } + + for (int i = 0; i < 1000; i += 2) { + set0.remove(i); + } + + assertEquals(500, set0.size()); + for (int i = 0; i < 1000; ++i) { + if (i % 2 == 0) { + assertFalse(set0.contains(i)); + } else { + assertTrue(set0.contains(i)); + } + } + + } + + @Test + public void test4() { + IntHashSet set1 = new IntHashSet(); + HashSet set2 = new HashSet(); + for (int i = 0; i < ArrayHashMapTest.RANDOM_TEST_NUM_ITERATIONS; ++i) { + int value = random.nextInt() % 500; + boolean shouldAdd = random.nextBoolean(); + if (shouldAdd) { + set1.add(value); + set2.add(value); + } else { + set1.remove(value); + set2.remove(value); + } + } + assertEquals(set2.size(), set1.size()); + int i = 0; + for (int value : set2) { + assertTrue(set1.contains(value)); + i++; + } + } + + @Test + public void testRegularJavaSet() { + HashSet set = new HashSet(); + for (int j = 0; j < 100; ++j) { + for (int i = 0; i < ArrayHashMapTest.RANDOM_TEST_NUM_ITERATIONS; ++i) { + int value = random.nextInt() % 5000; + boolean shouldAdd = random.nextBoolean(); + if (shouldAdd) { + set.add(value); + } else { + set.remove(value); + } + } + set.clear(); + } + } + + @Test + public void testMySet() { + IntHashSet set = new IntHashSet(); + for (int j = 0; j < 100; ++j) { + for (int i = 0; i < ArrayHashMapTest.RANDOM_TEST_NUM_ITERATIONS; ++i) { + int value = random.nextInt() % 5000; + boolean shouldAdd = random.nextBoolean(); + if (shouldAdd) { + set.add(value); + } else { + set.remove(value); + } + } + set.clear(); + } + } + + @Test + public void testToArray() { + IntHashSet set = new IntHashSet(); + for (int j = 0; j < 100; ++j) { + for (int i = 0; i < ArrayHashMapTest.RANDOM_TEST_NUM_ITERATIONS; ++i) { + int value = random.nextInt() % 5000; + boolean shouldAdd = random.nextBoolean(); + if (shouldAdd) { + set.add(value); + } else { + set.remove(value); + } + } + int[] vals = set.toArray(); + assertEquals(set.size(), vals.length); + + int[] realValues = new int[set.size()]; + int[] unrealValues = set.toArray(realValues); + assertEquals(realValues, unrealValues); + for (int value : vals) { + assertTrue(set.remove(value)); + } + for (int i = 0; i < vals.length; ++i) { + assertEquals(vals[i], realValues[i]); + } + } + } + + @Test + public void testZZRegularJavaSet() { + HashSet set = new HashSet(); + for (int j = 0; j < 100; ++j) { + for (int i = 0; i < ArrayHashMapTest.RANDOM_TEST_NUM_ITERATIONS; ++i) { + int value = random.nextInt() % 5000; + boolean shouldAdd = random.nextBoolean(); + if (shouldAdd) { + set.add(value); + } else { + set.remove(value); + } + } + set.clear(); + } + } + + @Test + public void testZZMySet() { + IntHashSet set = new IntHashSet(); + for (int j = 0; j < 100; ++j) { + for (int i = 0; i < ArrayHashMapTest.RANDOM_TEST_NUM_ITERATIONS; ++i) { + int value = random.nextInt() % 5000; + boolean shouldAdd = random.nextBoolean(); + if (shouldAdd) { + set.add(value); + } else { + set.remove(value); + } + } + set.clear(); + } + } +} diff --git a/modules/facet/src/test/org/apache/lucene/util/collections/IntToDoubleMapTest.java b/modules/facet/src/test/org/apache/lucene/util/collections/IntToDoubleMapTest.java new file mode 100644 index 00000000000..b2e28fa7ef3 --- /dev/null +++ b/modules/facet/src/test/org/apache/lucene/util/collections/IntToDoubleMapTest.java @@ -0,0 +1,272 @@ +package org.apache.lucene.util.collections; + +import org.junit.Test; + +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.collections.DoubleIterator; +import org.apache.lucene.util.collections.IntIterator; +import org.apache.lucene.util.collections.IntToDoubleMap; + +import java.util.HashSet; +import java.util.Random; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class IntToDoubleMapTest extends LuceneTestCase { + private static void assertGround(double value) { + assertEquals(IntToDoubleMap.GROUND, value, Double.MAX_VALUE); + } + + @Test + public void test0() { + IntToDoubleMap map = new IntToDoubleMap(); + + assertGround(map.get(0)); + + for (int i = 0; i < 100; ++i) { + int value = 100 + i; + assertFalse(map.containsValue(value)); + map.put(i, value); + assertTrue(map.containsValue(value)); + assertNotNull(map.get(i)); + } + + assertEquals(100, map.size()); + for (int i = 0; i < 100; ++i) { + assertTrue(map.containsKey(i)); + assertEquals(100 + i, map.get(i), Double.MAX_VALUE); + + } + + for (int i = 10; i < 90; ++i) { + map.remove(i); + assertGround(map.get(i)); + } + + assertEquals(20, map.size()); + for (int i = 0; i < 100; ++i) { + assertEquals(map.containsKey(i), !(i >= 10 && i < 90)); + } + + for (int i = 5; i < 85; ++i) { + map.put(i, Integer.valueOf(5 + i)); + } + assertEquals(95, map.size()); + for (int i = 0; i < 100; ++i) { + assertEquals(map.containsKey(i), !(i >= 85 && i < 90)); + } + for (int i = 0; i < 5; ++i) { + assertEquals(map.get(i), (100 + i), Double.MAX_VALUE); + } + for (int i = 5; i < 85; ++i) { + assertEquals(map.get(i), (5 + i), Double.MAX_VALUE); + } + for (int i = 90; i < 100; ++i) { + assertEquals(map.get(i), (100 + i), Double.MAX_VALUE); + } + } + + @Test + public void test1() { + IntToDoubleMap map = new IntToDoubleMap(); + + for (int i = 0; i < 100; ++i) { + map.put(i, Integer.valueOf(100 + i)); + } + + HashSet set = new HashSet(); + + for (DoubleIterator iterator = map.iterator(); iterator.hasNext();) { + set.add(iterator.next()); + } + + assertEquals(set.size(), map.size()); + for (int i = 0; i < 100; ++i) { + assertTrue(set.contains(Double.valueOf(100+i))); + } + + set.clear(); + for (DoubleIterator iterator = map.iterator(); iterator.hasNext();) { + double d = iterator.next(); + if (d % 2 == 1) { + iterator.remove(); + continue; + } + set.add(d); + } + assertEquals(set.size(), map.size()); + for (int i = 0; i < 100; i+=2) { + assertTrue(set.contains(Double.valueOf(100+i))); + } + } + + @Test + public void test2() { + IntToDoubleMap map = new IntToDoubleMap(); + + assertTrue(map.isEmpty()); + assertGround(map.get(0)); + for (int i = 0; i < 128; ++i) { + int value = i * 4096; + assertFalse(map.containsValue(value)); + map.put(i, value); + assertTrue(map.containsValue(value)); + assertNotNull(map.get(i)); + assertFalse(map.isEmpty()); + } + + assertEquals(128, map.size()); + for (int i = 0; i < 128; ++i) { + assertTrue(map.containsKey(i)); + assertEquals(i * 4096, map.get(i), Double.MAX_VALUE); + } + + for (int i = 0 ; i < 200; i+=2) { + map.remove(i); + } + assertEquals(64, map.size()); + for (int i = 1; i < 128; i+=2) { + assertTrue(map.containsKey(i)); + assertEquals(i * 4096, map.get(i), Double.MAX_VALUE); + map.remove(i); + } + assertTrue(map.isEmpty()); + } + + @Test + public void test3() { + IntToDoubleMap map = new IntToDoubleMap(); + int length = 100; + for (int i = 0; i < length; ++i) { + map.put(i*64, 100 + i); + } + HashSet keySet = new HashSet(); + for (IntIterator iit = map.keyIterator(); iit.hasNext(); ) { + keySet.add(iit.next()); + } + assertEquals(length, keySet.size()); + for (int i = 0; i < length; ++i) { + assertTrue(keySet.contains(i * 64)); + } + + HashSet valueSet = new HashSet(); + for (DoubleIterator iit = map.iterator(); iit.hasNext(); ) { + valueSet.add(iit.next()); + } + assertEquals(length, valueSet.size()); + double[] array = map.toArray(); + assertEquals(length, array.length); + for (double value: array) { + assertTrue(valueSet.contains(value)); + } + + double[] array2 = new double[80]; + array2 = map.toArray(array2); + assertEquals(length, array2.length); + for (double value: array2) { + assertTrue(valueSet.contains(value)); + } + + double[] array3 = new double[120]; + array3 = map.toArray(array3); + for (int i = 0 ;i < length; ++i) { + assertTrue(valueSet.contains(array3[i])); + } + + for (int i = 0; i < length; ++i) { + assertTrue(map.containsValue(i + 100)); + assertTrue(map.containsKey(i*64)); + } + + for (IntIterator iit = map.keyIterator(); iit.hasNext(); ) { + iit.next(); + iit.remove(); + } + assertTrue(map.isEmpty()); + assertEquals(0, map.size()); + + } + + // now with random data.. and lots of it + @Test + public void test4() { + IntToDoubleMap map = new IntToDoubleMap(); + int length = ArrayHashMapTest.RANDOM_TEST_NUM_ITERATIONS; + // for a repeatable random sequence + long seed = random.nextLong(); + Random random = new Random(seed); + + for (int i = 0; i < length; ++i) { + int value = random.nextInt(Integer.MAX_VALUE); + map.put(i*128, value); + } + + assertEquals(length, map.size()); + + // now repeat + random.setSeed(seed); + + for (int i = 0; i < length; ++i) { + int value = random.nextInt(Integer.MAX_VALUE); + assertTrue(map.containsValue(value)); + assertTrue(map.containsKey(i*128)); + assertEquals(0, Double.compare(value, map.remove(i*128))); + } + assertEquals(0, map.size()); + assertTrue(map.isEmpty()); + } + + @Test + public void testEquals() { + IntToDoubleMap map1 = new IntToDoubleMap(100); + IntToDoubleMap map2 = new IntToDoubleMap(100); + assertEquals("Empty maps should be equal", map1, map2); + assertEquals("hashCode() for empty maps should be equal", + map1.hashCode(), map2.hashCode()); + + for (int i = 0; i < 100; ++i) { + map1.put(i, Float.valueOf(1f/i)); + map2.put(i, Float.valueOf(1f/i)); + } + assertEquals("Identical maps should be equal", map1, map2); + assertEquals("hashCode() for identical maps should be equal", + map1.hashCode(), map2.hashCode()); + + for (int i = 10; i < 20; i++) { + map1.remove(i); + } + assertFalse("Different maps should not be equal", map1.equals(map2)); + + for (int i = 19; i >=10; --i) { + map2.remove(i); + } + assertEquals("Identical maps should be equal", map1, map2); + assertEquals("hashCode() for identical maps should be equal", + map1.hashCode(), map2.hashCode()); + + map1.put(-1,-1f); + map2.put(-1,-1.1f); + assertFalse("Different maps should not be equal", map1.equals(map2)); + + map2.put(-1,-1f); + assertEquals("Identical maps should be equal", map1, map2); + assertEquals("hashCode() for identical maps should be equal", + map1.hashCode(), map2.hashCode()); + } + +} diff --git a/modules/facet/src/test/org/apache/lucene/util/collections/IntToIntMapTest.java b/modules/facet/src/test/org/apache/lucene/util/collections/IntToIntMapTest.java new file mode 100644 index 00000000000..a61e64ce269 --- /dev/null +++ b/modules/facet/src/test/org/apache/lucene/util/collections/IntToIntMapTest.java @@ -0,0 +1,272 @@ +package org.apache.lucene.util.collections; + +import org.junit.Test; +import java.util.HashSet; +import java.util.Random; + +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.collections.IntIterator; +import org.apache.lucene.util.collections.IntToIntMap; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class IntToIntMapTest extends LuceneTestCase { + + private static void assertGround(int value) { + assertEquals(IntToIntMap.GROUD, value); + } + + @Test + public void test0() { + IntToIntMap map = new IntToIntMap(); + + assertGround(map.get(0)); + + for (int i = 0; i < 100; ++i) { + int value = 100 + i; + assertFalse(map.containsValue(value)); + map.put(i, value); + assertTrue(map.containsValue(value)); + assertNotNull(map.get(i)); + } + + assertEquals(100, map.size()); + for (int i = 0; i < 100; ++i) { + assertTrue(map.containsKey(i)); + assertEquals(100 + i, map.get(i)); + + } + + for (int i = 10; i < 90; ++i) { + map.remove(i); + assertGround(map.get(i)); + } + + assertEquals(20, map.size()); + for (int i = 0; i < 100; ++i) { + assertEquals(map.containsKey(i), !(i >= 10 && i < 90)); + } + + for (int i = 5; i < 85; ++i) { + map.put(i, Integer.valueOf(5 + i)); + } + assertEquals(95, map.size()); + for (int i = 0; i < 100; ++i) { + assertEquals(map.containsKey(i), !(i >= 85 && i < 90)); + } + for (int i = 0; i < 5; ++i) { + assertEquals(map.get(i), (100 + i)); + } + for (int i = 5; i < 85; ++i) { + assertEquals(map.get(i), (5 + i)); + } + for (int i = 90; i < 100; ++i) { + assertEquals(map.get(i), (100 + i)); + } + } + + @Test + public void test1() { + IntToIntMap map = new IntToIntMap(); + + for (int i = 0; i < 100; ++i) { + map.put(i, Integer.valueOf(100 + i)); + } + + HashSet set = new HashSet(); + + for (IntIterator iterator = map.iterator(); iterator.hasNext();) { + set.add(iterator.next()); + } + + assertEquals(set.size(), map.size()); + for (int i = 0; i < 100; ++i) { + assertTrue(set.contains(Integer.valueOf(100+i))); + } + + set.clear(); + for (IntIterator iterator = map.iterator(); iterator.hasNext();) { + Integer integer = iterator.next(); + if (integer % 2 == 1) { + iterator.remove(); + continue; + } + set.add(integer); + } + assertEquals(set.size(), map.size()); + for (int i = 0; i < 100; i+=2) { + assertTrue(set.contains(Integer.valueOf(100+i))); + } + } + + @Test + public void test2() { + IntToIntMap map = new IntToIntMap(); + + assertTrue(map.isEmpty()); + assertGround(map.get(0)); + for (int i = 0; i < 128; ++i) { + int value = i * 4096; + assertFalse(map.containsValue(value)); + map.put(i, value); + assertTrue(map.containsValue(value)); + assertNotNull(map.get(i)); + assertFalse(map.isEmpty()); + } + + assertEquals(128, map.size()); + for (int i = 0; i < 128; ++i) { + assertTrue(map.containsKey(i)); + assertEquals(i * 4096, map.get(i)); + } + + for (int i = 0 ; i < 200; i+=2) { + map.remove(i); + } + assertEquals(64, map.size()); + for (int i = 1; i < 128; i+=2) { + assertTrue(map.containsKey(i)); + assertEquals(i * 4096, map.get(i)); + map.remove(i); + } + assertTrue(map.isEmpty()); + } + + @Test + public void test3() { + IntToIntMap map = new IntToIntMap(); + int length = 100; + for (int i = 0; i < length; ++i) { + map.put(i*64, 100 + i); + } + HashSet keySet = new HashSet(); + for (IntIterator iit = map.keyIterator(); iit.hasNext(); ) { + keySet.add(iit.next()); + } + assertEquals(length, keySet.size()); + for (int i = 0; i < length; ++i) { + assertTrue(keySet.contains(i * 64)); + } + + HashSet valueSet = new HashSet(); + for (IntIterator iit = map.iterator(); iit.hasNext(); ) { + valueSet.add(iit.next()); + } + assertEquals(length, valueSet.size()); + int[] array = map.toArray(); + assertEquals(length, array.length); + for (int value: array) { + assertTrue(valueSet.contains(value)); + } + + int[] array2 = new int[80]; + array2 = map.toArray(array2); + assertEquals(length, array2.length); + for (int value: array2) { + assertTrue(valueSet.contains(value)); + } + + int[] array3 = new int[120]; + array3 = map.toArray(array3); + for (int i = 0 ;i < length; ++i) { + assertTrue(valueSet.contains(array3[i])); + } + + for (int i = 0; i < length; ++i) { + assertTrue(map.containsValue(i + 100)); + assertTrue(map.containsKey(i*64)); + } + + for (IntIterator iit = map.keyIterator(); iit.hasNext(); ) { + iit.next(); + iit.remove(); + } + assertTrue(map.isEmpty()); + assertEquals(0, map.size()); + + } + + // now with random data.. and lots of it + @Test + public void test4() { + IntToIntMap map = new IntToIntMap(); + int length = ArrayHashMapTest.RANDOM_TEST_NUM_ITERATIONS; + + // for a repeatable random sequence + long seed = random.nextLong(); + Random random = new Random(seed); + + for (int i = 0; i < length; ++i) { + int value = random.nextInt(Integer.MAX_VALUE); + map.put(i*128, value); + } + + assertEquals(length, map.size()); + + // now repeat + random.setSeed(seed); + + for (int i = 0; i < length; ++i) { + int value = random.nextInt(Integer.MAX_VALUE); + + assertTrue(map.containsValue(value)); + assertTrue(map.containsKey(i*128)); + assertEquals(value, map.remove(i*128)); + } + assertEquals(0, map.size()); + assertTrue(map.isEmpty()); + } + + @Test + public void testEquals() { + IntToIntMap map1 = new IntToIntMap(100); + IntToIntMap map2 = new IntToIntMap(100); + assertEquals("Empty maps should be equal", map1, map2); + assertEquals("hashCode() for empty maps should be equal", + map1.hashCode(), map2.hashCode()); + + for (int i = 0; i < 100; ++i) { + map1.put(i, 100 + i); + map2.put(i, 100 + i); + } + assertEquals("Identical maps should be equal", map1, map2); + assertEquals("hashCode() for identical maps should be equal", + map1.hashCode(), map2.hashCode()); + + for (int i = 10; i < 20; i++) { + map1.remove(i); + } + assertFalse("Different maps should not be equal", map1.equals(map2)); + + for (int i = 19; i >=10; --i) { + map2.remove(i); + } + assertEquals("Identical maps should be equal", map1, map2); + assertEquals("hashCode() for identical maps should be equal", + map1.hashCode(), map2.hashCode()); + + map1.put(-1,-1); + map2.put(-1,-2); + assertFalse("Different maps should not be equal", map1.equals(map2)); + + map2.put(-1,-1); + assertEquals("Identical maps should be equal", map1, map2); + assertEquals("hashCode() for identical maps should be equal", + map1.hashCode(), map2.hashCode()); + } +} diff --git a/modules/facet/src/test/org/apache/lucene/util/collections/IntToObjectMapTest.java b/modules/facet/src/test/org/apache/lucene/util/collections/IntToObjectMapTest.java new file mode 100644 index 00000000000..a1cb69b3205 --- /dev/null +++ b/modules/facet/src/test/org/apache/lucene/util/collections/IntToObjectMapTest.java @@ -0,0 +1,268 @@ +package org.apache.lucene.util.collections; + +import java.util.HashSet; +import java.util.Iterator; +import java.util.Random; + +import org.junit.Test; + +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.collections.IntIterator; +import org.apache.lucene.util.collections.IntToObjectMap; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class IntToObjectMapTest extends LuceneTestCase { + + @Test + public void test0() { + IntToObjectMap map = new IntToObjectMap(); + + assertNull(map.get(0)); + + for (int i = 0; i < 100; ++i) { + int value = 100 + i; + assertFalse(map.containsValue(value)); + map.put(i, value); + assertTrue(map.containsValue(value)); + assertNotNull(map.get(i)); + } + + assertEquals(100, map.size()); + for (int i = 0; i < 100; ++i) { + assertTrue(map.containsKey(i)); + assertEquals(100 + i, map.get(i).intValue()); + + } + + for (int i = 10; i < 90; ++i) { + map.remove(i); + assertNull(map.get(i)); + } + + assertEquals(20, map.size()); + for (int i = 0; i < 100; ++i) { + assertEquals(map.containsKey(i), !(i >= 10 && i < 90)); + } + + for (int i = 5; i < 85; ++i) { + map.put(i, Integer.valueOf(5 + i)); + } + assertEquals(95, map.size()); + for (int i = 0; i < 100; ++i) { + assertEquals(map.containsKey(i), !(i >= 85 && i < 90)); + } + for (int i = 0; i < 5; ++i) { + assertEquals(map.get(i).intValue(), (100 + i)); + } + for (int i = 5; i < 85; ++i) { + assertEquals(map.get(i).intValue(), (5 + i)); + } + for (int i = 90; i < 100; ++i) { + assertEquals(map.get(i).intValue(), (100 + i)); + } + } + + @Test + public void test1() { + IntToObjectMap map = new IntToObjectMap(); + + for (int i = 0; i < 100; ++i) { + map.put(i, Integer.valueOf(100 + i)); + } + + HashSet set = new HashSet(); + + for (Iterator iterator = map.iterator(); iterator.hasNext();) { + set.add(iterator.next()); + } + + assertEquals(set.size(), map.size()); + for (int i = 0; i < 100; ++i) { + assertTrue(set.contains(Integer.valueOf(100 + i))); + } + + set.clear(); + for (Iterator iterator = map.iterator(); iterator.hasNext();) { + Integer integer = iterator.next(); + if (integer % 2 == 1) { + iterator.remove(); + continue; + } + set.add(integer); + } + assertEquals(set.size(), map.size()); + for (int i = 0; i < 100; i += 2) { + assertTrue(set.contains(Integer.valueOf(100 + i))); + } + } + + @Test + public void test2() { + IntToObjectMap map = new IntToObjectMap(); + + assertTrue(map.isEmpty()); + assertNull(map.get(0)); + for (int i = 0; i < 128; ++i) { + int value = i * 4096; + assertFalse(map.containsValue(value)); + map.put(i, value); + assertTrue(map.containsValue(value)); + assertNotNull(map.get(i)); + assertFalse(map.isEmpty()); + } + + assertEquals(128, map.size()); + for (int i = 0; i < 128; ++i) { + assertTrue(map.containsKey(i)); + assertEquals(i * 4096, map.get(i).intValue()); + } + + for (int i = 0; i < 200; i += 2) { + map.remove(i); + } + assertEquals(64, map.size()); + for (int i = 1; i < 128; i += 2) { + assertTrue(map.containsKey(i)); + assertEquals(i * 4096, map.get(i).intValue()); + map.remove(i); + } + assertTrue(map.isEmpty()); + } + + @Test + public void test3() { + IntToObjectMap map = new IntToObjectMap(); + int length = 100; + for (int i = 0; i < length; ++i) { + map.put(i * 64, 100 + i); + } + HashSet keySet = new HashSet(); + for (IntIterator iit = map.keyIterator(); iit.hasNext();) { + keySet.add(iit.next()); + } + assertEquals(length, keySet.size()); + for (int i = 0; i < length; ++i) { + assertTrue(keySet.contains(i * 64)); + } + + HashSet valueSet = new HashSet(); + for (Iterator iit = map.iterator(); iit.hasNext();) { + valueSet.add(iit.next()); + } + assertEquals(length, valueSet.size()); + Object[] array = map.toArray(); + assertEquals(length, array.length); + for (Object value : array) { + assertTrue(valueSet.contains(value)); + } + + Integer[] array2 = new Integer[80]; + array2 = map.toArray(array2); + for (int value : array2) { + assertTrue(valueSet.contains(value)); + } + Integer[] array3 = new Integer[120]; + array3 = map.toArray(array3); + for (int i = 0; i < length; ++i) { + assertTrue(valueSet.contains(array3[i])); + } + assertNull(array3[length]); + + for (int i = 0; i < length; ++i) { + assertTrue(map.containsValue(i + 100)); + assertTrue(map.containsKey(i * 64)); + } + + for (IntIterator iit = map.keyIterator(); iit.hasNext();) { + iit.next(); + iit.remove(); + } + assertTrue(map.isEmpty()); + assertEquals(0, map.size()); + + } + + // now with random data.. and lots of it + @Test + public void test4() { + IntToObjectMap map = new IntToObjectMap(); + int length = ArrayHashMapTest.RANDOM_TEST_NUM_ITERATIONS; + + // for a repeatable random sequence + long seed = random.nextLong(); + Random random = new Random(seed); + + for (int i = 0; i < length; ++i) { + int value = random.nextInt(Integer.MAX_VALUE); + map.put(i * 128, value); + } + + assertEquals(length, map.size()); + + // now repeat + random.setSeed(seed); + + for (int i = 0; i < length; ++i) { + int value = random.nextInt(Integer.MAX_VALUE); + assertTrue(map.containsValue(value)); + assertTrue(map.containsKey(i * 128)); + assertEquals(Integer.valueOf(value), map.remove(i * 128)); + } + assertEquals(0, map.size()); + assertTrue(map.isEmpty()); + } + + @Test + public void testEquals() { + IntToObjectMap map1 = new IntToObjectMap(100); + IntToObjectMap map2 = new IntToObjectMap(100); + assertEquals("Empty maps should be equal", map1, map2); + assertEquals("hashCode() for empty maps should be equal", + map1.hashCode(), map2.hashCode()); + + for (int i = 0; i < 100; ++i) { + map1.put(i, Double.valueOf(1f/i)); + map2.put(i, Double.valueOf(1f/i)); + } + assertEquals("Identical maps should be equal", map1, map2); + assertEquals("hashCode() for identical maps should be equal", + map1.hashCode(), map2.hashCode()); + + for (int i = 10; i < 20; i++) { + map1.remove(i); + } + assertFalse("Different maps should not be equal", map1.equals(map2)); + + for (int i = 19; i >=10; --i) { + map2.remove(i); + } + assertEquals("Identical maps should be equal", map1, map2); + assertEquals("hashCode() for identical maps should be equal", + map1.hashCode(), map2.hashCode()); + + map1.put(-1,-1d); + map2.put(-1,-1.1d); + assertFalse("Different maps should not be equal", map1.equals(map2)); + + map2.put(-1,-1d); + assertEquals("Identical maps should be equal", map1, map2); + assertEquals("hashCode() for identical maps should be equal", + map1.hashCode(), map2.hashCode()); + } +} diff --git a/modules/facet/src/test/org/apache/lucene/util/collections/ObjectToFloatMapTest.java b/modules/facet/src/test/org/apache/lucene/util/collections/ObjectToFloatMapTest.java new file mode 100644 index 00000000000..faeb8ef15ef --- /dev/null +++ b/modules/facet/src/test/org/apache/lucene/util/collections/ObjectToFloatMapTest.java @@ -0,0 +1,280 @@ +package org.apache.lucene.util.collections; + +import org.junit.Test; + +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.collections.FloatIterator; +import org.apache.lucene.util.collections.ObjectToFloatMap; + +import java.util.HashSet; +import java.util.Iterator; +import java.util.Random; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class ObjectToFloatMapTest extends LuceneTestCase { + + @Test + public void test0() { + ObjectToFloatMap map = new ObjectToFloatMap(); + + assertNaN(map.get(0)); + + for (int i = 0; i < 100; ++i) { + int value = 100 + i; + assertFalse(map.containsValue(value)); + map.put(i, value); + assertTrue(map.containsValue(value)); + assertNotNaN(map.get(i)); + } + + assertEquals(100, map.size()); + for (int i = 0; i < 100; ++i) { + assertTrue(map.containsKey(i)); + assertEquals(100 + i, map.get(i), 1E-5); + + } + + for (int i = 10; i < 90; ++i) { + map.remove(i); + assertNaN(map.get(i)); + } + + assertEquals(20, map.size()); + for (int i = 0; i < 100; ++i) { + assertEquals(map.containsKey(i), !(i >= 10 && i < 90)); + } + + for (int i = 5; i < 85; ++i) { + map.put(i, Integer.valueOf(5 + i)); + } + assertEquals(95, map.size()); + for (int i = 0; i < 100; ++i) { + assertEquals(map.containsKey(i), !(i >= 85 && i < 90)); + } + for (int i = 0; i < 5; ++i) { + assertEquals(map.get(i), (100 + i), 1E-5); + } + for (int i = 5; i < 85; ++i) { + assertEquals(map.get(i), (5 + i), 1E-5); + } + for (int i = 90; i < 100; ++i) { + assertEquals(map.get(i), (100 + i), 1E-5); + } + } + + private static void assertNaN(float f) { + assertTrue(Float.isNaN(f)); + } + + private static void assertNotNaN(float f) { + assertFalse(Float.isNaN(f)); + } + + @Test + public void test1() { + ObjectToFloatMap map = new ObjectToFloatMap(); + + for (int i = 0; i < 100; ++i) { + map.put(i, Integer.valueOf(100 + i)); + } + + HashSet set = new HashSet(); + + for (FloatIterator iterator = map.iterator(); iterator.hasNext();) { + set.add(iterator.next()); + } + + assertEquals(set.size(), map.size()); + for (int i = 0; i < 100; ++i) { + assertTrue(set.contains(Float.valueOf(100+i))); + } + + set.clear(); + for (FloatIterator iterator = map.iterator(); iterator.hasNext();) { + Float value = iterator.next(); + if (value % 2 == 1) { + iterator.remove(); + continue; + } + set.add(value); + } + assertEquals(set.size(), map.size()); + for (int i = 0; i < 100; i+=2) { + assertTrue(set.contains(Float.valueOf(100+i))); + } + } + + @Test + public void test2() { + ObjectToFloatMap map = new ObjectToFloatMap(); + + assertTrue(map.isEmpty()); + assertNaN(map.get(0)); + for (int i = 0; i < 128; ++i) { + int value = i * 4096; + assertFalse(map.containsValue(value)); + map.put(i, value); + assertTrue(map.containsValue(value)); + assertNotNaN(map.get(i)); + assertFalse(map.isEmpty()); + } + + assertEquals(128, map.size()); + for (int i = 0; i < 128; ++i) { + assertTrue(map.containsKey(i)); + assertEquals(i * 4096, map.get(i), 1E-5); + } + + for (int i = 0 ; i < 200; i+=2) { + map.remove(i); + } + assertEquals(64, map.size()); + for (int i = 1; i < 128; i+=2) { + assertTrue(map.containsKey(i)); + assertEquals(i * 4096, map.get(i), 1E-5); + map.remove(i); + } + assertTrue(map.isEmpty()); + } + + @Test + public void test3() { + ObjectToFloatMap map = new ObjectToFloatMap(); + int length = 100; + for (int i = 0; i < length; ++i) { + map.put(i*64, 100 + i); + } + HashSet keySet = new HashSet(); + for (Iterator iit = map.keyIterator(); iit.hasNext(); ) { + keySet.add(iit.next()); + } + assertEquals(length, keySet.size()); + for (int i = 0; i < length; ++i) { + assertTrue(keySet.contains(i * 64)); + } + + HashSet valueSet = new HashSet(); + for (FloatIterator iit = map.iterator(); iit.hasNext(); ) { + valueSet.add(iit.next()); + } + assertEquals(length, valueSet.size()); + float[] array = map.toArray(); + assertEquals(length, array.length); + for (float value: array) { + assertTrue(valueSet.contains(value)); + } + + float[] array2 = new float[80]; + array2 = map.toArray(array2); + assertEquals(80, array2.length); + for (float value: array2) { + assertTrue(valueSet.contains(value)); + } + + float[] array3 = new float[120]; + array3 = map.toArray(array3); + for (int i = 0 ;i < length; ++i) { + assertTrue(valueSet.contains(array3[i])); + } + assertNaN(array3[length]); + + for (int i = 0; i < length; ++i) { + assertTrue(map.containsValue(i + 100)); + assertTrue(map.containsKey(i*64)); + } + + for (Iterator iit = map.keyIterator(); iit.hasNext(); ) { + iit.next(); + iit.remove(); + } + assertTrue(map.isEmpty()); + assertEquals(0, map.size()); + + } + + // now with random data.. and lots of it + @Test + public void test4() { + ObjectToFloatMap map = new ObjectToFloatMap(); + int length = ArrayHashMapTest.RANDOM_TEST_NUM_ITERATIONS; + + // for a repeatable random sequence + long seed = random.nextLong(); + Random random = new Random(seed); + + for (int i = 0; i < length; ++i) { + int value = random.nextInt(Integer.MAX_VALUE); + map.put(i*128, value); + } + + assertEquals(length, map.size()); + + // now repeat + random.setSeed(seed); + + for (int i = 0; i < length; ++i) { + int value = random.nextInt(Integer.MAX_VALUE); + + assertTrue(map.containsValue(value)); + assertTrue(map.containsKey(i*128)); + assertEquals(0, Float.compare(value, map.remove(i*128))); + } + assertEquals(0, map.size()); + assertTrue(map.isEmpty()); + } + + @Test + public void testEquals() { + ObjectToFloatMap map1 = new ObjectToFloatMap(100); + ObjectToFloatMap map2 = new ObjectToFloatMap(100); + assertEquals("Empty maps should be equal", map1, map2); + assertEquals("hashCode() for empty maps should be equal", + map1.hashCode(), map2.hashCode()); + + for (int i = 0; i < 100; ++i) { + map1.put(i, Float.valueOf(1f/i)); + map2.put(i, Float.valueOf(1f/i)); + } + assertEquals("Identical maps should be equal", map1, map2); + assertEquals("hashCode() for identical maps should be equal", + map1.hashCode(), map2.hashCode()); + + for (int i = 10; i < 20; i++) { + map1.remove(i); + } + assertFalse("Different maps should not be equal", map1.equals(map2)); + + for (int i = 19; i >=10; --i) { + map2.remove(i); + } + assertEquals("Identical maps should be equal", map1, map2); + assertEquals("hashCode() for identical maps should be equal", + map1.hashCode(), map2.hashCode()); + + map1.put(-1,-1f); + map2.put(-1,-1.1f); + assertFalse("Different maps should not be equal", map1.equals(map2)); + + map2.put(-1,-1f); + assertEquals("Identical maps should be equal", map1, map2); + assertEquals("hashCode() for identical maps should be equal", + map1.hashCode(), map2.hashCode()); + } + +} diff --git a/modules/facet/src/test/org/apache/lucene/util/collections/ObjectToIntMapTest.java b/modules/facet/src/test/org/apache/lucene/util/collections/ObjectToIntMapTest.java new file mode 100644 index 00000000000..d2a7ff2f298 --- /dev/null +++ b/modules/facet/src/test/org/apache/lucene/util/collections/ObjectToIntMapTest.java @@ -0,0 +1,278 @@ +package org.apache.lucene.util.collections; + +import java.util.HashSet; +import java.util.Iterator; +import java.util.Random; + +import org.junit.Test; + +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.collections.IntIterator; +import org.apache.lucene.util.collections.ObjectToIntMap; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class ObjectToIntMapTest extends LuceneTestCase { + + @Test + public void test0() { + ObjectToIntMap map = new ObjectToIntMap(); + + assertIntegerMaxValue(map.get(0)); + + for (int i = 0; i < 100; ++i) { + int value = 100 + i; + assertFalse(map.containsValue(value)); + map.put(i, value); + assertTrue(map.containsValue(value)); + assertNotIntegerMaxValue(map.get(i)); + } + + assertEquals(100, map.size()); + for (int i = 0; i < 100; ++i) { + assertTrue(map.containsKey(i)); + assertEquals(100 + i, map.get(i), 1E-5); + + } + + for (int i = 10; i < 90; ++i) { + map.remove(i); + assertIntegerMaxValue(map.get(i)); + } + + assertEquals(20, map.size()); + for (int i = 0; i < 100; ++i) { + assertEquals(map.containsKey(i), !(i >= 10 && i < 90)); + } + + for (int i = 5; i < 85; ++i) { + map.put(i, Integer.valueOf(5 + i)); + } + assertEquals(95, map.size()); + for (int i = 0; i < 100; ++i) { + assertEquals(map.containsKey(i), !(i >= 85 && i < 90)); + } + for (int i = 0; i < 5; ++i) { + assertEquals(map.get(i), (100 + i), 1E-5); + } + for (int i = 5; i < 85; ++i) { + assertEquals(map.get(i), (5 + i), 1E-5); + } + for (int i = 90; i < 100; ++i) { + assertEquals(map.get(i), (100 + i), 1E-5); + } + } + + private static void assertIntegerMaxValue(int i) { + assertTrue(i == Integer.MAX_VALUE); + } + + private static void assertNotIntegerMaxValue(int i) { + assertFalse(i == Integer.MAX_VALUE); + } + + @Test + public void test1() { + ObjectToIntMap map = new ObjectToIntMap(); + + for (int i = 0; i < 100; ++i) { + map.put(i, Integer.valueOf(100 + i)); + } + + HashSet set = new HashSet(); + + for (IntIterator iterator = map.iterator(); iterator.hasNext();) { + set.add(iterator.next()); + } + + assertEquals(set.size(), map.size()); + for (int i = 0; i < 100; ++i) { + assertTrue(set.contains(Integer.valueOf(100+i))); + } + + set.clear(); + for (IntIterator iterator = map.iterator(); iterator.hasNext();) { + Integer value = iterator.next(); + if (value % 2 == 1) { + iterator.remove(); + continue; + } + set.add(value); + } + assertEquals(set.size(), map.size()); + for (int i = 0; i < 100; i+=2) { + assertTrue(set.contains(Integer.valueOf(100+i))); + } + } + + @Test + public void test2() { + ObjectToIntMap map = new ObjectToIntMap(); + + assertTrue(map.isEmpty()); + assertIntegerMaxValue(map.get(0)); + for (int i = 0; i < 128; ++i) { + int value = i * 4096; + assertFalse(map.containsValue(value)); + map.put(i, value); + assertTrue(map.containsValue(value)); + assertNotIntegerMaxValue(map.get(i)); + assertFalse(map.isEmpty()); + } + + assertEquals(128, map.size()); + for (int i = 0; i < 128; ++i) { + assertTrue(map.containsKey(i)); + assertEquals(i * 4096, map.get(i), 1E-5); + } + + for (int i = 0 ; i < 200; i+=2) { + map.remove(i); + } + assertEquals(64, map.size()); + for (int i = 1; i < 128; i+=2) { + assertTrue(map.containsKey(i)); + assertEquals(i * 4096, map.get(i), 1E-5); + map.remove(i); + } + assertTrue(map.isEmpty()); + } + + @Test + public void test3() { + ObjectToIntMap map = new ObjectToIntMap(); + int length = 100; + for (int i = 0; i < length; ++i) { + map.put(i*64, 100 + i); + } + HashSet keySet = new HashSet(); + for (Iterator iit = map.keyIterator(); iit.hasNext(); ) { + keySet.add(iit.next()); + } + assertEquals(length, keySet.size()); + for (int i = 0; i < length; ++i) { + assertTrue(keySet.contains(i * 64)); + } + + HashSet valueSet = new HashSet(); + for (IntIterator iit = map.iterator(); iit.hasNext(); ) { + valueSet.add(iit.next()); + } + assertEquals(length, valueSet.size()); + int[] array = map.toArray(); + assertEquals(length, array.length); + for (int value: array) { + assertTrue(valueSet.contains(value)); + } + + int[] array2 = new int[80]; + array2 = map.toArray(array2); + assertEquals(80, array2.length); + for (int value: array2) { + assertTrue(valueSet.contains(value)); + } + + int[] array3 = new int[120]; + array3 = map.toArray(array3); + for (int i = 0 ;i < length; ++i) { + assertTrue(valueSet.contains(array3[i])); + } + assertIntegerMaxValue(array3[length]); + + for (int i = 0; i < length; ++i) { + assertTrue(map.containsValue(i + 100)); + assertTrue(map.containsKey(i*64)); + } + + for (Iterator iit = map.keyIterator(); iit.hasNext(); ) { + iit.next(); + iit.remove(); + } + assertTrue(map.isEmpty()); + assertEquals(0, map.size()); + + } + + // now with random data.. and lots of it + @Test + public void test4() { + ObjectToIntMap map = new ObjectToIntMap(); + int length = ArrayHashMapTest.RANDOM_TEST_NUM_ITERATIONS; + + // for a repeatable random sequence + long seed = random.nextLong(); + Random random = new Random(seed); + + for (int i = 0; i < length; ++i) { + int value = random.nextInt(Integer.MAX_VALUE); + map.put(i*128, value); + } + + assertEquals(length, map.size()); + + // now repeat + random.setSeed(seed); + + for (int i = 0; i < length; ++i) { + int value = random.nextInt(Integer.MAX_VALUE); + assertTrue(map.containsValue(value)); + assertTrue(map.containsKey(i*128)); + assertEquals(value, map.remove(i*128)); + } + assertEquals(0, map.size()); + assertTrue(map.isEmpty()); + } + + @Test + public void testEquals() { + ObjectToIntMap map1 = new ObjectToIntMap(100); + ObjectToIntMap map2 = new ObjectToIntMap(100); + assertEquals("Empty maps should be equal", map1, map2); + assertEquals("hashCode() for empty maps should be equal", + map1.hashCode(), map2.hashCode()); + + for (int i = 0; i < 100; ++i) { + map1.put(i * 1.1f, 100 + i); + map2.put(i * 1.1f, 100 + i); + } + assertEquals("Identical maps should be equal", map1, map2); + assertEquals("hashCode() for identical maps should be equal", + map1.hashCode(), map2.hashCode()); + + for (int i = 10; i < 20; i++) { + map1.remove(i * 1.1f); + } + assertFalse("Different maps should not be equal", map1.equals(map2)); + + for (int i = 19; i >=10; --i) { + map2.remove(i * 1.1f); + } + assertEquals("Identical maps should be equal", map1, map2); + assertEquals("hashCode() for identical maps should be equal", + map1.hashCode(), map2.hashCode()); + + map1.put(-1.1f,-1); + map2.put(-1.1f,-2); + assertFalse("Different maps should not be equal", map1.equals(map2)); + + map2.put(-1.1f,-1); + assertEquals("Identical maps should be equal", map1, map2); + assertEquals("hashCode() for identical maps should be equal", + map1.hashCode(), map2.hashCode()); + } +} diff --git a/modules/facet/src/test/org/apache/lucene/util/collections/TestLRUHashMap.java b/modules/facet/src/test/org/apache/lucene/util/collections/TestLRUHashMap.java new file mode 100644 index 00000000000..48221b335c3 --- /dev/null +++ b/modules/facet/src/test/org/apache/lucene/util/collections/TestLRUHashMap.java @@ -0,0 +1,61 @@ +package org.apache.lucene.util.collections; + +import org.junit.Test; + +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.collections.LRUHashMap; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class TestLRUHashMap extends LuceneTestCase { + // testLRU() tests that the specified size limit is indeed honored, and + // the remaining objects in the map are indeed those that have been most + // recently used + @Test + public void testLRU() throws Exception { + LRUHashMap lru = new LRUHashMap(3); + assertEquals(0, lru.size()); + lru.put("one", "Hello world"); + assertEquals(1, lru.size()); + lru.put("two", "Hi man"); + assertEquals(2, lru.size()); + lru.put("three", "Bonjour"); + assertEquals(3, lru.size()); + lru.put("four", "Shalom"); + assertEquals(3, lru.size()); + assertNotNull(lru.get("three")); + assertNotNull(lru.get("two")); + assertNotNull(lru.get("four")); + assertNull(lru.get("one")); + lru.put("five", "Yo!"); + assertEquals(3, lru.size()); + assertNull(lru.get("three")); // three was last used, so it got removed + assertNotNull(lru.get("five")); + lru.get("four"); + lru.put("six", "hi"); + lru.put("seven", "hey dude"); + assertEquals(3, lru.size()); + assertNull(lru.get("one")); + assertNull(lru.get("two")); + assertNull(lru.get("three")); + assertNotNull(lru.get("four")); + assertNull(lru.get("five")); + assertNotNull(lru.get("six")); + assertNotNull(lru.get("seven")); + } +} diff --git a/modules/facet/src/test/org/apache/lucene/util/encoding/EncodingSpeed.java b/modules/facet/src/test/org/apache/lucene/util/encoding/EncodingSpeed.java new file mode 100644 index 00000000000..a78478aefc5 --- /dev/null +++ b/modules/facet/src/test/org/apache/lucene/util/encoding/EncodingSpeed.java @@ -0,0 +1,659 @@ +package org.apache.lucene.util.encoding; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.text.NumberFormat; +import java.util.Arrays; + +import org.apache.lucene.util.encoding.DGapIntEncoder; +import org.apache.lucene.util.encoding.EightFlagsIntEncoder; +import org.apache.lucene.util.encoding.FourFlagsIntEncoder; +import org.apache.lucene.util.encoding.IntDecoder; +import org.apache.lucene.util.encoding.IntEncoder; +import org.apache.lucene.util.encoding.NOnesIntEncoder; +import org.apache.lucene.util.encoding.SortingIntEncoder; +import org.apache.lucene.util.encoding.UniqueValuesIntEncoder; +import org.apache.lucene.util.encoding.VInt8IntEncoder; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class EncodingSpeed { + + private static int[] data3630 = null; + private static int[] data9910 = null; + private static int[] data501871 = null; + private static int[] data10k = null; + private static String resultsFormat = "%-20s %10s %20d %26s %20d %26s"; + private static String headerFormat = "%-20s %10s %20s %26s %20s %26s"; + private static int integers = 100000000; + + private static NumberFormat nf; + + /** + * @param args + * @throws IOException + */ + public static void main(String[] args) throws IOException { + testFacetIDs(data3630, 3630); + testFacetIDs(data9910, 9910); + testFacetIDs(data10k, 10000); + testFacetIDs(data501871, 501871); + } + + private static void testFacetIDs(int[] facetIDs, int docID) + throws IOException { + int loopFactor = integers / facetIDs.length; + System.out + .println("\nEstimating ~" + + integers + + " Integers compression time by\nEncoding/decoding facets' ID payload of docID = " + + docID + " (unsorted, length of: " + facetIDs.length + + ") " + loopFactor + " times."); + + System.out.println(); + String header = String.format(headerFormat, "Encoder", "Bits/Int", + "Encode Time", "Encode Time", "Decode Time", "Decode Time"); + + System.out.println(header); + String header2 = String.format(headerFormat, "", "", "[milliseconds]", + "[microsecond / int]", "[milliseconds]", "[microsecond / int]"); + + System.out.println(header2); + + char[] separator = header.toCharArray(); + Arrays.fill(separator, '-'); + System.out.println(separator); + + encoderTest(new VInt8IntEncoder(), facetIDs, loopFactor); + encoderTest(new SortingIntEncoder(new UniqueValuesIntEncoder(new VInt8IntEncoder())), facetIDs, loopFactor); + encoderTest(new SortingIntEncoder(new UniqueValuesIntEncoder(new DGapIntEncoder(new VInt8IntEncoder()))), facetIDs, loopFactor); + encoderTest(new SortingIntEncoder(new UniqueValuesIntEncoder(new DGapIntEncoder(new EightFlagsIntEncoder()))), facetIDs, loopFactor); + encoderTest(new SortingIntEncoder(new UniqueValuesIntEncoder(new DGapIntEncoder(new FourFlagsIntEncoder()))), facetIDs, loopFactor); + encoderTest(new SortingIntEncoder(new UniqueValuesIntEncoder(new DGapIntEncoder(new NOnesIntEncoder(3)))), facetIDs, loopFactor); + encoderTest(new SortingIntEncoder(new UniqueValuesIntEncoder(new DGapIntEncoder(new NOnesIntEncoder(4)))), facetIDs, loopFactor); + + System.out.println(); + } + + private static void encoderTest(IntEncoder encoder, int[] data, + int loopFactor) throws IOException { + + long startTime, endTime; + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + + // -- Looping 100 times as a warm up -------------------------- + for (int i = 100; i != 0; --i) { + baos.reset(); + encoder.reInit(baos); + for (int value : data) { + encoder.encode(value); + } + encoder.close(); + } + // ----------------------------------------------------------- + + startTime = System.currentTimeMillis(); + for (int factor = loopFactor; factor > 0; --factor) { + baos.reset(); + encoder.reInit(baos); + for (int value : data) { + encoder.encode(value); + } + encoder.close(); + } + endTime = System.currentTimeMillis(); + + long encodeTime = endTime - startTime; + + ByteArrayInputStream bais = new ByteArrayInputStream(baos.toByteArray()); + IntDecoder decoder = encoder.createMatchingDecoder(); + decoder.reInit(bais); + + // -- Looping 100 times as a warm up -------------------------- + for (int i = 100; i != 0; --i) { + bais.mark(baos.size()); + while (decoder.decode() != IntDecoder.EOS) { + } + bais.reset(); + decoder.reInit(bais); + } + // ----------------------------------------------------------- + + decoder.reInit(bais); + startTime = System.currentTimeMillis(); + for (int i = loopFactor; i > 0; --i) { + bais.mark(baos.size()); + while (decoder.decode() != IntDecoder.EOS) { + } + bais.reset(); + decoder.reInit(bais); + } + + endTime = System.currentTimeMillis(); + long decodeTime = endTime - startTime; + + System.out.println(String.format(resultsFormat, encoder, nf.format(baos + .size() + * 8.0 / data.length), encodeTime, nf.format(encodeTime + * 1000000.0 / (loopFactor * data.length)), decodeTime, nf + .format(decodeTime * 1000000.0 / (loopFactor * data.length)))); + } + + static { + nf = NumberFormat.getInstance(); + nf.setMaximumFractionDigits(4); + nf.setMinimumFractionDigits(4); + + data9910 = new int[] { 2, 4, 149085, 11, 12292, 69060, 69061, 149309, + 99090, 568, 5395, 149310, 3911, 149311, 149312, 148752, 1408, + 1410, 1411, 1412, 4807, 1413, 1414, 1417, 1415, 1416, 1418, + 1420, 470, 4808, 1422, 1423, 1424, 4809, 4810, 1427, 1429, + 1430, 4811, 1432, 1433, 3752, 1435, 3753, 1437, 1439, 1440, + 4812, 1442, 1443, 4813, 1445, 1446, 1447, 4814, 4815, 1450, + 4816, 353, 1452, 89004, 1624, 1625, 2052, 1626, 1627, 63991, + 725, 726, 727, 728, 35543, 729, 730, 731, 1633, 733, 734, 735, + 37954, 737, 738, 76315, 23068, 76316, 1634, 740, 741, 742, 744, + 745, 76317, 15645, 748, 17488, 2904, 89005, 752, 753, 89006, + 754, 755, 756, 757, 41, 261, 758, 89007, 760, 762, 763, 89008, + 764, 765, 766, 85930, 165, 768, 149313, 33593, 149314, 149315, + 81589, 39456, 15467, 1296, 149316, 39457, 2235, 144, 2236, + 2309, 3050, 2237, 2311, 89692, 2240, 2241, 2243, 2244, 2245, + 2246, 2314, 12856, 2248, 2250, 2251, 2253, 2254, 12857, 7677, + 12858, 39149, 2257, 23147, 3303, 2258, 7422, 2322, 2262, 2317, + 2263, 7423, 24264, 2232, 89693, 12862, 89694, 12863, 12864, + 23201, 2329, 33019, 2255, 12865, 3517, 2492, 2277, 2280, 2267, + 2260, 25368, 12866, 2281, 2282, 2283, 12867, 2284, 9055, 2287, + 125133, 2337, 2286, 2288, 2338, 125134, 2290, 125135, 12869, + 965, 966, 1298, 17945, 1300, 970, 971, 972, 973, 974, 296, + 17946, 1303, 1391, 902, 1304, 1395, 1308, 1309, 1310, 1312, + 967, 9414, 1315, 1317, 1318, 9415, 1321, 23592, 1322, 22433, + 1323, 1324, 1326, 109241, 31225, 1330, 1331, 2540, 27196, 1332, + 1334, 1335, 11999, 414, 340, 3651, 44040, 31995, 1344, 1343, + 4618, 116770, 116771, 1474, 1349, 42122, 14199, 149317, 451, + 149318, 29, 14200, 14198, 14201, 1979, 1980, 1981, 3132, 3147, + 34090, 1987, 12770, 1329, 80818, 80819, 1988, 23522, 1986, + 15880, 1985, 32975, 1992, 1993, 7165, 3141, 3143, 86346, 1982, + 1984, 3145, 86347, 78064, 23456, 29578, 3136, 17752, 4710, + 4711, 4712, 149319, 424, 4713, 95735, 4715, 149320, 4717, 4718, + 149321, 192, 149322, 108126, 29976, 5404, 38059, 5406, 2030, + 289, 1804, 1557, 1558, 94080, 29651, 94317, 1561, 1562, 1563, + 1565, 24632, 1927, 1928, 1566, 1570, 1571, 1572, 1573, 1574, + 1575, 94318, 1576, 2674, 9351, 94319, 94320, 2677, 2678, 29654, + 2946, 2945, 2682, 2683, 2947, 3102, 3402, 3104, 4780, 3106, + 3107, 3108, 3109, 3110, 3111, 3112, 3113, 3114, 3116, 3117, + 3118, 19610, 44805, 3119, 3407, 3121, 3122, 3124, 3126, 3127, + 41745, 41746, 3130, 459, 460, 461, 462, 463, 464, 466, 467, + 40306, 468, 471, 472, 40307, 4467, 475, 476, 477, 478, 479, + 40308, 481, 482, 20129, 483, 484, 485, 486, 4473, 488, 489, + 458, 491, 40309, 494, 495, 496, 497, 499, 500, 501, 502, 355, + 356, 1549, 358, 359, 360, 37971, 362, 2579, 2581, 24578, 2583, + 24579, 2586, 2587, 2588, 2590, 2591, 24580, 24581, 3666, 24582, + 2594, 24583, 2596, 2597, 24584, 2599, 18013, 24585, 2601, + 49361, 280, 3969, 11651, 11652, 3926, 5103, 11653, 11654, + 11655, 6896, 417, 168, 149323, 11268, 11657, 38089, 59517, + 149324, 38092, 149325, 5110, 38094, 59520, 38096, 38097, 28916, + 59703, 4992, 149326, 32383, 2478, 3985, 2479, 2480, 2481, 2482, + 2483, 2484, 2485, 2486, 24146, 22184, 2488, 2489, 2490, 2494, + 2493, 18043, 2495, 2542, 2497, 5062, 2499, 2501, 24147, 24148, + 2504, 2505, 2506, 2507, 2508, 394, 2660, 2509, 2511, 24149, + 2512, 2513, 2514, 3988, 4410, 3989, 2518, 2522, 2521, 24150, + 12082, 2524, 3990, 24151, 387, 24152, 2529, 2530, 2528, 3991, + 24153, 2534, 24154, 2536, 24155, 2538, 22510, 6332, 3554, 5309, + 7700, 6333, 6334, 6335, 6336, 6337, 5693, 117020, 6339, 149327, + 149328, 149329, 6340, 6343, 117022, 4324, 283, 284, 285, 286, + 2688, 287, 2689, 288, 8880, 290, 291, 2690, 292, 295, 294, + 24543, 13899, 297, 298, 299, 300, 303, 301, 59178, 302, 8881, + 34403, 13900, 17811, 305, 307, 306, 308, 2727, 368, 364, + 110416, 1587, 366, 367, 2692, 26624, 7233, 9082, 35684, 7250, + 13902, 304, 13903, 991, 110417, 273, 274, 275, 276, 277, 278, + 41095, 281, 282, 4419, 2768, 229, 230, 231, 232, 233, 234, 235, + 236, 237, 1065, 239, 2745, 2746, 240, 9250, 241, 242, 244, 245, + 9251, 246, 247, 248, 249, 250, 251, 253, 254, 255, 9252, 257, + 258, 259, 9253, 9254, 2751, 265, 266, 267, 268, 9255, 9256, + 270, 271, 9257, 238, 1024, 829, 1025, 1026, 1028, 1029, 1030, + 9258, 1032, 1033, 1034, 1027, 1035, 1036, 9259, 1037, 1038, + 1039, 4594, 4429, 1041, 1042, 1043, 70332, 1045, 1046, 1047, + 1048, 21128, 1050, 122787, 72433, 1052, 2762, 1054, 1055, 1056, + 9548, 1057, 71311, 1058, 1059, 1060, 61413, 2765, 4436, 1064, + 1066, 11610, 3485, 22357, 104580, 149330, 149331, 15471, 5679, + 5680, 687, 5683, 5684, 953, 8849, 102120, 149332, 5688, 5689, + 149333, 6920, 60202, 33855, 33856, 33857, 19163, 33858, 3491, + 149334, 914, 2202, 916, 917, 919, 920, 921, 922, 3568, 924, + 925, 926, 927, 928, 929, 8752, 931, 932, 933, 934, 3570, 1876, + 9138, 1877, 1878, 2210, 1880, 1881, 3571, 1883, 1884, 2212, + 1886, 2214, 1888, 1889, 1890, 8753, 1891, 1892, 1893, 1894, + 1895, 1896, 1898, 2217, 3572, 1901, 1902, 688, 2219, 107, 1904, + 1905, 3573, 1907, 3323, 1909, 1910, 1911, 8754, 1912, 55911, + 1913, 1914, 3574, 1741, 3575, 1916, 2226, 3576, 1919, 2227, + 1920, 3577, 3578, 2229, 1923, 85396, 174, 175, 114875, 178, + 180, 181, 182, 1477, 185, 186, 172, 187, 188, 85397, 85398, + 190, 191, 891, 893, 19778, 18068, 895, 897, 896, 25985, 894, + 900, 361, 1206, 193, 194, 195, 196, 197, 198, 199, 200, 55009, + 201, 33266, 29064, 204, 205, 40129, 206, 207, 208, 2842, 209, + 210, 211, 212, 149335, 870, 871, 18005, 872, 18006, 874, 875, + 876, 1479, 1480, 1481, 879, 881, 57212, 2779, 57213, 886, 887, + 57214, 57215, 889, 890, 806, 69305, 808, 809, 86327, 812, 813, + 814, 815, 26724, 816, 69307, 43484, 818, 819, 63904, 820, 821, + 822, 86328, 13498, 824, 825, 12218, 149336, 49042, 4464, 4466, + 35536, 73245, 73246, 474, 73247, 480, 46247, 29624, 21086, + 73248, 490, 493, 73249, 73250, 401, 403, 405, 2860, 15483, + 74826, 408, 409, 74827, 410, 411, 413, 74828, 415, 2863, 68707, + 33284, 2865, 2866, 2867, 2868, 2869, 2870, 17976, 3032, 38498, + 7350, 2876, 2874, 24506, 918, 923, 64562, 64563, 32648, 930, + 1875, 32649, 1879, 32650, 1882, 1887, 32651, 64564, 32652, + 1897, 32653, 18170, 1900, 32654, 1906, 1915, 64565, 1921, 1922, + 90662, 2234, 37449, 8886, 37450, 7418, 37451, 37452, 37453, + 37454, 1609, 1610, 1611, 1612, 113456, 1212, 1616, 1617, + 113457, 1615, 1619, 113458, 1620, 8747, 113459, 8748, 42233, + 78065, 42235, 2149, 42236, 78066, 42237, 42238, 4335, 42239, + 78067, 42241, 78068, 42243, 78069, 42244, 78070, 54587, 12993, + 2040, 1130, 1131, 51172, 1133, 1134, 1135, 1136, 1137, 1138, + 1139, 1140, 1141, 149337, 1115, 5178, 149338, 452, 7784, 21522, + 1361, 103718, 149339, 15990, 79432, 149340, 4232, 149341, + 15998, 53917, 15996, 53918, 149342, 149343, 97544, 53920, + 97546, 841, 1954, 842, 41926, 844, 2589, 845, 846, 27370, 848, + 849, 41927, 25165, 852, 1956, 854, 856, 1957, 855, 1959, 35170, + 23055, 75673, 116783, 857, 116784, 851, 116785, 858, 859, 860, + 861, 57422, 1964, 864, 866, 867, 1965, 1966, 1968, 1969, 2989, + 116786, 1972, 1973, 116787, 1975, 1976, 1977, 2580, 39540, + 2585, 39541, 21755, 39542, 2592, 34859, 2593, 39543, 38540, + 2595, 39544, 149344, 35433, 81849, 35434, 40257, 873, 877, + 2778, 32040, 882, 883, 884, 885, 888, 3358, 1559, 1560, 1438, + 25387, 1569, 38135, 66925, 2673, 3095, 2679, 59053, 25443, + 34369, 1983, 17749, 9343, 1989, 13565, 31525, 61690, 18165, + 17751, 78234, 26506, 9348, 20307, 18154, 3133, 2572, 3134, + 12131, 19770, 48724, 25759, 13549, 65465, 19936, 13545, 25645, + 4786, 15756, 19547, 1581, 92226, 1362, 21524, 13059, 23717, + 149345, 20198, 27123, 149346, 149347, 26030, 27126, 27652, + 10538, 1667, 40282, 14134, 40284, 16368, 149348, 40287, 8870, + 40288, 149349, 40289, 149350, 149351, 40295, 10424, 7012, + 13178, 45608, 10423, 13181, 4201, 672, 13182, 10174, 10607, + 13183, 580, 149352, 149353, 96298, 53691, 3721, 66048, 21584, + 149354, 48206, 48207, 149355, 1405, 1406, 1407, 11162, 577, + 149356, 6941, 6942, 16583, 1284, 10511, 16584, 16585, 422, 423, + 1249, 1244, 1245, 1247, 2544, 1248, 1250, 2545, 1252, 2547, + 1253, 2549, 1259, 1257, 1258, 1260, 1261, 2551, 1262, 1263, + 1264, 1265, 2553, 1266, 17795, 2554, 17796, 1270, 1271, 1273, + 17797, 2556, 1275, 1276, 2557, 1277, 1278, 1279, 1280, 1282, + 68, 69, 5080, 5256, 6869, 10148, 6960, 10150, 149357, 10152, + 14413, 149358, 14414, 56037, 651, 56038, 131797, 555, 14415, + 14416, 149359, 149360, 56042, 14418, 149361, 149, 56043, 97512, + 34512, 797, 7396, 9395, 9396, 9397, 63417, 805, 23984, 13665, + 10452, 55147, 5656, 53, 4348, 4349, 4350, 148488, 13669, 6527, + 149362, 11374, 11376, 11377, 8092, 11378, 11380, 152, 5013, + 8093, 561, 11381, 5623, 4176, 26840, 3564, 3565, 3708, 3567, + 18783, 18784, 4039, 10540, 18786, 30100, 30101, 1528, 149363, + 19561, 19562, 19563, 19564, 1110, 134146, 10600, 149364, 10602, + 149365, 149366, 10603, 10604, 4981, 57075, 37508, 149367, + 34589, 1209, 149368, 19592, 19593, 7620, 9674, 3481, 10240, + 81835, 8001, 33872, 8907, 55155, 1585, 31731, 49694, 25760, + 31733, 903, 904, 2539, 49695, 1194, 1195, 1196, 31734, 1197, + 1198, 1199, 1593, 899, 1200, 1201, 9276, 1202, 40181, 40482, + 55718, 80833, 24596, 3669, 15699, 55720, 55721, 40481, 3672, + 39826, 80363, 2602, 2603, 2604, 62126, 2605, 2606, 2607, 8714, + 2608, 2609, 2610, 2612, 149369, 2894, 15241, 15242, 15262, + 5384, 20290, 20291, 7792, 20295, 64413, 39236, 18011, 71494, + 898, 51015, 19782, 105107, 149370, 7634, 149371, 149372, + 115458, 22821, 19894, 2213, 66926 }; + + data3630 = new int[] { 2, 4, 86133, 11, 16505, 86134, 86135, 86136, + 1290, 86137, 86138, 32473, 19346, 32474, 4922, 32475, 86139, + 16914, 86140, 86141, 86142, 86143, 32478, 86144, 86145, 32480, + 4884, 4887, 32481, 86146, 16572, 86147, 16295, 165, 86148, + 3183, 21920, 21921, 21922, 555, 4006, 32484, 21925, 21926, + 13775, 86149, 13777, 85833, 85834, 13779, 13773, 13780, 75266, + 17674, 13784, 13785, 13786, 13787, 13788, 6258, 86150, 13790, + 75267, 13793, 13794, 13795, 312, 4914, 4915, 6222, 86151, 4845, + 4883, 4918, 4894, 4919, 86152, 4921, 6223, 6224, 6225, 6226, + 67909, 6229, 18170, 6230, 5198, 25625, 6231, 6232, 6233, 1808, + 6234, 6235, 6236, 41376, 6238, 6239, 67911, 6240, 86153, 6243, + 6244, 83549, 6246, 6247, 6248, 6249, 782, 444, 6251, 6250, + 19863, 28963, 310, 2234, 144, 2236, 2309, 69437, 2311, 2325, + 2241, 69438, 69439, 2244, 2245, 2246, 23504, 2314, 69440, + 36603, 2250, 2268, 2271, 2251, 2254, 2255, 2257, 2240, 36604, + 84726, 36605, 84727, 2262, 2263, 18431, 38853, 2317, 2149, + 2326, 2327, 2329, 3980, 2275, 2277, 2258, 84728, 2260, 84729, + 84730, 13766, 36607, 2282, 2283, 84731, 2284, 2286, 2287, 2337, + 7424, 2288, 2338, 3522, 2290, 84733, 32902, 371, 37708, 2096, + 3065, 3066, 375, 377, 374, 378, 2100, 86154, 381, 382, 58795, + 379, 383, 384, 385, 4449, 387, 388, 389, 390, 9052, 391, 18358, + 2107, 394, 2111, 2108, 393, 2109, 395, 86155, 86156, 397, 2113, + 398, 399, 400, 273, 274, 275, 40980, 276, 277, 31716, 279, 280, + 31717, 281, 282, 1628, 1623, 1624, 1625, 2052, 1626, 725, 727, + 728, 729, 730, 731, 1633, 733, 734, 735, 86157, 737, 738, 739, + 1634, 3563, 3564, 3565, 1667, 12461, 76276, 3567, 5413, 77622, + 5415, 5416, 5417, 5418, 107, 86158, 7784, 15363, 153, 3723, + 2713, 7786, 3835, 7787, 86159, 7789, 7791, 7792, 7794, 86160, + 7796, 86161, 6708, 7798, 7799, 7800, 7801, 7802, 7803, 1665, + 43150, 15365, 1581, 5656, 43152, 80258, 7450, 39922, 86162, + 51587, 9059, 4606, 396, 86163, 86164, 7250, 401, 403, 2860, + 33281, 2964, 408, 9119, 409, 86165, 7669, 2861, 410, 413, + 86166, 414, 415, 33282, 405, 33283, 7498, 2865, 7230, 33284, + 2866, 86167, 2867, 47518, 2868, 86168, 2869, 2870, 4712, 7096, + 28484, 6913, 6914, 6915, 6916, 37169, 37170, 7103, 28269, 6919, + 86169, 45431, 6922, 7104, 6923, 7108, 6924, 6925, 6926, 6927, + 6928, 86170, 86171, 86172, 6930, 6931, 6932, 6934, 6935, 6936, + 451, 6937, 6938, 4756, 3554, 5309, 8145, 3586, 16417, 9767, + 14126, 25854, 6580, 10174, 86173, 5519, 21309, 8561, 20938, + 10386, 86174, 781, 2030, 16419, 30323, 16420, 16421, 16424, + 86175, 86176, 86177, 28871, 86178, 28872, 63980, 6329, 49561, + 4271, 38778, 86179, 86180, 20126, 16245, 193, 195, 196, 197, + 56973, 199, 200, 201, 202, 203, 204, 56974, 56975, 205, 206, + 4662, 207, 208, 209, 210, 211, 212, 47901, 641, 642, 643, 1380, + 1079, 47902, 1381, 1081, 1082, 1083, 47903, 1382, 47904, 1087, + 47905, 965, 966, 1298, 968, 1387, 1300, 50288, 971, 972, 973, + 974, 23974, 22183, 1390, 23313, 1389, 1391, 902, 23029, 296, + 1304, 1395, 1303, 1309, 1308, 50289, 1312, 50290, 50291, 1315, + 1317, 9270, 19796, 3605, 1320, 1321, 44946, 1322, 1323, 50292, + 967, 1587, 1326, 1331, 17482, 633, 29115, 53858, 29118, 29119, + 62624, 44494, 6965, 6966, 6959, 6967, 71562, 6969, 23459, + 23460, 17464, 4225, 23461, 23462, 23463, 5893, 23464, 17467, + 17468, 23465, 12562, 1405, 1406, 1407, 960, 961, 962, 687, 963, + 86181, 86182, 5997, 10812, 11976, 11977, 1850, 577, 13393, + 10810, 13394, 65040, 86183, 3935, 3936, 3937, 710, 86184, 5785, + 5786, 29949, 5787, 5788, 283, 284, 2687, 285, 286, 287, 2689, + 288, 289, 8880, 290, 2690, 13899, 991, 292, 295, 42007, 35616, + 63103, 298, 299, 3520, 297, 9024, 303, 301, 302, 300, 31345, + 3719, 304, 305, 306, 307, 308, 368, 364, 85002, 9026, 63105, + 367, 39596, 25835, 19746, 293, 294, 26505, 85003, 18377, 56785, + 10122, 10123, 10124, 86185, 39863, 86186, 10125, 39865, 4066, + 4067, 24257, 4068, 4070, 86187, 4073, 4074, 86188, 4076, 7538, + 4077, 86189, 4078, 4079, 7540, 7541, 4084, 4085, 7542, 86190, + 4086, 86191, 4087, 4088, 86192, 7545, 44874, 7821, 44875, + 86193, 4286, 86194, 51470, 17609, 1408, 47486, 1411, 1412, + 47487, 1413, 1414, 1417, 1415, 47488, 1416, 1418, 1420, 470, + 1422, 1423, 1424, 5001, 5002, 47489, 1427, 1429, 1430, 31811, + 1432, 1433, 47490, 1435, 3753, 1437, 1439, 1440, 47491, 1443, + 47492, 1446, 5004, 5005, 1450, 47493, 353, 1452, 42145, 3103, + 3402, 3104, 3105, 4780, 3106, 3107, 3108, 12157, 3111, 42146, + 42147, 3114, 4782, 42148, 3116, 3117, 42149, 42150, 3407, 3121, + 3122, 18154, 3126, 3127, 3128, 3410, 3130, 3411, 3412, 3415, + 24241, 3417, 3418, 3449, 42151, 3421, 3422, 7587, 42152, 3424, + 3427, 3428, 3448, 3430, 3432, 42153, 42154, 41648, 1991, 407, + 57234, 411, 2862, 57235, 2863, 18368, 57236, 2874, 7350, 4115, + 2876, 2877, 17975, 86195, 4116, 2881, 2882, 2883, 2886, 463, + 870, 872, 873, 874, 875, 8783, 8784, 877, 1480, 1481, 459, + 2778, 881, 8785, 2779, 8786, 8787, 8788, 886, 887, 8789, 889, + 8790, 86196, 6920, 86197, 5080, 5081, 7395, 7396, 9395, 9396, + 1528, 42737, 805, 86198, 1209, 13595, 4126, 9680, 34368, 9682, + 86199, 86200, 174, 175, 176, 177, 178, 179, 180, 182, 183, + 1477, 31138, 186, 172, 187, 188, 189, 190, 191, 458, 871, + 31294, 31295, 27604, 31296, 31297, 882, 883, 884, 31298, 890, + 1089, 1488, 1489, 1092, 1093, 1094, 1095, 1096, 1097, 1490, + 1098, 1495, 1502, 1099, 1100, 1101, 1493, 2997, 12223, 1103, + 2654, 1498, 1499, 1500, 80615, 80616, 80617, 33359, 86201, + 9294, 1501, 86202, 1506, 1507, 23454, 38802, 38803, 1014, + 86203, 5583, 5584, 651, 74717, 5586, 5587, 5588, 5589, 74720, + 5590, 38808, 33527, 78330, 10930, 5119, 10931, 1000, 10928, + 10932, 10933, 10934, 10935, 5863, 10936, 86204, 10938, 10939, + 86205, 192, 194, 38754, 38755, 198, 38756, 38757, 38758, 2842, + 640, 22780, 22781, 1080, 86206, 86207, 1084, 1086, 1088, 63916, + 9412, 970, 9413, 9414, 9415, 9416, 9417, 1310, 7168, 7169, + 1318, 9418, 1324, 39159, 1804, 1557, 24850, 41499, 1560, 41500, + 1562, 1563, 1565, 1927, 1928, 1566, 1569, 1570, 1571, 1572, + 1573, 1574, 1575, 1576, 2674, 2677, 2678, 2679, 2946, 2682, + 2676, 2683, 2947, 1156, 1157, 1158, 1467, 1160, 1468, 1469, + 1161, 1162, 1163, 4369, 1165, 1166, 1167, 12923, 2917, 1169, + 1170, 1171, 1172, 1173, 1174, 1175, 1176, 1177, 18153, 8359, + 1178, 1164, 1191, 1180, 12924, 86208, 86209, 54817, 66962, + 2476, 86210, 86211, 41820, 41821, 41822, 41824, 1130, 1131, + 1132, 32692, 1134, 34848, 1136, 1133, 1137, 1138, 1139, 1140, + 1141, 1143, 1144, 1145, 34849, 2639, 34850, 1146, 1147, 1148, + 34851, 1150, 1151, 1152, 1153, 1154, 1155, 1678, 1679, 1680, + 1681, 40870, 2059, 1685, 1686, 32686, 14970, 1688, 1689, 86212, + 1692, 1682, 1693, 1695, 1696, 1698, 12955, 8909, 41690, 1700, + 41691, 86213, 30949, 41692, 1703, 1704, 1705, 41693, 14976, + 1708, 2071, 1709, 1710, 1711, 1712, 1727, 86214, 86215, 86216, + 1715, 86217, 1714, 1717, 1690, 41697, 86218, 1720, 86219, 2073, + 41699, 1724, 2075, 1726, 1729, 1730, 1732, 2078, 2223, 1735, + 1713, 41700, 1737, 14977, 1739, 1740, 1741, 2080, 1743, 1744, + 1745, 1746, 1747, 1748, 1749, 1750, 1751, 41701, 1752, 1753, + 1909, 86220, 2085, 1754, 19548, 86221, 19551, 5733, 3856, 5190, + 4581, 25145, 86222, 86223, 4846, 86224, 4861, 86225, 86226, + 86227, 25150, 86228, 86229, 13820, 2027, 4898, 4899, 4901, + 2135, 4902, 4868, 4904, 86230, 4905, 25155, 4907, 86231, 4909, + 4910, 4911, 4912, 86232, 6220, 81357, 86233, 2589, 73877, + 29706, 6227, 6228, 86234, 6237, 86235, 6241, 6242, 1812, 13808, + 13809, 70908, 2293, 2294, 86236, 2295, 2296, 2297, 22947, + 16511, 2299, 2300, 2301, 13097, 73079, 86237, 13099, 50121, + 86238, 86239, 13101, 86240, 2424, 4725, 4726, 4727, 4728, 4729, + 4730, 86241, 26881, 10944, 4734, 4735, 4736, 26239, 26240, + 71408, 86242, 57401, 71410, 26244, 5344, 26245, 86243, 4102, + 71414, 11091, 6736, 86244, 6737, 6738, 38152, 6740, 6741, 6742, + 6298, 6743, 6745, 6746, 20867, 6749, 20616, 86245, 9801, 65297, + 20617, 65298, 20619, 5629, 65299, 20621, 20622, 8385, 20623, + 20624, 5191, 20625, 20626, 442, 443, 445, 27837, 77681, 86246, + 27839, 86247, 86248, 41435, 66511, 2478, 2479, 2480, 2481, + 2482, 2483, 2484, 2485, 2486, 2487, 2488, 2489, 2490, 2494, + 2493, 33025, 12084, 2542, 2497, 2499, 2501, 2503, 2504, 2505, + 33026, 2506, 2507, 2508, 2509, 2511, 1787, 12080, 2513, 2514, + 3988, 3176, 3989, 2518, 2521, 9285, 2522, 2524, 2525, 3990, + 2527, 2528, 27499, 2529, 2530, 3991, 2532, 2534, 2535, 18038, + 2536, 2538, 2495, 46077, 61493, 61494, 1006, 713, 4971, 4972, + 4973, 4975, 4976, 650, 170, 7549, 7550, 7551, 7552, 7553, + 86249, 7936, 956, 11169, 11170, 1249, 1244, 1245, 1247, 2544, + 1250, 2545, 1252, 2547, 1253, 1254, 2549, 39636, 1259, 1257, + 1258, 39637, 1260, 1261, 2551, 1262, 1263, 848, 86250, 86251, + 854, 74596, 856, 1957, 86252, 855, 1959, 1961, 857, 86253, 851, + 859, 860, 862, 1964, 864, 865, 866, 867, 1965, 1966, 1967, + 1968, 1969, 86254, 1971, 1972, 1973, 1974, 1975, 1976, 1977, + 841, 1954, 842, 2978, 846, 847, 849, 850, 852, 1956, 17452, + 71941, 86255, 86256, 73665, 1471, 13690, 185, 503, 504, 2342, + 505, 506, 4378, 508, 4379, 17313, 510, 511, 512, 520, 513, + 4384, 17314, 514, 515, 46158, 17317, 518, 34269, 519, 4386, + 523, 524, 525, 46159, 528, 529, 17319, 531, 532, 533, 534, 535, + 7482, 537, 538, 5267, 536, 539, 541, 540, 19858, 17320, 17321, + 906, 907, 908, 17322, 910, 17323, 912, 15850, 913, 4398, 17324, + 86257, 278, 2948, 2949, 2950, 3007, 2951, 2952, 2953, 2954, + 2955, 3013, 35352, 3014, 3015, 2962, 3016, 33505, 39118, 3017, + 3018, 20492, 4000, 3021, 3022, 35353, 39293, 3024, 18443, 3029, + 9467, 20529, 39119, 8380, 2965, 3030, 3043, 22714, 39120, 2956, + 3035, 39121, 3037, 3038, 2688, 86258, 36675, 30894, 24505, + 8888, 13541, 49728, 27660, 9082, 27661, 365, 366, 2232, 76098, + 7233, 1494, 17391, 606, 607, 611, 610, 612, 614, 615, 613, 616, + 9117, 617, 618, 21155, 1789, 619, 620, 7636, 12019, 621, 622, + 1793, 623, 625, 624, 631, 626, 627, 21578, 21103, 628, 21579, + 629, 9122, 9123, 12189, 9289, 3168, 3169, 630, 632, 634, 21580, + 9121, 635, 636, 637, 21581, 12781, 1801, 638, 639, 1559, 24343, + 9419, 9420, 795, 796, 1611, 86259, 1612, 21551, 21552, 3741, + 1617, 3742, 1615, 1619, 1620, 6301, 3744, 1622, 67685, 8521, + 55937, 9025, 27663, 8881, 13581, 86260, 11592, 44720, 86261, + 63231, 50873, 42925, 52332, 86262, 72706, 17705, 17707, 17708, + 3401, 40217, 1248, 40218, 86263, 7098, 86264, 86265, 1264, + 86266, 1266, 1267, 1268, 1269, 86267, 1271, 1272, 1273, 1274, + 2556, 1275, 1276, 1277, 1278, 1279, 1280, 1282, 1283, 22680, + 11889, 86268, 45662, 7038, 86269, 19315, 45663, 45664, 86270, + 5855, 34002, 49245, 10447, 5663, 86271, 15429, 53877, 49249, + 86272, 86273, 86274, 60128, 60453, 60129, 5552, 31923, 43407, + 4287, 17980, 64977, 86275, 86276, 8234, 86277, 3649, 8240, + 1330, 11999, 1332, 27618, 1334, 1335, 340, 3651, 25640, 18165, + 1343, 4618, 1474, 3653, 75921, 1349, 53519, 1779, 45454, 22778, + 40153, 67677, 63826, 45455, 15128, 67678, 67679, 1792, 67680, + 3171, 47816, 45457, 9288, 59891, 67681, 25703, 35731, 35732, + 369, 35713, 35714, 35715, 34652, 35716, 31681, 35717, 12779, + 35718, 35719, 11992, 806, 807, 808, 43499, 43500, 810, 776, + 812, 813, 814, 241, 43501, 43502, 816, 755, 43503, 818, 819, + 820, 43504, 821, 822, 823, 824, 825, 826, 43505, 43506, 43507, + 828, 829, 20083, 43508, 43509, 832, 833, 834, 835, 86278, + 19984, 19985, 86279, 24125, 19986, 86280, 19988, 86281, 5414, + 86282, 85808, 5479, 5420, 5421, 5422, 5423, 63800, 86283, + 86284, 30965, 86285, 416, 1510, 5740, 5741, 81991, 86286, + 28938, 50149, 1003, 55512, 14306, 6960, 688, 86287, 14307, + 5399, 5400, 17783, 24118, 720, 86288, 44913, 24557, 667, 24876, + 6529, 24877, 24878, 24879, 24880, 31847, 20671, 4011, 171, 580, + 86289, 3863, 914, 2202, 916, 917, 918, 919, 921, 922, 923, + 7585, 925, 7586, 926, 927, 928, 7588, 929, 930, 931, 932, 933, + 934, 1875, 1876, 7589, 7590, 1878, 1879, 7591, 7592, 1882, + 1883, 1884, 2212, 7593, 1887, 1888, 1889, 1890, 1891, 1892, + 1893, 1894, 1895, 1896, 1897, 1898, 2217, 1900, 7594, 1902, + 2219, 7595, 1905, 1906, 1907, 3323, 7596, 1911, 1912, 7597, + 1914, 1915, 1916, 2226, 1919, 7598, 2227, 1920, 1921, 7599, + 7600, 4708, 1923, 355, 356, 1549, 358, 32077, 360, 32078, + 21117, 362, 19043, 71677, 5716, 86290, 49790, 86291, 86292, + 86293, 49792, 86294, 86295, 49794, 86296, 86297, 86298, 86299, + 11882, 86300, 49798, 86301, 49800, 49801, 49802, 49803, 453, + 49804, 8591, 6794, 49806, 18989, 49807, 49808, 16308, 49809, + 86302, 86303, 10105, 86304, 5285, 10106, 10107, 6557, 86305, + 23571, 10109, 38883, 10110, 5401, 86306, 67557, 16430, 67558, + 40171, 16433, 25878, 86307, 21762, 23, 86308, 86309, 21766, + 86310, 86311, 5149, 3926, 21768, 21769, 47826, 942, 46985, + 6588, 58867, 6589, 6590, 86312, 6592, 6006, 53855, 9565, 359, + 86313, 2845, 876, 879, 27556, 27557, 885, 27558, 888, 2847, + 27559, 2115, 2116, 2117, 53962, 57839, 315, 316, 317, 318, 319, + 86314, 321, 322, 2122, 323, 2123, 324, 325, 328, 326, 327, + 40542, 329, 330, 18079, 18080, 331, 1790, 7382, 332, 7380, + 7236, 23413, 23414, 18924, 18925, 333, 335, 336, 39750, 337, + 86315, 339, 341, 342, 343, 16264, 16265, 6615, 86316, 86317, + 86318, 86319, 16269, 10538, 33226, 86320, 16272, 5824, 16273, + 16274, 16276, 16277, 16278, 16279, 16280, 14517, 1547, 6463, + 3394, 49677, 659, 10380, 30013, 10382, 10378, 10379, 10383, + 10384, 10385, 86321, 4139, 13370, 13371, 86322, 86323, 11878, + 64509, 15141, 15142, 15143, 32737, 14183, 15144, 39101, 42768, + 5645, 32738, 801, 803, 804, 86324, 14707, 86325, 6601, 12402, + 712, 12403, 2936, 1447, 15477, 1410, 44872, 1550, 8614, 15478, + 15479, 15480, 15481, 4811, 3752, 1442, 15482, 8818, 1445, 5006, + 16304, 32277, 16305, 16306, 86326, 16307, 53691, 69305, 809, + 86327, 815, 26724, 69307, 43484, 63904, 86328, 13498, 827, + 86329, 831, 2857, 836, 86330, 86331, 837, 838, 839, 840, 228, + 229, 43722, 230, 231, 43723, 234, 235, 236, 237, 238, 239, + 2745, 2746, 240, 242, 243, 244, 43724, 19788, 246, 247, 21134, + 248, 250, 251, 252, 253, 254, 255, 256, 257, 258, 43725, 43726, + 41, 43727, 262, 43728, 2751, 264, 265, 266, 267, 268, 269, 270, + 271, 272, 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 1032, + 1033, 1034, 43729, 1035, 43730, 1037, 21821, 2926, 14388, + 10432, 14389, 14390, 14391, 14392, 86332, 14394, 14395, 2035, + 2169, 86333, 14397, 14398, 14399, 14400, 52, 14401, 14402, + 7077, 21822, 14405, 14406, 14396, 86334, 17356, 17357, 84679, + 84680, 76383, 17360, 17361, 86335, 38801, 2060, 30850, 12963, + 1684, 1687, 2061, 14978, 1694, 43387, 1697, 1699, 2067, 1701, + 1702, 1706, 43388, 43389, 76325, 1716, 1718, 26832, 1719, 1723, + 2081, 2063, 1728, 39059, 76326, 1731, 86336, 1736, 76327, 1738, + 19657, 6579, 6581, 6582, 6583, 6584, 6585, 29979, 1818, 28239, + 68, 69, 3391, 86337, 10266, 63528, 86338, 10269, 10270, 10271, + 10272, 86339, 86340, 63530, 63531, 63532, 63533, 10273, 63534, + 86341, 10681, 10682, 86342, 9673, 86343, 10683, 460, 461, 462, + 467, 4464, 4466, 3729, 471, 472, 468, 81634, 474, 81635, 475, + 476, 477, 479, 480, 81636, 81637, 482, 17442, 81638, 81639, + 484, 485, 486, 4473, 488, 489, 490, 493, 466, 494, 495, 496, + 497, 499, 500, 501, 502, 34376, 86344, 63836, 56281, 1707, + 20416, 61452, 56282, 1755, 56283, 56284, 18508, 53650, 63444, + 86345, 3579, 63445, 3677, 1979, 1980, 1981, 3132, 3147, 34090, + 1987, 12770, 1329, 80818, 80819, 1988, 23522, 1986, 15880, + 1985, 32975, 1992, 1993, 7165, 3141, 3143, 86346, 1982, 1984, + 3145, 86347, 78064, 55453, 2656, 2657, 35634, 35635, 2167, + 43479 }; + + data10k = new int[] { 2, 4, 149900, 11, 70236, 149901, 149902, 6721, + 149929, 29212, 34600, 149930, 149931, 149932, 141696, 149908, + 149909, 149910 }; + + data501871 = new int[] { 1368366, 1368367, 1817408, 11, 2513, 1817409, + 1817410, 1817411, 1382349, 126700, 1817412, 5539, 21862, 21863, + 21864, 1233, 1127, 121, 15254, 15255, 357, 449, 15256, 8817, + 15257, 15258, 1406, 1096, 281, 4826, 4827, 223, 166, 2372, 168, + 169, 2219, 170, 171, 1176, 172, 173, 2222, 3035, 177, 178, 179, + 180, 181, 183, 3036, 2378, 1157, 1158, 2380, 1160, 1161, 1162, + 2384, 1164, 1165, 1166, 1167, 1168, 2385, 3037, 1171, 1172, + 1173, 2238, 1175, 1177, 1178, 1179, 1180, 1181, 2243, 3038, + 1182, 2244, 1183, 1184, 1185, 1186, 1187, 1188, 1189, 1190, + 59766, 471, 7349, 3599, 2847, 59767, 59768, 59769, 59770, + 59771, 59772, 59773, 59774, 59775, 2625, 852, 853, 2632, 854, + 855, 856, 2284, 857, 862, 1031, 859, 860, 861, 866, 1033, 867, + 1035, 868, 870, 2294, 871, 2295, 873, 874, 875, 876, 877, 878, + 879, 66632, 66633, 66634, 66635, 14823, 66636, 66637, 3763, + 77345, 1370, 3764, 3765, 3766, 5666, 3768, 3770, 16892, 3771, + 3772, 3773, 3244, 3246, 3247, 1504, 266, 29250, 24764, 29251, + 689, 12844, 8068, 29252, 38918, 750, 751, 770, 3704, 753, 754, + 765, 755, 3708, 757, 758, 759, 760, 3710, 761, 762, 763, 3712, + 766, 767, 768, 769, 771, 3719, 4380, 3722, 3723, 3725, 4381, + 3727, 3728, 3731, 3732, 764, 4382, 2316, 334, 1637, 4383, 4384, + 4385, 4386, 4387, 184, 185, 1134, 186, 1135, 187, 188, 1138, + 197, 191, 3517, 193, 194, 195, 196, 208, 3519, 198, 9210, 937, + 9211, 9212, 916, 917, 117, 118, 919, 122, 921, 123, 124, 125, + 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 924, 137, + 138, 139, 140, 141, 588, 928, 142, 143, 144, 929, 146, 147, + 148, 149, 150, 151, 3775, 3776, 3777, 3778, 3780, 3781, 3783, + 3784, 3785, 3796, 4169, 3788, 4170, 3790, 3791, 3793, 3803, + 3794, 3797, 4171, 3799, 3800, 3801, 3802, 3804, 4172, 3806, + 4173, 4174, 3811, 4175, 3813, 3829, 3815, 3816, 3817, 4176, + 4177, 3820, 3821, 3822, 2168, 3039, 2460, 2170, 2459, 2174, + 2175, 2176, 2461, 2462, 2463, 3040, 2466, 2467, 2469, 2468, + 2470, 3041, 2472, 3042, 3043, 3044, 3045, 231, 881, 882, 1219, + 884, 2038, 886, 887, 888, 891, 892, 1221, 894, 895, 1222, 2039, + 899, 1225, 900, 901, 902, 2492, 2494, 2495, 2496, 4052, 2498, + 2502, 2500, 2501, 2503, 2504, 4653, 5514, 18671, 10350, 1122, + 44317, 44318, 44319, 44320, 44321, 44322, 44323, 44324, 7923, + 1422, 10284, 10285, 6146, 9803, 10286, 466, 5998, 696, 3257, + 6043, 6195, 6196, 6197, 6198, 6199, 6200, 6201, 7029, 4405, + 4864, 450, 349, 11214, 3548, 1092, 5728, 7395, 6533, 1123, + 5736, 1115, 6535, 6536, 2739, 2832, 2833, 2834, 2835, 2836, + 23972, 2837, 23973, 2839, 2840, 2691, 1339, 20116, 3219, 8210, + 3170, 3171, 3172, 3173, 2094, 2095, 2096, 2097, 2099, 2100, + 2102, 3174, 2104, 1372, 2105, 2107, 2108, 2109, 2110, 2113, + 2114, 2115, 2117, 2118, 3221, 3222, 2122, 2123, 2124, 4611, + 2125, 2126, 2127, 2128, 2129, 2130, 2131, 575, 576, 2132, 4612, + 2134, 2135, 2136, 4368, 5931, 5932, 5933, 5934, 5935, 5936, + 5937, 5938, 5939, 2902, 4057, 4058, 4059, 4060, 4062, 4063, + 4064, 4654, 4655, 4067, 4068, 4069, 4656, 4657, 4073, 4658, + 4074, 4075, 4659, 4660, 4661, 4076, 4662, 4663, 4664, 4078, + 4079, 4080, 4665, 4082, 4083, 4084, 4666, 4086, 4087, 4088, + 544, 545, 546, 547, 548, 549, 550, 559, 1227, 552, 553, 5035, + 555, 554, 1228, 556, 1229, 557, 558, 560, 561, 562, 563, 564, + 565, 1230, 566, 567, 568, 569, 570, 572, 573, 222, 7461, 2059, + 2060, 2061, 5664, 2062, 7463, 16997, 2065, 2066, 2067, 2068, + 2069, 2070, 2072, 2073, 2074, 2075, 2076, 2077, 2078, 7464, + 2079, 2080, 2081, 7465, 2082, 2083, 2084, 2085, 2086, 2087, + 199, 206, 200, 203, 205, 211, 1140, 3699, 209, 214, 215, 216, + 217, 218, 777, 778, 779, 780, 2298, 781, 782, 783, 784, 785, + 787, 788, 384, 789, 790, 791, 2677, 793, 794, 795, 796, 797, + 2307, 798, 799, 801, 802, 3645, 803, 4337, 805, 3648, 3649, + 807, 808, 3651, 810, 812, 813, 814, 815, 816, 3654, 818, 819, + 13780, 930, 932, 4221, 935, 936, 938, 2197, 939, 940, 941, + 2200, 943, 1591, 1952, 2630, 1592, 2631, 1602, 1607, 1595, + 1596, 1597, 1598, 1599, 1955, 1601, 1603, 1956, 1605, 1606, + 1608, 1610, 1638, 20608, 968, 969, 970, 971, 972, 973, 974, + 975, 2729, 2730, 977, 2731, 979, 980, 981, 982, 983, 984, 3506, + 987, 989, 990, 991, 2732, 2733, 6051, 6053, 6055, 910, 6056, + 4339, 4340, 577, 4341, 579, 580, 581, 616, 584, 585, 586, 4342, + 4343, 589, 590, 591, 592, 593, 594, 595, 596, 597, 598, 5046, + 599, 600, 5047, 601, 602, 603, 604, 605, 5053, 608, 609, 610, + 5055, 612, 613, 5056, 615, 617, 618, 619, 620, 621, 622, 623, + 624, 6882, 627, 628, 629, 630, 631, 5330, 633, 634, 635, 636, + 637, 639, 640, 7870, 632, 34480, 13118, 903, 904, 905, 907, + 2616, 2617, 2618, 2619, 2620, 2621, 2622, 2623, 2624, 2643, + 1685, 1686, 1687, 1688, 1690, 1691, 2644, 2645, 1695, 2646, + 1699, 2647, 2648, 1702, 2649, 2650, 1706, 22082, 5516, 4307, + 2203, 1995, 1996, 1998, 1999, 2206, 2002, 2003, 4407, 2005, + 4408, 2007, 2008, 2009, 2010, 2011, 4409, 2013, 2014, 2015, + 2017, 3227, 3149, 6025, 22913, 22914, 3228, 7925, 10123, 10124, + 10125, 10127, 16978, 14094, 1593, 4869, 4870, 3477, 3844, 3845, + 9923, 3846, 3847, 39767, 39768, 39769, 3541, 39770, 39771, + 14179, 39772, 39773, 39774, 42558, 1043, 4203, 42559, 42560, + 42561, 42562, 42563, 42564, 11018, 42565, 42566, 4589, 4590, + 4591, 4312, 18283, 4317, 4318, 4319, 12659, 11706, 11707, + 53395, 53396, 29410, 8040, 8041, 915, 20105, 22952, 22953, + 20596, 4161, 3047, 3048, 3049, 3050, 3051, 3052, 3053, 3054, + 3055, 1474, 3056, 3057, 3058, 3059, 3060, 3061, 2549, 2551, + 3062, 3063, 3064, 3065, 3066, 3067, 3068, 3069, 515, 3070, + 3071, 3072, 3073, 3074, 3075, 3076, 3077, 3078, 3079, 3080, + 3081, 3082, 506, 3083, 3084, 3085, 3086, 3087, 3088, 3089, + 3090, 3091, 527, 528, 2995, 530, 531, 533, 534, 535, 537, 538 }; + } + +} diff --git a/modules/facet/src/test/org/apache/lucene/util/encoding/EncodingTest.java b/modules/facet/src/test/org/apache/lucene/util/encoding/EncodingTest.java new file mode 100644 index 00000000000..9d02a803749 --- /dev/null +++ b/modules/facet/src/test/org/apache/lucene/util/encoding/EncodingTest.java @@ -0,0 +1,424 @@ +package org.apache.lucene.util.encoding; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.util.HashSet; +import java.util.TreeSet; + +import org.junit.Test; + +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.encoding.DGapIntEncoder; +import org.apache.lucene.util.encoding.EightFlagsIntEncoder; +import org.apache.lucene.util.encoding.FourFlagsIntEncoder; +import org.apache.lucene.util.encoding.IntDecoder; +import org.apache.lucene.util.encoding.IntEncoder; +import org.apache.lucene.util.encoding.NOnesIntEncoder; +import org.apache.lucene.util.encoding.SimpleIntEncoder; +import org.apache.lucene.util.encoding.SortingIntEncoder; +import org.apache.lucene.util.encoding.UniqueValuesIntEncoder; +import org.apache.lucene.util.encoding.VInt8IntEncoder; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class EncodingTest extends LuceneTestCase { + + static int[] data = null; + + private static TreeSet dataSet = new TreeSet(); + static { + setData(); + } + + @Test + public void testVInt8() throws Exception { + encoderTest(new VInt8IntEncoder()); + + // cover negative numbers; + IntEncoder enc = new VInt8IntEncoder(); + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + enc.reInit(baos); + enc.encode(-1); + + IntDecoder dec = enc.createMatchingDecoder(); + dec.reInit(new ByteArrayInputStream(baos.toByteArray())); + assertEquals(-1, dec.decode()); + } + + @Test + public void testSimpleInt() { + encoderTest(new SimpleIntEncoder()); + } + + @Test + public void testSortingUniqueValues() { + encoderTest(new SortingIntEncoder(new UniqueValuesIntEncoder(new VInt8IntEncoder()))); + } + + @Test + public void testSortingUniqueDGap() { + encoderTest(new SortingIntEncoder(new UniqueValuesIntEncoder(new DGapIntEncoder(new VInt8IntEncoder())))); + } + + @Test + public void testSortingUniqueDGapEightFlags() { + encoderTest(new SortingIntEncoder(new UniqueValuesIntEncoder(new DGapIntEncoder(new EightFlagsIntEncoder())))); + } + + @Test + public void testSortingUniqueDGapFourFlags() { + encoderTest(new SortingIntEncoder(new UniqueValuesIntEncoder(new DGapIntEncoder(new FourFlagsIntEncoder())))); + } + + @Test + public void testSortingUniqueDGapNOnes4() { + encoderTest(new SortingIntEncoder(new UniqueValuesIntEncoder(new DGapIntEncoder(new NOnesIntEncoder(4))))); + } + + @Test + public void testSortingUniqueDGapNOnes3() { + encoderTest(new SortingIntEncoder(new UniqueValuesIntEncoder(new DGapIntEncoder(new NOnesIntEncoder(3))))); + } + + private static void encoderTest(IntEncoder encoder) { + + // ensure toString is implemented + String toString = encoder.toString(); + assertFalse(toString.startsWith(encoder.getClass().getName() + "@")); + IntDecoder decoder = encoder.createMatchingDecoder(); + toString = decoder.toString(); + assertFalse(toString.startsWith(decoder.getClass().getName() + "@")); + + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + + try { + encoding(encoder, baos); + decoding(baos, encoder.createMatchingDecoder()); + } catch (Exception e) { + e.printStackTrace(); + fail(e.getMessage()); + } + + baos.reset(); + + try { + encoding(encoder, baos); + decoding(baos, encoder.createMatchingDecoder()); + } catch (Exception e) { + e.printStackTrace(); + fail(e.getMessage()); + } + } + + private static void encoding(IntEncoder encoder, ByteArrayOutputStream baos) throws IOException { + encoder.reInit(baos); + for (int value : data) { + encoder.encode(value); + } + encoder.close(); + + baos.reset(); + encoder.reInit(baos); + for (int value : data) { + encoder.encode(value); + } + encoder.close(); + } + + private static void decoding(ByteArrayOutputStream baos, IntDecoder decoder) + throws IOException, InstantiationException, IllegalAccessException { + ByteArrayInputStream bais = new ByteArrayInputStream(baos.toByteArray()); + decoder.reInit(bais); + + HashSet set = new HashSet(); + long value = 0; + while ((value = decoder.decode()) != IntDecoder.EOS) { + set.add(value); + } + assertEquals(dataSet.size(), set.size()); + assertTrue(set.equals(dataSet)); + + set.clear(); + bais.reset(); + decoder.reInit(bais); + value = 0; + while ((value = decoder.decode()) != IntDecoder.EOS) { + set.add(value); + } + assertEquals(dataSet.size(), set.size()); + assertTrue(set.equals(dataSet)); + + } + + private static void setData() { + data = new int[] { 2, 4, 86133, 11, 16505, 86134, 86135, 86136, 1290, + 86137, 86138, 32473, 19346, 32474, 4922, 32475, 86139, 16914, + 86140, 86141, 86142, 86143, 32478, 86144, 86145, 32480, 4884, + 4887, 32481, 86146, 16572, 86147, 16295, 165, 86148, 3183, + 21920, 21921, 21922, 555, 4006, 32484, 21925, 21926, 13775, + 86149, 13777, 85833, 85834, 13779, 13773, 13780, 75266, 17674, + 13784, 13785, 13786, 13787, 13788, 6258, 86150, 13790, 75267, + 13793, 13794, 13795, 312, 4914, 4915, 6222, 86151, 4845, 4883, + 4918, 4894, 4919, 86152, 4921, 6223, 6224, 6225, 6226, 67909, + 6229, 18170, 6230, 5198, 25625, 6231, 6232, 6233, 1808, 6234, + 6235, 6236, 41376, 6238, 6239, 67911, 6240, 86153, 6243, 6244, + 83549, 6246, 6247, 6248, 6249, 782, 444, 6251, 6250, 19863, + 28963, 310, 2234, 144, 2236, 2309, 69437, 2311, 2325, 2241, + 69438, 69439, 2244, 2245, 2246, 23504, 2314, 69440, 36603, + 2250, 2268, 2271, 2251, 2254, 2255, 2257, 2240, 36604, 84726, + 36605, 84727, 2262, 2263, 18431, 38853, 2317, 2149, 2326, 2327, + 2329, 3980, 2275, 2277, 2258, 84728, 2260, 84729, 84730, 13766, + 36607, 2282, 2283, 84731, 2284, 2286, 2287, 2337, 7424, 2288, + 2338, 3522, 2290, 84733, 32902, 371, 37708, 2096, 3065, 3066, + 375, 377, 374, 378, 2100, 86154, 381, 382, 58795, 379, 383, + 384, 385, 4449, 387, 388, 389, 390, 9052, 391, 18358, 2107, + 394, 2111, 2108, 393, 2109, 395, 86155, 86156, 397, 2113, 398, + 399, 400, 273, 274, 275, 40980, 276, 277, 31716, 279, 280, + 31717, 281, 282, 1628, 1623, 1624, 1625, 2052, 1626, 725, 727, + 728, 729, 730, 731, 1633, 733, 734, 735, 86157, 737, 738, 739, + 1634, 3563, 3564, 3565, 1667, 12461, 76276, 3567, 5413, 77622, + 5415, 5416, 5417, 5418, 107, 86158, 7784, 15363, 153, 3723, + 2713, 7786, 3835, 7787, 86159, 7789, 7791, 7792, 7794, 86160, + 7796, 86161, 6708, 7798, 7799, 7800, 7801, 7802, 7803, 1665, + 43150, 15365, 1581, 5656, 43152, 80258, 7450, 39922, 86162, + 51587, 9059, 4606, 396, 86163, 86164, 7250, 401, 403, 2860, + 33281, 2964, 408, 9119, 409, 86165, 7669, 2861, 410, 413, + 86166, 414, 415, 33282, 405, 33283, 7498, 2865, 7230, 33284, + 2866, 86167, 2867, 47518, 2868, 86168, 2869, 2870, 4712, 7096, + 28484, 6913, 6914, 6915, 6916, 37169, 37170, 7103, 28269, 6919, + 86169, 45431, 6922, 7104, 6923, 7108, 6924, 6925, 6926, 6927, + 6928, 86170, 86171, 86172, 6930, 6931, 6932, 6934, 6935, 6936, + 451, 6937, 6938, 4756, 3554, 5309, 8145, 3586, 16417, 9767, + 14126, 25854, 6580, 10174, 86173, 5519, 21309, 8561, 20938, + 10386, 86174, 781, 2030, 16419, 30323, 16420, 16421, 16424, + 86175, 86176, 86177, 28871, 86178, 28872, 63980, 6329, 49561, + 4271, 38778, 86179, 86180, 20126, 16245, 193, 195, 196, 197, + 56973, 199, 200, 201, 202, 203, 204, 56974, 56975, 205, 206, + 4662, 207, 208, 209, 210, 211, 212, 47901, 641, 642, 643, 1380, + 1079, 47902, 1381, 1081, 1082, 1083, 47903, 1382, 47904, 1087, + 47905, 965, 966, 1298, 968, 1387, 1300, 50288, 971, 972, 973, + 974, 23974, 22183, 1390, 23313, 1389, 1391, 902, 23029, 296, + 1304, 1395, 1303, 1309, 1308, 50289, 1312, 50290, 50291, 1315, + 1317, 9270, 19796, 3605, 1320, 1321, 44946, 1322, 1323, 50292, + 967, 1587, 1326, 1331, 17482, 633, 29115, 53858, 29118, 29119, + 62624, 44494, 6965, 6966, 6959, 6967, 71562, 6969, 23459, + 23460, 17464, 4225, 23461, 23462, 23463, 5893, 23464, 17467, + 17468, 23465, 12562, 1405, 1406, 1407, 960, 961, 962, 687, 963, + 86181, 86182, 5997, 10812, 11976, 11977, 1850, 577, 13393, + 10810, 13394, 65040, 86183, 3935, 3936, 3937, 710, 86184, 5785, + 5786, 29949, 5787, 5788, 283, 284, 2687, 285, 286, 287, 2689, + 288, 289, 8880, 290, 2690, 13899, 991, 292, 295, 42007, 35616, + 63103, 298, 299, 3520, 297, 9024, 303, 301, 302, 300, 31345, + 3719, 304, 305, 306, 307, 308, 368, 364, 85002, 9026, 63105, + 367, 39596, 25835, 19746, 293, 294, 26505, 85003, 18377, 56785, + 10122, 10123, 10124, 86185, 39863, 86186, 10125, 39865, 4066, + 4067, 24257, 4068, 4070, 86187, 4073, 4074, 86188, 4076, 7538, + 4077, 86189, 4078, 4079, 7540, 7541, 4084, 4085, 7542, 86190, + 4086, 86191, 4087, 4088, 86192, 7545, 44874, 7821, 44875, + 86193, 4286, 86194, 51470, 17609, 1408, 47486, 1411, 1412, + 47487, 1413, 1414, 1417, 1415, 47488, 1416, 1418, 1420, 470, + 1422, 1423, 1424, 5001, 5002, 47489, 1427, 1429, 1430, 31811, + 1432, 1433, 47490, 1435, 3753, 1437, 1439, 1440, 47491, 1443, + 47492, 1446, 5004, 5005, 1450, 47493, 353, 1452, 42145, 3103, + 3402, 3104, 3105, 4780, 3106, 3107, 3108, 12157, 3111, 42146, + 42147, 3114, 4782, 42148, 3116, 3117, 42149, 42150, 3407, 3121, + 3122, 18154, 3126, 3127, 3128, 3410, 3130, 3411, 3412, 3415, + 24241, 3417, 3418, 3449, 42151, 3421, 3422, 7587, 42152, 3424, + 3427, 3428, 3448, 3430, 3432, 42153, 42154, 41648, 1991, 407, + 57234, 411, 2862, 57235, 2863, 18368, 57236, 2874, 7350, 4115, + 2876, 2877, 17975, 86195, 4116, 2881, 2882, 2883, 2886, 463, + 870, 872, 873, 874, 875, 8783, 8784, 877, 1480, 1481, 459, + 2778, 881, 8785, 2779, 8786, 8787, 8788, 886, 887, 8789, 889, + 8790, 86196, 6920, 86197, 5080, 5081, 7395, 7396, 9395, 9396, + 1528, 42737, 805, 86198, 1209, 13595, 4126, 9680, 34368, 9682, + 86199, 86200, 174, 175, 176, 177, 178, 179, 180, 182, 183, + 1477, 31138, 186, 172, 187, 188, 189, 190, 191, 458, 871, + 31294, 31295, 27604, 31296, 31297, 882, 883, 884, 31298, 890, + 1089, 1488, 1489, 1092, 1093, 1094, 1095, 1096, 1097, 1490, + 1098, 1495, 1502, 1099, 1100, 1101, 1493, 2997, 12223, 1103, + 2654, 1498, 1499, 1500, 80615, 80616, 80617, 33359, 86201, + 9294, 1501, 86202, 1506, 1507, 23454, 38802, 38803, 1014, + 86203, 5583, 5584, 651, 74717, 5586, 5587, 5588, 5589, 74720, + 5590, 38808, 33527, 78330, 10930, 5119, 10931, 1000, 10928, + 10932, 10933, 10934, 10935, 5863, 10936, 86204, 10938, 10939, + 86205, 192, 194, 38754, 38755, 198, 38756, 38757, 38758, 2842, + 640, 22780, 22781, 1080, 86206, 86207, 1084, 1086, 1088, 63916, + 9412, 970, 9413, 9414, 9415, 9416, 9417, 1310, 7168, 7169, + 1318, 9418, 1324, 39159, 1804, 1557, 24850, 41499, 1560, 41500, + 1562, 1563, 1565, 1927, 1928, 1566, 1569, 1570, 1571, 1572, + 1573, 1574, 1575, 1576, 2674, 2677, 2678, 2679, 2946, 2682, + 2676, 2683, 2947, 1156, 1157, 1158, 1467, 1160, 1468, 1469, + 1161, 1162, 1163, 4369, 1165, 1166, 1167, 12923, 2917, 1169, + 1170, 1171, 1172, 1173, 1174, 1175, 1176, 1177, 18153, 8359, + 1178, 1164, 1191, 1180, 12924, 86208, 86209, 54817, 66962, + 2476, 86210, 86211, 41820, 41821, 41822, 41824, 1130, 1131, + 1132, 32692, 1134, 34848, 1136, 1133, 1137, 1138, 1139, 1140, + 1141, 1143, 1144, 1145, 34849, 2639, 34850, 1146, 1147, 1148, + 34851, 1150, 1151, 1152, 1153, 1154, 1155, 1678, 1679, 1680, + 1681, 40870, 2059, 1685, 1686, 32686, 14970, 1688, 1689, 86212, + 1692, 1682, 1693, 1695, 1696, 1698, 12955, 8909, 41690, 1700, + 41691, 86213, 30949, 41692, 1703, 1704, 1705, 41693, 14976, + 1708, 2071, 1709, 1710, 1711, 1712, 1727, 86214, 86215, 86216, + 1715, 86217, 1714, 1717, 1690, 41697, 86218, 1720, 86219, 2073, + 41699, 1724, 2075, 1726, 1729, 1730, 1732, 2078, 2223, 1735, + 1713, 41700, 1737, 14977, 1739, 1740, 1741, 2080, 1743, 1744, + 1745, 1746, 1747, 1748, 1749, 1750, 1751, 41701, 1752, 1753, + 1909, 86220, 2085, 1754, 19548, 86221, 19551, 5733, 3856, 5190, + 4581, 25145, 86222, 86223, 4846, 86224, 4861, 86225, 86226, + 86227, 25150, 86228, 86229, 13820, 2027, 4898, 4899, 4901, + 2135, 4902, 4868, 4904, 86230, 4905, 25155, 4907, 86231, 4909, + 4910, 4911, 4912, 86232, 6220, 81357, 86233, 2589, 73877, + 29706, 6227, 6228, 86234, 6237, 86235, 6241, 6242, 1812, 13808, + 13809, 70908, 2293, 2294, 86236, 2295, 2296, 2297, 22947, + 16511, 2299, 2300, 2301, 13097, 73079, 86237, 13099, 50121, + 86238, 86239, 13101, 86240, 2424, 4725, 4726, 4727, 4728, 4729, + 4730, 86241, 26881, 10944, 4734, 4735, 4736, 26239, 26240, + 71408, 86242, 57401, 71410, 26244, 5344, 26245, 86243, 4102, + 71414, 11091, 6736, 86244, 6737, 6738, 38152, 6740, 6741, 6742, + 6298, 6743, 6745, 6746, 20867, 6749, 20616, 86245, 9801, 65297, + 20617, 65298, 20619, 5629, 65299, 20621, 20622, 8385, 20623, + 20624, 5191, 20625, 20626, 442, 443, 445, 27837, 77681, 86246, + 27839, 86247, 86248, 41435, 66511, 2478, 2479, 2480, 2481, + 2482, 2483, 2484, 2485, 2486, 2487, 2488, 2489, 2490, 2494, + 2493, 33025, 12084, 2542, 2497, 2499, 2501, 2503, 2504, 2505, + 33026, 2506, 2507, 2508, 2509, 2511, 1787, 12080, 2513, 2514, + 3988, 3176, 3989, 2518, 2521, 9285, 2522, 2524, 2525, 3990, + 2527, 2528, 27499, 2529, 2530, 3991, 2532, 2534, 2535, 18038, + 2536, 2538, 2495, 46077, 61493, 61494, 1006, 713, 4971, 4972, + 4973, 4975, 4976, 650, 170, 7549, 7550, 7551, 7552, 7553, + 86249, 7936, 956, 11169, 11170, 1249, 1244, 1245, 1247, 2544, + 1250, 2545, 1252, 2547, 1253, 1254, 2549, 39636, 1259, 1257, + 1258, 39637, 1260, 1261, 2551, 1262, 1263, 848, 86250, 86251, + 854, 74596, 856, 1957, 86252, 855, 1959, 1961, 857, 86253, 851, + 859, 860, 862, 1964, 864, 865, 866, 867, 1965, 1966, 1967, + 1968, 1969, 86254, 1971, 1972, 1973, 1974, 1975, 1976, 1977, + 841, 1954, 842, 2978, 846, 847, 849, 850, 852, 1956, 17452, + 71941, 86255, 86256, 73665, 1471, 13690, 185, 503, 504, 2342, + 505, 506, 4378, 508, 4379, 17313, 510, 511, 512, 520, 513, + 4384, 17314, 514, 515, 46158, 17317, 518, 34269, 519, 4386, + 523, 524, 525, 46159, 528, 529, 17319, 531, 532, 533, 534, 535, + 7482, 537, 538, 5267, 536, 539, 541, 540, 19858, 17320, 17321, + 906, 907, 908, 17322, 910, 17323, 912, 15850, 913, 4398, 17324, + 86257, 278, 2948, 2949, 2950, 3007, 2951, 2952, 2953, 2954, + 2955, 3013, 35352, 3014, 3015, 2962, 3016, 33505, 39118, 3017, + 3018, 20492, 4000, 3021, 3022, 35353, 39293, 3024, 18443, 3029, + 9467, 20529, 39119, 8380, 2965, 3030, 3043, 22714, 39120, 2956, + 3035, 39121, 3037, 3038, 2688, 86258, 36675, 30894, 24505, + 8888, 13541, 49728, 27660, 9082, 27661, 365, 366, 2232, 76098, + 7233, 1494, 17391, 606, 607, 611, 610, 612, 614, 615, 613, 616, + 9117, 617, 618, 21155, 1789, 619, 620, 7636, 12019, 621, 622, + 1793, 623, 625, 624, 631, 626, 627, 21578, 21103, 628, 21579, + 629, 9122, 9123, 12189, 9289, 3168, 3169, 630, 632, 634, 21580, + 9121, 635, 636, 637, 21581, 12781, 1801, 638, 639, 1559, 24343, + 9419, 9420, 795, 796, 1611, 86259, 1612, 21551, 21552, 3741, + 1617, 3742, 1615, 1619, 1620, 6301, 3744, 1622, 67685, 8521, + 55937, 9025, 27663, 8881, 13581, 86260, 11592, 44720, 86261, + 63231, 50873, 42925, 52332, 86262, 72706, 17705, 17707, 17708, + 3401, 40217, 1248, 40218, 86263, 7098, 86264, 86265, 1264, + 86266, 1266, 1267, 1268, 1269, 86267, 1271, 1272, 1273, 1274, + 2556, 1275, 1276, 1277, 1278, 1279, 1280, 1282, 1283, 22680, + 11889, 86268, 45662, 7038, 86269, 19315, 45663, 45664, 86270, + 5855, 34002, 49245, 10447, 5663, 86271, 15429, 53877, 49249, + 86272, 86273, 86274, 60128, 60453, 60129, 5552, 31923, 43407, + 4287, 17980, 64977, 86275, 86276, 8234, 86277, 3649, 8240, + 1330, 11999, 1332, 27618, 1334, 1335, 340, 3651, 25640, 18165, + 1343, 4618, 1474, 3653, 75921, 1349, 53519, 1779, 45454, 22778, + 40153, 67677, 63826, 45455, 15128, 67678, 67679, 1792, 67680, + 3171, 47816, 45457, 9288, 59891, 67681, 25703, 35731, 35732, + 369, 35713, 35714, 35715, 34652, 35716, 31681, 35717, 12779, + 35718, 35719, 11992, 806, 807, 808, 43499, 43500, 810, 776, + 812, 813, 814, 241, 43501, 43502, 816, 755, 43503, 818, 819, + 820, 43504, 821, 822, 823, 824, 825, 826, 43505, 43506, 43507, + 828, 829, 20083, 43508, 43509, 832, 833, 834, 835, 86278, + 19984, 19985, 86279, 24125, 19986, 86280, 19988, 86281, 5414, + 86282, 85808, 5479, 5420, 5421, 5422, 5423, 63800, 86283, + 86284, 30965, 86285, 416, 1510, 5740, 5741, 81991, 86286, + 28938, 50149, 1003, 55512, 14306, 6960, 688, 86287, 14307, + 5399, 5400, 17783, 24118, 720, 86288, 44913, 24557, 667, 24876, + 6529, 24877, 24878, 24879, 24880, 31847, 20671, 4011, 171, 580, + 86289, 3863, 914, 2202, 916, 917, 918, 919, 921, 922, 923, + 7585, 925, 7586, 926, 927, 928, 7588, 929, 930, 931, 932, 933, + 934, 1875, 1876, 7589, 7590, 1878, 1879, 7591, 7592, 1882, + 1883, 1884, 2212, 7593, 1887, 1888, 1889, 1890, 1891, 1892, + 1893, 1894, 1895, 1896, 1897, 1898, 2217, 1900, 7594, 1902, + 2219, 7595, 1905, 1906, 1907, 3323, 7596, 1911, 1912, 7597, + 1914, 1915, 1916, 2226, 1919, 7598, 2227, 1920, 1921, 7599, + 7600, 4708, 1923, 355, 356, 1549, 358, 32077, 360, 32078, + 21117, 362, 19043, 71677, 5716, 86290, 49790, 86291, 86292, + 86293, 49792, 86294, 86295, 49794, 86296, 86297, 86298, 86299, + 11882, 86300, 49798, 86301, 49800, 49801, 49802, 49803, 453, + 49804, 8591, 6794, 49806, 18989, 49807, 49808, 16308, 49809, + 86302, 86303, 10105, 86304, 5285, 10106, 10107, 6557, 86305, + 23571, 10109, 38883, 10110, 5401, 86306, 67557, 16430, 67558, + 40171, 16433, 25878, 86307, 21762, 23, 86308, 86309, 21766, + 86310, 86311, 5149, 3926, 21768, 21769, 47826, 942, 46985, + 6588, 58867, 6589, 6590, 86312, 6592, 6006, 53855, 9565, 359, + 86313, 2845, 876, 879, 27556, 27557, 885, 27558, 888, 2847, + 27559, 2115, 2116, 2117, 53962, 57839, 315, 316, 317, 318, 319, + 86314, 321, 322, 2122, 323, 2123, 324, 325, 328, 326, 327, + 40542, 329, 330, 18079, 18080, 331, 1790, 7382, 332, 7380, + 7236, 23413, 23414, 18924, 18925, 333, 335, 336, 39750, 337, + 86315, 339, 341, 342, 343, 16264, 16265, 6615, 86316, 86317, + 86318, 86319, 16269, 10538, 33226, 86320, 16272, 5824, 16273, + 16274, 16276, 16277, 16278, 16279, 16280, 14517, 1547, 6463, + 3394, 49677, 659, 10380, 30013, 10382, 10378, 10379, 10383, + 10384, 10385, 86321, 4139, 13370, 13371, 86322, 86323, 11878, + 64509, 15141, 15142, 15143, 32737, 14183, 15144, 39101, 42768, + 5645, 32738, 801, 803, 804, 86324, 14707, 86325, 6601, 12402, + 712, 12403, 2936, 1447, 15477, 1410, 44872, 1550, 8614, 15478, + 15479, 15480, 15481, 4811, 3752, 1442, 15482, 8818, 1445, 5006, + 16304, 32277, 16305, 16306, 86326, 16307, 53691, 69305, 809, + 86327, 815, 26724, 69307, 43484, 63904, 86328, 13498, 827, + 86329, 831, 2857, 836, 86330, 86331, 837, 838, 839, 840, 228, + 229, 43722, 230, 231, 43723, 234, 235, 236, 237, 238, 239, + 2745, 2746, 240, 242, 243, 244, 43724, 19788, 246, 247, 21134, + 248, 250, 251, 252, 253, 254, 255, 256, 257, 258, 43725, 43726, + 41, 43727, 262, 43728, 2751, 264, 265, 266, 267, 268, 269, 270, + 271, 272, 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 1032, + 1033, 1034, 43729, 1035, 43730, 1037, 21821, 2926, 14388, + 10432, 14389, 14390, 14391, 14392, 86332, 14394, 14395, 2035, + 2169, 86333, 14397, 14398, 14399, 14400, 52, 14401, 14402, + 7077, 21822, 14405, 14406, 14396, 86334, 17356, 17357, 84679, + 84680, 76383, 17360, 17361, 86335, 38801, 2060, 30850, 12963, + 1684, 1687, 2061, 14978, 1694, 43387, 1697, 1699, 2067, 1701, + 1702, 1706, 43388, 43389, 76325, 1716, 1718, 26832, 1719, 1723, + 2081, 2063, 1728, 39059, 76326, 1731, 86336, 1736, 76327, 1738, + 19657, 6579, 6581, 6582, 6583, 6584, 6585, 29979, 1818, 28239, + 68, 69, 3391, 86337, 10266, 63528, 86338, 10269, 10270, 10271, + 10272, 86339, 86340, 63530, 63531, 63532, 63533, 10273, 63534, + 86341, 10681, 10682, 86342, 9673, 86343, 10683, 460, 461, 462, + 467, 4464, 4466, 3729, 471, 472, 468, 81634, 474, 81635, 475, + 476, 477, 479, 480, 81636, 81637, 482, 17442, 81638, 81639, + 484, 485, 486, 4473, 488, 489, 490, 493, 466, 494, 495, 496, + 497, 499, 500, 501, 502, 34376, 86344, 63836, 56281, 1707, + 20416, 61452, 56282, 1755, 56283, 56284, 18508, 53650, 63444, + 86345, 3579, 63445, 3677, 1979, 1980, 1981, 3132, 3147, 34090, + 1987, 12770, 1329, 80818, 80819, 1988, 23522, 1986, 15880, + 1985, 32975, 1992, 1993, 7165, 3141, 3143, 86346, 1982, 1984, + 3145, 86347, 78064, 55453, 2656, 2657, 35634, 35635, 2167, + 43479, + // ensure there is a representative number for any # of int bytes + 1, 1 << 8 + 1, 1 << 16 + 1, 1 << 24 + 1 }; +// data = new int[]{1, 2, 3, 4}; + for (int value : data) { + dataSet.add(new Long(value)); + } + } + +}