From de17206f02ada12c56b8e8910e9804a80bb07f69 Mon Sep 17 00:00:00 2001 From: Shai Erera Date: Fri, 11 Jan 2013 13:09:06 +0000 Subject: [PATCH] LUCENE-4620: IntEncoder/Decoder bulk API git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1432028 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/CHANGES.txt | 4 + lucene/facet/build.xml | 9 +- ....java => CategoryAssociationsIndexer.java} | 10 +- ...ain.java => CategoryAssociationsMain.java} | 14 +- ...java => CategoryAssociationsSearcher.java} | 15 +- ...ls.java => CategoryAssociationsUtils.java} | 2 +- .../facet/example/simple/SimpleUtils.java | 4 - .../AssociationsCategoryListBuilder.java | 92 ---- .../associations/AssociationsFacetFields.java | 21 +- .../associations/AssociationsListBuilder.java | 89 ++++ .../AssociationsPayloadIterator.java | 16 +- .../CategoryAssociationsContainer.java | 5 + .../CategoryFloatAssociation.java | 5 + .../associations/CategoryIntAssociation.java | 5 + .../FloatAssociationsPayloadIterator.java | 24 +- .../IntAssociationsPayloadIterator.java | 28 +- .../facet/index/CategoryListBuilder.java | 161 +----- .../facet/index/CountingListBuilder.java | 160 ++++++ .../lucene/facet/index/FacetFields.java | 36 +- .../index/OrdinalMappingAtomicReader.java | 27 +- .../index/params/CategoryListParams.java | 4 +- .../index/params/FacetIndexingParams.java | 15 +- .../params/PerDimensionIndexingParams.java | 4 +- .../facet/search/CategoryListIterator.java | 49 +- ...r.java => PayloadCategoryListIteraor.java} | 56 +-- .../lucene/facet/search/PayloadIterator.java | 36 +- .../search/StandardFacetsAccumulator.java | 14 +- .../facet/search/aggregator/Aggregator.java | 17 +- .../ComplementCountingAggregator.java | 13 +- .../search/aggregator/CountingAggregator.java | 29 +- .../search/aggregator/ScoringAggregator.java | 17 +- .../AssociationFloatSumAggregator.java | 22 +- .../AssociationIntSumAggregator.java | 22 +- .../facet/search/cache/CategoryListData.java | 66 +-- .../search/params/CountFacetRequest.java | 3 +- .../facet/util/MultiCategoryListIterator.java | 36 +- .../java/org/apache/lucene/util/Vint8.java | 229 --------- .../util/encoding/ChunksIntEncoder.java | 69 ++- .../lucene/util/encoding/DGapIntDecoder.java | 33 +- .../lucene/util/encoding/DGapIntEncoder.java | 25 +- .../util/encoding/EightFlagsIntDecoder.java | 68 ++- .../util/encoding/EightFlagsIntEncoder.java | 46 +- .../util/encoding/FourFlagsIntDecoder.java | 67 +-- .../util/encoding/FourFlagsIntEncoder.java | 45 +- .../lucene/util/encoding/IntDecoder.java | 59 ++- .../lucene/util/encoding/IntEncoder.java | 104 ++-- .../util/encoding/IntEncoderFilter.java | 25 +- .../lucene/util/encoding/NOnesIntDecoder.java | 79 +-- .../lucene/util/encoding/NOnesIntEncoder.java | 73 +-- .../util/encoding/SimpleIntDecoder.java | 41 +- .../util/encoding/SimpleIntEncoder.java | 32 +- .../util/encoding/SortingIntEncoder.java | 45 +- .../util/encoding/UniqueValuesIntEncoder.java | 35 +- .../apache/lucene/util/encoding/VInt8.java | 138 +++++ .../lucene/util/encoding/VInt8IntDecoder.java | 29 +- .../lucene/util/encoding/VInt8IntEncoder.java | 32 +- .../apache/lucene/util/encoding/package.html | 132 +---- .../facet/example/TestAssociationExample.java | 6 +- .../search/CategoryListIteratorTest.java | 85 ++-- .../lucene/facet/search/DrillDownTest.java | 4 +- .../facet/search/TestCategoryListCache.java | 20 +- .../params/MultiCategoryListIteratorTest.java | 126 +++++ .../params/MultiIteratorsPerCLParamsTest.java | 271 ---------- .../org/apache/lucene/util/Vint8Test.java | 141 ------ .../lucene/util/encoding/EncodingSpeed.java | 96 ++-- .../lucene/util/encoding/EncodingTest.java | 473 ++++-------------- .../lucene/util/encoding/Vint8Test.java | 54 ++ 67 files changed, 1429 insertions(+), 2383 deletions(-) rename lucene/facet/src/examples/org/apache/lucene/facet/example/association/{AssociationIndexer.java => CategoryAssociationsIndexer.java} (93%) rename lucene/facet/src/examples/org/apache/lucene/facet/example/association/{AssociationMain.java => CategoryAssociationsMain.java} (83%) rename lucene/facet/src/examples/org/apache/lucene/facet/example/association/{AssociationSearcher.java => CategoryAssociationsSearcher.java} (90%) rename lucene/facet/src/examples/org/apache/lucene/facet/example/association/{AssociationUtils.java => CategoryAssociationsUtils.java} (98%) delete mode 100644 lucene/facet/src/java/org/apache/lucene/facet/associations/AssociationsCategoryListBuilder.java create mode 100644 lucene/facet/src/java/org/apache/lucene/facet/associations/AssociationsListBuilder.java create mode 100644 lucene/facet/src/java/org/apache/lucene/facet/index/CountingListBuilder.java rename lucene/facet/src/java/org/apache/lucene/facet/search/{PayloadIntDecodingIterator.java => PayloadCategoryListIteraor.java} (52%) delete mode 100644 lucene/facet/src/java/org/apache/lucene/util/Vint8.java create mode 100644 lucene/facet/src/java/org/apache/lucene/util/encoding/VInt8.java create mode 100644 lucene/facet/src/test/org/apache/lucene/facet/search/params/MultiCategoryListIteratorTest.java delete mode 100644 lucene/facet/src/test/org/apache/lucene/facet/search/params/MultiIteratorsPerCLParamsTest.java delete mode 100644 lucene/facet/src/test/org/apache/lucene/util/Vint8Test.java create mode 100644 lucene/facet/src/test/org/apache/lucene/util/encoding/Vint8Test.java diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 9de829ba36c..9ac8f0c3e65 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -111,6 +111,10 @@ Changes in backwards compatibility policy which are called after a doc/field/term has been completely added. (Adrien Grand, Robert Muir) +* LUCENE-4620: IntEncoder/Decoder were changed to do bulk encoding/decoding. As a + result, few other classes such as Aggregator and CategoryListIterator were + changed to handle bulk category ordinals. (Shai Erera) + New Features * LUCENE-4226: New experimental StoredFieldsFormat that compresses chunks of diff --git a/lucene/facet/build.xml b/lucene/facet/build.xml index 209f5c7e2f2..9cdd5e995bb 100644 --- a/lucene/facet/build.xml +++ b/lucene/facet/build.xml @@ -81,5 +81,12 @@ - + + + + + + + + diff --git a/lucene/facet/src/examples/org/apache/lucene/facet/example/association/AssociationIndexer.java b/lucene/facet/src/examples/org/apache/lucene/facet/example/association/CategoryAssociationsIndexer.java similarity index 93% rename from lucene/facet/src/examples/org/apache/lucene/facet/example/association/AssociationIndexer.java rename to lucene/facet/src/examples/org/apache/lucene/facet/example/association/CategoryAssociationsIndexer.java index 145d13e7826..85c34783cde 100644 --- a/lucene/facet/src/examples/org/apache/lucene/facet/example/association/AssociationIndexer.java +++ b/lucene/facet/src/examples/org/apache/lucene/facet/example/association/CategoryAssociationsIndexer.java @@ -39,7 +39,7 @@ import org.apache.lucene.store.Directory; * * @lucene.experimental */ -public class AssociationIndexer { +public class CategoryAssociationsIndexer { /** * Create an index, and adds to it sample documents and categories. @@ -75,13 +75,11 @@ public class AssociationIndexer { ++nFacetsAdded; } // and also those with associations - CategoryPath[] associationsPaths = AssociationUtils.categories[docNum]; - CategoryAssociation[] associationsValues = AssociationUtils.associations[docNum]; + CategoryPath[] associationsPaths = CategoryAssociationsUtils.categories[docNum]; + CategoryAssociation[] associationsValues = CategoryAssociationsUtils.associations[docNum]; for (int i = 0; i < associationsPaths.length; i++) { associations.setAssociation(associationsPaths[i], associationsValues[i]); - ExampleUtils.log("\t $$$$ Association: (" - + associationsPaths[i] + "," + associationsValues[i] - + ")"); + ExampleUtils.log("\t $$$$ Association: (" + associationsPaths[i] + "," + associationsValues[i] + ")"); ++nFacetsAdded; } diff --git a/lucene/facet/src/examples/org/apache/lucene/facet/example/association/AssociationMain.java b/lucene/facet/src/examples/org/apache/lucene/facet/example/association/CategoryAssociationsMain.java similarity index 83% rename from lucene/facet/src/examples/org/apache/lucene/facet/example/association/AssociationMain.java rename to lucene/facet/src/examples/org/apache/lucene/facet/example/association/CategoryAssociationsMain.java index 619377b48a6..bdd99be72dd 100644 --- a/lucene/facet/src/examples/org/apache/lucene/facet/example/association/AssociationMain.java +++ b/lucene/facet/src/examples/org/apache/lucene/facet/example/association/CategoryAssociationsMain.java @@ -31,15 +31,15 @@ import org.apache.lucene.facet.search.results.FacetResult; * * @lucene.experimental */ -public class AssociationMain { +public class CategoryAssociationsMain { /** * Driver for the simple sample. * @throws Exception on error (no detailed exception handling here for sample simplicity */ public static void main(String[] args) throws Exception { - new AssociationMain().runSumIntAssociationSample(); - new AssociationMain().runSumFloatAssociationSample(); + new CategoryAssociationsMain().runSumIntAssociationSample(); + new CategoryAssociationsMain().runSumFloatAssociationSample(); ExampleUtils.log("DONE"); } @@ -51,10 +51,10 @@ public class AssociationMain { // index the sample documents ExampleUtils.log("index the sample documents..."); - AssociationIndexer.index(indexDir, taxoDir); + CategoryAssociationsIndexer.index(indexDir, taxoDir); ExampleUtils.log("search the sample documents..."); - List facetRes = AssociationSearcher.searchSumIntAssociation(indexDir, taxoDir); + List facetRes = CategoryAssociationsSearcher.searchSumIntAssociation(indexDir, taxoDir); ExampleResult res = new ExampleResult(); res.setFacetResults(facetRes); @@ -69,10 +69,10 @@ public class AssociationMain { // index the sample documents ExampleUtils.log("index the sample documents..."); - AssociationIndexer.index(indexDir, taxoDir); + CategoryAssociationsIndexer.index(indexDir, taxoDir); ExampleUtils.log("search the sample documents..."); - List facetRes = AssociationSearcher.searchSumFloatAssociation(indexDir, taxoDir); + List facetRes = CategoryAssociationsSearcher.searchSumFloatAssociation(indexDir, taxoDir); ExampleResult res = new ExampleResult(); res.setFacetResults(facetRes); diff --git a/lucene/facet/src/examples/org/apache/lucene/facet/example/association/AssociationSearcher.java b/lucene/facet/src/examples/org/apache/lucene/facet/example/association/CategoryAssociationsSearcher.java similarity index 90% rename from lucene/facet/src/examples/org/apache/lucene/facet/example/association/AssociationSearcher.java rename to lucene/facet/src/examples/org/apache/lucene/facet/example/association/CategoryAssociationsSearcher.java index 1e643df2c50..57c9524266b 100644 --- a/lucene/facet/src/examples/org/apache/lucene/facet/example/association/AssociationSearcher.java +++ b/lucene/facet/src/examples/org/apache/lucene/facet/example/association/CategoryAssociationsSearcher.java @@ -37,18 +37,15 @@ import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader; * * @lucene.experimental */ -public class AssociationSearcher { +public class CategoryAssociationsSearcher { /** Search an index with a sum of int-association. */ - public static List searchSumIntAssociation(Directory indexDir, - Directory taxoDir) throws Exception { + public static List searchSumIntAssociation(Directory indexDir, Directory taxoDir) throws Exception { // prepare index reader IndexReader indexReader = DirectoryReader.open(indexDir); TaxonomyReader taxo = new DirectoryTaxonomyReader(taxoDir); - AssociationIntSumFacetRequest facetRequest = new AssociationIntSumFacetRequest( - new CategoryPath("tags"), 10); - + AssociationIntSumFacetRequest facetRequest = new AssociationIntSumFacetRequest(new CategoryPath("tags"), 10); List res = SimpleSearcher.searchWithRequest(indexReader, taxo, null, facetRequest); // close readers @@ -59,14 +56,12 @@ public class AssociationSearcher { } /** Search an index with a sum of float-association. */ - public static List searchSumFloatAssociation(Directory indexDir, - Directory taxoDir) throws Exception { + public static List searchSumFloatAssociation(Directory indexDir, Directory taxoDir) throws Exception { // prepare index reader IndexReader indexReader = DirectoryReader.open(indexDir); TaxonomyReader taxo = new DirectoryTaxonomyReader(taxoDir); - AssociationFloatSumFacetRequest facetRequest = new AssociationFloatSumFacetRequest( - new CategoryPath("genre"), 10); + AssociationFloatSumFacetRequest facetRequest = new AssociationFloatSumFacetRequest(new CategoryPath("genre"), 10); List res = SimpleSearcher.searchWithRequest(indexReader, taxo, null, facetRequest); diff --git a/lucene/facet/src/examples/org/apache/lucene/facet/example/association/AssociationUtils.java b/lucene/facet/src/examples/org/apache/lucene/facet/example/association/CategoryAssociationsUtils.java similarity index 98% rename from lucene/facet/src/examples/org/apache/lucene/facet/example/association/AssociationUtils.java rename to lucene/facet/src/examples/org/apache/lucene/facet/example/association/CategoryAssociationsUtils.java index 022a233a5d6..f2cc8520599 100644 --- a/lucene/facet/src/examples/org/apache/lucene/facet/example/association/AssociationUtils.java +++ b/lucene/facet/src/examples/org/apache/lucene/facet/example/association/CategoryAssociationsUtils.java @@ -25,7 +25,7 @@ import org.apache.lucene.facet.taxonomy.CategoryPath; /** * @lucene.experimental */ -public class AssociationUtils { +public class CategoryAssociationsUtils { /** * Categories: categories[D][N] == category-path with association no. N for diff --git a/lucene/facet/src/examples/org/apache/lucene/facet/example/simple/SimpleUtils.java b/lucene/facet/src/examples/org/apache/lucene/facet/example/simple/SimpleUtils.java index 087f7397ae5..08ae45027ea 100644 --- a/lucene/facet/src/examples/org/apache/lucene/facet/example/simple/SimpleUtils.java +++ b/lucene/facet/src/examples/org/apache/lucene/facet/example/simple/SimpleUtils.java @@ -1,11 +1,7 @@ package org.apache.lucene.facet.example.simple; -import java.util.ArrayList; -import java.util.List; - import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.core.WhitespaceAnalyzer; - import org.apache.lucene.facet.example.ExampleUtils; import org.apache.lucene.facet.taxonomy.CategoryPath; diff --git a/lucene/facet/src/java/org/apache/lucene/facet/associations/AssociationsCategoryListBuilder.java b/lucene/facet/src/java/org/apache/lucene/facet/associations/AssociationsCategoryListBuilder.java deleted file mode 100644 index c19898cbbb3..00000000000 --- a/lucene/facet/src/java/org/apache/lucene/facet/associations/AssociationsCategoryListBuilder.java +++ /dev/null @@ -1,92 +0,0 @@ -package org.apache.lucene.facet.associations; - -import java.io.IOException; -import java.util.HashMap; - -import org.apache.lucene.facet.index.CategoryListBuilder; -import org.apache.lucene.facet.index.params.CategoryListParams; -import org.apache.lucene.facet.index.params.FacetIndexingParams; -import org.apache.lucene.facet.taxonomy.CategoryPath; -import org.apache.lucene.facet.taxonomy.TaxonomyWriter; -import org.apache.lucene.store.ByteArrayDataOutput; -import org.apache.lucene.util.BytesRef; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * A {@link CategoryListBuilder} which encodes category-association value pairs - * in addition to regular counting list category ordinals. Every - * category-association pair is written under the respective association's - * {@link CategoryAssociation#getCategoryListID()}. - */ -public class AssociationsCategoryListBuilder extends CategoryListBuilder { - - private final CategoryAssociationsContainer associations; - private final HashMap perAssociationBytes = new HashMap(); - private final ByteArrayDataOutput output = new ByteArrayDataOutput(); - - public AssociationsCategoryListBuilder(CategoryAssociationsContainer associations, - CategoryListParams categoryListParams, FacetIndexingParams indexingParams, TaxonomyWriter taxoWriter) { - super(categoryListParams, indexingParams, taxoWriter); - this.associations = associations; - } - - @Override - public void handle(int ordinal, CategoryPath cp) throws IOException { - super.handle(ordinal, cp); - - // build per-association key BytesRef - CategoryAssociation association = associations.getAssociation(cp); - if (association == null) { - // it is ok to set a null association for a category - it's treated as a - // regular category in that case. - return; - } - - BytesRef bytes = perAssociationBytes.get(association.getCategoryListID()); - if (bytes == null) { - bytes = new BytesRef(); - perAssociationBytes.put(association.getCategoryListID(), bytes); - } - - int maxBytesNeeded = 4 /* int */ + association.maxBytesNeeded(); - if (bytes.bytes.length - bytes.length < maxBytesNeeded) { - bytes.grow(bytes.bytes.length + maxBytesNeeded); - } - - // reset the output to write from bytes.length (current position) until the end - output.reset(bytes.bytes, bytes.length, bytes.bytes.length - bytes.length); - output.writeInt(ordinal); - - // encode the association bytes - association.serialize(output); - - // update BytesRef - bytes.length = output.getPosition(); - } - - @Override - public HashMap finish() { - // build the ordinals list - HashMap result = super.finish(); - // add per association bytes - result.putAll(perAssociationBytes); - return result; - } - -} diff --git a/lucene/facet/src/java/org/apache/lucene/facet/associations/AssociationsFacetFields.java b/lucene/facet/src/java/org/apache/lucene/facet/associations/AssociationsFacetFields.java index 00f61e17149..1ed3985afef 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/associations/AssociationsFacetFields.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/associations/AssociationsFacetFields.java @@ -7,7 +7,7 @@ import java.util.Map; import org.apache.lucene.document.Document; import org.apache.lucene.document.FieldType; import org.apache.lucene.document.TextField; -import org.apache.lucene.facet.index.CategoryListBuilder; +import org.apache.lucene.facet.index.CountingListBuilder; import org.apache.lucene.facet.index.DrillDownStream; import org.apache.lucene.facet.index.FacetFields; import org.apache.lucene.facet.index.params.CategoryListParams; @@ -15,6 +15,8 @@ import org.apache.lucene.facet.index.params.FacetIndexingParams; import org.apache.lucene.facet.taxonomy.CategoryPath; import org.apache.lucene.facet.taxonomy.TaxonomyWriter; import org.apache.lucene.index.FieldInfo.IndexOptions; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -94,15 +96,16 @@ public class AssociationsFacetFields extends FacetFields { return categoryLists; } - /** - * Returns a {@link CategoryListBuilder} for encoding the given categories and - * associations. - */ @Override - protected CategoryListBuilder getCategoryListBuilder(CategoryListParams categoryListParams, - Iterable categories) { - return new AssociationsCategoryListBuilder((CategoryAssociationsContainer) categories, categoryListParams, - indexingParams, taxonomyWriter); + protected Map getCategoryListData(CategoryListParams categoryListParams, IntsRef ordinals, + Iterable categories) throws IOException { + AssociationsListBuilder associations = new AssociationsListBuilder((CategoryAssociationsContainer) categories); + CountingListBuilder counting = new CountingListBuilder(categoryListParams, indexingParams, taxonomyWriter); + // CountingListBuilder modifies the ordinals array, by e.g. adding parent ordinals, sorting etc. + // Therefore first build the associations list and only afterwards the counting list. + final Map res = associations.build(ordinals, categories); + res.putAll(counting.build(ordinals, categories)); + return res; } @Override diff --git a/lucene/facet/src/java/org/apache/lucene/facet/associations/AssociationsListBuilder.java b/lucene/facet/src/java/org/apache/lucene/facet/associations/AssociationsListBuilder.java new file mode 100644 index 00000000000..42a4218f323 --- /dev/null +++ b/lucene/facet/src/java/org/apache/lucene/facet/associations/AssociationsListBuilder.java @@ -0,0 +1,89 @@ +package org.apache.lucene.facet.associations; + +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.facet.index.CategoryListBuilder; +import org.apache.lucene.facet.index.CountingListBuilder; +import org.apache.lucene.facet.taxonomy.CategoryPath; +import org.apache.lucene.store.ByteArrayDataOutput; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * A {@link AssociationsListBuilder} which encodes category-association value pairs. + * Every category-association pair is written under the respective association's + * {@link CategoryAssociation#getCategoryListID()}. + *

+ * NOTE: associations list do not encode the counting list data. You + * should use {@link CountingListBuilder} to build that information and then + * merge the results of both {@link #build(IntsRef, Iterable)}. + */ +public class AssociationsListBuilder implements CategoryListBuilder { + + private final CategoryAssociationsContainer associations; + private final ByteArrayDataOutput output = new ByteArrayDataOutput(); + + public AssociationsListBuilder(CategoryAssociationsContainer associations) { + this.associations = associations; + } + + @Override + public Map build(IntsRef ordinals, Iterable categories) throws IOException { + final HashMap res = new HashMap(); + int idx = 0; + for (CategoryPath cp : categories) { + // build per-association key BytesRef + CategoryAssociation association = associations.getAssociation(cp); + + if (association == null) { + // it is ok to set a null association for a category - it's treated as a + // regular category in that case. + ++idx; + continue; + } + + BytesRef bytes = res.get(association.getCategoryListID()); + if (bytes == null) { + bytes = new BytesRef(32); + res.put(association.getCategoryListID(), bytes); + } + + int maxBytesNeeded = 4 /* int */ + association.maxBytesNeeded() + bytes.length; + if (bytes.bytes.length < maxBytesNeeded) { + bytes.grow(maxBytesNeeded); + } + + // reset the output to write from bytes.length (current position) until the end + output.reset(bytes.bytes, bytes.length, bytes.bytes.length - bytes.length); + output.writeInt(ordinals.ints[idx++]); + + // encode the association bytes + association.serialize(output); + + // update BytesRef + bytes.length = output.getPosition(); + } + + return res; + } + +} diff --git a/lucene/facet/src/java/org/apache/lucene/facet/associations/AssociationsPayloadIterator.java b/lucene/facet/src/java/org/apache/lucene/facet/associations/AssociationsPayloadIterator.java index d5ffcd7632d..6ced1c70ce8 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/associations/AssociationsPayloadIterator.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/associations/AssociationsPayloadIterator.java @@ -53,20 +53,22 @@ public abstract class AssociationsPayloadIterator } /** - * Skip to the requested document. Returns true iff the document has categort - * association values and they were read successfully. + * Skip to the requested document. Returns true iff the document has category + * association values and they were read successfully. Associations are + * handled through {@link #handleAssociation(int, CategoryAssociation)} by + * extending classes. */ - public boolean setNextDoc(int docId) throws IOException { + protected final boolean setNextDoc(int docID) throws IOException { if (!hasAssociations) { // there are no associations at all return false; } - if (!pi.setdoc(docId)) { // no associations for the requested document + BytesRef bytes = pi.getPayload(docID); + if (bytes == null) { // no associations for the requested document return false; } - - BytesRef associations = pi.getPayload(); - ByteArrayDataInput in = new ByteArrayDataInput(associations.bytes, associations.offset, associations.length); + + ByteArrayDataInput in = new ByteArrayDataInput(bytes.bytes, bytes.offset, bytes.length); while (!in.eof()) { int ordinal = in.readInt(); association.deserialize(in); diff --git a/lucene/facet/src/java/org/apache/lucene/facet/associations/CategoryAssociationsContainer.java b/lucene/facet/src/java/org/apache/lucene/facet/associations/CategoryAssociationsContainer.java index 27b3e58470b..67e6c4c8e90 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/associations/CategoryAssociationsContainer.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/associations/CategoryAssociationsContainer.java @@ -55,5 +55,10 @@ public class CategoryAssociationsContainer implements Iterable { public void clear() { categoryAssociations.clear(); } + + @Override + public String toString() { + return categoryAssociations.toString(); + } } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/associations/CategoryFloatAssociation.java b/lucene/facet/src/java/org/apache/lucene/facet/associations/CategoryFloatAssociation.java index bd826dfb60c..459e523e15d 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/associations/CategoryFloatAssociation.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/associations/CategoryFloatAssociation.java @@ -71,5 +71,10 @@ public class CategoryFloatAssociation implements CategoryAssociation { public float getValue() { return value; } + + @Override + public String toString() { + return getClass().getSimpleName() + "(" + value + ")"; + } } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/associations/CategoryIntAssociation.java b/lucene/facet/src/java/org/apache/lucene/facet/associations/CategoryIntAssociation.java index 6758d7d00fc..1d3b691b0d3 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/associations/CategoryIntAssociation.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/associations/CategoryIntAssociation.java @@ -72,4 +72,9 @@ public class CategoryIntAssociation implements CategoryAssociation { return value; } + @Override + public String toString() { + return getClass().getSimpleName() + "(" + value + ")"; + } + } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/associations/FloatAssociationsPayloadIterator.java b/lucene/facet/src/java/org/apache/lucene/facet/associations/FloatAssociationsPayloadIterator.java index 958dfb49ff7..0708910523d 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/associations/FloatAssociationsPayloadIterator.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/associations/FloatAssociationsPayloadIterator.java @@ -40,23 +40,17 @@ public class FloatAssociationsPayloadIterator extends AssociationsPayloadIterato protected void handleAssociation(int ordinal, CategoryFloatAssociation association) { ordinalAssociations.put(ordinal, association.getValue()); } - - @Override - public boolean setNextDoc(int docId) throws IOException { - ordinalAssociations.clear(); - return super.setNextDoc(docId); - } /** - * Get the float association value for the given ordinal, or - * {@link Float#NaN} in case the ordinal has no association value. + * Returns the float association values of the categories that are associated + * with the given document, or {@code null} if the document has no + * associations. + *

+ * NOTE: you are not expected to modify the returned map. */ - public float getAssociation(int ordinal) { - if (ordinalAssociations.containsKey(ordinal)) { - return ordinalAssociations.get(ordinal); - } - - return Float.NaN; + public IntToFloatMap getAssociations(int docID) throws IOException { + ordinalAssociations.clear(); + return setNextDoc(docID) ? ordinalAssociations : null; } - + } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/associations/IntAssociationsPayloadIterator.java b/lucene/facet/src/java/org/apache/lucene/facet/associations/IntAssociationsPayloadIterator.java index ad983114084..e3bed4f51e5 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/associations/IntAssociationsPayloadIterator.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/associations/IntAssociationsPayloadIterator.java @@ -31,12 +31,6 @@ public class IntAssociationsPayloadIterator extends AssociationsPayloadIterator< private final IntToIntMap ordinalAssociations = new IntToIntMap(); - /** - * The long-special-value returned for ordinals which have no associated int - * value. It is not in the int range of values making it a valid mark. - */ - public final static long NO_ASSOCIATION = Integer.MAX_VALUE + 1; - public IntAssociationsPayloadIterator(IndexReader reader, String field, CategoryIntAssociation association) throws IOException { super(reader, field, association); @@ -47,22 +41,16 @@ public class IntAssociationsPayloadIterator extends AssociationsPayloadIterator< ordinalAssociations.put(ordinal, association.getValue()); } - @Override - public boolean setNextDoc(int docId) throws IOException { - ordinalAssociations.clear(); - return super.setNextDoc(docId); - } - /** - * Get the integer association value for the given ordinal, or - * {@link #NO_ASSOCIATION} in case the ordinal has no association value. + * Returns the integer association values of the categories that are + * associated with the given document, or {@code null} if the document has no + * associations. + *

+ * NOTE: you are not expected to modify the returned map. */ - public long getAssociation(int ordinal) { - if (ordinalAssociations.containsKey(ordinal)) { - return ordinalAssociations.get(ordinal); - } - - return NO_ASSOCIATION; + public IntToIntMap getAssociations(int docID) throws IOException { + ordinalAssociations.clear(); + return setNextDoc(docID) ? ordinalAssociations : null; } } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/index/CategoryListBuilder.java b/lucene/facet/src/java/org/apache/lucene/facet/index/CategoryListBuilder.java index 509c0c61684..0bea43235b4 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/index/CategoryListBuilder.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/index/CategoryListBuilder.java @@ -1,19 +1,11 @@ package org.apache.lucene.facet.index; import java.io.IOException; -import java.util.HashMap; -import java.util.Map.Entry; +import java.util.Map; -import org.apache.lucene.facet.index.categorypolicy.OrdinalPolicy; -import org.apache.lucene.facet.index.params.CategoryListParams; -import org.apache.lucene.facet.index.params.FacetIndexingParams; import org.apache.lucene.facet.taxonomy.CategoryPath; -import org.apache.lucene.facet.taxonomy.TaxonomyWriter; -import org.apache.lucene.facet.util.PartitionsUtils; import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.IOUtils; -import org.apache.lucene.util.UnsafeByteArrayOutputStream; -import org.apache.lucene.util.encoding.IntEncoder; +import org.apache.lucene.util.IntsRef; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -33,149 +25,14 @@ import org.apache.lucene.util.encoding.IntEncoder; */ /** - * Builds a category list by encoding the category ordinals into one or more - * {@link BytesRef}. Each {@link BytesRef} corresponds to a set of ordinals that - * belong to the same partition. When partitions are not enabled (i.e. - * {@link FacetIndexingParams#getPartitionSize()} returns - * {@link Integer#MAX_VALUE}), only one {@link BytesRef} is returned by this - * class. + * Builds a category list data by encoding the appropriate information for every + * category and ordinal given to {@link #build(IntsRef, Iterable)}. + * + * @lucene.experimental */ -public class CategoryListBuilder { - - /** Specializes encoding ordinals when partitions are enabled/disabled. */ - private static abstract class OrdinalsEncoder { - OrdinalsEncoder() {} - public abstract void encode(int ordinal); - public abstract HashMap finish(); - } +public interface CategoryListBuilder { - private static final class NoPartitionsOrdinalsEncoder extends OrdinalsEncoder { - - private final IntEncoder encoder; - private final UnsafeByteArrayOutputStream ubaos; - private final String name; - - NoPartitionsOrdinalsEncoder(CategoryListParams categoryListParams) { - name = categoryListParams.getTerm().text(); - encoder = categoryListParams.createEncoder(); - ubaos = new UnsafeByteArrayOutputStream(); - encoder.reInit(ubaos); - } - - @Override - public void encode(int ordinal) { - try { - encoder.encode(ordinal); - } catch (IOException e) { - // shouldn't happen as we're writing to byte[] - throw new RuntimeException("unexpected exception", e); - } - } - - @Override - public HashMap finish() { - try { - encoder.close(); - } catch (IOException e) { - // shouldn't happen as we're writing to byte[] - throw new RuntimeException("unexpected exception", e); - } - HashMap result = new HashMap(); - result.put(name, new BytesRef(ubaos.toByteArray(), ubaos.getStartPos(), ubaos.length())); - return result; - } - - } - - private static final class PerPartitionOrdinalsEncoder extends OrdinalsEncoder { - - private final FacetIndexingParams indexingParams; - private final CategoryListParams categoryListParams; - private final int partitionSize; - private final HashMap partitionEncoder = new HashMap(); - private final HashMap partitionBytes = new HashMap(); - - PerPartitionOrdinalsEncoder(FacetIndexingParams indexingParams, CategoryListParams categoryListParams) { - this.indexingParams = indexingParams; - this.categoryListParams = categoryListParams; - this.partitionSize = indexingParams.getPartitionSize(); - } - - @Override - public void encode(int ordinal) { - final String name = PartitionsUtils.partitionNameByOrdinal(indexingParams, categoryListParams, ordinal); - IntEncoder encoder = partitionEncoder.get(name); - if (encoder == null) { - encoder = categoryListParams.createEncoder(); - final UnsafeByteArrayOutputStream ubaos = new UnsafeByteArrayOutputStream(); - encoder.reInit(ubaos); - partitionEncoder.put(name, encoder); - partitionBytes.put(name, ubaos); - } - try { - encoder.encode(ordinal % partitionSize); - } catch (IOException e) { - // shouldn't happen as we're writing to byte[] - throw new RuntimeException("unexpected exception", e); - } - } - - @Override - public HashMap finish() { - // finish encoding - IOUtils.closeWhileHandlingException(partitionEncoder.values()); - - HashMap bytes = new HashMap(); - for (Entry e : partitionBytes.entrySet()) { - UnsafeByteArrayOutputStream ubaos = e.getValue(); - bytes.put(e.getKey(), new BytesRef(ubaos.toByteArray(), ubaos.getStartPos(), ubaos.length())); - } - return bytes; - } - - } - - private final TaxonomyWriter taxoWriter; - private final OrdinalsEncoder ordinalsEncoder; - private final OrdinalPolicy ordinalPolicy; - - public CategoryListBuilder(CategoryListParams categoryListParams, FacetIndexingParams indexingParams, - TaxonomyWriter taxoWriter) { - this.taxoWriter = taxoWriter; - this.ordinalPolicy = indexingParams.getOrdinalPolicy(); - if (indexingParams.getPartitionSize() == Integer.MAX_VALUE) { - ordinalsEncoder = new NoPartitionsOrdinalsEncoder(categoryListParams); - } else { - ordinalsEncoder = new PerPartitionOrdinalsEncoder(indexingParams, categoryListParams); - } - } - - /** - * Encodes the given ordinal as well as any of its parent ordinals (per - * {@link OrdinalPolicy}). - */ - public void handle(int ordinal, CategoryPath cp) throws IOException { - ordinalsEncoder.encode(ordinal); - - // add all parent ordinals, per OrdinalPolicy - int parent = taxoWriter.getParent(ordinal); - while (parent > 0) { - if (ordinalPolicy.shouldAdd(parent)) { - ordinalsEncoder.encode(parent); - } - parent = taxoWriter.getParent(parent); - } - } - - /** - * Returns the encoded ordinals data. Every returned {@link BytesRef} - * corresponds to a single partition (as defined by - * {@link FacetIndexingParams#getPartitionSize()}) and the key denotes the - * partition ID. When no partitions are defined, the returned map includes - * only one value. - */ - public HashMap finish() { - return ordinalsEncoder.finish(); - } + /** Returns the encoded ordinals data. */ + public Map build(IntsRef ordinals, Iterable categories) throws IOException; } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/index/CountingListBuilder.java b/lucene/facet/src/java/org/apache/lucene/facet/index/CountingListBuilder.java new file mode 100644 index 00000000000..29b87b3f7b2 --- /dev/null +++ b/lucene/facet/src/java/org/apache/lucene/facet/index/CountingListBuilder.java @@ -0,0 +1,160 @@ +package org.apache.lucene.facet.index; + +import java.io.IOException; +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; +import java.util.Map.Entry; + +import org.apache.lucene.facet.index.categorypolicy.OrdinalPolicy; +import org.apache.lucene.facet.index.params.CategoryListParams; +import org.apache.lucene.facet.index.params.FacetIndexingParams; +import org.apache.lucene.facet.taxonomy.CategoryPath; +import org.apache.lucene.facet.taxonomy.TaxonomyWriter; +import org.apache.lucene.facet.util.PartitionsUtils; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; +import org.apache.lucene.util.encoding.IntEncoder; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * A {@link CategoryListBuilder} which builds a counting list data by encoding + * the category ordinals into one or more {@link BytesRef}. Each + * {@link BytesRef} corresponds to a set of ordinals that belong to the same + * partition. When partitions are not enabled (i.e. + * {@link FacetIndexingParams#getPartitionSize()} returns + * {@link Integer#MAX_VALUE}), only one {@link BytesRef} is returned by this + * class. + *

+ * Counting lists are used usually for computing the weight of categories by + * summing their number of occurrences (hence counting) in a result set. + */ +public class CountingListBuilder implements CategoryListBuilder { + + /** Specializes encoding ordinals when partitions are enabled/disabled. */ + private static abstract class OrdinalsEncoder { + OrdinalsEncoder() {} + public abstract Map encode(IntsRef ordinals); + } + + private static final class NoPartitionsOrdinalsEncoder extends OrdinalsEncoder { + + private final IntEncoder encoder; + private final String name; + + NoPartitionsOrdinalsEncoder(CategoryListParams categoryListParams) { + name = categoryListParams.getTerm().text(); + encoder = categoryListParams.createEncoder(); + } + + @Override + public Map encode(IntsRef ordinals) { + final BytesRef bytes = new BytesRef(128); // should be enough for most common applications + encoder.encode(ordinals, bytes); + return Collections.singletonMap(name, bytes); + } + + } + + private static final class PerPartitionOrdinalsEncoder extends OrdinalsEncoder { + + private final FacetIndexingParams indexingParams; + private final CategoryListParams categoryListParams; + private final int partitionSize; + private final HashMap partitionEncoder = new HashMap(); + + PerPartitionOrdinalsEncoder(FacetIndexingParams indexingParams, CategoryListParams categoryListParams) { + this.indexingParams = indexingParams; + this.categoryListParams = categoryListParams; + this.partitionSize = indexingParams.getPartitionSize(); + } + + @Override + public HashMap encode(IntsRef ordinals) { + // build the partitionOrdinals map + final HashMap partitionOrdinals = new HashMap(); + for (int i = 0; i < ordinals.length; i++) { + int ordinal = ordinals.ints[i]; + final String name = PartitionsUtils.partitionNameByOrdinal(indexingParams, categoryListParams, ordinal); + IntsRef partitionOrds = partitionOrdinals.get(name); + if (partitionOrds == null) { + partitionOrds = new IntsRef(32); + partitionOrdinals.put(name, partitionOrds); + partitionEncoder.put(name, categoryListParams.createEncoder()); + } + partitionOrds.ints[partitionOrds.length++] = ordinal % partitionSize; + } + + HashMap partitionBytes = new HashMap(); + for (Entry e : partitionOrdinals.entrySet()) { + String name = e.getKey(); + final IntEncoder encoder = partitionEncoder.get(name); + final BytesRef bytes = new BytesRef(128); // should be enough for most common applications + encoder.encode(e.getValue(), bytes); + partitionBytes.put(name, bytes); + } + return partitionBytes; + } + + } + + private final OrdinalsEncoder ordinalsEncoder; + private final TaxonomyWriter taxoWriter; + private final OrdinalPolicy ordinalPolicy; + + public CountingListBuilder(CategoryListParams categoryListParams, FacetIndexingParams indexingParams, + TaxonomyWriter taxoWriter) { + this.taxoWriter = taxoWriter; + this.ordinalPolicy = indexingParams.getOrdinalPolicy(); + if (indexingParams.getPartitionSize() == Integer.MAX_VALUE) { + ordinalsEncoder = new NoPartitionsOrdinalsEncoder(categoryListParams); + } else { + ordinalsEncoder = new PerPartitionOrdinalsEncoder(indexingParams, categoryListParams); + } + } + + /** + * Every returned {@link BytesRef} corresponds to a single partition (as + * defined by {@link FacetIndexingParams#getPartitionSize()}) and the key + * denotes the partition ID. When no partitions are defined, the returned map + * contains only one value. + *

+ * NOTE: the {@code ordinals} array is modified by adding parent + * ordinals to it. Also, some encoders may sort the array and remove duplicate + * ordinals. Therefore you may want to invoke this method after you finished + * processing the array for other purposes. + */ + @Override + public Map build(IntsRef ordinals, Iterable categories) throws IOException { + int upto = ordinals.length; // since we add ordinals to IntsRef, iterate upto original length + + for (int i = 0; i < upto; i++) { + int ordinal = ordinals.ints[i]; + int parent = taxoWriter.getParent(ordinal); + while (parent > 0) { + if (ordinalPolicy.shouldAdd(parent)) { + ordinals.ints[ordinals.length++] = parent; + } + parent = taxoWriter.getParent(parent); + } + } + return ordinalsEncoder.encode(ordinals); + } + +} diff --git a/lucene/facet/src/java/org/apache/lucene/facet/index/FacetFields.java b/lucene/facet/src/java/org/apache/lucene/facet/index/FacetFields.java index 07249aaec99..9fda2396ee5 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/index/FacetFields.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/index/FacetFields.java @@ -2,6 +2,7 @@ package org.apache.lucene.facet.index; import java.io.IOException; import java.util.ArrayList; +import java.util.Collections; import java.util.HashMap; import java.util.Iterator; import java.util.List; @@ -21,6 +22,7 @@ import org.apache.lucene.facet.taxonomy.CategoryPath; import org.apache.lucene.facet.taxonomy.TaxonomyWriter; import org.apache.lucene.index.FieldInfo.IndexOptions; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -69,7 +71,7 @@ public class FacetFields { return true; } - void setCategoriesData(HashMap categoriesData) { + void setCategoriesData(Map categoriesData) { this.categoriesData = categoriesData.entrySet().iterator(); } @@ -132,6 +134,9 @@ public class FacetFields { */ protected Map> createCategoryListMapping( Iterable categories) { + if (indexingParams.getAllCategoryListParams().size() == 1) { + return Collections.singletonMap(indexingParams.getCategoryListParams(null), categories); + } HashMap> categoryLists = new HashMap>(); for (CategoryPath cp : categories) { @@ -147,10 +152,15 @@ public class FacetFields { return categoryLists; } - /** Returns a {@link CategoryListBuilder} for encoding the given categories. */ - protected CategoryListBuilder getCategoryListBuilder(CategoryListParams categoryListParams, - Iterable categories /* needed for AssociationsFacetFields */) { - return new CategoryListBuilder(categoryListParams, indexingParams, taxonomyWriter); + /** + * Returns the category list data, as a mapping from key to {@link BytesRef} + * which includes the encoded data. Every ordinal in {@code ordinals} + * corrspond to a {@link CategoryPath} returned from {@code categories}. + */ + protected Map getCategoryListData(CategoryListParams categoryListParams, + IntsRef ordinals, Iterable categories /* needed for AssociationsFacetFields */) + throws IOException { + return new CountingListBuilder(categoryListParams, indexingParams, taxonomyWriter).build(ordinals, categories); } /** @@ -185,17 +195,25 @@ public class FacetFields { // for each CLP we add a different field for drill-down terms as well as for // counting list data. + IntsRef ordinals = new IntsRef(32); // should be enough for most common applications for (Entry> e : categoryLists.entrySet()) { final CategoryListParams clp = e.getKey(); final String field = clp.getTerm().field(); - // add the counting list data - CategoryListBuilder categoriesPayloadBuilder = getCategoryListBuilder(clp, e.getValue()); + // build category list data + ordinals.length = 0; // reset + int maxNumOrds = 0; for (CategoryPath cp : e.getValue()) { int ordinal = taxonomyWriter.addCategory(cp); - categoriesPayloadBuilder.handle(ordinal , cp); + maxNumOrds += cp.length; // ordinal and potentially all parents + if (ordinals.ints.length < maxNumOrds) { + ordinals.grow(maxNumOrds); + } + ordinals.ints[ordinals.length++] = ordinal; } - HashMap categoriesData = categoriesPayloadBuilder.finish(); + Map categoriesData = getCategoryListData(clp, ordinals, e.getValue()); + + // add the counting list data CountingListStream ts = new CountingListStream(); ts.setCategoriesData(categoriesData); doc.add(new Field(field, ts, COUNTING_LIST_PAYLOAD_TYPE)); diff --git a/lucene/facet/src/java/org/apache/lucene/facet/index/OrdinalMappingAtomicReader.java b/lucene/facet/src/java/org/apache/lucene/facet/index/OrdinalMappingAtomicReader.java index e2f6a5acd88..522f383e4d4 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/index/OrdinalMappingAtomicReader.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/index/OrdinalMappingAtomicReader.java @@ -17,10 +17,7 @@ package org.apache.lucene.facet.index; * limitations under the License. */ -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; import java.io.IOException; -import java.io.InputStream; import java.util.HashMap; import java.util.Map; @@ -36,6 +33,7 @@ import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.encoding.IntDecoder; import org.apache.lucene.util.encoding.IntEncoder; @@ -187,7 +185,7 @@ public class OrdinalMappingAtomicReader extends FilterAtomicReader { private class OrdinalMappingDocsAndPositionsEnum extends FilterDocsAndPositionsEnum { private final IntEncoder encoder; private final IntDecoder decoder; - private final ByteArrayOutputStream os = new ByteArrayOutputStream(); + private final IntsRef ordinals = new IntsRef(32); private final BytesRef payloadOut = new BytesRef(); public OrdinalMappingDocsAndPositionsEnum(DocsAndPositionsEnum in, CategoryListParams params) { @@ -202,21 +200,14 @@ public class OrdinalMappingAtomicReader extends FilterAtomicReader { if (payload == null) { return payload; } else { - InputStream is = new ByteArrayInputStream(payload.bytes, payload.offset, payload.length); - decoder.reInit(is); - os.reset(); - encoder.reInit(os); - long ordinal; - while ((ordinal = decoder.decode()) != IntDecoder.EOS) { - int newOrdinal = ordinalMap[(int)ordinal]; - encoder.encode(newOrdinal); + decoder.decode(payload, ordinals); + + // map the ordinals + for (int i = 0; i < ordinals.length; i++) { + ordinals.ints[i] = ordinalMap[ordinals.ints[i]]; } - encoder.close(); - // TODO (Facet): avoid copy? - byte out[] = os.toByteArray(); - payloadOut.bytes = out; - payloadOut.offset = 0; - payloadOut.length = out.length; + + encoder.encode(ordinals, payloadOut); return payloadOut; } } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/index/params/CategoryListParams.java b/lucene/facet/src/java/org/apache/lucene/facet/index/params/CategoryListParams.java index 349da82d7ef..576b9be077a 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/index/params/CategoryListParams.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/index/params/CategoryListParams.java @@ -7,7 +7,7 @@ import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.facet.search.CategoryListIterator; -import org.apache.lucene.facet.search.PayloadIntDecodingIterator; +import org.apache.lucene.facet.search.PayloadCategoryListIteraor; import org.apache.lucene.facet.search.TotalFacetCounts; import org.apache.lucene.facet.util.PartitionsUtils; import org.apache.lucene.util.encoding.DGapIntEncoder; @@ -142,7 +142,7 @@ public class CategoryListParams implements Serializable { int partition) throws IOException { String categoryListTermStr = PartitionsUtils.partitionName(this, partition); Term payloadTerm = new Term(term.field(), categoryListTermStr); - return new PayloadIntDecodingIterator(reader, payloadTerm, + return new PayloadCategoryListIteraor(reader, payloadTerm, createEncoder().createMatchingDecoder()); } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/index/params/FacetIndexingParams.java b/lucene/facet/src/java/org/apache/lucene/facet/index/params/FacetIndexingParams.java index 62502e75443..3fca0e3a4aa 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/index/params/FacetIndexingParams.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/index/params/FacetIndexingParams.java @@ -83,18 +83,9 @@ public class FacetIndexingParams { } /** - * The name of the category list to put this category in, or {@code null} if - * this category should not be aggregatable. - *

- * By default, all categories are written to the same category list, but - * applications which know in advance that in some situations only parts of - * the category hierarchy needs to be counted can divide the categories into - * two or more different category lists. - *

- * If {@code null} is returned for a category, it means that this category - * should not appear in any category list, and thus weights for it cannot be - * aggregated. This category can still be used for drill-down, even though the - * its weight is unknown. + * Returns the {@link CategoryListParams} for this {@link CategoryPath}. The + * default implementation returns the same {@link CategoryListParams} for all + * categories (even if {@code category} is {@code null}). * * @see PerDimensionIndexingParams */ diff --git a/lucene/facet/src/java/org/apache/lucene/facet/index/params/PerDimensionIndexingParams.java b/lucene/facet/src/java/org/apache/lucene/facet/index/params/PerDimensionIndexingParams.java index 1ccb58d309f..c4b971a5ca9 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/index/params/PerDimensionIndexingParams.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/index/params/PerDimensionIndexingParams.java @@ -78,7 +78,9 @@ public class PerDimensionIndexingParams extends FacetIndexingParams { /** * Returns the {@link CategoryListParams} for the corresponding dimension - * which is returned by {@code category.getComponent(0)}. + * which is returned by {@code category.getComponent(0)}. If {@code category} + * is {@code null}, or was not specified in the map given to the constructor, + * returns the default {@link CategoryListParams}. */ @Override public CategoryListParams getCategoryListParams(CategoryPath category) { diff --git a/lucene/facet/src/java/org/apache/lucene/facet/search/CategoryListIterator.java b/lucene/facet/src/java/org/apache/lucene/facet/search/CategoryListIterator.java index 79f89909030..5132e930264 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/search/CategoryListIterator.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/search/CategoryListIterator.java @@ -2,6 +2,8 @@ package org.apache.lucene.facet.search; import java.io.IOException; +import org.apache.lucene.util.IntsRef; + /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with @@ -20,20 +22,10 @@ import java.io.IOException; */ /** - * An interface for iterating over a "category list", i.e., the list of - * categories per document. + * An interface for obtaining the category ordinals of documents. *

- * NOTE: - *

    - *
  • This class operates as a key to a Map. Appropriate implementation of - * hashCode() and equals() must be provided. - *
  • {@link #init()} must be called before you consume any categories, or call - * {@link #skipTo(int)}. - *
  • {@link #skipTo(int)} must be called before any calls to - * {@link #nextCategory()}. - *
  • {@link #nextCategory()} returns values < {@link Integer#MAX_VALUE}, so - * you can use it as a stop condition. - *
+ * NOTE: this class operates as a key to a map, and therefore you should + * implement {@code equals()} and {@code hashCode()} for proper behavior. * * @lucene.experimental */ @@ -41,29 +33,20 @@ public interface CategoryListIterator { /** * Initializes the iterator. This method must be called before any calls to - * {@link #skipTo(int)}, and its return value indicates whether there are - * any relevant documents for this iterator. If it returns false, any call - * to {@link #skipTo(int)} will return false as well.
- * NOTE: calling this method twice may result in skipping over - * documents for some implementations. Also, calling it again after all - * documents were consumed may yield unexpected behavior. + * {@link #getOrdinals(int, IntsRef)}, and its return value indicates whether there are + * any relevant documents for this iterator. */ public boolean init() throws IOException; /** - * Skips forward to document docId. Returns true iff this document exists - * and has any categories. This method must be called before calling - * {@link #nextCategory()} for a particular document.
- * NOTE: Users should call this method with increasing docIds, and - * implementations can assume that this is the case. + * Stores the category ordinals of the given document ID in the given + * {@link IntsRef}, starting at position 0 upto {@link IntsRef#length}. Grows + * the {@link IntsRef} if it is not large enough. + * + *

+ * NOTE: if the requested document does not category ordinals + * associated with it, {@link IntsRef#length} is set to zero. */ - public boolean skipTo(int docId) throws IOException; - - /** - * Returns the next category for the current document that is set through - * {@link #skipTo(int)}, or a number higher than {@link Integer#MAX_VALUE}. - * No assumptions can be made on the order of the categories. - */ - public long nextCategory() throws IOException; - + public void getOrdinals(int docID, IntsRef ints) throws IOException; + } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/search/PayloadIntDecodingIterator.java b/lucene/facet/src/java/org/apache/lucene/facet/search/PayloadCategoryListIteraor.java similarity index 52% rename from lucene/facet/src/java/org/apache/lucene/facet/search/PayloadIntDecodingIterator.java rename to lucene/facet/src/java/org/apache/lucene/facet/search/PayloadCategoryListIteraor.java index db4803e1019..3deba112f3e 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/search/PayloadIntDecodingIterator.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/search/PayloadCategoryListIteraor.java @@ -5,7 +5,7 @@ import java.io.IOException; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.UnsafeByteArrayInputStream; +import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.encoding.IntDecoder; /* @@ -26,44 +26,21 @@ import org.apache.lucene.util.encoding.IntDecoder; */ /** - * A payload deserializer comes with its own working space (buffer). One need to - * define the {@link IndexReader} and {@link Term} in which the payload resides. - * The iterator then consumes the payload information of each document and - * decodes it into categories. A typical use case of this class is: - * - *

- * IndexReader reader = [open your reader];
- * Term t = new Term("field", "where-payload-exists");
- * CategoryListIterator cli = new PayloadIntDecodingIterator(reader, t);
- * if (!cli.init()) {
- *   // it means there are no payloads / documents associated with that term.
- *   // Usually a sanity check. However, init() must be called.
- * }
- * DocIdSetIterator disi = [you usually iterate on something else, such as a Scorer];
- * int doc;
- * while ((doc = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
- *   cli.setdoc(doc);
- *   long category;
- *   while ((category = cli.nextCategory()) < Integer.MAX_VALUE) {
- *   }
- * }
- * 
+ * A {@link CategoryListIterator} which reads the category ordinals from a + * payload. * * @lucene.experimental */ -public class PayloadIntDecodingIterator implements CategoryListIterator { +public class PayloadCategoryListIteraor implements CategoryListIterator { - private final UnsafeByteArrayInputStream ubais; private final IntDecoder decoder; - private final IndexReader indexReader; private final Term term; private final PayloadIterator pi; private final int hashCode; - public PayloadIntDecodingIterator(IndexReader indexReader, Term term, IntDecoder decoder) throws IOException { + public PayloadCategoryListIteraor(IndexReader indexReader, Term term, IntDecoder decoder) throws IOException { pi = new PayloadIterator(indexReader, term); - ubais = new UnsafeByteArrayInputStream(); this.decoder = decoder; hashCode = indexReader.hashCode() ^ term.hashCode(); this.term = term; @@ -72,10 +49,10 @@ public class PayloadIntDecodingIterator implements CategoryListIterator { @Override public boolean equals(Object other) { - if (!(other instanceof PayloadIntDecodingIterator)) { + if (!(other instanceof PayloadCategoryListIteraor)) { return false; } - PayloadIntDecodingIterator that = (PayloadIntDecodingIterator) other; + PayloadCategoryListIteraor that = (PayloadCategoryListIteraor) other; if (hashCode != that.hashCode) { return false; } @@ -95,21 +72,12 @@ public class PayloadIntDecodingIterator implements CategoryListIterator { } @Override - public long nextCategory() throws IOException { - return decoder.decode(); - } - - @Override - public boolean skipTo(int docId) throws IOException { - if (!pi.setdoc(docId)) { - return false; + public void getOrdinals(int docID, IntsRef ints) throws IOException { + ints.length = 0; + BytesRef payload = pi.getPayload(docID); + if (payload != null) { + decoder.decode(payload, ints); } - - // Initializing the decoding mechanism with the new payload data - BytesRef data = pi.getPayload(); - ubais.reInit(data.bytes, data.offset, data.length + data.offset); - decoder.reInit(ubais); - return true; } } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/search/PayloadIterator.java b/lucene/facet/src/java/org/apache/lucene/facet/search/PayloadIterator.java index a12f2cd91f6..7cc7527280d 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/search/PayloadIterator.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/search/PayloadIterator.java @@ -34,9 +34,9 @@ import org.apache.lucene.util.BytesRef; * A utility class for iterating through a posting list of a given term and * retrieving the payload of the first position in every document. For * efficiency, this class does not check if documents passed to - * {@link #setdoc(int)} are deleted, since it is usually used to iterate on + * {@link #getPayload(int)} are deleted, since it is usually used to iterate on * payloads of documents that matched a query. If you need to skip over deleted - * documents, you should do so before calling {@link #setdoc(int)}. + * documents, you should do so before calling {@link #getPayload(int)}. * * @lucene.experimental */ @@ -84,8 +84,8 @@ public class PayloadIterator { /** * Initialize the iterator. Should be done before the first call to - * {@link #setdoc(int)}. Returns {@code false} if no category list is found, - * or the category list has no documents. + * {@link #getPayload(int)}. Returns {@code false} if no category list is + * found, or the category list has no documents. */ public boolean init() throws IOException { nextSegment(); @@ -93,30 +93,29 @@ public class PayloadIterator { } /** - * Skip forward to document docId. Return true if this document exists and - * has any payload. - *

- * Users should call this method with increasing docIds, and implementations - * can assume that this is the case. + * Returns the {@link BytesRef payload} of the given document, or {@code null} + * if the document does not exist, there are no more documents in the posting + * list, or the document exists but has not payload. You should call + * {@link #init()} before the first call to this method. */ - public boolean setdoc(int docId) throws IOException { + public BytesRef getPayload(int docID) throws IOException { if (!hasMore) { - return false; + return null; } // re-basing docId->localDocID is done fewer times than currentDoc->globalDoc - int localDocID = docId - curDocBase; + int localDocID = docID - curDocBase; if (curDocID > localDocID) { // document does not exist - return false; + return null; } if (curDocID < localDocID) { // look for the document either in that segment, or others while (hasMore && (curDocID = currentDPE.advance(localDocID)) == DocIdSetIterator.NO_MORE_DOCS) { nextSegment(); // also updates curDocID - localDocID = docId - curDocBase; + localDocID = docID - curDocBase; // nextSegment advances to nextDoc, so check if we still need to advance if (curDocID >= localDocID) { break; @@ -127,7 +126,7 @@ public class PayloadIterator { // 1. we iterated over all segments (hasMore=false) // 2. current segment advanced to a doc, either requested or higher if (!hasMore || curDocID != localDocID) { - return false; + return null; } } @@ -135,12 +134,7 @@ public class PayloadIterator { assert currentDPE.freq() == 1 : "expecting freq=1 (got " + currentDPE.freq() + ") term=" + term + " doc=" + (curDocID + curDocBase); int pos = currentDPE.nextPosition(); assert pos != -1 : "no positions for term=" + term + " doc=" + (curDocID + curDocBase); - data = currentDPE.getPayload(); - return data != null; + return currentDPE.getPayload(); } - public BytesRef getPayload() { - return data; - } - } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/search/StandardFacetsAccumulator.java b/lucene/facet/src/java/org/apache/lucene/facet/search/StandardFacetsAccumulator.java index 78b362ea063..32466269d60 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/search/StandardFacetsAccumulator.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/search/StandardFacetsAccumulator.java @@ -10,6 +10,7 @@ import java.util.logging.Level; import java.util.logging.Logger; import org.apache.lucene.index.IndexReader; +import org.apache.lucene.util.IntsRef; import org.apache.lucene.facet.search.aggregator.Aggregator; import org.apache.lucene.facet.search.params.FacetSearchParams; @@ -231,9 +232,9 @@ public class StandardFacetsAccumulator extends FacetsAccumulator { facetArrays.free(); // to get a cleared array for this partition } - HashMap categoryLists = getCategoryListMap( - facetArrays, partition); + HashMap categoryLists = getCategoryListMap(facetArrays, partition); + IntsRef ordinals = new IntsRef(32); // a reasonable start capacity for most common apps for (Entry entry : categoryLists.entrySet()) { CategoryListIterator categoryList = entry.getKey(); if (!categoryList.init()) { @@ -244,14 +245,11 @@ public class StandardFacetsAccumulator extends FacetsAccumulator { ScoredDocIDsIterator iterator = docids.iterator(); while (iterator.next()) { int docID = iterator.getDocID(); - if (!categoryList.skipTo(docID)) { + categoryList.getOrdinals(docID, ordinals); + if (ordinals.length == 0) { continue; } - categorator.setNextDoc(docID, iterator.getScore()); - long ordinal; - while ((ordinal = categoryList.nextCategory()) <= Integer.MAX_VALUE) { - categorator.aggregate((int) ordinal); - } + categorator.aggregate(docID, iterator.getScore(), ordinals); } } } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/Aggregator.java b/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/Aggregator.java index 447ddd9b49d..a3743317cc8 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/Aggregator.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/Aggregator.java @@ -2,6 +2,8 @@ package org.apache.lucene.facet.search.aggregator; import java.io.IOException; +import org.apache.lucene.util.IntsRef; + /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with @@ -36,16 +38,9 @@ import java.io.IOException; public interface Aggregator { /** - * Specify the document (and its score in the search) that the following - * {@link #aggregate(int)} calls will pertain to. + * Aggregate the ordinals of the given document ID (and its score). The given + * ordinals offset is always zero. */ - void setNextDoc(int docid, float score) throws IOException; - - /** - * Collect (and do whatever an implementation deems appropriate) the - * category given by its ordinal. This category belongs to a document - * given earlier by {@link #setNextDoc(int, float)}. - */ - void aggregate(int ordinal); - + public void aggregate(int docID, float score, IntsRef ordinals) throws IOException; + } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/ComplementCountingAggregator.java b/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/ComplementCountingAggregator.java index 3bf62c4e520..50ca39fa352 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/ComplementCountingAggregator.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/ComplementCountingAggregator.java @@ -1,5 +1,9 @@ package org.apache.lucene.facet.search.aggregator; +import java.io.IOException; + +import org.apache.lucene.util.IntsRef; + /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with @@ -29,9 +33,12 @@ public class ComplementCountingAggregator extends CountingAggregator { } @Override - public void aggregate(int ordinal) { - assert counterArray[ordinal]!=0:"complement aggregation: count is about to become negative for ordinal "+ordinal; - --counterArray[ordinal]; + public void aggregate(int docID, float score, IntsRef ordinals) throws IOException { + for (int i = 0; i < ordinals.length; i++) { + int ord = ordinals.ints[i]; + assert counterArray[ord] != 0 : "complement aggregation: count is about to become negative for ordinal " + ord; + --counterArray[ord]; + } } } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/CountingAggregator.java b/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/CountingAggregator.java index 5dd54a06b3a..8cd71595dc8 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/CountingAggregator.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/CountingAggregator.java @@ -1,5 +1,9 @@ package org.apache.lucene.facet.search.aggregator; +import java.io.IOException; + +import org.apache.lucene.util.IntsRef; + /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with @@ -27,21 +31,17 @@ package org.apache.lucene.facet.search.aggregator; public class CountingAggregator implements Aggregator { protected int[] counterArray; - - @Override - public void aggregate(int ordinal) { - ++counterArray[ordinal]; - } - - @Override - public void setNextDoc(int docid, float score) { - // There's nothing for us to do here since we only increment the count by 1 - // in this aggregator. - } - + public CountingAggregator(int[] counterArray) { this.counterArray = counterArray; } + + @Override + public void aggregate(int docID, float score, IntsRef ordinals) throws IOException { + for (int i = 0; i < ordinals.length; i++) { + counterArray[ordinals.ints[i]]++; + } + } @Override public boolean equals(Object obj) { @@ -54,8 +54,7 @@ public class CountingAggregator implements Aggregator { @Override public int hashCode() { - int hashCode = counterArray == null ? 0 : counterArray.hashCode(); - - return hashCode; + return counterArray == null ? 0 : counterArray.hashCode(); } + } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/ScoringAggregator.java b/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/ScoringAggregator.java index ef788968138..6c3dc492703 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/ScoringAggregator.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/ScoringAggregator.java @@ -1,5 +1,9 @@ package org.apache.lucene.facet.search.aggregator; +import java.io.IOException; + +import org.apache.lucene.util.IntsRef; + /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with @@ -26,7 +30,6 @@ package org.apache.lucene.facet.search.aggregator; public class ScoringAggregator implements Aggregator { private final float[] scoreArray; - private float score; private final int hashCode; public ScoringAggregator(float[] counterArray) { @@ -35,10 +38,12 @@ public class ScoringAggregator implements Aggregator { } @Override - public void aggregate(int ordinal) { - scoreArray[ordinal] += score; + public void aggregate(int docID, float score, IntsRef ordinals) throws IOException { + for (int i = 0; i < ordinals.length; i++) { + scoreArray[ordinals.ints[i]] += score; + } } - + @Override public boolean equals(Object obj) { if (obj == null || obj.getClass() != this.getClass()) { @@ -53,8 +58,4 @@ public class ScoringAggregator implements Aggregator { return hashCode; } - @Override - public void setNextDoc(int docid, float score) { - this.score = score; - } } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/associations/AssociationFloatSumAggregator.java b/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/associations/AssociationFloatSumAggregator.java index 57814d7cf57..22ebfecdba9 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/associations/AssociationFloatSumAggregator.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/associations/AssociationFloatSumAggregator.java @@ -7,6 +7,8 @@ import org.apache.lucene.facet.associations.FloatAssociationsPayloadIterator; import org.apache.lucene.facet.index.params.CategoryListParams; import org.apache.lucene.facet.search.aggregator.Aggregator; import org.apache.lucene.index.IndexReader; +import org.apache.lucene.util.IntsRef; +import org.apache.lucene.util.collections.IntToFloatMap; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -48,13 +50,18 @@ public class AssociationFloatSumAggregator implements Aggregator { } @Override - public void aggregate(int ordinal) { - float association = associations.getAssociation(ordinal); - if (!Float.isNaN(association)) { - sumArray[ordinal] += association; + public void aggregate(int docID, float score, IntsRef ordinals) throws IOException { + IntToFloatMap values = associations.getAssociations(docID); + if (values != null) { + for (int i = 0; i < ordinals.length; i++) { + int ord = ordinals.ints[i]; + if (values.containsKey(ord)) { + sumArray[ord] += values.get(ord); + } + } } } - + @Override public boolean equals(Object obj) { if (obj == null || obj.getClass() != this.getClass()) { @@ -69,9 +76,4 @@ public class AssociationFloatSumAggregator implements Aggregator { return field.hashCode(); } - @Override - public void setNextDoc(int docid, float score) throws IOException { - associations.setNextDoc(docid); - } - } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/associations/AssociationIntSumAggregator.java b/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/associations/AssociationIntSumAggregator.java index 42260597376..2f12080a35b 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/associations/AssociationIntSumAggregator.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/associations/AssociationIntSumAggregator.java @@ -7,6 +7,8 @@ import org.apache.lucene.facet.associations.IntAssociationsPayloadIterator; import org.apache.lucene.facet.index.params.CategoryListParams; import org.apache.lucene.facet.search.aggregator.Aggregator; import org.apache.lucene.index.IndexReader; +import org.apache.lucene.util.IntsRef; +import org.apache.lucene.util.collections.IntToIntMap; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -48,13 +50,18 @@ public class AssociationIntSumAggregator implements Aggregator { } @Override - public void aggregate(int ordinal) { - long association = associations.getAssociation(ordinal); - if (association != IntAssociationsPayloadIterator.NO_ASSOCIATION) { - sumArray[ordinal] += association; + public void aggregate(int docID, float score, IntsRef ordinals) throws IOException { + IntToIntMap values = associations.getAssociations(docID); + if (values != null) { + for (int i = 0; i < ordinals.length; i++) { + int ord = ordinals.ints[i]; + if (values.containsKey(ord)) { + sumArray[ord] += values.get(ord); + } + } } } - + @Override public boolean equals(Object obj) { if (obj == null || obj.getClass() != this.getClass()) { @@ -69,9 +76,4 @@ public class AssociationIntSumAggregator implements Aggregator { return field.hashCode(); } - @Override - public void setNextDoc(int docid, float score) throws IOException { - associations.setNextDoc(docid); - } - } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/search/cache/CategoryListData.java b/lucene/facet/src/java/org/apache/lucene/facet/search/cache/CategoryListData.java index cc0daa4f6fe..4dc5c694a8c 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/search/cache/CategoryListData.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/search/cache/CategoryListData.java @@ -2,13 +2,12 @@ package org.apache.lucene.facet.search.cache; import java.io.IOException; -import org.apache.lucene.index.IndexReader; - import org.apache.lucene.facet.index.params.CategoryListParams; import org.apache.lucene.facet.index.params.FacetIndexingParams; import org.apache.lucene.facet.search.CategoryListIterator; import org.apache.lucene.facet.taxonomy.TaxonomyReader; -import org.apache.lucene.util.collections.IntArray; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.util.IntsRef; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -56,33 +55,26 @@ public class CategoryListData { protected CategoryListData() { } - /** - * Compute category list data for caching for faster iteration. - */ + /** Compute category list data for caching for faster iteration. */ CategoryListData(IndexReader reader, TaxonomyReader taxo, FacetIndexingParams iparams, CategoryListParams clp) throws IOException { final int maxDoc = reader.maxDoc(); int[][][]dpf = new int[maxDoc][][]; int numPartitions = (int)Math.ceil(taxo.getSize()/(double)iparams.getPartitionSize()); - IntArray docCategories = new IntArray(); - for (int part=0; part 0) { + if (dpf[doc] == null) { dpf[doc] = new int[numPartitions][]; } - long category; - while ((category = cli.nextCategory()) <= Integer.MAX_VALUE) { - docCategories.addToArray((int)category); - } - final int size = docCategories.size(); - dpf[doc][part] = new int[size]; - for (int i=0; ipart; + return dpc != null && dpc.length > part; } @Override - public long nextCategory() throws IOException { - if (nextCategoryIndex >= dpc[currDoc][part].length) { - return 1L+Integer.MAX_VALUE; + public void getOrdinals(int docID, IntsRef ints) throws IOException { + ints.length = 0; + if (dpc.length > docID && dpc[docID] != null && dpc[docID][part] != null) { + if (ints.ints.length < dpc[docID][part].length) { + ints.grow(dpc[docID][part].length); + } + ints.length = 0; + for (int i = 0; i < dpc[docID][part].length; i++) { + ints.ints[ints.length++] = dpc[docID][part][i]; + } } - return dpc[currDoc][part][nextCategoryIndex++]; - } - - @Override - public boolean skipTo(int docId) throws IOException { - final boolean res = dpc.length>docId && dpc[docId]!=null && dpc[docId][part]!=null; - if (res) { - currDoc = docId; - nextCategoryIndex = 0; - } - return res; } } + } \ No newline at end of file diff --git a/lucene/facet/src/java/org/apache/lucene/facet/search/params/CountFacetRequest.java b/lucene/facet/src/java/org/apache/lucene/facet/search/params/CountFacetRequest.java index 9a55244a5ee..71c7df7f0c4 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/search/params/CountFacetRequest.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/search/params/CountFacetRequest.java @@ -48,8 +48,7 @@ public class CountFacetRequest extends FacetRequest { @Override public Aggregator createAggregator(boolean useComplements, - FacetArrays arrays, IndexReader reader, - TaxonomyReader taxonomy) { + FacetArrays arrays, IndexReader reader, TaxonomyReader taxonomy) { // we rely on that, if needed, result is cleared by arrays! int[] a = arrays.getIntArray(); if (useComplements) { diff --git a/lucene/facet/src/java/org/apache/lucene/facet/util/MultiCategoryListIterator.java b/lucene/facet/src/java/org/apache/lucene/facet/util/MultiCategoryListIterator.java index 13fc778bad1..402ba3dcf69 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/util/MultiCategoryListIterator.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/util/MultiCategoryListIterator.java @@ -5,6 +5,7 @@ import java.util.ArrayList; import java.util.List; import org.apache.lucene.facet.search.CategoryListIterator; +import org.apache.lucene.util.IntsRef; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -33,16 +34,13 @@ public class MultiCategoryListIterator implements CategoryListIterator { private final CategoryListIterator[] iterators; private final List validIterators; - private final List perDocValidIterators; /** Receives the iterators to iterate on */ public MultiCategoryListIterator(CategoryListIterator... iterators) { this.iterators = iterators; this.validIterators = new ArrayList(); - this.perDocValidIterators = new ArrayList(); } - /** Fails if all given iterators fail to init */ @Override public boolean init() throws IOException { for (CategoryListIterator cli : iterators) { @@ -52,35 +50,17 @@ public class MultiCategoryListIterator implements CategoryListIterator { } return !validIterators.isEmpty(); } - - /** - * Return a value larger than {@link Integer#MAX_VALUE} only if all - * iterators are exhausted - */ + @Override - public long nextCategory() throws IOException { - while (!perDocValidIterators.isEmpty()) { - long value = perDocValidIterators.get(0).nextCategory(); - if (value <= Integer.MAX_VALUE) { - return value; - } - perDocValidIterators.remove(0); - } - return 0x100000000L; - } - - /** - * Fails only if skipTo on all the provided iterators returned {@code false} - */ - @Override - public boolean skipTo(int docId) throws IOException { - perDocValidIterators.clear(); + public void getOrdinals(int docID, IntsRef ints) throws IOException { + IntsRef tmp = new IntsRef(ints.length); for (CategoryListIterator cli : validIterators) { - if (cli.skipTo(docId)) { - perDocValidIterators.add(cli); + cli.getOrdinals(docID, tmp); + if (ints.ints.length < ints.length + tmp.length) { + ints.grow(ints.length + tmp.length); } + ints.length += tmp.length; } - return !perDocValidIterators.isEmpty(); } } diff --git a/lucene/facet/src/java/org/apache/lucene/util/Vint8.java b/lucene/facet/src/java/org/apache/lucene/util/Vint8.java deleted file mode 100644 index 40dd96daebd..00000000000 --- a/lucene/facet/src/java/org/apache/lucene/util/Vint8.java +++ /dev/null @@ -1,229 +0,0 @@ -package org.apache.lucene.util; - -import java.io.EOFException; -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Variable-length encoding of 32-bit integers, into 8-bit bytes. A number is encoded as follows: - *

    - *
  • If it is less than 127 and non-negative (i.e., if the number uses only 7 bits), it is encoded as - * as single byte: 0bbbbbbb. - *
  • If its highest nonzero bit is greater than bit 6 (0x40), it is represented as a series of - * bytes, each byte's - * 7 LSB containing bits from the original value, with the MSB set for all but the last - * byte. The first encoded byte contains the highest nonzero bits from the - * original; the second byte contains the next 7 MSB; and so on, with the last byte - * containing the 7 LSB of the original. - *
- * Examples: - *
    - *
  1. n = 117 = 1110101: This has fewer than 8 significant bits, and so is encoded as - * 01110101 = 0x75. - *
  2. n = 100000 = (binary) 11000011010100000. This has 17 significant bits, and so needs - * three Vint8 bytes. Left-zero-pad it to a multiple of 7 bits, then split it into chunks of 7 - * and add an MSB, 0 for the last byte, 1 for the others: 1|0000110 1|0001101 0|0100000 - * = 0x86 0x8D 0x20. - *
- * This encoder/decoder will correctly handle any 32-bit integer, but for negative numbers, - * and positive numbers with more than 28 significant bits, encoding requires 5 bytes; this - * is not an efficient encoding scheme for large - * positive numbers or any negative number. - *

- * Compatibility:
- * This class has been used in products that have shipped to customers, and is needed to - * decode legacy data. Do not modify this class in ways that will break compatibility. - * - * @lucene.experimental - */ -public class Vint8 { - - /** - * Because Java lacks call-by-reference, this class boxes the decoding position, which - * is initially set by the caller, and returned after decoding, incremented by the number - * of bytes processed. - */ - public static class Position { - /** - * Creates a position value set to zero. - */ - public Position() { - // The initial position is zero by default. - } - /** - * Creates a position set to {@code initialPosition}. - * @param initialPosition The starting decoding position in the source buffer. - */ - public Position(int initialPosition) { - this.pos = initialPosition; - } - /** - * The value passed by reference. - */ - public int pos; - } - - /** - * Returns the number of bytes needed to encode {@code number}. - * @param number The number whose encoded length is needed. - * @return The number of bytes needed to encode {@code number}. - */ - public static int bytesNeeded(int number) { - if ((number & ~0x7F) == 0) { - return 1; - } else if ((number & ~0x3FFF) == 0) { - return 2; - } else if ((number & ~0x1FFFFF) == 0) { - return 3; - } else if ((number & ~0xFFFFFFF) == 0) { - return 4; - } else { - return 5; - } - } - - /** - * The maximum number of bytes needed to encode a number using {@code Vint8}. - */ - public static final int MAXIMUM_BYTES_NEEDED = 5; - - /** - * Encodes {@code number} to {@code out}. - * @param number The value to be written in encoded form, to {@code out}. - * @param out The output stream receiving the encoded bytes. - * @exception IOException If there is a problem writing to {@code out}. - */ - public static void encode(int number, OutputStream out) throws IOException { - if ((number & ~0x7F) == 0) { - out.write(number); - } else if ((number & ~0x3FFF) == 0) { - out.write(0x80 | (number >> 7)); - out.write(0x7F & number); - } else if ((number & ~0x1FFFFF) == 0) { - out.write(0x80 | (number >> 14)); - out.write(0x80 | (number >> 7)); - out.write(0x7F & number); - } else if ((number & ~0xFFFFFFF) == 0) { - out.write(0x80 | (number >> 21)); - out.write(0x80 | (number >> 14)); - out.write(0x80 | (number >> 7)); - out.write(0x7F & number); - } else { - out.write(0x80 | (number >> 28)); - out.write(0x80 | (number >> 21)); - out.write(0x80 | (number >> 14)); - out.write(0x80 | (number >> 7)); - out.write(0x7F & number); - } - } - - /** - * Encodes {@code number} into {@code dest}, starting at offset {@code start} from - * the beginning of the array. This method assumes {@code dest} is large enough to - * hold the required number of bytes. - * @param number The number to be encoded. - * @param dest The destination array. - * @param start The starting offset in the array. - * @return The number of bytes used in the array. - */ - public static int encode(int number, byte[] dest, int start) { - if ((number & ~0x7F) == 0) { - dest[start] = (byte) number; - return 1; - } else if ((number & ~0x3FFF) == 0) { - dest[start] = (byte) (0x80 | ((number & 0x3F80) >> 7)); - dest[start + 1] = (byte) (number & 0x7F); - return 2; - } else if ((number & ~0x1FFFFF) == 0) { - dest[start] = (byte) (0x80 | ((number & 0x1FC000) >> 14)); - dest[start + 1] = (byte) (0x80 | ((number & 0x3F80) >> 7)); - dest[start + 2] = (byte) (number & 0x7F); - return 3; - } else if ((number & ~0xFFFFFFF) == 0) { - dest[start] = (byte) (0x80 | ((number & 0xFE00000) >> 21)); - dest[start + 1] = (byte) (0x80 | ((number & 0x1FC000) >> 14)); - dest[start + 2] = (byte) (0x80 | ((number & 0x3F80) >> 7)); - dest[start + 3] = (byte) (number & 0x7F); - return 4; - } else { - dest[start] = (byte) (0x80 | ((number & 0xF0000000) >> 28)); - dest[start + 1] = (byte) (0x80 | ((number & 0xFE00000) >> 21)); - dest[start + 2] = (byte) (0x80 | ((number & 0x1FC000) >> 14)); - dest[start + 3] = (byte) (0x80 | ((number & 0x3F80) >> 7)); - dest[start + 4] = (byte) (number & 0x7F); - return 5; - } - } - - /** - * Decodes a 32-bit integer from {@code bytes}, beginning at offset {@code pos.pos}. - * The decoded value is returned, and {@code pos.pos} is incremented by the number of - * bytes processed. - * @param bytes The byte array containing an encoded value. - * @param pos On entry, the starting position in the array; on return, one greater - * than the position of the last byte decoded in the call. - * @return The decoded value. - */ - public static int decode(byte[] bytes, Position pos) { - int value = 0; - while (true) { - byte first = bytes[pos.pos]; - ++pos.pos; - value |= first & 0x7F; - if ((first & 0x80) == 0) { - return value; - } - value <<= 7; - } - } - - /** - * Decodes a 32-bit integer from bytes read from {@code in}. Bytes are read, - * one at a time, from {@code in}, and it is assumed they represent a 32-bit - * integer encoded using this class's encoding scheme. The decoded value is - * returned. - * @param in The input stream containing the encoded bytes. - * @return The decoded value. - * @exception EOFException If the stream ends before a value has been decoded. - */ - public static int decode(InputStream in) throws IOException { - int value = 0; - while (true) { - int first = in.read(); - if (first < 0) { - throw new EOFException(); - } - value |= first & 0x7F; - if ((first & 0x80) == 0) { - return value; - } - value <<= 7; - } - } - - /** - * The default ctor is made private because all methods of this class are static. - */ - private Vint8() { - // Just making it impossible to instantiate. - } - -} diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/ChunksIntEncoder.java b/lucene/facet/src/java/org/apache/lucene/util/encoding/ChunksIntEncoder.java index c640c53b6cc..a19550c29e5 100644 --- a/lucene/facet/src/java/org/apache/lucene/util/encoding/ChunksIntEncoder.java +++ b/lucene/facet/src/java/org/apache/lucene/util/encoding/ChunksIntEncoder.java @@ -1,7 +1,7 @@ package org.apache.lucene.util.encoding; -import java.io.IOException; -import java.io.OutputStream; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -27,38 +27,31 @@ import java.io.OutputStream; * read more on the two implementations {@link FourFlagsIntEncoder} and * {@link EightFlagsIntEncoder}. *

- * Extensions of this class need to implement {@link #encode(int)} in order to - * build the proper indicator (flags). When enough values were accumulated - * (typically the batch size), extensions can call {@link #encodeChunk()} to - * flush the indicator and the rest of the values. + * Extensions of this class need to implement {@link #encode(IntsRef, BytesRef)} + * in order to build the proper indicator (flags). When enough values were + * accumulated (typically the batch size), extensions can call + * {@link #encodeChunk(BytesRef)} to flush the indicator and the rest of the + * values. *

* NOTE: flags encoders do not accept values ≤ 0 (zero) in their - * {@link #encode(int)}. For performance reasons they do not check that - * condition, however if such value is passed the result stream may be corrupt - * or an exception will be thrown. Also, these encoders perform the best when - * there are many consecutive small values (depends on the encoder + * {@link #encode(IntsRef, BytesRef)}. For performance reasons they do not check + * that condition, however if such value is passed the result stream may be + * corrupt or an exception will be thrown. Also, these encoders perform the best + * when there are many consecutive small values (depends on the encoder * implementation). If that is not the case, the encoder will occupy 1 more byte * for every batch number of integers, over whatever * {@link VInt8IntEncoder} would have occupied. Therefore make sure to check * whether your data fits into the conditions of the specific encoder. *

* For the reasons mentioned above, these encoders are usually chained with - * {@link UniqueValuesIntEncoder} and {@link DGapIntEncoder} in the following - * manner:

- * IntEncoder fourFlags = 
- *         new SortingEncoderFilter(new UniqueValuesIntEncoder(new DGapIntEncoder(new FlagsIntEncoderImpl())));
- * 
+ * {@link UniqueValuesIntEncoder} and {@link DGapIntEncoder}. * * @lucene.experimental */ public abstract class ChunksIntEncoder extends IntEncoder { /** Holds the values which must be encoded, outside the indicator. */ - protected final int[] encodeQueue; - protected int encodeQueueSize = 0; - - /** Encoder used to encode values outside the indicator. */ - protected final IntEncoder encoder = new VInt8IntEncoder(); + protected final IntsRef encodeQueue; /** Represents bits flag byte. */ protected int indicator = 0; @@ -67,39 +60,33 @@ public abstract class ChunksIntEncoder extends IntEncoder { protected byte ordinal = 0; protected ChunksIntEncoder(int chunkSize) { - encodeQueue = new int[chunkSize]; + encodeQueue = new IntsRef(chunkSize); } /** * Encodes the values of the current chunk. First it writes the indicator, and * then it encodes the values outside the indicator. */ - protected void encodeChunk() throws IOException { - out.write(indicator); - for (int i = 0; i < encodeQueueSize; ++i) { - encoder.encode(encodeQueue[i]); + protected void encodeChunk(BytesRef buf) { + // ensure there's enough room in the buffer + int maxBytesRequired = buf.length + 1 + encodeQueue.length * 4; /* indicator + at most 4 bytes per positive VInt */ + if (buf.bytes.length < maxBytesRequired) { + buf.grow(maxBytesRequired); } - encodeQueueSize = 0; - ordinal = 0; - indicator = 0; + + buf.bytes[buf.length++] = ((byte) indicator); + for (int i = 0; i < encodeQueue.length; i++) { + VInt8.encode(encodeQueue.ints[i], buf); + } + + reset(); } @Override - public void close() throws IOException { - if (ordinal != 0) { - encodeChunk(); - } - encoder.close(); - super.close(); - } - - @Override - public void reInit(OutputStream out) { - encoder.reInit(out); - super.reInit(out); + protected void reset() { ordinal = 0; indicator = 0; - encodeQueueSize = 0; + encodeQueue.length = 0; } } diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/DGapIntDecoder.java b/lucene/facet/src/java/org/apache/lucene/util/encoding/DGapIntDecoder.java index a37a4b9f4bb..a5b2fb3c28c 100644 --- a/lucene/facet/src/java/org/apache/lucene/util/encoding/DGapIntDecoder.java +++ b/lucene/facet/src/java/org/apache/lucene/util/encoding/DGapIntDecoder.java @@ -1,7 +1,7 @@ package org.apache.lucene.util.encoding; -import java.io.IOException; -import java.io.InputStream; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -21,10 +21,8 @@ import java.io.InputStream; */ /** - * An {@link IntDecoder} which wraps another {@link IntDecoder} and reverts the - * d-gap that was encoded by {@link DGapIntEncoder}. The wrapped decoder - * performs the actual decoding, while this class simply adds the decoded value - * to the previous value. + * An {@link IntDecoder} which wraps another decoder and reverts the d-gap that + * was encoded by {@link DGapIntEncoder}. * * @lucene.experimental */ @@ -32,26 +30,23 @@ public class DGapIntDecoder extends IntDecoder { private final IntDecoder decoder; - private int prev = 0; - public DGapIntDecoder(IntDecoder decoder) { this.decoder = decoder; } @Override - public long decode() throws IOException { - long decode = decoder.decode(); - if (decode == EOS) { - return EOS; - } - - return prev += decode; + protected void reset() { + decoder.reset(); } - + @Override - public void reInit(InputStream in) { - decoder.reInit(in); - prev = 0; + protected void doDecode(BytesRef buf, IntsRef values, int upto) { + decoder.doDecode(buf, values, upto); + int prev = 0; + for (int i = 0; i < values.length; i++) { + values.ints[i] += prev; + prev = values.ints[i]; + } } @Override diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/DGapIntEncoder.java b/lucene/facet/src/java/org/apache/lucene/util/encoding/DGapIntEncoder.java index 8b1a0deffaf..305f975c619 100644 --- a/lucene/facet/src/java/org/apache/lucene/util/encoding/DGapIntEncoder.java +++ b/lucene/facet/src/java/org/apache/lucene/util/encoding/DGapIntEncoder.java @@ -1,7 +1,7 @@ package org.apache.lucene.util.encoding; -import java.io.IOException; -import java.io.OutputStream; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -27,7 +27,7 @@ import java.io.OutputStream; * space) if the values are 'close' to each other. *

* NOTE: this encoder assumes the values are given to - * {@link #encode(int)} in an ascending sorted manner, which ensures only + * {@link #encode(IntsRef, BytesRef)} in an ascending sorted manner, which ensures only * positive values are encoded and thus yields better performance. If you are * not sure whether the values are sorted or not, it is possible to chain this * encoder with {@link SortingIntEncoder} to ensure the values will be @@ -37,17 +37,20 @@ import java.io.OutputStream; */ public class DGapIntEncoder extends IntEncoderFilter { - private int prev = 0; - /** Initializes with the given encoder. */ public DGapIntEncoder(IntEncoder encoder) { super(encoder); } @Override - public void encode(int value) throws IOException { - encoder.encode(value - prev); - prev = value; + protected void doEncode(IntsRef values, BytesRef buf, int upto) { + int prev = 0; + for (int i = values.offset; i < upto; i++) { + int tmp = values.ints[i]; + values.ints[i] -= prev; + prev = tmp; + } + encoder.doEncode(values, buf, upto); } @Override @@ -55,12 +58,6 @@ public class DGapIntEncoder extends IntEncoderFilter { return new DGapIntDecoder(encoder.createMatchingDecoder()); } - @Override - public void reInit(OutputStream out) { - super.reInit(out); - prev = 0; - } - @Override public String toString() { return "DGap (" + encoder.toString() + ")"; diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/EightFlagsIntDecoder.java b/lucene/facet/src/java/org/apache/lucene/util/encoding/EightFlagsIntDecoder.java index fe2b9c6d67b..270fcffca52 100644 --- a/lucene/facet/src/java/org/apache/lucene/util/encoding/EightFlagsIntDecoder.java +++ b/lucene/facet/src/java/org/apache/lucene/util/encoding/EightFlagsIntDecoder.java @@ -1,7 +1,7 @@ package org.apache.lucene.util.encoding; -import java.io.IOException; -import java.io.InputStream; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -21,20 +21,17 @@ import java.io.InputStream; */ /** - * Decodes data which was encoded by {@link EightFlagsIntEncoder}. Scans - * the indicator, one flag (1-bits) at a time, and decodes extra - * data using {@link VInt8IntDecoder}. + * Decodes values encoded with {@link EightFlagsIntEncoder}. * - * @see EightFlagsIntEncoder * @lucene.experimental */ public class EightFlagsIntDecoder extends IntDecoder { - /** + /* * Holds all combinations of indicator for fast decoding (saves time * on real-time bit manipulation) */ - private static final byte[][] decodeTable = new byte[256][8]; + private static final byte[][] DECODE_TABLE = new byte[256][8]; /** Generating all combinations of indicator into separate flags. */ static { @@ -42,45 +39,36 @@ public class EightFlagsIntDecoder extends IntDecoder { --i; for (int j = 8; j != 0;) { --j; - decodeTable[i][j] = (byte) ((i >>> j) & 0x1); + DECODE_TABLE[i][j] = (byte) ((i >>> j) & 0x1); } } } - private final IntDecoder decoder = new VInt8IntDecoder(); - - /** The indicator for decoding a chunk of 8 integers. */ - private int indicator; - - /** Used as an ordinal of 0 - 7, as the decoder decodes chunks of 8 integers. */ - private int ordinal = 0; - @Override - public long decode() throws IOException { - // If we've decoded 8 integers, read the next indicator. - if ((ordinal & 0x7) == 0) { - indicator = in.read(); - if (indicator < 0) { - return EOS; + protected void doDecode(BytesRef buf, IntsRef values, int upto) { + while (buf.offset < upto) { + // read indicator + int indicator = buf.bytes[buf.offset++] & 0xFF; + int ordinal = 0; + + int capacityNeeded = values.length + 8; + if (values.ints.length < capacityNeeded) { + values.grow(capacityNeeded); + } + + // process indicator, until we read 8 values, or end-of-buffer + while (ordinal != 8) { + if (DECODE_TABLE[indicator][ordinal++] == 0) { + if (buf.offset == upto) { // end of buffer + return; + } + // decode the value from the stream. + values.ints[values.length++] = VInt8.decode(buf) + 2; + } else { + values.ints[values.length++] = 1; + } } - ordinal = 0; } - - if (decodeTable[indicator][ordinal++] == 0) { - // decode the value from the stream. - long decode = decoder.decode(); - return decode == EOS ? EOS : decode + 2; - } - - return 1; - } - - @Override - public void reInit(InputStream in) { - super.reInit(in); - decoder.reInit(in); - ordinal = 0; - indicator = 0; } @Override diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/EightFlagsIntEncoder.java b/lucene/facet/src/java/org/apache/lucene/util/encoding/EightFlagsIntEncoder.java index ac427725674..143660a4742 100644 --- a/lucene/facet/src/java/org/apache/lucene/util/encoding/EightFlagsIntEncoder.java +++ b/lucene/facet/src/java/org/apache/lucene/util/encoding/EightFlagsIntEncoder.java @@ -1,6 +1,7 @@ package org.apache.lucene.util.encoding; -import java.io.IOException; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -20,14 +21,15 @@ import java.io.IOException; */ /** - * A {@link ChunksIntEncoder} which encodes data in chunks of 8. Every group starts with a single - * byte (called indicator) which represents 8 - 1 bit flags, where the value: + * A {@link ChunksIntEncoder} which encodes data in chunks of 8. Every group + * starts with a single byte (called indicator) which represents 8 - 1 bit + * flags, where the value: *

    *
  • 1 means the encoded value is '1' *
  • 0 means the value is encoded using {@link VInt8IntEncoder}, and the * encoded bytes follow the indicator.
    - * Since value 0 is illegal, and 1 is encoded in the indicator, the actual - * value that is encoded is value-2, which saves some more bits. + * Since value 0 is illegal, and 1 is encoded in the indicator, the actual value + * that is encoded is value-2, which saves some more bits. *
* Encoding example: *
    @@ -46,28 +48,36 @@ import java.io.IOException; */ public class EightFlagsIntEncoder extends ChunksIntEncoder { - /** + /* * Holds all combinations of indicator flags for fast encoding (saves * time on bit manipulation at encode time) */ - private static byte[] encodeTable = new byte[] { 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, (byte) 0x80 }; + private static final byte[] ENCODE_TABLE = new byte[] { 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, (byte) 0x80 }; public EightFlagsIntEncoder() { super(8); } @Override - public void encode(int data) throws IOException { - if (data == 1) { - indicator |= encodeTable[ordinal]; - } else { - encodeQueue[encodeQueueSize++] = data - 2; + protected void doEncode(IntsRef values, BytesRef buf, int upto) { + for (int i = values.offset; i < upto; i++) { + int value = values.ints[i]; + if (value == 1) { + indicator |= ENCODE_TABLE[ordinal]; + } else { + encodeQueue.ints[encodeQueue.length++] = value - 2; + } + ++ordinal; + + // encode the chunk and the indicator + if (ordinal == 8) { + encodeChunk(buf); + } } - ++ordinal; - - // If 8 values were encoded thus far, 'flush' them including the indicator. - if ((ordinal & 0x7) == 0) { - encodeChunk(); + + // encode remaining values + if (ordinal != 0) { + encodeChunk(buf); } } @@ -78,7 +88,7 @@ public class EightFlagsIntEncoder extends ChunksIntEncoder { @Override public String toString() { - return "EightFlags (" + encoder.toString() + ")"; + return "EightFlags (VInt)"; } } diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/FourFlagsIntDecoder.java b/lucene/facet/src/java/org/apache/lucene/util/encoding/FourFlagsIntDecoder.java index a597f9e2a38..ebc161ac9cd 100644 --- a/lucene/facet/src/java/org/apache/lucene/util/encoding/FourFlagsIntDecoder.java +++ b/lucene/facet/src/java/org/apache/lucene/util/encoding/FourFlagsIntDecoder.java @@ -1,7 +1,7 @@ package org.apache.lucene.util.encoding; -import java.io.IOException; -import java.io.InputStream; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -21,11 +21,8 @@ import java.io.InputStream; */ /** - * Decodes data which was encoded by {@link FourFlagsIntEncoder}. Scans - * the indicator, one flag (1-bits) at a time, and decodes extra - * data using {@link VInt8IntDecoder}. + * Decodes values encoded with {@link FourFlagsIntEncoder}. * - * @see FourFlagsIntEncoder * @lucene.experimental */ public class FourFlagsIntDecoder extends IntDecoder { @@ -34,7 +31,7 @@ public class FourFlagsIntDecoder extends IntDecoder { * Holds all combinations of indicator for fast decoding (saves time * on real-time bit manipulation) */ - private final static byte[][] decodeTable = new byte[256][4]; + private final static byte[][] DECODE_TABLE = new byte[256][4]; /** Generating all combinations of indicator into separate flags. */ static { @@ -42,46 +39,36 @@ public class FourFlagsIntDecoder extends IntDecoder { --i; for (int j = 4; j != 0;) { --j; - decodeTable[i][j] = (byte) ((i >>> (j << 1)) & 0x3); + DECODE_TABLE[i][j] = (byte) ((i >>> (j << 1)) & 0x3); } } } - private final IntDecoder decoder = new VInt8IntDecoder(); - - /** The indicator for decoding a chunk of 4 integers. */ - private int indicator; - - /** Used as an ordinal of 0 - 3, as the decoder decodes chunks of 4 integers. */ - private int ordinal = 0; - @Override - public long decode() throws IOException { - // If we've decoded 8 integers, read the next indicator. - if ((ordinal & 0x3) == 0) { - indicator = in.read(); - if (indicator < 0) { - return EOS; + protected void doDecode(BytesRef buf, IntsRef values, int upto) { + while (buf.offset < upto) { + // read indicator + int indicator = buf.bytes[buf.offset++] & 0xFF; + int ordinal = 0; + + int capacityNeeded = values.length + 4; + if (values.ints.length < capacityNeeded) { + values.grow(capacityNeeded); + } + + while (ordinal != 4) { + byte decodeVal = DECODE_TABLE[indicator][ordinal++]; + if (decodeVal == 0) { + if (buf.offset == upto) { // end of buffer + return; + } + // decode the value from the stream. + values.ints[values.length++] = VInt8.decode(buf) + 4; + } else { + values.ints[values.length++] = decodeVal; + } } - ordinal = 0; } - - byte decodeVal = decodeTable[indicator][ordinal++]; - if (decodeVal == 0) { - // decode the value from the stream. - long decode = decoder.decode(); - return decode == EOS ? EOS : decode + 4; - } - - return decodeVal; - } - - @Override - public void reInit(InputStream in) { - super.reInit(in); - decoder.reInit(in); - ordinal = 0; - indicator = 0; } @Override diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/FourFlagsIntEncoder.java b/lucene/facet/src/java/org/apache/lucene/util/encoding/FourFlagsIntEncoder.java index a1e227d5cdf..535a90fb60c 100644 --- a/lucene/facet/src/java/org/apache/lucene/util/encoding/FourFlagsIntEncoder.java +++ b/lucene/facet/src/java/org/apache/lucene/util/encoding/FourFlagsIntEncoder.java @@ -1,6 +1,7 @@ package org.apache.lucene.util.encoding; -import java.io.IOException; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -48,11 +49,11 @@ import java.io.IOException; */ public class FourFlagsIntEncoder extends ChunksIntEncoder { - /** + /* * Holds all combinations of indicator flags for fast encoding (saves * time on bit manipulation @ encode time) */ - private static byte[][] encodeTable = new byte[][] { + private static final byte[][] ENCODE_TABLE = new byte[][] { new byte[] { 0x00, 0x00, 0x00, 0x00 }, new byte[] { 0x01, 0x04, 0x10, 0x40 }, new byte[] { 0x02, 0x08, 0x20, (byte) 0x80 }, @@ -63,26 +64,26 @@ public class FourFlagsIntEncoder extends ChunksIntEncoder { super(4); } - /** - * Small values (<=3) are stored in the indicator while larger - * values are saved for later encoding in the {@link #encodeQueue}. Since - * Vint8 will only encode values larger or equal to 4, the values saves for - * encoded are transformed to (value - 4).
    - * When a chunk is ready (got 4 values), the {@link #encodeChunk()} - * takes control. - */ @Override - public void encode(int data) throws IOException { - if (data <= 3) { - indicator |= encodeTable[data][ordinal]; - } else { - encodeQueue[encodeQueueSize++] = data - 4; + protected void doEncode(IntsRef values, BytesRef buf, int upto) { + for (int i = values.offset; i < upto; i++) { + int value = values.ints[i]; + if (value <= 3) { + indicator |= ENCODE_TABLE[value][ordinal]; + } else { + encodeQueue.ints[encodeQueue.length++] = value - 4; + } + ++ordinal; + + // encode the chunk and the indicator + if (ordinal == 4) { + encodeChunk(buf); + } } - ++ordinal; - - // If 4 values were encoded thus far, 'flush' them including the indicator. - if ((ordinal & 0x3) == 0) { - encodeChunk(); + + // encode remaining values + if (ordinal != 0) { + encodeChunk(buf); } } @@ -93,7 +94,7 @@ public class FourFlagsIntEncoder extends ChunksIntEncoder { @Override public String toString() { - return "FourFlags (" + encoder.toString() + ")"; + return "FourFlags (VInt)"; } } diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/IntDecoder.java b/lucene/facet/src/java/org/apache/lucene/util/encoding/IntDecoder.java index faabad4c8ed..acf83cc6158 100644 --- a/lucene/facet/src/java/org/apache/lucene/util/encoding/IntDecoder.java +++ b/lucene/facet/src/java/org/apache/lucene/util/encoding/IntDecoder.java @@ -1,7 +1,7 @@ package org.apache.lucene.util.encoding; -import java.io.IOException; -import java.io.InputStream; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -21,33 +21,50 @@ import java.io.InputStream; */ /** - * Decodes integers from a set {@link InputStream}. For re-usability, the - * decoder's input stream can be set by ({@link #reInit(InputStream)}). - * By design, Decoders are NOT thread-safe. + * Decodes integers from a set {@link BytesRef}. * * @lucene.experimental */ public abstract class IntDecoder { - /** A special long value which is used to indicate end-of-stream has reached. */ - public static final long EOS = 0x100000000L; - - /** Input stream from which the encoded bytes are read */ - protected InputStream in; - - /** Sets the input stream from which the encoded data is read. */ - public void reInit(InputStream in) { - this.in = in; + /** + * Performs the actual decoding. Values should be read from + * {@link BytesRef#offset} up to {@code upto}. Also, {@code values} offset and + * length are set to 0 and the encoder is expected to update + * {@link IntsRef#length}, but not {@link IntsRef#offset}. + * + *

    + * NOTE: it is ok to use the buffer's offset as the current position in + * the buffer (and modify it), it will be reset by + * {@link #decode(BytesRef, IntsRef)}. + */ + protected abstract void doDecode(BytesRef buf, IntsRef values, int upto); + + /** + * Called before {@link #doDecode(BytesRef, IntsRef, int)} so that decoders + * can reset their state. + */ + protected void reset() { + // do nothing by default } /** - * Decodes data received from the input stream, and returns one decoded - * integer. If end of stream is reached, {@link #EOS} is returned. - * - * @return one decoded integer as long or {@link #EOS} if end-of-stream - * reached. - * @throws IOException if an I/O error occurs + * Decodes the values from the buffer into the given {@link IntsRef}. Note + * that {@code values.offset} and {@code values.length} are set to 0. */ - public abstract long decode() throws IOException; + public final void decode(BytesRef buf, IntsRef values) { + values.offset = values.length = 0; // must do that because we cannot grow() them otherwise + + // some decoders may use the buffer's offset as a position index, so save + // current offset. + int bufOffset = buf.offset; + + reset(); + doDecode(buf, values, buf.offset + buf.length); + assert values.offset == 0 : "offset should not have been modified by the decoder."; + + // fix offset + buf.offset = bufOffset; + } } diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/IntEncoder.java b/lucene/facet/src/java/org/apache/lucene/util/encoding/IntEncoder.java index 4055d750810..0a3197d6c6b 100644 --- a/lucene/facet/src/java/org/apache/lucene/util/encoding/IntEncoder.java +++ b/lucene/facet/src/java/org/apache/lucene/util/encoding/IntEncoder.java @@ -1,8 +1,7 @@ package org.apache.lucene.util.encoding; -import java.io.Closeable; -import java.io.IOException; -import java.io.OutputStream; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -22,94 +21,47 @@ import java.io.OutputStream; */ /** - * Encodes integers to a set {@link OutputStream}. Extending classes need to - * override {@link #encode(int)} to encode the value using their encoding - * algorithm. The default implementation of {@link #close()} closes the set - * {@link OutputStream}. - *

    - * The default {@link #IntEncoder() constructor} is provided for convenience - * only. One must call {@link #reInit(OutputStream)} before calling - * {@link #encode(int)} or {@link #close()}. - *

    - * For convenience, each encoder implements {@link #createMatchingDecoder()} for - * easy access to the matching decoder. - *

    - * NOTE: some implementations may buffer the encoded values in memory - * (such as {@link IntEncoderFilter} implementations) and encoding will happen - * only upon calling {@link #close()}. Therefore it is important to always call - * {@link #close()} on the encoder at hand. - *

    - * NOTE: encoders are usually not thread safe, unless specifically - * documented otherwise by an implementation. + * Encodes integers to a set {@link BytesRef}. For convenience, each encoder + * implements {@link #createMatchingDecoder()} for easy access to the matching + * decoder. * * @lucene.experimental */ -public abstract class IntEncoder implements Closeable { +public abstract class IntEncoder { - protected OutputStream out = null; + public IntEncoder() {} /** - * Default constructor, provided here for robustness: if in the future a - * constructor with parameters will be added, this might break custom - * implementations of this class which call this implicit constructor. So we - * make it explicit to avoid any such issue in the future. + * Performs the actual encoding. Values should be read from + * {@link IntsRef#offset} up to {@code upto}. Also, it is guaranteed that + * {@code buf's} offset and length are set to 0 and the encoder is expected to + * update {@link BytesRef#length}, but not {@link BytesRef#offset}. */ - public IntEncoder() { + protected abstract void doEncode(IntsRef values, BytesRef buf, int upto); + + /** + * Called before {@link #doEncode(IntsRef, BytesRef, int)} so that encoders + * can reset their state. + */ + protected void reset() { + // do nothing by default } /** - * Instructs the encoder to finish the encoding process. This method closes - * the output stream which was specified by {@link #reInit(OutputStream) - * reInit}. An implementation may do here additional cleanup required to - * complete the encoding, such as flushing internal buffers, etc.
    - * Once this method was called, no further calls to {@link #encode(int) - * encode} should be made before first calling {@link #reInit(OutputStream) - * reInit}. - *

    - * NOTE: overriding classes should make sure they either call - * super.close() or close the output stream themselves. + * Encodes the values to the given buffer. Note that the buffer's offset and + * length are set to 0. */ - @Override - public void close() throws IOException { - if (out != null) { - out.close(); - } + public final void encode(IntsRef values, BytesRef buf) { + buf.offset = buf.length = 0; + reset(); + doEncode(values, buf, values.offset + values.length); + assert buf.offset == 0 : "offset should not have been modified by the encoder."; } /** - * Encodes an integer to the output stream given in - * {@link #reInit(OutputStream) reInit} - */ - public abstract void encode(int value) throws IOException; - - /** - * Returns an {@link IntDecoder} which matches this encoder. Every encoder - * must return an {@link IntDecoder} and null is not a valid - * value. If an encoder is just a filter, it should at least return its - * wrapped encoder's matching decoder. - *

    - * NOTE: this method should create a new instance of the matching - * decoder and leave the instance sharing to the caller. Returning the same - * instance over and over is risky because encoders and decoders are not - * thread safe. + * Returns an {@link IntDecoder} which can decode the values that were encoded + * with this encoder. */ public abstract IntDecoder createMatchingDecoder(); - /** - * Reinitializes the encoder with the give {@link OutputStream}. For - * re-usability it can be changed without the need to reconstruct a new - * object. - *

    - * NOTE: after calling {@link #close()}, one must call - * this method even if the output stream itself hasn't changed. An example - * case is that the output stream wraps a byte[], and the output stream itself - * is reset, but its instance hasn't changed. Some implementations of - * {@link IntEncoder} may write some metadata about themselves to the output - * stream, and therefore it is imperative that one calls this method before - * encoding any data. - */ - public void reInit(OutputStream out) { - this.out = out; - } - } diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/IntEncoderFilter.java b/lucene/facet/src/java/org/apache/lucene/util/encoding/IntEncoderFilter.java index 6c7a40340d0..ee2e5db7e9e 100644 --- a/lucene/facet/src/java/org/apache/lucene/util/encoding/IntEncoderFilter.java +++ b/lucene/facet/src/java/org/apache/lucene/util/encoding/IntEncoderFilter.java @@ -1,7 +1,5 @@ package org.apache.lucene.util.encoding; -import java.io.IOException; -import java.io.OutputStream; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -21,17 +19,7 @@ import java.io.OutputStream; */ /** - * An abstract implementation of {@link IntEncoder} which is served as a filter - * on the values to encode. An encoder filter wraps another {@link IntEncoder} - * which does the actual encoding. This allows for chaining filters and - * encoders, such as:

    - * new UniqueValuesIntEncoder(new DGapIntEncoder(new VInt8IntEnoder()));
    - * {@link UniqueValuesIntEncoder} followed by {@link DGapIntEncoder}
    -  
    - *

    - * The default implementation implements {@link #close()} by closing the wrapped - * encoder and {@link #reInit(OutputStream)} by re-initializing the wrapped - * encoder. + * An abstract implementation of {@link IntEncoder} which wraps another encoder. * * @lucene.experimental */ @@ -44,15 +32,8 @@ public abstract class IntEncoderFilter extends IntEncoder { } @Override - public void close() throws IOException { - // There is no need to call super.close(), since we don't pass the output - // stream to super. - encoder.close(); - } - - @Override - public void reInit(OutputStream out) { - encoder.reInit(out); + public void reset() { + encoder.reset(); } } diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/NOnesIntDecoder.java b/lucene/facet/src/java/org/apache/lucene/util/encoding/NOnesIntDecoder.java index 6d00e049080..1cf33857280 100644 --- a/lucene/facet/src/java/org/apache/lucene/util/encoding/NOnesIntDecoder.java +++ b/lucene/facet/src/java/org/apache/lucene/util/encoding/NOnesIntDecoder.java @@ -1,7 +1,7 @@ package org.apache.lucene.util.encoding; -import java.io.IOException; -import java.io.InputStream; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -21,54 +21,65 @@ import java.io.InputStream; */ /** - * Decodes data which was encoded by {@link NOnesIntEncoder}. Uses a - * {@link FourFlagsIntDecoder} to perform the actual encoding and translates the - * values back as described in {@link NOnesIntEncoder}. + * Decodes values encoded encoded with {@link NOnesIntEncoder}. * - * @see NOnesIntEncoder * @lucene.experimental */ public class NOnesIntDecoder extends FourFlagsIntDecoder { - /** Number of consecutive '1's to generate upon decoding a '2'. */ - private int n; - - private int onesCounter; - + // Number of consecutive '1's to generate upon decoding a '2' + private final int n; + private final IntsRef internalBuffer; + /** * Constructs a decoder with a given N (Number of consecutive '1's which are * translated into a single target value '2'. */ public NOnesIntDecoder(int n) { this.n = n; + // initial size (room for 100 integers) + internalBuffer = new IntsRef(100); } @Override - public long decode() throws IOException { - // If we read '2', we should return n '1's. - if (onesCounter > 0) { - --onesCounter; - return 1; - } - - long decode = super.decode(); - if (decode == 1) { - return 1; - } - if (decode == 2) { - onesCounter = n - 1; - return 1; - } - if (decode == 3) { - return 2; - } - return decode == EOS ? EOS : decode - 1; + protected void reset() { + internalBuffer.length = 0; + super.reset(); } - + @Override - public void reInit(InputStream in) { - super.reInit(in); - onesCounter = 0; + protected void doDecode(BytesRef buf, IntsRef values, int upto) { + super.doDecode(buf, internalBuffer, upto); + if (values.ints.length < internalBuffer.length) { + // need space for internalBuffer.length to internalBuffer.length*N, + // grow mildly at first + values.grow(internalBuffer.length * n/2); + } + + for (int i = 0; i < internalBuffer.length; i++) { + int decode = internalBuffer.ints[i]; + if (decode == 1) { + if (values.length == values.ints.length) { + values.grow(values.length + 10); // grow by few items, however not too many + } + // 1 is 1 + values.ints[values.length++] = 1; + } else if (decode == 2) { + if (values.length + n >= values.ints.length) { + values.grow(values.length + n); // grow by few items, however not too many + } + // '2' means N 1's + for (int j = 0; j < n; j++) { + values.ints[values.length++] = 1; + } + } else { + if (values.length == values.ints.length) { + values.grow(values.length + 10); // grow by few items, however not too many + } + // any other value is val-1 + values.ints[values.length++] = decode - 1; + } + } } @Override diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/NOnesIntEncoder.java b/lucene/facet/src/java/org/apache/lucene/util/encoding/NOnesIntEncoder.java index badfe1cd31d..956eea253a3 100644 --- a/lucene/facet/src/java/org/apache/lucene/util/encoding/NOnesIntEncoder.java +++ b/lucene/facet/src/java/org/apache/lucene/util/encoding/NOnesIntEncoder.java @@ -1,7 +1,7 @@ package org.apache.lucene.util.encoding; -import java.io.IOException; -import java.io.OutputStream; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -50,11 +50,10 @@ import java.io.OutputStream; */ public class NOnesIntEncoder extends FourFlagsIntEncoder { + private final IntsRef internalBuffer; + /** Number of consecutive '1's to be translated into single target value '2'. */ - private int n; - - /** Counts the number of consecutive ones seen. */ - private int onesCounter = 0; + private final int n; /** * Constructs an encoder with a given value of N (N: Number of consecutive @@ -62,38 +61,48 @@ public class NOnesIntEncoder extends FourFlagsIntEncoder { */ public NOnesIntEncoder(int n) { this.n = n; + internalBuffer = new IntsRef(n); } @Override - public void close() throws IOException { - // We might have ones in our buffer, encode them as neccesary. - while (onesCounter-- > 0) { - super.encode(1); + protected void reset() { + internalBuffer.length = 0; + super.reset(); + } + + @Override + protected void doEncode(IntsRef values, BytesRef buf, int upto) { + // make sure the internal buffer is large enough + if (values.length > internalBuffer.ints.length) { + internalBuffer.grow(values.length); } - - super.close(); - } - - @Override - public void encode(int value) throws IOException { - if (value == 1) { - // Increment the number of consecutive ones seen so far - if (++onesCounter == n) { - super.encode(2); - onesCounter = 0; + + int onesCounter = 0; + for (int i = values.offset; i < upto; i++) { + int value = values.ints[i]; + if (value == 1) { + // every N 1's should be encoded as '2' + if (++onesCounter == n) { + internalBuffer.ints[internalBuffer.length++] = 2; + onesCounter = 0; + } + } else { + // there might have been 1's that we need to encode + while (onesCounter > 0) { + --onesCounter; + internalBuffer.ints[internalBuffer.length++] = 1; + } + + // encode value as value+1 + internalBuffer.ints[internalBuffer.length++] = value + 1; } - return; } - - // If it's not one - there might have been ones we had to encode prior to - // this value + // there might have been 1's that we need to encode while (onesCounter > 0) { --onesCounter; - super.encode(1); + internalBuffer.ints[internalBuffer.length++] = 1; } - - // encode value + 1 --> the translation. - super.encode(value + 1); + super.doEncode(internalBuffer, buf, internalBuffer.length); } @Override @@ -101,12 +110,6 @@ public class NOnesIntEncoder extends FourFlagsIntEncoder { return new NOnesIntDecoder(n); } - @Override - public void reInit(OutputStream out) { - super.reInit(out); - onesCounter = 0; - } - @Override public String toString() { return "NOnes (" + n + ") (" + super.toString() + ")"; diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/SimpleIntDecoder.java b/lucene/facet/src/java/org/apache/lucene/util/encoding/SimpleIntDecoder.java index 414af6effa7..af6fce26af7 100644 --- a/lucene/facet/src/java/org/apache/lucene/util/encoding/SimpleIntDecoder.java +++ b/lucene/facet/src/java/org/apache/lucene/util/encoding/SimpleIntDecoder.java @@ -1,7 +1,7 @@ package org.apache.lucene.util.encoding; -import java.io.IOException; -import java.io.StreamCorruptedException; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -21,41 +21,24 @@ import java.io.StreamCorruptedException; */ /** - * A simple stream decoder which can decode values encoded with - * {@link SimpleIntEncoder}. + * Decodes values encoded with {@link SimpleIntEncoder}. * * @lucene.experimental */ public class SimpleIntDecoder extends IntDecoder { - /** - * reusable buffer - allocated only once as this is not a thread-safe object - */ - private byte[] buffer = new byte[4]; - @Override - public long decode() throws IOException { - - // we need exactly 4 bytes to decode an int in this decoder impl, otherwise, throw an exception - int offset = 0; - while (offset < 4) { - int nRead = in.read(buffer, offset, 4 - offset); - if (nRead == -1) { - if (offset > 0) { - throw new StreamCorruptedException( - "Need 4 bytes for decoding an int, got only " + offset); - } - return EOS; + protected void doDecode(BytesRef buf, IntsRef values, int upto) { + while (buf.offset < upto) { + if (values.length == values.ints.length) { + values.grow(values.length + 10); // grow by few items, however not too many } - offset += nRead; + values.ints[values.length++] = + ((buf.bytes[buf.offset++] & 0xFF) << 24) | + ((buf.bytes[buf.offset++] & 0xFF) << 16) | + ((buf.bytes[buf.offset++] & 0xFF) << 8) | + (buf.bytes[buf.offset++] & 0xFF); } - - int v = buffer[3] & 0xff; - v |= (buffer[2] << 8) & 0xff00; - v |= (buffer[1] << 16) & 0xff0000; - v |= (buffer[0] << 24) & 0xff000000; - - return v; } @Override diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/SimpleIntEncoder.java b/lucene/facet/src/java/org/apache/lucene/util/encoding/SimpleIntEncoder.java index 2e17ef29640..fd6a0206117 100644 --- a/lucene/facet/src/java/org/apache/lucene/util/encoding/SimpleIntEncoder.java +++ b/lucene/facet/src/java/org/apache/lucene/util/encoding/SimpleIntEncoder.java @@ -1,6 +1,7 @@ package org.apache.lucene.util.encoding; -import java.io.IOException; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -26,22 +27,21 @@ import java.io.IOException; */ public class SimpleIntEncoder extends IntEncoder { - /** - * This method makes sure the value wasn't previously encoded by checking - * against the Set. If the value wasn't encoded, it's added to the Set, and - * encoded with {#link Vint8#encode} - * - * @param value - * an integer to be encoded - * @throws IOException - * possibly thrown by the OutputStream - */ @Override - public void encode(int value) throws IOException { - out.write(value >>> 24); - out.write((value >> 16) & 0xFF); - out.write((value >> 8) & 0xFF); - out.write(value & 0xFF); + protected void doEncode(IntsRef values, BytesRef buf, int upto) { + // ensure there's enough room in the buffer + int bytesNeeded = values.length * 4; + if (buf.bytes.length < bytesNeeded) { + buf.grow(bytesNeeded); + } + + for (int i = values.offset; i < upto; i++) { + int value = values.ints[i]; + buf.bytes[buf.length++] = (byte) (value >>> 24); + buf.bytes[buf.length++] = (byte) ((value >> 16) & 0xFF); + buf.bytes[buf.length++] = (byte) ((value >> 8) & 0xFF); + buf.bytes[buf.length++] = (byte) (value & 0xFF); + } } @Override diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/SortingIntEncoder.java b/lucene/facet/src/java/org/apache/lucene/util/encoding/SortingIntEncoder.java index 9f6cee635ce..0ebb06efa85 100644 --- a/lucene/facet/src/java/org/apache/lucene/util/encoding/SortingIntEncoder.java +++ b/lucene/facet/src/java/org/apache/lucene/util/encoding/SortingIntEncoder.java @@ -1,9 +1,10 @@ package org.apache.lucene.util.encoding; -import java.io.IOException; -import java.io.OutputStream; import java.util.Arrays; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; + /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with @@ -23,47 +24,21 @@ import java.util.Arrays; /** * An {@link IntEncoderFilter} which sorts the values to encode in ascending - * order before encoding them. Encoding therefore happens upon calling - * {@link #close()}. Since this encoder is usually chained with another encoder - * that relies on sorted values, it does not offer a default constructor. + * order before encoding them. * * @lucene.experimental */ public class SortingIntEncoder extends IntEncoderFilter { - private float grow = 2.0f; - private int index = 0; - private int[] set = new int[1024]; - /** Initializes with the given encoder. */ public SortingIntEncoder(IntEncoder encoder) { super(encoder); } @Override - public void close() throws IOException { - if (index == 0) { - return; - } - - Arrays.sort(set, 0, index); - for (int i = 0; i < index; i++) { - encoder.encode(set[i]); - } - encoder.close(); - index = 0; - - super.close(); - } - - @Override - public void encode(int value) throws IOException { - if (index == set.length) { - int[] newSet = new int[(int) (set.length * grow)]; - System.arraycopy(set, 0, newSet, 0, set.length); - set = newSet; - } - set[index++] = value; + protected void doEncode(IntsRef values, BytesRef buf, int upto) { + Arrays.sort(values.ints, values.offset, upto); + encoder.doEncode(values, buf, upto); } @Override @@ -71,12 +46,6 @@ public class SortingIntEncoder extends IntEncoderFilter { return encoder.createMatchingDecoder(); } - @Override - public void reInit(OutputStream out) { - super.reInit(out); - index = 0; - } - @Override public String toString() { return "Sorting (" + encoder.toString() + ")"; diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/UniqueValuesIntEncoder.java b/lucene/facet/src/java/org/apache/lucene/util/encoding/UniqueValuesIntEncoder.java index 3583402295f..c9a6be5c848 100644 --- a/lucene/facet/src/java/org/apache/lucene/util/encoding/UniqueValuesIntEncoder.java +++ b/lucene/facet/src/java/org/apache/lucene/util/encoding/UniqueValuesIntEncoder.java @@ -1,7 +1,7 @@ package org.apache.lucene.util.encoding; -import java.io.IOException; -import java.io.OutputStream; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -22,7 +22,7 @@ import java.io.OutputStream; /** * An {@link IntEncoderFilter} which ensures only unique values are encoded. The - * implementation assumes the values given to {@link #encode(int)} are sorted. + * implementation assumes the values given to {@link #encode(IntsRef, BytesRef)} are sorted. * If this is not the case, you can chain this encoder with * {@link SortingIntEncoder}. * @@ -30,26 +30,23 @@ import java.io.OutputStream; */ public final class UniqueValuesIntEncoder extends IntEncoderFilter { - /** - * Denotes an illegal value which we can use to init 'prev' to. Since all - * encoded values are integers, this value is init to MAX_INT+1 and is of type - * long. Therefore we are guaranteed not to get this value in encode. - */ - private static final long ILLEGAL_VALUE = Integer.MAX_VALUE + 1; - - private long prev = ILLEGAL_VALUE; - /** Constructs a new instance with the given encoder. */ public UniqueValuesIntEncoder(IntEncoder encoder) { super(encoder); } @Override - public void encode(int value) throws IOException { - if (prev != value) { - encoder.encode(value); - prev = value; + protected void doEncode(IntsRef values, BytesRef buf, int upto) { + int prev = values.ints[values.offset]; + int idx = values.offset + 1; + for (int i = idx; i < upto; i++) { + if (values.ints[i] != prev) { + values.ints[idx++] = values.ints[i]; + prev = values.ints[i]; + } } + values.length = idx - values.offset; + encoder.doEncode(values, buf, idx); } @Override @@ -57,12 +54,6 @@ public final class UniqueValuesIntEncoder extends IntEncoderFilter { return encoder.createMatchingDecoder(); } - @Override - public void reInit(OutputStream out) { - super.reInit(out); - prev = ILLEGAL_VALUE; - } - @Override public String toString() { return "Unique (" + encoder.toString() + ")"; diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/VInt8.java b/lucene/facet/src/java/org/apache/lucene/util/encoding/VInt8.java new file mode 100644 index 00000000000..267d52bae96 --- /dev/null +++ b/lucene/facet/src/java/org/apache/lucene/util/encoding/VInt8.java @@ -0,0 +1,138 @@ +package org.apache.lucene.util.encoding; + +import org.apache.lucene.util.BytesRef; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Variable-length encoding of 32-bit integers, into 8-bit bytes. A number is + * encoded as follows: + *

      + *
    • If it is less than 127 and non-negative (i.e., if the number uses only 7 + * bits), it is encoded as as single byte: 0bbbbbbb. + *
    • If its highest nonzero bit is greater than bit 6 (0x40), it is + * represented as a series of bytes, each byte's 7 LSB containing bits from the + * original value, with the MSB set for all but the last byte. The first encoded + * byte contains the highest nonzero bits from the original; the second byte + * contains the next 7 MSB; and so on, with the last byte containing the 7 LSB + * of the original. + *
    + * Examples: + *
      + *
    1. n = 117 = 1110101: This has fewer than 8 significant bits, and so is + * encoded as 01110101 = 0x75. + *
    2. n = 100000 = (binary) 11000011010100000. This has 17 significant bits, + * and so needs three Vint8 bytes. Left-zero-pad it to a multiple of 7 bits, + * then split it into chunks of 7 and add an MSB, 0 for the last byte, 1 for the + * others: 1|0000110 1|0001101 0|0100000 = 0x86 0x8D 0x20. + *
    + * {@link #encode(int, BytesRef)} and {@link #decode(BytesRef)} will correctly + * handle any 32-bit integer, but for negative numbers, and positive numbers + * with more than 28 significant bits, encoding requires 5 bytes; this is not an + * efficient encoding scheme for large positive numbers or any negative number. + * + * @lucene.experimental + */ +public class VInt8 { + + /** The maximum number of bytes needed to encode an integer. */ + public static final int MAXIMUM_BYTES_NEEDED = 5; + + /** + * Decodes an int from the given bytes, starting at {@link BytesRef#offset}. + * Returns the decoded bytes and updates {@link BytesRef#offset}. + */ + public static int decode(BytesRef bytes) { + /* + This is the original code of this method, but a Hotspot bug + corrupted the for-loop of DataInput.readVInt() (see LUCENE-2975) + so the loop was unwounded here too, to be on the safe side + int value = 0; + while (true) { + byte first = bytes.bytes[bytes.offset++]; + value |= first & 0x7F; + if ((first & 0x80) == 0) { + return value; + } + value <<= 7; + } + */ + + // byte 1 + byte b = bytes.bytes[bytes.offset++]; + if (b >= 0) return b; + + // byte 2 + int value = b & 0x7F; + b = bytes.bytes[bytes.offset++]; + value = (value << 7) | b & 0x7F; + if (b >= 0) return value; + + // byte 3 + b = bytes.bytes[bytes.offset++]; + value = (value << 7) | b & 0x7F; + if (b >= 0) return value; + + // byte 4 + b = bytes.bytes[bytes.offset++]; + value = (value << 7) | b & 0x7F; + if (b >= 0) return value; + + // byte 5 + b = bytes.bytes[bytes.offset++]; + return (value << 7) | b & 0x7F; + } + + /** + * Encodes the given number into bytes, starting at {@link BytesRef#length}. + * Assumes that the array is large enough. + */ + public static void encode(int value, BytesRef bytes) { + if ((value & ~0x7F) == 0) { + bytes.bytes[bytes.length] = (byte) value; + bytes.length++; + } else if ((value & ~0x3FFF) == 0) { + bytes.bytes[bytes.length] = (byte) (0x80 | ((value & 0x3F80) >> 7)); + bytes.bytes[bytes.length + 1] = (byte) (value & 0x7F); + bytes.length += 2; + } else if ((value & ~0x1FFFFF) == 0) { + bytes.bytes[bytes.length] = (byte) (0x80 | ((value & 0x1FC000) >> 14)); + bytes.bytes[bytes.length + 1] = (byte) (0x80 | ((value & 0x3F80) >> 7)); + bytes.bytes[bytes.length + 2] = (byte) (value & 0x7F); + bytes.length += 3; + } else if ((value & ~0xFFFFFFF) == 0) { + bytes.bytes[bytes.length] = (byte) (0x80 | ((value & 0xFE00000) >> 21)); + bytes.bytes[bytes.length + 1] = (byte) (0x80 | ((value & 0x1FC000) >> 14)); + bytes.bytes[bytes.length + 2] = (byte) (0x80 | ((value & 0x3F80) >> 7)); + bytes.bytes[bytes.length + 3] = (byte) (value & 0x7F); + bytes.length += 4; + } else { + bytes.bytes[bytes.length] = (byte) (0x80 | ((value & 0xF0000000) >> 28)); + bytes.bytes[bytes.length + 1] = (byte) (0x80 | ((value & 0xFE00000) >> 21)); + bytes.bytes[bytes.length + 2] = (byte) (0x80 | ((value & 0x1FC000) >> 14)); + bytes.bytes[bytes.length + 3] = (byte) (0x80 | ((value & 0x3F80) >> 7)); + bytes.bytes[bytes.length + 4] = (byte) (value & 0x7F); + bytes.length += 5; + } + } + + private VInt8() { + // Just making it impossible to instantiate. + } + +} diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/VInt8IntDecoder.java b/lucene/facet/src/java/org/apache/lucene/util/encoding/VInt8IntDecoder.java index 2beaf773d78..e9fe5600c9a 100644 --- a/lucene/facet/src/java/org/apache/lucene/util/encoding/VInt8IntDecoder.java +++ b/lucene/facet/src/java/org/apache/lucene/util/encoding/VInt8IntDecoder.java @@ -1,6 +1,7 @@ package org.apache.lucene.util.encoding; -import java.io.IOException; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -20,33 +21,19 @@ import java.io.IOException; */ /** - * An {@link IntDecoder} which can decode values encoded by - * {@link VInt8IntEncoder}. + * Decodes values encoded by {@link VInt8IntEncoder}. * * @lucene.experimental */ public class VInt8IntDecoder extends IntDecoder { - private boolean legalEOS = true; - @Override - public long decode() throws IOException { - int value = 0; - while (true) { - int first = in.read(); - if (first < 0) { - if (!legalEOS) { - throw new IOException("Unexpected End-Of-Stream"); - } - return EOS; + protected void doDecode(BytesRef buf, IntsRef values, int upto) { + while (buf.offset < upto) { + if (values.length == values.ints.length) { + values.grow(values.length + 10); // grow by few items, however not too many } - value |= first & 0x7F; - if ((first & 0x80) == 0) { - legalEOS = true; - return value; - } - legalEOS = false; - value <<= 7; + values.ints[values.length++] = VInt8.decode(buf); } } diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/VInt8IntEncoder.java b/lucene/facet/src/java/org/apache/lucene/util/encoding/VInt8IntEncoder.java index 8fe6fbc9e09..7c62bf3e035 100644 --- a/lucene/facet/src/java/org/apache/lucene/util/encoding/VInt8IntEncoder.java +++ b/lucene/facet/src/java/org/apache/lucene/util/encoding/VInt8IntEncoder.java @@ -1,6 +1,7 @@ package org.apache.lucene.util.encoding; -import java.io.IOException; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -49,27 +50,14 @@ import java.io.IOException; public class VInt8IntEncoder extends IntEncoder { @Override - public void encode(int value) throws IOException { - if ((value & ~0x7F) == 0) { - out.write(value); - } else if ((value & ~0x3FFF) == 0) { - out.write(0x80 | (value >> 7)); - out.write(0x7F & value); - } else if ((value & ~0x1FFFFF) == 0) { - out.write(0x80 | (value >> 14)); - out.write(0x80 | (value >> 7)); - out.write(0x7F & value); - } else if ((value & ~0xFFFFFFF) == 0) { - out.write(0x80 | (value >> 21)); - out.write(0x80 | (value >> 14)); - out.write(0x80 | (value >> 7)); - out.write(0x7F & value); - } else { - out.write(0x80 | (value >> 28)); - out.write(0x80 | (value >> 21)); - out.write(0x80 | (value >> 14)); - out.write(0x80 | (value >> 7)); - out.write(0x7F & value); + protected void doEncode(IntsRef values, BytesRef buf, int upto) { + int maxBytesNeeded = 5 * values.length; // at most 5 bytes per VInt + if (buf.bytes.length < maxBytesNeeded) { + buf.grow(maxBytesNeeded); + } + + for (int i = values.offset; i < upto; i++) { + VInt8.encode(values.ints[i], buf); } } diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/package.html b/lucene/facet/src/java/org/apache/lucene/util/encoding/package.html index 2e50e8f38eb..8a81b258e34 100644 --- a/lucene/facet/src/java/org/apache/lucene/util/encoding/package.html +++ b/lucene/facet/src/java/org/apache/lucene/util/encoding/package.html @@ -25,49 +25,8 @@ mechanisms to create new ones. The super class for all encoders is encoders there is a matching {@link org.apache.lucene.util.encoding.IntDecoder} implementation (not all encoders need a decoder). -

    An encoder encodes the integers that are passed to {@link -org.apache.lucene.util.encoding.IntEncoder#encode(int) encode} into a -set output stream (see {@link -org.apache.lucene.util.encoding.IntEncoder#reInit(OutputStream) -reInit}). One should always call {@link -org.apache.lucene.util.encoding.IntEncoder#close() close} when all -integers have been encoded, to ensure proper finish by the encoder. Some -encoders buffer values in-memory and encode in batches in order to -optimize the encoding, and not closing them may result in loss of -information or corrupt stream. -

    A proper and typical usage of an encoder looks like this: -

    
    -int[] data = <the values to encode>
    -IntEncoder encoder = new VInt8IntEncoder();
    -OutputStream out = new ByteArrayOutputStream();
    -encoder.reInit(out);
    -for (int val : data) {
    -  encoder.encode(val);
    -}
    -encoder.close();
    -
    -// Print the bytes in binary
    -byte[] bytes = out.toByteArray();
    -for (byte b : bytes) {
    -  System.out.println(Integer.toBinaryString(b));
    -}
    -
    -Each encoder also implements {@link -org.apache.lucene.util.encoding.IntEncoder#createMatchingDecoder() -createMatchingDecoder} which returns the matching decoder for this encoder. -As mentioned above, not all encoders have a matching decoder (like some -encoder filters which are explained next), however every encoder should -return a decoder following a call to that method. To complete the -example above, one can easily iterate over the decoded values like this: -
    
    -IntDecoder d = e.createMatchingDecoder();
    -d.reInit(new ByteArrayInputStream(bytes));
    -long val;
    -while ((val = d.decode()) != IntDecoder.EOS) {
    -  System.out.println(val);
    -}
    -
    -

    Some encoders don't perform any encoding at all, or do not include an +

    +Some encoders don't perform any encoding at all, or do not include an encoding logic. Those are called {@link org.apache.lucene.util.encoding.IntEncoderFilter}s. A filter is an encoder which delegates the encoding task to a given encoder, however @@ -76,91 +35,6 @@ example is {@link org.apache.lucene.util.encoding.DGapIntEncoder} which encodes the gaps between values rather than the values themselves. Another example is {@link org.apache.lucene.util.encoding.SortingIntEncoder} which sorts all the -values in ascending order before they are sent for encoding. This -encoder aggregates the values in its {@link -org.apache.lucene.util.encoding.IntEncoder#encode(int) encode} implementation -and decoding only happens upon calling {@link -org.apache.lucene.util.encoding.IntEncoder#close() close}. -

    Extending IntEncoder

    -Extending {@link org.apache.lucene.util.encoding.IntEncoder} is a very -easy task. One only needs to implement {@link -org.apache.lucene.util.encoding.IntEncoder#encode(int) encode} and -{@link org.apache.lucene.util.encoding.IntEncoder#createMatchingDecoder() -createMatchingDecoder} as the base implementation takes care of -re-initializing the output stream and closing it. The following example -illustrates how can one write an encoder (and a matching decoder) which -'tags' the stream with type/ID of the encoder. Such tagging is important -in scenarios where an application uses different encoders for different -streams, and wants to manage some sort of mapping between an encoder ID -to an IntEncoder/Decoder implementation, so a proper decoder will be -initialized on the fly: -
    
    -public class TaggingIntEncoder extends IntEncoderFilter {
    -  
    -  public TaggingIntEncoder(IntEncoder encoder) {
    -    super(encoder);
    -  }
    -  
    -  @Override
    -  public void encode(int value) throws IOException {
    -    encoder.encode(value);
    -  }
    -
    -  @Override
    -  public IntDecoder createMatchingDecoder() {
    -    return new TaggingIntDecoder();
    -  }
    -	
    -  @Override
    -  public void reInit(OutputStream out) {
    -    super.reInit(os);
    -    // Assumes the application has a static EncodersMap class which is able to 
    -    // return a unique ID for a given encoder.
    -    int encoderID = EncodersMap.getID(encoder);
    -    this.out.write(encoderID);
    -  }
    -
    -  @Override
    -  public String toString() {
    -    return "Tagging (" + encoder.toString() + ")";
    -  }
    -
    -}
    -
    -And the matching decoder: -
    
    -public class TaggingIntDecoder extends IntDecoder {
    -  
    -  // Will be initialized upon calling reInit.
    -  private IntDecoder decoder;
    -  
    -  @Override
    -  public void reInit(InputStream in) {
    -    super.reInit(in);
    -    
    -    // Read the ID of the encoder that tagged this stream.
    -    int encoderID = in.read();
    -    
    -    // Assumes EncodersMap can return the proper IntEncoder given the ID.
    -    decoder = EncodersMap.getEncoder(encoderID).createMatchingDecoder();
    -  }
    -	
    -  @Override
    -  public long decode() throws IOException {
    -    return decoder.decode();
    -  }
    -
    -  @Override
    -  public String toString() {
    -    return "Tagging (" + decoder == null ? "none" : decoder.toString() + ")";
    -  }
    -
    -}
    -
    -The example implements TaggingIntEncoder as a filter over another -encoder. Even though it does not do any filtering on the actual values, it feels -right to present it as a filter. Anyway, this is just an example code and one -can choose to implement it however it makes sense to the application. For -simplicity, error checking was omitted from the sample code. +values in ascending order before they are sent for encoding. \ No newline at end of file diff --git a/lucene/facet/src/test/org/apache/lucene/facet/example/TestAssociationExample.java b/lucene/facet/src/test/org/apache/lucene/facet/example/TestAssociationExample.java index c630fe95330..203343859e6 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/example/TestAssociationExample.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/example/TestAssociationExample.java @@ -4,7 +4,7 @@ import org.junit.Test; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.facet.example.ExampleResult; -import org.apache.lucene.facet.example.association.AssociationMain; +import org.apache.lucene.facet.example.association.CategoryAssociationsMain; import org.apache.lucene.facet.search.results.FacetResultNode; /* @@ -35,8 +35,8 @@ public class TestAssociationExample extends LuceneTestCase { @Test public void testAssociationExamples() throws Exception { - assertExampleResult(new AssociationMain().runSumIntAssociationSample(), EXPECTED_INT_SUM_RESULTS); - assertExampleResult(new AssociationMain().runSumFloatAssociationSample(), EXPECTED_FLOAT_SUM_RESULTS); + assertExampleResult(new CategoryAssociationsMain().runSumIntAssociationSample(), EXPECTED_INT_SUM_RESULTS); + assertExampleResult(new CategoryAssociationsMain().runSumFloatAssociationSample(), EXPECTED_FLOAT_SUM_RESULTS); } private void assertExampleResult(ExampleResult res, double[] expectedResults) { diff --git a/lucene/facet/src/test/org/apache/lucene/facet/search/CategoryListIteratorTest.java b/lucene/facet/src/test/org/apache/lucene/facet/search/CategoryListIteratorTest.java index ddb4ac54480..7db8056a71c 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/search/CategoryListIteratorTest.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/search/CategoryListIteratorTest.java @@ -19,8 +19,8 @@ import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.store.Directory; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.LuceneTestCase; -import org.apache.lucene.util.UnsafeByteArrayOutputStream; import org.apache.lucene.util.encoding.DGapIntEncoder; import org.apache.lucene.util.encoding.IntEncoder; import org.apache.lucene.util.encoding.SortingIntEncoder; @@ -49,17 +49,19 @@ public class CategoryListIteratorTest extends LuceneTestCase { private static final class DataTokenStream extends TokenStream { + private final PayloadAttribute payload = addAttribute(PayloadAttribute.class); + private final BytesRef buf; + private final IntEncoder encoder; + private final CharTermAttribute term = addAttribute(CharTermAttribute.class); + private int idx; - private PayloadAttribute payload = addAttribute(PayloadAttribute.class); - private byte[] buf = new byte[20]; - UnsafeByteArrayOutputStream ubaos = new UnsafeByteArrayOutputStream(buf); - IntEncoder encoder; private boolean exhausted = false; - private CharTermAttribute term = addAttribute(CharTermAttribute.class); public DataTokenStream(String text, IntEncoder encoder) { this.encoder = encoder; term.setEmpty().append(text); + buf = new BytesRef(); + payload.setPayload(buf); } public void setIdx(int idx) { @@ -73,30 +75,26 @@ public class CategoryListIteratorTest extends LuceneTestCase { return false; } - int[] values = data[idx]; - ubaos.reInit(buf); - encoder.reInit(ubaos); - for (int val : values) { - encoder.encode(val); - } - encoder.close(); - payload.setPayload(new BytesRef(buf, 0, ubaos.length())); - + // must copy because encoders may change the buffer + encoder.encode(IntsRef.deepCopyOf(data[idx]), buf); exhausted = true; return true; } } - static final int[][] data = new int[][] { - new int[] { 1, 2 }, new int[] { 3, 4 }, new int[] { 1, 3 }, new int[] { 1, 2, 3, 4 }, + static final IntsRef[] data = new IntsRef[] { + new IntsRef(new int[] { 1, 2 }, 0, 2), + new IntsRef(new int[] { 3, 4 }, 0, 2), + new IntsRef(new int[] { 1, 3 }, 0, 2), + new IntsRef(new int[] { 1, 2, 3, 4 }, 0, 4) }; @Test - public void testPayloadIntDecodingIterator() throws Exception { + public void testPayloadCategoryListIteraor() throws Exception { Directory dir = newDirectory(); - DataTokenStream dts = new DataTokenStream("1",new SortingIntEncoder( - new UniqueValuesIntEncoder(new DGapIntEncoder(new VInt8IntEncoder())))); + final IntEncoder encoder = new SortingIntEncoder(new UniqueValuesIntEncoder(new DGapIntEncoder(new VInt8IntEncoder()))); + DataTokenStream dts = new DataTokenStream("1",encoder); RandomIndexWriter writer = new RandomIndexWriter(random(), dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.KEYWORD, false)).setMergePolicy(newLogMergePolicy())); for (int i = 0; i < data.length; i++) { @@ -108,21 +106,21 @@ public class CategoryListIteratorTest extends LuceneTestCase { IndexReader reader = writer.getReader(); writer.close(); - CategoryListIterator cli = new PayloadIntDecodingIterator(reader, new Term( - "f","1"), dts.encoder.createMatchingDecoder()); + IntsRef ordinals = new IntsRef(); + CategoryListIterator cli = new PayloadCategoryListIteraor(reader, new Term("f","1"), encoder.createMatchingDecoder()); cli.init(); int totalCategories = 0; for (int i = 0; i < data.length; i++) { Set values = new HashSet(); for (int j = 0; j < data[i].length; j++) { - values.add(data[i][j]); + values.add(data[i].ints[j]); } - cli.skipTo(i); - long cat; - while ((cat = cli.nextCategory()) < Integer.MAX_VALUE) { - assertTrue("expected category not found: " + cat, values.contains((int) cat)); - totalCategories ++; + cli.getOrdinals(i, ordinals); + assertTrue("no ordinals for document " + i, ordinals.length > 0); + for (int j = 0; j < ordinals.length; j++) { + assertTrue("expected category not found: " + ordinals.ints[j], values.contains(ordinals.ints[j])); } + totalCategories += ordinals.length; } assertEquals("Missing categories!",10,totalCategories); reader.close(); @@ -135,8 +133,8 @@ public class CategoryListIteratorTest extends LuceneTestCase { @Test public void testPayloadIteratorWithInvalidDoc() throws Exception { Directory dir = newDirectory(); - DataTokenStream dts = new DataTokenStream("1",new SortingIntEncoder( - new UniqueValuesIntEncoder(new DGapIntEncoder(new VInt8IntEncoder())))); + final IntEncoder encoder = new SortingIntEncoder(new UniqueValuesIntEncoder(new DGapIntEncoder(new VInt8IntEncoder()))); + DataTokenStream dts = new DataTokenStream("1", encoder); // this test requires that no payloads ever be randomly present! final Analyzer noPayloadsAnalyzer = new Analyzer() { @Override @@ -162,30 +160,27 @@ public class CategoryListIteratorTest extends LuceneTestCase { IndexReader reader = writer.getReader(); writer.close(); - CategoryListIterator cli = new PayloadIntDecodingIterator(reader, new Term( - "f","1"), dts.encoder.createMatchingDecoder()); + IntsRef ordinals = new IntsRef(); + CategoryListIterator cli = new PayloadCategoryListIteraor(reader, new Term("f","1"), encoder.createMatchingDecoder()); assertTrue("Failed to initialize payload iterator", cli.init()); - int totalCats = 0; + int totalCategories = 0; for (int i = 0; i < data.length; i++) { - // doc no. i Set values = new HashSet(); for (int j = 0; j < data[i].length; j++) { - values.add(data[i][j]); + values.add(data[i].ints[j]); } - boolean hasDoc = cli.skipTo(i); - if (hasDoc) { - assertTrue("Document " + i + " must not have a payload!", i == 0); - long cat; - while ((cat = cli.nextCategory()) < Integer.MAX_VALUE) { - assertTrue("expected category not found: " + cat, values.contains((int) cat)); - ++totalCats; + cli.getOrdinals(i, ordinals); + if (i == 0) { + assertTrue("document 0 must have a payload", ordinals.length > 0); + for (int j = 0; j < ordinals.length; j++) { + assertTrue("expected category not found: " + ordinals.ints[j], values.contains(ordinals.ints[j])); } + totalCategories += ordinals.length; } else { - assertFalse("Document " + i + " must have a payload!", i == 0); + assertTrue("only document 0 should have a payload", ordinals.length == 0); } - } - assertEquals("Wrong number of total categories!", 2, totalCats); + assertEquals("Wrong number of total categories!", 2, totalCategories); reader.close(); dir.close(); diff --git a/lucene/facet/src/test/org/apache/lucene/facet/search/DrillDownTest.java b/lucene/facet/src/test/org/apache/lucene/facet/search/DrillDownTest.java index 6292ea37353..c20a98c41fe 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/search/DrillDownTest.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/search/DrillDownTest.java @@ -90,7 +90,9 @@ public class DrillDownTest extends LuceneTestCase { paths.add(new CategoryPath("b")); } FacetFields facetFields = new FacetFields(taxoWriter); - facetFields.addFields(doc, paths); + if (paths.size() > 0) { + facetFields.addFields(doc, paths); + } writer.addDocument(doc); } diff --git a/lucene/facet/src/test/org/apache/lucene/facet/search/TestCategoryListCache.java b/lucene/facet/src/test/org/apache/lucene/facet/search/TestCategoryListCache.java index 08b48aad3fe..aaafc9abd1e 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/search/TestCategoryListCache.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/search/TestCategoryListCache.java @@ -6,6 +6,7 @@ import java.util.List; import java.util.Map; import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.util.IntsRef; import org.junit.After; import org.junit.Before; @@ -118,18 +119,17 @@ public class TestCategoryListCache extends FacetTestBase { @Override public CategoryListIterator iterator(int partition) throws IOException { final CategoryListIterator it = cld.iterator(partition); - return new CategoryListIterator() { + return new CategoryListIterator() { @Override - public boolean skipTo(int docId) throws IOException { - return it.skipTo(docId); - } - @Override - public long nextCategory() throws IOException { - long res = it.nextCategory(); - if (res>Integer.MAX_VALUE) { - return res; + public void getOrdinals(int docID, IntsRef ints) throws IOException { + it.getOrdinals(docID, ints); + for (int i = 0; i < ints.length; i++) { + if (ints.ints[i] > 1) { + ints.ints[i]--; + } else { + ints.ints[i]++; + } } - return res>1 ? res-1 : res+1; } @Override public boolean init() throws IOException { diff --git a/lucene/facet/src/test/org/apache/lucene/facet/search/params/MultiCategoryListIteratorTest.java b/lucene/facet/src/test/org/apache/lucene/facet/search/params/MultiCategoryListIteratorTest.java new file mode 100644 index 00000000000..404ac5f54e8 --- /dev/null +++ b/lucene/facet/src/test/org/apache/lucene/facet/search/params/MultiCategoryListIteratorTest.java @@ -0,0 +1,126 @@ +package org.apache.lucene.facet.search.params; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Random; + +import org.apache.lucene.document.Document; +import org.apache.lucene.facet.index.FacetFields; +import org.apache.lucene.facet.index.params.CategoryListParams; +import org.apache.lucene.facet.index.params.PerDimensionIndexingParams; +import org.apache.lucene.facet.search.CategoryListIterator; +import org.apache.lucene.facet.search.PayloadCategoryListIteraor; +import org.apache.lucene.facet.search.cache.CategoryListCache; +import org.apache.lucene.facet.taxonomy.CategoryPath; +import org.apache.lucene.facet.taxonomy.TaxonomyReader; +import org.apache.lucene.facet.taxonomy.TaxonomyWriter; +import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader; +import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter; +import org.apache.lucene.facet.util.MultiCategoryListIterator; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.IntsRef; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.encoding.IntDecoder; +import org.junit.Test; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class MultiCategoryListIteratorTest extends LuceneTestCase { + + @Test + public void testMultipleCategoryLists() throws Exception { + Random random = random(); + int numDimensions = atLeast(random, 2); // at least 2 dimensions + String[] dimensions = new String[numDimensions]; + for (int i = 0; i < numDimensions; i++) { + dimensions[i] = "dim" + i; + } + + // build the PerDimensionIndexingParams + HashMap clps = new HashMap(); + for (String dim : dimensions) { + CategoryPath cp = new CategoryPath(dim); + CategoryListParams clp = new CategoryListParams(new Term("$" + dim, CategoryListParams.DEFAULT_TERM.bytes())); + clps.put(cp, clp); + } + PerDimensionIndexingParams indexingParams = new PerDimensionIndexingParams(clps); + + // index some documents + Directory indexDir = newDirectory(); + Directory taxoDir = newDirectory(); + IndexWriter indexWriter = new IndexWriter(indexDir, newIndexWriterConfig(TEST_VERSION_CURRENT, null).setMaxBufferedDocs(2)); + TaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir); + FacetFields facetFields = new FacetFields(taxoWriter, indexingParams); + int ndocs = atLeast(random, 10); + for (int i = 0; i < ndocs; i++) { + Document doc = new Document(); + int numCategories = random.nextInt(numDimensions) + 1; + ArrayList categories = new ArrayList(); + for (int j = 0; j < numCategories; j++) { + String dimension = dimensions[random.nextInt(dimensions.length)]; + categories.add(new CategoryPath(dimension, Integer.toString(i))); + } + facetFields.addFields(doc, categories); + indexWriter.addDocument(doc); + } + IOUtils.close(indexWriter, taxoWriter); + + // test the multi iterator + CategoryListCache clCache = null; + if (random.nextBoolean()) { + clCache = new CategoryListCache(); + } + + DirectoryReader indexReader = DirectoryReader.open(indexDir); + TaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoDir); + CategoryListIterator[] iterators = new CategoryListIterator[numDimensions]; + for (int i = 0; i < iterators.length; i++) { + CategoryListParams clp = indexingParams.getCategoryListParams(new CategoryPath(dimensions[i])); + IntDecoder decoder = clp.createEncoder().createMatchingDecoder(); + if (clCache != null && random.nextBoolean()) { + clCache.loadAndRegister(clp, indexReader, taxoReader, indexingParams); + iterators[i] = clCache.get(clp).iterator(0); // no partitions + } else { + iterators[i] = new PayloadCategoryListIteraor(indexReader, clp.getTerm(), decoder); + } + } + MultiCategoryListIterator cli = new MultiCategoryListIterator(iterators); + assertTrue("failed to init multi-iterator", cli.init()); + IntsRef ordinals = new IntsRef(); + int maxDoc = indexReader.maxDoc(); + for (int i = 0; i < maxDoc; i++) { + cli.getOrdinals(i, ordinals); + assertTrue("document " + i + " does not have categories", ordinals.length > 0); + for (int j = 0; j < ordinals.length; j++) { + CategoryPath cp = taxoReader.getPath(ordinals.ints[j]); + assertNotNull("ordinal " + ordinals.ints[j] + " not found in taxonomy", cp); + if (cp.length == 2) { + assertEquals("invalid category for document " + i, i, Integer.parseInt(cp.components[1])); + } + } + } + + IOUtils.close(indexReader, taxoReader); + IOUtils.close(indexDir, taxoDir); + } + +} \ No newline at end of file diff --git a/lucene/facet/src/test/org/apache/lucene/facet/search/params/MultiIteratorsPerCLParamsTest.java b/lucene/facet/src/test/org/apache/lucene/facet/search/params/MultiIteratorsPerCLParamsTest.java deleted file mode 100644 index a6d169b3284..00000000000 --- a/lucene/facet/src/test/org/apache/lucene/facet/search/params/MultiIteratorsPerCLParamsTest.java +++ /dev/null @@ -1,271 +0,0 @@ -package org.apache.lucene.facet.search.params; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; - -import org.apache.lucene.analysis.MockAnalyzer; -import org.apache.lucene.analysis.MockTokenizer; -import org.apache.lucene.document.Document; -import org.apache.lucene.facet.index.FacetFields; -import org.apache.lucene.facet.index.params.CategoryListParams; -import org.apache.lucene.facet.index.params.FacetIndexingParams; -import org.apache.lucene.facet.search.CategoryListIterator; -import org.apache.lucene.facet.search.FacetArrays; -import org.apache.lucene.facet.search.FacetResultsHandler; -import org.apache.lucene.facet.search.FacetsAccumulator; -import org.apache.lucene.facet.search.ScoredDocIDs; -import org.apache.lucene.facet.search.StandardFacetsAccumulator; -import org.apache.lucene.facet.search.TopKFacetResultsHandler; -import org.apache.lucene.facet.search.cache.CategoryListCache; -import org.apache.lucene.facet.search.results.FacetResult; -import org.apache.lucene.facet.search.results.FacetResultNode; -import org.apache.lucene.facet.search.results.IntermediateFacetResult; -import org.apache.lucene.facet.taxonomy.CategoryPath; -import org.apache.lucene.facet.taxonomy.TaxonomyReader; -import org.apache.lucene.facet.taxonomy.TaxonomyWriter; -import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader; -import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter; -import org.apache.lucene.facet.util.ScoredDocIdsUtils; -import org.apache.lucene.index.DirectoryReader; -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.RandomIndexWriter; -import org.apache.lucene.store.Directory; -import org.apache.lucene.util.LuceneTestCase; -import org.junit.Test; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Test faceted search with creation of multiple category list iterators by the - * same CLP, depending on the provided facet request - */ -public class MultiIteratorsPerCLParamsTest extends LuceneTestCase { - - CategoryPath[][] perDocCategories = new CategoryPath[][] { - { new CategoryPath("author", "Mark Twain"), - new CategoryPath("date", "2010") }, - { new CategoryPath("author", "Robert Frost"), - new CategoryPath("date", "2009") }, - { new CategoryPath("author", "Artur Miller"), - new CategoryPath("date", "2010") }, - { new CategoryPath("author", "Edgar Allan Poe"), - new CategoryPath("date", "2009") }, - { new CategoryPath("author", "Henry James"), - new CategoryPath("date", "2010") } }; - - String countForbiddenDimension; - - @Test - public void testCLParamMultiIteratorsByRequest() throws Exception { - doTestCLParamMultiIteratorsByRequest(false); - } - - @Test - public void testCLParamMultiIteratorsByRequestCacheCLI() throws Exception { - doTestCLParamMultiIteratorsByRequest(true); - } - - private void doTestCLParamMultiIteratorsByRequest(boolean cacheCLI) throws Exception { - // Create a CLP which generates different CLIs according to the - // FacetRequest's dimension - CategoryListParams clp = new CategoryListParams(); - FacetIndexingParams iParams = new FacetIndexingParams(clp); - Directory indexDir = newDirectory(); - Directory taxoDir = newDirectory(); - populateIndex(iParams, indexDir, taxoDir); - - TaxonomyReader taxo = new DirectoryTaxonomyReader(taxoDir); - IndexReader reader = DirectoryReader.open(indexDir); - - CategoryListCache clCache = null; - if (cacheCLI) { - // caching the iteratorr, so: - // 1: create the cached iterator, using original params - clCache = new CategoryListCache(); - clCache.loadAndRegister(clp, reader, taxo, iParams); - } - - ScoredDocIDs allDocs = ScoredDocIdsUtils - .createAllDocsScoredDocIDs(reader); - - // Search index with 'author' should filter ONLY ordinals whose parent - // is 'author' - countForbiddenDimension = "date"; - validateFacetedSearch(iParams, taxo, reader, clCache, allDocs, "author", 5, 5); - - // Search index with 'date' should filter ONLY ordinals whose parent is - // 'date' - countForbiddenDimension = "author"; - validateFacetedSearch(iParams, taxo, reader, clCache, allDocs, "date", 5, 2); - - // Search index with both 'date' and 'author' - countForbiddenDimension = null; - validateFacetedSearch(iParams, taxo, reader, clCache, allDocs, new String[] { - "author", "date" }, new int[] { 5, 5 }, new int[] { 5, 2 }); - taxo.close(); - reader.close(); - indexDir.close(); - taxoDir.close(); - } - - private void validateFacetedSearch(FacetIndexingParams iParams, - TaxonomyReader taxo, IndexReader reader, CategoryListCache clCache, - ScoredDocIDs allDocs, String dimension, int expectedValue, int expectedNumDescendants) throws IOException { - validateFacetedSearch(iParams, taxo, reader, clCache, allDocs, - new String[] { dimension }, new int[] { expectedValue }, - new int[] { expectedNumDescendants }); - } - - private void validateFacetedSearch(FacetIndexingParams iParams, - TaxonomyReader taxo, IndexReader reader, final CategoryListCache clCache, - ScoredDocIDs allDocs, String[] dimension, int[] expectedValue, - int[] expectedNumDescendants) throws IOException { - List facetRequests = new ArrayList(); - for (String dim : dimension) { - facetRequests.add(new PerDimCountFacetRequest(new CategoryPath(dim), 10)); - } - FacetSearchParams sParams = new FacetSearchParams(facetRequests, iParams) { - @Override - public CategoryListCache getCategoryListCache() { - return clCache; - } - }; - FacetsAccumulator acc = new StandardFacetsAccumulator(sParams, reader, taxo); - - // no use to test this with complement since at that mode all facets are taken - acc.setComplementThreshold(FacetsAccumulator.DISABLE_COMPLEMENT); - - List results = acc.accumulate(allDocs); - assertEquals("Wrong #results", dimension.length, results.size()); - - for (int i = 0; i < results.size(); i++) { - FacetResult res = results.get(i); - assertEquals("wrong num-descendants for dimension " + dimension[i], - expectedNumDescendants[i], res.getNumValidDescendants()); - FacetResultNode resNode = res.getFacetResultNode(); - assertEquals("wrong value for dimension " + dimension[i], - expectedValue[i], (int) resNode.getValue()); - } - } - - private void populateIndex(FacetIndexingParams iParams, Directory indexDir, - Directory taxoDir) throws Exception { - RandomIndexWriter writer = new RandomIndexWriter(random(), indexDir, - newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random(), MockTokenizer.KEYWORD, false))); - TaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir); - - FacetFields facetFields = new FacetFields(taxoWriter, iParams); - for (CategoryPath[] categories : perDocCategories) { - Document doc = new Document(); - facetFields.addFields(doc, Arrays.asList(categories)); - writer.addDocument(doc); - } - taxoWriter.commit(); - writer.commit(); - taxoWriter.close(); - writer.close(); - } - - private class PerDimCountFacetRequest extends CountFacetRequest { - - public PerDimCountFacetRequest(CategoryPath path, int num) { - super(path, num); - } - - @Override - public CategoryListIterator createCategoryListIterator(IndexReader reader, - TaxonomyReader taxo, FacetSearchParams sParams, int partition) throws IOException { - // categories of certain dimension only - return new PerDimensionCLI(taxo, super.createCategoryListIterator( - reader, taxo, sParams, partition), getCategoryPath()); - } - - @Override - /** Override this method just for verifying that only specified facets are iterated.. */ - public FacetResultsHandler createFacetResultsHandler( - TaxonomyReader taxonomyReader) { - return new TopKFacetResultsHandler(taxonomyReader, this) { - @Override - public IntermediateFacetResult fetchPartitionResult( - FacetArrays facetArrays, int offset) throws IOException { - final IntermediateFacetResult res = super.fetchPartitionResult(facetArrays, offset); - if (countForbiddenDimension!=null) { - int ord = taxonomyReader.getOrdinal(new CategoryPath(countForbiddenDimension)); - assertEquals("Should not have accumulated for dimension '"+countForbiddenDimension+"'!",0,facetArrays.getIntArray()[ord]); - } - return res; - } - }; - } - } - - /** - * a CLI which filters another CLI for the dimension of the provided - * category-path - */ - private static class PerDimensionCLI implements CategoryListIterator { - private final CategoryListIterator superCLI; - private final int[] parentArray; - private final int parentOrdinal; - - PerDimensionCLI(TaxonomyReader taxo, CategoryListIterator superCLI, - CategoryPath requestedPath) throws IOException { - this.superCLI = superCLI; - if (requestedPath == null) { - parentOrdinal = 0; - } else { - CategoryPath cp = new CategoryPath(requestedPath.components[0]); - parentOrdinal = taxo.getOrdinal(cp); - } - parentArray = taxo.getParallelTaxonomyArrays().parents(); - } - - @Override - public boolean init() throws IOException { - return superCLI.init(); - } - - @Override - public long nextCategory() throws IOException { - long next; - while ((next = superCLI.nextCategory()) <= Integer.MAX_VALUE - && !isInDimension((int) next)) { - } - - return next; - } - - /** look for original parent ordinal, meaning same dimension */ - private boolean isInDimension(int ordinal) { - while (ordinal > 0) { - if (ordinal == parentOrdinal) { - return true; - } - ordinal = parentArray[ordinal]; - } - return false; - } - - @Override - public boolean skipTo(int docId) throws IOException { - return superCLI.skipTo(docId); - } - } -} \ No newline at end of file diff --git a/lucene/facet/src/test/org/apache/lucene/util/Vint8Test.java b/lucene/facet/src/test/org/apache/lucene/util/Vint8Test.java deleted file mode 100644 index 82141860552..00000000000 --- a/lucene/facet/src/test/org/apache/lucene/util/Vint8Test.java +++ /dev/null @@ -1,141 +0,0 @@ -package org.apache.lucene.util; - -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; -import java.io.IOException; - -import org.junit.Test; - -import org.apache.lucene.util.LuceneTestCase; -import org.apache.lucene.util.Vint8; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Tests the {@link Vint8} class. - */ -public class Vint8Test extends LuceneTestCase { - - /** - * Tests the position wrapper. - * @throws Exception For any reason. - */ - @Test - public void testPosition() throws Exception { - Vint8.Position pos = new Vint8.Position(); - assertEquals(0, pos.pos); - pos = new Vint8.Position(12345); - assertEquals(12345, pos.pos); - } - - private static int[] testValues = { - -1000000000, - -1, 0, (1 << 7) - 1, 1 << 7, (1 << 14) - 1, 1 << 14, - (1 << 21) - 1, 1 << 21, (1 << 28) - 1, 1 << 28 - }; - private static int[] bytesNeededTestValues = { - 5, 5, 1, 1, 2, 2, 3, 3, 4, 4, 5 - }; - - /** - * Tests the {@code bytesNeeded} method. - */ - @Test - public void testBytesNeeded() { - assertEquals(5, Vint8.MAXIMUM_BYTES_NEEDED); - for (int j = 0; j < testValues.length; j++) { - assertEquals(bytesNeededTestValues[j], Vint8.bytesNeeded(testValues[j])); - } - } - - /** - * Tests encoding and decoding to and from a stream. - */ - @Test - public void testStreamEncodingAndDecoding() throws IOException { - ByteArrayOutputStream baos = new ByteArrayOutputStream(256); - int expectedSize = 0; - for (int j = 0; j < testValues.length; j++) { - Vint8.encode(testValues[j], baos); - expectedSize += bytesNeededTestValues[j]; - } - assertEquals(expectedSize, baos.size()); - ByteArrayInputStream bais = new ByteArrayInputStream(baos.toByteArray()); - for (int j = 0; j < testValues.length; j++) { - assertEquals(testValues[j], Vint8.decode(bais)); - } - assertEquals(0, bais.available()); - } - - /** - * Tests encoding and decoding to and from an array. - */ - @Test - public void testArrayEncodingAndDecoding() throws IOException { - byte[] byteArray = new byte[256]; - int position = 0, expectedSize = 0; - for (int j = 0; j < testValues.length; j++) { - position += Vint8.encode(testValues[j], byteArray, position); - expectedSize += bytesNeededTestValues[j]; - } - assertEquals(expectedSize, position); - Vint8.Position pos = new Vint8.Position(); - for (int j = 0; j < testValues.length; j++) { - assertEquals(testValues[j], Vint8.decode(byteArray, pos)); - } - assertEquals(expectedSize, pos.pos); - } - - /** - * The result of encoding the test values with the current algorithm. If these - * values are changed to match an algorithm change, compatibility with legacy - * data will be broken. - */ - private static final byte[] encodedTestValues = { - -4, -93, -108, -20, 0, -1, -1, -1, -1, 127, 0, 127, -127, 0, -1, 127, - -127, -128, 0, -1, -1, 127, -127, -128, -128, 0, -1, -1, -1, 127, -127, - -128, -128, -128, 0 - }; - - /** - * Tests algorithm. - */ - @Test - public void testLegacyCompatibility() throws IOException { - /* To generate the encoded test values: - byte[] byteArray = new byte[256]; - int position = 0, expectedSize = 0; - for (int j = 0; j < testValues.length; j++) { - position += Vint8.encode(testValues[j], byteArray, position); - expectedSize += bytesNeededTestValues[j]; - } - assertEquals(expectedSize, position); - Vint8.Position pos = new Vint8.Position(); - for (int j = 0; j < expectedSize; j++) { - System.out.print(byteArray[j] + ", "); - } - System.out.flush(); - pos.pos = 0; - */ - Vint8.Position pos = new Vint8.Position(); - for (int j = 0; j < testValues.length; j++) { - assertEquals(testValues[j], Vint8.decode(encodedTestValues, pos)); - } - } - -} // end class Vint8Test diff --git a/lucene/facet/src/test/org/apache/lucene/util/encoding/EncodingSpeed.java b/lucene/facet/src/test/org/apache/lucene/util/encoding/EncodingSpeed.java index 96dcea5a5ba..7b58ff4d0d7 100644 --- a/lucene/facet/src/test/org/apache/lucene/util/encoding/EncodingSpeed.java +++ b/lucene/facet/src/test/org/apache/lucene/util/encoding/EncodingSpeed.java @@ -1,21 +1,12 @@ package org.apache.lucene.util.encoding; -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; import java.io.IOException; import java.text.NumberFormat; import java.util.Arrays; import java.util.Locale; -import org.apache.lucene.util.encoding.DGapIntEncoder; -import org.apache.lucene.util.encoding.EightFlagsIntEncoder; -import org.apache.lucene.util.encoding.FourFlagsIntEncoder; -import org.apache.lucene.util.encoding.IntDecoder; -import org.apache.lucene.util.encoding.IntEncoder; -import org.apache.lucene.util.encoding.NOnesIntEncoder; -import org.apache.lucene.util.encoding.SortingIntEncoder; -import org.apache.lucene.util.encoding.UniqueValuesIntEncoder; -import org.apache.lucene.util.encoding.VInt8IntEncoder; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -40,8 +31,8 @@ public class EncodingSpeed { private static int[] data9910 = null; private static int[] data501871 = null; private static int[] data10k = null; - private static String resultsFormat = "%-20s %10s %20d %26s %20d %26s"; - private static String headerFormat = "%-20s %10s %20s %26s %20s %26s"; + private static String resultsFormat = "%-60s %10s %20d %26s %20d %26s"; + private static String headerFormat = "%-60s %10s %20s %26s %20s %26s"; private static int integers = 100000000; private static NumberFormat nf; @@ -53,8 +44,14 @@ public class EncodingSpeed { testFacetIDs(data501871, 501871); } - private static void testFacetIDs(int[] facetIDs, int docID) - throws IOException { + private static IntsRef newIntsRef(int[] data) { + IntsRef res = new IntsRef(data.length); + System.arraycopy(data, 0, res.ints, 0, data.length); + res.length = data.length; + return res; + } + + private static void testFacetIDs(int[] facetIDs, int docID) throws IOException { int loopFactor = integers / facetIDs.length; System.out .println("\nEstimating ~" @@ -88,68 +85,53 @@ public class EncodingSpeed { System.out.println(); } - private static void encoderTest(IntEncoder encoder, int[] data, - int loopFactor) throws IOException { + private static void encoderTest(IntEncoder encoder, int[] values, int loopFactor) throws IOException { - long startTime, endTime; - ByteArrayOutputStream baos = new ByteArrayOutputStream(); + BytesRef bytes = new BytesRef(values.length); // at least one byte per value // -- Looping 100 times as a warm up -------------------------- for (int i = 100; i != 0; --i) { - baos.reset(); - encoder.reInit(baos); - for (int value : data) { - encoder.encode(value); - } - encoder.close(); + IntsRef data = newIntsRef(values); + encoder.encode(data, bytes); } // ----------------------------------------------------------- - startTime = System.currentTimeMillis(); + long encodeTime = 0; for (int factor = loopFactor; factor > 0; --factor) { - baos.reset(); - encoder.reInit(baos); - for (int value : data) { - encoder.encode(value); - } - encoder.close(); + IntsRef data = newIntsRef(values); + long start = System.currentTimeMillis(); + encoder.encode(data, bytes); + encodeTime += System.currentTimeMillis() - start; } - endTime = System.currentTimeMillis(); - long encodeTime = endTime - startTime; - - ByteArrayInputStream bais = new ByteArrayInputStream(baos.toByteArray()); + IntsRef decoded = new IntsRef(values.length); + int encodedSize = bytes.length; IntDecoder decoder = encoder.createMatchingDecoder(); - decoder.reInit(bais); // -- Looping 100 times as a warm up -------------------------- for (int i = 100; i != 0; --i) { - bais.mark(baos.size()); - while (decoder.decode() != IntDecoder.EOS) { - } - bais.reset(); - decoder.reInit(bais); + decoder.decode(bytes, decoded); } // ----------------------------------------------------------- - decoder.reInit(bais); - startTime = System.currentTimeMillis(); + long decodeTime = 0; for (int i = loopFactor; i > 0; --i) { - bais.mark(baos.size()); - while (decoder.decode() != IntDecoder.EOS) { - } - bais.reset(); - decoder.reInit(bais); + long start = System.currentTimeMillis(); + decoder.decode(bytes, decoded); + decodeTime += System.currentTimeMillis() - start; + } + + if (decoded.length != values.length) { + throw new RuntimeException("wrong num values. expected=" + values.length + " actual=" + decoded.length + + " decoder=" + decoder); } - endTime = System.currentTimeMillis(); - long decodeTime = endTime - startTime; - - System.out.println(String.format(Locale.ROOT, resultsFormat, encoder, nf.format(baos - .size() - * 8.0 / data.length), encodeTime, nf.format(encodeTime - * 1000000.0 / (loopFactor * data.length)), decodeTime, nf - .format(decodeTime * 1000000.0 / (loopFactor * data.length)))); + System.out.println(String.format(Locale.ROOT, resultsFormat, encoder, + nf.format(encodedSize * 8.0 / values.length), + encodeTime, + nf.format(encodeTime * 1000000.0 / (loopFactor * values.length)), + decodeTime, + nf.format(decodeTime * 1000000.0 / (loopFactor * values.length)))); } static { diff --git a/lucene/facet/src/test/org/apache/lucene/util/encoding/EncodingTest.java b/lucene/facet/src/test/org/apache/lucene/util/encoding/EncodingTest.java index 73e44fa744a..57689d5731c 100644 --- a/lucene/facet/src/test/org/apache/lucene/util/encoding/EncodingTest.java +++ b/lucene/facet/src/test/org/apache/lucene/util/encoding/EncodingTest.java @@ -1,14 +1,13 @@ package org.apache.lucene.util.encoding; -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; import java.io.IOException; -import java.util.HashSet; -import java.util.TreeSet; - -import org.junit.Test; +import java.util.Arrays; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.LuceneTestCase; +import org.junit.BeforeClass; +import org.junit.Test; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -29,65 +28,33 @@ import org.apache.lucene.util.LuceneTestCase; public class EncodingTest extends LuceneTestCase { - static int[] data = null; - - private static TreeSet dataSet = new TreeSet(); - static { - setData(); - } - - @Test - public void testVInt8() throws Exception { - encoderTest(new VInt8IntEncoder()); + private static IntsRef uniqueSortedData, data; + + @BeforeClass + public static void beforeClassEncodingTest() throws Exception { + int capacity = atLeast(10000); + data = new IntsRef(capacity); + for (int i = 0; i < 10; i++) { + data.ints[i] = i + 1; // small values + } + for (int i = 10; i < data.ints.length; i++) { + data.ints[i] = random().nextInt(Integer.MAX_VALUE - 1) + 1; // some encoders don't allow 0 + } + data.length = data.ints.length; - // cover negative numbers; - IntEncoder enc = new VInt8IntEncoder(); - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - enc.reInit(baos); - enc.encode(-1); - - IntDecoder dec = enc.createMatchingDecoder(); - dec.reInit(new ByteArrayInputStream(baos.toByteArray())); - assertEquals(-1, dec.decode()); + uniqueSortedData = IntsRef.deepCopyOf(data); + Arrays.sort(uniqueSortedData.ints); + uniqueSortedData.length = 0; + int prev = -1; + for (int i = 0; i < uniqueSortedData.ints.length; i++) { + if (uniqueSortedData.ints[i] != prev) { + uniqueSortedData.ints[uniqueSortedData.length++] = uniqueSortedData.ints[i]; + prev = uniqueSortedData.ints[i]; + } + } } - @Test - public void testSimpleInt() { - encoderTest(new SimpleIntEncoder()); - } - - @Test - public void testSortingUniqueValues() { - encoderTest(new SortingIntEncoder(new UniqueValuesIntEncoder(new VInt8IntEncoder()))); - } - - @Test - public void testSortingUniqueDGap() { - encoderTest(new SortingIntEncoder(new UniqueValuesIntEncoder(new DGapIntEncoder(new VInt8IntEncoder())))); - } - - @Test - public void testSortingUniqueDGapEightFlags() { - encoderTest(new SortingIntEncoder(new UniqueValuesIntEncoder(new DGapIntEncoder(new EightFlagsIntEncoder())))); - } - - @Test - public void testSortingUniqueDGapFourFlags() { - encoderTest(new SortingIntEncoder(new UniqueValuesIntEncoder(new DGapIntEncoder(new FourFlagsIntEncoder())))); - } - - @Test - public void testSortingUniqueDGapNOnes4() { - encoderTest(new SortingIntEncoder(new UniqueValuesIntEncoder(new DGapIntEncoder(new NOnesIntEncoder(4))))); - } - - @Test - public void testSortingUniqueDGapNOnes3() { - encoderTest(new SortingIntEncoder(new UniqueValuesIntEncoder(new DGapIntEncoder(new NOnesIntEncoder(3))))); - } - - private static void encoderTest(IntEncoder encoder) { - + private static void encoderTest(IntEncoder encoder, IntsRef data, IntsRef expected) throws IOException { // ensure toString is implemented String toString = encoder.toString(); assertFalse(toString.startsWith(encoder.getClass().getName() + "@")); @@ -95,320 +62,90 @@ public class EncodingTest extends LuceneTestCase { toString = decoder.toString(); assertFalse(toString.startsWith(decoder.getClass().getName() + "@")); - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - - try { - encoding(encoder, baos); - decoding(baos, encoder.createMatchingDecoder()); - } catch (Exception e) { - e.printStackTrace(); - fail(e.getMessage()); - } - - baos.reset(); - - try { - encoding(encoder, baos); - decoding(baos, encoder.createMatchingDecoder()); - } catch (Exception e) { - e.printStackTrace(); - fail(e.getMessage()); - } + BytesRef bytes = new BytesRef(100); // some initial capacity - encoders should grow the byte[] + IntsRef values = new IntsRef(100); // some initial capacity - decoders should grow the int[] + encoding(encoder, data, bytes); + decoding(bytes, values, encoder.createMatchingDecoder()); + assertTrue(expected.intsEquals(values)); } - private static void encoding(IntEncoder encoder, ByteArrayOutputStream baos) throws IOException { - encoder.reInit(baos); - for (int value : data) { - encoder.encode(value); + private static void encoding(IntEncoder encoder, IntsRef data, BytesRef bytes) throws IOException { + final IntsRef values; + if (random().nextBoolean()) { // randomly set the offset + values = new IntsRef(data.length + 1); + System.arraycopy(data.ints, 0, values.ints, 1, data.length); + values.offset = 1; // ints start at index 1 + values.length = data.length; + } else { + // need to copy the array because it may be modified by encoders (e.g. sorting) + values = IntsRef.deepCopyOf(data); } - encoder.close(); - - baos.reset(); - encoder.reInit(baos); - for (int value : data) { - encoder.encode(value); - } - encoder.close(); + encoder.encode(values, bytes); } - private static void decoding(ByteArrayOutputStream baos, IntDecoder decoder) - throws IOException { - ByteArrayInputStream bais = new ByteArrayInputStream(baos.toByteArray()); - decoder.reInit(bais); - - HashSet set = new HashSet(); - long value = 0; - while ((value = decoder.decode()) != IntDecoder.EOS) { - set.add(value); + private static void decoding(BytesRef bytes, IntsRef values, IntDecoder decoder) throws IOException { + int offset = 0; + if (random().nextBoolean()) { // randomly set the offset and length to other than 0,0 + bytes.grow(bytes.length + 1); // ensure that we have enough capacity to shift values by 1 + bytes.offset = 1; // bytes start at index 1 (must do that after grow) + System.arraycopy(bytes.bytes, 0, bytes.bytes, 1, bytes.length); + offset = 1; } - assertEquals(dataSet.size(), set.size()); - assertTrue(set.equals(dataSet)); - - set.clear(); - bais.reset(); - decoder.reInit(bais); - value = 0; - while ((value = decoder.decode()) != IntDecoder.EOS) { - set.add(value); - } - assertEquals(dataSet.size(), set.size()); - assertTrue(set.equals(dataSet)); - + decoder.decode(bytes, values); + assertEquals(offset, bytes.offset); // decoders should not mess with offsets } - private static void setData() { - data = new int[] { 2, 4, 86133, 11, 16505, 86134, 86135, 86136, 1290, - 86137, 86138, 32473, 19346, 32474, 4922, 32475, 86139, 16914, - 86140, 86141, 86142, 86143, 32478, 86144, 86145, 32480, 4884, - 4887, 32481, 86146, 16572, 86147, 16295, 165, 86148, 3183, - 21920, 21921, 21922, 555, 4006, 32484, 21925, 21926, 13775, - 86149, 13777, 85833, 85834, 13779, 13773, 13780, 75266, 17674, - 13784, 13785, 13786, 13787, 13788, 6258, 86150, 13790, 75267, - 13793, 13794, 13795, 312, 4914, 4915, 6222, 86151, 4845, 4883, - 4918, 4894, 4919, 86152, 4921, 6223, 6224, 6225, 6226, 67909, - 6229, 18170, 6230, 5198, 25625, 6231, 6232, 6233, 1808, 6234, - 6235, 6236, 41376, 6238, 6239, 67911, 6240, 86153, 6243, 6244, - 83549, 6246, 6247, 6248, 6249, 782, 444, 6251, 6250, 19863, - 28963, 310, 2234, 144, 2236, 2309, 69437, 2311, 2325, 2241, - 69438, 69439, 2244, 2245, 2246, 23504, 2314, 69440, 36603, - 2250, 2268, 2271, 2251, 2254, 2255, 2257, 2240, 36604, 84726, - 36605, 84727, 2262, 2263, 18431, 38853, 2317, 2149, 2326, 2327, - 2329, 3980, 2275, 2277, 2258, 84728, 2260, 84729, 84730, 13766, - 36607, 2282, 2283, 84731, 2284, 2286, 2287, 2337, 7424, 2288, - 2338, 3522, 2290, 84733, 32902, 371, 37708, 2096, 3065, 3066, - 375, 377, 374, 378, 2100, 86154, 381, 382, 58795, 379, 383, - 384, 385, 4449, 387, 388, 389, 390, 9052, 391, 18358, 2107, - 394, 2111, 2108, 393, 2109, 395, 86155, 86156, 397, 2113, 398, - 399, 400, 273, 274, 275, 40980, 276, 277, 31716, 279, 280, - 31717, 281, 282, 1628, 1623, 1624, 1625, 2052, 1626, 725, 727, - 728, 729, 730, 731, 1633, 733, 734, 735, 86157, 737, 738, 739, - 1634, 3563, 3564, 3565, 1667, 12461, 76276, 3567, 5413, 77622, - 5415, 5416, 5417, 5418, 107, 86158, 7784, 15363, 153, 3723, - 2713, 7786, 3835, 7787, 86159, 7789, 7791, 7792, 7794, 86160, - 7796, 86161, 6708, 7798, 7799, 7800, 7801, 7802, 7803, 1665, - 43150, 15365, 1581, 5656, 43152, 80258, 7450, 39922, 86162, - 51587, 9059, 4606, 396, 86163, 86164, 7250, 401, 403, 2860, - 33281, 2964, 408, 9119, 409, 86165, 7669, 2861, 410, 413, - 86166, 414, 415, 33282, 405, 33283, 7498, 2865, 7230, 33284, - 2866, 86167, 2867, 47518, 2868, 86168, 2869, 2870, 4712, 7096, - 28484, 6913, 6914, 6915, 6916, 37169, 37170, 7103, 28269, 6919, - 86169, 45431, 6922, 7104, 6923, 7108, 6924, 6925, 6926, 6927, - 6928, 86170, 86171, 86172, 6930, 6931, 6932, 6934, 6935, 6936, - 451, 6937, 6938, 4756, 3554, 5309, 8145, 3586, 16417, 9767, - 14126, 25854, 6580, 10174, 86173, 5519, 21309, 8561, 20938, - 10386, 86174, 781, 2030, 16419, 30323, 16420, 16421, 16424, - 86175, 86176, 86177, 28871, 86178, 28872, 63980, 6329, 49561, - 4271, 38778, 86179, 86180, 20126, 16245, 193, 195, 196, 197, - 56973, 199, 200, 201, 202, 203, 204, 56974, 56975, 205, 206, - 4662, 207, 208, 209, 210, 211, 212, 47901, 641, 642, 643, 1380, - 1079, 47902, 1381, 1081, 1082, 1083, 47903, 1382, 47904, 1087, - 47905, 965, 966, 1298, 968, 1387, 1300, 50288, 971, 972, 973, - 974, 23974, 22183, 1390, 23313, 1389, 1391, 902, 23029, 296, - 1304, 1395, 1303, 1309, 1308, 50289, 1312, 50290, 50291, 1315, - 1317, 9270, 19796, 3605, 1320, 1321, 44946, 1322, 1323, 50292, - 967, 1587, 1326, 1331, 17482, 633, 29115, 53858, 29118, 29119, - 62624, 44494, 6965, 6966, 6959, 6967, 71562, 6969, 23459, - 23460, 17464, 4225, 23461, 23462, 23463, 5893, 23464, 17467, - 17468, 23465, 12562, 1405, 1406, 1407, 960, 961, 962, 687, 963, - 86181, 86182, 5997, 10812, 11976, 11977, 1850, 577, 13393, - 10810, 13394, 65040, 86183, 3935, 3936, 3937, 710, 86184, 5785, - 5786, 29949, 5787, 5788, 283, 284, 2687, 285, 286, 287, 2689, - 288, 289, 8880, 290, 2690, 13899, 991, 292, 295, 42007, 35616, - 63103, 298, 299, 3520, 297, 9024, 303, 301, 302, 300, 31345, - 3719, 304, 305, 306, 307, 308, 368, 364, 85002, 9026, 63105, - 367, 39596, 25835, 19746, 293, 294, 26505, 85003, 18377, 56785, - 10122, 10123, 10124, 86185, 39863, 86186, 10125, 39865, 4066, - 4067, 24257, 4068, 4070, 86187, 4073, 4074, 86188, 4076, 7538, - 4077, 86189, 4078, 4079, 7540, 7541, 4084, 4085, 7542, 86190, - 4086, 86191, 4087, 4088, 86192, 7545, 44874, 7821, 44875, - 86193, 4286, 86194, 51470, 17609, 1408, 47486, 1411, 1412, - 47487, 1413, 1414, 1417, 1415, 47488, 1416, 1418, 1420, 470, - 1422, 1423, 1424, 5001, 5002, 47489, 1427, 1429, 1430, 31811, - 1432, 1433, 47490, 1435, 3753, 1437, 1439, 1440, 47491, 1443, - 47492, 1446, 5004, 5005, 1450, 47493, 353, 1452, 42145, 3103, - 3402, 3104, 3105, 4780, 3106, 3107, 3108, 12157, 3111, 42146, - 42147, 3114, 4782, 42148, 3116, 3117, 42149, 42150, 3407, 3121, - 3122, 18154, 3126, 3127, 3128, 3410, 3130, 3411, 3412, 3415, - 24241, 3417, 3418, 3449, 42151, 3421, 3422, 7587, 42152, 3424, - 3427, 3428, 3448, 3430, 3432, 42153, 42154, 41648, 1991, 407, - 57234, 411, 2862, 57235, 2863, 18368, 57236, 2874, 7350, 4115, - 2876, 2877, 17975, 86195, 4116, 2881, 2882, 2883, 2886, 463, - 870, 872, 873, 874, 875, 8783, 8784, 877, 1480, 1481, 459, - 2778, 881, 8785, 2779, 8786, 8787, 8788, 886, 887, 8789, 889, - 8790, 86196, 6920, 86197, 5080, 5081, 7395, 7396, 9395, 9396, - 1528, 42737, 805, 86198, 1209, 13595, 4126, 9680, 34368, 9682, - 86199, 86200, 174, 175, 176, 177, 178, 179, 180, 182, 183, - 1477, 31138, 186, 172, 187, 188, 189, 190, 191, 458, 871, - 31294, 31295, 27604, 31296, 31297, 882, 883, 884, 31298, 890, - 1089, 1488, 1489, 1092, 1093, 1094, 1095, 1096, 1097, 1490, - 1098, 1495, 1502, 1099, 1100, 1101, 1493, 2997, 12223, 1103, - 2654, 1498, 1499, 1500, 80615, 80616, 80617, 33359, 86201, - 9294, 1501, 86202, 1506, 1507, 23454, 38802, 38803, 1014, - 86203, 5583, 5584, 651, 74717, 5586, 5587, 5588, 5589, 74720, - 5590, 38808, 33527, 78330, 10930, 5119, 10931, 1000, 10928, - 10932, 10933, 10934, 10935, 5863, 10936, 86204, 10938, 10939, - 86205, 192, 194, 38754, 38755, 198, 38756, 38757, 38758, 2842, - 640, 22780, 22781, 1080, 86206, 86207, 1084, 1086, 1088, 63916, - 9412, 970, 9413, 9414, 9415, 9416, 9417, 1310, 7168, 7169, - 1318, 9418, 1324, 39159, 1804, 1557, 24850, 41499, 1560, 41500, - 1562, 1563, 1565, 1927, 1928, 1566, 1569, 1570, 1571, 1572, - 1573, 1574, 1575, 1576, 2674, 2677, 2678, 2679, 2946, 2682, - 2676, 2683, 2947, 1156, 1157, 1158, 1467, 1160, 1468, 1469, - 1161, 1162, 1163, 4369, 1165, 1166, 1167, 12923, 2917, 1169, - 1170, 1171, 1172, 1173, 1174, 1175, 1176, 1177, 18153, 8359, - 1178, 1164, 1191, 1180, 12924, 86208, 86209, 54817, 66962, - 2476, 86210, 86211, 41820, 41821, 41822, 41824, 1130, 1131, - 1132, 32692, 1134, 34848, 1136, 1133, 1137, 1138, 1139, 1140, - 1141, 1143, 1144, 1145, 34849, 2639, 34850, 1146, 1147, 1148, - 34851, 1150, 1151, 1152, 1153, 1154, 1155, 1678, 1679, 1680, - 1681, 40870, 2059, 1685, 1686, 32686, 14970, 1688, 1689, 86212, - 1692, 1682, 1693, 1695, 1696, 1698, 12955, 8909, 41690, 1700, - 41691, 86213, 30949, 41692, 1703, 1704, 1705, 41693, 14976, - 1708, 2071, 1709, 1710, 1711, 1712, 1727, 86214, 86215, 86216, - 1715, 86217, 1714, 1717, 1690, 41697, 86218, 1720, 86219, 2073, - 41699, 1724, 2075, 1726, 1729, 1730, 1732, 2078, 2223, 1735, - 1713, 41700, 1737, 14977, 1739, 1740, 1741, 2080, 1743, 1744, - 1745, 1746, 1747, 1748, 1749, 1750, 1751, 41701, 1752, 1753, - 1909, 86220, 2085, 1754, 19548, 86221, 19551, 5733, 3856, 5190, - 4581, 25145, 86222, 86223, 4846, 86224, 4861, 86225, 86226, - 86227, 25150, 86228, 86229, 13820, 2027, 4898, 4899, 4901, - 2135, 4902, 4868, 4904, 86230, 4905, 25155, 4907, 86231, 4909, - 4910, 4911, 4912, 86232, 6220, 81357, 86233, 2589, 73877, - 29706, 6227, 6228, 86234, 6237, 86235, 6241, 6242, 1812, 13808, - 13809, 70908, 2293, 2294, 86236, 2295, 2296, 2297, 22947, - 16511, 2299, 2300, 2301, 13097, 73079, 86237, 13099, 50121, - 86238, 86239, 13101, 86240, 2424, 4725, 4726, 4727, 4728, 4729, - 4730, 86241, 26881, 10944, 4734, 4735, 4736, 26239, 26240, - 71408, 86242, 57401, 71410, 26244, 5344, 26245, 86243, 4102, - 71414, 11091, 6736, 86244, 6737, 6738, 38152, 6740, 6741, 6742, - 6298, 6743, 6745, 6746, 20867, 6749, 20616, 86245, 9801, 65297, - 20617, 65298, 20619, 5629, 65299, 20621, 20622, 8385, 20623, - 20624, 5191, 20625, 20626, 442, 443, 445, 27837, 77681, 86246, - 27839, 86247, 86248, 41435, 66511, 2478, 2479, 2480, 2481, - 2482, 2483, 2484, 2485, 2486, 2487, 2488, 2489, 2490, 2494, - 2493, 33025, 12084, 2542, 2497, 2499, 2501, 2503, 2504, 2505, - 33026, 2506, 2507, 2508, 2509, 2511, 1787, 12080, 2513, 2514, - 3988, 3176, 3989, 2518, 2521, 9285, 2522, 2524, 2525, 3990, - 2527, 2528, 27499, 2529, 2530, 3991, 2532, 2534, 2535, 18038, - 2536, 2538, 2495, 46077, 61493, 61494, 1006, 713, 4971, 4972, - 4973, 4975, 4976, 650, 170, 7549, 7550, 7551, 7552, 7553, - 86249, 7936, 956, 11169, 11170, 1249, 1244, 1245, 1247, 2544, - 1250, 2545, 1252, 2547, 1253, 1254, 2549, 39636, 1259, 1257, - 1258, 39637, 1260, 1261, 2551, 1262, 1263, 848, 86250, 86251, - 854, 74596, 856, 1957, 86252, 855, 1959, 1961, 857, 86253, 851, - 859, 860, 862, 1964, 864, 865, 866, 867, 1965, 1966, 1967, - 1968, 1969, 86254, 1971, 1972, 1973, 1974, 1975, 1976, 1977, - 841, 1954, 842, 2978, 846, 847, 849, 850, 852, 1956, 17452, - 71941, 86255, 86256, 73665, 1471, 13690, 185, 503, 504, 2342, - 505, 506, 4378, 508, 4379, 17313, 510, 511, 512, 520, 513, - 4384, 17314, 514, 515, 46158, 17317, 518, 34269, 519, 4386, - 523, 524, 525, 46159, 528, 529, 17319, 531, 532, 533, 534, 535, - 7482, 537, 538, 5267, 536, 539, 541, 540, 19858, 17320, 17321, - 906, 907, 908, 17322, 910, 17323, 912, 15850, 913, 4398, 17324, - 86257, 278, 2948, 2949, 2950, 3007, 2951, 2952, 2953, 2954, - 2955, 3013, 35352, 3014, 3015, 2962, 3016, 33505, 39118, 3017, - 3018, 20492, 4000, 3021, 3022, 35353, 39293, 3024, 18443, 3029, - 9467, 20529, 39119, 8380, 2965, 3030, 3043, 22714, 39120, 2956, - 3035, 39121, 3037, 3038, 2688, 86258, 36675, 30894, 24505, - 8888, 13541, 49728, 27660, 9082, 27661, 365, 366, 2232, 76098, - 7233, 1494, 17391, 606, 607, 611, 610, 612, 614, 615, 613, 616, - 9117, 617, 618, 21155, 1789, 619, 620, 7636, 12019, 621, 622, - 1793, 623, 625, 624, 631, 626, 627, 21578, 21103, 628, 21579, - 629, 9122, 9123, 12189, 9289, 3168, 3169, 630, 632, 634, 21580, - 9121, 635, 636, 637, 21581, 12781, 1801, 638, 639, 1559, 24343, - 9419, 9420, 795, 796, 1611, 86259, 1612, 21551, 21552, 3741, - 1617, 3742, 1615, 1619, 1620, 6301, 3744, 1622, 67685, 8521, - 55937, 9025, 27663, 8881, 13581, 86260, 11592, 44720, 86261, - 63231, 50873, 42925, 52332, 86262, 72706, 17705, 17707, 17708, - 3401, 40217, 1248, 40218, 86263, 7098, 86264, 86265, 1264, - 86266, 1266, 1267, 1268, 1269, 86267, 1271, 1272, 1273, 1274, - 2556, 1275, 1276, 1277, 1278, 1279, 1280, 1282, 1283, 22680, - 11889, 86268, 45662, 7038, 86269, 19315, 45663, 45664, 86270, - 5855, 34002, 49245, 10447, 5663, 86271, 15429, 53877, 49249, - 86272, 86273, 86274, 60128, 60453, 60129, 5552, 31923, 43407, - 4287, 17980, 64977, 86275, 86276, 8234, 86277, 3649, 8240, - 1330, 11999, 1332, 27618, 1334, 1335, 340, 3651, 25640, 18165, - 1343, 4618, 1474, 3653, 75921, 1349, 53519, 1779, 45454, 22778, - 40153, 67677, 63826, 45455, 15128, 67678, 67679, 1792, 67680, - 3171, 47816, 45457, 9288, 59891, 67681, 25703, 35731, 35732, - 369, 35713, 35714, 35715, 34652, 35716, 31681, 35717, 12779, - 35718, 35719, 11992, 806, 807, 808, 43499, 43500, 810, 776, - 812, 813, 814, 241, 43501, 43502, 816, 755, 43503, 818, 819, - 820, 43504, 821, 822, 823, 824, 825, 826, 43505, 43506, 43507, - 828, 829, 20083, 43508, 43509, 832, 833, 834, 835, 86278, - 19984, 19985, 86279, 24125, 19986, 86280, 19988, 86281, 5414, - 86282, 85808, 5479, 5420, 5421, 5422, 5423, 63800, 86283, - 86284, 30965, 86285, 416, 1510, 5740, 5741, 81991, 86286, - 28938, 50149, 1003, 55512, 14306, 6960, 688, 86287, 14307, - 5399, 5400, 17783, 24118, 720, 86288, 44913, 24557, 667, 24876, - 6529, 24877, 24878, 24879, 24880, 31847, 20671, 4011, 171, 580, - 86289, 3863, 914, 2202, 916, 917, 918, 919, 921, 922, 923, - 7585, 925, 7586, 926, 927, 928, 7588, 929, 930, 931, 932, 933, - 934, 1875, 1876, 7589, 7590, 1878, 1879, 7591, 7592, 1882, - 1883, 1884, 2212, 7593, 1887, 1888, 1889, 1890, 1891, 1892, - 1893, 1894, 1895, 1896, 1897, 1898, 2217, 1900, 7594, 1902, - 2219, 7595, 1905, 1906, 1907, 3323, 7596, 1911, 1912, 7597, - 1914, 1915, 1916, 2226, 1919, 7598, 2227, 1920, 1921, 7599, - 7600, 4708, 1923, 355, 356, 1549, 358, 32077, 360, 32078, - 21117, 362, 19043, 71677, 5716, 86290, 49790, 86291, 86292, - 86293, 49792, 86294, 86295, 49794, 86296, 86297, 86298, 86299, - 11882, 86300, 49798, 86301, 49800, 49801, 49802, 49803, 453, - 49804, 8591, 6794, 49806, 18989, 49807, 49808, 16308, 49809, - 86302, 86303, 10105, 86304, 5285, 10106, 10107, 6557, 86305, - 23571, 10109, 38883, 10110, 5401, 86306, 67557, 16430, 67558, - 40171, 16433, 25878, 86307, 21762, 23, 86308, 86309, 21766, - 86310, 86311, 5149, 3926, 21768, 21769, 47826, 942, 46985, - 6588, 58867, 6589, 6590, 86312, 6592, 6006, 53855, 9565, 359, - 86313, 2845, 876, 879, 27556, 27557, 885, 27558, 888, 2847, - 27559, 2115, 2116, 2117, 53962, 57839, 315, 316, 317, 318, 319, - 86314, 321, 322, 2122, 323, 2123, 324, 325, 328, 326, 327, - 40542, 329, 330, 18079, 18080, 331, 1790, 7382, 332, 7380, - 7236, 23413, 23414, 18924, 18925, 333, 335, 336, 39750, 337, - 86315, 339, 341, 342, 343, 16264, 16265, 6615, 86316, 86317, - 86318, 86319, 16269, 10538, 33226, 86320, 16272, 5824, 16273, - 16274, 16276, 16277, 16278, 16279, 16280, 14517, 1547, 6463, - 3394, 49677, 659, 10380, 30013, 10382, 10378, 10379, 10383, - 10384, 10385, 86321, 4139, 13370, 13371, 86322, 86323, 11878, - 64509, 15141, 15142, 15143, 32737, 14183, 15144, 39101, 42768, - 5645, 32738, 801, 803, 804, 86324, 14707, 86325, 6601, 12402, - 712, 12403, 2936, 1447, 15477, 1410, 44872, 1550, 8614, 15478, - 15479, 15480, 15481, 4811, 3752, 1442, 15482, 8818, 1445, 5006, - 16304, 32277, 16305, 16306, 86326, 16307, 53691, 69305, 809, - 86327, 815, 26724, 69307, 43484, 63904, 86328, 13498, 827, - 86329, 831, 2857, 836, 86330, 86331, 837, 838, 839, 840, 228, - 229, 43722, 230, 231, 43723, 234, 235, 236, 237, 238, 239, - 2745, 2746, 240, 242, 243, 244, 43724, 19788, 246, 247, 21134, - 248, 250, 251, 252, 253, 254, 255, 256, 257, 258, 43725, 43726, - 41, 43727, 262, 43728, 2751, 264, 265, 266, 267, 268, 269, 270, - 271, 272, 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 1032, - 1033, 1034, 43729, 1035, 43730, 1037, 21821, 2926, 14388, - 10432, 14389, 14390, 14391, 14392, 86332, 14394, 14395, 2035, - 2169, 86333, 14397, 14398, 14399, 14400, 52, 14401, 14402, - 7077, 21822, 14405, 14406, 14396, 86334, 17356, 17357, 84679, - 84680, 76383, 17360, 17361, 86335, 38801, 2060, 30850, 12963, - 1684, 1687, 2061, 14978, 1694, 43387, 1697, 1699, 2067, 1701, - 1702, 1706, 43388, 43389, 76325, 1716, 1718, 26832, 1719, 1723, - 2081, 2063, 1728, 39059, 76326, 1731, 86336, 1736, 76327, 1738, - 19657, 6579, 6581, 6582, 6583, 6584, 6585, 29979, 1818, 28239, - 68, 69, 3391, 86337, 10266, 63528, 86338, 10269, 10270, 10271, - 10272, 86339, 86340, 63530, 63531, 63532, 63533, 10273, 63534, - 86341, 10681, 10682, 86342, 9673, 86343, 10683, 460, 461, 462, - 467, 4464, 4466, 3729, 471, 472, 468, 81634, 474, 81635, 475, - 476, 477, 479, 480, 81636, 81637, 482, 17442, 81638, 81639, - 484, 485, 486, 4473, 488, 489, 490, 493, 466, 494, 495, 496, - 497, 499, 500, 501, 502, 34376, 86344, 63836, 56281, 1707, - 20416, 61452, 56282, 1755, 56283, 56284, 18508, 53650, 63444, - 86345, 3579, 63445, 3677, 1979, 1980, 1981, 3132, 3147, 34090, - 1987, 12770, 1329, 80818, 80819, 1988, 23522, 1986, 15880, - 1985, 32975, 1992, 1993, 7165, 3141, 3143, 86346, 1982, 1984, - 3145, 86347, 78064, 55453, 2656, 2657, 35634, 35635, 2167, - 43479, - // ensure there is a representative number for any # of int bytes - 1, 1 << 8 + 1, 1 << 16 + 1, 1 << 24 + 1 }; -// data = new int[]{1, 2, 3, 4}; - for (int value : data) { - dataSet.add(new Long(value)); - } + @Test + public void testVInt8() throws Exception { + encoderTest(new VInt8IntEncoder(), data, data); + + // cover negative numbers; + BytesRef bytes = new BytesRef(5); + IntEncoder enc = new VInt8IntEncoder(); + IntsRef values = new IntsRef(1); + values.ints[values.length++] = -1; + enc.encode(values, bytes); + + IntDecoder dec = enc.createMatchingDecoder(); + values.length = 0; + dec.decode(bytes, values); + assertEquals(1, values.length); + assertEquals(-1, values.ints[0]); + } + + @Test + public void testSimpleInt() throws Exception { + encoderTest(new SimpleIntEncoder(), data, data); + } + + @Test + public void testSortingUniqueValues() throws Exception { + encoderTest(new SortingIntEncoder(new UniqueValuesIntEncoder(new VInt8IntEncoder())), data, uniqueSortedData); + } + + @Test + public void testSortingUniqueDGap() throws Exception { + encoderTest(new SortingIntEncoder(new UniqueValuesIntEncoder(new DGapIntEncoder(new VInt8IntEncoder()))), data, uniqueSortedData); + } + + @Test + public void testSortingUniqueDGapEightFlags() throws Exception { + encoderTest(new SortingIntEncoder(new UniqueValuesIntEncoder(new DGapIntEncoder(new EightFlagsIntEncoder()))), data, uniqueSortedData); + } + + @Test + public void testSortingUniqueDGapFourFlags() throws Exception { + encoderTest(new SortingIntEncoder(new UniqueValuesIntEncoder(new DGapIntEncoder(new FourFlagsIntEncoder()))), data, uniqueSortedData); + } + + @Test + public void testSortingUniqueDGapNOnes4() throws Exception { + encoderTest(new SortingIntEncoder(new UniqueValuesIntEncoder(new DGapIntEncoder(new NOnesIntEncoder(4)))), data, uniqueSortedData); + } + + @Test + public void testSortingUniqueDGapNOnes3() throws Exception { + encoderTest(new SortingIntEncoder(new UniqueValuesIntEncoder(new DGapIntEncoder(new NOnesIntEncoder(3)))), data, uniqueSortedData); } } diff --git a/lucene/facet/src/test/org/apache/lucene/util/encoding/Vint8Test.java b/lucene/facet/src/test/org/apache/lucene/util/encoding/Vint8Test.java new file mode 100644 index 00000000000..f0556f3c718 --- /dev/null +++ b/lucene/facet/src/test/org/apache/lucene/util/encoding/Vint8Test.java @@ -0,0 +1,54 @@ +package org.apache.lucene.util.encoding; + +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.LuceneTestCase; +import org.junit.Test; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Tests the {@link VInt8} class. + */ +public class Vint8Test extends LuceneTestCase { + + private static final int[] TEST_VALUES = { + -1000000000, + -1, 0, (1 << 7) - 1, 1 << 7, (1 << 14) - 1, 1 << 14, + (1 << 21) - 1, 1 << 21, (1 << 28) - 1, 1 << 28 + }; + private static int[] BYTES_NEEDED_TEST_VALUES = { + 5, 5, 1, 1, 2, 2, 3, 3, 4, 4, 5 + }; + + @Test + public void testBytesRef() throws Exception { + BytesRef bytes = new BytesRef(256); + int expectedSize = 0; + for (int j = 0; j < TEST_VALUES.length; j++) { + VInt8.encode(TEST_VALUES[j], bytes); + expectedSize += BYTES_NEEDED_TEST_VALUES[j]; + } + assertEquals(expectedSize, bytes.length); + + for (int j = 0; j < TEST_VALUES.length; j++) { + assertEquals(TEST_VALUES[j], VInt8.decode(bytes)); + } + assertEquals(bytes.offset, bytes.length); + } + +}