res = associations.build(ordinals, categories);
+ res.putAll(counting.build(ordinals, categories));
+ return res;
}
@Override
diff --git a/lucene/facet/src/java/org/apache/lucene/facet/associations/AssociationsListBuilder.java b/lucene/facet/src/java/org/apache/lucene/facet/associations/AssociationsListBuilder.java
new file mode 100644
index 00000000000..42a4218f323
--- /dev/null
+++ b/lucene/facet/src/java/org/apache/lucene/facet/associations/AssociationsListBuilder.java
@@ -0,0 +1,89 @@
+package org.apache.lucene.facet.associations;
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.lucene.facet.index.CategoryListBuilder;
+import org.apache.lucene.facet.index.CountingListBuilder;
+import org.apache.lucene.facet.taxonomy.CategoryPath;
+import org.apache.lucene.store.ByteArrayDataOutput;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IntsRef;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * A {@link AssociationsListBuilder} which encodes category-association value pairs.
+ * Every category-association pair is written under the respective association's
+ * {@link CategoryAssociation#getCategoryListID()}.
+ *
+ * NOTE: associations list do not encode the counting list data. You
+ * should use {@link CountingListBuilder} to build that information and then
+ * merge the results of both {@link #build(IntsRef, Iterable)}.
+ */
+public class AssociationsListBuilder implements CategoryListBuilder {
+
+ private final CategoryAssociationsContainer associations;
+ private final ByteArrayDataOutput output = new ByteArrayDataOutput();
+
+ public AssociationsListBuilder(CategoryAssociationsContainer associations) {
+ this.associations = associations;
+ }
+
+ @Override
+ public Map build(IntsRef ordinals, Iterable categories) throws IOException {
+ final HashMap res = new HashMap();
+ int idx = 0;
+ for (CategoryPath cp : categories) {
+ // build per-association key BytesRef
+ CategoryAssociation association = associations.getAssociation(cp);
+
+ if (association == null) {
+ // it is ok to set a null association for a category - it's treated as a
+ // regular category in that case.
+ ++idx;
+ continue;
+ }
+
+ BytesRef bytes = res.get(association.getCategoryListID());
+ if (bytes == null) {
+ bytes = new BytesRef(32);
+ res.put(association.getCategoryListID(), bytes);
+ }
+
+ int maxBytesNeeded = 4 /* int */ + association.maxBytesNeeded() + bytes.length;
+ if (bytes.bytes.length < maxBytesNeeded) {
+ bytes.grow(maxBytesNeeded);
+ }
+
+ // reset the output to write from bytes.length (current position) until the end
+ output.reset(bytes.bytes, bytes.length, bytes.bytes.length - bytes.length);
+ output.writeInt(ordinals.ints[idx++]);
+
+ // encode the association bytes
+ association.serialize(output);
+
+ // update BytesRef
+ bytes.length = output.getPosition();
+ }
+
+ return res;
+ }
+
+}
diff --git a/lucene/facet/src/java/org/apache/lucene/facet/associations/AssociationsPayloadIterator.java b/lucene/facet/src/java/org/apache/lucene/facet/associations/AssociationsPayloadIterator.java
index d5ffcd7632d..6ced1c70ce8 100644
--- a/lucene/facet/src/java/org/apache/lucene/facet/associations/AssociationsPayloadIterator.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/associations/AssociationsPayloadIterator.java
@@ -53,20 +53,22 @@ public abstract class AssociationsPayloadIterator
}
/**
- * Skip to the requested document. Returns true iff the document has categort
- * association values and they were read successfully.
+ * Skip to the requested document. Returns true iff the document has category
+ * association values and they were read successfully. Associations are
+ * handled through {@link #handleAssociation(int, CategoryAssociation)} by
+ * extending classes.
*/
- public boolean setNextDoc(int docId) throws IOException {
+ protected final boolean setNextDoc(int docID) throws IOException {
if (!hasAssociations) { // there are no associations at all
return false;
}
- if (!pi.setdoc(docId)) { // no associations for the requested document
+ BytesRef bytes = pi.getPayload(docID);
+ if (bytes == null) { // no associations for the requested document
return false;
}
-
- BytesRef associations = pi.getPayload();
- ByteArrayDataInput in = new ByteArrayDataInput(associations.bytes, associations.offset, associations.length);
+
+ ByteArrayDataInput in = new ByteArrayDataInput(bytes.bytes, bytes.offset, bytes.length);
while (!in.eof()) {
int ordinal = in.readInt();
association.deserialize(in);
diff --git a/lucene/facet/src/java/org/apache/lucene/facet/associations/CategoryAssociationsContainer.java b/lucene/facet/src/java/org/apache/lucene/facet/associations/CategoryAssociationsContainer.java
index 27b3e58470b..67e6c4c8e90 100644
--- a/lucene/facet/src/java/org/apache/lucene/facet/associations/CategoryAssociationsContainer.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/associations/CategoryAssociationsContainer.java
@@ -55,5 +55,10 @@ public class CategoryAssociationsContainer implements Iterable {
public void clear() {
categoryAssociations.clear();
}
+
+ @Override
+ public String toString() {
+ return categoryAssociations.toString();
+ }
}
diff --git a/lucene/facet/src/java/org/apache/lucene/facet/associations/CategoryFloatAssociation.java b/lucene/facet/src/java/org/apache/lucene/facet/associations/CategoryFloatAssociation.java
index bd826dfb60c..459e523e15d 100644
--- a/lucene/facet/src/java/org/apache/lucene/facet/associations/CategoryFloatAssociation.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/associations/CategoryFloatAssociation.java
@@ -71,5 +71,10 @@ public class CategoryFloatAssociation implements CategoryAssociation {
public float getValue() {
return value;
}
+
+ @Override
+ public String toString() {
+ return getClass().getSimpleName() + "(" + value + ")";
+ }
}
diff --git a/lucene/facet/src/java/org/apache/lucene/facet/associations/CategoryIntAssociation.java b/lucene/facet/src/java/org/apache/lucene/facet/associations/CategoryIntAssociation.java
index 6758d7d00fc..1d3b691b0d3 100644
--- a/lucene/facet/src/java/org/apache/lucene/facet/associations/CategoryIntAssociation.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/associations/CategoryIntAssociation.java
@@ -72,4 +72,9 @@ public class CategoryIntAssociation implements CategoryAssociation {
return value;
}
+ @Override
+ public String toString() {
+ return getClass().getSimpleName() + "(" + value + ")";
+ }
+
}
diff --git a/lucene/facet/src/java/org/apache/lucene/facet/associations/FloatAssociationsPayloadIterator.java b/lucene/facet/src/java/org/apache/lucene/facet/associations/FloatAssociationsPayloadIterator.java
index 958dfb49ff7..0708910523d 100644
--- a/lucene/facet/src/java/org/apache/lucene/facet/associations/FloatAssociationsPayloadIterator.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/associations/FloatAssociationsPayloadIterator.java
@@ -40,23 +40,17 @@ public class FloatAssociationsPayloadIterator extends AssociationsPayloadIterato
protected void handleAssociation(int ordinal, CategoryFloatAssociation association) {
ordinalAssociations.put(ordinal, association.getValue());
}
-
- @Override
- public boolean setNextDoc(int docId) throws IOException {
- ordinalAssociations.clear();
- return super.setNextDoc(docId);
- }
/**
- * Get the float association value for the given ordinal, or
- * {@link Float#NaN} in case the ordinal has no association value.
+ * Returns the float association values of the categories that are associated
+ * with the given document, or {@code null} if the document has no
+ * associations.
+ *
+ * NOTE: you are not expected to modify the returned map.
*/
- public float getAssociation(int ordinal) {
- if (ordinalAssociations.containsKey(ordinal)) {
- return ordinalAssociations.get(ordinal);
- }
-
- return Float.NaN;
+ public IntToFloatMap getAssociations(int docID) throws IOException {
+ ordinalAssociations.clear();
+ return setNextDoc(docID) ? ordinalAssociations : null;
}
-
+
}
diff --git a/lucene/facet/src/java/org/apache/lucene/facet/associations/IntAssociationsPayloadIterator.java b/lucene/facet/src/java/org/apache/lucene/facet/associations/IntAssociationsPayloadIterator.java
index ad983114084..e3bed4f51e5 100644
--- a/lucene/facet/src/java/org/apache/lucene/facet/associations/IntAssociationsPayloadIterator.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/associations/IntAssociationsPayloadIterator.java
@@ -31,12 +31,6 @@ public class IntAssociationsPayloadIterator extends AssociationsPayloadIterator<
private final IntToIntMap ordinalAssociations = new IntToIntMap();
- /**
- * The long-special-value returned for ordinals which have no associated int
- * value. It is not in the int range of values making it a valid mark.
- */
- public final static long NO_ASSOCIATION = Integer.MAX_VALUE + 1;
-
public IntAssociationsPayloadIterator(IndexReader reader, String field, CategoryIntAssociation association)
throws IOException {
super(reader, field, association);
@@ -47,22 +41,16 @@ public class IntAssociationsPayloadIterator extends AssociationsPayloadIterator<
ordinalAssociations.put(ordinal, association.getValue());
}
- @Override
- public boolean setNextDoc(int docId) throws IOException {
- ordinalAssociations.clear();
- return super.setNextDoc(docId);
- }
-
/**
- * Get the integer association value for the given ordinal, or
- * {@link #NO_ASSOCIATION} in case the ordinal has no association value.
+ * Returns the integer association values of the categories that are
+ * associated with the given document, or {@code null} if the document has no
+ * associations.
+ *
+ * NOTE: you are not expected to modify the returned map.
*/
- public long getAssociation(int ordinal) {
- if (ordinalAssociations.containsKey(ordinal)) {
- return ordinalAssociations.get(ordinal);
- }
-
- return NO_ASSOCIATION;
+ public IntToIntMap getAssociations(int docID) throws IOException {
+ ordinalAssociations.clear();
+ return setNextDoc(docID) ? ordinalAssociations : null;
}
}
diff --git a/lucene/facet/src/java/org/apache/lucene/facet/index/CategoryListBuilder.java b/lucene/facet/src/java/org/apache/lucene/facet/index/CategoryListBuilder.java
index 509c0c61684..0bea43235b4 100644
--- a/lucene/facet/src/java/org/apache/lucene/facet/index/CategoryListBuilder.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/index/CategoryListBuilder.java
@@ -1,19 +1,11 @@
package org.apache.lucene.facet.index;
import java.io.IOException;
-import java.util.HashMap;
-import java.util.Map.Entry;
+import java.util.Map;
-import org.apache.lucene.facet.index.categorypolicy.OrdinalPolicy;
-import org.apache.lucene.facet.index.params.CategoryListParams;
-import org.apache.lucene.facet.index.params.FacetIndexingParams;
import org.apache.lucene.facet.taxonomy.CategoryPath;
-import org.apache.lucene.facet.taxonomy.TaxonomyWriter;
-import org.apache.lucene.facet.util.PartitionsUtils;
import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.IOUtils;
-import org.apache.lucene.util.UnsafeByteArrayOutputStream;
-import org.apache.lucene.util.encoding.IntEncoder;
+import org.apache.lucene.util.IntsRef;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@@ -33,149 +25,14 @@ import org.apache.lucene.util.encoding.IntEncoder;
*/
/**
- * Builds a category list by encoding the category ordinals into one or more
- * {@link BytesRef}. Each {@link BytesRef} corresponds to a set of ordinals that
- * belong to the same partition. When partitions are not enabled (i.e.
- * {@link FacetIndexingParams#getPartitionSize()} returns
- * {@link Integer#MAX_VALUE}), only one {@link BytesRef} is returned by this
- * class.
+ * Builds a category list data by encoding the appropriate information for every
+ * category and ordinal given to {@link #build(IntsRef, Iterable)}.
+ *
+ * @lucene.experimental
*/
-public class CategoryListBuilder {
-
- /** Specializes encoding ordinals when partitions are enabled/disabled. */
- private static abstract class OrdinalsEncoder {
- OrdinalsEncoder() {}
- public abstract void encode(int ordinal);
- public abstract HashMap finish();
- }
+public interface CategoryListBuilder {
- private static final class NoPartitionsOrdinalsEncoder extends OrdinalsEncoder {
-
- private final IntEncoder encoder;
- private final UnsafeByteArrayOutputStream ubaos;
- private final String name;
-
- NoPartitionsOrdinalsEncoder(CategoryListParams categoryListParams) {
- name = categoryListParams.getTerm().text();
- encoder = categoryListParams.createEncoder();
- ubaos = new UnsafeByteArrayOutputStream();
- encoder.reInit(ubaos);
- }
-
- @Override
- public void encode(int ordinal) {
- try {
- encoder.encode(ordinal);
- } catch (IOException e) {
- // shouldn't happen as we're writing to byte[]
- throw new RuntimeException("unexpected exception", e);
- }
- }
-
- @Override
- public HashMap finish() {
- try {
- encoder.close();
- } catch (IOException e) {
- // shouldn't happen as we're writing to byte[]
- throw new RuntimeException("unexpected exception", e);
- }
- HashMap result = new HashMap();
- result.put(name, new BytesRef(ubaos.toByteArray(), ubaos.getStartPos(), ubaos.length()));
- return result;
- }
-
- }
-
- private static final class PerPartitionOrdinalsEncoder extends OrdinalsEncoder {
-
- private final FacetIndexingParams indexingParams;
- private final CategoryListParams categoryListParams;
- private final int partitionSize;
- private final HashMap partitionEncoder = new HashMap();
- private final HashMap partitionBytes = new HashMap();
-
- PerPartitionOrdinalsEncoder(FacetIndexingParams indexingParams, CategoryListParams categoryListParams) {
- this.indexingParams = indexingParams;
- this.categoryListParams = categoryListParams;
- this.partitionSize = indexingParams.getPartitionSize();
- }
-
- @Override
- public void encode(int ordinal) {
- final String name = PartitionsUtils.partitionNameByOrdinal(indexingParams, categoryListParams, ordinal);
- IntEncoder encoder = partitionEncoder.get(name);
- if (encoder == null) {
- encoder = categoryListParams.createEncoder();
- final UnsafeByteArrayOutputStream ubaos = new UnsafeByteArrayOutputStream();
- encoder.reInit(ubaos);
- partitionEncoder.put(name, encoder);
- partitionBytes.put(name, ubaos);
- }
- try {
- encoder.encode(ordinal % partitionSize);
- } catch (IOException e) {
- // shouldn't happen as we're writing to byte[]
- throw new RuntimeException("unexpected exception", e);
- }
- }
-
- @Override
- public HashMap finish() {
- // finish encoding
- IOUtils.closeWhileHandlingException(partitionEncoder.values());
-
- HashMap bytes = new HashMap();
- for (Entry e : partitionBytes.entrySet()) {
- UnsafeByteArrayOutputStream ubaos = e.getValue();
- bytes.put(e.getKey(), new BytesRef(ubaos.toByteArray(), ubaos.getStartPos(), ubaos.length()));
- }
- return bytes;
- }
-
- }
-
- private final TaxonomyWriter taxoWriter;
- private final OrdinalsEncoder ordinalsEncoder;
- private final OrdinalPolicy ordinalPolicy;
-
- public CategoryListBuilder(CategoryListParams categoryListParams, FacetIndexingParams indexingParams,
- TaxonomyWriter taxoWriter) {
- this.taxoWriter = taxoWriter;
- this.ordinalPolicy = indexingParams.getOrdinalPolicy();
- if (indexingParams.getPartitionSize() == Integer.MAX_VALUE) {
- ordinalsEncoder = new NoPartitionsOrdinalsEncoder(categoryListParams);
- } else {
- ordinalsEncoder = new PerPartitionOrdinalsEncoder(indexingParams, categoryListParams);
- }
- }
-
- /**
- * Encodes the given ordinal as well as any of its parent ordinals (per
- * {@link OrdinalPolicy}).
- */
- public void handle(int ordinal, CategoryPath cp) throws IOException {
- ordinalsEncoder.encode(ordinal);
-
- // add all parent ordinals, per OrdinalPolicy
- int parent = taxoWriter.getParent(ordinal);
- while (parent > 0) {
- if (ordinalPolicy.shouldAdd(parent)) {
- ordinalsEncoder.encode(parent);
- }
- parent = taxoWriter.getParent(parent);
- }
- }
-
- /**
- * Returns the encoded ordinals data. Every returned {@link BytesRef}
- * corresponds to a single partition (as defined by
- * {@link FacetIndexingParams#getPartitionSize()}) and the key denotes the
- * partition ID. When no partitions are defined, the returned map includes
- * only one value.
- */
- public HashMap finish() {
- return ordinalsEncoder.finish();
- }
+ /** Returns the encoded ordinals data. */
+ public Map build(IntsRef ordinals, Iterable categories) throws IOException;
}
diff --git a/lucene/facet/src/java/org/apache/lucene/facet/index/CountingListBuilder.java b/lucene/facet/src/java/org/apache/lucene/facet/index/CountingListBuilder.java
new file mode 100644
index 00000000000..29b87b3f7b2
--- /dev/null
+++ b/lucene/facet/src/java/org/apache/lucene/facet/index/CountingListBuilder.java
@@ -0,0 +1,160 @@
+package org.apache.lucene.facet.index;
+
+import java.io.IOException;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Map.Entry;
+
+import org.apache.lucene.facet.index.categorypolicy.OrdinalPolicy;
+import org.apache.lucene.facet.index.params.CategoryListParams;
+import org.apache.lucene.facet.index.params.FacetIndexingParams;
+import org.apache.lucene.facet.taxonomy.CategoryPath;
+import org.apache.lucene.facet.taxonomy.TaxonomyWriter;
+import org.apache.lucene.facet.util.PartitionsUtils;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IntsRef;
+import org.apache.lucene.util.encoding.IntEncoder;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * A {@link CategoryListBuilder} which builds a counting list data by encoding
+ * the category ordinals into one or more {@link BytesRef}. Each
+ * {@link BytesRef} corresponds to a set of ordinals that belong to the same
+ * partition. When partitions are not enabled (i.e.
+ * {@link FacetIndexingParams#getPartitionSize()} returns
+ * {@link Integer#MAX_VALUE}), only one {@link BytesRef} is returned by this
+ * class.
+ *
+ * Counting lists are used usually for computing the weight of categories by
+ * summing their number of occurrences (hence counting) in a result set.
+ */
+public class CountingListBuilder implements CategoryListBuilder {
+
+ /** Specializes encoding ordinals when partitions are enabled/disabled. */
+ private static abstract class OrdinalsEncoder {
+ OrdinalsEncoder() {}
+ public abstract Map encode(IntsRef ordinals);
+ }
+
+ private static final class NoPartitionsOrdinalsEncoder extends OrdinalsEncoder {
+
+ private final IntEncoder encoder;
+ private final String name;
+
+ NoPartitionsOrdinalsEncoder(CategoryListParams categoryListParams) {
+ name = categoryListParams.getTerm().text();
+ encoder = categoryListParams.createEncoder();
+ }
+
+ @Override
+ public Map encode(IntsRef ordinals) {
+ final BytesRef bytes = new BytesRef(128); // should be enough for most common applications
+ encoder.encode(ordinals, bytes);
+ return Collections.singletonMap(name, bytes);
+ }
+
+ }
+
+ private static final class PerPartitionOrdinalsEncoder extends OrdinalsEncoder {
+
+ private final FacetIndexingParams indexingParams;
+ private final CategoryListParams categoryListParams;
+ private final int partitionSize;
+ private final HashMap partitionEncoder = new HashMap();
+
+ PerPartitionOrdinalsEncoder(FacetIndexingParams indexingParams, CategoryListParams categoryListParams) {
+ this.indexingParams = indexingParams;
+ this.categoryListParams = categoryListParams;
+ this.partitionSize = indexingParams.getPartitionSize();
+ }
+
+ @Override
+ public HashMap encode(IntsRef ordinals) {
+ // build the partitionOrdinals map
+ final HashMap partitionOrdinals = new HashMap();
+ for (int i = 0; i < ordinals.length; i++) {
+ int ordinal = ordinals.ints[i];
+ final String name = PartitionsUtils.partitionNameByOrdinal(indexingParams, categoryListParams, ordinal);
+ IntsRef partitionOrds = partitionOrdinals.get(name);
+ if (partitionOrds == null) {
+ partitionOrds = new IntsRef(32);
+ partitionOrdinals.put(name, partitionOrds);
+ partitionEncoder.put(name, categoryListParams.createEncoder());
+ }
+ partitionOrds.ints[partitionOrds.length++] = ordinal % partitionSize;
+ }
+
+ HashMap partitionBytes = new HashMap();
+ for (Entry e : partitionOrdinals.entrySet()) {
+ String name = e.getKey();
+ final IntEncoder encoder = partitionEncoder.get(name);
+ final BytesRef bytes = new BytesRef(128); // should be enough for most common applications
+ encoder.encode(e.getValue(), bytes);
+ partitionBytes.put(name, bytes);
+ }
+ return partitionBytes;
+ }
+
+ }
+
+ private final OrdinalsEncoder ordinalsEncoder;
+ private final TaxonomyWriter taxoWriter;
+ private final OrdinalPolicy ordinalPolicy;
+
+ public CountingListBuilder(CategoryListParams categoryListParams, FacetIndexingParams indexingParams,
+ TaxonomyWriter taxoWriter) {
+ this.taxoWriter = taxoWriter;
+ this.ordinalPolicy = indexingParams.getOrdinalPolicy();
+ if (indexingParams.getPartitionSize() == Integer.MAX_VALUE) {
+ ordinalsEncoder = new NoPartitionsOrdinalsEncoder(categoryListParams);
+ } else {
+ ordinalsEncoder = new PerPartitionOrdinalsEncoder(indexingParams, categoryListParams);
+ }
+ }
+
+ /**
+ * Every returned {@link BytesRef} corresponds to a single partition (as
+ * defined by {@link FacetIndexingParams#getPartitionSize()}) and the key
+ * denotes the partition ID. When no partitions are defined, the returned map
+ * contains only one value.
+ *
+ * NOTE: the {@code ordinals} array is modified by adding parent
+ * ordinals to it. Also, some encoders may sort the array and remove duplicate
+ * ordinals. Therefore you may want to invoke this method after you finished
+ * processing the array for other purposes.
+ */
+ @Override
+ public Map build(IntsRef ordinals, Iterable categories) throws IOException {
+ int upto = ordinals.length; // since we add ordinals to IntsRef, iterate upto original length
+
+ for (int i = 0; i < upto; i++) {
+ int ordinal = ordinals.ints[i];
+ int parent = taxoWriter.getParent(ordinal);
+ while (parent > 0) {
+ if (ordinalPolicy.shouldAdd(parent)) {
+ ordinals.ints[ordinals.length++] = parent;
+ }
+ parent = taxoWriter.getParent(parent);
+ }
+ }
+ return ordinalsEncoder.encode(ordinals);
+ }
+
+}
diff --git a/lucene/facet/src/java/org/apache/lucene/facet/index/FacetFields.java b/lucene/facet/src/java/org/apache/lucene/facet/index/FacetFields.java
index 07249aaec99..9fda2396ee5 100644
--- a/lucene/facet/src/java/org/apache/lucene/facet/index/FacetFields.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/index/FacetFields.java
@@ -2,6 +2,7 @@ package org.apache.lucene.facet.index;
import java.io.IOException;
import java.util.ArrayList;
+import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
@@ -21,6 +22,7 @@ import org.apache.lucene.facet.taxonomy.CategoryPath;
import org.apache.lucene.facet.taxonomy.TaxonomyWriter;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IntsRef;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@@ -69,7 +71,7 @@ public class FacetFields {
return true;
}
- void setCategoriesData(HashMap categoriesData) {
+ void setCategoriesData(Map categoriesData) {
this.categoriesData = categoriesData.entrySet().iterator();
}
@@ -132,6 +134,9 @@ public class FacetFields {
*/
protected Map> createCategoryListMapping(
Iterable categories) {
+ if (indexingParams.getAllCategoryListParams().size() == 1) {
+ return Collections.singletonMap(indexingParams.getCategoryListParams(null), categories);
+ }
HashMap> categoryLists =
new HashMap>();
for (CategoryPath cp : categories) {
@@ -147,10 +152,15 @@ public class FacetFields {
return categoryLists;
}
- /** Returns a {@link CategoryListBuilder} for encoding the given categories. */
- protected CategoryListBuilder getCategoryListBuilder(CategoryListParams categoryListParams,
- Iterable categories /* needed for AssociationsFacetFields */) {
- return new CategoryListBuilder(categoryListParams, indexingParams, taxonomyWriter);
+ /**
+ * Returns the category list data, as a mapping from key to {@link BytesRef}
+ * which includes the encoded data. Every ordinal in {@code ordinals}
+ * corrspond to a {@link CategoryPath} returned from {@code categories}.
+ */
+ protected Map getCategoryListData(CategoryListParams categoryListParams,
+ IntsRef ordinals, Iterable categories /* needed for AssociationsFacetFields */)
+ throws IOException {
+ return new CountingListBuilder(categoryListParams, indexingParams, taxonomyWriter).build(ordinals, categories);
}
/**
@@ -185,17 +195,25 @@ public class FacetFields {
// for each CLP we add a different field for drill-down terms as well as for
// counting list data.
+ IntsRef ordinals = new IntsRef(32); // should be enough for most common applications
for (Entry> e : categoryLists.entrySet()) {
final CategoryListParams clp = e.getKey();
final String field = clp.getTerm().field();
- // add the counting list data
- CategoryListBuilder categoriesPayloadBuilder = getCategoryListBuilder(clp, e.getValue());
+ // build category list data
+ ordinals.length = 0; // reset
+ int maxNumOrds = 0;
for (CategoryPath cp : e.getValue()) {
int ordinal = taxonomyWriter.addCategory(cp);
- categoriesPayloadBuilder.handle(ordinal , cp);
+ maxNumOrds += cp.length; // ordinal and potentially all parents
+ if (ordinals.ints.length < maxNumOrds) {
+ ordinals.grow(maxNumOrds);
+ }
+ ordinals.ints[ordinals.length++] = ordinal;
}
- HashMap categoriesData = categoriesPayloadBuilder.finish();
+ Map categoriesData = getCategoryListData(clp, ordinals, e.getValue());
+
+ // add the counting list data
CountingListStream ts = new CountingListStream();
ts.setCategoriesData(categoriesData);
doc.add(new Field(field, ts, COUNTING_LIST_PAYLOAD_TYPE));
diff --git a/lucene/facet/src/java/org/apache/lucene/facet/index/OrdinalMappingAtomicReader.java b/lucene/facet/src/java/org/apache/lucene/facet/index/OrdinalMappingAtomicReader.java
index e2f6a5acd88..522f383e4d4 100644
--- a/lucene/facet/src/java/org/apache/lucene/facet/index/OrdinalMappingAtomicReader.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/index/OrdinalMappingAtomicReader.java
@@ -17,10 +17,7 @@ package org.apache.lucene.facet.index;
* limitations under the License.
*/
-import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
import java.io.IOException;
-import java.io.InputStream;
import java.util.HashMap;
import java.util.Map;
@@ -36,6 +33,7 @@ import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.encoding.IntDecoder;
import org.apache.lucene.util.encoding.IntEncoder;
@@ -187,7 +185,7 @@ public class OrdinalMappingAtomicReader extends FilterAtomicReader {
private class OrdinalMappingDocsAndPositionsEnum extends FilterDocsAndPositionsEnum {
private final IntEncoder encoder;
private final IntDecoder decoder;
- private final ByteArrayOutputStream os = new ByteArrayOutputStream();
+ private final IntsRef ordinals = new IntsRef(32);
private final BytesRef payloadOut = new BytesRef();
public OrdinalMappingDocsAndPositionsEnum(DocsAndPositionsEnum in, CategoryListParams params) {
@@ -202,21 +200,14 @@ public class OrdinalMappingAtomicReader extends FilterAtomicReader {
if (payload == null) {
return payload;
} else {
- InputStream is = new ByteArrayInputStream(payload.bytes, payload.offset, payload.length);
- decoder.reInit(is);
- os.reset();
- encoder.reInit(os);
- long ordinal;
- while ((ordinal = decoder.decode()) != IntDecoder.EOS) {
- int newOrdinal = ordinalMap[(int)ordinal];
- encoder.encode(newOrdinal);
+ decoder.decode(payload, ordinals);
+
+ // map the ordinals
+ for (int i = 0; i < ordinals.length; i++) {
+ ordinals.ints[i] = ordinalMap[ordinals.ints[i]];
}
- encoder.close();
- // TODO (Facet): avoid copy?
- byte out[] = os.toByteArray();
- payloadOut.bytes = out;
- payloadOut.offset = 0;
- payloadOut.length = out.length;
+
+ encoder.encode(ordinals, payloadOut);
return payloadOut;
}
}
diff --git a/lucene/facet/src/java/org/apache/lucene/facet/index/params/CategoryListParams.java b/lucene/facet/src/java/org/apache/lucene/facet/index/params/CategoryListParams.java
index 349da82d7ef..576b9be077a 100644
--- a/lucene/facet/src/java/org/apache/lucene/facet/index/params/CategoryListParams.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/index/params/CategoryListParams.java
@@ -7,7 +7,7 @@ import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.facet.search.CategoryListIterator;
-import org.apache.lucene.facet.search.PayloadIntDecodingIterator;
+import org.apache.lucene.facet.search.PayloadCategoryListIteraor;
import org.apache.lucene.facet.search.TotalFacetCounts;
import org.apache.lucene.facet.util.PartitionsUtils;
import org.apache.lucene.util.encoding.DGapIntEncoder;
@@ -142,7 +142,7 @@ public class CategoryListParams implements Serializable {
int partition) throws IOException {
String categoryListTermStr = PartitionsUtils.partitionName(this, partition);
Term payloadTerm = new Term(term.field(), categoryListTermStr);
- return new PayloadIntDecodingIterator(reader, payloadTerm,
+ return new PayloadCategoryListIteraor(reader, payloadTerm,
createEncoder().createMatchingDecoder());
}
diff --git a/lucene/facet/src/java/org/apache/lucene/facet/index/params/FacetIndexingParams.java b/lucene/facet/src/java/org/apache/lucene/facet/index/params/FacetIndexingParams.java
index 62502e75443..3fca0e3a4aa 100644
--- a/lucene/facet/src/java/org/apache/lucene/facet/index/params/FacetIndexingParams.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/index/params/FacetIndexingParams.java
@@ -83,18 +83,9 @@ public class FacetIndexingParams {
}
/**
- * The name of the category list to put this category in, or {@code null} if
- * this category should not be aggregatable.
- *
- * By default, all categories are written to the same category list, but
- * applications which know in advance that in some situations only parts of
- * the category hierarchy needs to be counted can divide the categories into
- * two or more different category lists.
- *
- * If {@code null} is returned for a category, it means that this category
- * should not appear in any category list, and thus weights for it cannot be
- * aggregated. This category can still be used for drill-down, even though the
- * its weight is unknown.
+ * Returns the {@link CategoryListParams} for this {@link CategoryPath}. The
+ * default implementation returns the same {@link CategoryListParams} for all
+ * categories (even if {@code category} is {@code null}).
*
* @see PerDimensionIndexingParams
*/
diff --git a/lucene/facet/src/java/org/apache/lucene/facet/index/params/PerDimensionIndexingParams.java b/lucene/facet/src/java/org/apache/lucene/facet/index/params/PerDimensionIndexingParams.java
index 1ccb58d309f..c4b971a5ca9 100644
--- a/lucene/facet/src/java/org/apache/lucene/facet/index/params/PerDimensionIndexingParams.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/index/params/PerDimensionIndexingParams.java
@@ -78,7 +78,9 @@ public class PerDimensionIndexingParams extends FacetIndexingParams {
/**
* Returns the {@link CategoryListParams} for the corresponding dimension
- * which is returned by {@code category.getComponent(0)}.
+ * which is returned by {@code category.getComponent(0)}. If {@code category}
+ * is {@code null}, or was not specified in the map given to the constructor,
+ * returns the default {@link CategoryListParams}.
*/
@Override
public CategoryListParams getCategoryListParams(CategoryPath category) {
diff --git a/lucene/facet/src/java/org/apache/lucene/facet/search/CategoryListIterator.java b/lucene/facet/src/java/org/apache/lucene/facet/search/CategoryListIterator.java
index 79f89909030..5132e930264 100644
--- a/lucene/facet/src/java/org/apache/lucene/facet/search/CategoryListIterator.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/search/CategoryListIterator.java
@@ -2,6 +2,8 @@ package org.apache.lucene.facet.search;
import java.io.IOException;
+import org.apache.lucene.util.IntsRef;
+
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@@ -20,20 +22,10 @@ import java.io.IOException;
*/
/**
- * An interface for iterating over a "category list", i.e., the list of
- * categories per document.
+ * An interface for obtaining the category ordinals of documents.
*
- * NOTE:
- *
- * - This class operates as a key to a Map. Appropriate implementation of
- *
hashCode()
and equals()
must be provided.
- * - {@link #init()} must be called before you consume any categories, or call
- * {@link #skipTo(int)}.
- *
- {@link #skipTo(int)} must be called before any calls to
- * {@link #nextCategory()}.
- *
- {@link #nextCategory()} returns values < {@link Integer#MAX_VALUE}, so
- * you can use it as a stop condition.
- *
+ * NOTE: this class operates as a key to a map, and therefore you should
+ * implement {@code equals()} and {@code hashCode()} for proper behavior.
*
* @lucene.experimental
*/
@@ -41,29 +33,20 @@ public interface CategoryListIterator {
/**
* Initializes the iterator. This method must be called before any calls to
- * {@link #skipTo(int)}, and its return value indicates whether there are
- * any relevant documents for this iterator. If it returns false, any call
- * to {@link #skipTo(int)} will return false as well.
- * NOTE: calling this method twice may result in skipping over
- * documents for some implementations. Also, calling it again after all
- * documents were consumed may yield unexpected behavior.
+ * {@link #getOrdinals(int, IntsRef)}, and its return value indicates whether there are
+ * any relevant documents for this iterator.
*/
public boolean init() throws IOException;
/**
- * Skips forward to document docId. Returns true iff this document exists
- * and has any categories. This method must be called before calling
- * {@link #nextCategory()} for a particular document.
- * NOTE: Users should call this method with increasing docIds, and
- * implementations can assume that this is the case.
+ * Stores the category ordinals of the given document ID in the given
+ * {@link IntsRef}, starting at position 0 upto {@link IntsRef#length}. Grows
+ * the {@link IntsRef} if it is not large enough.
+ *
+ *
+ * NOTE: if the requested document does not category ordinals
+ * associated with it, {@link IntsRef#length} is set to zero.
*/
- public boolean skipTo(int docId) throws IOException;
-
- /**
- * Returns the next category for the current document that is set through
- * {@link #skipTo(int)}, or a number higher than {@link Integer#MAX_VALUE}.
- * No assumptions can be made on the order of the categories.
- */
- public long nextCategory() throws IOException;
-
+ public void getOrdinals(int docID, IntsRef ints) throws IOException;
+
}
diff --git a/lucene/facet/src/java/org/apache/lucene/facet/search/PayloadIntDecodingIterator.java b/lucene/facet/src/java/org/apache/lucene/facet/search/PayloadCategoryListIteraor.java
similarity index 52%
rename from lucene/facet/src/java/org/apache/lucene/facet/search/PayloadIntDecodingIterator.java
rename to lucene/facet/src/java/org/apache/lucene/facet/search/PayloadCategoryListIteraor.java
index db4803e1019..3deba112f3e 100644
--- a/lucene/facet/src/java/org/apache/lucene/facet/search/PayloadIntDecodingIterator.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/search/PayloadCategoryListIteraor.java
@@ -5,7 +5,7 @@ import java.io.IOException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.UnsafeByteArrayInputStream;
+import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.encoding.IntDecoder;
/*
@@ -26,44 +26,21 @@ import org.apache.lucene.util.encoding.IntDecoder;
*/
/**
- * A payload deserializer comes with its own working space (buffer). One need to
- * define the {@link IndexReader} and {@link Term} in which the payload resides.
- * The iterator then consumes the payload information of each document and
- * decodes it into categories. A typical use case of this class is:
- *
- *
- * IndexReader reader = [open your reader];
- * Term t = new Term("field", "where-payload-exists");
- * CategoryListIterator cli = new PayloadIntDecodingIterator(reader, t);
- * if (!cli.init()) {
- * // it means there are no payloads / documents associated with that term.
- * // Usually a sanity check. However, init() must be called.
- * }
- * DocIdSetIterator disi = [you usually iterate on something else, such as a Scorer];
- * int doc;
- * while ((doc = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
- * cli.setdoc(doc);
- * long category;
- * while ((category = cli.nextCategory()) < Integer.MAX_VALUE) {
- * }
- * }
- *
+ * A {@link CategoryListIterator} which reads the category ordinals from a
+ * payload.
*
* @lucene.experimental
*/
-public class PayloadIntDecodingIterator implements CategoryListIterator {
+public class PayloadCategoryListIteraor implements CategoryListIterator {
- private final UnsafeByteArrayInputStream ubais;
private final IntDecoder decoder;
-
private final IndexReader indexReader;
private final Term term;
private final PayloadIterator pi;
private final int hashCode;
- public PayloadIntDecodingIterator(IndexReader indexReader, Term term, IntDecoder decoder) throws IOException {
+ public PayloadCategoryListIteraor(IndexReader indexReader, Term term, IntDecoder decoder) throws IOException {
pi = new PayloadIterator(indexReader, term);
- ubais = new UnsafeByteArrayInputStream();
this.decoder = decoder;
hashCode = indexReader.hashCode() ^ term.hashCode();
this.term = term;
@@ -72,10 +49,10 @@ public class PayloadIntDecodingIterator implements CategoryListIterator {
@Override
public boolean equals(Object other) {
- if (!(other instanceof PayloadIntDecodingIterator)) {
+ if (!(other instanceof PayloadCategoryListIteraor)) {
return false;
}
- PayloadIntDecodingIterator that = (PayloadIntDecodingIterator) other;
+ PayloadCategoryListIteraor that = (PayloadCategoryListIteraor) other;
if (hashCode != that.hashCode) {
return false;
}
@@ -95,21 +72,12 @@ public class PayloadIntDecodingIterator implements CategoryListIterator {
}
@Override
- public long nextCategory() throws IOException {
- return decoder.decode();
- }
-
- @Override
- public boolean skipTo(int docId) throws IOException {
- if (!pi.setdoc(docId)) {
- return false;
+ public void getOrdinals(int docID, IntsRef ints) throws IOException {
+ ints.length = 0;
+ BytesRef payload = pi.getPayload(docID);
+ if (payload != null) {
+ decoder.decode(payload, ints);
}
-
- // Initializing the decoding mechanism with the new payload data
- BytesRef data = pi.getPayload();
- ubais.reInit(data.bytes, data.offset, data.length + data.offset);
- decoder.reInit(ubais);
- return true;
}
}
diff --git a/lucene/facet/src/java/org/apache/lucene/facet/search/PayloadIterator.java b/lucene/facet/src/java/org/apache/lucene/facet/search/PayloadIterator.java
index a12f2cd91f6..7cc7527280d 100644
--- a/lucene/facet/src/java/org/apache/lucene/facet/search/PayloadIterator.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/search/PayloadIterator.java
@@ -34,9 +34,9 @@ import org.apache.lucene.util.BytesRef;
* A utility class for iterating through a posting list of a given term and
* retrieving the payload of the first position in every document. For
* efficiency, this class does not check if documents passed to
- * {@link #setdoc(int)} are deleted, since it is usually used to iterate on
+ * {@link #getPayload(int)} are deleted, since it is usually used to iterate on
* payloads of documents that matched a query. If you need to skip over deleted
- * documents, you should do so before calling {@link #setdoc(int)}.
+ * documents, you should do so before calling {@link #getPayload(int)}.
*
* @lucene.experimental
*/
@@ -84,8 +84,8 @@ public class PayloadIterator {
/**
* Initialize the iterator. Should be done before the first call to
- * {@link #setdoc(int)}. Returns {@code false} if no category list is found,
- * or the category list has no documents.
+ * {@link #getPayload(int)}. Returns {@code false} if no category list is
+ * found, or the category list has no documents.
*/
public boolean init() throws IOException {
nextSegment();
@@ -93,30 +93,29 @@ public class PayloadIterator {
}
/**
- * Skip forward to document docId. Return true if this document exists and
- * has any payload.
- *
- * Users should call this method with increasing docIds, and implementations
- * can assume that this is the case.
+ * Returns the {@link BytesRef payload} of the given document, or {@code null}
+ * if the document does not exist, there are no more documents in the posting
+ * list, or the document exists but has not payload. You should call
+ * {@link #init()} before the first call to this method.
*/
- public boolean setdoc(int docId) throws IOException {
+ public BytesRef getPayload(int docID) throws IOException {
if (!hasMore) {
- return false;
+ return null;
}
// re-basing docId->localDocID is done fewer times than currentDoc->globalDoc
- int localDocID = docId - curDocBase;
+ int localDocID = docID - curDocBase;
if (curDocID > localDocID) {
// document does not exist
- return false;
+ return null;
}
if (curDocID < localDocID) {
// look for the document either in that segment, or others
while (hasMore && (curDocID = currentDPE.advance(localDocID)) == DocIdSetIterator.NO_MORE_DOCS) {
nextSegment(); // also updates curDocID
- localDocID = docId - curDocBase;
+ localDocID = docID - curDocBase;
// nextSegment advances to nextDoc, so check if we still need to advance
if (curDocID >= localDocID) {
break;
@@ -127,7 +126,7 @@ public class PayloadIterator {
// 1. we iterated over all segments (hasMore=false)
// 2. current segment advanced to a doc, either requested or higher
if (!hasMore || curDocID != localDocID) {
- return false;
+ return null;
}
}
@@ -135,12 +134,7 @@ public class PayloadIterator {
assert currentDPE.freq() == 1 : "expecting freq=1 (got " + currentDPE.freq() + ") term=" + term + " doc=" + (curDocID + curDocBase);
int pos = currentDPE.nextPosition();
assert pos != -1 : "no positions for term=" + term + " doc=" + (curDocID + curDocBase);
- data = currentDPE.getPayload();
- return data != null;
+ return currentDPE.getPayload();
}
- public BytesRef getPayload() {
- return data;
- }
-
}
diff --git a/lucene/facet/src/java/org/apache/lucene/facet/search/StandardFacetsAccumulator.java b/lucene/facet/src/java/org/apache/lucene/facet/search/StandardFacetsAccumulator.java
index 78b362ea063..32466269d60 100644
--- a/lucene/facet/src/java/org/apache/lucene/facet/search/StandardFacetsAccumulator.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/search/StandardFacetsAccumulator.java
@@ -10,6 +10,7 @@ import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.util.IntsRef;
import org.apache.lucene.facet.search.aggregator.Aggregator;
import org.apache.lucene.facet.search.params.FacetSearchParams;
@@ -231,9 +232,9 @@ public class StandardFacetsAccumulator extends FacetsAccumulator {
facetArrays.free(); // to get a cleared array for this partition
}
- HashMap categoryLists = getCategoryListMap(
- facetArrays, partition);
+ HashMap categoryLists = getCategoryListMap(facetArrays, partition);
+ IntsRef ordinals = new IntsRef(32); // a reasonable start capacity for most common apps
for (Entry entry : categoryLists.entrySet()) {
CategoryListIterator categoryList = entry.getKey();
if (!categoryList.init()) {
@@ -244,14 +245,11 @@ public class StandardFacetsAccumulator extends FacetsAccumulator {
ScoredDocIDsIterator iterator = docids.iterator();
while (iterator.next()) {
int docID = iterator.getDocID();
- if (!categoryList.skipTo(docID)) {
+ categoryList.getOrdinals(docID, ordinals);
+ if (ordinals.length == 0) {
continue;
}
- categorator.setNextDoc(docID, iterator.getScore());
- long ordinal;
- while ((ordinal = categoryList.nextCategory()) <= Integer.MAX_VALUE) {
- categorator.aggregate((int) ordinal);
- }
+ categorator.aggregate(docID, iterator.getScore(), ordinals);
}
}
}
diff --git a/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/Aggregator.java b/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/Aggregator.java
index 447ddd9b49d..a3743317cc8 100644
--- a/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/Aggregator.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/Aggregator.java
@@ -2,6 +2,8 @@ package org.apache.lucene.facet.search.aggregator;
import java.io.IOException;
+import org.apache.lucene.util.IntsRef;
+
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@@ -36,16 +38,9 @@ import java.io.IOException;
public interface Aggregator {
/**
- * Specify the document (and its score in the search) that the following
- * {@link #aggregate(int)} calls will pertain to.
+ * Aggregate the ordinals of the given document ID (and its score). The given
+ * ordinals offset is always zero.
*/
- void setNextDoc(int docid, float score) throws IOException;
-
- /**
- * Collect (and do whatever an implementation deems appropriate) the
- * category given by its ordinal. This category belongs to a document
- * given earlier by {@link #setNextDoc(int, float)}.
- */
- void aggregate(int ordinal);
-
+ public void aggregate(int docID, float score, IntsRef ordinals) throws IOException;
+
}
diff --git a/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/ComplementCountingAggregator.java b/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/ComplementCountingAggregator.java
index 3bf62c4e520..50ca39fa352 100644
--- a/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/ComplementCountingAggregator.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/ComplementCountingAggregator.java
@@ -1,5 +1,9 @@
package org.apache.lucene.facet.search.aggregator;
+import java.io.IOException;
+
+import org.apache.lucene.util.IntsRef;
+
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@@ -29,9 +33,12 @@ public class ComplementCountingAggregator extends CountingAggregator {
}
@Override
- public void aggregate(int ordinal) {
- assert counterArray[ordinal]!=0:"complement aggregation: count is about to become negative for ordinal "+ordinal;
- --counterArray[ordinal];
+ public void aggregate(int docID, float score, IntsRef ordinals) throws IOException {
+ for (int i = 0; i < ordinals.length; i++) {
+ int ord = ordinals.ints[i];
+ assert counterArray[ord] != 0 : "complement aggregation: count is about to become negative for ordinal " + ord;
+ --counterArray[ord];
+ }
}
}
diff --git a/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/CountingAggregator.java b/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/CountingAggregator.java
index 5dd54a06b3a..8cd71595dc8 100644
--- a/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/CountingAggregator.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/CountingAggregator.java
@@ -1,5 +1,9 @@
package org.apache.lucene.facet.search.aggregator;
+import java.io.IOException;
+
+import org.apache.lucene.util.IntsRef;
+
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@@ -27,21 +31,17 @@ package org.apache.lucene.facet.search.aggregator;
public class CountingAggregator implements Aggregator {
protected int[] counterArray;
-
- @Override
- public void aggregate(int ordinal) {
- ++counterArray[ordinal];
- }
-
- @Override
- public void setNextDoc(int docid, float score) {
- // There's nothing for us to do here since we only increment the count by 1
- // in this aggregator.
- }
-
+
public CountingAggregator(int[] counterArray) {
this.counterArray = counterArray;
}
+
+ @Override
+ public void aggregate(int docID, float score, IntsRef ordinals) throws IOException {
+ for (int i = 0; i < ordinals.length; i++) {
+ counterArray[ordinals.ints[i]]++;
+ }
+ }
@Override
public boolean equals(Object obj) {
@@ -54,8 +54,7 @@ public class CountingAggregator implements Aggregator {
@Override
public int hashCode() {
- int hashCode = counterArray == null ? 0 : counterArray.hashCode();
-
- return hashCode;
+ return counterArray == null ? 0 : counterArray.hashCode();
}
+
}
diff --git a/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/ScoringAggregator.java b/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/ScoringAggregator.java
index ef788968138..6c3dc492703 100644
--- a/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/ScoringAggregator.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/ScoringAggregator.java
@@ -1,5 +1,9 @@
package org.apache.lucene.facet.search.aggregator;
+import java.io.IOException;
+
+import org.apache.lucene.util.IntsRef;
+
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@@ -26,7 +30,6 @@ package org.apache.lucene.facet.search.aggregator;
public class ScoringAggregator implements Aggregator {
private final float[] scoreArray;
- private float score;
private final int hashCode;
public ScoringAggregator(float[] counterArray) {
@@ -35,10 +38,12 @@ public class ScoringAggregator implements Aggregator {
}
@Override
- public void aggregate(int ordinal) {
- scoreArray[ordinal] += score;
+ public void aggregate(int docID, float score, IntsRef ordinals) throws IOException {
+ for (int i = 0; i < ordinals.length; i++) {
+ scoreArray[ordinals.ints[i]] += score;
+ }
}
-
+
@Override
public boolean equals(Object obj) {
if (obj == null || obj.getClass() != this.getClass()) {
@@ -53,8 +58,4 @@ public class ScoringAggregator implements Aggregator {
return hashCode;
}
- @Override
- public void setNextDoc(int docid, float score) {
- this.score = score;
- }
}
diff --git a/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/associations/AssociationFloatSumAggregator.java b/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/associations/AssociationFloatSumAggregator.java
index 57814d7cf57..22ebfecdba9 100644
--- a/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/associations/AssociationFloatSumAggregator.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/associations/AssociationFloatSumAggregator.java
@@ -7,6 +7,8 @@ import org.apache.lucene.facet.associations.FloatAssociationsPayloadIterator;
import org.apache.lucene.facet.index.params.CategoryListParams;
import org.apache.lucene.facet.search.aggregator.Aggregator;
import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.util.IntsRef;
+import org.apache.lucene.util.collections.IntToFloatMap;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@@ -48,13 +50,18 @@ public class AssociationFloatSumAggregator implements Aggregator {
}
@Override
- public void aggregate(int ordinal) {
- float association = associations.getAssociation(ordinal);
- if (!Float.isNaN(association)) {
- sumArray[ordinal] += association;
+ public void aggregate(int docID, float score, IntsRef ordinals) throws IOException {
+ IntToFloatMap values = associations.getAssociations(docID);
+ if (values != null) {
+ for (int i = 0; i < ordinals.length; i++) {
+ int ord = ordinals.ints[i];
+ if (values.containsKey(ord)) {
+ sumArray[ord] += values.get(ord);
+ }
+ }
}
}
-
+
@Override
public boolean equals(Object obj) {
if (obj == null || obj.getClass() != this.getClass()) {
@@ -69,9 +76,4 @@ public class AssociationFloatSumAggregator implements Aggregator {
return field.hashCode();
}
- @Override
- public void setNextDoc(int docid, float score) throws IOException {
- associations.setNextDoc(docid);
- }
-
}
diff --git a/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/associations/AssociationIntSumAggregator.java b/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/associations/AssociationIntSumAggregator.java
index 42260597376..2f12080a35b 100644
--- a/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/associations/AssociationIntSumAggregator.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/search/aggregator/associations/AssociationIntSumAggregator.java
@@ -7,6 +7,8 @@ import org.apache.lucene.facet.associations.IntAssociationsPayloadIterator;
import org.apache.lucene.facet.index.params.CategoryListParams;
import org.apache.lucene.facet.search.aggregator.Aggregator;
import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.util.IntsRef;
+import org.apache.lucene.util.collections.IntToIntMap;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@@ -48,13 +50,18 @@ public class AssociationIntSumAggregator implements Aggregator {
}
@Override
- public void aggregate(int ordinal) {
- long association = associations.getAssociation(ordinal);
- if (association != IntAssociationsPayloadIterator.NO_ASSOCIATION) {
- sumArray[ordinal] += association;
+ public void aggregate(int docID, float score, IntsRef ordinals) throws IOException {
+ IntToIntMap values = associations.getAssociations(docID);
+ if (values != null) {
+ for (int i = 0; i < ordinals.length; i++) {
+ int ord = ordinals.ints[i];
+ if (values.containsKey(ord)) {
+ sumArray[ord] += values.get(ord);
+ }
+ }
}
}
-
+
@Override
public boolean equals(Object obj) {
if (obj == null || obj.getClass() != this.getClass()) {
@@ -69,9 +76,4 @@ public class AssociationIntSumAggregator implements Aggregator {
return field.hashCode();
}
- @Override
- public void setNextDoc(int docid, float score) throws IOException {
- associations.setNextDoc(docid);
- }
-
}
diff --git a/lucene/facet/src/java/org/apache/lucene/facet/search/cache/CategoryListData.java b/lucene/facet/src/java/org/apache/lucene/facet/search/cache/CategoryListData.java
index cc0daa4f6fe..4dc5c694a8c 100644
--- a/lucene/facet/src/java/org/apache/lucene/facet/search/cache/CategoryListData.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/search/cache/CategoryListData.java
@@ -2,13 +2,12 @@ package org.apache.lucene.facet.search.cache;
import java.io.IOException;
-import org.apache.lucene.index.IndexReader;
-
import org.apache.lucene.facet.index.params.CategoryListParams;
import org.apache.lucene.facet.index.params.FacetIndexingParams;
import org.apache.lucene.facet.search.CategoryListIterator;
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
-import org.apache.lucene.util.collections.IntArray;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.util.IntsRef;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@@ -56,33 +55,26 @@ public class CategoryListData {
protected CategoryListData() {
}
- /**
- * Compute category list data for caching for faster iteration.
- */
+ /** Compute category list data for caching for faster iteration. */
CategoryListData(IndexReader reader, TaxonomyReader taxo,
FacetIndexingParams iparams, CategoryListParams clp) throws IOException {
final int maxDoc = reader.maxDoc();
int[][][]dpf = new int[maxDoc][][];
int numPartitions = (int)Math.ceil(taxo.getSize()/(double)iparams.getPartitionSize());
- IntArray docCategories = new IntArray();
- for (int part=0; part 0) {
+ if (dpf[doc] == null) {
dpf[doc] = new int[numPartitions][];
}
- long category;
- while ((category = cli.nextCategory()) <= Integer.MAX_VALUE) {
- docCategories.addToArray((int)category);
- }
- final int size = docCategories.size();
- dpf[doc][part] = new int[size];
- for (int i=0; ipart;
+ return dpc != null && dpc.length > part;
}
@Override
- public long nextCategory() throws IOException {
- if (nextCategoryIndex >= dpc[currDoc][part].length) {
- return 1L+Integer.MAX_VALUE;
+ public void getOrdinals(int docID, IntsRef ints) throws IOException {
+ ints.length = 0;
+ if (dpc.length > docID && dpc[docID] != null && dpc[docID][part] != null) {
+ if (ints.ints.length < dpc[docID][part].length) {
+ ints.grow(dpc[docID][part].length);
+ }
+ ints.length = 0;
+ for (int i = 0; i < dpc[docID][part].length; i++) {
+ ints.ints[ints.length++] = dpc[docID][part][i];
+ }
}
- return dpc[currDoc][part][nextCategoryIndex++];
- }
-
- @Override
- public boolean skipTo(int docId) throws IOException {
- final boolean res = dpc.length>docId && dpc[docId]!=null && dpc[docId][part]!=null;
- if (res) {
- currDoc = docId;
- nextCategoryIndex = 0;
- }
- return res;
}
}
+
}
\ No newline at end of file
diff --git a/lucene/facet/src/java/org/apache/lucene/facet/search/params/CountFacetRequest.java b/lucene/facet/src/java/org/apache/lucene/facet/search/params/CountFacetRequest.java
index 9a55244a5ee..71c7df7f0c4 100644
--- a/lucene/facet/src/java/org/apache/lucene/facet/search/params/CountFacetRequest.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/search/params/CountFacetRequest.java
@@ -48,8 +48,7 @@ public class CountFacetRequest extends FacetRequest {
@Override
public Aggregator createAggregator(boolean useComplements,
- FacetArrays arrays, IndexReader reader,
- TaxonomyReader taxonomy) {
+ FacetArrays arrays, IndexReader reader, TaxonomyReader taxonomy) {
// we rely on that, if needed, result is cleared by arrays!
int[] a = arrays.getIntArray();
if (useComplements) {
diff --git a/lucene/facet/src/java/org/apache/lucene/facet/util/MultiCategoryListIterator.java b/lucene/facet/src/java/org/apache/lucene/facet/util/MultiCategoryListIterator.java
index 13fc778bad1..402ba3dcf69 100644
--- a/lucene/facet/src/java/org/apache/lucene/facet/util/MultiCategoryListIterator.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/util/MultiCategoryListIterator.java
@@ -5,6 +5,7 @@ import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.facet.search.CategoryListIterator;
+import org.apache.lucene.util.IntsRef;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@@ -33,16 +34,13 @@ public class MultiCategoryListIterator implements CategoryListIterator {
private final CategoryListIterator[] iterators;
private final List validIterators;
- private final List perDocValidIterators;
/** Receives the iterators to iterate on */
public MultiCategoryListIterator(CategoryListIterator... iterators) {
this.iterators = iterators;
this.validIterators = new ArrayList();
- this.perDocValidIterators = new ArrayList();
}
- /** Fails if all given iterators fail to init */
@Override
public boolean init() throws IOException {
for (CategoryListIterator cli : iterators) {
@@ -52,35 +50,17 @@ public class MultiCategoryListIterator implements CategoryListIterator {
}
return !validIterators.isEmpty();
}
-
- /**
- * Return a value larger than {@link Integer#MAX_VALUE} only if all
- * iterators are exhausted
- */
+
@Override
- public long nextCategory() throws IOException {
- while (!perDocValidIterators.isEmpty()) {
- long value = perDocValidIterators.get(0).nextCategory();
- if (value <= Integer.MAX_VALUE) {
- return value;
- }
- perDocValidIterators.remove(0);
- }
- return 0x100000000L;
- }
-
- /**
- * Fails only if skipTo on all the provided iterators returned {@code false}
- */
- @Override
- public boolean skipTo(int docId) throws IOException {
- perDocValidIterators.clear();
+ public void getOrdinals(int docID, IntsRef ints) throws IOException {
+ IntsRef tmp = new IntsRef(ints.length);
for (CategoryListIterator cli : validIterators) {
- if (cli.skipTo(docId)) {
- perDocValidIterators.add(cli);
+ cli.getOrdinals(docID, tmp);
+ if (ints.ints.length < ints.length + tmp.length) {
+ ints.grow(ints.length + tmp.length);
}
+ ints.length += tmp.length;
}
- return !perDocValidIterators.isEmpty();
}
}
diff --git a/lucene/facet/src/java/org/apache/lucene/util/Vint8.java b/lucene/facet/src/java/org/apache/lucene/util/Vint8.java
deleted file mode 100644
index 40dd96daebd..00000000000
--- a/lucene/facet/src/java/org/apache/lucene/util/Vint8.java
+++ /dev/null
@@ -1,229 +0,0 @@
-package org.apache.lucene.util;
-
-import java.io.EOFException;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * Variable-length encoding of 32-bit integers, into 8-bit bytes. A number is encoded as follows:
- *
- * - If it is less than 127 and non-negative (i.e., if the number uses only 7 bits), it is encoded as
- * as single byte: 0bbbbbbb.
- *
- If its highest nonzero bit is greater than bit 6 (0x40), it is represented as a series of
- * bytes, each byte's
- * 7 LSB containing bits from the original value, with the MSB set for all but the last
- * byte. The first encoded byte contains the highest nonzero bits from the
- * original; the second byte contains the next 7 MSB; and so on, with the last byte
- * containing the 7 LSB of the original.
- *
- * Examples:
- *
- * - n = 117 = 1110101: This has fewer than 8 significant bits, and so is encoded as
- * 01110101 = 0x75.
- *
- n = 100000 = (binary) 11000011010100000. This has 17 significant bits, and so needs
- * three Vint8 bytes. Left-zero-pad it to a multiple of 7 bits, then split it into chunks of 7
- * and add an MSB, 0 for the last byte, 1 for the others: 1|0000110 1|0001101 0|0100000
- * = 0x86 0x8D 0x20.
- *
- * This encoder/decoder will correctly handle any 32-bit integer, but for negative numbers,
- * and positive numbers with more than 28 significant bits, encoding requires 5 bytes; this
- * is not an efficient encoding scheme for large
- * positive numbers or any negative number.
- *
- * Compatibility:
- * This class has been used in products that have shipped to customers, and is needed to
- * decode legacy data. Do not modify this class in ways that will break compatibility.
- *
- * @lucene.experimental
- */
-public class Vint8 {
-
- /**
- * Because Java lacks call-by-reference, this class boxes the decoding position, which
- * is initially set by the caller, and returned after decoding, incremented by the number
- * of bytes processed.
- */
- public static class Position {
- /**
- * Creates a position value set to zero.
- */
- public Position() {
- // The initial position is zero by default.
- }
- /**
- * Creates a position set to {@code initialPosition}.
- * @param initialPosition The starting decoding position in the source buffer.
- */
- public Position(int initialPosition) {
- this.pos = initialPosition;
- }
- /**
- * The value passed by reference.
- */
- public int pos;
- }
-
- /**
- * Returns the number of bytes needed to encode {@code number}.
- * @param number The number whose encoded length is needed.
- * @return The number of bytes needed to encode {@code number}.
- */
- public static int bytesNeeded(int number) {
- if ((number & ~0x7F) == 0) {
- return 1;
- } else if ((number & ~0x3FFF) == 0) {
- return 2;
- } else if ((number & ~0x1FFFFF) == 0) {
- return 3;
- } else if ((number & ~0xFFFFFFF) == 0) {
- return 4;
- } else {
- return 5;
- }
- }
-
- /**
- * The maximum number of bytes needed to encode a number using {@code Vint8}.
- */
- public static final int MAXIMUM_BYTES_NEEDED = 5;
-
- /**
- * Encodes {@code number} to {@code out}.
- * @param number The value to be written in encoded form, to {@code out}.
- * @param out The output stream receiving the encoded bytes.
- * @exception IOException If there is a problem writing to {@code out}.
- */
- public static void encode(int number, OutputStream out) throws IOException {
- if ((number & ~0x7F) == 0) {
- out.write(number);
- } else if ((number & ~0x3FFF) == 0) {
- out.write(0x80 | (number >> 7));
- out.write(0x7F & number);
- } else if ((number & ~0x1FFFFF) == 0) {
- out.write(0x80 | (number >> 14));
- out.write(0x80 | (number >> 7));
- out.write(0x7F & number);
- } else if ((number & ~0xFFFFFFF) == 0) {
- out.write(0x80 | (number >> 21));
- out.write(0x80 | (number >> 14));
- out.write(0x80 | (number >> 7));
- out.write(0x7F & number);
- } else {
- out.write(0x80 | (number >> 28));
- out.write(0x80 | (number >> 21));
- out.write(0x80 | (number >> 14));
- out.write(0x80 | (number >> 7));
- out.write(0x7F & number);
- }
- }
-
- /**
- * Encodes {@code number} into {@code dest}, starting at offset {@code start} from
- * the beginning of the array. This method assumes {@code dest} is large enough to
- * hold the required number of bytes.
- * @param number The number to be encoded.
- * @param dest The destination array.
- * @param start The starting offset in the array.
- * @return The number of bytes used in the array.
- */
- public static int encode(int number, byte[] dest, int start) {
- if ((number & ~0x7F) == 0) {
- dest[start] = (byte) number;
- return 1;
- } else if ((number & ~0x3FFF) == 0) {
- dest[start] = (byte) (0x80 | ((number & 0x3F80) >> 7));
- dest[start + 1] = (byte) (number & 0x7F);
- return 2;
- } else if ((number & ~0x1FFFFF) == 0) {
- dest[start] = (byte) (0x80 | ((number & 0x1FC000) >> 14));
- dest[start + 1] = (byte) (0x80 | ((number & 0x3F80) >> 7));
- dest[start + 2] = (byte) (number & 0x7F);
- return 3;
- } else if ((number & ~0xFFFFFFF) == 0) {
- dest[start] = (byte) (0x80 | ((number & 0xFE00000) >> 21));
- dest[start + 1] = (byte) (0x80 | ((number & 0x1FC000) >> 14));
- dest[start + 2] = (byte) (0x80 | ((number & 0x3F80) >> 7));
- dest[start + 3] = (byte) (number & 0x7F);
- return 4;
- } else {
- dest[start] = (byte) (0x80 | ((number & 0xF0000000) >> 28));
- dest[start + 1] = (byte) (0x80 | ((number & 0xFE00000) >> 21));
- dest[start + 2] = (byte) (0x80 | ((number & 0x1FC000) >> 14));
- dest[start + 3] = (byte) (0x80 | ((number & 0x3F80) >> 7));
- dest[start + 4] = (byte) (number & 0x7F);
- return 5;
- }
- }
-
- /**
- * Decodes a 32-bit integer from {@code bytes}, beginning at offset {@code pos.pos}.
- * The decoded value is returned, and {@code pos.pos} is incremented by the number of
- * bytes processed.
- * @param bytes The byte array containing an encoded value.
- * @param pos On entry, the starting position in the array; on return, one greater
- * than the position of the last byte decoded in the call.
- * @return The decoded value.
- */
- public static int decode(byte[] bytes, Position pos) {
- int value = 0;
- while (true) {
- byte first = bytes[pos.pos];
- ++pos.pos;
- value |= first & 0x7F;
- if ((first & 0x80) == 0) {
- return value;
- }
- value <<= 7;
- }
- }
-
- /**
- * Decodes a 32-bit integer from bytes read from {@code in}. Bytes are read,
- * one at a time, from {@code in}, and it is assumed they represent a 32-bit
- * integer encoded using this class's encoding scheme. The decoded value is
- * returned.
- * @param in The input stream containing the encoded bytes.
- * @return The decoded value.
- * @exception EOFException If the stream ends before a value has been decoded.
- */
- public static int decode(InputStream in) throws IOException {
- int value = 0;
- while (true) {
- int first = in.read();
- if (first < 0) {
- throw new EOFException();
- }
- value |= first & 0x7F;
- if ((first & 0x80) == 0) {
- return value;
- }
- value <<= 7;
- }
- }
-
- /**
- * The default ctor is made private because all methods of this class are static.
- */
- private Vint8() {
- // Just making it impossible to instantiate.
- }
-
-}
diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/ChunksIntEncoder.java b/lucene/facet/src/java/org/apache/lucene/util/encoding/ChunksIntEncoder.java
index c640c53b6cc..a19550c29e5 100644
--- a/lucene/facet/src/java/org/apache/lucene/util/encoding/ChunksIntEncoder.java
+++ b/lucene/facet/src/java/org/apache/lucene/util/encoding/ChunksIntEncoder.java
@@ -1,7 +1,7 @@
package org.apache.lucene.util.encoding;
-import java.io.IOException;
-import java.io.OutputStream;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IntsRef;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@@ -27,38 +27,31 @@ import java.io.OutputStream;
* read more on the two implementations {@link FourFlagsIntEncoder} and
* {@link EightFlagsIntEncoder}.
*
- * Extensions of this class need to implement {@link #encode(int)} in order to
- * build the proper indicator (flags). When enough values were accumulated
- * (typically the batch size), extensions can call {@link #encodeChunk()} to
- * flush the indicator and the rest of the values.
+ * Extensions of this class need to implement {@link #encode(IntsRef, BytesRef)}
+ * in order to build the proper indicator (flags). When enough values were
+ * accumulated (typically the batch size), extensions can call
+ * {@link #encodeChunk(BytesRef)} to flush the indicator and the rest of the
+ * values.
*
* NOTE: flags encoders do not accept values ≤ 0 (zero) in their
- * {@link #encode(int)}. For performance reasons they do not check that
- * condition, however if such value is passed the result stream may be corrupt
- * or an exception will be thrown. Also, these encoders perform the best when
- * there are many consecutive small values (depends on the encoder
+ * {@link #encode(IntsRef, BytesRef)}. For performance reasons they do not check
+ * that condition, however if such value is passed the result stream may be
+ * corrupt or an exception will be thrown. Also, these encoders perform the best
+ * when there are many consecutive small values (depends on the encoder
* implementation). If that is not the case, the encoder will occupy 1 more byte
* for every batch number of integers, over whatever
* {@link VInt8IntEncoder} would have occupied. Therefore make sure to check
* whether your data fits into the conditions of the specific encoder.
*
* For the reasons mentioned above, these encoders are usually chained with
- * {@link UniqueValuesIntEncoder} and {@link DGapIntEncoder} in the following
- * manner:
- * IntEncoder fourFlags =
- * new SortingEncoderFilter(new UniqueValuesIntEncoder(new DGapIntEncoder(new FlagsIntEncoderImpl())));
- *
+ * {@link UniqueValuesIntEncoder} and {@link DGapIntEncoder}.
*
* @lucene.experimental
*/
public abstract class ChunksIntEncoder extends IntEncoder {
/** Holds the values which must be encoded, outside the indicator. */
- protected final int[] encodeQueue;
- protected int encodeQueueSize = 0;
-
- /** Encoder used to encode values outside the indicator. */
- protected final IntEncoder encoder = new VInt8IntEncoder();
+ protected final IntsRef encodeQueue;
/** Represents bits flag byte. */
protected int indicator = 0;
@@ -67,39 +60,33 @@ public abstract class ChunksIntEncoder extends IntEncoder {
protected byte ordinal = 0;
protected ChunksIntEncoder(int chunkSize) {
- encodeQueue = new int[chunkSize];
+ encodeQueue = new IntsRef(chunkSize);
}
/**
* Encodes the values of the current chunk. First it writes the indicator, and
* then it encodes the values outside the indicator.
*/
- protected void encodeChunk() throws IOException {
- out.write(indicator);
- for (int i = 0; i < encodeQueueSize; ++i) {
- encoder.encode(encodeQueue[i]);
+ protected void encodeChunk(BytesRef buf) {
+ // ensure there's enough room in the buffer
+ int maxBytesRequired = buf.length + 1 + encodeQueue.length * 4; /* indicator + at most 4 bytes per positive VInt */
+ if (buf.bytes.length < maxBytesRequired) {
+ buf.grow(maxBytesRequired);
}
- encodeQueueSize = 0;
- ordinal = 0;
- indicator = 0;
+
+ buf.bytes[buf.length++] = ((byte) indicator);
+ for (int i = 0; i < encodeQueue.length; i++) {
+ VInt8.encode(encodeQueue.ints[i], buf);
+ }
+
+ reset();
}
@Override
- public void close() throws IOException {
- if (ordinal != 0) {
- encodeChunk();
- }
- encoder.close();
- super.close();
- }
-
- @Override
- public void reInit(OutputStream out) {
- encoder.reInit(out);
- super.reInit(out);
+ protected void reset() {
ordinal = 0;
indicator = 0;
- encodeQueueSize = 0;
+ encodeQueue.length = 0;
}
}
diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/DGapIntDecoder.java b/lucene/facet/src/java/org/apache/lucene/util/encoding/DGapIntDecoder.java
index a37a4b9f4bb..a5b2fb3c28c 100644
--- a/lucene/facet/src/java/org/apache/lucene/util/encoding/DGapIntDecoder.java
+++ b/lucene/facet/src/java/org/apache/lucene/util/encoding/DGapIntDecoder.java
@@ -1,7 +1,7 @@
package org.apache.lucene.util.encoding;
-import java.io.IOException;
-import java.io.InputStream;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IntsRef;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@@ -21,10 +21,8 @@ import java.io.InputStream;
*/
/**
- * An {@link IntDecoder} which wraps another {@link IntDecoder} and reverts the
- * d-gap that was encoded by {@link DGapIntEncoder}. The wrapped decoder
- * performs the actual decoding, while this class simply adds the decoded value
- * to the previous value.
+ * An {@link IntDecoder} which wraps another decoder and reverts the d-gap that
+ * was encoded by {@link DGapIntEncoder}.
*
* @lucene.experimental
*/
@@ -32,26 +30,23 @@ public class DGapIntDecoder extends IntDecoder {
private final IntDecoder decoder;
- private int prev = 0;
-
public DGapIntDecoder(IntDecoder decoder) {
this.decoder = decoder;
}
@Override
- public long decode() throws IOException {
- long decode = decoder.decode();
- if (decode == EOS) {
- return EOS;
- }
-
- return prev += decode;
+ protected void reset() {
+ decoder.reset();
}
-
+
@Override
- public void reInit(InputStream in) {
- decoder.reInit(in);
- prev = 0;
+ protected void doDecode(BytesRef buf, IntsRef values, int upto) {
+ decoder.doDecode(buf, values, upto);
+ int prev = 0;
+ for (int i = 0; i < values.length; i++) {
+ values.ints[i] += prev;
+ prev = values.ints[i];
+ }
}
@Override
diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/DGapIntEncoder.java b/lucene/facet/src/java/org/apache/lucene/util/encoding/DGapIntEncoder.java
index 8b1a0deffaf..305f975c619 100644
--- a/lucene/facet/src/java/org/apache/lucene/util/encoding/DGapIntEncoder.java
+++ b/lucene/facet/src/java/org/apache/lucene/util/encoding/DGapIntEncoder.java
@@ -1,7 +1,7 @@
package org.apache.lucene.util.encoding;
-import java.io.IOException;
-import java.io.OutputStream;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IntsRef;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@@ -27,7 +27,7 @@ import java.io.OutputStream;
* space) if the values are 'close' to each other.
*
* NOTE: this encoder assumes the values are given to
- * {@link #encode(int)} in an ascending sorted manner, which ensures only
+ * {@link #encode(IntsRef, BytesRef)} in an ascending sorted manner, which ensures only
* positive values are encoded and thus yields better performance. If you are
* not sure whether the values are sorted or not, it is possible to chain this
* encoder with {@link SortingIntEncoder} to ensure the values will be
@@ -37,17 +37,20 @@ import java.io.OutputStream;
*/
public class DGapIntEncoder extends IntEncoderFilter {
- private int prev = 0;
-
/** Initializes with the given encoder. */
public DGapIntEncoder(IntEncoder encoder) {
super(encoder);
}
@Override
- public void encode(int value) throws IOException {
- encoder.encode(value - prev);
- prev = value;
+ protected void doEncode(IntsRef values, BytesRef buf, int upto) {
+ int prev = 0;
+ for (int i = values.offset; i < upto; i++) {
+ int tmp = values.ints[i];
+ values.ints[i] -= prev;
+ prev = tmp;
+ }
+ encoder.doEncode(values, buf, upto);
}
@Override
@@ -55,12 +58,6 @@ public class DGapIntEncoder extends IntEncoderFilter {
return new DGapIntDecoder(encoder.createMatchingDecoder());
}
- @Override
- public void reInit(OutputStream out) {
- super.reInit(out);
- prev = 0;
- }
-
@Override
public String toString() {
return "DGap (" + encoder.toString() + ")";
diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/EightFlagsIntDecoder.java b/lucene/facet/src/java/org/apache/lucene/util/encoding/EightFlagsIntDecoder.java
index fe2b9c6d67b..270fcffca52 100644
--- a/lucene/facet/src/java/org/apache/lucene/util/encoding/EightFlagsIntDecoder.java
+++ b/lucene/facet/src/java/org/apache/lucene/util/encoding/EightFlagsIntDecoder.java
@@ -1,7 +1,7 @@
package org.apache.lucene.util.encoding;
-import java.io.IOException;
-import java.io.InputStream;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IntsRef;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@@ -21,20 +21,17 @@ import java.io.InputStream;
*/
/**
- * Decodes data which was encoded by {@link EightFlagsIntEncoder}. Scans
- * the indicator
, one flag (1-bits) at a time, and decodes extra
- * data using {@link VInt8IntDecoder}.
+ * Decodes values encoded with {@link EightFlagsIntEncoder}.
*
- * @see EightFlagsIntEncoder
* @lucene.experimental
*/
public class EightFlagsIntDecoder extends IntDecoder {
- /**
+ /*
* Holds all combinations of indicator for fast decoding (saves time
* on real-time bit manipulation)
*/
- private static final byte[][] decodeTable = new byte[256][8];
+ private static final byte[][] DECODE_TABLE = new byte[256][8];
/** Generating all combinations of indicator into separate flags. */
static {
@@ -42,45 +39,36 @@ public class EightFlagsIntDecoder extends IntDecoder {
--i;
for (int j = 8; j != 0;) {
--j;
- decodeTable[i][j] = (byte) ((i >>> j) & 0x1);
+ DECODE_TABLE[i][j] = (byte) ((i >>> j) & 0x1);
}
}
}
- private final IntDecoder decoder = new VInt8IntDecoder();
-
- /** The indicator for decoding a chunk of 8 integers. */
- private int indicator;
-
- /** Used as an ordinal of 0 - 7, as the decoder decodes chunks of 8 integers. */
- private int ordinal = 0;
-
@Override
- public long decode() throws IOException {
- // If we've decoded 8 integers, read the next indicator.
- if ((ordinal & 0x7) == 0) {
- indicator = in.read();
- if (indicator < 0) {
- return EOS;
+ protected void doDecode(BytesRef buf, IntsRef values, int upto) {
+ while (buf.offset < upto) {
+ // read indicator
+ int indicator = buf.bytes[buf.offset++] & 0xFF;
+ int ordinal = 0;
+
+ int capacityNeeded = values.length + 8;
+ if (values.ints.length < capacityNeeded) {
+ values.grow(capacityNeeded);
+ }
+
+ // process indicator, until we read 8 values, or end-of-buffer
+ while (ordinal != 8) {
+ if (DECODE_TABLE[indicator][ordinal++] == 0) {
+ if (buf.offset == upto) { // end of buffer
+ return;
+ }
+ // decode the value from the stream.
+ values.ints[values.length++] = VInt8.decode(buf) + 2;
+ } else {
+ values.ints[values.length++] = 1;
+ }
}
- ordinal = 0;
}
-
- if (decodeTable[indicator][ordinal++] == 0) {
- // decode the value from the stream.
- long decode = decoder.decode();
- return decode == EOS ? EOS : decode + 2;
- }
-
- return 1;
- }
-
- @Override
- public void reInit(InputStream in) {
- super.reInit(in);
- decoder.reInit(in);
- ordinal = 0;
- indicator = 0;
}
@Override
diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/EightFlagsIntEncoder.java b/lucene/facet/src/java/org/apache/lucene/util/encoding/EightFlagsIntEncoder.java
index ac427725674..143660a4742 100644
--- a/lucene/facet/src/java/org/apache/lucene/util/encoding/EightFlagsIntEncoder.java
+++ b/lucene/facet/src/java/org/apache/lucene/util/encoding/EightFlagsIntEncoder.java
@@ -1,6 +1,7 @@
package org.apache.lucene.util.encoding;
-import java.io.IOException;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IntsRef;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@@ -20,14 +21,15 @@ import java.io.IOException;
*/
/**
- * A {@link ChunksIntEncoder} which encodes data in chunks of 8. Every group starts with a single
- * byte (called indicator) which represents 8 - 1 bit flags, where the value:
+ * A {@link ChunksIntEncoder} which encodes data in chunks of 8. Every group
+ * starts with a single byte (called indicator) which represents 8 - 1 bit
+ * flags, where the value:
*
* - 1 means the encoded value is '1'
*
- 0 means the value is encoded using {@link VInt8IntEncoder}, and the
* encoded bytes follow the indicator.
- * Since value 0 is illegal, and 1 is encoded in the indicator, the actual
- * value that is encoded is value-2
, which saves some more bits.
+ * Since value 0 is illegal, and 1 is encoded in the indicator, the actual value
+ * that is encoded is value-2
, which saves some more bits.
*
* Encoding example:
*
@@ -46,28 +48,36 @@ import java.io.IOException;
*/
public class EightFlagsIntEncoder extends ChunksIntEncoder {
- /**
+ /*
* Holds all combinations of indicator flags for fast encoding (saves
* time on bit manipulation at encode time)
*/
- private static byte[] encodeTable = new byte[] { 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, (byte) 0x80 };
+ private static final byte[] ENCODE_TABLE = new byte[] { 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, (byte) 0x80 };
public EightFlagsIntEncoder() {
super(8);
}
@Override
- public void encode(int data) throws IOException {
- if (data == 1) {
- indicator |= encodeTable[ordinal];
- } else {
- encodeQueue[encodeQueueSize++] = data - 2;
+ protected void doEncode(IntsRef values, BytesRef buf, int upto) {
+ for (int i = values.offset; i < upto; i++) {
+ int value = values.ints[i];
+ if (value == 1) {
+ indicator |= ENCODE_TABLE[ordinal];
+ } else {
+ encodeQueue.ints[encodeQueue.length++] = value - 2;
+ }
+ ++ordinal;
+
+ // encode the chunk and the indicator
+ if (ordinal == 8) {
+ encodeChunk(buf);
+ }
}
- ++ordinal;
-
- // If 8 values were encoded thus far, 'flush' them including the indicator.
- if ((ordinal & 0x7) == 0) {
- encodeChunk();
+
+ // encode remaining values
+ if (ordinal != 0) {
+ encodeChunk(buf);
}
}
@@ -78,7 +88,7 @@ public class EightFlagsIntEncoder extends ChunksIntEncoder {
@Override
public String toString() {
- return "EightFlags (" + encoder.toString() + ")";
+ return "EightFlags (VInt)";
}
}
diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/FourFlagsIntDecoder.java b/lucene/facet/src/java/org/apache/lucene/util/encoding/FourFlagsIntDecoder.java
index a597f9e2a38..ebc161ac9cd 100644
--- a/lucene/facet/src/java/org/apache/lucene/util/encoding/FourFlagsIntDecoder.java
+++ b/lucene/facet/src/java/org/apache/lucene/util/encoding/FourFlagsIntDecoder.java
@@ -1,7 +1,7 @@
package org.apache.lucene.util.encoding;
-import java.io.IOException;
-import java.io.InputStream;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IntsRef;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@@ -21,11 +21,8 @@ import java.io.InputStream;
*/
/**
- * Decodes data which was encoded by {@link FourFlagsIntEncoder}. Scans
- * the indicator
, one flag (1-bits) at a time, and decodes extra
- * data using {@link VInt8IntDecoder}.
+ * Decodes values encoded with {@link FourFlagsIntEncoder}.
*
- * @see FourFlagsIntEncoder
* @lucene.experimental
*/
public class FourFlagsIntDecoder extends IntDecoder {
@@ -34,7 +31,7 @@ public class FourFlagsIntDecoder extends IntDecoder {
* Holds all combinations of indicator for fast decoding (saves time
* on real-time bit manipulation)
*/
- private final static byte[][] decodeTable = new byte[256][4];
+ private final static byte[][] DECODE_TABLE = new byte[256][4];
/** Generating all combinations of indicator into separate flags. */
static {
@@ -42,46 +39,36 @@ public class FourFlagsIntDecoder extends IntDecoder {
--i;
for (int j = 4; j != 0;) {
--j;
- decodeTable[i][j] = (byte) ((i >>> (j << 1)) & 0x3);
+ DECODE_TABLE[i][j] = (byte) ((i >>> (j << 1)) & 0x3);
}
}
}
- private final IntDecoder decoder = new VInt8IntDecoder();
-
- /** The indicator for decoding a chunk of 4 integers. */
- private int indicator;
-
- /** Used as an ordinal of 0 - 3, as the decoder decodes chunks of 4 integers. */
- private int ordinal = 0;
-
@Override
- public long decode() throws IOException {
- // If we've decoded 8 integers, read the next indicator.
- if ((ordinal & 0x3) == 0) {
- indicator = in.read();
- if (indicator < 0) {
- return EOS;
+ protected void doDecode(BytesRef buf, IntsRef values, int upto) {
+ while (buf.offset < upto) {
+ // read indicator
+ int indicator = buf.bytes[buf.offset++] & 0xFF;
+ int ordinal = 0;
+
+ int capacityNeeded = values.length + 4;
+ if (values.ints.length < capacityNeeded) {
+ values.grow(capacityNeeded);
+ }
+
+ while (ordinal != 4) {
+ byte decodeVal = DECODE_TABLE[indicator][ordinal++];
+ if (decodeVal == 0) {
+ if (buf.offset == upto) { // end of buffer
+ return;
+ }
+ // decode the value from the stream.
+ values.ints[values.length++] = VInt8.decode(buf) + 4;
+ } else {
+ values.ints[values.length++] = decodeVal;
+ }
}
- ordinal = 0;
}
-
- byte decodeVal = decodeTable[indicator][ordinal++];
- if (decodeVal == 0) {
- // decode the value from the stream.
- long decode = decoder.decode();
- return decode == EOS ? EOS : decode + 4;
- }
-
- return decodeVal;
- }
-
- @Override
- public void reInit(InputStream in) {
- super.reInit(in);
- decoder.reInit(in);
- ordinal = 0;
- indicator = 0;
}
@Override
diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/FourFlagsIntEncoder.java b/lucene/facet/src/java/org/apache/lucene/util/encoding/FourFlagsIntEncoder.java
index a1e227d5cdf..535a90fb60c 100644
--- a/lucene/facet/src/java/org/apache/lucene/util/encoding/FourFlagsIntEncoder.java
+++ b/lucene/facet/src/java/org/apache/lucene/util/encoding/FourFlagsIntEncoder.java
@@ -1,6 +1,7 @@
package org.apache.lucene.util.encoding;
-import java.io.IOException;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IntsRef;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@@ -48,11 +49,11 @@ import java.io.IOException;
*/
public class FourFlagsIntEncoder extends ChunksIntEncoder {
- /**
+ /*
* Holds all combinations of indicator flags for fast encoding (saves
* time on bit manipulation @ encode time)
*/
- private static byte[][] encodeTable = new byte[][] {
+ private static final byte[][] ENCODE_TABLE = new byte[][] {
new byte[] { 0x00, 0x00, 0x00, 0x00 },
new byte[] { 0x01, 0x04, 0x10, 0x40 },
new byte[] { 0x02, 0x08, 0x20, (byte) 0x80 },
@@ -63,26 +64,26 @@ public class FourFlagsIntEncoder extends ChunksIntEncoder {
super(4);
}
- /**
- * Small values (<=3) are stored in the indicator
while larger
- * values are saved for later encoding in the {@link #encodeQueue}. Since
- * Vint8 will only encode values larger or equal to 4, the values saves for
- * encoded are transformed to (value - 4).
- * When a chunk is ready (got 4 values), the {@link #encodeChunk()}
- * takes control.
- */
@Override
- public void encode(int data) throws IOException {
- if (data <= 3) {
- indicator |= encodeTable[data][ordinal];
- } else {
- encodeQueue[encodeQueueSize++] = data - 4;
+ protected void doEncode(IntsRef values, BytesRef buf, int upto) {
+ for (int i = values.offset; i < upto; i++) {
+ int value = values.ints[i];
+ if (value <= 3) {
+ indicator |= ENCODE_TABLE[value][ordinal];
+ } else {
+ encodeQueue.ints[encodeQueue.length++] = value - 4;
+ }
+ ++ordinal;
+
+ // encode the chunk and the indicator
+ if (ordinal == 4) {
+ encodeChunk(buf);
+ }
}
- ++ordinal;
-
- // If 4 values were encoded thus far, 'flush' them including the indicator.
- if ((ordinal & 0x3) == 0) {
- encodeChunk();
+
+ // encode remaining values
+ if (ordinal != 0) {
+ encodeChunk(buf);
}
}
@@ -93,7 +94,7 @@ public class FourFlagsIntEncoder extends ChunksIntEncoder {
@Override
public String toString() {
- return "FourFlags (" + encoder.toString() + ")";
+ return "FourFlags (VInt)";
}
}
diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/IntDecoder.java b/lucene/facet/src/java/org/apache/lucene/util/encoding/IntDecoder.java
index faabad4c8ed..acf83cc6158 100644
--- a/lucene/facet/src/java/org/apache/lucene/util/encoding/IntDecoder.java
+++ b/lucene/facet/src/java/org/apache/lucene/util/encoding/IntDecoder.java
@@ -1,7 +1,7 @@
package org.apache.lucene.util.encoding;
-import java.io.IOException;
-import java.io.InputStream;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IntsRef;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@@ -21,33 +21,50 @@ import java.io.InputStream;
*/
/**
- * Decodes integers from a set {@link InputStream}. For re-usability, the
- * decoder's input stream can be set by ({@link #reInit(InputStream)}).
- * By design, Decoders are NOT thread-safe.
+ * Decodes integers from a set {@link BytesRef}.
*
* @lucene.experimental
*/
public abstract class IntDecoder {
- /** A special long value which is used to indicate end-of-stream has reached. */
- public static final long EOS = 0x100000000L;
-
- /** Input stream from which the encoded bytes are read */
- protected InputStream in;
-
- /** Sets the input stream from which the encoded data is read. */
- public void reInit(InputStream in) {
- this.in = in;
+ /**
+ * Performs the actual decoding. Values should be read from
+ * {@link BytesRef#offset} up to {@code upto}. Also, {@code values} offset and
+ * length are set to 0 and the encoder is expected to update
+ * {@link IntsRef#length}, but not {@link IntsRef#offset}.
+ *
+ *
+ * NOTE: it is ok to use the buffer's offset as the current position in
+ * the buffer (and modify it), it will be reset by
+ * {@link #decode(BytesRef, IntsRef)}.
+ */
+ protected abstract void doDecode(BytesRef buf, IntsRef values, int upto);
+
+ /**
+ * Called before {@link #doDecode(BytesRef, IntsRef, int)} so that decoders
+ * can reset their state.
+ */
+ protected void reset() {
+ // do nothing by default
}
/**
- * Decodes data received from the input stream, and returns one decoded
- * integer. If end of stream is reached, {@link #EOS} is returned.
- *
- * @return one decoded integer as long or {@link #EOS} if end-of-stream
- * reached.
- * @throws IOException if an I/O error occurs
+ * Decodes the values from the buffer into the given {@link IntsRef}. Note
+ * that {@code values.offset} and {@code values.length} are set to 0.
*/
- public abstract long decode() throws IOException;
+ public final void decode(BytesRef buf, IntsRef values) {
+ values.offset = values.length = 0; // must do that because we cannot grow() them otherwise
+
+ // some decoders may use the buffer's offset as a position index, so save
+ // current offset.
+ int bufOffset = buf.offset;
+
+ reset();
+ doDecode(buf, values, buf.offset + buf.length);
+ assert values.offset == 0 : "offset should not have been modified by the decoder.";
+
+ // fix offset
+ buf.offset = bufOffset;
+ }
}
diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/IntEncoder.java b/lucene/facet/src/java/org/apache/lucene/util/encoding/IntEncoder.java
index 4055d750810..0a3197d6c6b 100644
--- a/lucene/facet/src/java/org/apache/lucene/util/encoding/IntEncoder.java
+++ b/lucene/facet/src/java/org/apache/lucene/util/encoding/IntEncoder.java
@@ -1,8 +1,7 @@
package org.apache.lucene.util.encoding;
-import java.io.Closeable;
-import java.io.IOException;
-import java.io.OutputStream;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IntsRef;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@@ -22,94 +21,47 @@ import java.io.OutputStream;
*/
/**
- * Encodes integers to a set {@link OutputStream}. Extending classes need to
- * override {@link #encode(int)} to encode the value using their encoding
- * algorithm. The default implementation of {@link #close()} closes the set
- * {@link OutputStream}.
- *
- * The default {@link #IntEncoder() constructor} is provided for convenience
- * only. One must call {@link #reInit(OutputStream)} before calling
- * {@link #encode(int)} or {@link #close()}.
- *
- * For convenience, each encoder implements {@link #createMatchingDecoder()} for
- * easy access to the matching decoder.
- *
- * NOTE: some implementations may buffer the encoded values in memory
- * (such as {@link IntEncoderFilter} implementations) and encoding will happen
- * only upon calling {@link #close()}. Therefore it is important to always call
- * {@link #close()} on the encoder at hand.
- *
- * NOTE: encoders are usually not thread safe, unless specifically
- * documented otherwise by an implementation.
+ * Encodes integers to a set {@link BytesRef}. For convenience, each encoder
+ * implements {@link #createMatchingDecoder()} for easy access to the matching
+ * decoder.
*
* @lucene.experimental
*/
-public abstract class IntEncoder implements Closeable {
+public abstract class IntEncoder {
- protected OutputStream out = null;
+ public IntEncoder() {}
/**
- * Default constructor, provided here for robustness: if in the future a
- * constructor with parameters will be added, this might break custom
- * implementations of this class which call this implicit constructor. So we
- * make it explicit to avoid any such issue in the future.
+ * Performs the actual encoding. Values should be read from
+ * {@link IntsRef#offset} up to {@code upto}. Also, it is guaranteed that
+ * {@code buf's} offset and length are set to 0 and the encoder is expected to
+ * update {@link BytesRef#length}, but not {@link BytesRef#offset}.
*/
- public IntEncoder() {
+ protected abstract void doEncode(IntsRef values, BytesRef buf, int upto);
+
+ /**
+ * Called before {@link #doEncode(IntsRef, BytesRef, int)} so that encoders
+ * can reset their state.
+ */
+ protected void reset() {
+ // do nothing by default
}
/**
- * Instructs the encoder to finish the encoding process. This method closes
- * the output stream which was specified by {@link #reInit(OutputStream)
- * reInit}. An implementation may do here additional cleanup required to
- * complete the encoding, such as flushing internal buffers, etc.
- * Once this method was called, no further calls to {@link #encode(int)
- * encode} should be made before first calling {@link #reInit(OutputStream)
- * reInit}.
- *
- * NOTE: overriding classes should make sure they either call
- * super.close()
or close the output stream themselves.
+ * Encodes the values to the given buffer. Note that the buffer's offset and
+ * length are set to 0.
*/
- @Override
- public void close() throws IOException {
- if (out != null) {
- out.close();
- }
+ public final void encode(IntsRef values, BytesRef buf) {
+ buf.offset = buf.length = 0;
+ reset();
+ doEncode(values, buf, values.offset + values.length);
+ assert buf.offset == 0 : "offset should not have been modified by the encoder.";
}
/**
- * Encodes an integer to the output stream given in
- * {@link #reInit(OutputStream) reInit}
- */
- public abstract void encode(int value) throws IOException;
-
- /**
- * Returns an {@link IntDecoder} which matches this encoder. Every encoder
- * must return an {@link IntDecoder} and null
is not a valid
- * value. If an encoder is just a filter, it should at least return its
- * wrapped encoder's matching decoder.
- *
- * NOTE: this method should create a new instance of the matching
- * decoder and leave the instance sharing to the caller. Returning the same
- * instance over and over is risky because encoders and decoders are not
- * thread safe.
+ * Returns an {@link IntDecoder} which can decode the values that were encoded
+ * with this encoder.
*/
public abstract IntDecoder createMatchingDecoder();
- /**
- * Reinitializes the encoder with the give {@link OutputStream}. For
- * re-usability it can be changed without the need to reconstruct a new
- * object.
- *
- * NOTE: after calling {@link #close()}, one must call
- * this method even if the output stream itself hasn't changed. An example
- * case is that the output stream wraps a byte[], and the output stream itself
- * is reset, but its instance hasn't changed. Some implementations of
- * {@link IntEncoder} may write some metadata about themselves to the output
- * stream, and therefore it is imperative that one calls this method before
- * encoding any data.
- */
- public void reInit(OutputStream out) {
- this.out = out;
- }
-
}
diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/IntEncoderFilter.java b/lucene/facet/src/java/org/apache/lucene/util/encoding/IntEncoderFilter.java
index 6c7a40340d0..ee2e5db7e9e 100644
--- a/lucene/facet/src/java/org/apache/lucene/util/encoding/IntEncoderFilter.java
+++ b/lucene/facet/src/java/org/apache/lucene/util/encoding/IntEncoderFilter.java
@@ -1,7 +1,5 @@
package org.apache.lucene.util.encoding;
-import java.io.IOException;
-import java.io.OutputStream;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@@ -21,17 +19,7 @@ import java.io.OutputStream;
*/
/**
- * An abstract implementation of {@link IntEncoder} which is served as a filter
- * on the values to encode. An encoder filter wraps another {@link IntEncoder}
- * which does the actual encoding. This allows for chaining filters and
- * encoders, such as:
- * new UniqueValuesIntEncoder(new DGapIntEncoder(new VInt8IntEnoder()));
- * {@link UniqueValuesIntEncoder} followed by {@link DGapIntEncoder}
-
- *
- * The default implementation implements {@link #close()} by closing the wrapped
- * encoder and {@link #reInit(OutputStream)} by re-initializing the wrapped
- * encoder.
+ * An abstract implementation of {@link IntEncoder} which wraps another encoder.
*
* @lucene.experimental
*/
@@ -44,15 +32,8 @@ public abstract class IntEncoderFilter extends IntEncoder {
}
@Override
- public void close() throws IOException {
- // There is no need to call super.close(), since we don't pass the output
- // stream to super.
- encoder.close();
- }
-
- @Override
- public void reInit(OutputStream out) {
- encoder.reInit(out);
+ public void reset() {
+ encoder.reset();
}
}
diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/NOnesIntDecoder.java b/lucene/facet/src/java/org/apache/lucene/util/encoding/NOnesIntDecoder.java
index 6d00e049080..1cf33857280 100644
--- a/lucene/facet/src/java/org/apache/lucene/util/encoding/NOnesIntDecoder.java
+++ b/lucene/facet/src/java/org/apache/lucene/util/encoding/NOnesIntDecoder.java
@@ -1,7 +1,7 @@
package org.apache.lucene.util.encoding;
-import java.io.IOException;
-import java.io.InputStream;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IntsRef;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@@ -21,54 +21,65 @@ import java.io.InputStream;
*/
/**
- * Decodes data which was encoded by {@link NOnesIntEncoder}. Uses a
- * {@link FourFlagsIntDecoder} to perform the actual encoding and translates the
- * values back as described in {@link NOnesIntEncoder}.
+ * Decodes values encoded encoded with {@link NOnesIntEncoder}.
*
- * @see NOnesIntEncoder
* @lucene.experimental
*/
public class NOnesIntDecoder extends FourFlagsIntDecoder {
- /** Number of consecutive '1's to generate upon decoding a '2'. */
- private int n;
-
- private int onesCounter;
-
+ // Number of consecutive '1's to generate upon decoding a '2'
+ private final int n;
+ private final IntsRef internalBuffer;
+
/**
* Constructs a decoder with a given N (Number of consecutive '1's which are
* translated into a single target value '2'.
*/
public NOnesIntDecoder(int n) {
this.n = n;
+ // initial size (room for 100 integers)
+ internalBuffer = new IntsRef(100);
}
@Override
- public long decode() throws IOException {
- // If we read '2', we should return n '1's.
- if (onesCounter > 0) {
- --onesCounter;
- return 1;
- }
-
- long decode = super.decode();
- if (decode == 1) {
- return 1;
- }
- if (decode == 2) {
- onesCounter = n - 1;
- return 1;
- }
- if (decode == 3) {
- return 2;
- }
- return decode == EOS ? EOS : decode - 1;
+ protected void reset() {
+ internalBuffer.length = 0;
+ super.reset();
}
-
+
@Override
- public void reInit(InputStream in) {
- super.reInit(in);
- onesCounter = 0;
+ protected void doDecode(BytesRef buf, IntsRef values, int upto) {
+ super.doDecode(buf, internalBuffer, upto);
+ if (values.ints.length < internalBuffer.length) {
+ // need space for internalBuffer.length to internalBuffer.length*N,
+ // grow mildly at first
+ values.grow(internalBuffer.length * n/2);
+ }
+
+ for (int i = 0; i < internalBuffer.length; i++) {
+ int decode = internalBuffer.ints[i];
+ if (decode == 1) {
+ if (values.length == values.ints.length) {
+ values.grow(values.length + 10); // grow by few items, however not too many
+ }
+ // 1 is 1
+ values.ints[values.length++] = 1;
+ } else if (decode == 2) {
+ if (values.length + n >= values.ints.length) {
+ values.grow(values.length + n); // grow by few items, however not too many
+ }
+ // '2' means N 1's
+ for (int j = 0; j < n; j++) {
+ values.ints[values.length++] = 1;
+ }
+ } else {
+ if (values.length == values.ints.length) {
+ values.grow(values.length + 10); // grow by few items, however not too many
+ }
+ // any other value is val-1
+ values.ints[values.length++] = decode - 1;
+ }
+ }
}
@Override
diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/NOnesIntEncoder.java b/lucene/facet/src/java/org/apache/lucene/util/encoding/NOnesIntEncoder.java
index badfe1cd31d..956eea253a3 100644
--- a/lucene/facet/src/java/org/apache/lucene/util/encoding/NOnesIntEncoder.java
+++ b/lucene/facet/src/java/org/apache/lucene/util/encoding/NOnesIntEncoder.java
@@ -1,7 +1,7 @@
package org.apache.lucene.util.encoding;
-import java.io.IOException;
-import java.io.OutputStream;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IntsRef;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@@ -50,11 +50,10 @@ import java.io.OutputStream;
*/
public class NOnesIntEncoder extends FourFlagsIntEncoder {
+ private final IntsRef internalBuffer;
+
/** Number of consecutive '1's to be translated into single target value '2'. */
- private int n;
-
- /** Counts the number of consecutive ones seen. */
- private int onesCounter = 0;
+ private final int n;
/**
* Constructs an encoder with a given value of N (N: Number of consecutive
@@ -62,38 +61,48 @@ public class NOnesIntEncoder extends FourFlagsIntEncoder {
*/
public NOnesIntEncoder(int n) {
this.n = n;
+ internalBuffer = new IntsRef(n);
}
@Override
- public void close() throws IOException {
- // We might have ones in our buffer, encode them as neccesary.
- while (onesCounter-- > 0) {
- super.encode(1);
+ protected void reset() {
+ internalBuffer.length = 0;
+ super.reset();
+ }
+
+ @Override
+ protected void doEncode(IntsRef values, BytesRef buf, int upto) {
+ // make sure the internal buffer is large enough
+ if (values.length > internalBuffer.ints.length) {
+ internalBuffer.grow(values.length);
}
-
- super.close();
- }
-
- @Override
- public void encode(int value) throws IOException {
- if (value == 1) {
- // Increment the number of consecutive ones seen so far
- if (++onesCounter == n) {
- super.encode(2);
- onesCounter = 0;
+
+ int onesCounter = 0;
+ for (int i = values.offset; i < upto; i++) {
+ int value = values.ints[i];
+ if (value == 1) {
+ // every N 1's should be encoded as '2'
+ if (++onesCounter == n) {
+ internalBuffer.ints[internalBuffer.length++] = 2;
+ onesCounter = 0;
+ }
+ } else {
+ // there might have been 1's that we need to encode
+ while (onesCounter > 0) {
+ --onesCounter;
+ internalBuffer.ints[internalBuffer.length++] = 1;
+ }
+
+ // encode value as value+1
+ internalBuffer.ints[internalBuffer.length++] = value + 1;
}
- return;
}
-
- // If it's not one - there might have been ones we had to encode prior to
- // this value
+ // there might have been 1's that we need to encode
while (onesCounter > 0) {
--onesCounter;
- super.encode(1);
+ internalBuffer.ints[internalBuffer.length++] = 1;
}
-
- // encode value + 1 --> the translation.
- super.encode(value + 1);
+ super.doEncode(internalBuffer, buf, internalBuffer.length);
}
@Override
@@ -101,12 +110,6 @@ public class NOnesIntEncoder extends FourFlagsIntEncoder {
return new NOnesIntDecoder(n);
}
- @Override
- public void reInit(OutputStream out) {
- super.reInit(out);
- onesCounter = 0;
- }
-
@Override
public String toString() {
return "NOnes (" + n + ") (" + super.toString() + ")";
diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/SimpleIntDecoder.java b/lucene/facet/src/java/org/apache/lucene/util/encoding/SimpleIntDecoder.java
index 414af6effa7..af6fce26af7 100644
--- a/lucene/facet/src/java/org/apache/lucene/util/encoding/SimpleIntDecoder.java
+++ b/lucene/facet/src/java/org/apache/lucene/util/encoding/SimpleIntDecoder.java
@@ -1,7 +1,7 @@
package org.apache.lucene.util.encoding;
-import java.io.IOException;
-import java.io.StreamCorruptedException;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IntsRef;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@@ -21,41 +21,24 @@ import java.io.StreamCorruptedException;
*/
/**
- * A simple stream decoder which can decode values encoded with
- * {@link SimpleIntEncoder}.
+ * Decodes values encoded with {@link SimpleIntEncoder}.
*
* @lucene.experimental
*/
public class SimpleIntDecoder extends IntDecoder {
- /**
- * reusable buffer - allocated only once as this is not a thread-safe object
- */
- private byte[] buffer = new byte[4];
-
@Override
- public long decode() throws IOException {
-
- // we need exactly 4 bytes to decode an int in this decoder impl, otherwise, throw an exception
- int offset = 0;
- while (offset < 4) {
- int nRead = in.read(buffer, offset, 4 - offset);
- if (nRead == -1) {
- if (offset > 0) {
- throw new StreamCorruptedException(
- "Need 4 bytes for decoding an int, got only " + offset);
- }
- return EOS;
+ protected void doDecode(BytesRef buf, IntsRef values, int upto) {
+ while (buf.offset < upto) {
+ if (values.length == values.ints.length) {
+ values.grow(values.length + 10); // grow by few items, however not too many
}
- offset += nRead;
+ values.ints[values.length++] =
+ ((buf.bytes[buf.offset++] & 0xFF) << 24) |
+ ((buf.bytes[buf.offset++] & 0xFF) << 16) |
+ ((buf.bytes[buf.offset++] & 0xFF) << 8) |
+ (buf.bytes[buf.offset++] & 0xFF);
}
-
- int v = buffer[3] & 0xff;
- v |= (buffer[2] << 8) & 0xff00;
- v |= (buffer[1] << 16) & 0xff0000;
- v |= (buffer[0] << 24) & 0xff000000;
-
- return v;
}
@Override
diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/SimpleIntEncoder.java b/lucene/facet/src/java/org/apache/lucene/util/encoding/SimpleIntEncoder.java
index 2e17ef29640..fd6a0206117 100644
--- a/lucene/facet/src/java/org/apache/lucene/util/encoding/SimpleIntEncoder.java
+++ b/lucene/facet/src/java/org/apache/lucene/util/encoding/SimpleIntEncoder.java
@@ -1,6 +1,7 @@
package org.apache.lucene.util.encoding;
-import java.io.IOException;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IntsRef;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@@ -26,22 +27,21 @@ import java.io.IOException;
*/
public class SimpleIntEncoder extends IntEncoder {
- /**
- * This method makes sure the value wasn't previously encoded by checking
- * against the Set. If the value wasn't encoded, it's added to the Set, and
- * encoded with {#link Vint8#encode}
- *
- * @param value
- * an integer to be encoded
- * @throws IOException
- * possibly thrown by the OutputStream
- */
@Override
- public void encode(int value) throws IOException {
- out.write(value >>> 24);
- out.write((value >> 16) & 0xFF);
- out.write((value >> 8) & 0xFF);
- out.write(value & 0xFF);
+ protected void doEncode(IntsRef values, BytesRef buf, int upto) {
+ // ensure there's enough room in the buffer
+ int bytesNeeded = values.length * 4;
+ if (buf.bytes.length < bytesNeeded) {
+ buf.grow(bytesNeeded);
+ }
+
+ for (int i = values.offset; i < upto; i++) {
+ int value = values.ints[i];
+ buf.bytes[buf.length++] = (byte) (value >>> 24);
+ buf.bytes[buf.length++] = (byte) ((value >> 16) & 0xFF);
+ buf.bytes[buf.length++] = (byte) ((value >> 8) & 0xFF);
+ buf.bytes[buf.length++] = (byte) (value & 0xFF);
+ }
}
@Override
diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/SortingIntEncoder.java b/lucene/facet/src/java/org/apache/lucene/util/encoding/SortingIntEncoder.java
index 9f6cee635ce..0ebb06efa85 100644
--- a/lucene/facet/src/java/org/apache/lucene/util/encoding/SortingIntEncoder.java
+++ b/lucene/facet/src/java/org/apache/lucene/util/encoding/SortingIntEncoder.java
@@ -1,9 +1,10 @@
package org.apache.lucene.util.encoding;
-import java.io.IOException;
-import java.io.OutputStream;
import java.util.Arrays;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IntsRef;
+
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@@ -23,47 +24,21 @@ import java.util.Arrays;
/**
* An {@link IntEncoderFilter} which sorts the values to encode in ascending
- * order before encoding them. Encoding therefore happens upon calling
- * {@link #close()}. Since this encoder is usually chained with another encoder
- * that relies on sorted values, it does not offer a default constructor.
+ * order before encoding them.
*
* @lucene.experimental
*/
public class SortingIntEncoder extends IntEncoderFilter {
- private float grow = 2.0f;
- private int index = 0;
- private int[] set = new int[1024];
-
/** Initializes with the given encoder. */
public SortingIntEncoder(IntEncoder encoder) {
super(encoder);
}
@Override
- public void close() throws IOException {
- if (index == 0) {
- return;
- }
-
- Arrays.sort(set, 0, index);
- for (int i = 0; i < index; i++) {
- encoder.encode(set[i]);
- }
- encoder.close();
- index = 0;
-
- super.close();
- }
-
- @Override
- public void encode(int value) throws IOException {
- if (index == set.length) {
- int[] newSet = new int[(int) (set.length * grow)];
- System.arraycopy(set, 0, newSet, 0, set.length);
- set = newSet;
- }
- set[index++] = value;
+ protected void doEncode(IntsRef values, BytesRef buf, int upto) {
+ Arrays.sort(values.ints, values.offset, upto);
+ encoder.doEncode(values, buf, upto);
}
@Override
@@ -71,12 +46,6 @@ public class SortingIntEncoder extends IntEncoderFilter {
return encoder.createMatchingDecoder();
}
- @Override
- public void reInit(OutputStream out) {
- super.reInit(out);
- index = 0;
- }
-
@Override
public String toString() {
return "Sorting (" + encoder.toString() + ")";
diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/UniqueValuesIntEncoder.java b/lucene/facet/src/java/org/apache/lucene/util/encoding/UniqueValuesIntEncoder.java
index 3583402295f..c9a6be5c848 100644
--- a/lucene/facet/src/java/org/apache/lucene/util/encoding/UniqueValuesIntEncoder.java
+++ b/lucene/facet/src/java/org/apache/lucene/util/encoding/UniqueValuesIntEncoder.java
@@ -1,7 +1,7 @@
package org.apache.lucene.util.encoding;
-import java.io.IOException;
-import java.io.OutputStream;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IntsRef;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@@ -22,7 +22,7 @@ import java.io.OutputStream;
/**
* An {@link IntEncoderFilter} which ensures only unique values are encoded. The
- * implementation assumes the values given to {@link #encode(int)} are sorted.
+ * implementation assumes the values given to {@link #encode(IntsRef, BytesRef)} are sorted.
* If this is not the case, you can chain this encoder with
* {@link SortingIntEncoder}.
*
@@ -30,26 +30,23 @@ import java.io.OutputStream;
*/
public final class UniqueValuesIntEncoder extends IntEncoderFilter {
- /**
- * Denotes an illegal value which we can use to init 'prev' to. Since all
- * encoded values are integers, this value is init to MAX_INT+1 and is of type
- * long. Therefore we are guaranteed not to get this value in encode.
- */
- private static final long ILLEGAL_VALUE = Integer.MAX_VALUE + 1;
-
- private long prev = ILLEGAL_VALUE;
-
/** Constructs a new instance with the given encoder. */
public UniqueValuesIntEncoder(IntEncoder encoder) {
super(encoder);
}
@Override
- public void encode(int value) throws IOException {
- if (prev != value) {
- encoder.encode(value);
- prev = value;
+ protected void doEncode(IntsRef values, BytesRef buf, int upto) {
+ int prev = values.ints[values.offset];
+ int idx = values.offset + 1;
+ for (int i = idx; i < upto; i++) {
+ if (values.ints[i] != prev) {
+ values.ints[idx++] = values.ints[i];
+ prev = values.ints[i];
+ }
}
+ values.length = idx - values.offset;
+ encoder.doEncode(values, buf, idx);
}
@Override
@@ -57,12 +54,6 @@ public final class UniqueValuesIntEncoder extends IntEncoderFilter {
return encoder.createMatchingDecoder();
}
- @Override
- public void reInit(OutputStream out) {
- super.reInit(out);
- prev = ILLEGAL_VALUE;
- }
-
@Override
public String toString() {
return "Unique (" + encoder.toString() + ")";
diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/VInt8.java b/lucene/facet/src/java/org/apache/lucene/util/encoding/VInt8.java
new file mode 100644
index 00000000000..267d52bae96
--- /dev/null
+++ b/lucene/facet/src/java/org/apache/lucene/util/encoding/VInt8.java
@@ -0,0 +1,138 @@
+package org.apache.lucene.util.encoding;
+
+import org.apache.lucene.util.BytesRef;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Variable-length encoding of 32-bit integers, into 8-bit bytes. A number is
+ * encoded as follows:
+ *
+ * - If it is less than 127 and non-negative (i.e., if the number uses only 7
+ * bits), it is encoded as as single byte: 0bbbbbbb.
+ *
- If its highest nonzero bit is greater than bit 6 (0x40), it is
+ * represented as a series of bytes, each byte's 7 LSB containing bits from the
+ * original value, with the MSB set for all but the last byte. The first encoded
+ * byte contains the highest nonzero bits from the original; the second byte
+ * contains the next 7 MSB; and so on, with the last byte containing the 7 LSB
+ * of the original.
+ *
+ * Examples:
+ *
+ * - n = 117 = 1110101: This has fewer than 8 significant bits, and so is
+ * encoded as 01110101 = 0x75.
+ *
- n = 100000 = (binary) 11000011010100000. This has 17 significant bits,
+ * and so needs three Vint8 bytes. Left-zero-pad it to a multiple of 7 bits,
+ * then split it into chunks of 7 and add an MSB, 0 for the last byte, 1 for the
+ * others: 1|0000110 1|0001101 0|0100000 = 0x86 0x8D 0x20.
+ *
+ * {@link #encode(int, BytesRef)} and {@link #decode(BytesRef)} will correctly
+ * handle any 32-bit integer, but for negative numbers, and positive numbers
+ * with more than 28 significant bits, encoding requires 5 bytes; this is not an
+ * efficient encoding scheme for large positive numbers or any negative number.
+ *
+ * @lucene.experimental
+ */
+public class VInt8 {
+
+ /** The maximum number of bytes needed to encode an integer. */
+ public static final int MAXIMUM_BYTES_NEEDED = 5;
+
+ /**
+ * Decodes an int from the given bytes, starting at {@link BytesRef#offset}.
+ * Returns the decoded bytes and updates {@link BytesRef#offset}.
+ */
+ public static int decode(BytesRef bytes) {
+ /*
+ This is the original code of this method, but a Hotspot bug
+ corrupted the for-loop of DataInput.readVInt() (see LUCENE-2975)
+ so the loop was unwounded here too, to be on the safe side
+ int value = 0;
+ while (true) {
+ byte first = bytes.bytes[bytes.offset++];
+ value |= first & 0x7F;
+ if ((first & 0x80) == 0) {
+ return value;
+ }
+ value <<= 7;
+ }
+ */
+
+ // byte 1
+ byte b = bytes.bytes[bytes.offset++];
+ if (b >= 0) return b;
+
+ // byte 2
+ int value = b & 0x7F;
+ b = bytes.bytes[bytes.offset++];
+ value = (value << 7) | b & 0x7F;
+ if (b >= 0) return value;
+
+ // byte 3
+ b = bytes.bytes[bytes.offset++];
+ value = (value << 7) | b & 0x7F;
+ if (b >= 0) return value;
+
+ // byte 4
+ b = bytes.bytes[bytes.offset++];
+ value = (value << 7) | b & 0x7F;
+ if (b >= 0) return value;
+
+ // byte 5
+ b = bytes.bytes[bytes.offset++];
+ return (value << 7) | b & 0x7F;
+ }
+
+ /**
+ * Encodes the given number into bytes, starting at {@link BytesRef#length}.
+ * Assumes that the array is large enough.
+ */
+ public static void encode(int value, BytesRef bytes) {
+ if ((value & ~0x7F) == 0) {
+ bytes.bytes[bytes.length] = (byte) value;
+ bytes.length++;
+ } else if ((value & ~0x3FFF) == 0) {
+ bytes.bytes[bytes.length] = (byte) (0x80 | ((value & 0x3F80) >> 7));
+ bytes.bytes[bytes.length + 1] = (byte) (value & 0x7F);
+ bytes.length += 2;
+ } else if ((value & ~0x1FFFFF) == 0) {
+ bytes.bytes[bytes.length] = (byte) (0x80 | ((value & 0x1FC000) >> 14));
+ bytes.bytes[bytes.length + 1] = (byte) (0x80 | ((value & 0x3F80) >> 7));
+ bytes.bytes[bytes.length + 2] = (byte) (value & 0x7F);
+ bytes.length += 3;
+ } else if ((value & ~0xFFFFFFF) == 0) {
+ bytes.bytes[bytes.length] = (byte) (0x80 | ((value & 0xFE00000) >> 21));
+ bytes.bytes[bytes.length + 1] = (byte) (0x80 | ((value & 0x1FC000) >> 14));
+ bytes.bytes[bytes.length + 2] = (byte) (0x80 | ((value & 0x3F80) >> 7));
+ bytes.bytes[bytes.length + 3] = (byte) (value & 0x7F);
+ bytes.length += 4;
+ } else {
+ bytes.bytes[bytes.length] = (byte) (0x80 | ((value & 0xF0000000) >> 28));
+ bytes.bytes[bytes.length + 1] = (byte) (0x80 | ((value & 0xFE00000) >> 21));
+ bytes.bytes[bytes.length + 2] = (byte) (0x80 | ((value & 0x1FC000) >> 14));
+ bytes.bytes[bytes.length + 3] = (byte) (0x80 | ((value & 0x3F80) >> 7));
+ bytes.bytes[bytes.length + 4] = (byte) (value & 0x7F);
+ bytes.length += 5;
+ }
+ }
+
+ private VInt8() {
+ // Just making it impossible to instantiate.
+ }
+
+}
diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/VInt8IntDecoder.java b/lucene/facet/src/java/org/apache/lucene/util/encoding/VInt8IntDecoder.java
index 2beaf773d78..e9fe5600c9a 100644
--- a/lucene/facet/src/java/org/apache/lucene/util/encoding/VInt8IntDecoder.java
+++ b/lucene/facet/src/java/org/apache/lucene/util/encoding/VInt8IntDecoder.java
@@ -1,6 +1,7 @@
package org.apache.lucene.util.encoding;
-import java.io.IOException;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IntsRef;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@@ -20,33 +21,19 @@ import java.io.IOException;
*/
/**
- * An {@link IntDecoder} which can decode values encoded by
- * {@link VInt8IntEncoder}.
+ * Decodes values encoded by {@link VInt8IntEncoder}.
*
* @lucene.experimental
*/
public class VInt8IntDecoder extends IntDecoder {
- private boolean legalEOS = true;
-
@Override
- public long decode() throws IOException {
- int value = 0;
- while (true) {
- int first = in.read();
- if (first < 0) {
- if (!legalEOS) {
- throw new IOException("Unexpected End-Of-Stream");
- }
- return EOS;
+ protected void doDecode(BytesRef buf, IntsRef values, int upto) {
+ while (buf.offset < upto) {
+ if (values.length == values.ints.length) {
+ values.grow(values.length + 10); // grow by few items, however not too many
}
- value |= first & 0x7F;
- if ((first & 0x80) == 0) {
- legalEOS = true;
- return value;
- }
- legalEOS = false;
- value <<= 7;
+ values.ints[values.length++] = VInt8.decode(buf);
}
}
diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/VInt8IntEncoder.java b/lucene/facet/src/java/org/apache/lucene/util/encoding/VInt8IntEncoder.java
index 8fe6fbc9e09..7c62bf3e035 100644
--- a/lucene/facet/src/java/org/apache/lucene/util/encoding/VInt8IntEncoder.java
+++ b/lucene/facet/src/java/org/apache/lucene/util/encoding/VInt8IntEncoder.java
@@ -1,6 +1,7 @@
package org.apache.lucene.util.encoding;
-import java.io.IOException;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IntsRef;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@@ -49,27 +50,14 @@ import java.io.IOException;
public class VInt8IntEncoder extends IntEncoder {
@Override
- public void encode(int value) throws IOException {
- if ((value & ~0x7F) == 0) {
- out.write(value);
- } else if ((value & ~0x3FFF) == 0) {
- out.write(0x80 | (value >> 7));
- out.write(0x7F & value);
- } else if ((value & ~0x1FFFFF) == 0) {
- out.write(0x80 | (value >> 14));
- out.write(0x80 | (value >> 7));
- out.write(0x7F & value);
- } else if ((value & ~0xFFFFFFF) == 0) {
- out.write(0x80 | (value >> 21));
- out.write(0x80 | (value >> 14));
- out.write(0x80 | (value >> 7));
- out.write(0x7F & value);
- } else {
- out.write(0x80 | (value >> 28));
- out.write(0x80 | (value >> 21));
- out.write(0x80 | (value >> 14));
- out.write(0x80 | (value >> 7));
- out.write(0x7F & value);
+ protected void doEncode(IntsRef values, BytesRef buf, int upto) {
+ int maxBytesNeeded = 5 * values.length; // at most 5 bytes per VInt
+ if (buf.bytes.length < maxBytesNeeded) {
+ buf.grow(maxBytesNeeded);
+ }
+
+ for (int i = values.offset; i < upto; i++) {
+ VInt8.encode(values.ints[i], buf);
}
}
diff --git a/lucene/facet/src/java/org/apache/lucene/util/encoding/package.html b/lucene/facet/src/java/org/apache/lucene/util/encoding/package.html
index 2e50e8f38eb..8a81b258e34 100644
--- a/lucene/facet/src/java/org/apache/lucene/util/encoding/package.html
+++ b/lucene/facet/src/java/org/apache/lucene/util/encoding/package.html
@@ -25,49 +25,8 @@ mechanisms to create new ones. The super class for all encoders is
encoders there is a matching {@link
org.apache.lucene.util.encoding.IntDecoder} implementation (not all
encoders need a decoder).
-An encoder encodes the integers that are passed to {@link
-org.apache.lucene.util.encoding.IntEncoder#encode(int) encode} into a
-set output stream (see {@link
-org.apache.lucene.util.encoding.IntEncoder#reInit(OutputStream)
-reInit}). One should always call {@link
-org.apache.lucene.util.encoding.IntEncoder#close() close} when all
-integers have been encoded, to ensure proper finish by the encoder. Some
-encoders buffer values in-memory and encode in batches in order to
-optimize the encoding, and not closing them may result in loss of
-information or corrupt stream.
-
A proper and typical usage of an encoder looks like this:
-
-int[] data = <the values to encode>
-IntEncoder encoder = new VInt8IntEncoder();
-OutputStream out = new ByteArrayOutputStream();
-encoder.reInit(out);
-for (int val : data) {
- encoder.encode(val);
-}
-encoder.close();
-
-// Print the bytes in binary
-byte[] bytes = out.toByteArray();
-for (byte b : bytes) {
- System.out.println(Integer.toBinaryString(b));
-}
-
-Each encoder also implements {@link
-org.apache.lucene.util.encoding.IntEncoder#createMatchingDecoder()
-createMatchingDecoder} which returns the matching decoder for this encoder.
-As mentioned above, not all encoders have a matching decoder (like some
-encoder filters which are explained next), however every encoder should
-return a decoder following a call to that method. To complete the
-example above, one can easily iterate over the decoded values like this:
-
-IntDecoder d = e.createMatchingDecoder();
-d.reInit(new ByteArrayInputStream(bytes));
-long val;
-while ((val = d.decode()) != IntDecoder.EOS) {
- System.out.println(val);
-}
-
-Some encoders don't perform any encoding at all, or do not include an
+
+Some encoders don't perform any encoding at all, or do not include an
encoding logic. Those are called {@link
org.apache.lucene.util.encoding.IntEncoderFilter}s. A filter is an
encoder which delegates the encoding task to a given encoder, however
@@ -76,91 +35,6 @@ example is {@link org.apache.lucene.util.encoding.DGapIntEncoder}
which encodes the gaps between values rather than the values themselves.
Another example is {@link
org.apache.lucene.util.encoding.SortingIntEncoder} which sorts all the
-values in ascending order before they are sent for encoding. This
-encoder aggregates the values in its {@link
-org.apache.lucene.util.encoding.IntEncoder#encode(int) encode} implementation
-and decoding only happens upon calling {@link
-org.apache.lucene.util.encoding.IntEncoder#close() close}.
-
Extending IntEncoder
-Extending {@link org.apache.lucene.util.encoding.IntEncoder} is a very
-easy task. One only needs to implement {@link
-org.apache.lucene.util.encoding.IntEncoder#encode(int) encode} and
-{@link org.apache.lucene.util.encoding.IntEncoder#createMatchingDecoder()
-createMatchingDecoder} as the base implementation takes care of
-re-initializing the output stream and closing it. The following example
-illustrates how can one write an encoder (and a matching decoder) which
-'tags' the stream with type/ID of the encoder. Such tagging is important
-in scenarios where an application uses different encoders for different
-streams, and wants to manage some sort of mapping between an encoder ID
-to an IntEncoder/Decoder implementation, so a proper decoder will be
-initialized on the fly:
-
-public class TaggingIntEncoder extends IntEncoderFilter {
-
- public TaggingIntEncoder(IntEncoder encoder) {
- super(encoder);
- }
-
- @Override
- public void encode(int value) throws IOException {
- encoder.encode(value);
- }
-
- @Override
- public IntDecoder createMatchingDecoder() {
- return new TaggingIntDecoder();
- }
-
- @Override
- public void reInit(OutputStream out) {
- super.reInit(os);
- // Assumes the application has a static EncodersMap class which is able to
- // return a unique ID for a given encoder.
- int encoderID = EncodersMap.getID(encoder);
- this.out.write(encoderID);
- }
-
- @Override
- public String toString() {
- return "Tagging (" + encoder.toString() + ")";
- }
-
-}
-
-And the matching decoder:
-
-public class TaggingIntDecoder extends IntDecoder {
-
- // Will be initialized upon calling reInit.
- private IntDecoder decoder;
-
- @Override
- public void reInit(InputStream in) {
- super.reInit(in);
-
- // Read the ID of the encoder that tagged this stream.
- int encoderID = in.read();
-
- // Assumes EncodersMap can return the proper IntEncoder given the ID.
- decoder = EncodersMap.getEncoder(encoderID).createMatchingDecoder();
- }
-
- @Override
- public long decode() throws IOException {
- return decoder.decode();
- }
-
- @Override
- public String toString() {
- return "Tagging (" + decoder == null ? "none" : decoder.toString() + ")";
- }
-
-}
-
-The example implements TaggingIntEncoder
as a filter over another
-encoder. Even though it does not do any filtering on the actual values, it feels
-right to present it as a filter. Anyway, this is just an example code and one
-can choose to implement it however it makes sense to the application. For
-simplicity, error checking was omitted from the sample code.
+values in ascending order before they are sent for encoding.