From 77e6e291bf34ffaa6f1afc2d9c64779f4b250b65 Mon Sep 17 00:00:00 2001 From: Mike McCandless Date: Wed, 1 Nov 2017 12:58:53 -0400 Subject: [PATCH] LUCENE-7994: use int/int scatter map to count facets when number of hits is small relative to number of unique facet labels --- lucene/CHANGES.txt | 4 + .../apache/lucene/search/QueryRescorer.java | 2 +- lucene/facet/build.xml | 1 + lucene/facet/ivy.xml | 6 + .../lucene/facet/LongValueFacetCounts.java | 128 +++------- .../taxonomy/FastTaxonomyFacetCounts.java | 8 +- .../facet/taxonomy/FloatTaxonomyFacets.java | 12 +- .../facet/taxonomy/IntTaxonomyFacets.java | 149 +++++++++--- .../facet/taxonomy/TaxonomyFacetCounts.java | 4 +- .../TaxonomyFacetSumIntAssociations.java | 4 +- .../lucene/facet/taxonomy/TaxonomyFacets.java | 47 +++- .../writercache/LruTaxonomyWriterCache.java | 4 + .../writercache/TaxonomyWriterCache.java | 4 +- .../writercache/UTF8TaxonomyWriterCache.java | 227 +++++++++--------- .../taxonomy/TestTaxonomyFacetCounts.java | 8 + .../TestConcurrentFacetedIndexing.java | 2 + .../TestDirectoryTaxonomyWriter.java | 2 + .../TestUTF8TaxonomyWriterCache.java | 22 +- lucene/ivy-versions.properties | 2 +- lucene/licenses/hppc-0.7.3.jar.sha1 | 1 + lucene/licenses/hppc-LICENSE-ASL.txt | 177 ++++++++++++++ lucene/licenses/hppc-NOTICE.txt | 0 solr/licenses/hppc-0.7.1.jar.sha1 | 1 - solr/licenses/hppc-0.7.3.jar.sha1 | 1 + 24 files changed, 553 insertions(+), 263 deletions(-) create mode 100644 lucene/licenses/hppc-0.7.3.jar.sha1 create mode 100644 lucene/licenses/hppc-LICENSE-ASL.txt create mode 100644 lucene/licenses/hppc-NOTICE.txt delete mode 100644 solr/licenses/hppc-0.7.1.jar.sha1 create mode 100644 solr/licenses/hppc-0.7.3.jar.sha1 diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index f214620576a..d0ed234bef0 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -41,6 +41,10 @@ Optimizations * LUCENE-8018: Smaller FieldInfos memory footprint by not retaining unnecessary references to TreeMap entries. (Julian Vassev via Adrien Grand) +* LUCENE-7994: Use int/int scatter map to gather facet counts when the + number of hits is small relative to the number of unique facet labels + (Dawid Weiss, Robert Muir, Mike McCandless) + ======================= Lucene 7.1.0 ======================= Changes in Runtime Behavior diff --git a/lucene/core/src/java/org/apache/lucene/search/QueryRescorer.java b/lucene/core/src/java/org/apache/lucene/search/QueryRescorer.java index a8477e9b0e6..73c37d28320 100644 --- a/lucene/core/src/java/org/apache/lucene/search/QueryRescorer.java +++ b/lucene/core/src/java/org/apache/lucene/search/QueryRescorer.java @@ -85,7 +85,7 @@ public abstract class QueryRescorer extends Rescorer { scorer = weight.scorer(readerContext); } - if(scorer != null) { + if (scorer != null) { int targetDoc = docID - docBase; int actualDoc = scorer.docID(); if (actualDoc < targetDoc) { diff --git a/lucene/facet/build.xml b/lucene/facet/build.xml index 1a568ea369c..e91f1629802 100644 --- a/lucene/facet/build.xml +++ b/lucene/facet/build.xml @@ -28,6 +28,7 @@ + diff --git a/lucene/facet/ivy.xml b/lucene/facet/ivy.xml index 326f71f90fc..249c78e36c8 100644 --- a/lucene/facet/ivy.xml +++ b/lucene/facet/ivy.xml @@ -18,4 +18,10 @@ --> + + + + + + diff --git a/lucene/facet/src/java/org/apache/lucene/facet/LongValueFacetCounts.java b/lucene/facet/src/java/org/apache/lucene/facet/LongValueFacetCounts.java index 7d7a32bacd5..ac3f5b34a04 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/LongValueFacetCounts.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/LongValueFacetCounts.java @@ -40,6 +40,10 @@ import org.apache.lucene.search.LongValuesSource; import org.apache.lucene.util.InPlaceMergeSorter; import org.apache.lucene.util.PriorityQueue; +import com.carrotsearch.hppc.LongIntScatterMap; +import com.carrotsearch.hppc.cursors.LongIntCursor; + + /** {@link Facets} implementation that computes counts for * all uniqute long values, more efficiently counting small values (0-1023) using an int array, * and switching to a HashMap for values above 1023. @@ -53,7 +57,7 @@ public class LongValueFacetCounts extends Facets { private final int[] counts = new int[1024]; /** Used for all values that are >= 1K. */ - private final HashTable hashCounts = new HashTable(); + private final LongIntScatterMap hashCounts = new LongIntScatterMap(); private final String field; @@ -253,7 +257,7 @@ public class LongValueFacetCounts extends Facets { if (value >= 0 && value < counts.length) { counts[(int) value]++; } else { - hashCounts.add(value, 1); + hashCounts.addTo(value, 1); } } @@ -276,7 +280,7 @@ public class LongValueFacetCounts extends Facets { /** Returns the specified top number of facets, sorted by count. */ public FacetResult getTopChildrenSortByCount(int topN) { - PriorityQueue pq = new PriorityQueue(Math.min(topN, counts.length + hashCounts.size)) { + PriorityQueue pq = new PriorityQueue(Math.min(topN, counts.length + hashCounts.size())) { @Override protected boolean lessThan(Entry a, Entry b) { // sort by count descending, breaking ties by value ascending: @@ -298,15 +302,15 @@ public class LongValueFacetCounts extends Facets { } } - if (hashCounts.size != 0) { - childCount += hashCounts.size; - for (int i = 0; i < hashCounts.values.length; i++) { - int count = hashCounts.counts[i]; + if (hashCounts.size() != 0) { + childCount += hashCounts.size(); + for (LongIntCursor c : hashCounts) { + int count = c.value; if (count != 0) { if (e == null) { e = new Entry(); } - e.value = hashCounts.values[i]; + e.value = c.key; e.count = count; e = pq.insertWithOverflow(e); } @@ -328,47 +332,47 @@ public class LongValueFacetCounts extends Facets { List labelValues = new ArrayList<>(); // compact & sort hash table's arrays by value + int[] hashCounts = new int[this.hashCounts.size()]; + long[] hashValues = new long[this.hashCounts.size()]; + int upto = 0; - for (int i = 0; i < hashCounts.values.length; i++) { - if (hashCounts.counts[i] != 0) { - hashCounts.counts[upto] = hashCounts.counts[i]; - hashCounts.values[upto] = hashCounts.values[i]; + for (LongIntCursor c : this.hashCounts) { + if (c.value != 0) { + hashCounts[upto] = c.value; + hashValues[upto] = c.key; upto++; } } - // zero fill all remaining counts so if we are called again we don't mistake these as real values - Arrays.fill(hashCounts.counts, upto, hashCounts.counts.length, 0); - - assert upto == hashCounts.size : "upto=" + upto + " hashCounts.size=" + hashCounts.size; + assert upto == this.hashCounts.size() : "upto=" + upto + " hashCounts.size=" + this.hashCounts.size(); new InPlaceMergeSorter() { @Override public int compare(int i, int j) { - return Long.compare(hashCounts.values[i], hashCounts.values[j]); + return Long.compare(hashValues[i], hashValues[j]); } @Override public void swap(int i, int j) { - int x = hashCounts.counts[i]; - hashCounts.counts[i] = hashCounts.counts[j]; - hashCounts.counts[j] = x; + int x = hashCounts[i]; + hashCounts[i] = hashCounts[j]; + hashCounts[j] = x; - long y = hashCounts.values[j]; - hashCounts.values[j] = hashCounts.values[i]; - hashCounts.values[i] = y; + long y = hashValues[j]; + hashValues[j] = hashValues[i]; + hashValues[i] = y; } }.sort(0, upto); boolean countsAdded = false; for (int i = 0; i < upto; i++) { - if (countsAdded == false && hashCounts.values[i] >= counts.length) { + if (countsAdded == false && hashValues[i] >= counts.length) { countsAdded = true; appendCounts(labelValues); } - labelValues.add(new LabelAndValue(Long.toString(hashCounts.values[i]), - hashCounts.counts[i])); + labelValues.add(new LabelAndValue(Long.toString(hashValues[i]), + hashCounts[i])); } if (countsAdded == false) { @@ -413,13 +417,13 @@ public class LongValueFacetCounts extends Facets { } } - if (hashCounts.size != 0) { - for (int i = 0; i < hashCounts.values.length; i++) { - if (hashCounts.counts[i] != 0) { + if (hashCounts.size() != 0) { + for (LongIntCursor c : hashCounts) { + if (c.value != 0) { b.append(" "); - b.append(hashCounts.values[i]); + b.append(c.key); b.append(" -> count="); - b.append(hashCounts.counts[i]); + b.append(c.value); b.append('\n'); } } @@ -427,66 +431,4 @@ public class LongValueFacetCounts extends Facets { return b.toString(); } - - /** Native typed hash table. */ - static class HashTable { - - static final float LOAD_FACTOR = 0.7f; - - long[] values; // values identifying a value - int[] counts; - int mask; - int size; - int threshold; - - HashTable() { - int capacity = 64; // must be a power of 2 - values = new long[capacity]; - counts = new int[capacity]; - mask = capacity - 1; - size = 0; - threshold = (int) (capacity * LOAD_FACTOR); - } - - private int hash(long v) { - int h = (int) (v ^ (v >>> 32)); - h = (31 * h) & mask; // * 31 to try to use the whole table, even if values are dense - return h; - } - - void add(long value, int inc) { - if (size >= threshold) { - rehash(); - } - final int h = hash(value); - for (int slot = h;; slot = (slot + 1) & mask) { - if (counts[slot] == 0) { - values[slot] = value; - ++size; - } else if (values[slot] != value) { - continue; - } - counts[slot] += inc; - break; - } - } - - private void rehash() { - final long[] oldValues = values; - final int[] oldCounts = counts; - - final int newCapacity = values.length * 2; - values = new long[newCapacity]; - counts = new int[newCapacity]; - mask = newCapacity - 1; - threshold = (int) (LOAD_FACTOR * newCapacity); - size = 0; - - for (int i = 0; i < oldValues.length; ++i) { - if (oldCounts[i] > 0) { - add(oldValues[i], oldCounts[i]); - } - } - } - } } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/FastTaxonomyFacetCounts.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/FastTaxonomyFacetCounts.java index d560d40927a..47673361cb8 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/FastTaxonomyFacetCounts.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/FastTaxonomyFacetCounts.java @@ -50,7 +50,7 @@ public class FastTaxonomyFacetCounts extends IntTaxonomyFacets { * FacetsConfig#setIndexFieldName} to change the index * field name for certain dimensions. */ public FastTaxonomyFacetCounts(String indexFieldName, TaxonomyReader taxoReader, FacetsConfig config, FacetsCollector fc) throws IOException { - super(indexFieldName, taxoReader, config); + super(indexFieldName, taxoReader, config, fc); count(fc.getMatchingDocs()); } @@ -60,7 +60,7 @@ public class FastTaxonomyFacetCounts extends IntTaxonomyFacets { * the same result as searching on {@link MatchAllDocsQuery}, * but faster */ public FastTaxonomyFacetCounts(String indexFieldName, IndexReader reader, TaxonomyReader taxoReader, FacetsConfig config) throws IOException { - super(indexFieldName, taxoReader, config); + super(indexFieldName, taxoReader, config, null); countAll(reader); } @@ -85,7 +85,7 @@ public class FastTaxonomyFacetCounts extends IntTaxonomyFacets { byte b = bytes[offset++]; if (b >= 0) { prev = ord = ((ord << 7) | b) + prev; - ++values[ord]; + increment(ord); ord = 0; } else { ord = (ord << 7) | (b & 0x7F); @@ -120,7 +120,7 @@ public class FastTaxonomyFacetCounts extends IntTaxonomyFacets { byte b = bytes[offset++]; if (b >= 0) { prev = ord = ((ord << 7) | b) + prev; - ++values[ord]; + increment(ord); ord = 0; } else { ord = (ord << 7) | (b & 0x7F); diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/FloatTaxonomyFacets.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/FloatTaxonomyFacets.java index 2e8231d8d6c..371e3277fc5 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/FloatTaxonomyFacets.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/FloatTaxonomyFacets.java @@ -29,6 +29,8 @@ import org.apache.lucene.facet.TopOrdAndFloatQueue; * to a per-ords float[]. */ public abstract class FloatTaxonomyFacets extends TaxonomyFacets { + // TODO: also use native hash map for sparse collection, like IntTaxonomyFacets + /** Per-ordinal value. */ protected final float[] values; @@ -41,6 +43,7 @@ public abstract class FloatTaxonomyFacets extends TaxonomyFacets { /** Rolls up any single-valued hierarchical dimensions. */ protected void rollup() throws IOException { // Rollup any necessary dims: + int[] children = getChildren(); for(Map.Entry ent : config.getDimConfigs().entrySet()) { String dim = ent.getKey(); DimConfig ft = ent.getValue(); @@ -52,7 +55,9 @@ public abstract class FloatTaxonomyFacets extends TaxonomyFacets { } } - private float rollup(int ord) { + private float rollup(int ord) throws IOException { + int[] children = getChildren(); + int[] siblings = getSiblings(); float sum = 0; while (ord != TaxonomyReader.INVALID_ORDINAL) { float childValue = values[ord] + rollup(children[ord]); @@ -97,6 +102,9 @@ public abstract class FloatTaxonomyFacets extends TaxonomyFacets { TopOrdAndFloatQueue q = new TopOrdAndFloatQueue(Math.min(taxoReader.getSize(), topN)); float bottomValue = 0; + int[] children = getChildren(); + int[] siblings = getSiblings(); + int ord = children[dimOrd]; float sumValues = 0; int childCount = 0; @@ -146,4 +154,4 @@ public abstract class FloatTaxonomyFacets extends TaxonomyFacets { return new FacetResult(dim, path, sumValues, labelValues, childCount); } -} \ No newline at end of file +} diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/IntTaxonomyFacets.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/IntTaxonomyFacets.java index 1b0a96d4a20..71b628cb607 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/IntTaxonomyFacets.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/IntTaxonomyFacets.java @@ -20,28 +20,87 @@ import java.io.IOException; import java.util.Map; import org.apache.lucene.facet.FacetResult; -import org.apache.lucene.facet.FacetsConfig; +import org.apache.lucene.facet.FacetsCollector.MatchingDocs; +import org.apache.lucene.facet.FacetsCollector; import org.apache.lucene.facet.FacetsConfig.DimConfig; +import org.apache.lucene.facet.FacetsConfig; import org.apache.lucene.facet.LabelAndValue; import org.apache.lucene.facet.TopOrdAndIntQueue; +import com.carrotsearch.hppc.IntIntScatterMap; +import com.carrotsearch.hppc.cursors.IntIntCursor; + /** Base class for all taxonomy-based facets that aggregate * to a per-ords int[]. */ public abstract class IntTaxonomyFacets extends TaxonomyFacets { /** Per-ordinal value. */ - protected final int[] values; + private final int[] values; + private final IntIntScatterMap sparseValues; /** Sole constructor. */ - protected IntTaxonomyFacets(String indexFieldName, TaxonomyReader taxoReader, FacetsConfig config) throws IOException { + protected IntTaxonomyFacets(String indexFieldName, TaxonomyReader taxoReader, FacetsConfig config, FacetsCollector fc) throws IOException { super(indexFieldName, taxoReader, config); - values = new int[taxoReader.getSize()]; + + if (useHashTable(fc, taxoReader)) { + sparseValues = new IntIntScatterMap(); + values = null; + } else { + sparseValues = null; + values = new int[taxoReader.getSize()]; + } } - + + /** Return true if a sparse hash table should be used for counting, instead of a dense int[]. */ + protected boolean useHashTable(FacetsCollector fc, TaxonomyReader taxoReader) { + if (taxoReader.getSize() < 1024) { + // small number of unique values: use an array + return false; + } + + if (fc == null) { + // counting all docs: use an array + return false; + } + + int maxDoc = 0; + int sumTotalHits = 0; + for (MatchingDocs docs : fc.getMatchingDocs()) { + sumTotalHits += docs.totalHits; + maxDoc += docs.context.reader().maxDoc(); + } + + // if our result set is < 10% of the index, we collect sparsely (use hash map): + return sumTotalHits < maxDoc/10; + } + + /** Increment the count for this ordinal by 1. */ + protected void increment(int ordinal) { + increment(ordinal, 1); + } + + /** Increment the count for this ordinal by {@code amount}.. */ + protected void increment(int ordinal, int amount) { + if (sparseValues != null) { + sparseValues.addTo(ordinal, amount); + } else { + values[ordinal] += amount; + } + } + + private int getValue(int ordinal) { + if (sparseValues != null) { + return sparseValues.get(ordinal); + } else { + return values[ordinal]; + } + } + /** Rolls up any single-valued hierarchical dimensions. */ protected void rollup() throws IOException { // Rollup any necessary dims: + int[] children = null; for(Map.Entry ent : config.getDimConfigs().entrySet()) { String dim = ent.getKey(); DimConfig ft = ent.getValue(); @@ -50,18 +109,23 @@ public abstract class IntTaxonomyFacets extends TaxonomyFacets { // It can be -1 if this field was declared in the // config but never indexed: if (dimRootOrd > 0) { - values[dimRootOrd] += rollup(children[dimRootOrd]); + if (children == null) { + // lazy init + children = getChildren(); + } + increment(dimRootOrd, rollup(children[dimRootOrd])); } } } } - private int rollup(int ord) { + private int rollup(int ord) throws IOException { + int[] children = getChildren(); + int[] siblings = getSiblings(); int sum = 0; while (ord != TaxonomyReader.INVALID_ORDINAL) { - int childValue = values[ord] + rollup(children[ord]); - values[ord] = childValue; - sum += childValue; + increment(ord, rollup(children[ord])); + sum += getValue(ord); ord = siblings[ord]; } return sum; @@ -83,7 +147,7 @@ public abstract class IntTaxonomyFacets extends TaxonomyFacets { if (ord < 0) { return -1; } - return values[ord]; + return getValue(ord); } @Override @@ -102,29 +166,58 @@ public abstract class IntTaxonomyFacets extends TaxonomyFacets { int bottomValue = 0; - int ord = children[dimOrd]; int totValue = 0; int childCount = 0; TopOrdAndIntQueue.OrdAndValue reuse = null; - while(ord != TaxonomyReader.INVALID_ORDINAL) { - if (values[ord] > 0) { - totValue += values[ord]; - childCount++; - if (values[ord] > bottomValue) { - if (reuse == null) { - reuse = new TopOrdAndIntQueue.OrdAndValue(); - } - reuse.ord = ord; - reuse.value = values[ord]; - reuse = q.insertWithOverflow(reuse); - if (q.size() == topN) { - bottomValue = q.top().value; + + // TODO: would be faster if we had a "get the following children" API? then we + // can make a single pass over the hashmap + + if (sparseValues != null) { + for (IntIntCursor c : sparseValues) { + int count = c.value; + int ord = c.key; + if (parents[ord] == dimOrd && count > 0) { + totValue += count; + childCount++; + if (count > bottomValue) { + if (reuse == null) { + reuse = new TopOrdAndIntQueue.OrdAndValue(); + } + reuse.ord = ord; + reuse.value = count; + reuse = q.insertWithOverflow(reuse); + if (q.size() == topN) { + bottomValue = q.top().value; + } } } } + } else { + int[] children = getChildren(); + int[] siblings = getSiblings(); + int ord = children[dimOrd]; + while(ord != TaxonomyReader.INVALID_ORDINAL) { + int value = values[ord]; + if (value > 0) { + totValue += value; + childCount++; + if (value > bottomValue) { + if (reuse == null) { + reuse = new TopOrdAndIntQueue.OrdAndValue(); + } + reuse.ord = ord; + reuse.value = value; + reuse = q.insertWithOverflow(reuse); + if (q.size() == topN) { + bottomValue = q.top().value; + } + } + } - ord = siblings[ord]; + ord = siblings[ord]; + } } if (totValue == 0) { @@ -133,7 +226,7 @@ public abstract class IntTaxonomyFacets extends TaxonomyFacets { if (dimConfig.multiValued) { if (dimConfig.requireDimCount) { - totValue = values[dimOrd]; + totValue = getValue(dimOrd); } else { // Our sum'd value is not correct, in general: totValue = -1; @@ -151,4 +244,4 @@ public abstract class IntTaxonomyFacets extends TaxonomyFacets { return new FacetResult(dim, path, totValue, labelValues, childCount); } -} \ No newline at end of file +} diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyFacetCounts.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyFacetCounts.java index ce82e27b176..891851e5514 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyFacetCounts.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/TaxonomyFacetCounts.java @@ -39,7 +39,7 @@ public class TaxonomyFacetCounts extends IntTaxonomyFacets { * {@link OrdinalsReader}; otherwise use {@link * FastTaxonomyFacetCounts}. */ public TaxonomyFacetCounts(OrdinalsReader ordinalsReader, TaxonomyReader taxoReader, FacetsConfig config, FacetsCollector fc) throws IOException { - super(ordinalsReader.getIndexFieldName(), taxoReader, config); + super(ordinalsReader.getIndexFieldName(), taxoReader, config, fc); this.ordinalsReader = ordinalsReader; count(fc.getMatchingDocs()); } @@ -54,7 +54,7 @@ public class TaxonomyFacetCounts extends IntTaxonomyFacets { while ((doc = docs.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { ords.get(doc, scratch); for(int i=0;i getAllDims(int topN) throws IOException { + int[] children = getChildren(); + int[] siblings = getSiblings(); int ord = children[TaxonomyReader.ROOT_ORDINAL]; List results = new ArrayList<>(); while (ord != TaxonomyReader.INVALID_ORDINAL) { @@ -101,5 +137,4 @@ public abstract class TaxonomyFacets extends Facets { Collections.sort(results, BY_VALUE_THEN_DIM); return results; } - } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/LruTaxonomyWriterCache.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/LruTaxonomyWriterCache.java index 6dc8cd23724..1182ffd4049 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/LruTaxonomyWriterCache.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/LruTaxonomyWriterCache.java @@ -89,6 +89,10 @@ public class LruTaxonomyWriterCache implements TaxonomyWriterCache { cache = null; } + public int size() { + return cache.getSize(); + } + @Override public synchronized int get(FacetLabel categoryPath) { Integer res = cache.get(categoryPath); diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/TaxonomyWriterCache.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/TaxonomyWriterCache.java index f31042648a8..b72db214063 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/TaxonomyWriterCache.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/TaxonomyWriterCache.java @@ -94,5 +94,7 @@ public interface TaxonomyWriterCache { * assume that the cache is still operable after this method returns. */ public void clear(); - + + /** How many labels are currently stored in the cache. */ + public int size(); } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/UTF8TaxonomyWriterCache.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/UTF8TaxonomyWriterCache.java index 70bb6bad7e6..1c0adf72215 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/UTF8TaxonomyWriterCache.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/writercache/UTF8TaxonomyWriterCache.java @@ -31,128 +31,133 @@ import org.apache.lucene.util.UnicodeUtil; /** A "cache" that never frees memory, and stores labels in a BytesRefHash (utf-8 encoding). */ public final class UTF8TaxonomyWriterCache implements TaxonomyWriterCache, Accountable { - private final ThreadLocal bytes = new ThreadLocal() { - @Override - protected BytesRefBuilder initialValue() { - return new BytesRefBuilder(); - } - }; - private final Counter bytesUsed = Counter.newCounter(); - private final BytesRefHash map = new BytesRefHash(new ByteBlockPool(new DirectTrackingAllocator(bytesUsed))); + private final ThreadLocal bytes = new ThreadLocal() { + @Override + protected BytesRefBuilder initialValue() { + return new BytesRefBuilder(); + } + }; + private final Counter bytesUsed = Counter.newCounter(); + private final BytesRefHash map = new BytesRefHash(new ByteBlockPool(new DirectTrackingAllocator(bytesUsed))); - private final static int ORDINALS_PAGE_SIZE = 65536; - private final static int ORDINALS_PAGE_MASK = ORDINALS_PAGE_SIZE - 1; + private final static int ORDINALS_PAGE_SIZE = 65536; + private final static int ORDINALS_PAGE_MASK = ORDINALS_PAGE_SIZE - 1; - private volatile int[][] ordinals; + private volatile int[][] ordinals; - // How many labels we are storing: - private int count; + // How many labels we are storing: + private int count; - // How many pages in ordinals we've allocated: - private int pageCount; + // How many pages in ordinals we've allocated: + private int pageCount; - /** Sole constructor. */ - public UTF8TaxonomyWriterCache() { - ordinals = new int[1][]; - ordinals[0] = new int[ORDINALS_PAGE_SIZE]; + /** Sole constructor. */ + public UTF8TaxonomyWriterCache() { + ordinals = new int[1][]; + ordinals[0] = new int[ORDINALS_PAGE_SIZE]; + } + + @Override + public int get(FacetLabel label) { + BytesRef bytes = toBytes(label); + int id; + synchronized (this) { + id = map.find(bytes); } - - @Override - public int get(FacetLabel label) { - BytesRef bytes = toBytes(label); - int id; - synchronized (this) { - id = map.find(bytes); - } - if (id == -1) { - return LabelToOrdinal.INVALID_ORDINAL; - } - int page = id / ORDINALS_PAGE_SIZE; - int offset = id % ORDINALS_PAGE_MASK; - return ordinals[page][offset]; + if (id == -1) { + return LabelToOrdinal.INVALID_ORDINAL; } + int page = id / ORDINALS_PAGE_SIZE; + int offset = id % ORDINALS_PAGE_MASK; + return ordinals[page][offset]; + } - // Called only from assert - private boolean assertSameOrdinal(FacetLabel label, int id, int ord) { - id = -id - 1; - int page = id / ORDINALS_PAGE_SIZE; - int offset = id % ORDINALS_PAGE_MASK; - int oldOrd = ordinals[page][offset]; - if (oldOrd != ord) { - throw new IllegalArgumentException("label " + label + " was already cached, with old ord=" + oldOrd + " versus new ord=" + ord); - } - return true; + // Called only from assert + private boolean assertSameOrdinal(FacetLabel label, int id, int ord) { + id = -id - 1; + int page = id / ORDINALS_PAGE_SIZE; + int offset = id % ORDINALS_PAGE_MASK; + int oldOrd = ordinals[page][offset]; + if (oldOrd != ord) { + throw new IllegalArgumentException("label " + label + " was already cached, with old ord=" + oldOrd + " versus new ord=" + ord); } + return true; + } - @Override - public boolean put(FacetLabel label, int ord) { - BytesRef bytes = toBytes(label); - int id; - synchronized (this) { - id = map.add(bytes); - if (id < 0) { - assert assertSameOrdinal(label, id, ord); - return false; - } - assert id == count; - int page = id / ORDINALS_PAGE_SIZE; - int offset = id % ORDINALS_PAGE_MASK; - if (page == pageCount) { - if (page == ordinals.length) { - int[][] newOrdinals = new int[ArrayUtil.oversize(page+1, RamUsageEstimator.NUM_BYTES_OBJECT_REF)][]; - System.arraycopy(ordinals, 0, newOrdinals, 0, ordinals.length); - ordinals = newOrdinals; - } - ordinals[page] = new int[ORDINALS_PAGE_MASK]; - pageCount++; - } - ordinals[page][offset] = ord; - count++; - - // we never prune from the cache - return false; - } - } - - @Override - public boolean isFull() { - // we are never full + @Override + public boolean put(FacetLabel label, int ord) { + BytesRef bytes = toBytes(label); + int id; + synchronized (this) { + id = map.add(bytes); + if (id < 0) { + assert assertSameOrdinal(label, id, ord); return false; - } - - @Override - public synchronized void clear() { - map.clear(); - map.reinit(); - ordinals = new int[1][]; - ordinals[0] = new int[ORDINALS_PAGE_SIZE]; - count = 0; - pageCount = 0; - assert bytesUsed.get() == 0; - } - - @Override - public synchronized long ramBytesUsed() { - return bytesUsed.get() + pageCount * ORDINALS_PAGE_SIZE * RamUsageEstimator.NUM_BYTES_INT; - } - - @Override - public void close() { - } - - private static final byte DELIM_CHAR = (byte) 0x1F; - - private BytesRef toBytes(FacetLabel label) { - BytesRefBuilder bytes = this.bytes.get(); - bytes.clear(); - for (int i = 0; i < label.length; i++) { - String part = label.components[i]; - if (i > 0) { - bytes.append(DELIM_CHAR); - } - bytes.grow(bytes.length() + UnicodeUtil.maxUTF8Length(part.length())); - bytes.setLength(UnicodeUtil.UTF16toUTF8(part, 0, part.length(), bytes.bytes(), bytes.length())); + } + assert id == count; + int page = id / ORDINALS_PAGE_SIZE; + int offset = id % ORDINALS_PAGE_MASK; + if (page == pageCount) { + if (page == ordinals.length) { + int[][] newOrdinals = new int[ArrayUtil.oversize(page+1, RamUsageEstimator.NUM_BYTES_OBJECT_REF)][]; + System.arraycopy(ordinals, 0, newOrdinals, 0, ordinals.length); + ordinals = newOrdinals; } - return bytes.get(); + ordinals[page] = new int[ORDINALS_PAGE_MASK]; + pageCount++; + } + ordinals[page][offset] = ord; + count++; + + // we never prune from the cache + return false; } + } + + @Override + public boolean isFull() { + // we are never full + return false; + } + + @Override + public synchronized void clear() { + map.clear(); + map.reinit(); + ordinals = new int[1][]; + ordinals[0] = new int[ORDINALS_PAGE_SIZE]; + count = 0; + pageCount = 0; + assert bytesUsed.get() == 0; + } + + /** How many labels are currently stored in the cache. */ + public int size() { + return count; + } + + @Override + public synchronized long ramBytesUsed() { + return bytesUsed.get() + pageCount * ORDINALS_PAGE_SIZE * RamUsageEstimator.NUM_BYTES_INT; + } + + @Override + public void close() { + } + + private static final byte DELIM_CHAR = (byte) 0x1F; + + private BytesRef toBytes(FacetLabel label) { + BytesRefBuilder bytes = this.bytes.get(); + bytes.clear(); + for (int i = 0; i < label.length; i++) { + String part = label.components[i]; + if (i > 0) { + bytes.append(DELIM_CHAR); + } + bytes.grow(bytes.length() + UnicodeUtil.maxUTF8Length(part.length())); + bytes.setLength(UnicodeUtil.UTF16toUTF8(part, 0, part.length(), bytes.bytes(), bytes.length())); + } + return bytes.get(); + } } diff --git a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestTaxonomyFacetCounts.java b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestTaxonomyFacetCounts.java index 3bb480d1fa9..a12563e1d1f 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestTaxonomyFacetCounts.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestTaxonomyFacetCounts.java @@ -105,6 +105,10 @@ public class TestTaxonomyFacetCounts extends FacetTestCase { Facets facets = getAllFacets(FacetsConfig.DEFAULT_INDEX_FIELD_NAME, searcher, taxoReader, config); + // Publish Date is hierarchical, so we should have loaded all 3 int[]: + assertTrue(((TaxonomyFacets) facets).siblingsLoaded()); + assertTrue(((TaxonomyFacets) facets).childrenLoaded()); + // Retrieve & verify results: assertEquals("dim=Publish Date path=[] value=5 childCount=3\n 2010 (2)\n 2012 (2)\n 1999 (1)\n", facets.getTopChildren(10, "Publish Date").toString()); assertEquals("dim=Author path=[] value=5 childCount=4\n Lisa (2)\n Bob (1)\n Susan (1)\n Frank (1)\n", facets.getTopChildren(10, "Author").toString()); @@ -330,6 +334,10 @@ public class TestTaxonomyFacetCounts extends FacetTestCase { assertEquals(1, facets.getSpecificValue("dim", "test\u001Fone")); assertEquals(1, facets.getSpecificValue("dim", "test\u001Etwo")); + // no hierarchy + assertFalse(((TaxonomyFacets) facets).siblingsLoaded()); + assertFalse(((TaxonomyFacets) facets).childrenLoaded()); + FacetResult result = facets.getTopChildren(10, "dim"); assertEquals("dim=dim path=[] value=-1 childCount=2\n test\u001Fone (1)\n test\u001Etwo (1)\n", result.toString()); writer.close(); diff --git a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestConcurrentFacetedIndexing.java b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestConcurrentFacetedIndexing.java index bf10a3b4470..9c628d51177 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestConcurrentFacetedIndexing.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestConcurrentFacetedIndexing.java @@ -51,6 +51,8 @@ public class TestConcurrentFacetedIndexing extends FacetTestCase { public boolean isFull() { return true; } @Override public void clear() {} + @Override + public int size() { return 0; } }; diff --git a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestDirectoryTaxonomyWriter.java b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestDirectoryTaxonomyWriter.java index 1f330e00a6d..1b5b826423c 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestDirectoryTaxonomyWriter.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestDirectoryTaxonomyWriter.java @@ -65,6 +65,8 @@ public class TestDirectoryTaxonomyWriter extends FacetTestCase { public boolean isFull() { return true; } @Override public void clear() {} + @Override + public int size() { return 0; } }; diff --git a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/writercache/TestUTF8TaxonomyWriterCache.java b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/writercache/TestUTF8TaxonomyWriterCache.java index 31a5e49662c..c36abe1ab3b 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/writercache/TestUTF8TaxonomyWriterCache.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/writercache/TestUTF8TaxonomyWriterCache.java @@ -45,19 +45,19 @@ public class TestUTF8TaxonomyWriterCache extends FacetTestCase { int numParts = TestUtil.nextInt(random(), 1, 5); StringBuilder b = new StringBuilder(); for (int i=0;i 0) { - break; - } + String part = null; + while (true) { + part = TestUtil.randomRealisticUnicodeString(random(), 16); + part = part.replace("/", ""); + if (part.length() > 0) { + break; } + } - if (i > 0) { - b.append('/'); - } - b.append(part); + if (i > 0) { + b.append('/'); + } + b.append(part); } uniqueValuesSet.add(b.toString()); } diff --git a/lucene/ivy-versions.properties b/lucene/ivy-versions.properties index d020231e5d3..b06adac5d3c 100644 --- a/lucene/ivy-versions.properties +++ b/lucene/ivy-versions.properties @@ -9,7 +9,7 @@ com.carrotsearch.randomizedtesting.version = 2.5.3 /com.carrotsearch.randomizedtesting/junit4-ant = ${com.carrotsearch.randomizedtesting.version} /com.carrotsearch.randomizedtesting/randomizedtesting-runner = ${com.carrotsearch.randomizedtesting.version} -/com.carrotsearch/hppc = 0.7.1 +/com.carrotsearch/hppc = 0.7.3 /com.cybozu.labs/langdetect = 1.1-20120112 diff --git a/lucene/licenses/hppc-0.7.3.jar.sha1 b/lucene/licenses/hppc-0.7.3.jar.sha1 new file mode 100644 index 00000000000..df3f33a32ce --- /dev/null +++ b/lucene/licenses/hppc-0.7.3.jar.sha1 @@ -0,0 +1 @@ +1a9c77da84ac7db6a78b49c60947983490ece324 diff --git a/lucene/licenses/hppc-LICENSE-ASL.txt b/lucene/licenses/hppc-LICENSE-ASL.txt new file mode 100644 index 00000000000..f433b1a53f5 --- /dev/null +++ b/lucene/licenses/hppc-LICENSE-ASL.txt @@ -0,0 +1,177 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS diff --git a/lucene/licenses/hppc-NOTICE.txt b/lucene/licenses/hppc-NOTICE.txt new file mode 100644 index 00000000000..e69de29bb2d diff --git a/solr/licenses/hppc-0.7.1.jar.sha1 b/solr/licenses/hppc-0.7.1.jar.sha1 deleted file mode 100644 index d3e00e1c476..00000000000 --- a/solr/licenses/hppc-0.7.1.jar.sha1 +++ /dev/null @@ -1 +0,0 @@ -8b5057f74ea378c0150a1860874a3ebdcb713767 diff --git a/solr/licenses/hppc-0.7.3.jar.sha1 b/solr/licenses/hppc-0.7.3.jar.sha1 new file mode 100644 index 00000000000..df3f33a32ce --- /dev/null +++ b/solr/licenses/hppc-0.7.3.jar.sha1 @@ -0,0 +1 @@ +1a9c77da84ac7db6a78b49c60947983490ece324