mirror of https://github.com/apache/lucene.git
LUCENE-10517: Improve performance of SortedSetDV faceting by iterating on class types (#812)
This commit is contained in:
parent
08f848a582
commit
3bcc40efe9
|
@ -306,7 +306,12 @@ public class StringValueFacetCounts extends Facets {
|
|||
assert ordinalMap == null;
|
||||
|
||||
LeafReaderContext context = leaves.get(0);
|
||||
countOneSegment(docValues, context.ord, null, context.reader().getLiveDocs());
|
||||
Bits liveDocs = context.reader().getLiveDocs();
|
||||
if (liveDocs == null) {
|
||||
countOneSegmentNHLD(docValues, context.ord);
|
||||
} else {
|
||||
countOneSegment(docValues, context.ord, null, liveDocs);
|
||||
}
|
||||
} else {
|
||||
// Since we have more than one segment, we should have a non-null ordinalMap and docValues
|
||||
// should be a MultiSortedSetDocValues instance:
|
||||
|
@ -318,7 +323,12 @@ public class StringValueFacetCounts extends Facets {
|
|||
|
||||
for (int i = 0; i < numLeaves; i++) {
|
||||
LeafReaderContext context = leaves.get(i);
|
||||
countOneSegment(multiValues.values[i], context.ord, null, context.reader().getLiveDocs());
|
||||
Bits liveDocs = context.reader().getLiveDocs();
|
||||
if (liveDocs == null) {
|
||||
countOneSegmentNHLD(multiValues.values[i], context.ord);
|
||||
} else {
|
||||
countOneSegment(multiValues.values[i], context.ord, null, liveDocs);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -339,7 +349,8 @@ public class StringValueFacetCounts extends Facets {
|
|||
// all doc values for this segment:
|
||||
DocIdSetIterator it;
|
||||
if (hits == null) {
|
||||
it = (liveDocs != null) ? FacetUtils.liveDocsDISI(valuesIt, liveDocs) : valuesIt;
|
||||
assert liveDocs != null;
|
||||
it = FacetUtils.liveDocsDISI(valuesIt, liveDocs);
|
||||
} else {
|
||||
it = ConjunctionUtils.intersectIterators(Arrays.asList(hits.bits.iterator(), valuesIt));
|
||||
}
|
||||
|
@ -440,6 +451,86 @@ public class StringValueFacetCounts extends Facets {
|
|||
}
|
||||
}
|
||||
|
||||
// Variant of countOneSegment, that has No Hits or Live Docs
|
||||
private void countOneSegmentNHLD(SortedSetDocValues multiValues, int segmentOrd)
|
||||
throws IOException {
|
||||
|
||||
// It's slightly more efficient to work against SortedDocValues if the field is actually
|
||||
// single-valued (see: LUCENE-5309)
|
||||
SortedDocValues singleValues = DocValues.unwrapSingleton(multiValues);
|
||||
|
||||
if (ordinalMap == null) {
|
||||
// If there's no ordinal map we don't need to map segment ordinals to globals, so counting
|
||||
// is very straight-forward:
|
||||
if (singleValues != null) {
|
||||
for (int doc = singleValues.nextDoc();
|
||||
doc != DocIdSetIterator.NO_MORE_DOCS;
|
||||
doc = singleValues.nextDoc()) {
|
||||
increment(singleValues.ordValue());
|
||||
totalDocCount++;
|
||||
}
|
||||
} else {
|
||||
for (int doc = multiValues.nextDoc();
|
||||
doc != DocIdSetIterator.NO_MORE_DOCS;
|
||||
doc = multiValues.nextDoc()) {
|
||||
boolean countedDocInTotal = false;
|
||||
for (int term = (int) multiValues.nextOrd();
|
||||
term != SortedSetDocValues.NO_MORE_ORDS;
|
||||
term = (int) multiValues.nextOrd()) {
|
||||
increment(term);
|
||||
if (countedDocInTotal == false) {
|
||||
totalDocCount++;
|
||||
countedDocInTotal = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// We need to map segment ordinals to globals. We have two different approaches to this
|
||||
// depending on how many hits we have to count relative to how many unique doc val ordinals
|
||||
// there are in this segment:
|
||||
final LongValues ordMap = ordinalMap.getGlobalOrds(segmentOrd);
|
||||
int segmentCardinality = (int) multiValues.getValueCount();
|
||||
|
||||
// First count in seg-ord space.
|
||||
// At this point, we're either counting all ordinals or our heuristic suggests that
|
||||
// we expect to visit a large percentage of the unique ordinals (lots of hits relative
|
||||
// to the segment cardinality), so we count the segment densely:
|
||||
final int[] segCounts = new int[segmentCardinality];
|
||||
if (singleValues != null) {
|
||||
for (int doc = singleValues.nextDoc();
|
||||
doc != DocIdSetIterator.NO_MORE_DOCS;
|
||||
doc = singleValues.nextDoc()) {
|
||||
segCounts[singleValues.ordValue()]++;
|
||||
totalDocCount++;
|
||||
}
|
||||
} else {
|
||||
for (int doc = multiValues.nextDoc();
|
||||
doc != DocIdSetIterator.NO_MORE_DOCS;
|
||||
doc = multiValues.nextDoc()) {
|
||||
boolean countedDocInTotal = false;
|
||||
for (int term = (int) multiValues.nextOrd();
|
||||
term != SortedSetDocValues.NO_MORE_ORDS;
|
||||
term = (int) multiValues.nextOrd()) {
|
||||
segCounts[term]++;
|
||||
if (countedDocInTotal == false) {
|
||||
totalDocCount++;
|
||||
countedDocInTotal = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Then, migrate to global ords:
|
||||
for (int ord = 0; ord < segmentCardinality; ord++) {
|
||||
int count = segCounts[ord];
|
||||
if (count != 0) {
|
||||
increment((int) ordMap.get(ord), count);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void increment(int ordinal) {
|
||||
increment(ordinal, 1);
|
||||
}
|
||||
|
|
|
@ -258,15 +258,39 @@ public class ConcurrentSortedSetDocValuesFacetCounts extends Facets {
|
|||
if (hits != null && hits.totalHits < numSegOrds / 10) {
|
||||
// Remap every ord to global ord as we iterate:
|
||||
if (singleValues != null) {
|
||||
for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
|
||||
counts.incrementAndGet((int) ordMap.get(singleValues.ordValue()));
|
||||
if (singleValues == it) {
|
||||
for (int doc = singleValues.nextDoc();
|
||||
doc != DocIdSetIterator.NO_MORE_DOCS;
|
||||
doc = singleValues.nextDoc()) {
|
||||
counts.incrementAndGet((int) ordMap.get(singleValues.ordValue()));
|
||||
}
|
||||
} else {
|
||||
for (int doc = it.nextDoc();
|
||||
doc != DocIdSetIterator.NO_MORE_DOCS;
|
||||
doc = it.nextDoc()) {
|
||||
counts.incrementAndGet((int) ordMap.get(singleValues.ordValue()));
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
|
||||
int term = (int) multiValues.nextOrd();
|
||||
while (term != SortedSetDocValues.NO_MORE_ORDS) {
|
||||
counts.incrementAndGet((int) ordMap.get(term));
|
||||
term = (int) multiValues.nextOrd();
|
||||
if (multiValues == it) {
|
||||
for (int doc = multiValues.nextDoc();
|
||||
doc != DocIdSetIterator.NO_MORE_DOCS;
|
||||
doc = multiValues.nextDoc()) {
|
||||
for (int term = (int) multiValues.nextOrd();
|
||||
term != SortedSetDocValues.NO_MORE_ORDS;
|
||||
term = (int) multiValues.nextOrd()) {
|
||||
counts.incrementAndGet((int) ordMap.get(term));
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (int doc = it.nextDoc();
|
||||
doc != DocIdSetIterator.NO_MORE_DOCS;
|
||||
doc = it.nextDoc()) {
|
||||
for (int term = (int) multiValues.nextOrd();
|
||||
term != SortedSetDocValues.NO_MORE_ORDS;
|
||||
term = (int) multiValues.nextOrd()) {
|
||||
counts.incrementAndGet((int) ordMap.get(term));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -275,15 +299,39 @@ public class ConcurrentSortedSetDocValuesFacetCounts extends Facets {
|
|||
// First count in seg-ord space:
|
||||
final int[] segCounts = new int[numSegOrds];
|
||||
if (singleValues != null) {
|
||||
for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
|
||||
segCounts[singleValues.ordValue()]++;
|
||||
if (singleValues == it) {
|
||||
for (int doc = singleValues.nextDoc();
|
||||
doc != DocIdSetIterator.NO_MORE_DOCS;
|
||||
doc = singleValues.nextDoc()) {
|
||||
segCounts[singleValues.ordValue()]++;
|
||||
}
|
||||
} else {
|
||||
for (int doc = it.nextDoc();
|
||||
doc != DocIdSetIterator.NO_MORE_DOCS;
|
||||
doc = it.nextDoc()) {
|
||||
segCounts[singleValues.ordValue()]++;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
|
||||
int term = (int) multiValues.nextOrd();
|
||||
while (term != SortedSetDocValues.NO_MORE_ORDS) {
|
||||
segCounts[term]++;
|
||||
term = (int) multiValues.nextOrd();
|
||||
if (multiValues == it) {
|
||||
for (int doc = multiValues.nextDoc();
|
||||
doc != DocIdSetIterator.NO_MORE_DOCS;
|
||||
doc = multiValues.nextDoc()) {
|
||||
for (int term = (int) multiValues.nextOrd();
|
||||
term != SortedSetDocValues.NO_MORE_ORDS;
|
||||
term = (int) multiValues.nextOrd()) {
|
||||
segCounts[term]++;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (int doc = it.nextDoc();
|
||||
doc != DocIdSetIterator.NO_MORE_DOCS;
|
||||
doc = it.nextDoc()) {
|
||||
for (int term = (int) multiValues.nextOrd();
|
||||
term != SortedSetDocValues.NO_MORE_ORDS;
|
||||
term = (int) multiValues.nextOrd()) {
|
||||
segCounts[term]++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -300,15 +348,35 @@ public class ConcurrentSortedSetDocValuesFacetCounts extends Facets {
|
|||
// No ord mapping (e.g., single segment index):
|
||||
// just aggregate directly into counts:
|
||||
if (singleValues != null) {
|
||||
for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
|
||||
counts.incrementAndGet(singleValues.ordValue());
|
||||
if (singleValues == it) {
|
||||
for (int doc = singleValues.nextDoc();
|
||||
doc != DocIdSetIterator.NO_MORE_DOCS;
|
||||
doc = singleValues.nextDoc()) {
|
||||
counts.incrementAndGet(singleValues.ordValue());
|
||||
}
|
||||
} else {
|
||||
for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
|
||||
counts.incrementAndGet(singleValues.ordValue());
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
|
||||
int term = (int) multiValues.nextOrd();
|
||||
while (term != SortedSetDocValues.NO_MORE_ORDS) {
|
||||
counts.incrementAndGet(term);
|
||||
term = (int) multiValues.nextOrd();
|
||||
if (multiValues == it) {
|
||||
for (int doc = multiValues.nextDoc();
|
||||
doc != DocIdSetIterator.NO_MORE_DOCS;
|
||||
doc = multiValues.nextDoc()) {
|
||||
for (int term = (int) multiValues.nextOrd();
|
||||
term != SortedSetDocValues.NO_MORE_ORDS;
|
||||
term = (int) multiValues.nextOrd()) {
|
||||
counts.incrementAndGet(term);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
|
||||
for (int term = (int) multiValues.nextOrd();
|
||||
term != SortedSetDocValues.NO_MORE_ORDS;
|
||||
term = (int) multiValues.nextOrd()) {
|
||||
counts.incrementAndGet(term);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -278,6 +278,83 @@ public class SortedSetDocValuesFacetCounts extends Facets {
|
|||
return childOrdsResult.dimCount;
|
||||
}
|
||||
|
||||
// Variant of countOneSegment, that has No Hits or Live Docs
|
||||
private void countOneSegmentNHLD(OrdinalMap ordinalMap, LeafReader reader, int segOrd)
|
||||
throws IOException {
|
||||
SortedSetDocValues multiValues = DocValues.getSortedSet(reader, field);
|
||||
if (multiValues == null) {
|
||||
// nothing to count
|
||||
return;
|
||||
}
|
||||
|
||||
// It's slightly more efficient to work against SortedDocValues if the field is actually
|
||||
// single-valued (see: LUCENE-5309)
|
||||
SortedDocValues singleValues = DocValues.unwrapSingleton(multiValues);
|
||||
|
||||
// TODO: yet another option is to count all segs
|
||||
// first, only in seg-ord space, and then do a
|
||||
// merge-sort-PQ in the end to only "resolve to
|
||||
// global" those seg ords that can compete, if we know
|
||||
// we just want top K? ie, this is the same algo
|
||||
// that'd be used for merging facets across shards
|
||||
// (distributed faceting). but this has much higher
|
||||
// temp ram req'ts (sum of number of ords across all
|
||||
// segs)
|
||||
if (ordinalMap != null) {
|
||||
final LongValues ordMap = ordinalMap.getGlobalOrds(segOrd);
|
||||
int numSegOrds = (int) multiValues.getValueCount();
|
||||
|
||||
// First count in seg-ord space:
|
||||
final int[] segCounts = new int[numSegOrds];
|
||||
if (singleValues != null) {
|
||||
for (int doc = singleValues.nextDoc();
|
||||
doc != DocIdSetIterator.NO_MORE_DOCS;
|
||||
doc = singleValues.nextDoc()) {
|
||||
segCounts[singleValues.ordValue()]++;
|
||||
}
|
||||
} else {
|
||||
for (int doc = multiValues.nextDoc();
|
||||
doc != DocIdSetIterator.NO_MORE_DOCS;
|
||||
doc = multiValues.nextDoc()) {
|
||||
int term = (int) multiValues.nextOrd();
|
||||
while (term != SortedSetDocValues.NO_MORE_ORDS) {
|
||||
segCounts[term]++;
|
||||
term = (int) multiValues.nextOrd();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Then, migrate to global ords:
|
||||
for (int ord = 0; ord < numSegOrds; ord++) {
|
||||
int count = segCounts[ord];
|
||||
if (count != 0) {
|
||||
// ordinalMap.getGlobalOrd(segOrd, ord));
|
||||
counts[(int) ordMap.get(ord)] += count;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// No ord mapping (e.g., single segment index):
|
||||
// just aggregate directly into counts:
|
||||
if (singleValues != null) {
|
||||
for (int doc = singleValues.nextDoc();
|
||||
doc != DocIdSetIterator.NO_MORE_DOCS;
|
||||
doc = singleValues.nextDoc()) {
|
||||
counts[singleValues.ordValue()]++;
|
||||
}
|
||||
} else {
|
||||
for (int doc = multiValues.nextDoc();
|
||||
doc != DocIdSetIterator.NO_MORE_DOCS;
|
||||
doc = multiValues.nextDoc()) {
|
||||
int term = (int) multiValues.nextOrd();
|
||||
while (term != SortedSetDocValues.NO_MORE_ORDS) {
|
||||
counts[term]++;
|
||||
term = (int) multiValues.nextOrd();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void countOneSegment(
|
||||
OrdinalMap ordinalMap, LeafReader reader, int segOrd, MatchingDocs hits, Bits liveDocs)
|
||||
throws IOException {
|
||||
|
@ -294,7 +371,8 @@ public class SortedSetDocValuesFacetCounts extends Facets {
|
|||
|
||||
DocIdSetIterator it;
|
||||
if (hits == null) {
|
||||
it = (liveDocs != null) ? FacetUtils.liveDocsDISI(valuesIt, liveDocs) : valuesIt;
|
||||
assert liveDocs != null;
|
||||
it = FacetUtils.liveDocsDISI(valuesIt, liveDocs);
|
||||
} else {
|
||||
it = ConjunctionUtils.intersectIterators(Arrays.asList(hits.bits.iterator(), valuesIt));
|
||||
}
|
||||
|
@ -420,8 +498,13 @@ public class SortedSetDocValuesFacetCounts extends Facets {
|
|||
|
||||
for (LeafReaderContext context : state.getReader().leaves()) {
|
||||
|
||||
countOneSegment(
|
||||
ordinalMap, context.reader(), context.ord, null, context.reader().getLiveDocs());
|
||||
Bits liveDocs = context.reader().getLiveDocs();
|
||||
if (liveDocs == null) {
|
||||
countOneSegmentNHLD(ordinalMap, context.reader(), context.ord);
|
||||
} else {
|
||||
countOneSegment(
|
||||
ordinalMap, context.reader(), context.ord, null, context.reader().getLiveDocs());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -126,23 +126,41 @@ public class FastTaxonomyFacetCounts extends IntTaxonomyFacets {
|
|||
|
||||
NumericDocValues singleValued = DocValues.unwrapSingleton(multiValued);
|
||||
if (singleValued != null) {
|
||||
for (int doc = singleValued.nextDoc();
|
||||
doc != DocIdSetIterator.NO_MORE_DOCS;
|
||||
doc = singleValued.nextDoc()) {
|
||||
if (liveDocs != null && liveDocs.get(doc) == false) {
|
||||
continue;
|
||||
if (liveDocs == null) {
|
||||
for (int doc = singleValued.nextDoc();
|
||||
doc != DocIdSetIterator.NO_MORE_DOCS;
|
||||
doc = singleValued.nextDoc()) {
|
||||
values[(int) singleValued.longValue()]++;
|
||||
}
|
||||
} else {
|
||||
for (int doc = singleValued.nextDoc();
|
||||
doc != DocIdSetIterator.NO_MORE_DOCS;
|
||||
doc = singleValued.nextDoc()) {
|
||||
if (liveDocs.get(doc) == false) {
|
||||
continue;
|
||||
}
|
||||
values[(int) singleValued.longValue()]++;
|
||||
}
|
||||
values[(int) singleValued.longValue()]++;
|
||||
}
|
||||
} else {
|
||||
for (int doc = multiValued.nextDoc();
|
||||
doc != DocIdSetIterator.NO_MORE_DOCS;
|
||||
doc = multiValued.nextDoc()) {
|
||||
if (liveDocs != null && liveDocs.get(doc) == false) {
|
||||
continue;
|
||||
if (liveDocs == null) {
|
||||
for (int doc = multiValued.nextDoc();
|
||||
doc != DocIdSetIterator.NO_MORE_DOCS;
|
||||
doc = multiValued.nextDoc()) {
|
||||
for (int i = 0; i < multiValued.docValueCount(); i++) {
|
||||
values[(int) multiValued.nextValue()]++;
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < multiValued.docValueCount(); i++) {
|
||||
values[(int) multiValued.nextValue()]++;
|
||||
} else {
|
||||
for (int doc = multiValued.nextDoc();
|
||||
doc != DocIdSetIterator.NO_MORE_DOCS;
|
||||
doc = multiValued.nextDoc()) {
|
||||
if (liveDocs.get(doc) == false) {
|
||||
continue;
|
||||
}
|
||||
for (int i = 0; i < multiValued.docValueCount(); i++) {
|
||||
values[(int) multiValued.nextValue()]++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue