LUCENE-10517: Improve performance of SortedSetDV faceting by iterating on class types (#812)

This commit is contained in:
Chris Hegarty 2022-04-21 17:39:53 +01:00 committed by GitHub
parent 08f848a582
commit 3bcc40efe9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 300 additions and 40 deletions

View File

@ -306,7 +306,12 @@ public class StringValueFacetCounts extends Facets {
assert ordinalMap == null;
LeafReaderContext context = leaves.get(0);
countOneSegment(docValues, context.ord, null, context.reader().getLiveDocs());
Bits liveDocs = context.reader().getLiveDocs();
if (liveDocs == null) {
countOneSegmentNHLD(docValues, context.ord);
} else {
countOneSegment(docValues, context.ord, null, liveDocs);
}
} else {
// Since we have more than one segment, we should have a non-null ordinalMap and docValues
// should be a MultiSortedSetDocValues instance:
@ -318,7 +323,12 @@ public class StringValueFacetCounts extends Facets {
for (int i = 0; i < numLeaves; i++) {
LeafReaderContext context = leaves.get(i);
countOneSegment(multiValues.values[i], context.ord, null, context.reader().getLiveDocs());
Bits liveDocs = context.reader().getLiveDocs();
if (liveDocs == null) {
countOneSegmentNHLD(multiValues.values[i], context.ord);
} else {
countOneSegment(multiValues.values[i], context.ord, null, liveDocs);
}
}
}
}
@ -339,7 +349,8 @@ public class StringValueFacetCounts extends Facets {
// all doc values for this segment:
DocIdSetIterator it;
if (hits == null) {
it = (liveDocs != null) ? FacetUtils.liveDocsDISI(valuesIt, liveDocs) : valuesIt;
assert liveDocs != null;
it = FacetUtils.liveDocsDISI(valuesIt, liveDocs);
} else {
it = ConjunctionUtils.intersectIterators(Arrays.asList(hits.bits.iterator(), valuesIt));
}
@ -440,6 +451,86 @@ public class StringValueFacetCounts extends Facets {
}
}
// Variant of countOneSegment, that has No Hits or Live Docs
private void countOneSegmentNHLD(SortedSetDocValues multiValues, int segmentOrd)
throws IOException {
// It's slightly more efficient to work against SortedDocValues if the field is actually
// single-valued (see: LUCENE-5309)
SortedDocValues singleValues = DocValues.unwrapSingleton(multiValues);
if (ordinalMap == null) {
// If there's no ordinal map we don't need to map segment ordinals to globals, so counting
// is very straight-forward:
if (singleValues != null) {
for (int doc = singleValues.nextDoc();
doc != DocIdSetIterator.NO_MORE_DOCS;
doc = singleValues.nextDoc()) {
increment(singleValues.ordValue());
totalDocCount++;
}
} else {
for (int doc = multiValues.nextDoc();
doc != DocIdSetIterator.NO_MORE_DOCS;
doc = multiValues.nextDoc()) {
boolean countedDocInTotal = false;
for (int term = (int) multiValues.nextOrd();
term != SortedSetDocValues.NO_MORE_ORDS;
term = (int) multiValues.nextOrd()) {
increment(term);
if (countedDocInTotal == false) {
totalDocCount++;
countedDocInTotal = true;
}
}
}
}
} else {
// We need to map segment ordinals to globals. We have two different approaches to this
// depending on how many hits we have to count relative to how many unique doc val ordinals
// there are in this segment:
final LongValues ordMap = ordinalMap.getGlobalOrds(segmentOrd);
int segmentCardinality = (int) multiValues.getValueCount();
// First count in seg-ord space.
// At this point, we're either counting all ordinals or our heuristic suggests that
// we expect to visit a large percentage of the unique ordinals (lots of hits relative
// to the segment cardinality), so we count the segment densely:
final int[] segCounts = new int[segmentCardinality];
if (singleValues != null) {
for (int doc = singleValues.nextDoc();
doc != DocIdSetIterator.NO_MORE_DOCS;
doc = singleValues.nextDoc()) {
segCounts[singleValues.ordValue()]++;
totalDocCount++;
}
} else {
for (int doc = multiValues.nextDoc();
doc != DocIdSetIterator.NO_MORE_DOCS;
doc = multiValues.nextDoc()) {
boolean countedDocInTotal = false;
for (int term = (int) multiValues.nextOrd();
term != SortedSetDocValues.NO_MORE_ORDS;
term = (int) multiValues.nextOrd()) {
segCounts[term]++;
if (countedDocInTotal == false) {
totalDocCount++;
countedDocInTotal = true;
}
}
}
}
// Then, migrate to global ords:
for (int ord = 0; ord < segmentCardinality; ord++) {
int count = segCounts[ord];
if (count != 0) {
increment((int) ordMap.get(ord), count);
}
}
}
}
private void increment(int ordinal) {
increment(ordinal, 1);
}

View File

@ -258,15 +258,39 @@ public class ConcurrentSortedSetDocValuesFacetCounts extends Facets {
if (hits != null && hits.totalHits < numSegOrds / 10) {
// Remap every ord to global ord as we iterate:
if (singleValues != null) {
for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
counts.incrementAndGet((int) ordMap.get(singleValues.ordValue()));
if (singleValues == it) {
for (int doc = singleValues.nextDoc();
doc != DocIdSetIterator.NO_MORE_DOCS;
doc = singleValues.nextDoc()) {
counts.incrementAndGet((int) ordMap.get(singleValues.ordValue()));
}
} else {
for (int doc = it.nextDoc();
doc != DocIdSetIterator.NO_MORE_DOCS;
doc = it.nextDoc()) {
counts.incrementAndGet((int) ordMap.get(singleValues.ordValue()));
}
}
} else {
for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
int term = (int) multiValues.nextOrd();
while (term != SortedSetDocValues.NO_MORE_ORDS) {
counts.incrementAndGet((int) ordMap.get(term));
term = (int) multiValues.nextOrd();
if (multiValues == it) {
for (int doc = multiValues.nextDoc();
doc != DocIdSetIterator.NO_MORE_DOCS;
doc = multiValues.nextDoc()) {
for (int term = (int) multiValues.nextOrd();
term != SortedSetDocValues.NO_MORE_ORDS;
term = (int) multiValues.nextOrd()) {
counts.incrementAndGet((int) ordMap.get(term));
}
}
} else {
for (int doc = it.nextDoc();
doc != DocIdSetIterator.NO_MORE_DOCS;
doc = it.nextDoc()) {
for (int term = (int) multiValues.nextOrd();
term != SortedSetDocValues.NO_MORE_ORDS;
term = (int) multiValues.nextOrd()) {
counts.incrementAndGet((int) ordMap.get(term));
}
}
}
}
@ -275,15 +299,39 @@ public class ConcurrentSortedSetDocValuesFacetCounts extends Facets {
// First count in seg-ord space:
final int[] segCounts = new int[numSegOrds];
if (singleValues != null) {
for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
segCounts[singleValues.ordValue()]++;
if (singleValues == it) {
for (int doc = singleValues.nextDoc();
doc != DocIdSetIterator.NO_MORE_DOCS;
doc = singleValues.nextDoc()) {
segCounts[singleValues.ordValue()]++;
}
} else {
for (int doc = it.nextDoc();
doc != DocIdSetIterator.NO_MORE_DOCS;
doc = it.nextDoc()) {
segCounts[singleValues.ordValue()]++;
}
}
} else {
for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
int term = (int) multiValues.nextOrd();
while (term != SortedSetDocValues.NO_MORE_ORDS) {
segCounts[term]++;
term = (int) multiValues.nextOrd();
if (multiValues == it) {
for (int doc = multiValues.nextDoc();
doc != DocIdSetIterator.NO_MORE_DOCS;
doc = multiValues.nextDoc()) {
for (int term = (int) multiValues.nextOrd();
term != SortedSetDocValues.NO_MORE_ORDS;
term = (int) multiValues.nextOrd()) {
segCounts[term]++;
}
}
} else {
for (int doc = it.nextDoc();
doc != DocIdSetIterator.NO_MORE_DOCS;
doc = it.nextDoc()) {
for (int term = (int) multiValues.nextOrd();
term != SortedSetDocValues.NO_MORE_ORDS;
term = (int) multiValues.nextOrd()) {
segCounts[term]++;
}
}
}
}
@ -300,15 +348,35 @@ public class ConcurrentSortedSetDocValuesFacetCounts extends Facets {
// No ord mapping (e.g., single segment index):
// just aggregate directly into counts:
if (singleValues != null) {
for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
counts.incrementAndGet(singleValues.ordValue());
if (singleValues == it) {
for (int doc = singleValues.nextDoc();
doc != DocIdSetIterator.NO_MORE_DOCS;
doc = singleValues.nextDoc()) {
counts.incrementAndGet(singleValues.ordValue());
}
} else {
for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
counts.incrementAndGet(singleValues.ordValue());
}
}
} else {
for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
int term = (int) multiValues.nextOrd();
while (term != SortedSetDocValues.NO_MORE_ORDS) {
counts.incrementAndGet(term);
term = (int) multiValues.nextOrd();
if (multiValues == it) {
for (int doc = multiValues.nextDoc();
doc != DocIdSetIterator.NO_MORE_DOCS;
doc = multiValues.nextDoc()) {
for (int term = (int) multiValues.nextOrd();
term != SortedSetDocValues.NO_MORE_ORDS;
term = (int) multiValues.nextOrd()) {
counts.incrementAndGet(term);
}
}
} else {
for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
for (int term = (int) multiValues.nextOrd();
term != SortedSetDocValues.NO_MORE_ORDS;
term = (int) multiValues.nextOrd()) {
counts.incrementAndGet(term);
}
}
}
}

View File

@ -278,6 +278,83 @@ public class SortedSetDocValuesFacetCounts extends Facets {
return childOrdsResult.dimCount;
}
// Variant of countOneSegment, that has No Hits or Live Docs
private void countOneSegmentNHLD(OrdinalMap ordinalMap, LeafReader reader, int segOrd)
throws IOException {
SortedSetDocValues multiValues = DocValues.getSortedSet(reader, field);
if (multiValues == null) {
// nothing to count
return;
}
// It's slightly more efficient to work against SortedDocValues if the field is actually
// single-valued (see: LUCENE-5309)
SortedDocValues singleValues = DocValues.unwrapSingleton(multiValues);
// TODO: yet another option is to count all segs
// first, only in seg-ord space, and then do a
// merge-sort-PQ in the end to only "resolve to
// global" those seg ords that can compete, if we know
// we just want top K? ie, this is the same algo
// that'd be used for merging facets across shards
// (distributed faceting). but this has much higher
// temp ram req'ts (sum of number of ords across all
// segs)
if (ordinalMap != null) {
final LongValues ordMap = ordinalMap.getGlobalOrds(segOrd);
int numSegOrds = (int) multiValues.getValueCount();
// First count in seg-ord space:
final int[] segCounts = new int[numSegOrds];
if (singleValues != null) {
for (int doc = singleValues.nextDoc();
doc != DocIdSetIterator.NO_MORE_DOCS;
doc = singleValues.nextDoc()) {
segCounts[singleValues.ordValue()]++;
}
} else {
for (int doc = multiValues.nextDoc();
doc != DocIdSetIterator.NO_MORE_DOCS;
doc = multiValues.nextDoc()) {
int term = (int) multiValues.nextOrd();
while (term != SortedSetDocValues.NO_MORE_ORDS) {
segCounts[term]++;
term = (int) multiValues.nextOrd();
}
}
}
// Then, migrate to global ords:
for (int ord = 0; ord < numSegOrds; ord++) {
int count = segCounts[ord];
if (count != 0) {
// ordinalMap.getGlobalOrd(segOrd, ord));
counts[(int) ordMap.get(ord)] += count;
}
}
} else {
// No ord mapping (e.g., single segment index):
// just aggregate directly into counts:
if (singleValues != null) {
for (int doc = singleValues.nextDoc();
doc != DocIdSetIterator.NO_MORE_DOCS;
doc = singleValues.nextDoc()) {
counts[singleValues.ordValue()]++;
}
} else {
for (int doc = multiValues.nextDoc();
doc != DocIdSetIterator.NO_MORE_DOCS;
doc = multiValues.nextDoc()) {
int term = (int) multiValues.nextOrd();
while (term != SortedSetDocValues.NO_MORE_ORDS) {
counts[term]++;
term = (int) multiValues.nextOrd();
}
}
}
}
}
private void countOneSegment(
OrdinalMap ordinalMap, LeafReader reader, int segOrd, MatchingDocs hits, Bits liveDocs)
throws IOException {
@ -294,7 +371,8 @@ public class SortedSetDocValuesFacetCounts extends Facets {
DocIdSetIterator it;
if (hits == null) {
it = (liveDocs != null) ? FacetUtils.liveDocsDISI(valuesIt, liveDocs) : valuesIt;
assert liveDocs != null;
it = FacetUtils.liveDocsDISI(valuesIt, liveDocs);
} else {
it = ConjunctionUtils.intersectIterators(Arrays.asList(hits.bits.iterator(), valuesIt));
}
@ -420,8 +498,13 @@ public class SortedSetDocValuesFacetCounts extends Facets {
for (LeafReaderContext context : state.getReader().leaves()) {
countOneSegment(
ordinalMap, context.reader(), context.ord, null, context.reader().getLiveDocs());
Bits liveDocs = context.reader().getLiveDocs();
if (liveDocs == null) {
countOneSegmentNHLD(ordinalMap, context.reader(), context.ord);
} else {
countOneSegment(
ordinalMap, context.reader(), context.ord, null, context.reader().getLiveDocs());
}
}
}

View File

@ -126,23 +126,41 @@ public class FastTaxonomyFacetCounts extends IntTaxonomyFacets {
NumericDocValues singleValued = DocValues.unwrapSingleton(multiValued);
if (singleValued != null) {
for (int doc = singleValued.nextDoc();
doc != DocIdSetIterator.NO_MORE_DOCS;
doc = singleValued.nextDoc()) {
if (liveDocs != null && liveDocs.get(doc) == false) {
continue;
if (liveDocs == null) {
for (int doc = singleValued.nextDoc();
doc != DocIdSetIterator.NO_MORE_DOCS;
doc = singleValued.nextDoc()) {
values[(int) singleValued.longValue()]++;
}
} else {
for (int doc = singleValued.nextDoc();
doc != DocIdSetIterator.NO_MORE_DOCS;
doc = singleValued.nextDoc()) {
if (liveDocs.get(doc) == false) {
continue;
}
values[(int) singleValued.longValue()]++;
}
values[(int) singleValued.longValue()]++;
}
} else {
for (int doc = multiValued.nextDoc();
doc != DocIdSetIterator.NO_MORE_DOCS;
doc = multiValued.nextDoc()) {
if (liveDocs != null && liveDocs.get(doc) == false) {
continue;
if (liveDocs == null) {
for (int doc = multiValued.nextDoc();
doc != DocIdSetIterator.NO_MORE_DOCS;
doc = multiValued.nextDoc()) {
for (int i = 0; i < multiValued.docValueCount(); i++) {
values[(int) multiValued.nextValue()]++;
}
}
for (int i = 0; i < multiValued.docValueCount(); i++) {
values[(int) multiValued.nextValue()]++;
} else {
for (int doc = multiValued.nextDoc();
doc != DocIdSetIterator.NO_MORE_DOCS;
doc = multiValued.nextDoc()) {
if (liveDocs.get(doc) == false) {
continue;
}
for (int i = 0; i < multiValued.docValueCount(); i++) {
values[(int) multiValued.nextValue()]++;
}
}
}
}