LUCENE-10517: Improve performance of SortedSetDV faceting by iterating on class types (#812)

2022-04-21 17:39:53 +01:00 · 2022-04-21 17:39:53 +01:00 · 3bcc40efe9
parent 08f848a582
commit 3bcc40efe9
4 changed files with 300 additions and 40 deletions
--- a/lucene/facet/src/java/org/apache/lucene/facet/StringValueFacetCounts.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/StringValueFacetCounts.java
@ -306,7 +306,12 @@ public class StringValueFacetCounts extends Facets {
      assert ordinalMap == null;

      LeafReaderContext context = leaves.get(0);
-      countOneSegment(docValues, context.ord, null, context.reader().getLiveDocs());
+      Bits liveDocs = context.reader().getLiveDocs();
+      if (liveDocs == null) {
+        countOneSegmentNHLD(docValues, context.ord);
+      } else {
+        countOneSegment(docValues, context.ord, null, liveDocs);
+      }
    } else {
      // Since we have more than one segment, we should have a non-null ordinalMap and docValues
      // should be a MultiSortedSetDocValues instance:
@ -318,7 +323,12 @@ public class StringValueFacetCounts extends Facets {

      for (int i = 0; i < numLeaves; i++) {
        LeafReaderContext context = leaves.get(i);
-        countOneSegment(multiValues.values[i], context.ord, null, context.reader().getLiveDocs());
+        Bits liveDocs = context.reader().getLiveDocs();
+        if (liveDocs == null) {
+          countOneSegmentNHLD(multiValues.values[i], context.ord);
+        } else {
+          countOneSegment(multiValues.values[i], context.ord, null, liveDocs);
+        }
      }
    }
  }
@ -339,7 +349,8 @@ public class StringValueFacetCounts extends Facets {
    // all doc values for this segment:
    DocIdSetIterator it;
    if (hits == null) {
-      it = (liveDocs != null) ? FacetUtils.liveDocsDISI(valuesIt, liveDocs) : valuesIt;
+      assert liveDocs != null;
+      it = FacetUtils.liveDocsDISI(valuesIt, liveDocs);
    } else {
      it = ConjunctionUtils.intersectIterators(Arrays.asList(hits.bits.iterator(), valuesIt));
    }
@ -440,6 +451,86 @@ public class StringValueFacetCounts extends Facets {
    }
  }

+  // Variant of countOneSegment, that has No Hits or Live Docs
+  private void countOneSegmentNHLD(SortedSetDocValues multiValues, int segmentOrd)
+      throws IOException {
+
+    // It's slightly more efficient to work against SortedDocValues if the field is actually
+    // single-valued (see: LUCENE-5309)
+    SortedDocValues singleValues = DocValues.unwrapSingleton(multiValues);
+
+    if (ordinalMap == null) {
+      // If there's no ordinal map we don't need to map segment ordinals to globals, so counting
+      // is very straight-forward:
+      if (singleValues != null) {
+        for (int doc = singleValues.nextDoc();
+            doc != DocIdSetIterator.NO_MORE_DOCS;
+            doc = singleValues.nextDoc()) {
+          increment(singleValues.ordValue());
+          totalDocCount++;
+        }
+      } else {
+        for (int doc = multiValues.nextDoc();
+            doc != DocIdSetIterator.NO_MORE_DOCS;
+            doc = multiValues.nextDoc()) {
+          boolean countedDocInTotal = false;
+          for (int term = (int) multiValues.nextOrd();
+              term != SortedSetDocValues.NO_MORE_ORDS;
+              term = (int) multiValues.nextOrd()) {
+            increment(term);
+            if (countedDocInTotal == false) {
+              totalDocCount++;
+              countedDocInTotal = true;
+            }
+          }
+        }
+      }
+    } else {
+      // We need to map segment ordinals to globals. We have two different approaches to this
+      // depending on how many hits we have to count relative to how many unique doc val ordinals
+      // there are in this segment:
+      final LongValues ordMap = ordinalMap.getGlobalOrds(segmentOrd);
+      int segmentCardinality = (int) multiValues.getValueCount();
+
+      // First count in seg-ord space.
+      // At this point, we're either counting all ordinals or our heuristic suggests that
+      // we expect to visit a large percentage of the unique ordinals (lots of hits relative
+      // to the segment cardinality), so we count the segment densely:
+      final int[] segCounts = new int[segmentCardinality];
+      if (singleValues != null) {
+        for (int doc = singleValues.nextDoc();
+            doc != DocIdSetIterator.NO_MORE_DOCS;
+            doc = singleValues.nextDoc()) {
+          segCounts[singleValues.ordValue()]++;
+          totalDocCount++;
+        }
+      } else {
+        for (int doc = multiValues.nextDoc();
+            doc != DocIdSetIterator.NO_MORE_DOCS;
+            doc = multiValues.nextDoc()) {
+          boolean countedDocInTotal = false;
+          for (int term = (int) multiValues.nextOrd();
+              term != SortedSetDocValues.NO_MORE_ORDS;
+              term = (int) multiValues.nextOrd()) {
+            segCounts[term]++;
+            if (countedDocInTotal == false) {
+              totalDocCount++;
+              countedDocInTotal = true;
+            }
+          }
+        }
+      }
+
+      // Then, migrate to global ords:
+      for (int ord = 0; ord < segmentCardinality; ord++) {
+        int count = segCounts[ord];
+        if (count != 0) {
+          increment((int) ordMap.get(ord), count);
+        }
+      }
+    }
+  }
+
  private void increment(int ordinal) {
    increment(ordinal, 1);
  }
--- a/lucene/facet/src/java/org/apache/lucene/facet/sortedset/ConcurrentSortedSetDocValuesFacetCounts.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/sortedset/ConcurrentSortedSetDocValuesFacetCounts.java
@ -258,15 +258,39 @@ public class ConcurrentSortedSetDocValuesFacetCounts extends Facets {
        if (hits != null && hits.totalHits < numSegOrds / 10) {
          // Remap every ord to global ord as we iterate:
          if (singleValues != null) {
-            for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
-              counts.incrementAndGet((int) ordMap.get(singleValues.ordValue()));
+            if (singleValues == it) {
+              for (int doc = singleValues.nextDoc();
+                  doc != DocIdSetIterator.NO_MORE_DOCS;
+                  doc = singleValues.nextDoc()) {
+                counts.incrementAndGet((int) ordMap.get(singleValues.ordValue()));
+              }
+            } else {
+              for (int doc = it.nextDoc();
+                  doc != DocIdSetIterator.NO_MORE_DOCS;
+                  doc = it.nextDoc()) {
+                counts.incrementAndGet((int) ordMap.get(singleValues.ordValue()));
+              }
            }
          } else {
-            for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
-              int term = (int) multiValues.nextOrd();
-              while (term != SortedSetDocValues.NO_MORE_ORDS) {
-                counts.incrementAndGet((int) ordMap.get(term));
-                term = (int) multiValues.nextOrd();
+            if (multiValues == it) {
+              for (int doc = multiValues.nextDoc();
+                  doc != DocIdSetIterator.NO_MORE_DOCS;
+                  doc = multiValues.nextDoc()) {
+                for (int term = (int) multiValues.nextOrd();
+                    term != SortedSetDocValues.NO_MORE_ORDS;
+                    term = (int) multiValues.nextOrd()) {
+                  counts.incrementAndGet((int) ordMap.get(term));
+                }
+              }
+            } else {
+              for (int doc = it.nextDoc();
+                  doc != DocIdSetIterator.NO_MORE_DOCS;
+                  doc = it.nextDoc()) {
+                for (int term = (int) multiValues.nextOrd();
+                    term != SortedSetDocValues.NO_MORE_ORDS;
+                    term = (int) multiValues.nextOrd()) {
+                  counts.incrementAndGet((int) ordMap.get(term));
+                }
              }
            }
          }
@ -275,15 +299,39 @@ public class ConcurrentSortedSetDocValuesFacetCounts extends Facets {
          // First count in seg-ord space:
          final int[] segCounts = new int[numSegOrds];
          if (singleValues != null) {
-            for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
-              segCounts[singleValues.ordValue()]++;
+            if (singleValues == it) {
+              for (int doc = singleValues.nextDoc();
+                  doc != DocIdSetIterator.NO_MORE_DOCS;
+                  doc = singleValues.nextDoc()) {
+                segCounts[singleValues.ordValue()]++;
+              }
+            } else {
+              for (int doc = it.nextDoc();
+                  doc != DocIdSetIterator.NO_MORE_DOCS;
+                  doc = it.nextDoc()) {
+                segCounts[singleValues.ordValue()]++;
+              }
            }
          } else {
-            for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
-              int term = (int) multiValues.nextOrd();
-              while (term != SortedSetDocValues.NO_MORE_ORDS) {
-                segCounts[term]++;
-                term = (int) multiValues.nextOrd();
+            if (multiValues == it) {
+              for (int doc = multiValues.nextDoc();
+                  doc != DocIdSetIterator.NO_MORE_DOCS;
+                  doc = multiValues.nextDoc()) {
+                for (int term = (int) multiValues.nextOrd();
+                    term != SortedSetDocValues.NO_MORE_ORDS;
+                    term = (int) multiValues.nextOrd()) {
+                  segCounts[term]++;
+                }
+              }
+            } else {
+              for (int doc = it.nextDoc();
+                  doc != DocIdSetIterator.NO_MORE_DOCS;
+                  doc = it.nextDoc()) {
+                for (int term = (int) multiValues.nextOrd();
+                    term != SortedSetDocValues.NO_MORE_ORDS;
+                    term = (int) multiValues.nextOrd()) {
+                  segCounts[term]++;
+                }
              }
            }
          }
@ -300,15 +348,35 @@ public class ConcurrentSortedSetDocValuesFacetCounts extends Facets {
        // No ord mapping (e.g., single segment index):
        // just aggregate directly into counts:
        if (singleValues != null) {
-          for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
-            counts.incrementAndGet(singleValues.ordValue());
+          if (singleValues == it) {
+            for (int doc = singleValues.nextDoc();
+                doc != DocIdSetIterator.NO_MORE_DOCS;
+                doc = singleValues.nextDoc()) {
+              counts.incrementAndGet(singleValues.ordValue());
+            }
+          } else {
+            for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
+              counts.incrementAndGet(singleValues.ordValue());
+            }
          }
        } else {
-          for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
-            int term = (int) multiValues.nextOrd();
-            while (term != SortedSetDocValues.NO_MORE_ORDS) {
-              counts.incrementAndGet(term);
-              term = (int) multiValues.nextOrd();
+          if (multiValues == it) {
+            for (int doc = multiValues.nextDoc();
+                doc != DocIdSetIterator.NO_MORE_DOCS;
+                doc = multiValues.nextDoc()) {
+              for (int term = (int) multiValues.nextOrd();
+                  term != SortedSetDocValues.NO_MORE_ORDS;
+                  term = (int) multiValues.nextOrd()) {
+                counts.incrementAndGet(term);
+              }
+            }
+          } else {
+            for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
+              for (int term = (int) multiValues.nextOrd();
+                  term != SortedSetDocValues.NO_MORE_ORDS;
+                  term = (int) multiValues.nextOrd()) {
+                counts.incrementAndGet(term);
+              }
            }
          }
        }
--- a/lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesFacetCounts.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesFacetCounts.java
@ -278,6 +278,83 @@ public class SortedSetDocValuesFacetCounts extends Facets {
    return childOrdsResult.dimCount;
  }

+  // Variant of countOneSegment, that has No Hits or Live Docs
+  private void countOneSegmentNHLD(OrdinalMap ordinalMap, LeafReader reader, int segOrd)
+      throws IOException {
+    SortedSetDocValues multiValues = DocValues.getSortedSet(reader, field);
+    if (multiValues == null) {
+      // nothing to count
+      return;
+    }
+
+    // It's slightly more efficient to work against SortedDocValues if the field is actually
+    // single-valued (see: LUCENE-5309)
+    SortedDocValues singleValues = DocValues.unwrapSingleton(multiValues);
+
+    // TODO: yet another option is to count all segs
+    // first, only in seg-ord space, and then do a
+    // merge-sort-PQ in the end to only "resolve to
+    // global" those seg ords that can compete, if we know
+    // we just want top K?  ie, this is the same algo
+    // that'd be used for merging facets across shards
+    // (distributed faceting).  but this has much higher
+    // temp ram req'ts (sum of number of ords across all
+    // segs)
+    if (ordinalMap != null) {
+      final LongValues ordMap = ordinalMap.getGlobalOrds(segOrd);
+      int numSegOrds = (int) multiValues.getValueCount();
+
+      // First count in seg-ord space:
+      final int[] segCounts = new int[numSegOrds];
+      if (singleValues != null) {
+        for (int doc = singleValues.nextDoc();
+            doc != DocIdSetIterator.NO_MORE_DOCS;
+            doc = singleValues.nextDoc()) {
+          segCounts[singleValues.ordValue()]++;
+        }
+      } else {
+        for (int doc = multiValues.nextDoc();
+            doc != DocIdSetIterator.NO_MORE_DOCS;
+            doc = multiValues.nextDoc()) {
+          int term = (int) multiValues.nextOrd();
+          while (term != SortedSetDocValues.NO_MORE_ORDS) {
+            segCounts[term]++;
+            term = (int) multiValues.nextOrd();
+          }
+        }
+      }
+
+      // Then, migrate to global ords:
+      for (int ord = 0; ord < numSegOrds; ord++) {
+        int count = segCounts[ord];
+        if (count != 0) {
+          // ordinalMap.getGlobalOrd(segOrd, ord));
+          counts[(int) ordMap.get(ord)] += count;
+        }
+      }
+    } else {
+      // No ord mapping (e.g., single segment index):
+      // just aggregate directly into counts:
+      if (singleValues != null) {
+        for (int doc = singleValues.nextDoc();
+            doc != DocIdSetIterator.NO_MORE_DOCS;
+            doc = singleValues.nextDoc()) {
+          counts[singleValues.ordValue()]++;
+        }
+      } else {
+        for (int doc = multiValues.nextDoc();
+            doc != DocIdSetIterator.NO_MORE_DOCS;
+            doc = multiValues.nextDoc()) {
+          int term = (int) multiValues.nextOrd();
+          while (term != SortedSetDocValues.NO_MORE_ORDS) {
+            counts[term]++;
+            term = (int) multiValues.nextOrd();
+          }
+        }
+      }
+    }
+  }
+
  private void countOneSegment(
      OrdinalMap ordinalMap, LeafReader reader, int segOrd, MatchingDocs hits, Bits liveDocs)
      throws IOException {
@ -294,7 +371,8 @@ public class SortedSetDocValuesFacetCounts extends Facets {

    DocIdSetIterator it;
    if (hits == null) {
-      it = (liveDocs != null) ? FacetUtils.liveDocsDISI(valuesIt, liveDocs) : valuesIt;
+      assert liveDocs != null;
+      it = FacetUtils.liveDocsDISI(valuesIt, liveDocs);
    } else {
      it = ConjunctionUtils.intersectIterators(Arrays.asList(hits.bits.iterator(), valuesIt));
    }
@ -420,8 +498,13 @@ public class SortedSetDocValuesFacetCounts extends Facets {

    for (LeafReaderContext context : state.getReader().leaves()) {

-      countOneSegment(
-          ordinalMap, context.reader(), context.ord, null, context.reader().getLiveDocs());
+      Bits liveDocs = context.reader().getLiveDocs();
+      if (liveDocs == null) {
+        countOneSegmentNHLD(ordinalMap, context.reader(), context.ord);
+      } else {
+        countOneSegment(
+            ordinalMap, context.reader(), context.ord, null, context.reader().getLiveDocs());
+      }
    }
  }

--- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/FastTaxonomyFacetCounts.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/FastTaxonomyFacetCounts.java
@ -126,23 +126,41 @@ public class FastTaxonomyFacetCounts extends IntTaxonomyFacets {

      NumericDocValues singleValued = DocValues.unwrapSingleton(multiValued);
      if (singleValued != null) {
-        for (int doc = singleValued.nextDoc();
-            doc != DocIdSetIterator.NO_MORE_DOCS;
-            doc = singleValued.nextDoc()) {
-          if (liveDocs != null && liveDocs.get(doc) == false) {
-            continue;
+        if (liveDocs == null) {
+          for (int doc = singleValued.nextDoc();
+              doc != DocIdSetIterator.NO_MORE_DOCS;
+              doc = singleValued.nextDoc()) {
+            values[(int) singleValued.longValue()]++;
+          }
+        } else {
+          for (int doc = singleValued.nextDoc();
+              doc != DocIdSetIterator.NO_MORE_DOCS;
+              doc = singleValued.nextDoc()) {
+            if (liveDocs.get(doc) == false) {
+              continue;
+            }
+            values[(int) singleValued.longValue()]++;
          }
-          values[(int) singleValued.longValue()]++;
        }
      } else {
-        for (int doc = multiValued.nextDoc();
-            doc != DocIdSetIterator.NO_MORE_DOCS;
-            doc = multiValued.nextDoc()) {
-          if (liveDocs != null && liveDocs.get(doc) == false) {
-            continue;
+        if (liveDocs == null) {
+          for (int doc = multiValued.nextDoc();
+              doc != DocIdSetIterator.NO_MORE_DOCS;
+              doc = multiValued.nextDoc()) {
+            for (int i = 0; i < multiValued.docValueCount(); i++) {
+              values[(int) multiValued.nextValue()]++;
+            }
          }
-          for (int i = 0; i < multiValued.docValueCount(); i++) {
-            values[(int) multiValued.nextValue()]++;
+        } else {
+          for (int doc = multiValued.nextDoc();
+              doc != DocIdSetIterator.NO_MORE_DOCS;
+              doc = multiValued.nextDoc()) {
+            if (liveDocs.get(doc) == false) {
+              continue;
+            }
+            for (int i = 0; i < multiValued.docValueCount(); i++) {
+              values[(int) multiValued.nextValue()]++;
+            }
          }
        }
      }