LUCENE-10122 Use NumericDocValue to store taxonomy parent array instead of custom term positions (#451)

2021-11-17 16:32:34 -08:00 · 2021-11-17 16:32:34 -08:00 · b4476e4318
parent bae095ae48
commit b4476e4318
4 changed files with 32 additions and 116 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -251,7 +251,7 @@ Improvements
 * LUCENE-9633: Improve match highlighter behavior for degenerate intervals (on non-existing positions).
  (Dawid Weiss)

-* LUCENE-9618: Do not call IntervalIterator.nextInterval after NO_MORE_DOCS is returned. (Haoyu Zhai)
+* LUCENE-9618: Do not call IntervalIterator.nextInterval after NO_MORE_DOCS is returned. (Patrick Zhai)

 * LUCENE-9576: Improve ConcurrentMergeScheduler settings by default, assuming modern I/O.
  Previously Lucene was too conservative, jumping through hoops to detect if disks were SSD-backed.
@ -460,6 +460,8 @@ Build
 Other
 ---------------------

+* LUCENE-10122: Use NumericDocValues to store taxonomy parent array (Patrick Zhai)
+
 * LUCENE-10136: allow 'var' declarations in source code (Dawid Weiss)

 * LUCENE-9570, LUCENE-9564: Apply google java format and enforce it on source Java files.
@ -468,7 +470,7 @@ Other

 * LUCENE-9631: Properly override slice() on subclasses of OffsetRange. (Dawid Weiss)

-* LUCENE-9391: Upgrade HPPC to 0.8.2. (Haoyu Zhai)
+* LUCENE-9391: Upgrade HPPC to 0.8.2. (Patrick Zhai)

 * LUCENE-10021: Upgrade HPPC to 0.9.0. Replace usage of ...ScatterMap to ...HashMap. (Patrick Zhai)

@ -529,7 +531,7 @@ Improvements
 * LUCENE-9662: Make CheckIndex concurrent by parallelizing index check across segments.
  (Zach Chen, Mike McCandless, Dawid Weiss, Robert Muir)

-* LUCENE-10103: Make QueryCache respect Accountable queries. (Haoyu Zhai)
+* LUCENE-10103: Make QueryCache respect Accountable queries. (Patrick Zhai)

 Optimizations
 ---------------------
@ -741,7 +743,7 @@ New Features
  (Cameron VandenBerg)

 * LUCENE-9694: New tool for creating a deterministic index to enable benchmarking changes
-  on a consistent multi-segment index even when they require re-indexing. (Haoyu Zhai)
+  on a consistent multi-segment index even when they require re-indexing. (Patrick Zhai)

 * LUCENE-9385: Add FacetsConfig option to control which drill-down
  terms are indexed for a FacetLabel (Zachary Chen)
@ -944,7 +946,7 @@ Improvements

 * LUCENE-8574: Add a new ExpressionValueSource which will enforce only one value per name
  per hit in dependencies, ExpressionFunctionValues will no longer
-  recompute already computed values (Haoyu Zhai)
+  recompute already computed values (Patrick Zhai)

 * LUCENE-9416: Fix CheckIndex to print an invalid non-zero norm as
  unsigned long when detecting corruption.
@ -1017,7 +1019,7 @@ Bug Fixes
 Documentation
 ---------------------

-* LUCENE-9424: Add a performance warning to AttributeSource.captureState javadocs (Haoyu Zhai)
+* LUCENE-9424: Add a performance warning to AttributeSource.captureState javadocs (Patrick Zhai)

 Changes in Runtime Behavior
 ---------------------
--- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/Consts.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/Consts.java
@ -16,12 +16,8 @@
 */
 package org.apache.lucene.facet.taxonomy.directory;

-import org.apache.lucene.util.BytesRef;
-
 /** @lucene.experimental */
 abstract class Consts {
  static final String FULL = "$full_path$";
-  static final String FIELD_PAYLOADS = "$payloads$";
-  static final String PAYLOAD_PARENT = "p";
-  static final BytesRef PAYLOAD_PARENT_BYTES_REF = new BytesRef(PAYLOAD_PARENT);
+  static final String FIELD_PARENT_ORDINAL_NDV = "$parent_ndv$";
 }
--- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyWriter.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyWriter.java
@ -27,15 +27,11 @@ import java.util.HashMap;
 import java.util.Map;
 import java.util.Objects;
 import java.util.concurrent.atomic.AtomicInteger;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.document.BinaryDocValuesField;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
-import org.apache.lucene.document.FieldType;
+import org.apache.lucene.document.NumericDocValuesField;
 import org.apache.lucene.document.StringField;
-import org.apache.lucene.document.TextField;
 import org.apache.lucene.facet.FacetsConfig;
 import org.apache.lucene.facet.taxonomy.FacetLabel;
 import org.apache.lucene.facet.taxonomy.TaxonomyReader;
@ -97,9 +93,6 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
  // Records the taxonomy index epoch, updated on replaceTaxonomy as well.
  private long indexEpoch;

-  private SinglePositionTokenStream parentStream =
-      new SinglePositionTokenStream(Consts.PAYLOAD_PARENT);
-  private Field parentStreamField;
  private Field fullPathField;
  private int cacheMissesUntilFill = 11;
  private boolean shouldFillCache = true;
@ -176,9 +169,6 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
      ++indexEpoch;
    }

-    FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
-    ft.setOmitNorms(true);
-    parentStreamField = new Field(Consts.FIELD_PAYLOADS, parentStream, ft);
    fullPathField = new StringField(Consts.FULL, "", Field.Store.NO);

    nextID = indexWriter.getDocStats().maxDoc;
@ -457,18 +447,9 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
   * effectively synchronized as well.
   */
  private int addCategoryDocument(FacetLabel categoryPath, int parent) throws IOException {
-    // Before Lucene 2.9, position increments >=0 were supported, so we
-    // added 1 to parent to allow the parent -1 (the parent of the root).
-    // Unfortunately, starting with Lucene 2.9, after LUCENE-1542, this is
-    // no longer enough, since 0 is not encoded consistently either (see
-    // comment in SinglePositionTokenStream). But because we must be
-    // backward-compatible with existing indexes, we can't just fix what
-    // we write here (e.g., to write parent+2), and need to do a workaround
-    // in the reader (which knows that anyway only category 0 has a parent
-    // -1).
-    parentStream.set(Math.max(parent + 1, 1));
    Document d = new Document();
-    d.add(parentStreamField);
+    /* Lucene 9 switches to NumericDocValuesField for storing parent ordinals */
+    d.add(new NumericDocValuesField(Consts.FIELD_PARENT_ORDINAL_NDV, parent));

    String fieldPath = FacetsConfig.pathToString(categoryPath.components, categoryPath.length);
    fullPathField.setStringValue(fieldPath);
@ -497,50 +478,6 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
    return id;
  }

-  private static class SinglePositionTokenStream extends TokenStream {
-    private CharTermAttribute termAtt;
-    private PositionIncrementAttribute posIncrAtt;
-    private boolean returned;
-    private int val;
-    private final String word;
-
-    public SinglePositionTokenStream(String word) {
-      termAtt = addAttribute(CharTermAttribute.class);
-      posIncrAtt = addAttribute(PositionIncrementAttribute.class);
-      this.word = word;
-      returned = true;
-    }
-
-    /**
-     * Set the value we want to keep, as the position increment. Note that when
-     * TermPositions.nextPosition() is later used to retrieve this value, val-1 will be returned,
-     * not val.
-     *
-     * <p>IMPORTANT NOTE: Before Lucene 2.9, val&gt;=0 were safe (for val==0, the retrieved position
-     * would be -1). But starting with Lucene 2.9, this unfortunately changed, and only val&gt;0 are
-     * safe. val=0 can still be used, but don't count on the value you retrieve later (it could be 0
-     * or -1, depending on circumstances or versions). This change is described in Lucene's JIRA:
-     * LUCENE-1542.
-     */
-    public void set(int val) {
-      this.val = val;
-      returned = false;
-    }
-
-    @Override
-    public boolean incrementToken() throws IOException {
-      if (returned) {
-        return false;
-      }
-      clearAttributes();
-      posIncrAtt.setPositionIncrement(val);
-      termAtt.setEmpty();
-      termAtt.append(word);
-      returned = true;
-      return true;
-    }
-  }
-
  private void addToCache(FacetLabel categoryPath, int id) throws IOException {
    if (cache.put(categoryPath, id)) {
      // If cache.put() returned true, it means the cache was limited in
--- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/TaxonomyIndexArrays.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/TaxonomyIndexArrays.java
@ -25,9 +25,8 @@ import org.apache.lucene.facet.taxonomy.ParallelTaxonomyArrays;
 import org.apache.lucene.facet.taxonomy.TaxonomyReader;
 import org.apache.lucene.index.CorruptIndexException;
 import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.MultiTerms;
-import org.apache.lucene.index.PostingsEnum;
-import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.NumericDocValues;
 import org.apache.lucene.util.Accountable;
 import org.apache.lucene.util.Accountables;
 import org.apache.lucene.util.ArrayUtil;
@ -58,13 +57,6 @@ class TaxonomyIndexArrays extends ParallelTaxonomyArrays implements Accountable
    parents = new int[reader.maxDoc()];
    if (parents.length > 0) {
      initParents(reader, 0);
-      // Starting Lucene 2.9, following the change LUCENE-1542, we can
-      // no longer reliably read the parent "-1" (see comment in
-      // LuceneTaxonomyWriter.SinglePositionTokenStream). We have no way
-      // to fix this in indexing without breaking backward-compatibility
-      // with existing indexes, so what we'll do instead is just
-      // hard-code the parent of ordinal 0 to be -1, and assume (as is
-      // indeed the case) that no other parent can be -1.
      parents[0] = TaxonomyReader.INVALID_ORDINAL;
    }
  }
@ -130,38 +122,27 @@ class TaxonomyIndexArrays extends ParallelTaxonomyArrays implements Accountable
      return;
    }

-    // it's ok to use MultiTerms because we only iterate on one posting list.
-    // breaking it to loop over the leaves() only complicates code for no
-    // apparent gain.
-    PostingsEnum positions =
-        MultiTerms.getTermPostingsEnum(
-            reader, Consts.FIELD_PAYLOADS, Consts.PAYLOAD_PARENT_BYTES_REF, PostingsEnum.PAYLOADS);
-
-    // shouldn't really happen, if it does, something's wrong
-    if (positions == null || positions.advance(first) == DocIdSetIterator.NO_MORE_DOCS) {
+    for (LeafReaderContext leafContext : reader.leaves()) {
+      int leafDocNum = leafContext.reader().maxDoc();
+      if (leafContext.docBase + leafDocNum <= first) {
+        // skip this leaf if it does not contain new categories
+        continue;
+      }
+      NumericDocValues parentValues =
+          leafContext.reader().getNumericDocValues(Consts.FIELD_PARENT_ORDINAL_NDV);
+      if (parentValues == null) {
        throw new CorruptIndexException(
-          "Missing parent data for category " + first, reader.toString());
+            "Parent data field " + Consts.FIELD_PARENT_ORDINAL_NDV + " not exists",
+            leafContext.reader().toString());
      }

-    int num = reader.maxDoc();
-    for (int i = first; i < num; i++) {
-      if (positions.docID() == i) {
-        if (positions.freq() == 0) { // shouldn't happen
+      for (int doc = Math.max(first - leafContext.docBase, 0); doc < leafDocNum; doc++) {
+        if (parentValues.advanceExact(doc) == false) {
          throw new CorruptIndexException(
-              "Missing parent data for category " + i, reader.toString());
+              "Missing parent data for category " + (doc + leafContext.docBase), reader.toString());
        }
-
-        parents[i] = positions.nextPosition();
-
-        if (positions.nextDoc() == DocIdSetIterator.NO_MORE_DOCS) {
-          if (i + 1 < num) {
-            throw new CorruptIndexException(
-                "Missing parent data for category " + (i + 1), reader.toString());
-          }
-          break;
-        }
-      } else { // this shouldn't happen
-        throw new CorruptIndexException("Missing parent data for category " + i, reader.toString());
+        // we're putting an int and converting it back so it should be safe
+        parents[doc + leafContext.docBase] = Math.toIntExact(parentValues.longValue());
      }
    }
  }