From f339e24e8ed64bf64ef4bb05111d252ecdacde1e Mon Sep 17 00:00:00 2001
From: Stefan Vodita <41467371+stefanvodita@users.noreply.github.com>
Date: Thu, 8 Feb 2024 11:02:12 +0000
Subject: [PATCH] Index arbitrary fields in taxonomy docs (#12337)

---
 lucene/CHANGES.txt                            |   3 +
 .../facet/taxonomy/directory/Consts.java      |   5 +
 .../directory/DirectoryTaxonomyReader.java    |   2 +-
 .../directory/DirectoryTaxonomyWriter.java    |  40 ++++
 ...dexingEnrichedDirectoryTaxonomyWriter.java | 105 ++++++++++
 .../facet/taxonomy/TestOrdinalData.java       | 186 ++++++++++++++++++
 6 files changed, 340 insertions(+), 1 deletion(-)
 create mode 100644 lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/ReindexingEnrichedDirectoryTaxonomyWriter.java
 create mode 100644 lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestOrdinalData.java

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index eb16b3be8c1..181d4e27157 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -197,6 +197,9 @@ New Features
   better-scoring nodes are available, or the best candidate is below a score of `traversalSimilarity` in the lowest
   level. (Aditya Prakash, Kaival Parikh)
 
+* GITHUB#12336: Index additional data per facet label in the taxonomy. (Shai Erera, Egor Potemkin, Mike McCandless,
+  Stefan Vodita)
+
 Improvements
 ---------------------
 
diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/Consts.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/Consts.java
index c52805c5df1..cfcca124858 100644
--- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/Consts.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/Consts.java
@@ -17,9 +17,14 @@
 package org.apache.lucene.facet.taxonomy.directory;
 
 /**
+ * This class holds constants used by the directory taxonomy implementations.
+ *
  * @lucene.experimental
  */
 abstract class Consts {
+  /** The name of the field containing the full path of a taxonomy document. */
   static final String FULL = "$full_path$";
+
+  /** The name of the field containing the ordinal of the parent of a taxonomy document. */
   static final String FIELD_PARENT_ORDINAL_NDV = "$parent_ndv$";
 }
diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyReader.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyReader.java
index 0d641d396ec..e2cf114da0b 100644
--- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyReader.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyReader.java
@@ -243,7 +243,7 @@ public class DirectoryTaxonomyReader extends TaxonomyReader implements Accountab
    * Expert: returns the underlying {@link DirectoryReader} instance that is used by this {@link
    * TaxonomyReader}.
    */
-  protected DirectoryReader getInternalIndexReader() {
+  public DirectoryReader getInternalIndexReader() {
     ensureOpen();
     return indexReader;
   }
diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyWriter.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyWriter.java
index 86f3d18deed..bd4934fbb04 100644
--- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyWriter.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyWriter.java
@@ -27,6 +27,7 @@ import java.util.HashMap;
 import java.util.Map;
 import java.util.Objects;
 import java.util.concurrent.atomic.AtomicInteger;
+import java.util.function.BiConsumer;
 import org.apache.lucene.document.BinaryDocValuesField;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
@@ -436,6 +437,14 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
     }
   }
 
+  /**
+   * Child classes can implement this method to modify the document corresponding to a category path
+   * before indexing it.
+   *
+   * @lucene.experimental
+   */
+  protected void enrichOrdinalDocument(Document d, FacetLabel categoryPath) {}
+
   /**
    * Note that the methods calling addCategoryDocument() are synchronized, so this method is
    * effectively synchronized as well.
@@ -453,6 +462,9 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
 
     d.add(fullPathField);
 
+    // add arbitrary ordinal data to the doc
+    enrichOrdinalDocument(d, categoryPath);
+
     indexWriter.addDocument(d);
     int id = nextID.getAndIncrement();
 
@@ -878,6 +890,34 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
     ++indexEpoch;
   }
 
+  /**
+   * Delete the taxonomy and reset all state for this writer.
+   *
+   * <p>To keep using the same main index, you would have to regenerate the taxonomy, taking care
+   * that ordinals are indexed in the same order as before. An example of this can be found in
+   * {@link ReindexingEnrichedDirectoryTaxonomyWriter#reindexWithNewOrdinalData(BiConsumer)}.
+   *
+   * @lucene.experimental
+   */
+  synchronized void deleteAll() throws IOException {
+    indexWriter.deleteAll();
+    shouldRefreshReaderManager = true;
+    initReaderManager(); // ensure that it's initialized
+    refreshReaderManager();
+    nextID.set(0);
+    taxoArrays = null; // must nullify so that it's re-computed next time it's needed
+
+    // need to clear the cache, so that addCategory won't accidentally return
+    // old categories that are in the cache.
+    cache.clear();
+    cacheIsComplete = false;
+    shouldFillCache = true;
+    cacheMisses.set(0);
+
+    // update indexEpoch as a taxonomy replace is just like it has be recreated
+    ++indexEpoch;
+  }
+
   /** Returns the {@link Directory} of this taxonomy writer. */
   public Directory getDirectory() {
     return dir;
diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/ReindexingEnrichedDirectoryTaxonomyWriter.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/ReindexingEnrichedDirectoryTaxonomyWriter.java
new file mode 100644
index 00000000000..91b7291b276
--- /dev/null
+++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/ReindexingEnrichedDirectoryTaxonomyWriter.java
@@ -0,0 +1,105 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.facet.taxonomy.directory;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.function.BiConsumer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.facet.taxonomy.FacetLabel;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.LeafReader;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.IOUtils;
+
+/**
+ * Use this {@link org.apache.lucene.facet.taxonomy.TaxonomyWriter} to append arbitrary fields to
+ * the ordinal documents in the taxonomy. To update the custom data added to the docs, it is
+ * required to {@link #reindexWithNewOrdinalData(BiConsumer)}.
+ *
+ * @lucene.experimental
+ */
+public class ReindexingEnrichedDirectoryTaxonomyWriter extends DirectoryTaxonomyWriter {
+  private BiConsumer<FacetLabel, Document> ordinalDataAppender;
+
+  /** Create a taxonomy writer that will allow editing the ordinal docs before indexing them. */
+  public ReindexingEnrichedDirectoryTaxonomyWriter(
+      Directory d, BiConsumer<FacetLabel, Document> ordinalDataAppender) throws IOException {
+    super(d);
+    this.ordinalDataAppender = ordinalDataAppender;
+  }
+
+  /** Add fields specified by the {@link #ordinalDataAppender} to the provided {@link Document}. */
+  @Override
+  protected void enrichOrdinalDocument(Document d, FacetLabel categoryPath) {
+    if (ordinalDataAppender != null) {
+      ordinalDataAppender.accept(categoryPath, d);
+    }
+  }
+
+  /**
+   * Make a list of all labels in the taxonomy. The index of each label in this list is the ordinal
+   * which corresponds to it.
+   */
+  private List<FacetLabel> recordPathsInOrder(Directory d) throws IOException {
+    List<FacetLabel> paths = new ArrayList<>();
+
+    DirectoryTaxonomyReader taxoReader = new DirectoryTaxonomyReader(d);
+    IndexReader taxoIndexReader = taxoReader.getInternalIndexReader();
+
+    for (LeafReaderContext ctx : taxoIndexReader.leaves()) {
+      LeafReader leafReader = ctx.reader();
+      int[] ordinals = new int[leafReader.maxDoc()];
+      for (int i = 0; i < ordinals.length; i++) {
+        ordinals[i] = ctx.docBase + i;
+      }
+      FacetLabel[] labels = taxoReader.getBulkPath(ordinals);
+      for (FacetLabel label : labels) {
+        paths.add(label);
+      }
+    }
+
+    IOUtils.close(taxoReader);
+    return paths;
+  }
+
+  /**
+   * Delete the existing taxonomy index and recreate it using new ordinal data. The ordinals
+   * themselves will be preserved, so the caller does not need to update references to them in the
+   * main index.
+   */
+  public synchronized void reindexWithNewOrdinalData(
+      BiConsumer<FacetLabel, Document> ordinalDataAppender) throws IOException {
+    ensureOpen();
+    this.ordinalDataAppender = ordinalDataAppender;
+    Directory d = getDirectory();
+
+    // Record paths in order.
+    List<FacetLabel> ordinalToPath = recordPathsInOrder(d);
+
+    // Delete old taxonomy files.
+    deleteAll();
+
+    // Index paths in order - they will use the new appender.
+    for (FacetLabel categoryPath : ordinalToPath) {
+      addCategory(categoryPath);
+    }
+    commit();
+  }
+}
diff --git a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestOrdinalData.java b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestOrdinalData.java
new file mode 100644
index 00000000000..47750b773b2
--- /dev/null
+++ b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestOrdinalData.java
@@ -0,0 +1,186 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.facet.taxonomy;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.function.BiConsumer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.NumericDocValuesField;
+import org.apache.lucene.document.StringField;
+import org.apache.lucene.facet.FacetField;
+import org.apache.lucene.facet.FacetTestCase;
+import org.apache.lucene.facet.FacetsConfig;
+import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader;
+import org.apache.lucene.facet.taxonomy.directory.ReindexingEnrichedDirectoryTaxonomyWriter;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.index.LeafReader;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.NumericDocValues;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.IOUtils;
+import org.junit.After;
+import org.junit.Before;
+
+public class TestOrdinalData extends FacetTestCase {
+  Directory taxoDir;
+  DirectoryTaxonomyReader taxoReader;
+  IndexReader taxoIndexReader;
+  ReindexingEnrichedDirectoryTaxonomyWriter taxoWriter;
+
+  private static final Map<String, Long> labelToScore =
+      Map.of(
+          "Bob", 42L,
+          "Lisa", 35L);
+
+  private static class OrdinalDataAppender implements BiConsumer<FacetLabel, Document> {
+    private final Map<String, Long> scores;
+
+    private OrdinalDataAppender(Map<String, Long> scores) {
+      this.scores = scores;
+    }
+
+    @Override
+    public void accept(FacetLabel facetLabel, Document doc) {
+      if (facetLabel.length == 0) {
+        return;
+      }
+      Long score = scores.get(facetLabel.components[facetLabel.length - 1]);
+      if (score != null) {
+        doc.add(new NumericDocValuesField("score", score));
+        doc.add(new StringField("hasScore?", "yes", Field.Store.NO));
+      } else {
+        doc.add(new StringField("hasScore?", "no", Field.Store.NO));
+      }
+    }
+  }
+
+  @Before
+  @Override
+  public void setUp() throws Exception {
+    super.setUp();
+
+    Directory indexDir = newDirectory();
+    taxoDir = newDirectory();
+
+    IndexWriter indexWriter = new IndexWriter(indexDir, new IndexWriterConfig());
+    taxoWriter =
+        new ReindexingEnrichedDirectoryTaxonomyWriter(
+            taxoDir, new OrdinalDataAppender(labelToScore));
+
+    FacetsConfig facetsConfig = new FacetsConfig();
+    facetsConfig.setHierarchical("Author", true);
+    facetsConfig.setMultiValued("Author", true);
+    facetsConfig.setHierarchical("Publish Date", true);
+    facetsConfig.setMultiValued("Publish Date", true);
+
+    Document doc;
+
+    doc = new Document();
+    doc.add(new FacetField("Author", "Bob"));
+    doc.add(new FacetField("Publish Date", "2010", "10", "15"));
+    indexWriter.addDocument(facetsConfig.build(taxoWriter, doc));
+
+    doc = new Document();
+    doc.add(new FacetField("Author", "Lisa"));
+    doc.add(new FacetField("Publish Date", "2010", "10", "20"));
+    indexWriter.addDocument(facetsConfig.build(taxoWriter, doc));
+
+    IOUtils.close(indexWriter, indexDir);
+    taxoWriter.commit();
+
+    taxoReader = new DirectoryTaxonomyReader(taxoDir);
+    taxoIndexReader = taxoReader.getInternalIndexReader();
+  }
+
+  @After
+  @Override
+  public void tearDown() throws Exception {
+    super.tearDown();
+
+    IOUtils.close(taxoWriter);
+    IOUtils.close(taxoReader);
+    IOUtils.close(taxoDir);
+  }
+
+  public void testDocValue() throws IOException {
+    // Each unique label will have been assigned a doc.
+    // Additionally, we have the root node of the taxonomy.
+    assertEquals(9, taxoIndexReader.maxDoc());
+    for (LeafReaderContext ctx : taxoIndexReader.leaves()) {
+      LeafReader leafReader = ctx.reader();
+      NumericDocValues scores = leafReader.getNumericDocValues("score");
+      if (scores == null) {
+        continue;
+      }
+      for (int ord = 0; ord < leafReader.maxDoc(); ord++) {
+        if (scores.advanceExact(ord) == false) {
+          continue;
+        }
+        FacetLabel label = taxoReader.getPath(ctx.docBase + ord);
+        Long score = labelToScore.get(label.components[label.length - 1]);
+        if (score == null) {
+          throw new IOException("Unexpected score for " + Arrays.toString(label.components));
+        }
+        assertEquals((long) score, scores.longValue());
+      }
+    }
+  }
+
+  private void validateSearchResults(IndexSearcher searcher, Map<Query, Integer> queriesAndCounts)
+      throws IOException {
+    for (Map.Entry<Query, Integer> queryAndCount : queriesAndCounts.entrySet()) {
+      Query q = queryAndCount.getKey();
+      int count = queryAndCount.getValue();
+      TopDocs td = searcher.search(q, Integer.MAX_VALUE);
+      assertEquals(count, td.totalHits.value);
+    }
+  }
+
+  public void testSearchableField() throws IOException {
+    IndexSearcher taxoSearcher = newSearcher(taxoIndexReader);
+    validateSearchResults(
+        taxoSearcher,
+        Map.of(
+            new TermQuery(new Term("hasScore?", "yes")), 2,
+            new TermQuery(new Term("hasScore?", "no")), 6));
+  }
+
+  public void testReindex() throws IOException {
+    taxoWriter.reindexWithNewOrdinalData(new OrdinalDataAppender(new HashMap<>()));
+    taxoReader.close();
+    taxoReader = new DirectoryTaxonomyReader(taxoDir);
+    taxoIndexReader = taxoReader.getInternalIndexReader();
+
+    IndexSearcher taxoSearcher = newSearcher(taxoIndexReader);
+    validateSearchResults(
+        taxoSearcher,
+        Map.of(
+            new TermQuery(new Term("hasScore?", "yes")), 0,
+            new TermQuery(new Term("hasScore?", "no")), 8));
+  }
+}