Index arbitrary fields in taxonomy docs (#12337)

2024-02-08 11:02:12 +00:00 · 2024-02-08 11:02:12 +00:00 · f339e24e8e
parent bff5ac0ed0
commit f339e24e8e
6 changed files with 340 additions and 1 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -197,6 +197,9 @@ New Features
  better-scoring nodes are available, or the best candidate is below a score of `traversalSimilarity` in the lowest
  level. (Aditya Prakash, Kaival Parikh)
 * GITHUB#12336: Index additional data per facet label in the taxonomy. (Shai Erera, Egor Potemkin, Mike McCandless,
  Stefan Vodita)
 Improvements
 ---------------------
--- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/Consts.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/Consts.java
@ -17,9 +17,14 @@
 package org.apache.lucene.facet.taxonomy.directory;
 /**
 * This class holds constants used by the directory taxonomy implementations.
 *
 * @lucene.experimental
 */
 abstract class Consts {
  /** The name of the field containing the full path of a taxonomy document. */
  static final String FULL = "$full_path$";
  /** The name of the field containing the ordinal of the parent of a taxonomy document. */
  static final String FIELD_PARENT_ORDINAL_NDV = "$parent_ndv$";
 }
--- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyReader.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyReader.java
@ -243,7 +243,7 @@ public class DirectoryTaxonomyReader extends TaxonomyReader implements Accountab
   * Expert: returns the underlying {@link DirectoryReader} instance that is used by this {@link
   * TaxonomyReader}.
   */
-  protected DirectoryReader getInternalIndexReader() {
+  public DirectoryReader getInternalIndexReader() {
    ensureOpen();
    return indexReader;
  }
--- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyWriter.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyWriter.java
@ -27,6 +27,7 @@ import java.util.HashMap;
 import java.util.Map;
 import java.util.Objects;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.function.BiConsumer;
 import org.apache.lucene.document.BinaryDocValuesField;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
@ -436,6 +437,14 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
    }
  }
  /**
   * Child classes can implement this method to modify the document corresponding to a category path
   * before indexing it.
   *
   * @lucene.experimental
   */
  protected void enrichOrdinalDocument(Document d, FacetLabel categoryPath) {}
  /**
   * Note that the methods calling addCategoryDocument() are synchronized, so this method is
   * effectively synchronized as well.
@ -453,6 +462,9 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
    d.add(fullPathField);
    // add arbitrary ordinal data to the doc
    enrichOrdinalDocument(d, categoryPath);
    indexWriter.addDocument(d);
    int id = nextID.getAndIncrement();
@ -878,6 +890,34 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
    ++indexEpoch;
  }
  /**
   * Delete the taxonomy and reset all state for this writer.
   *
   * <p>To keep using the same main index, you would have to regenerate the taxonomy, taking care
   * that ordinals are indexed in the same order as before. An example of this can be found in
   * {@link ReindexingEnrichedDirectoryTaxonomyWriter#reindexWithNewOrdinalData(BiConsumer)}.
   *
   * @lucene.experimental
   */
  synchronized void deleteAll() throws IOException {
    indexWriter.deleteAll();
    shouldRefreshReaderManager = true;
    initReaderManager(); // ensure that it's initialized
    refreshReaderManager();
    nextID.set(0);
    taxoArrays = null; // must nullify so that it's re-computed next time it's needed
    // need to clear the cache, so that addCategory won't accidentally return
    // old categories that are in the cache.
    cache.clear();
    cacheIsComplete = false;
    shouldFillCache = true;
    cacheMisses.set(0);
    // update indexEpoch as a taxonomy replace is just like it has be recreated
    ++indexEpoch;
  }
  /** Returns the {@link Directory} of this taxonomy writer. */
  public Directory getDirectory() {
    return dir;
--- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/ReindexingEnrichedDirectoryTaxonomyWriter.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/ReindexingEnrichedDirectoryTaxonomyWriter.java
@ -0,0 +1,105 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.facet.taxonomy.directory;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.function.BiConsumer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.facet.taxonomy.FacetLabel;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.LeafReader;
 import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.IOUtils;
 /**
 * Use this {@link org.apache.lucene.facet.taxonomy.TaxonomyWriter} to append arbitrary fields to
 * the ordinal documents in the taxonomy. To update the custom data added to the docs, it is
 * required to {@link #reindexWithNewOrdinalData(BiConsumer)}.
 *
 * @lucene.experimental
 */
 public class ReindexingEnrichedDirectoryTaxonomyWriter extends DirectoryTaxonomyWriter {
  private BiConsumer<FacetLabel, Document> ordinalDataAppender;
  /** Create a taxonomy writer that will allow editing the ordinal docs before indexing them. */
  public ReindexingEnrichedDirectoryTaxonomyWriter(
      Directory d, BiConsumer<FacetLabel, Document> ordinalDataAppender) throws IOException {
    super(d);
    this.ordinalDataAppender = ordinalDataAppender;
  }
  /** Add fields specified by the {@link #ordinalDataAppender} to the provided {@link Document}. */
  @Override
  protected void enrichOrdinalDocument(Document d, FacetLabel categoryPath) {
    if (ordinalDataAppender != null) {
      ordinalDataAppender.accept(categoryPath, d);
    }
  }
  /**
   * Make a list of all labels in the taxonomy. The index of each label in this list is the ordinal
   * which corresponds to it.
   */
  private List<FacetLabel> recordPathsInOrder(Directory d) throws IOException {
    List<FacetLabel> paths = new ArrayList<>();
    DirectoryTaxonomyReader taxoReader = new DirectoryTaxonomyReader(d);
    IndexReader taxoIndexReader = taxoReader.getInternalIndexReader();
    for (LeafReaderContext ctx : taxoIndexReader.leaves()) {
      LeafReader leafReader = ctx.reader();
      int[] ordinals = new int[leafReader.maxDoc()];
      for (int i = 0; i < ordinals.length; i++) {
        ordinals[i] = ctx.docBase + i;
      }
      FacetLabel[] labels = taxoReader.getBulkPath(ordinals);
      for (FacetLabel label : labels) {
        paths.add(label);
      }
    }
    IOUtils.close(taxoReader);
    return paths;
  }
  /**
   * Delete the existing taxonomy index and recreate it using new ordinal data. The ordinals
   * themselves will be preserved, so the caller does not need to update references to them in the
   * main index.
   */
  public synchronized void reindexWithNewOrdinalData(
      BiConsumer<FacetLabel, Document> ordinalDataAppender) throws IOException {
    ensureOpen();
    this.ordinalDataAppender = ordinalDataAppender;
    Directory d = getDirectory();
    // Record paths in order.
    List<FacetLabel> ordinalToPath = recordPathsInOrder(d);
    // Delete old taxonomy files.
    deleteAll();
    // Index paths in order - they will use the new appender.
    for (FacetLabel categoryPath : ordinalToPath) {
      addCategory(categoryPath);
    }
    commit();
  }
 }
--- a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestOrdinalData.java
+++ b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestOrdinalData.java
@ -0,0 +1,186 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.facet.taxonomy;
 import java.io.IOException;
 import java.util.Arrays;
 import java.util.HashMap;
 import java.util.Map;
 import java.util.function.BiConsumer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.NumericDocValuesField;
 import org.apache.lucene.document.StringField;
 import org.apache.lucene.facet.FacetField;
 import org.apache.lucene.facet.FacetTestCase;
 import org.apache.lucene.facet.FacetsConfig;
 import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader;
 import org.apache.lucene.facet.taxonomy.directory.ReindexingEnrichedDirectoryTaxonomyWriter;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.IndexWriterConfig;
 import org.apache.lucene.index.LeafReader;
 import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.index.NumericDocValues;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.TermQuery;
 import org.apache.lucene.search.TopDocs;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.IOUtils;
 import org.junit.After;
 import org.junit.Before;
 public class TestOrdinalData extends FacetTestCase {
  Directory taxoDir;
  DirectoryTaxonomyReader taxoReader;
  IndexReader taxoIndexReader;
  ReindexingEnrichedDirectoryTaxonomyWriter taxoWriter;
  private static final Map<String, Long> labelToScore =
      Map.of(
          "Bob", 42L,
          "Lisa", 35L);
  private static class OrdinalDataAppender implements BiConsumer<FacetLabel, Document> {
    private final Map<String, Long> scores;
    private OrdinalDataAppender(Map<String, Long> scores) {
      this.scores = scores;
    }
    @Override
    public void accept(FacetLabel facetLabel, Document doc) {
      if (facetLabel.length == 0) {
        return;
      }
      Long score = scores.get(facetLabel.components[facetLabel.length - 1]);
      if (score != null) {
        doc.add(new NumericDocValuesField("score", score));
        doc.add(new StringField("hasScore?", "yes", Field.Store.NO));
      } else {
        doc.add(new StringField("hasScore?", "no", Field.Store.NO));
      }
    }
  }
  @Before
  @Override
  public void setUp() throws Exception {
    super.setUp();
    Directory indexDir = newDirectory();
    taxoDir = newDirectory();
    IndexWriter indexWriter = new IndexWriter(indexDir, new IndexWriterConfig());
    taxoWriter =
        new ReindexingEnrichedDirectoryTaxonomyWriter(
            taxoDir, new OrdinalDataAppender(labelToScore));
    FacetsConfig facetsConfig = new FacetsConfig();
    facetsConfig.setHierarchical("Author", true);
    facetsConfig.setMultiValued("Author", true);
    facetsConfig.setHierarchical("Publish Date", true);
    facetsConfig.setMultiValued("Publish Date", true);
    Document doc;
    doc = new Document();
    doc.add(new FacetField("Author", "Bob"));
    doc.add(new FacetField("Publish Date", "2010", "10", "15"));
    indexWriter.addDocument(facetsConfig.build(taxoWriter, doc));
    doc = new Document();
    doc.add(new FacetField("Author", "Lisa"));
    doc.add(new FacetField("Publish Date", "2010", "10", "20"));
    indexWriter.addDocument(facetsConfig.build(taxoWriter, doc));
    IOUtils.close(indexWriter, indexDir);
    taxoWriter.commit();
    taxoReader = new DirectoryTaxonomyReader(taxoDir);
    taxoIndexReader = taxoReader.getInternalIndexReader();
  }
  @After
  @Override
  public void tearDown() throws Exception {
    super.tearDown();
    IOUtils.close(taxoWriter);
    IOUtils.close(taxoReader);
    IOUtils.close(taxoDir);
  }
  public void testDocValue() throws IOException {
    // Each unique label will have been assigned a doc.
    // Additionally, we have the root node of the taxonomy.
    assertEquals(9, taxoIndexReader.maxDoc());
    for (LeafReaderContext ctx : taxoIndexReader.leaves()) {
      LeafReader leafReader = ctx.reader();
      NumericDocValues scores = leafReader.getNumericDocValues("score");
      if (scores == null) {
        continue;
      }
      for (int ord = 0; ord < leafReader.maxDoc(); ord++) {
        if (scores.advanceExact(ord) == false) {
          continue;
        }
        FacetLabel label = taxoReader.getPath(ctx.docBase + ord);
        Long score = labelToScore.get(label.components[label.length - 1]);
        if (score == null) {
          throw new IOException("Unexpected score for " + Arrays.toString(label.components));
        }
        assertEquals((long) score, scores.longValue());
      }
    }
  }
  private void validateSearchResults(IndexSearcher searcher, Map<Query, Integer> queriesAndCounts)
      throws IOException {
    for (Map.Entry<Query, Integer> queryAndCount : queriesAndCounts.entrySet()) {
      Query q = queryAndCount.getKey();
      int count = queryAndCount.getValue();
      TopDocs td = searcher.search(q, Integer.MAX_VALUE);
      assertEquals(count, td.totalHits.value);
    }
  }
  public void testSearchableField() throws IOException {
    IndexSearcher taxoSearcher = newSearcher(taxoIndexReader);
    validateSearchResults(
        taxoSearcher,
        Map.of(
            new TermQuery(new Term("hasScore?", "yes")), 2,
            new TermQuery(new Term("hasScore?", "no")), 6));
  }
  public void testReindex() throws IOException {
    taxoWriter.reindexWithNewOrdinalData(new OrdinalDataAppender(new HashMap<>()));
    taxoReader.close();
    taxoReader = new DirectoryTaxonomyReader(taxoDir);
    taxoIndexReader = taxoReader.getInternalIndexReader();
    IndexSearcher taxoSearcher = newSearcher(taxoIndexReader);
    validateSearchResults(
        taxoSearcher,
        Map.of(
            new TermQuery(new Term("hasScore?", "yes")), 0,
            new TermQuery(new Term("hasScore?", "no")), 8));
  }
 }