From f339e24e8ed64bf64ef4bb05111d252ecdacde1e Mon Sep 17 00:00:00 2001
From: Stefan Vodita <41467371+stefanvodita@users.noreply.github.com>
Date: Thu, 8 Feb 2024 11:02:12 +0000
Subject: [PATCH] Index arbitrary fields in taxonomy docs (#12337)
---
lucene/CHANGES.txt | 3 +
.../facet/taxonomy/directory/Consts.java | 5 +
.../directory/DirectoryTaxonomyReader.java | 2 +-
.../directory/DirectoryTaxonomyWriter.java | 40 ++++
...dexingEnrichedDirectoryTaxonomyWriter.java | 105 ++++++++++
.../facet/taxonomy/TestOrdinalData.java | 186 ++++++++++++++++++
6 files changed, 340 insertions(+), 1 deletion(-)
create mode 100644 lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/ReindexingEnrichedDirectoryTaxonomyWriter.java
create mode 100644 lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestOrdinalData.java
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index eb16b3be8c1..181d4e27157 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -197,6 +197,9 @@ New Features
better-scoring nodes are available, or the best candidate is below a score of `traversalSimilarity` in the lowest
level. (Aditya Prakash, Kaival Parikh)
+* GITHUB#12336: Index additional data per facet label in the taxonomy. (Shai Erera, Egor Potemkin, Mike McCandless,
+ Stefan Vodita)
+
Improvements
---------------------
diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/Consts.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/Consts.java
index c52805c5df1..cfcca124858 100644
--- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/Consts.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/Consts.java
@@ -17,9 +17,14 @@
package org.apache.lucene.facet.taxonomy.directory;
/**
+ * This class holds constants used by the directory taxonomy implementations.
+ *
* @lucene.experimental
*/
abstract class Consts {
+ /** The name of the field containing the full path of a taxonomy document. */
static final String FULL = "$full_path$";
+
+ /** The name of the field containing the ordinal of the parent of a taxonomy document. */
static final String FIELD_PARENT_ORDINAL_NDV = "$parent_ndv$";
}
diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyReader.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyReader.java
index 0d641d396ec..e2cf114da0b 100644
--- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyReader.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyReader.java
@@ -243,7 +243,7 @@ public class DirectoryTaxonomyReader extends TaxonomyReader implements Accountab
* Expert: returns the underlying {@link DirectoryReader} instance that is used by this {@link
* TaxonomyReader}.
*/
- protected DirectoryReader getInternalIndexReader() {
+ public DirectoryReader getInternalIndexReader() {
ensureOpen();
return indexReader;
}
diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyWriter.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyWriter.java
index 86f3d18deed..bd4934fbb04 100644
--- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyWriter.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyWriter.java
@@ -27,6 +27,7 @@ import java.util.HashMap;
import java.util.Map;
import java.util.Objects;
import java.util.concurrent.atomic.AtomicInteger;
+import java.util.function.BiConsumer;
import org.apache.lucene.document.BinaryDocValuesField;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
@@ -436,6 +437,14 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
}
}
+ /**
+ * Child classes can implement this method to modify the document corresponding to a category path
+ * before indexing it.
+ *
+ * @lucene.experimental
+ */
+ protected void enrichOrdinalDocument(Document d, FacetLabel categoryPath) {}
+
/**
* Note that the methods calling addCategoryDocument() are synchronized, so this method is
* effectively synchronized as well.
@@ -453,6 +462,9 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
d.add(fullPathField);
+ // add arbitrary ordinal data to the doc
+ enrichOrdinalDocument(d, categoryPath);
+
indexWriter.addDocument(d);
int id = nextID.getAndIncrement();
@@ -878,6 +890,34 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
++indexEpoch;
}
+ /**
+ * Delete the taxonomy and reset all state for this writer.
+ *
+ *
To keep using the same main index, you would have to regenerate the taxonomy, taking care
+ * that ordinals are indexed in the same order as before. An example of this can be found in
+ * {@link ReindexingEnrichedDirectoryTaxonomyWriter#reindexWithNewOrdinalData(BiConsumer)}.
+ *
+ * @lucene.experimental
+ */
+ synchronized void deleteAll() throws IOException {
+ indexWriter.deleteAll();
+ shouldRefreshReaderManager = true;
+ initReaderManager(); // ensure that it's initialized
+ refreshReaderManager();
+ nextID.set(0);
+ taxoArrays = null; // must nullify so that it's re-computed next time it's needed
+
+ // need to clear the cache, so that addCategory won't accidentally return
+ // old categories that are in the cache.
+ cache.clear();
+ cacheIsComplete = false;
+ shouldFillCache = true;
+ cacheMisses.set(0);
+
+ // update indexEpoch as a taxonomy replace is just like it has be recreated
+ ++indexEpoch;
+ }
+
/** Returns the {@link Directory} of this taxonomy writer. */
public Directory getDirectory() {
return dir;
diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/ReindexingEnrichedDirectoryTaxonomyWriter.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/ReindexingEnrichedDirectoryTaxonomyWriter.java
new file mode 100644
index 00000000000..91b7291b276
--- /dev/null
+++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/ReindexingEnrichedDirectoryTaxonomyWriter.java
@@ -0,0 +1,105 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.facet.taxonomy.directory;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.function.BiConsumer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.facet.taxonomy.FacetLabel;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.LeafReader;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.IOUtils;
+
+/**
+ * Use this {@link org.apache.lucene.facet.taxonomy.TaxonomyWriter} to append arbitrary fields to
+ * the ordinal documents in the taxonomy. To update the custom data added to the docs, it is
+ * required to {@link #reindexWithNewOrdinalData(BiConsumer)}.
+ *
+ * @lucene.experimental
+ */
+public class ReindexingEnrichedDirectoryTaxonomyWriter extends DirectoryTaxonomyWriter {
+ private BiConsumer ordinalDataAppender;
+
+ /** Create a taxonomy writer that will allow editing the ordinal docs before indexing them. */
+ public ReindexingEnrichedDirectoryTaxonomyWriter(
+ Directory d, BiConsumer ordinalDataAppender) throws IOException {
+ super(d);
+ this.ordinalDataAppender = ordinalDataAppender;
+ }
+
+ /** Add fields specified by the {@link #ordinalDataAppender} to the provided {@link Document}. */
+ @Override
+ protected void enrichOrdinalDocument(Document d, FacetLabel categoryPath) {
+ if (ordinalDataAppender != null) {
+ ordinalDataAppender.accept(categoryPath, d);
+ }
+ }
+
+ /**
+ * Make a list of all labels in the taxonomy. The index of each label in this list is the ordinal
+ * which corresponds to it.
+ */
+ private List recordPathsInOrder(Directory d) throws IOException {
+ List paths = new ArrayList<>();
+
+ DirectoryTaxonomyReader taxoReader = new DirectoryTaxonomyReader(d);
+ IndexReader taxoIndexReader = taxoReader.getInternalIndexReader();
+
+ for (LeafReaderContext ctx : taxoIndexReader.leaves()) {
+ LeafReader leafReader = ctx.reader();
+ int[] ordinals = new int[leafReader.maxDoc()];
+ for (int i = 0; i < ordinals.length; i++) {
+ ordinals[i] = ctx.docBase + i;
+ }
+ FacetLabel[] labels = taxoReader.getBulkPath(ordinals);
+ for (FacetLabel label : labels) {
+ paths.add(label);
+ }
+ }
+
+ IOUtils.close(taxoReader);
+ return paths;
+ }
+
+ /**
+ * Delete the existing taxonomy index and recreate it using new ordinal data. The ordinals
+ * themselves will be preserved, so the caller does not need to update references to them in the
+ * main index.
+ */
+ public synchronized void reindexWithNewOrdinalData(
+ BiConsumer ordinalDataAppender) throws IOException {
+ ensureOpen();
+ this.ordinalDataAppender = ordinalDataAppender;
+ Directory d = getDirectory();
+
+ // Record paths in order.
+ List ordinalToPath = recordPathsInOrder(d);
+
+ // Delete old taxonomy files.
+ deleteAll();
+
+ // Index paths in order - they will use the new appender.
+ for (FacetLabel categoryPath : ordinalToPath) {
+ addCategory(categoryPath);
+ }
+ commit();
+ }
+}
diff --git a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestOrdinalData.java b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestOrdinalData.java
new file mode 100644
index 00000000000..47750b773b2
--- /dev/null
+++ b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestOrdinalData.java
@@ -0,0 +1,186 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.facet.taxonomy;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.function.BiConsumer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.NumericDocValuesField;
+import org.apache.lucene.document.StringField;
+import org.apache.lucene.facet.FacetField;
+import org.apache.lucene.facet.FacetTestCase;
+import org.apache.lucene.facet.FacetsConfig;
+import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader;
+import org.apache.lucene.facet.taxonomy.directory.ReindexingEnrichedDirectoryTaxonomyWriter;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.index.LeafReader;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.NumericDocValues;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.IOUtils;
+import org.junit.After;
+import org.junit.Before;
+
+public class TestOrdinalData extends FacetTestCase {
+ Directory taxoDir;
+ DirectoryTaxonomyReader taxoReader;
+ IndexReader taxoIndexReader;
+ ReindexingEnrichedDirectoryTaxonomyWriter taxoWriter;
+
+ private static final Map labelToScore =
+ Map.of(
+ "Bob", 42L,
+ "Lisa", 35L);
+
+ private static class OrdinalDataAppender implements BiConsumer {
+ private final Map scores;
+
+ private OrdinalDataAppender(Map scores) {
+ this.scores = scores;
+ }
+
+ @Override
+ public void accept(FacetLabel facetLabel, Document doc) {
+ if (facetLabel.length == 0) {
+ return;
+ }
+ Long score = scores.get(facetLabel.components[facetLabel.length - 1]);
+ if (score != null) {
+ doc.add(new NumericDocValuesField("score", score));
+ doc.add(new StringField("hasScore?", "yes", Field.Store.NO));
+ } else {
+ doc.add(new StringField("hasScore?", "no", Field.Store.NO));
+ }
+ }
+ }
+
+ @Before
+ @Override
+ public void setUp() throws Exception {
+ super.setUp();
+
+ Directory indexDir = newDirectory();
+ taxoDir = newDirectory();
+
+ IndexWriter indexWriter = new IndexWriter(indexDir, new IndexWriterConfig());
+ taxoWriter =
+ new ReindexingEnrichedDirectoryTaxonomyWriter(
+ taxoDir, new OrdinalDataAppender(labelToScore));
+
+ FacetsConfig facetsConfig = new FacetsConfig();
+ facetsConfig.setHierarchical("Author", true);
+ facetsConfig.setMultiValued("Author", true);
+ facetsConfig.setHierarchical("Publish Date", true);
+ facetsConfig.setMultiValued("Publish Date", true);
+
+ Document doc;
+
+ doc = new Document();
+ doc.add(new FacetField("Author", "Bob"));
+ doc.add(new FacetField("Publish Date", "2010", "10", "15"));
+ indexWriter.addDocument(facetsConfig.build(taxoWriter, doc));
+
+ doc = new Document();
+ doc.add(new FacetField("Author", "Lisa"));
+ doc.add(new FacetField("Publish Date", "2010", "10", "20"));
+ indexWriter.addDocument(facetsConfig.build(taxoWriter, doc));
+
+ IOUtils.close(indexWriter, indexDir);
+ taxoWriter.commit();
+
+ taxoReader = new DirectoryTaxonomyReader(taxoDir);
+ taxoIndexReader = taxoReader.getInternalIndexReader();
+ }
+
+ @After
+ @Override
+ public void tearDown() throws Exception {
+ super.tearDown();
+
+ IOUtils.close(taxoWriter);
+ IOUtils.close(taxoReader);
+ IOUtils.close(taxoDir);
+ }
+
+ public void testDocValue() throws IOException {
+ // Each unique label will have been assigned a doc.
+ // Additionally, we have the root node of the taxonomy.
+ assertEquals(9, taxoIndexReader.maxDoc());
+ for (LeafReaderContext ctx : taxoIndexReader.leaves()) {
+ LeafReader leafReader = ctx.reader();
+ NumericDocValues scores = leafReader.getNumericDocValues("score");
+ if (scores == null) {
+ continue;
+ }
+ for (int ord = 0; ord < leafReader.maxDoc(); ord++) {
+ if (scores.advanceExact(ord) == false) {
+ continue;
+ }
+ FacetLabel label = taxoReader.getPath(ctx.docBase + ord);
+ Long score = labelToScore.get(label.components[label.length - 1]);
+ if (score == null) {
+ throw new IOException("Unexpected score for " + Arrays.toString(label.components));
+ }
+ assertEquals((long) score, scores.longValue());
+ }
+ }
+ }
+
+ private void validateSearchResults(IndexSearcher searcher, Map queriesAndCounts)
+ throws IOException {
+ for (Map.Entry queryAndCount : queriesAndCounts.entrySet()) {
+ Query q = queryAndCount.getKey();
+ int count = queryAndCount.getValue();
+ TopDocs td = searcher.search(q, Integer.MAX_VALUE);
+ assertEquals(count, td.totalHits.value);
+ }
+ }
+
+ public void testSearchableField() throws IOException {
+ IndexSearcher taxoSearcher = newSearcher(taxoIndexReader);
+ validateSearchResults(
+ taxoSearcher,
+ Map.of(
+ new TermQuery(new Term("hasScore?", "yes")), 2,
+ new TermQuery(new Term("hasScore?", "no")), 6));
+ }
+
+ public void testReindex() throws IOException {
+ taxoWriter.reindexWithNewOrdinalData(new OrdinalDataAppender(new HashMap<>()));
+ taxoReader.close();
+ taxoReader = new DirectoryTaxonomyReader(taxoDir);
+ taxoIndexReader = taxoReader.getInternalIndexReader();
+
+ IndexSearcher taxoSearcher = newSearcher(taxoIndexReader);
+ validateSearchResults(
+ taxoSearcher,
+ Map.of(
+ new TermQuery(new Term("hasScore?", "yes")), 0,
+ new TermQuery(new Term("hasScore?", "no")), 8));
+ }
+}