From f339e24e8ed64bf64ef4bb05111d252ecdacde1e Mon Sep 17 00:00:00 2001 From: Stefan Vodita <41467371+stefanvodita@users.noreply.github.com> Date: Thu, 8 Feb 2024 11:02:12 +0000 Subject: [PATCH] Index arbitrary fields in taxonomy docs (#12337) --- lucene/CHANGES.txt | 3 + .../facet/taxonomy/directory/Consts.java | 5 + .../directory/DirectoryTaxonomyReader.java | 2 +- .../directory/DirectoryTaxonomyWriter.java | 40 ++++ ...dexingEnrichedDirectoryTaxonomyWriter.java | 105 ++++++++++ .../facet/taxonomy/TestOrdinalData.java | 186 ++++++++++++++++++ 6 files changed, 340 insertions(+), 1 deletion(-) create mode 100644 lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/ReindexingEnrichedDirectoryTaxonomyWriter.java create mode 100644 lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestOrdinalData.java diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index eb16b3be8c1..181d4e27157 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -197,6 +197,9 @@ New Features better-scoring nodes are available, or the best candidate is below a score of `traversalSimilarity` in the lowest level. (Aditya Prakash, Kaival Parikh) +* GITHUB#12336: Index additional data per facet label in the taxonomy. (Shai Erera, Egor Potemkin, Mike McCandless, + Stefan Vodita) + Improvements --------------------- diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/Consts.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/Consts.java index c52805c5df1..cfcca124858 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/Consts.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/Consts.java @@ -17,9 +17,14 @@ package org.apache.lucene.facet.taxonomy.directory; /** + * This class holds constants used by the directory taxonomy implementations. + * * @lucene.experimental */ abstract class Consts { + /** The name of the field containing the full path of a taxonomy document. */ static final String FULL = "$full_path$"; + + /** The name of the field containing the ordinal of the parent of a taxonomy document. */ static final String FIELD_PARENT_ORDINAL_NDV = "$parent_ndv$"; } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyReader.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyReader.java index 0d641d396ec..e2cf114da0b 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyReader.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyReader.java @@ -243,7 +243,7 @@ public class DirectoryTaxonomyReader extends TaxonomyReader implements Accountab * Expert: returns the underlying {@link DirectoryReader} instance that is used by this {@link * TaxonomyReader}. */ - protected DirectoryReader getInternalIndexReader() { + public DirectoryReader getInternalIndexReader() { ensureOpen(); return indexReader; } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyWriter.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyWriter.java index 86f3d18deed..bd4934fbb04 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyWriter.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyWriter.java @@ -27,6 +27,7 @@ import java.util.HashMap; import java.util.Map; import java.util.Objects; import java.util.concurrent.atomic.AtomicInteger; +import java.util.function.BiConsumer; import org.apache.lucene.document.BinaryDocValuesField; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; @@ -436,6 +437,14 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter { } } + /** + * Child classes can implement this method to modify the document corresponding to a category path + * before indexing it. + * + * @lucene.experimental + */ + protected void enrichOrdinalDocument(Document d, FacetLabel categoryPath) {} + /** * Note that the methods calling addCategoryDocument() are synchronized, so this method is * effectively synchronized as well. @@ -453,6 +462,9 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter { d.add(fullPathField); + // add arbitrary ordinal data to the doc + enrichOrdinalDocument(d, categoryPath); + indexWriter.addDocument(d); int id = nextID.getAndIncrement(); @@ -878,6 +890,34 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter { ++indexEpoch; } + /** + * Delete the taxonomy and reset all state for this writer. + * + *

To keep using the same main index, you would have to regenerate the taxonomy, taking care + * that ordinals are indexed in the same order as before. An example of this can be found in + * {@link ReindexingEnrichedDirectoryTaxonomyWriter#reindexWithNewOrdinalData(BiConsumer)}. + * + * @lucene.experimental + */ + synchronized void deleteAll() throws IOException { + indexWriter.deleteAll(); + shouldRefreshReaderManager = true; + initReaderManager(); // ensure that it's initialized + refreshReaderManager(); + nextID.set(0); + taxoArrays = null; // must nullify so that it's re-computed next time it's needed + + // need to clear the cache, so that addCategory won't accidentally return + // old categories that are in the cache. + cache.clear(); + cacheIsComplete = false; + shouldFillCache = true; + cacheMisses.set(0); + + // update indexEpoch as a taxonomy replace is just like it has be recreated + ++indexEpoch; + } + /** Returns the {@link Directory} of this taxonomy writer. */ public Directory getDirectory() { return dir; diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/ReindexingEnrichedDirectoryTaxonomyWriter.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/ReindexingEnrichedDirectoryTaxonomyWriter.java new file mode 100644 index 00000000000..91b7291b276 --- /dev/null +++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/ReindexingEnrichedDirectoryTaxonomyWriter.java @@ -0,0 +1,105 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.facet.taxonomy.directory; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.function.BiConsumer; +import org.apache.lucene.document.Document; +import org.apache.lucene.facet.taxonomy.FacetLabel; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.LeafReader; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.IOUtils; + +/** + * Use this {@link org.apache.lucene.facet.taxonomy.TaxonomyWriter} to append arbitrary fields to + * the ordinal documents in the taxonomy. To update the custom data added to the docs, it is + * required to {@link #reindexWithNewOrdinalData(BiConsumer)}. + * + * @lucene.experimental + */ +public class ReindexingEnrichedDirectoryTaxonomyWriter extends DirectoryTaxonomyWriter { + private BiConsumer ordinalDataAppender; + + /** Create a taxonomy writer that will allow editing the ordinal docs before indexing them. */ + public ReindexingEnrichedDirectoryTaxonomyWriter( + Directory d, BiConsumer ordinalDataAppender) throws IOException { + super(d); + this.ordinalDataAppender = ordinalDataAppender; + } + + /** Add fields specified by the {@link #ordinalDataAppender} to the provided {@link Document}. */ + @Override + protected void enrichOrdinalDocument(Document d, FacetLabel categoryPath) { + if (ordinalDataAppender != null) { + ordinalDataAppender.accept(categoryPath, d); + } + } + + /** + * Make a list of all labels in the taxonomy. The index of each label in this list is the ordinal + * which corresponds to it. + */ + private List recordPathsInOrder(Directory d) throws IOException { + List paths = new ArrayList<>(); + + DirectoryTaxonomyReader taxoReader = new DirectoryTaxonomyReader(d); + IndexReader taxoIndexReader = taxoReader.getInternalIndexReader(); + + for (LeafReaderContext ctx : taxoIndexReader.leaves()) { + LeafReader leafReader = ctx.reader(); + int[] ordinals = new int[leafReader.maxDoc()]; + for (int i = 0; i < ordinals.length; i++) { + ordinals[i] = ctx.docBase + i; + } + FacetLabel[] labels = taxoReader.getBulkPath(ordinals); + for (FacetLabel label : labels) { + paths.add(label); + } + } + + IOUtils.close(taxoReader); + return paths; + } + + /** + * Delete the existing taxonomy index and recreate it using new ordinal data. The ordinals + * themselves will be preserved, so the caller does not need to update references to them in the + * main index. + */ + public synchronized void reindexWithNewOrdinalData( + BiConsumer ordinalDataAppender) throws IOException { + ensureOpen(); + this.ordinalDataAppender = ordinalDataAppender; + Directory d = getDirectory(); + + // Record paths in order. + List ordinalToPath = recordPathsInOrder(d); + + // Delete old taxonomy files. + deleteAll(); + + // Index paths in order - they will use the new appender. + for (FacetLabel categoryPath : ordinalToPath) { + addCategory(categoryPath); + } + commit(); + } +} diff --git a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestOrdinalData.java b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestOrdinalData.java new file mode 100644 index 00000000000..47750b773b2 --- /dev/null +++ b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestOrdinalData.java @@ -0,0 +1,186 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.facet.taxonomy; + +import java.io.IOException; +import java.util.Arrays; +import java.util.HashMap; +import java.util.Map; +import java.util.function.BiConsumer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.NumericDocValuesField; +import org.apache.lucene.document.StringField; +import org.apache.lucene.facet.FacetField; +import org.apache.lucene.facet.FacetTestCase; +import org.apache.lucene.facet.FacetsConfig; +import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader; +import org.apache.lucene.facet.taxonomy.directory.ReindexingEnrichedDirectoryTaxonomyWriter; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.LeafReader; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.NumericDocValues; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.IOUtils; +import org.junit.After; +import org.junit.Before; + +public class TestOrdinalData extends FacetTestCase { + Directory taxoDir; + DirectoryTaxonomyReader taxoReader; + IndexReader taxoIndexReader; + ReindexingEnrichedDirectoryTaxonomyWriter taxoWriter; + + private static final Map labelToScore = + Map.of( + "Bob", 42L, + "Lisa", 35L); + + private static class OrdinalDataAppender implements BiConsumer { + private final Map scores; + + private OrdinalDataAppender(Map scores) { + this.scores = scores; + } + + @Override + public void accept(FacetLabel facetLabel, Document doc) { + if (facetLabel.length == 0) { + return; + } + Long score = scores.get(facetLabel.components[facetLabel.length - 1]); + if (score != null) { + doc.add(new NumericDocValuesField("score", score)); + doc.add(new StringField("hasScore?", "yes", Field.Store.NO)); + } else { + doc.add(new StringField("hasScore?", "no", Field.Store.NO)); + } + } + } + + @Before + @Override + public void setUp() throws Exception { + super.setUp(); + + Directory indexDir = newDirectory(); + taxoDir = newDirectory(); + + IndexWriter indexWriter = new IndexWriter(indexDir, new IndexWriterConfig()); + taxoWriter = + new ReindexingEnrichedDirectoryTaxonomyWriter( + taxoDir, new OrdinalDataAppender(labelToScore)); + + FacetsConfig facetsConfig = new FacetsConfig(); + facetsConfig.setHierarchical("Author", true); + facetsConfig.setMultiValued("Author", true); + facetsConfig.setHierarchical("Publish Date", true); + facetsConfig.setMultiValued("Publish Date", true); + + Document doc; + + doc = new Document(); + doc.add(new FacetField("Author", "Bob")); + doc.add(new FacetField("Publish Date", "2010", "10", "15")); + indexWriter.addDocument(facetsConfig.build(taxoWriter, doc)); + + doc = new Document(); + doc.add(new FacetField("Author", "Lisa")); + doc.add(new FacetField("Publish Date", "2010", "10", "20")); + indexWriter.addDocument(facetsConfig.build(taxoWriter, doc)); + + IOUtils.close(indexWriter, indexDir); + taxoWriter.commit(); + + taxoReader = new DirectoryTaxonomyReader(taxoDir); + taxoIndexReader = taxoReader.getInternalIndexReader(); + } + + @After + @Override + public void tearDown() throws Exception { + super.tearDown(); + + IOUtils.close(taxoWriter); + IOUtils.close(taxoReader); + IOUtils.close(taxoDir); + } + + public void testDocValue() throws IOException { + // Each unique label will have been assigned a doc. + // Additionally, we have the root node of the taxonomy. + assertEquals(9, taxoIndexReader.maxDoc()); + for (LeafReaderContext ctx : taxoIndexReader.leaves()) { + LeafReader leafReader = ctx.reader(); + NumericDocValues scores = leafReader.getNumericDocValues("score"); + if (scores == null) { + continue; + } + for (int ord = 0; ord < leafReader.maxDoc(); ord++) { + if (scores.advanceExact(ord) == false) { + continue; + } + FacetLabel label = taxoReader.getPath(ctx.docBase + ord); + Long score = labelToScore.get(label.components[label.length - 1]); + if (score == null) { + throw new IOException("Unexpected score for " + Arrays.toString(label.components)); + } + assertEquals((long) score, scores.longValue()); + } + } + } + + private void validateSearchResults(IndexSearcher searcher, Map queriesAndCounts) + throws IOException { + for (Map.Entry queryAndCount : queriesAndCounts.entrySet()) { + Query q = queryAndCount.getKey(); + int count = queryAndCount.getValue(); + TopDocs td = searcher.search(q, Integer.MAX_VALUE); + assertEquals(count, td.totalHits.value); + } + } + + public void testSearchableField() throws IOException { + IndexSearcher taxoSearcher = newSearcher(taxoIndexReader); + validateSearchResults( + taxoSearcher, + Map.of( + new TermQuery(new Term("hasScore?", "yes")), 2, + new TermQuery(new Term("hasScore?", "no")), 6)); + } + + public void testReindex() throws IOException { + taxoWriter.reindexWithNewOrdinalData(new OrdinalDataAppender(new HashMap<>())); + taxoReader.close(); + taxoReader = new DirectoryTaxonomyReader(taxoDir); + taxoIndexReader = taxoReader.getInternalIndexReader(); + + IndexSearcher taxoSearcher = newSearcher(taxoIndexReader); + validateSearchResults( + taxoSearcher, + Map.of( + new TermQuery(new Term("hasScore?", "yes")), 0, + new TermQuery(new Term("hasScore?", "no")), 8)); + } +}