From 10ebc099c846c7d96f4ff5f9b7853df850fa8442 Mon Sep 17 00:00:00 2001 From: Gautam Worah Date: Fri, 15 Apr 2022 10:45:02 -0700 Subject: [PATCH] LUCENE-10482 Allow users to create their own DirectoryTaxonomyReaders with empty taxoArrays instead of letting the taxoEpoch decide (#762) --- lucene/CHANGES.txt | 5 + .../directory/DirectoryTaxonomyReader.java | 20 +- ...tAlwaysRefreshDirectoryTaxonomyReader.java | 194 ++++++++++++++++++ 3 files changed, 214 insertions(+), 5 deletions(-) create mode 100644 lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestAlwaysRefreshDirectoryTaxonomyReader.java diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 921410d0763..d526aed3a3a 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -61,11 +61,16 @@ Other (Tomoko Uchida) ======================= Lucene 9.2.0 ======================= + API Changes --------------------- * LUCENE-10325: Facets API extended to support getTopFacets. (Yuting Gan) +* LUCENE-10482: Allow users to create their own DirectoryTaxonomyReaders with empty taxoArrays instead of letting the + taxoEpoch decide. Add a test case that demonstrates the inconsistencies caused when you reuse taxoArrays on older + checkpoints. (Gautam Worah) + New Features --------------------- diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyReader.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyReader.java index 576cb3dcdd9..984e13d67db 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyReader.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyReader.java @@ -78,10 +78,20 @@ public class DirectoryTaxonomyReader extends TaxonomyReader implements Accountab private volatile TaxonomyIndexArrays taxoArrays; /** - * Called only from {@link #doOpenIfChanged()}. If the taxonomy has been recreated, you should - * pass {@code null} as the caches and parent/children arrays. + * Expert: Use this method to explicitly force the {@link DirectoryTaxonomyReader} to use specific + * parent/children arrays and caches. + * + *

Called from {@link #doOpenIfChanged()}. If the taxonomy has been recreated, you should pass + * {@code null} as the caches and parent/children arrays. + * + * @param indexReader An indexReader that is opened in the desired Directory + * @param taxoWriter The {@link DirectoryTaxonomyWriter} from which to obtain newly added + * categories, in real-time. + * @param ordinalCache a FacetLabel to Integer ordinal mapping if it already exists + * @param categoryCache an ordinal to FacetLabel mapping if it already exists + * @param taxoArrays taxonomy arrays that store the parent, siblings, children information */ - DirectoryTaxonomyReader( + protected DirectoryTaxonomyReader( DirectoryReader indexReader, DirectoryTaxonomyWriter taxoWriter, LRUHashMap ordinalCache, @@ -207,7 +217,7 @@ public class DirectoryTaxonomyReader extends TaxonomyReader implements Accountab final DirectoryTaxonomyReader newtr; if (recreated) { - // if recreated, do not reuse anything from this instace. the information + // if recreated, do not reuse anything from this instance. the information // will be lazily computed by the new instance when needed. newtr = new DirectoryTaxonomyReader(r2, taxoWriter, null, null, null); } else { @@ -238,7 +248,7 @@ public class DirectoryTaxonomyReader extends TaxonomyReader implements Accountab * Expert: returns the underlying {@link DirectoryReader} instance that is used by this {@link * TaxonomyReader}. */ - DirectoryReader getInternalIndexReader() { + protected DirectoryReader getInternalIndexReader() { ensureOpen(); return indexReader; } diff --git a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestAlwaysRefreshDirectoryTaxonomyReader.java b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestAlwaysRefreshDirectoryTaxonomyReader.java new file mode 100644 index 00000000000..822500b3630 --- /dev/null +++ b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestAlwaysRefreshDirectoryTaxonomyReader.java @@ -0,0 +1,194 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.facet.taxonomy.directory; + +import static com.carrotsearch.randomizedtesting.RandomizedTest.sleep; +import static org.apache.lucene.tests.mockfile.ExtrasFS.isExtra; + +import java.io.IOException; +import java.nio.file.Path; +import java.time.Instant; +import java.util.List; +import java.util.function.Function; +import org.apache.lucene.facet.FacetTestCase; +import org.apache.lucene.facet.FacetsCollector; +import org.apache.lucene.facet.FacetsConfig; +import org.apache.lucene.facet.taxonomy.FacetLabel; +import org.apache.lucene.facet.taxonomy.SearcherTaxonomyManager; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.util.IOUtils; + +// Nefarious FS will delay/stop deletion of index files and this test specifically does that +@LuceneTestCase.SuppressFileSystems({"WindowsFS", "VirusCheckingFS"}) +public class TestAlwaysRefreshDirectoryTaxonomyReader extends FacetTestCase { + + /** + * Tests the behavior of the {@link AlwaysRefreshDirectoryTaxonomyReader} by testing if the + * associated {@link SearcherTaxonomyManager} can successfully refresh and serve queries if the + * underlying taxonomy index is changed to an older checkpoint. Ideally, each checkpoint should be + * self-sufficient and should allow serving search queries when {@link + * SearcherTaxonomyManager#maybeRefresh()} is called. + * + *

It does not check whether the private taxoArrays were actually recreated or no. We are + * (correctly) hiding away that complexity away from the user. + */ + private void testAlwaysRefreshDirectoryTaxonomyReader( + Function dtrProducer, Class exceptionType) + throws IOException { + final Path taxoPath1 = createTempDir(String.valueOf(Instant.now())); + final Directory dir1 = newFSDirectory(taxoPath1); + + final DirectoryTaxonomyWriter tw1 = + new DirectoryTaxonomyWriter(dir1, IndexWriterConfig.OpenMode.CREATE); + tw1.addCategory(new FacetLabel("a")); + tw1.commit(); // commit1 + + final Path taxoPath2 = createTempDir(String.valueOf(Instant.now())); + final Directory commit1 = newFSDirectory(taxoPath2); + // copy all index files from dir1 + for (String file : dir1.listAll()) { + if (isExtra(file) == false) { + // the test framework creates these devious extra files just to chaos test the edge cases + commit1.copyFrom(dir1, file, file, IOContext.READ); + } + } + + tw1.addCategory(new FacetLabel("b")); + tw1.commit(); // commit2 + tw1.close(); + + final DirectoryReader dr1 = DirectoryReader.open(dir1); + final DirectoryTaxonomyReader dtr1 = dtrProducer.apply(dir1); + final SearcherTaxonomyManager mgr = new SearcherTaxonomyManager(dr1, dtr1, null); + + final FacetsConfig config = new FacetsConfig(); + SearcherTaxonomyManager.SearcherAndTaxonomy pair = mgr.acquire(); + final FacetsCollector sfc = new FacetsCollector(); + /** + * the call flow here initializes {@link DirectoryTaxonomyReader#taxoArrays}. These reused + * `taxoArrays` form the basis of the inconsistency * + */ + getTaxonomyFacetCounts(pair.taxonomyReader, config, sfc); + + // now try to go back to checkpoint 1 and refresh the SearcherTaxonomyManager + + // delete all files from commit2 + for (String file : dir1.listAll()) { + dir1.deleteFile(file); + } + + while (dir1.getPendingDeletions().isEmpty() == false) { + // make the test more robust to the OS taking more time to actually delete files + sleep(5); + } + + // copy all index files from commit1 + for (String file : commit1.listAll()) { + if (isExtra(file) == false) { + dir1.copyFrom(commit1, file, file, IOContext.READ); + } + } + + if (exceptionType != null) { + expectThrows(exceptionType, mgr::maybeRefresh); + } else { + mgr.maybeRefresh(); + pair = mgr.acquire(); + assertEquals(new FacetLabel("a"), pair.taxonomyReader.getPath(1)); + assertEquals(-1, pair.taxonomyReader.getOrdinal(new FacetLabel("b"))); + } + + mgr.release(pair); + IOUtils.close(mgr, dtr1, dr1); + // closing commit1 and dir1 throws exceptions because of checksum mismatches + IOUtils.deleteFiles(commit1, List.of(commit1.listAll())); + IOUtils.deleteFiles(dir1, List.of(dir1.listAll())); + IOUtils.close(commit1, dir1); + } + + public void testAlwaysRefreshDirectoryTaxonomyReader() throws IOException { + testAlwaysRefreshDirectoryTaxonomyReader( + (dir) -> { + try { + return new DirectoryTaxonomyReader(dir); + } catch (IOException e) { + e.printStackTrace(); + } + return null; + }, + ArrayIndexOutOfBoundsException.class); + testAlwaysRefreshDirectoryTaxonomyReader( + (dir) -> { + try { + return new AlwaysRefreshDirectoryTaxonomyReader(dir); + } catch (IOException e) { + e.printStackTrace(); + } + return null; + }, + null); + } + + /** + * A modified DirectoryTaxonomyReader that always recreates a new {@link + * AlwaysRefreshDirectoryTaxonomyReader} instance when {@link + * AlwaysRefreshDirectoryTaxonomyReader#doOpenIfChanged()} is called. This enables us to easily go + * forward or backward in time by re-computing the ordinal space during each refresh. This results + * in an always O(#facet_label) taxonomy array construction time when refresh is called. + */ + private class AlwaysRefreshDirectoryTaxonomyReader extends DirectoryTaxonomyReader { + + AlwaysRefreshDirectoryTaxonomyReader(Directory directory) throws IOException { + super(directory); + } + + AlwaysRefreshDirectoryTaxonomyReader(DirectoryReader indexReader) throws IOException { + super(indexReader, null, null, null, null); + } + + @Override + protected DirectoryTaxonomyReader doOpenIfChanged() throws IOException { + boolean success = false; + + // the getInternalIndexReader() function performs the ensureOpen() check + final DirectoryReader reader = DirectoryReader.openIfChanged(super.getInternalIndexReader()); + if (reader == null) { + return null; // no changes in the directory at all, nothing to do + } + + try { + // It is important that we create an AlwaysRefreshDirectoryTaxonomyReader here and not a + // DirectoryTaxonomyReader. + // Returning a AlwaysRefreshDirectoryTaxonomyReader ensures that the recreated taxonomy + // reader also uses the overridden doOpenIfChanged + // method (that always recomputes values). + final AlwaysRefreshDirectoryTaxonomyReader newTaxonomyReader = + new AlwaysRefreshDirectoryTaxonomyReader(reader); + success = true; + return newTaxonomyReader; + } finally { + if (!success) { + IOUtils.closeWhileHandlingException(reader); + } + } + } + } +}