LUCENE-10482 Allow users to create their own DirectoryTaxonomyReaders with empty taxoArrays instead of letting the taxoEpoch decide (#762)

2022-04-15 10:45:02 -07:00 · 2022-04-15 10:45:02 -07:00 · 10ebc099c8
parent 2e941fcfed
commit 10ebc099c8
3 changed files with 214 additions and 5 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -61,11 +61,16 @@ Other
  (Tomoko Uchida)

 ======================= Lucene 9.2.0 =======================
+
 API Changes
 ---------------------

 * LUCENE-10325: Facets API extended to support getTopFacets. (Yuting Gan)

+* LUCENE-10482: Allow users to create their own DirectoryTaxonomyReaders with empty taxoArrays instead of letting the
+  taxoEpoch decide. Add a test case that demonstrates the inconsistencies caused when you reuse taxoArrays on older
+  checkpoints. (Gautam Worah)
+
 New Features
 ---------------------

--- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyReader.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyReader.java
@ -78,10 +78,20 @@ public class DirectoryTaxonomyReader extends TaxonomyReader implements Accountab
  private volatile TaxonomyIndexArrays taxoArrays;

  /**
-   * Called only from {@link #doOpenIfChanged()}. If the taxonomy has been recreated, you should
-   * pass {@code null} as the caches and parent/children arrays.
+   * Expert: Use this method to explicitly force the {@link DirectoryTaxonomyReader} to use specific
+   * parent/children arrays and caches.
+   *
+   * <p>Called from {@link #doOpenIfChanged()}. If the taxonomy has been recreated, you should pass
+   * {@code null} as the caches and parent/children arrays.
+   *
+   * @param indexReader An indexReader that is opened in the desired Directory
+   * @param taxoWriter The {@link DirectoryTaxonomyWriter} from which to obtain newly added
+   *     categories, in real-time.
+   * @param ordinalCache a FacetLabel to Integer ordinal mapping if it already exists
+   * @param categoryCache an ordinal to FacetLabel mapping if it already exists
+   * @param taxoArrays taxonomy arrays that store the parent, siblings, children information
   */
-  DirectoryTaxonomyReader(
+  protected DirectoryTaxonomyReader(
      DirectoryReader indexReader,
      DirectoryTaxonomyWriter taxoWriter,
      LRUHashMap<FacetLabel, Integer> ordinalCache,
@ -207,7 +217,7 @@ public class DirectoryTaxonomyReader extends TaxonomyReader implements Accountab

      final DirectoryTaxonomyReader newtr;
      if (recreated) {
-        // if recreated, do not reuse anything from this instace. the information
+        // if recreated, do not reuse anything from this instance. the information
        // will be lazily computed by the new instance when needed.
        newtr = new DirectoryTaxonomyReader(r2, taxoWriter, null, null, null);
      } else {
@ -238,7 +248,7 @@ public class DirectoryTaxonomyReader extends TaxonomyReader implements Accountab
   * Expert: returns the underlying {@link DirectoryReader} instance that is used by this {@link
   * TaxonomyReader}.
   */
-  DirectoryReader getInternalIndexReader() {
+  protected DirectoryReader getInternalIndexReader() {
    ensureOpen();
    return indexReader;
  }
--- a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestAlwaysRefreshDirectoryTaxonomyReader.java
+++ b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestAlwaysRefreshDirectoryTaxonomyReader.java
@ -0,0 +1,194 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.facet.taxonomy.directory;
+
+import static com.carrotsearch.randomizedtesting.RandomizedTest.sleep;
+import static org.apache.lucene.tests.mockfile.ExtrasFS.isExtra;
+
+import java.io.IOException;
+import java.nio.file.Path;
+import java.time.Instant;
+import java.util.List;
+import java.util.function.Function;
+import org.apache.lucene.facet.FacetTestCase;
+import org.apache.lucene.facet.FacetsCollector;
+import org.apache.lucene.facet.FacetsConfig;
+import org.apache.lucene.facet.taxonomy.FacetLabel;
+import org.apache.lucene.facet.taxonomy.SearcherTaxonomyManager;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.IOContext;
+import org.apache.lucene.tests.util.LuceneTestCase;
+import org.apache.lucene.util.IOUtils;
+
+// Nefarious FS will delay/stop deletion of index files and this test specifically does that
+@LuceneTestCase.SuppressFileSystems({"WindowsFS", "VirusCheckingFS"})
+public class TestAlwaysRefreshDirectoryTaxonomyReader extends FacetTestCase {
+
+  /**
+   * Tests the behavior of the {@link AlwaysRefreshDirectoryTaxonomyReader} by testing if the
+   * associated {@link SearcherTaxonomyManager} can successfully refresh and serve queries if the
+   * underlying taxonomy index is changed to an older checkpoint. Ideally, each checkpoint should be
+   * self-sufficient and should allow serving search queries when {@link
+   * SearcherTaxonomyManager#maybeRefresh()} is called.
+   *
+   * <p>It does not check whether the private taxoArrays were actually recreated or no. We are
+   * (correctly) hiding away that complexity away from the user.
+   */
+  private <T extends Throwable> void testAlwaysRefreshDirectoryTaxonomyReader(
+      Function<Directory, DirectoryTaxonomyReader> dtrProducer, Class<T> exceptionType)
+      throws IOException {
+    final Path taxoPath1 = createTempDir(String.valueOf(Instant.now()));
+    final Directory dir1 = newFSDirectory(taxoPath1);
+
+    final DirectoryTaxonomyWriter tw1 =
+        new DirectoryTaxonomyWriter(dir1, IndexWriterConfig.OpenMode.CREATE);
+    tw1.addCategory(new FacetLabel("a"));
+    tw1.commit(); // commit1
+
+    final Path taxoPath2 = createTempDir(String.valueOf(Instant.now()));
+    final Directory commit1 = newFSDirectory(taxoPath2);
+    // copy all index files from dir1
+    for (String file : dir1.listAll()) {
+      if (isExtra(file) == false) {
+        // the test framework creates these devious extra files just to chaos test the edge cases
+        commit1.copyFrom(dir1, file, file, IOContext.READ);
+      }
+    }
+
+    tw1.addCategory(new FacetLabel("b"));
+    tw1.commit(); // commit2
+    tw1.close();
+
+    final DirectoryReader dr1 = DirectoryReader.open(dir1);
+    final DirectoryTaxonomyReader dtr1 = dtrProducer.apply(dir1);
+    final SearcherTaxonomyManager mgr = new SearcherTaxonomyManager(dr1, dtr1, null);
+
+    final FacetsConfig config = new FacetsConfig();
+    SearcherTaxonomyManager.SearcherAndTaxonomy pair = mgr.acquire();
+    final FacetsCollector sfc = new FacetsCollector();
+    /**
+     * the call flow here initializes {@link DirectoryTaxonomyReader#taxoArrays}. These reused
+     * `taxoArrays` form the basis of the inconsistency *
+     */
+    getTaxonomyFacetCounts(pair.taxonomyReader, config, sfc);
+
+    // now try to go back to checkpoint 1 and refresh the SearcherTaxonomyManager
+
+    // delete all files from commit2
+    for (String file : dir1.listAll()) {
+      dir1.deleteFile(file);
+    }
+
+    while (dir1.getPendingDeletions().isEmpty() == false) {
+      // make the test more robust to the OS taking more time to actually delete files
+      sleep(5);
+    }
+
+    // copy all index files from commit1
+    for (String file : commit1.listAll()) {
+      if (isExtra(file) == false) {
+        dir1.copyFrom(commit1, file, file, IOContext.READ);
+      }
+    }
+
+    if (exceptionType != null) {
+      expectThrows(exceptionType, mgr::maybeRefresh);
+    } else {
+      mgr.maybeRefresh();
+      pair = mgr.acquire();
+      assertEquals(new FacetLabel("a"), pair.taxonomyReader.getPath(1));
+      assertEquals(-1, pair.taxonomyReader.getOrdinal(new FacetLabel("b")));
+    }
+
+    mgr.release(pair);
+    IOUtils.close(mgr, dtr1, dr1);
+    // closing commit1 and dir1 throws exceptions because of checksum mismatches
+    IOUtils.deleteFiles(commit1, List.of(commit1.listAll()));
+    IOUtils.deleteFiles(dir1, List.of(dir1.listAll()));
+    IOUtils.close(commit1, dir1);
+  }
+
+  public void testAlwaysRefreshDirectoryTaxonomyReader() throws IOException {
+    testAlwaysRefreshDirectoryTaxonomyReader(
+        (dir) -> {
+          try {
+            return new DirectoryTaxonomyReader(dir);
+          } catch (IOException e) {
+            e.printStackTrace();
+          }
+          return null;
+        },
+        ArrayIndexOutOfBoundsException.class);
+    testAlwaysRefreshDirectoryTaxonomyReader(
+        (dir) -> {
+          try {
+            return new AlwaysRefreshDirectoryTaxonomyReader(dir);
+          } catch (IOException e) {
+            e.printStackTrace();
+          }
+          return null;
+        },
+        null);
+  }
+
+  /**
+   * A modified DirectoryTaxonomyReader that always recreates a new {@link
+   * AlwaysRefreshDirectoryTaxonomyReader} instance when {@link
+   * AlwaysRefreshDirectoryTaxonomyReader#doOpenIfChanged()} is called. This enables us to easily go
+   * forward or backward in time by re-computing the ordinal space during each refresh. This results
+   * in an always O(#facet_label) taxonomy array construction time when refresh is called.
+   */
+  private class AlwaysRefreshDirectoryTaxonomyReader extends DirectoryTaxonomyReader {
+
+    AlwaysRefreshDirectoryTaxonomyReader(Directory directory) throws IOException {
+      super(directory);
+    }
+
+    AlwaysRefreshDirectoryTaxonomyReader(DirectoryReader indexReader) throws IOException {
+      super(indexReader, null, null, null, null);
+    }
+
+    @Override
+    protected DirectoryTaxonomyReader doOpenIfChanged() throws IOException {
+      boolean success = false;
+
+      // the getInternalIndexReader() function performs the ensureOpen() check
+      final DirectoryReader reader = DirectoryReader.openIfChanged(super.getInternalIndexReader());
+      if (reader == null) {
+        return null; // no changes in the directory at all, nothing to do
+      }
+
+      try {
+        // It is important that we create an AlwaysRefreshDirectoryTaxonomyReader here and not a
+        // DirectoryTaxonomyReader.
+        // Returning a AlwaysRefreshDirectoryTaxonomyReader ensures that the recreated taxonomy
+        // reader also uses the overridden doOpenIfChanged
+        // method (that always recomputes values).
+        final AlwaysRefreshDirectoryTaxonomyReader newTaxonomyReader =
+            new AlwaysRefreshDirectoryTaxonomyReader(reader);
+        success = true;
+        return newTaxonomyReader;
+      } finally {
+        if (!success) {
+          IOUtils.closeWhileHandlingException(reader);
+        }
+      }
+    }
+  }
+}