mirror of https://github.com/apache/lucene.git
LUCENE-10482 Allow users to create their own DirectoryTaxonomyReaders with empty taxoArrays instead of letting the taxoEpoch decide (#762)
This commit is contained in:
parent
2e941fcfed
commit
10ebc099c8
|
@ -61,11 +61,16 @@ Other
|
|||
(Tomoko Uchida)
|
||||
|
||||
======================= Lucene 9.2.0 =======================
|
||||
|
||||
API Changes
|
||||
---------------------
|
||||
|
||||
* LUCENE-10325: Facets API extended to support getTopFacets. (Yuting Gan)
|
||||
|
||||
* LUCENE-10482: Allow users to create their own DirectoryTaxonomyReaders with empty taxoArrays instead of letting the
|
||||
taxoEpoch decide. Add a test case that demonstrates the inconsistencies caused when you reuse taxoArrays on older
|
||||
checkpoints. (Gautam Worah)
|
||||
|
||||
New Features
|
||||
---------------------
|
||||
|
||||
|
|
|
@ -78,10 +78,20 @@ public class DirectoryTaxonomyReader extends TaxonomyReader implements Accountab
|
|||
private volatile TaxonomyIndexArrays taxoArrays;
|
||||
|
||||
/**
|
||||
* Called only from {@link #doOpenIfChanged()}. If the taxonomy has been recreated, you should
|
||||
* pass {@code null} as the caches and parent/children arrays.
|
||||
* Expert: Use this method to explicitly force the {@link DirectoryTaxonomyReader} to use specific
|
||||
* parent/children arrays and caches.
|
||||
*
|
||||
* <p>Called from {@link #doOpenIfChanged()}. If the taxonomy has been recreated, you should pass
|
||||
* {@code null} as the caches and parent/children arrays.
|
||||
*
|
||||
* @param indexReader An indexReader that is opened in the desired Directory
|
||||
* @param taxoWriter The {@link DirectoryTaxonomyWriter} from which to obtain newly added
|
||||
* categories, in real-time.
|
||||
* @param ordinalCache a FacetLabel to Integer ordinal mapping if it already exists
|
||||
* @param categoryCache an ordinal to FacetLabel mapping if it already exists
|
||||
* @param taxoArrays taxonomy arrays that store the parent, siblings, children information
|
||||
*/
|
||||
DirectoryTaxonomyReader(
|
||||
protected DirectoryTaxonomyReader(
|
||||
DirectoryReader indexReader,
|
||||
DirectoryTaxonomyWriter taxoWriter,
|
||||
LRUHashMap<FacetLabel, Integer> ordinalCache,
|
||||
|
@ -207,7 +217,7 @@ public class DirectoryTaxonomyReader extends TaxonomyReader implements Accountab
|
|||
|
||||
final DirectoryTaxonomyReader newtr;
|
||||
if (recreated) {
|
||||
// if recreated, do not reuse anything from this instace. the information
|
||||
// if recreated, do not reuse anything from this instance. the information
|
||||
// will be lazily computed by the new instance when needed.
|
||||
newtr = new DirectoryTaxonomyReader(r2, taxoWriter, null, null, null);
|
||||
} else {
|
||||
|
@ -238,7 +248,7 @@ public class DirectoryTaxonomyReader extends TaxonomyReader implements Accountab
|
|||
* Expert: returns the underlying {@link DirectoryReader} instance that is used by this {@link
|
||||
* TaxonomyReader}.
|
||||
*/
|
||||
DirectoryReader getInternalIndexReader() {
|
||||
protected DirectoryReader getInternalIndexReader() {
|
||||
ensureOpen();
|
||||
return indexReader;
|
||||
}
|
||||
|
|
|
@ -0,0 +1,194 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.facet.taxonomy.directory;
|
||||
|
||||
import static com.carrotsearch.randomizedtesting.RandomizedTest.sleep;
|
||||
import static org.apache.lucene.tests.mockfile.ExtrasFS.isExtra;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
import java.time.Instant;
|
||||
import java.util.List;
|
||||
import java.util.function.Function;
|
||||
import org.apache.lucene.facet.FacetTestCase;
|
||||
import org.apache.lucene.facet.FacetsCollector;
|
||||
import org.apache.lucene.facet.FacetsConfig;
|
||||
import org.apache.lucene.facet.taxonomy.FacetLabel;
|
||||
import org.apache.lucene.facet.taxonomy.SearcherTaxonomyManager;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.IndexWriterConfig;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IOContext;
|
||||
import org.apache.lucene.tests.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
||||
// Nefarious FS will delay/stop deletion of index files and this test specifically does that
|
||||
@LuceneTestCase.SuppressFileSystems({"WindowsFS", "VirusCheckingFS"})
|
||||
public class TestAlwaysRefreshDirectoryTaxonomyReader extends FacetTestCase {
|
||||
|
||||
/**
|
||||
* Tests the behavior of the {@link AlwaysRefreshDirectoryTaxonomyReader} by testing if the
|
||||
* associated {@link SearcherTaxonomyManager} can successfully refresh and serve queries if the
|
||||
* underlying taxonomy index is changed to an older checkpoint. Ideally, each checkpoint should be
|
||||
* self-sufficient and should allow serving search queries when {@link
|
||||
* SearcherTaxonomyManager#maybeRefresh()} is called.
|
||||
*
|
||||
* <p>It does not check whether the private taxoArrays were actually recreated or no. We are
|
||||
* (correctly) hiding away that complexity away from the user.
|
||||
*/
|
||||
private <T extends Throwable> void testAlwaysRefreshDirectoryTaxonomyReader(
|
||||
Function<Directory, DirectoryTaxonomyReader> dtrProducer, Class<T> exceptionType)
|
||||
throws IOException {
|
||||
final Path taxoPath1 = createTempDir(String.valueOf(Instant.now()));
|
||||
final Directory dir1 = newFSDirectory(taxoPath1);
|
||||
|
||||
final DirectoryTaxonomyWriter tw1 =
|
||||
new DirectoryTaxonomyWriter(dir1, IndexWriterConfig.OpenMode.CREATE);
|
||||
tw1.addCategory(new FacetLabel("a"));
|
||||
tw1.commit(); // commit1
|
||||
|
||||
final Path taxoPath2 = createTempDir(String.valueOf(Instant.now()));
|
||||
final Directory commit1 = newFSDirectory(taxoPath2);
|
||||
// copy all index files from dir1
|
||||
for (String file : dir1.listAll()) {
|
||||
if (isExtra(file) == false) {
|
||||
// the test framework creates these devious extra files just to chaos test the edge cases
|
||||
commit1.copyFrom(dir1, file, file, IOContext.READ);
|
||||
}
|
||||
}
|
||||
|
||||
tw1.addCategory(new FacetLabel("b"));
|
||||
tw1.commit(); // commit2
|
||||
tw1.close();
|
||||
|
||||
final DirectoryReader dr1 = DirectoryReader.open(dir1);
|
||||
final DirectoryTaxonomyReader dtr1 = dtrProducer.apply(dir1);
|
||||
final SearcherTaxonomyManager mgr = new SearcherTaxonomyManager(dr1, dtr1, null);
|
||||
|
||||
final FacetsConfig config = new FacetsConfig();
|
||||
SearcherTaxonomyManager.SearcherAndTaxonomy pair = mgr.acquire();
|
||||
final FacetsCollector sfc = new FacetsCollector();
|
||||
/**
|
||||
* the call flow here initializes {@link DirectoryTaxonomyReader#taxoArrays}. These reused
|
||||
* `taxoArrays` form the basis of the inconsistency *
|
||||
*/
|
||||
getTaxonomyFacetCounts(pair.taxonomyReader, config, sfc);
|
||||
|
||||
// now try to go back to checkpoint 1 and refresh the SearcherTaxonomyManager
|
||||
|
||||
// delete all files from commit2
|
||||
for (String file : dir1.listAll()) {
|
||||
dir1.deleteFile(file);
|
||||
}
|
||||
|
||||
while (dir1.getPendingDeletions().isEmpty() == false) {
|
||||
// make the test more robust to the OS taking more time to actually delete files
|
||||
sleep(5);
|
||||
}
|
||||
|
||||
// copy all index files from commit1
|
||||
for (String file : commit1.listAll()) {
|
||||
if (isExtra(file) == false) {
|
||||
dir1.copyFrom(commit1, file, file, IOContext.READ);
|
||||
}
|
||||
}
|
||||
|
||||
if (exceptionType != null) {
|
||||
expectThrows(exceptionType, mgr::maybeRefresh);
|
||||
} else {
|
||||
mgr.maybeRefresh();
|
||||
pair = mgr.acquire();
|
||||
assertEquals(new FacetLabel("a"), pair.taxonomyReader.getPath(1));
|
||||
assertEquals(-1, pair.taxonomyReader.getOrdinal(new FacetLabel("b")));
|
||||
}
|
||||
|
||||
mgr.release(pair);
|
||||
IOUtils.close(mgr, dtr1, dr1);
|
||||
// closing commit1 and dir1 throws exceptions because of checksum mismatches
|
||||
IOUtils.deleteFiles(commit1, List.of(commit1.listAll()));
|
||||
IOUtils.deleteFiles(dir1, List.of(dir1.listAll()));
|
||||
IOUtils.close(commit1, dir1);
|
||||
}
|
||||
|
||||
public void testAlwaysRefreshDirectoryTaxonomyReader() throws IOException {
|
||||
testAlwaysRefreshDirectoryTaxonomyReader(
|
||||
(dir) -> {
|
||||
try {
|
||||
return new DirectoryTaxonomyReader(dir);
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
return null;
|
||||
},
|
||||
ArrayIndexOutOfBoundsException.class);
|
||||
testAlwaysRefreshDirectoryTaxonomyReader(
|
||||
(dir) -> {
|
||||
try {
|
||||
return new AlwaysRefreshDirectoryTaxonomyReader(dir);
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
return null;
|
||||
},
|
||||
null);
|
||||
}
|
||||
|
||||
/**
|
||||
* A modified DirectoryTaxonomyReader that always recreates a new {@link
|
||||
* AlwaysRefreshDirectoryTaxonomyReader} instance when {@link
|
||||
* AlwaysRefreshDirectoryTaxonomyReader#doOpenIfChanged()} is called. This enables us to easily go
|
||||
* forward or backward in time by re-computing the ordinal space during each refresh. This results
|
||||
* in an always O(#facet_label) taxonomy array construction time when refresh is called.
|
||||
*/
|
||||
private class AlwaysRefreshDirectoryTaxonomyReader extends DirectoryTaxonomyReader {
|
||||
|
||||
AlwaysRefreshDirectoryTaxonomyReader(Directory directory) throws IOException {
|
||||
super(directory);
|
||||
}
|
||||
|
||||
AlwaysRefreshDirectoryTaxonomyReader(DirectoryReader indexReader) throws IOException {
|
||||
super(indexReader, null, null, null, null);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected DirectoryTaxonomyReader doOpenIfChanged() throws IOException {
|
||||
boolean success = false;
|
||||
|
||||
// the getInternalIndexReader() function performs the ensureOpen() check
|
||||
final DirectoryReader reader = DirectoryReader.openIfChanged(super.getInternalIndexReader());
|
||||
if (reader == null) {
|
||||
return null; // no changes in the directory at all, nothing to do
|
||||
}
|
||||
|
||||
try {
|
||||
// It is important that we create an AlwaysRefreshDirectoryTaxonomyReader here and not a
|
||||
// DirectoryTaxonomyReader.
|
||||
// Returning a AlwaysRefreshDirectoryTaxonomyReader ensures that the recreated taxonomy
|
||||
// reader also uses the overridden doOpenIfChanged
|
||||
// method (that always recomputes values).
|
||||
final AlwaysRefreshDirectoryTaxonomyReader newTaxonomyReader =
|
||||
new AlwaysRefreshDirectoryTaxonomyReader(reader);
|
||||
success = true;
|
||||
return newTaxonomyReader;
|
||||
} finally {
|
||||
if (!success) {
|
||||
IOUtils.closeWhileHandlingException(reader);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue