Minor refactoring and cleanup to taxonomy index code (#11775)

This commit is contained in:
Shai Erera 2022-09-21 13:08:33 +03:00 committed by GitHub
parent add309bb40
commit bcc116057d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 90 additions and 108 deletions

View File

@ -32,7 +32,7 @@ public class FacetLabel implements Comparable<FacetLabel> {
/*
* copied from DocumentWriterPerThread -- if a FacetLabel is resolved to a
* drill-down term which is encoded to a larger term than that length, it is
* silently dropped! Therefore we limit the number of characters to MAX/4 to
* silently dropped! Therefore, we limit the number of characters to MAX/4 to
* be on the safe side.
*/
/** The maximum number of characters a {@link FacetLabel} can have. */
@ -48,10 +48,10 @@ public class FacetLabel implements Comparable<FacetLabel> {
/** The number of components of this {@link FacetLabel}. */
public final int length;
// Used by subpath
// Used by sub-path
private FacetLabel(final FacetLabel copyFrom, final int prefixLen) {
// while the code which calls this method is safe, at some point a test
// tripped on AIOOBE in toString, but we failed to reproduce. adding the
// While the code which calls this method is safe, at some point a test
// tripped on AIOOBE in toString, but we failed to reproduce. Adding
// assert as a safety check.
assert prefixLen >= 0 && prefixLen <= copyFrom.components.length
: "prefixLen cannot be negative nor larger than the given components' length: prefixLen="
@ -103,7 +103,7 @@ public class FacetLabel implements Comparable<FacetLabel> {
/** Compares this path with another {@link FacetLabel} for lexicographic order. */
@Override
public int compareTo(FacetLabel other) {
final int len = length < other.length ? length : other.length;
final int len = Math.min(length, other.length);
for (int i = 0, j = 0; i < len; i++, j++) {
int cmp = components[i].compareTo(other.components[j]);
if (cmp < 0) {
@ -120,7 +120,7 @@ public class FacetLabel implements Comparable<FacetLabel> {
@Override
public boolean equals(Object obj) {
if (!(obj instanceof FacetLabel)) {
if (obj instanceof FacetLabel == false) {
return false;
}

View File

@ -62,10 +62,10 @@ public class DirectoryTaxonomyReader extends TaxonomyReader implements Accountab
private static final int DEFAULT_CACHE_VALUE = 4000;
// NOTE: very coarse estimate!
private static final int BYTES_PER_CACHE_ENTRY =
4 * RamUsageEstimator.NUM_BYTES_OBJECT_REF
+ 4 * RamUsageEstimator.NUM_BYTES_OBJECT_HEADER
+ 8 * Character.BYTES;
private static final long BYTES_PER_CACHE_ENTRY =
4L * RamUsageEstimator.NUM_BYTES_OBJECT_REF
+ 4L * RamUsageEstimator.NUM_BYTES_OBJECT_HEADER
+ 8L * Character.BYTES;
private final DirectoryTaxonomyWriter taxoWriter;
private final long taxoEpoch; // used in doOpenIfChanged
@ -91,7 +91,7 @@ public class DirectoryTaxonomyReader extends TaxonomyReader implements Accountab
* @param categoryCache an ordinal to FacetLabel mapping if it already exists
* @param taxoArrays taxonomy arrays that store the parent, siblings, children information
*/
protected DirectoryTaxonomyReader(
DirectoryTaxonomyReader(
DirectoryReader indexReader,
DirectoryTaxonomyWriter taxoWriter,
LRUHashMap<FacetLabel, Integer> ordinalCache,
@ -103,14 +103,9 @@ public class DirectoryTaxonomyReader extends TaxonomyReader implements Accountab
this.taxoEpoch = taxoWriter == null ? -1 : taxoWriter.getTaxonomyEpoch();
// use the same instance of the cache, note the protective code in getOrdinal and getPath
this.ordinalCache =
ordinalCache == null
? new LRUHashMap<FacetLabel, Integer>(DEFAULT_CACHE_VALUE)
: ordinalCache;
this.ordinalCache = ordinalCache == null ? new LRUHashMap<>(DEFAULT_CACHE_VALUE) : ordinalCache;
this.categoryCache =
categoryCache == null
? new LRUHashMap<Integer, FacetLabel>(DEFAULT_CACHE_VALUE)
: categoryCache;
categoryCache == null ? new LRUHashMap<>(DEFAULT_CACHE_VALUE) : categoryCache;
this.taxoArrays = taxoArrays != null ? new TaxonomyIndexArrays(indexReader, taxoArrays) : null;
}
@ -151,16 +146,6 @@ public class DirectoryTaxonomyReader extends TaxonomyReader implements Accountab
categoryCache = new LRUHashMap<>(DEFAULT_CACHE_VALUE);
}
private synchronized void initTaxoArrays() throws IOException {
if (taxoArrays == null) {
// according to Java Concurrency in Practice, this might perform better on
// some JVMs, because the array initialization doesn't happen on the
// volatile member.
TaxonomyIndexArrays tmpArrays = new TaxonomyIndexArrays(indexReader);
taxoArrays = tmpArrays;
}
}
@Override
protected void doClose() throws IOException {
indexReader.close();
@ -215,18 +200,18 @@ public class DirectoryTaxonomyReader extends TaxonomyReader implements Accountab
}
}
final DirectoryTaxonomyReader newtr;
final DirectoryTaxonomyReader newTaxoReader;
if (recreated) {
// if recreated, do not reuse anything from this instance. the information
// will be lazily computed by the new instance when needed.
newtr = new DirectoryTaxonomyReader(r2, taxoWriter, null, null, null);
newTaxoReader = new DirectoryTaxonomyReader(r2, taxoWriter, null, null, null);
} else {
newtr =
newTaxoReader =
new DirectoryTaxonomyReader(r2, taxoWriter, ordinalCache, categoryCache, taxoArrays);
}
success = true;
return newtr;
return newTaxoReader;
} finally {
if (!success) {
IOUtils.closeWhileHandlingException(r2);
@ -256,10 +241,18 @@ public class DirectoryTaxonomyReader extends TaxonomyReader implements Accountab
@Override
public ParallelTaxonomyArrays getParallelTaxonomyArrays() throws IOException {
ensureOpen();
if (taxoArrays == null) {
initTaxoArrays();
// By copying to a local variable we only perform a volatile read once (if it's not null)
TaxonomyIndexArrays arrays = taxoArrays;
if (arrays == null) {
synchronized (this) {
arrays = taxoArrays;
if (arrays == null) {
arrays = new TaxonomyIndexArrays(indexReader);
taxoArrays = arrays;
}
}
}
return taxoArrays;
return arrays;
}
@Override
@ -279,14 +272,14 @@ public class DirectoryTaxonomyReader extends TaxonomyReader implements Accountab
synchronized (ordinalCache) {
Integer res = ordinalCache.get(cp);
if (res != null) {
if (res.intValue() < indexReader.maxDoc()) {
if (res < indexReader.maxDoc()) {
// Since the cache is shared with DTR instances allocated from
// doOpenIfChanged, we need to ensure that the ordinal is one that
// this DTR instance recognizes.
return res.intValue();
return res;
} else {
// if we get here, it means that the category was found in the cache,
// but is not recognized by this TR instance. Therefore there's no
// but is not recognized by this TR instance. Therefore, there's no
// need to continue search for the path on disk, because we won't find
// it there too.
return TaxonomyReader.INVALID_ORDINAL;
@ -306,13 +299,13 @@ public class DirectoryTaxonomyReader extends TaxonomyReader implements Accountab
if (docs != null && docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
ret = docs.docID();
// we only store the fact that a category exists, not its inexistence.
// We only store the fact that a category exists, not otherwise.
// This is required because the caches are shared with new DTR instances
// that are allocated from doOpenIfChanged. Therefore, if we only store
// information about found categories, we cannot accidently tell a new
// information about found categories, we cannot accidentally tell a new
// generation of DTR that a category does not exist.
synchronized (ordinalCache) {
ordinalCache.put(cp, Integer.valueOf(ret));
ordinalCache.put(cp, ret);
}
}
@ -325,7 +318,7 @@ public class DirectoryTaxonomyReader extends TaxonomyReader implements Accountab
// Since the cache is shared with DTR instances allocated from
// doOpenIfChanged, we need to ensure that the ordinal is one that this DTR
// instance recognizes. Therefore we do this check up front, before we hit
// instance recognizes. Therefore, we do this check up front, before we hit
// the cache.
checkOrdinalBounds(ordinal);
@ -398,8 +391,7 @@ public class DirectoryTaxonomyReader extends TaxonomyReader implements Accountab
* was created using StoredFields (with no performance gains) and uses DocValues based iteration
* when the index is based on BinaryDocValues. Lucene switched to BinaryDocValues in version 9.0
*
* @param ordinals Array of ordinals that are assigned to categories inserted into the taxonomy
* index
* @param ordinals Array of category ordinals that were added to the taxonomy index
*/
@Override
public FacetLabel[] getBulkPath(int... ordinals) throws IOException {
@ -464,7 +456,7 @@ public class DirectoryTaxonomyReader extends TaxonomyReader implements Accountab
return super.getBulkPath(ordinals);
}
}
// values is leaf specific so you only advance till you reach the target within the leaf
// values is leaf specific, so you only advance till you reach the target within the leaf
boolean success = values.advanceExact(ordinals[i] - leafReaderDocBase);
assert success;
bulkPath[originalPosition[i]] =
@ -558,8 +550,8 @@ public class DirectoryTaxonomyReader extends TaxonomyReader implements Accountab
public String toString(int max) {
ensureOpen();
StringBuilder sb = new StringBuilder();
int upperl = Math.min(max, indexReader.maxDoc());
for (int i = 0; i < upperl; i++) {
int limit = Math.min(max, indexReader.maxDoc());
for (int i = 0; i < limit; i++) {
try {
FacetLabel category = this.getPath(i);
if (category == null) {
@ -570,7 +562,7 @@ public class DirectoryTaxonomyReader extends TaxonomyReader implements Accountab
sb.append(i).append(": EMPTY STRING!! \n");
continue;
}
sb.append(i).append(": ").append(category.toString()).append("\n");
sb.append(i).append(": ").append(category).append("\n");
} catch (
@SuppressWarnings("unused")
IOException e) {

View File

@ -89,11 +89,12 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
private final IndexWriter indexWriter;
private final TaxonomyWriterCache cache;
private final AtomicInteger cacheMisses = new AtomicInteger(0);
private final AtomicInteger nextID = new AtomicInteger(0);
private final Field fullPathField;
// Records the taxonomy index epoch, updated on replaceTaxonomy as well.
private long indexEpoch;
private Field fullPathField;
private int cacheMissesUntilFill = 11;
private boolean shouldFillCache = true;
@ -108,14 +109,13 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
* When the cache is <B>not</B> complete, and we can't find a category in the cache, we still need
* to look for it in the on-disk index; Therefore when the cache is not complete, we need to open
* a "reader" to the taxonomy index. The cache becomes incomplete if it was never filled with the
* existing categories, or if a put() to the cache ever returned true (meaning that some of the
* cached data was cleared).
* existing categories, or if a put() to the cache ever returned true (meaning that some cached
* data was cleared).
*/
private volatile boolean cacheIsComplete;
private volatile boolean isClosed = false;
private volatile TaxonomyIndexArrays taxoArrays;
private volatile int nextID;
/**
* Construct a Taxonomy writer.
@ -169,14 +169,14 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
fullPathField = new StringField(Consts.FULL, "", Field.Store.NO);
nextID = indexWriter.getDocStats().maxDoc;
nextID.set(indexWriter.getDocStats().maxDoc);
if (cache == null) {
cache = defaultTaxonomyWriterCache();
}
this.cache = cache;
if (nextID == 0) {
if (nextID.get() == 0) {
cacheIsComplete = true;
// Make sure that the taxonomy always contain the root category
// with category id 0.
@ -424,9 +424,7 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
} else {
parent = TaxonomyReader.INVALID_ORDINAL;
}
int id = addCategoryDocument(cp, parent);
return id;
return addCategoryDocument(cp, parent);
}
/**
@ -439,7 +437,7 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
}
/**
* Note that the methods calling addCategoryDocument() are synchornized, so this method is
* Note that the methods calling addCategoryDocument() are synchronized, so this method is
* effectively synchronized as well.
*/
private int addCategoryDocument(FacetLabel categoryPath, int parent) throws IOException {
@ -455,11 +453,8 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
d.add(fullPathField);
// Note that we do no pass an Analyzer here because the fields that are
// added to the Document are untokenized or contains their own TokenStream.
// Therefore the IndexWriter's Analyzer has no effect.
indexWriter.addDocument(d);
int id = nextID++;
int id = nextID.getAndIncrement();
// added a category document, mark that ReaderManager is not up-to-date
shouldRefreshReaderManager = true;
@ -491,7 +486,7 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
// addCategoryDocument -- when this method returns, we must know that the
// reader manager's state is current. also, it sets shouldRefresh to false,
// and this cannot overlap with addCatDoc too.
// NOTE: since this method is sync'ed, it can call maybeRefresh, instead of
// NOTE: since this method is synced, it can call maybeRefresh, instead of
// maybeRefreshBlocking. If ever this is changed, make sure to change the
// call too.
if (shouldRefreshReaderManager && initializedReaderManager) {
@ -567,7 +562,7 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
@Override
public int getSize() {
ensureOpen();
return nextID;
return nextID.get();
}
/**
@ -656,33 +651,33 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
}
private TaxonomyIndexArrays getTaxoArrays() throws IOException {
if (taxoArrays == null) {
// By copying to a local variable we only perform a volatile read once (if it's not null)
TaxonomyIndexArrays arrays = taxoArrays;
if (arrays == null) {
synchronized (this) {
if (taxoArrays == null) {
arrays = taxoArrays;
if (arrays == null) {
initReaderManager();
DirectoryReader reader = readerManager.acquire();
try {
// according to Java Concurrency, this might perform better on some
// JVMs, since the object initialization doesn't happen on the
// volatile member.
TaxonomyIndexArrays tmpArrays = new TaxonomyIndexArrays(reader);
taxoArrays = tmpArrays;
arrays = new TaxonomyIndexArrays(reader);
} finally {
readerManager.release(reader);
}
taxoArrays = arrays;
}
}
}
return taxoArrays;
return arrays;
}
@Override
public int getParent(int ordinal) throws IOException {
ensureOpen();
// Note: the following if() just enforces that a user can never ask
// for the parent of a nonexistant category - even if the parent array
// for the parent of a nonexistent category - even if the parent array
// was allocated bigger than it really needs to be.
Objects.checkIndex(ordinal, nextID);
Objects.checkIndex(ordinal, nextID.get());
int[] parents = getTaxoArrays().parents();
assert ordinal < parents.length
@ -697,11 +692,9 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
*/
public void addTaxonomy(Directory taxoDir, OrdinalMap map) throws IOException {
ensureOpen();
DirectoryReader r = DirectoryReader.open(taxoDir);
try {
try (DirectoryReader r = DirectoryReader.open(taxoDir)) {
final int size = r.numDocs();
final OrdinalMap ordinalMap = map;
ordinalMap.setSize(size);
map.setSize(size);
int base = 0;
PostingsEnum docs = null;
for (final LeafReaderContext ctx : r.leaves()) {
@ -713,13 +706,11 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
FacetLabel cp = new FacetLabel(FacetsConfig.stringToPath(te.term().utf8ToString()));
final int ordinal = addCategory(cp);
docs = te.postings(docs, PostingsEnum.NONE);
ordinalMap.addMapping(docs.nextDoc() + base, ordinal);
map.addMapping(docs.nextDoc() + base, ordinal);
}
base += ar.maxDoc(); // no deletions, so we're ok
}
ordinalMap.addDone();
} finally {
r.close();
map.addDone();
}
}
@ -736,22 +727,22 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
* the same time, it is recommended to put the first taxonomy's map in memory, and all the rest on
* disk (later to be automatically read into memory one by one, when needed).
*/
public static interface OrdinalMap {
public interface OrdinalMap {
/**
* Set the size of the map. This MUST be called before addMapping(). It is assumed (but not
* verified) that addMapping() will then be called exactly 'size' times, with different
* origOrdinals between 0 and size-1.
*/
public void setSize(int size) throws IOException;
void setSize(int size) throws IOException;
/** Record a mapping. */
public void addMapping(int origOrdinal, int newOrdinal) throws IOException;
void addMapping(int origOrdinal, int newOrdinal) throws IOException;
/**
* Call addDone() to say that all addMapping() have been done. In some implementations this
* might free some resources.
*/
public void addDone() throws IOException;
void addDone() throws IOException;
/**
* Return the map from the taxonomy's original (consecutive) ordinals to the new taxonomy's
@ -760,7 +751,7 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
* needed. Calling it will also free all resources that the map might be holding (such as
* temporary disk space), other than the returned int[].
*/
public int[] getMap() throws IOException;
int[] getMap() throws IOException;
}
/** {@link OrdinalMap} maintained in memory */
@ -793,13 +784,15 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
/** {@link OrdinalMap} maintained on file system */
public static final class DiskOrdinalMap implements OrdinalMap {
Path tmpfile;
DataOutputStream out;
private final Path mapFile;
private int[] map = null;
private DataOutputStream out;
/** Sole constructor. */
public DiskOrdinalMap(Path tmpfile) throws IOException {
this.tmpfile = tmpfile;
out = new DataOutputStream(new BufferedOutputStream(Files.newOutputStream(tmpfile)));
public DiskOrdinalMap(Path mapFile) throws IOException {
this.mapFile = mapFile;
out = new DataOutputStream(new BufferedOutputStream(Files.newOutputStream(mapFile)));
}
@Override
@ -821,8 +814,6 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
}
}
int[] map = null;
@Override
public int[] getMap() throws IOException {
if (map != null) {
@ -830,20 +821,20 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
}
addDone(); // in case this wasn't previously called
try (DataInputStream in =
new DataInputStream(new BufferedInputStream(Files.newInputStream(tmpfile)))) {
new DataInputStream(new BufferedInputStream(Files.newInputStream(mapFile)))) {
map = new int[in.readInt()];
// NOTE: The current code assumes here that the map is complete,
// i.e., every ordinal gets one and exactly one value. Otherwise,
// we may run into an EOF here, or vice versa, not read everything.
// NOTE: The current code assumes that the map is complete,
// i.e. that every ordinal gets exactly one value. Otherwise,
// we may run into an EOF here, or not read everything.
for (int i = 0; i < map.length; i++) {
int origordinal = in.readInt();
int newordinal = in.readInt();
map[origordinal] = newordinal;
int origOrdinal = in.readInt();
int newOrdinal = in.readInt();
map[origOrdinal] = newOrdinal;
}
}
// Delete the temporary file, which is no longer needed.
Files.delete(tmpfile);
Files.delete(mapFile);
return map;
}
@ -863,8 +854,8 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
/**
* Replaces the current taxonomy with the given one. This method should generally be called in
* conjunction with {@link IndexWriter#addIndexes(Directory...)} to replace both the taxonomy as
* well as the search index content.
* conjunction with {@link IndexWriter#addIndexes(Directory...)} to replace both the taxonomy and
* the search index content.
*/
public synchronized void replaceTaxonomy(Directory taxoDir) throws IOException {
// replace the taxonomy by doing IW optimized operations
@ -873,7 +864,7 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
shouldRefreshReaderManager = true;
initReaderManager(); // ensure that it's initialized
refreshReaderManager();
nextID = indexWriter.getDocStats().maxDoc;
nextID.set(indexWriter.getDocStats().maxDoc);
taxoArrays = null; // must nullify so that it's re-computed next time it's needed
// need to clear the cache, so that addCategory won't accidentally return

View File

@ -78,7 +78,7 @@ class TaxonomyIndexArrays extends ParallelTaxonomyArrays implements Accountable
}
}
private final synchronized void initChildrenSiblings(TaxonomyIndexArrays copyFrom) {
private synchronized void initChildrenSiblings(TaxonomyIndexArrays copyFrom) {
if (!initializedChildren) { // must do this check !
children = new int[parents.length];
siblings = new int[parents.length];
@ -141,7 +141,6 @@ class TaxonomyIndexArrays extends ParallelTaxonomyArrays implements Accountable
throw new CorruptIndexException(
"Missing parent data for category " + (doc + leafContext.docBase), reader.toString());
}
// we're putting an int and converting it back so it should be safe
parents[doc + leafContext.docBase] = Math.toIntExact(parentValues.longValue());
}
}
@ -204,7 +203,7 @@ class TaxonomyIndexArrays extends ParallelTaxonomyArrays implements Accountable
@Override
public synchronized long ramBytesUsed() {
long ramBytesUsed =
RamUsageEstimator.NUM_BYTES_OBJECT_HEADER + 3 * RamUsageEstimator.NUM_BYTES_OBJECT_REF + 1;
RamUsageEstimator.NUM_BYTES_OBJECT_HEADER + 3L * RamUsageEstimator.NUM_BYTES_OBJECT_REF + 1;
ramBytesUsed += RamUsageEstimator.shallowSizeOf(parents);
if (children != null) {
ramBytesUsed += RamUsageEstimator.shallowSizeOf(children);