Remove UTF8TaxonomyWriterCache (#12092)

Removes the never-evicting UTF8TaxonomyWriterCache, changing the default to LruTaxonomyWriterCache
This commit is contained in:
Vigya Sharma 2023-01-18 13:20:26 -08:00 committed by GitHub
parent 318b002e0b
commit dc33ade76d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 15 additions and 317 deletions

View File

@ -8,6 +8,9 @@ http://s.apache.org/luceneversions
API Changes
---------------------
* LUCENE-12092: Remove deprecated UTF8TaxonomyWriterCache. Please use LruTaxonomyWriterCache
instead. (Vigya Sharma)
* LUCENE-10010: AutomatonQuery, CompiledAutomaton, RunAutomaton, RegExp
classes no longer determinize NFAs. Instead it is the responsibility
of the caller to determinize. (Robert Muir)

View File

@ -38,7 +38,6 @@ import org.apache.lucene.facet.taxonomy.TaxonomyReader;
import org.apache.lucene.facet.taxonomy.TaxonomyWriter;
import org.apache.lucene.facet.taxonomy.writercache.LruTaxonomyWriterCache;
import org.apache.lucene.facet.taxonomy.writercache.TaxonomyWriterCache;
import org.apache.lucene.facet.taxonomy.writercache.UTF8TaxonomyWriterCache;
import org.apache.lucene.index.CorruptIndexException; // javadocs
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
@ -85,6 +84,8 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
*/
public static final String INDEX_EPOCH = "index.epoch";
private static final int DEFAULT_CACHE_SIZE = 4000;
private final Directory dir;
private final IndexWriter indexWriter;
private final TaxonomyWriterCache cache;
@ -128,9 +129,8 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
* APPEND_OR_CREATE</code> appends to an existing index if there is one, otherwise it creates
* a new index.
* @param cache A {@link TaxonomyWriterCache} implementation which determines the in-memory
* caching policy. See for example {@link LruTaxonomyWriterCache} and {@link
* UTF8TaxonomyWriterCache}. If null or missing, {@link #defaultTaxonomyWriterCache()} is
* used.
* caching policy. See for example {@link LruTaxonomyWriterCache}. If null or missing, {@link
* #defaultTaxonomyWriterCache()} is used.
* @throws CorruptIndexException if the taxonomy is corrupted.
* @throws LockObtainFailedException if the taxonomy is locked by another writer.
* @throws IOException if another error occurred.
@ -267,11 +267,10 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
* Defines the default {@link TaxonomyWriterCache} to use in constructors which do not specify
* one.
*
* <p>The current default is {@link UTF8TaxonomyWriterCache}, i.e., the entire taxonomy is cached
* in memory while building it.
* <p>The current default is {@link LruTaxonomyWriterCache}
*/
public static TaxonomyWriterCache defaultTaxonomyWriterCache() {
return new UTF8TaxonomyWriterCache();
return new LruTaxonomyWriterCache(DEFAULT_CACHE_SIZE);
}
/** Create this with {@code OpenMode.CREATE_OR_APPEND}. */

View File

@ -1,174 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.facet.taxonomy.writercache;
import org.apache.lucene.facet.taxonomy.FacetLabel;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.ByteBlockPool;
import org.apache.lucene.util.ByteBlockPool.DirectTrackingAllocator;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.BytesRefHash;
import org.apache.lucene.util.Counter;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.UnicodeUtil;
/** A "cache" that never frees memory, and stores labels in a BytesRefHash (utf-8 encoding). */
public final class UTF8TaxonomyWriterCache implements TaxonomyWriterCache, Accountable {
private final ThreadLocal<BytesRefBuilder> bytes =
new ThreadLocal<BytesRefBuilder>() {
@Override
protected BytesRefBuilder initialValue() {
return new BytesRefBuilder();
}
};
private final Counter bytesUsed = Counter.newCounter();
private final BytesRefHash map =
new BytesRefHash(new ByteBlockPool(new DirectTrackingAllocator(bytesUsed)));
private static final int PAGE_BITS = 16;
private static final int PAGE_SIZE = 1 << PAGE_BITS;
private static final int PAGE_MASK = PAGE_SIZE - 1;
private volatile int[][] ordinals;
// How many labels we are storing:
private int count;
// How many pages in ordinals we've allocated:
private int pageCount;
/** Sole constructor. */
public UTF8TaxonomyWriterCache() {
ordinals = new int[1][];
ordinals[0] = new int[PAGE_SIZE];
}
@Override
public int get(FacetLabel label) {
BytesRef bytes = toBytes(label);
int id;
synchronized (this) {
id = map.find(bytes);
}
if (id == -1) {
return LabelToOrdinal.INVALID_ORDINAL;
}
int page = id >>> PAGE_BITS;
int offset = id & PAGE_MASK;
return ordinals[page][offset];
}
// Called only from assert
private boolean assertSameOrdinal(FacetLabel label, int id, int ord) {
id = -id - 1;
int page = id >>> PAGE_BITS;
int offset = id & PAGE_MASK;
int oldOrd = ordinals[page][offset];
if (oldOrd != ord) {
throw new IllegalArgumentException(
"label "
+ label
+ " was already cached, with old ord="
+ oldOrd
+ " versus new ord="
+ ord);
}
return true;
}
@Override
public boolean put(FacetLabel label, int ord) {
BytesRef bytes = toBytes(label);
int id;
synchronized (this) {
id = map.add(bytes);
if (id < 0) {
assert assertSameOrdinal(label, id, ord);
return false;
}
assert id == count;
int page = id >>> PAGE_BITS;
int offset = id & PAGE_MASK;
if (page == pageCount) {
if (page == ordinals.length) {
int[][] newOrdinals =
new int[ArrayUtil.oversize(page + 1, RamUsageEstimator.NUM_BYTES_OBJECT_REF)][];
System.arraycopy(ordinals, 0, newOrdinals, 0, ordinals.length);
ordinals = newOrdinals;
}
ordinals[page] = new int[PAGE_SIZE];
pageCount++;
}
ordinals[page][offset] = ord;
count++;
// we never prune from the cache
return false;
}
}
@Override
public boolean isFull() {
// we are never full
return false;
}
@Override
public synchronized void clear() {
map.clear();
map.reinit();
ordinals = new int[1][];
ordinals[0] = new int[PAGE_SIZE];
count = 0;
pageCount = 0;
assert bytesUsed.get() == 0;
}
/** How many labels are currently stored in the cache. */
@Override
public int size() {
return count;
}
@Override
public synchronized long ramBytesUsed() {
return bytesUsed.get() + pageCount * (long) PAGE_SIZE * Integer.BYTES;
}
@Override
public void close() {}
private static final byte DELIM_CHAR = (byte) 0x1F;
private BytesRef toBytes(FacetLabel label) {
BytesRefBuilder bytes = this.bytes.get();
bytes.clear();
for (int i = 0; i < label.length; i++) {
String part = label.components[i];
if (i > 0) {
bytes.append(DELIM_CHAR);
}
bytes.grow(bytes.length() + UnicodeUtil.maxUTF8Length(part.length()));
bytes.setLength(
UnicodeUtil.UTF16toUTF8(part, 0, part.length(), bytes.bytes(), bytes.length()));
}
return bytes.get();
}
}

View File

@ -27,7 +27,6 @@ import org.apache.lucene.facet.FacetsConfig;
import org.apache.lucene.facet.taxonomy.FacetLabel;
import org.apache.lucene.facet.taxonomy.writercache.LruTaxonomyWriterCache;
import org.apache.lucene.facet.taxonomy.writercache.TaxonomyWriterCache;
import org.apache.lucene.facet.taxonomy.writercache.UTF8TaxonomyWriterCache;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.store.Directory;
@ -79,8 +78,8 @@ public class TestConcurrentFacetedIndexing extends FacetTestCase {
static TaxonomyWriterCache newTaxoWriterCache(int ndocs) {
final double d = random().nextDouble();
if (d < 0.7) {
// this is the fastest, yet most memory consuming
return new UTF8TaxonomyWriterCache();
// same as LruTaxonomyWriterCache but with the default cache size
return DirectoryTaxonomyWriter.defaultTaxonomyWriterCache();
} else if (TEST_NIGHTLY && d > 0.98) {
// this is the slowest, but tests the writer concurrency when no caching is done.
// only pick it during NIGHTLY tests, and even then, with very low chances.

View File

@ -33,7 +33,6 @@ import org.apache.lucene.facet.taxonomy.TaxonomyReader;
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter.MemoryOrdinalMap;
import org.apache.lucene.facet.taxonomy.writercache.LruTaxonomyWriterCache;
import org.apache.lucene.facet.taxonomy.writercache.TaxonomyWriterCache;
import org.apache.lucene.facet.taxonomy.writercache.UTF8TaxonomyWriterCache;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
@ -170,7 +169,7 @@ public class TestDirectoryTaxonomyWriter extends FacetTestCase {
// Verifies that if rollback is called, DTW is closed.
Directory dir = newDirectory();
DirectoryTaxonomyWriter dtw = new DirectoryTaxonomyWriter(dir);
assertTrue(dtw.getCache() instanceof UTF8TaxonomyWriterCache);
assertTrue(dtw.getCache() instanceof LruTaxonomyWriterCache);
dtw.addCategory(new FacetLabel("a"));
dtw.rollback();
// should not have succeeded to add a category following rollback.
@ -300,8 +299,8 @@ public class TestDirectoryTaxonomyWriter extends FacetTestCase {
final double d = random().nextDouble();
final TaxonomyWriterCache cache;
if (d < 0.7) {
// this is the fastest, yet most memory consuming
cache = new UTF8TaxonomyWriterCache();
// same as LruTaxonomyWriterCache but with the default cache size
cache = DirectoryTaxonomyWriter.defaultTaxonomyWriterCache();
} else if (TEST_NIGHTLY && d > 0.98) {
// this is the slowest, but tests the writer concurrency when no caching is done.
// only pick it during NIGHTLY tests, and even then, with very low chances.
@ -506,8 +505,7 @@ public class TestDirectoryTaxonomyWriter extends FacetTestCase {
Directory indexDir = newDirectory(), taxoDir = newDirectory();
IndexWriter indexWriter =
new IndexWriter(indexDir, newIndexWriterConfig(new MockAnalyzer(random())));
DirectoryTaxonomyWriter taxoWriter =
new DirectoryTaxonomyWriter(taxoDir, OpenMode.CREATE, new UTF8TaxonomyWriterCache());
DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir, OpenMode.CREATE);
FacetsConfig config = new FacetsConfig();
// Add one huge label:

View File

@ -1,127 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.facet.taxonomy.writercache;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Random;
import java.util.Set;
import org.apache.lucene.facet.FacetTestCase;
import org.apache.lucene.facet.taxonomy.FacetLabel;
import org.apache.lucene.tests.util.TestUtil;
public class TestUTF8TaxonomyWriterCache extends FacetTestCase {
public void testPageOverflow() throws Exception {
UTF8TaxonomyWriterCache cache = new UTF8TaxonomyWriterCache();
for (int ord = 0; ord < 65536 * 2; ord++) {
cache.put(new FacetLabel("foo:", Integer.toString(ord)), ord);
}
for (int ord = 0; ord < 65536 * 2; ord++) {
assertEquals(ord, cache.get(new FacetLabel("foo:", Integer.toString(ord))));
}
}
public void testRandom() throws Exception {
LabelToOrdinal map = new LabelToOrdinalMap();
UTF8TaxonomyWriterCache cache = new UTF8TaxonomyWriterCache();
final int n = atLeast(10 * 1000);
final int numUniqueValues = 50 * 1000;
Random random = random();
Set<String> uniqueValuesSet = new HashSet<>();
while (uniqueValuesSet.size() < numUniqueValues) {
int numParts = TestUtil.nextInt(random(), 1, 5);
StringBuilder b = new StringBuilder();
for (int i = 0; i < numParts; i++) {
String part = null;
while (true) {
part = TestUtil.randomRealisticUnicodeString(random(), 16);
part = part.replace("/", "");
if (part.length() > 0) {
break;
}
}
if (i > 0) {
b.append('/');
}
b.append(part);
}
uniqueValuesSet.add(b.toString());
}
String[] uniqueValues = uniqueValuesSet.toArray(new String[0]);
int ordUpto = 0;
for (int i = 0; i < n; i++) {
int index = random.nextInt(numUniqueValues);
FacetLabel label;
String s = uniqueValues[index];
if (s.length() == 0) {
label = new FacetLabel();
} else {
label = new FacetLabel(s.split("/"));
}
int ord1 = map.getOrdinal(label);
int ord2 = cache.get(label);
assertEquals(ord1, ord2);
if (ord1 == LabelToOrdinal.INVALID_ORDINAL) {
ord1 = ordUpto++;
map.addLabel(label, ord1);
cache.put(label, ord1);
}
}
for (int i = 0; i < numUniqueValues; i++) {
FacetLabel label;
String s = uniqueValues[i];
if (s.length() == 0) {
label = new FacetLabel();
} else {
label = new FacetLabel(s.split("/"));
}
int ord1 = map.getOrdinal(label);
int ord2 = cache.get(label);
assertEquals(ord1, ord2);
}
}
private static class LabelToOrdinalMap extends LabelToOrdinal {
private Map<FacetLabel, Integer> map = new HashMap<>();
LabelToOrdinalMap() {}
@Override
public void addLabel(FacetLabel label, int ordinal) {
map.put(label, ordinal);
}
@Override
public int getOrdinal(FacetLabel label) {
Integer value = map.get(label);
return (value != null) ? value.intValue() : LabelToOrdinal.INVALID_ORDINAL;
}
}
}