mirror of https://github.com/apache/lucene.git
Remove UTF8TaxonomyWriterCache (#12092)
Removes the never-evicting UTF8TaxonomyWriterCache, changing the default to LruTaxonomyWriterCache
This commit is contained in:
parent
318b002e0b
commit
dc33ade76d
|
@ -8,6 +8,9 @@ http://s.apache.org/luceneversions
|
|||
API Changes
|
||||
---------------------
|
||||
|
||||
* LUCENE-12092: Remove deprecated UTF8TaxonomyWriterCache. Please use LruTaxonomyWriterCache
|
||||
instead. (Vigya Sharma)
|
||||
|
||||
* LUCENE-10010: AutomatonQuery, CompiledAutomaton, RunAutomaton, RegExp
|
||||
classes no longer determinize NFAs. Instead it is the responsibility
|
||||
of the caller to determinize. (Robert Muir)
|
||||
|
|
|
@ -38,7 +38,6 @@ import org.apache.lucene.facet.taxonomy.TaxonomyReader;
|
|||
import org.apache.lucene.facet.taxonomy.TaxonomyWriter;
|
||||
import org.apache.lucene.facet.taxonomy.writercache.LruTaxonomyWriterCache;
|
||||
import org.apache.lucene.facet.taxonomy.writercache.TaxonomyWriterCache;
|
||||
import org.apache.lucene.facet.taxonomy.writercache.UTF8TaxonomyWriterCache;
|
||||
import org.apache.lucene.index.CorruptIndexException; // javadocs
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
|
@ -85,6 +84,8 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
|
|||
*/
|
||||
public static final String INDEX_EPOCH = "index.epoch";
|
||||
|
||||
private static final int DEFAULT_CACHE_SIZE = 4000;
|
||||
|
||||
private final Directory dir;
|
||||
private final IndexWriter indexWriter;
|
||||
private final TaxonomyWriterCache cache;
|
||||
|
@ -128,9 +129,8 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
|
|||
* APPEND_OR_CREATE</code> appends to an existing index if there is one, otherwise it creates
|
||||
* a new index.
|
||||
* @param cache A {@link TaxonomyWriterCache} implementation which determines the in-memory
|
||||
* caching policy. See for example {@link LruTaxonomyWriterCache} and {@link
|
||||
* UTF8TaxonomyWriterCache}. If null or missing, {@link #defaultTaxonomyWriterCache()} is
|
||||
* used.
|
||||
* caching policy. See for example {@link LruTaxonomyWriterCache}. If null or missing, {@link
|
||||
* #defaultTaxonomyWriterCache()} is used.
|
||||
* @throws CorruptIndexException if the taxonomy is corrupted.
|
||||
* @throws LockObtainFailedException if the taxonomy is locked by another writer.
|
||||
* @throws IOException if another error occurred.
|
||||
|
@ -267,11 +267,10 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
|
|||
* Defines the default {@link TaxonomyWriterCache} to use in constructors which do not specify
|
||||
* one.
|
||||
*
|
||||
* <p>The current default is {@link UTF8TaxonomyWriterCache}, i.e., the entire taxonomy is cached
|
||||
* in memory while building it.
|
||||
* <p>The current default is {@link LruTaxonomyWriterCache}
|
||||
*/
|
||||
public static TaxonomyWriterCache defaultTaxonomyWriterCache() {
|
||||
return new UTF8TaxonomyWriterCache();
|
||||
return new LruTaxonomyWriterCache(DEFAULT_CACHE_SIZE);
|
||||
}
|
||||
|
||||
/** Create this with {@code OpenMode.CREATE_OR_APPEND}. */
|
||||
|
|
|
@ -1,174 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.facet.taxonomy.writercache;
|
||||
|
||||
import org.apache.lucene.facet.taxonomy.FacetLabel;
|
||||
import org.apache.lucene.util.Accountable;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.ByteBlockPool;
|
||||
import org.apache.lucene.util.ByteBlockPool.DirectTrackingAllocator;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.BytesRefBuilder;
|
||||
import org.apache.lucene.util.BytesRefHash;
|
||||
import org.apache.lucene.util.Counter;
|
||||
import org.apache.lucene.util.RamUsageEstimator;
|
||||
import org.apache.lucene.util.UnicodeUtil;
|
||||
|
||||
/** A "cache" that never frees memory, and stores labels in a BytesRefHash (utf-8 encoding). */
|
||||
public final class UTF8TaxonomyWriterCache implements TaxonomyWriterCache, Accountable {
|
||||
private final ThreadLocal<BytesRefBuilder> bytes =
|
||||
new ThreadLocal<BytesRefBuilder>() {
|
||||
@Override
|
||||
protected BytesRefBuilder initialValue() {
|
||||
return new BytesRefBuilder();
|
||||
}
|
||||
};
|
||||
|
||||
private final Counter bytesUsed = Counter.newCounter();
|
||||
private final BytesRefHash map =
|
||||
new BytesRefHash(new ByteBlockPool(new DirectTrackingAllocator(bytesUsed)));
|
||||
|
||||
private static final int PAGE_BITS = 16;
|
||||
private static final int PAGE_SIZE = 1 << PAGE_BITS;
|
||||
private static final int PAGE_MASK = PAGE_SIZE - 1;
|
||||
|
||||
private volatile int[][] ordinals;
|
||||
|
||||
// How many labels we are storing:
|
||||
private int count;
|
||||
|
||||
// How many pages in ordinals we've allocated:
|
||||
private int pageCount;
|
||||
|
||||
/** Sole constructor. */
|
||||
public UTF8TaxonomyWriterCache() {
|
||||
ordinals = new int[1][];
|
||||
ordinals[0] = new int[PAGE_SIZE];
|
||||
}
|
||||
|
||||
@Override
|
||||
public int get(FacetLabel label) {
|
||||
BytesRef bytes = toBytes(label);
|
||||
int id;
|
||||
synchronized (this) {
|
||||
id = map.find(bytes);
|
||||
}
|
||||
if (id == -1) {
|
||||
return LabelToOrdinal.INVALID_ORDINAL;
|
||||
}
|
||||
int page = id >>> PAGE_BITS;
|
||||
int offset = id & PAGE_MASK;
|
||||
return ordinals[page][offset];
|
||||
}
|
||||
|
||||
// Called only from assert
|
||||
private boolean assertSameOrdinal(FacetLabel label, int id, int ord) {
|
||||
id = -id - 1;
|
||||
int page = id >>> PAGE_BITS;
|
||||
int offset = id & PAGE_MASK;
|
||||
int oldOrd = ordinals[page][offset];
|
||||
if (oldOrd != ord) {
|
||||
throw new IllegalArgumentException(
|
||||
"label "
|
||||
+ label
|
||||
+ " was already cached, with old ord="
|
||||
+ oldOrd
|
||||
+ " versus new ord="
|
||||
+ ord);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean put(FacetLabel label, int ord) {
|
||||
BytesRef bytes = toBytes(label);
|
||||
int id;
|
||||
synchronized (this) {
|
||||
id = map.add(bytes);
|
||||
if (id < 0) {
|
||||
assert assertSameOrdinal(label, id, ord);
|
||||
return false;
|
||||
}
|
||||
assert id == count;
|
||||
int page = id >>> PAGE_BITS;
|
||||
int offset = id & PAGE_MASK;
|
||||
if (page == pageCount) {
|
||||
if (page == ordinals.length) {
|
||||
int[][] newOrdinals =
|
||||
new int[ArrayUtil.oversize(page + 1, RamUsageEstimator.NUM_BYTES_OBJECT_REF)][];
|
||||
System.arraycopy(ordinals, 0, newOrdinals, 0, ordinals.length);
|
||||
ordinals = newOrdinals;
|
||||
}
|
||||
ordinals[page] = new int[PAGE_SIZE];
|
||||
pageCount++;
|
||||
}
|
||||
ordinals[page][offset] = ord;
|
||||
count++;
|
||||
|
||||
// we never prune from the cache
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isFull() {
|
||||
// we are never full
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public synchronized void clear() {
|
||||
map.clear();
|
||||
map.reinit();
|
||||
ordinals = new int[1][];
|
||||
ordinals[0] = new int[PAGE_SIZE];
|
||||
count = 0;
|
||||
pageCount = 0;
|
||||
assert bytesUsed.get() == 0;
|
||||
}
|
||||
|
||||
/** How many labels are currently stored in the cache. */
|
||||
@Override
|
||||
public int size() {
|
||||
return count;
|
||||
}
|
||||
|
||||
@Override
|
||||
public synchronized long ramBytesUsed() {
|
||||
return bytesUsed.get() + pageCount * (long) PAGE_SIZE * Integer.BYTES;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() {}
|
||||
|
||||
private static final byte DELIM_CHAR = (byte) 0x1F;
|
||||
|
||||
private BytesRef toBytes(FacetLabel label) {
|
||||
BytesRefBuilder bytes = this.bytes.get();
|
||||
bytes.clear();
|
||||
for (int i = 0; i < label.length; i++) {
|
||||
String part = label.components[i];
|
||||
if (i > 0) {
|
||||
bytes.append(DELIM_CHAR);
|
||||
}
|
||||
bytes.grow(bytes.length() + UnicodeUtil.maxUTF8Length(part.length()));
|
||||
bytes.setLength(
|
||||
UnicodeUtil.UTF16toUTF8(part, 0, part.length(), bytes.bytes(), bytes.length()));
|
||||
}
|
||||
return bytes.get();
|
||||
}
|
||||
}
|
|
@ -27,7 +27,6 @@ import org.apache.lucene.facet.FacetsConfig;
|
|||
import org.apache.lucene.facet.taxonomy.FacetLabel;
|
||||
import org.apache.lucene.facet.taxonomy.writercache.LruTaxonomyWriterCache;
|
||||
import org.apache.lucene.facet.taxonomy.writercache.TaxonomyWriterCache;
|
||||
import org.apache.lucene.facet.taxonomy.writercache.UTF8TaxonomyWriterCache;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
|
||||
import org.apache.lucene.store.Directory;
|
||||
|
@ -79,8 +78,8 @@ public class TestConcurrentFacetedIndexing extends FacetTestCase {
|
|||
static TaxonomyWriterCache newTaxoWriterCache(int ndocs) {
|
||||
final double d = random().nextDouble();
|
||||
if (d < 0.7) {
|
||||
// this is the fastest, yet most memory consuming
|
||||
return new UTF8TaxonomyWriterCache();
|
||||
// same as LruTaxonomyWriterCache but with the default cache size
|
||||
return DirectoryTaxonomyWriter.defaultTaxonomyWriterCache();
|
||||
} else if (TEST_NIGHTLY && d > 0.98) {
|
||||
// this is the slowest, but tests the writer concurrency when no caching is done.
|
||||
// only pick it during NIGHTLY tests, and even then, with very low chances.
|
||||
|
|
|
@ -33,7 +33,6 @@ import org.apache.lucene.facet.taxonomy.TaxonomyReader;
|
|||
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter.MemoryOrdinalMap;
|
||||
import org.apache.lucene.facet.taxonomy.writercache.LruTaxonomyWriterCache;
|
||||
import org.apache.lucene.facet.taxonomy.writercache.TaxonomyWriterCache;
|
||||
import org.apache.lucene.facet.taxonomy.writercache.UTF8TaxonomyWriterCache;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
|
@ -170,7 +169,7 @@ public class TestDirectoryTaxonomyWriter extends FacetTestCase {
|
|||
// Verifies that if rollback is called, DTW is closed.
|
||||
Directory dir = newDirectory();
|
||||
DirectoryTaxonomyWriter dtw = new DirectoryTaxonomyWriter(dir);
|
||||
assertTrue(dtw.getCache() instanceof UTF8TaxonomyWriterCache);
|
||||
assertTrue(dtw.getCache() instanceof LruTaxonomyWriterCache);
|
||||
dtw.addCategory(new FacetLabel("a"));
|
||||
dtw.rollback();
|
||||
// should not have succeeded to add a category following rollback.
|
||||
|
@ -300,8 +299,8 @@ public class TestDirectoryTaxonomyWriter extends FacetTestCase {
|
|||
final double d = random().nextDouble();
|
||||
final TaxonomyWriterCache cache;
|
||||
if (d < 0.7) {
|
||||
// this is the fastest, yet most memory consuming
|
||||
cache = new UTF8TaxonomyWriterCache();
|
||||
// same as LruTaxonomyWriterCache but with the default cache size
|
||||
cache = DirectoryTaxonomyWriter.defaultTaxonomyWriterCache();
|
||||
} else if (TEST_NIGHTLY && d > 0.98) {
|
||||
// this is the slowest, but tests the writer concurrency when no caching is done.
|
||||
// only pick it during NIGHTLY tests, and even then, with very low chances.
|
||||
|
@ -506,8 +505,7 @@ public class TestDirectoryTaxonomyWriter extends FacetTestCase {
|
|||
Directory indexDir = newDirectory(), taxoDir = newDirectory();
|
||||
IndexWriter indexWriter =
|
||||
new IndexWriter(indexDir, newIndexWriterConfig(new MockAnalyzer(random())));
|
||||
DirectoryTaxonomyWriter taxoWriter =
|
||||
new DirectoryTaxonomyWriter(taxoDir, OpenMode.CREATE, new UTF8TaxonomyWriterCache());
|
||||
DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir, OpenMode.CREATE);
|
||||
FacetsConfig config = new FacetsConfig();
|
||||
|
||||
// Add one huge label:
|
||||
|
|
|
@ -1,127 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.facet.taxonomy.writercache;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Map;
|
||||
import java.util.Random;
|
||||
import java.util.Set;
|
||||
import org.apache.lucene.facet.FacetTestCase;
|
||||
import org.apache.lucene.facet.taxonomy.FacetLabel;
|
||||
import org.apache.lucene.tests.util.TestUtil;
|
||||
|
||||
public class TestUTF8TaxonomyWriterCache extends FacetTestCase {
|
||||
public void testPageOverflow() throws Exception {
|
||||
UTF8TaxonomyWriterCache cache = new UTF8TaxonomyWriterCache();
|
||||
for (int ord = 0; ord < 65536 * 2; ord++) {
|
||||
cache.put(new FacetLabel("foo:", Integer.toString(ord)), ord);
|
||||
}
|
||||
|
||||
for (int ord = 0; ord < 65536 * 2; ord++) {
|
||||
assertEquals(ord, cache.get(new FacetLabel("foo:", Integer.toString(ord))));
|
||||
}
|
||||
}
|
||||
|
||||
public void testRandom() throws Exception {
|
||||
LabelToOrdinal map = new LabelToOrdinalMap();
|
||||
|
||||
UTF8TaxonomyWriterCache cache = new UTF8TaxonomyWriterCache();
|
||||
|
||||
final int n = atLeast(10 * 1000);
|
||||
final int numUniqueValues = 50 * 1000;
|
||||
|
||||
Random random = random();
|
||||
Set<String> uniqueValuesSet = new HashSet<>();
|
||||
while (uniqueValuesSet.size() < numUniqueValues) {
|
||||
int numParts = TestUtil.nextInt(random(), 1, 5);
|
||||
StringBuilder b = new StringBuilder();
|
||||
for (int i = 0; i < numParts; i++) {
|
||||
String part = null;
|
||||
while (true) {
|
||||
part = TestUtil.randomRealisticUnicodeString(random(), 16);
|
||||
part = part.replace("/", "");
|
||||
if (part.length() > 0) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (i > 0) {
|
||||
b.append('/');
|
||||
}
|
||||
b.append(part);
|
||||
}
|
||||
uniqueValuesSet.add(b.toString());
|
||||
}
|
||||
String[] uniqueValues = uniqueValuesSet.toArray(new String[0]);
|
||||
|
||||
int ordUpto = 0;
|
||||
for (int i = 0; i < n; i++) {
|
||||
|
||||
int index = random.nextInt(numUniqueValues);
|
||||
FacetLabel label;
|
||||
String s = uniqueValues[index];
|
||||
if (s.length() == 0) {
|
||||
label = new FacetLabel();
|
||||
} else {
|
||||
label = new FacetLabel(s.split("/"));
|
||||
}
|
||||
|
||||
int ord1 = map.getOrdinal(label);
|
||||
int ord2 = cache.get(label);
|
||||
|
||||
assertEquals(ord1, ord2);
|
||||
|
||||
if (ord1 == LabelToOrdinal.INVALID_ORDINAL) {
|
||||
ord1 = ordUpto++;
|
||||
map.addLabel(label, ord1);
|
||||
cache.put(label, ord1);
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < numUniqueValues; i++) {
|
||||
FacetLabel label;
|
||||
String s = uniqueValues[i];
|
||||
if (s.length() == 0) {
|
||||
label = new FacetLabel();
|
||||
} else {
|
||||
label = new FacetLabel(s.split("/"));
|
||||
}
|
||||
int ord1 = map.getOrdinal(label);
|
||||
int ord2 = cache.get(label);
|
||||
assertEquals(ord1, ord2);
|
||||
}
|
||||
}
|
||||
|
||||
private static class LabelToOrdinalMap extends LabelToOrdinal {
|
||||
private Map<FacetLabel, Integer> map = new HashMap<>();
|
||||
|
||||
LabelToOrdinalMap() {}
|
||||
|
||||
@Override
|
||||
public void addLabel(FacetLabel label, int ordinal) {
|
||||
map.put(label, ordinal);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getOrdinal(FacetLabel label) {
|
||||
Integer value = map.get(label);
|
||||
return (value != null) ? value.intValue() : LabelToOrdinal.INVALID_ORDINAL;
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue