LUCENE-7975: change the default taxonomy facets cache to a faster UTF-8 cache

This commit is contained in:
Mike McCandless 2017-10-03 09:58:34 -04:00
parent fd2b4f3f86
commit a9fb4ddf80
10 changed files with 208 additions and 946 deletions

View File

@ -38,6 +38,9 @@ New Features
* LUCENE-7974: Add FloatPointNearestNeighbor, an N-dimensional FloatPoint
K-nearest-neighbor search implementation. (Steve Rowe)
* LUCENE-7975: Change the default taxonomy facets cache to a faster
byte[] (UTF-8) based cache.
Optimizations
* LUCENE-7905: Optimize how OrdinalMap (used by

View File

@ -39,19 +39,19 @@ import org.apache.lucene.facet.FacetsConfig;
import org.apache.lucene.facet.taxonomy.FacetLabel;
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
import org.apache.lucene.facet.taxonomy.TaxonomyWriter;
import org.apache.lucene.facet.taxonomy.writercache.Cl2oTaxonomyWriterCache;
import org.apache.lucene.facet.taxonomy.writercache.UTF8TaxonomyWriterCache;
import org.apache.lucene.facet.taxonomy.writercache.LruTaxonomyWriterCache;
import org.apache.lucene.facet.taxonomy.writercache.TaxonomyWriterCache;
import org.apache.lucene.index.CorruptIndexException; // javadocs
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.LogByteSizeMergePolicy;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.ReaderManager;
import org.apache.lucene.index.SegmentInfos;
import org.apache.lucene.index.Terms;
@ -149,7 +149,7 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
* @param cache
* A {@link TaxonomyWriterCache} implementation which determines
* the in-memory caching policy. See for example
* {@link LruTaxonomyWriterCache} and {@link Cl2oTaxonomyWriterCache}.
* {@link LruTaxonomyWriterCache} and {@link UTF8TaxonomyWriterCache}.
* If null or missing, {@link #defaultTaxonomyWriterCache()} is used.
* @throws CorruptIndexException
* if the taxonomy is corrupted.
@ -291,12 +291,11 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
* Defines the default {@link TaxonomyWriterCache} to use in constructors
* which do not specify one.
* <P>
* The current default is {@link Cl2oTaxonomyWriterCache} constructed
* with the parameters (1024, 0.15f, 3), i.e., the entire taxonomy is
* cached in memory while building it.
* The current default is {@link UTF8TaxonomyWriterCache}, i.e.,
* the entire taxonomy is cached in memory while building it.
*/
public static TaxonomyWriterCache defaultTaxonomyWriterCache() {
return new Cl2oTaxonomyWriterCache(1024, 0.15f, 3);
return new UTF8TaxonomyWriterCache();
}
/** Create this with {@code OpenMode.CREATE_OR_APPEND}. */

View File

@ -1,81 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.facet.taxonomy.writercache;
import org.apache.lucene.facet.taxonomy.FacetLabel;
/** Utilities for use of {@link FacetLabel} by {@link CompactLabelToOrdinal}. */
class CategoryPathUtils {
/** Serializes the given {@link FacetLabel} to the {@link CharBlockArray}. */
public static void serialize(FacetLabel cp, CharBlockArray charBlockArray) {
charBlockArray.append((char) cp.length);
if (cp.length == 0) {
return;
}
for (int i = 0; i < cp.length; i++) {
charBlockArray.append((char) cp.components[i].length());
charBlockArray.append(cp.components[i]);
}
}
/**
* Calculates a hash function of a path that was serialized with
* {@link #serialize(FacetLabel, CharBlockArray)}.
*/
public static int hashCodeOfSerialized(CharBlockArray charBlockArray, int offset) {
int length = charBlockArray.charAt(offset++);
if (length == 0) {
return 0;
}
int hash = length;
for (int i = 0; i < length; i++) {
int len = charBlockArray.charAt(offset++);
hash = hash * 31 + charBlockArray.subSequence(offset, offset + len).hashCode();
offset += len;
}
return hash;
}
/**
* Check whether the {@link FacetLabel} is equal to the one serialized in
* {@link CharBlockArray}.
*/
public static boolean equalsToSerialized(FacetLabel cp, CharBlockArray charBlockArray, int offset) {
int n = charBlockArray.charAt(offset++);
if (cp.length != n) {
return false;
}
if (cp.length == 0) {
return true;
}
for (int i = 0; i < cp.length; i++) {
int len = charBlockArray.charAt(offset++);
if (len != cp.components[i].length()) {
return false;
}
if (!cp.components[i].equals(charBlockArray.subSequence(offset, offset + len))) {
return false;
}
offset += len;
}
return true;
}
}

View File

@ -1,98 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.facet.taxonomy.writercache;
import java.util.concurrent.locks.ReadWriteLock;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import org.apache.lucene.facet.taxonomy.FacetLabel;
import org.apache.lucene.facet.taxonomy.writercache.TaxonomyWriterCache;
/**
* {@link TaxonomyWriterCache} using {@link CompactLabelToOrdinal}. Although
* called cache, it maintains in memory all the mappings from category to
* ordinal, relying on that {@link CompactLabelToOrdinal} is an efficient
* mapping for this purpose.
*
* @lucene.experimental
*/
public class Cl2oTaxonomyWriterCache implements TaxonomyWriterCache {
private final ReadWriteLock lock = new ReentrantReadWriteLock();
private final int initialCapcity, numHashArrays;
private final float loadFactor;
private volatile CompactLabelToOrdinal cache;
/** Sole constructor. */
public Cl2oTaxonomyWriterCache(int initialCapcity, float loadFactor, int numHashArrays) {
this.cache = new CompactLabelToOrdinal(initialCapcity, loadFactor, numHashArrays);
this.initialCapcity = initialCapcity;
this.numHashArrays = numHashArrays;
this.loadFactor = loadFactor;
}
@Override
public void clear() {
lock.writeLock().lock();
try {
cache = new CompactLabelToOrdinal(initialCapcity, loadFactor, numHashArrays);
} finally {
lock.writeLock().unlock();
}
}
@Override
public synchronized void close() {
cache = null;
}
@Override
public boolean isFull() {
// This cache is never full
return false;
}
@Override
public int get(FacetLabel categoryPath) {
lock.readLock().lock();
try {
return cache.getOrdinal(categoryPath);
} finally {
lock.readLock().unlock();
}
}
@Override
public boolean put(FacetLabel categoryPath, int ordinal) {
lock.writeLock().lock();
try {
cache.addLabel(categoryPath, ordinal);
// Tell the caller we didn't clear part of the cache, so it doesn't
// have to flush its on-disk index now
return false;
} finally {
lock.writeLock().unlock();
}
}
/** Returns the number of bytes in memory used by this object. */
public int getMemoryUsage() {
return cache == null ? 0 : cache.getMemoryUsage();
}
}

View File

@ -1,235 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.facet.taxonomy.writercache;
import java.util.Iterator;
import java.util.NoSuchElementException;
import org.apache.lucene.facet.taxonomy.FacetLabel;
/**
* HashMap to store colliding labels. See {@link CompactLabelToOrdinal} for
* details.
*
* @lucene.experimental
*/
public class CollisionMap {
private int capacity;
private float loadFactor;
private int size;
private int threshold;
static class Entry {
int offset;
int cid;
Entry next;
int hash;
Entry(int offset, int cid, int h, Entry e) {
this.offset = offset;
this.cid = cid;
this.next = e;
this.hash = h;
}
}
private CharBlockArray labelRepository;
private Entry[] entries;
CollisionMap(CharBlockArray labelRepository) {
this(16 * 1024, 0.75f, labelRepository);
}
CollisionMap(int initialCapacity, CharBlockArray labelRepository) {
this(initialCapacity, 0.75f, labelRepository);
}
private CollisionMap(int initialCapacity, float loadFactor, CharBlockArray labelRepository) {
this.labelRepository = labelRepository;
this.loadFactor = loadFactor;
this.capacity = CompactLabelToOrdinal.determineCapacity(2, initialCapacity);
this.entries = new Entry[this.capacity];
this.threshold = (int) (this.capacity * this.loadFactor);
}
/** How many mappings. */
public int size() {
return this.size;
}
/** How many slots are allocated. */
public int capacity() {
return this.capacity;
}
private void grow() {
int newCapacity = this.capacity * 2;
Entry[] newEntries = new Entry[newCapacity];
Entry[] src = this.entries;
for (int j = 0; j < src.length; j++) {
Entry e = src[j];
if (e != null) {
src[j] = null;
do {
Entry next = e.next;
int hash = e.hash;
int i = indexFor(hash, newCapacity);
e.next = newEntries[i];
newEntries[i] = e;
e = next;
} while (e != null);
}
}
this.capacity = newCapacity;
this.entries = newEntries;
this.threshold = (int) (this.capacity * this.loadFactor);
}
/** Return the mapping, or {@link
* LabelToOrdinal#INVALID_ORDINAL} if the label isn't
* recognized. */
public int get(FacetLabel label, int hash) {
int bucketIndex = indexFor(hash, this.capacity);
Entry e = this.entries[bucketIndex];
while (e != null && !(hash == e.hash && CategoryPathUtils.equalsToSerialized(label, labelRepository, e.offset))) {
e = e.next;
}
if (e == null) {
return LabelToOrdinal.INVALID_ORDINAL;
}
return e.cid;
}
/** Add another mapping. */
public int addLabel(FacetLabel label, int hash, int cid) {
int bucketIndex = indexFor(hash, this.capacity);
for (Entry e = this.entries[bucketIndex]; e != null; e = e.next) {
if (e.hash == hash && CategoryPathUtils.equalsToSerialized(label, labelRepository, e.offset)) {
return e.cid;
}
}
// new string; add to label repository
int offset = labelRepository.length();
CategoryPathUtils.serialize(label, labelRepository);
addEntry(offset, cid, hash, bucketIndex);
return cid;
}
/**
* This method does not check if the same value is already in the map because
* we pass in an char-array offset, so so we now that we're in resize-mode
* here.
*/
public void addLabelOffset(int hash, int offset, int cid) {
int bucketIndex = indexFor(hash, this.capacity);
addEntry(offset, cid, hash, bucketIndex);
}
private void addEntry(int offset, int cid, int hash, int bucketIndex) {
Entry e = this.entries[bucketIndex];
this.entries[bucketIndex] = new Entry(offset, cid, hash, e);
if (this.size++ >= this.threshold) {
grow();
}
}
Iterator<CollisionMap.Entry> entryIterator() {
return new EntryIterator(entries, size);
}
/**
* Returns index for hash code h.
*/
static int indexFor(int h, int length) {
return h & (length - 1);
}
/**
* Returns an estimate of the memory usage of this CollisionMap.
* @return The approximate number of bytes used by this structure.
*/
int getMemoryUsage() {
int memoryUsage = 0;
if (this.entries != null) {
for (Entry e : this.entries) {
if (e != null) {
memoryUsage += (4 * 4);
for (Entry ee = e.next; ee != null; ee = ee.next) {
memoryUsage += (4 * 4);
}
}
}
}
return memoryUsage;
}
private static class EntryIterator implements Iterator<Entry> {
Entry next; // next entry to return
int index; // current slot
Entry[] ents;
EntryIterator(Entry[] entries, int size) {
this.ents = entries;
Entry[] t = entries;
int i = t.length;
Entry n = null;
if (size != 0) { // advance to first entry
while (i > 0 && (n = t[--i]) == null) {
// advance
}
}
this.next = n;
this.index = i;
}
@Override
public boolean hasNext() {
return this.next != null;
}
@Override
public Entry next() {
Entry e = this.next;
if (e == null) throw new NoSuchElementException();
Entry n = e.next;
Entry[] t = ents;
int i = this.index;
while (n == null && i > 0) {
n = t[--i];
}
this.index = i;
this.next = n;
return e;
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
}
}

View File

@ -1,467 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.facet.taxonomy.writercache;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Iterator;
import org.apache.lucene.facet.taxonomy.FacetLabel;
/**
* This is a very efficient LabelToOrdinal implementation that uses a
* CharBlockArray to store all labels and a configurable number of HashArrays to
* reference the labels.
* <p>
* Since the HashArrays don't handle collisions, a {@link CollisionMap} is used
* to store the colliding labels.
* <p>
* This data structure grows by adding a new HashArray whenever the number of
* collisions in the {@link CollisionMap} exceeds {@code loadFactor} *
* {@link #getMaxOrdinal()}. Growing also includes reinserting all colliding
* labels into the HashArrays to possibly reduce the number of collisions.
*
* For setting the {@code loadFactor} see
* {@link #CompactLabelToOrdinal(int, float, int)}.
*
* <p>
* This data structure has a much lower memory footprint (~30%) compared to a
* Java HashMap&lt;String, Integer&gt;. It also only uses a small fraction of objects
* a HashMap would use, thus limiting the GC overhead. Ingestion speed was also
* ~50% faster compared to a HashMap for 3M unique labels.
*
* @lucene.experimental
*/
public class CompactLabelToOrdinal extends LabelToOrdinal {
/** Default maximum load factor. */
public static final float DefaultLoadFactor = 0.15f;
static final char TERMINATOR_CHAR = 0xffff;
private static final int COLLISION = -5;
private HashArray[] hashArrays;
private CollisionMap collisionMap;
private CharBlockArray labelRepository;
private int capacity;
private int threshold;
private float loadFactor;
/** How many labels. */
public int sizeOfMap() {
return this.collisionMap.size();
}
private CompactLabelToOrdinal() {
}
/** Sole constructor. */
public CompactLabelToOrdinal(int initialCapacity, float loadFactor,
int numHashArrays) {
this.hashArrays = new HashArray[numHashArrays];
this.capacity = determineCapacity((int) Math.pow(2, numHashArrays),
initialCapacity);
init();
this.collisionMap = new CollisionMap(this.labelRepository);
this.counter = 0;
this.loadFactor = loadFactor;
this.threshold = (int) (this.loadFactor * this.capacity);
}
static int determineCapacity(int minCapacity, int initialCapacity) {
int capacity = minCapacity;
while (capacity < initialCapacity) {
capacity <<= 1;
}
return capacity;
}
private void init() {
labelRepository = new CharBlockArray();
CategoryPathUtils.serialize(new FacetLabel(), labelRepository);
int c = this.capacity;
for (int i = 0; i < this.hashArrays.length; i++) {
this.hashArrays[i] = new HashArray(c);
c /= 2;
}
}
@Override
public void addLabel(FacetLabel label, int ordinal) {
if (collisionMap.size() > threshold) {
grow();
}
int hash = CompactLabelToOrdinal.stringHashCode(label);
for (int i = 0; i < this.hashArrays.length; i++) {
if (addLabel(this.hashArrays[i], label, hash, ordinal)) {
return;
}
}
int prevVal = collisionMap.addLabel(label, hash, ordinal);
if (prevVal != ordinal) {
throw new IllegalArgumentException("Label already exists: " + label + " prev ordinal " + prevVal);
}
}
@Override
public int getOrdinal(FacetLabel label) {
if (label == null) {
return LabelToOrdinal.INVALID_ORDINAL;
}
int hash = CompactLabelToOrdinal.stringHashCode(label);
for (int i = 0; i < this.hashArrays.length; i++) {
int ord = getOrdinal(this.hashArrays[i], label, hash);
if (ord != COLLISION) {
return ord;
}
}
return this.collisionMap.get(label, hash);
}
private void grow() {
HashArray temp = this.hashArrays[this.hashArrays.length - 1];
for (int i = this.hashArrays.length - 1; i > 0; i--) {
this.hashArrays[i] = this.hashArrays[i - 1];
}
this.capacity *= 2;
this.hashArrays[0] = new HashArray(this.capacity);
for (int i = 1; i < this.hashArrays.length; i++) {
int[] sourceOffsetArray = this.hashArrays[i].offsets;
int[] sourceCidsArray = this.hashArrays[i].cids;
for (int k = 0; k < sourceOffsetArray.length; k++) {
for (int j = 0; j < i && sourceOffsetArray[k] != 0; j++) {
int[] targetOffsetArray = this.hashArrays[j].offsets;
int[] targetCidsArray = this.hashArrays[j].cids;
int newIndex = indexFor(stringHashCode(
this.labelRepository, sourceOffsetArray[k]),
targetOffsetArray.length);
if (targetOffsetArray[newIndex] == 0) {
targetOffsetArray[newIndex] = sourceOffsetArray[k];
targetCidsArray[newIndex] = sourceCidsArray[k];
sourceOffsetArray[k] = 0;
}
}
}
}
for (int i = 0; i < temp.offsets.length; i++) {
int offset = temp.offsets[i];
if (offset > 0) {
int hash = stringHashCode(this.labelRepository, offset);
addLabelOffset(hash, temp.cids[i], offset);
}
}
CollisionMap oldCollisionMap = this.collisionMap;
this.collisionMap = new CollisionMap(oldCollisionMap.capacity(),
this.labelRepository);
this.threshold = (int) (this.capacity * this.loadFactor);
Iterator<CollisionMap.Entry> it = oldCollisionMap.entryIterator();
while (it.hasNext()) {
CollisionMap.Entry e = it.next();
addLabelOffset(stringHashCode(this.labelRepository, e.offset),
e.cid, e.offset);
}
}
private boolean addLabel(HashArray a, FacetLabel label, int hash, int ordinal) {
int index = CompactLabelToOrdinal.indexFor(hash, a.offsets.length);
int offset = a.offsets[index];
if (offset == 0) {
a.offsets[index] = this.labelRepository.length();
CategoryPathUtils.serialize(label, labelRepository);
a.cids[index] = ordinal;
return true;
}
return false;
}
private void addLabelOffset(int hash, int cid, int knownOffset) {
for (int i = 0; i < this.hashArrays.length; i++) {
if (addLabelOffsetToHashArray(this.hashArrays[i], hash, cid,
knownOffset)) {
return;
}
}
this.collisionMap.addLabelOffset(hash, knownOffset, cid);
if (this.collisionMap.size() > this.threshold) {
grow();
}
}
private boolean addLabelOffsetToHashArray(HashArray a, int hash, int ordinal,
int knownOffset) {
int index = CompactLabelToOrdinal.indexFor(hash, a.offsets.length);
int offset = a.offsets[index];
if (offset == 0) {
a.offsets[index] = knownOffset;
a.cids[index] = ordinal;
return true;
}
return false;
}
private int getOrdinal(HashArray a, FacetLabel label, int hash) {
if (label == null) {
return LabelToOrdinal.INVALID_ORDINAL;
}
int index = indexFor(hash, a.offsets.length);
int offset = a.offsets[index];
if (offset == 0) {
return LabelToOrdinal.INVALID_ORDINAL;
}
if (CategoryPathUtils.equalsToSerialized(label, labelRepository, offset)) {
return a.cids[index];
}
return COLLISION;
}
/** Returns index for hash code h. */
static int indexFor(int h, int length) {
return h & (length - 1);
}
// static int stringHashCode(String label) {
// int len = label.length();
// int hash = 0;
// int i;
// for (i = 0; i < len; ++i)
// hash = 33 * hash + label.charAt(i);
//
// hash = hash ^ ((hash >>> 20) ^ (hash >>> 12));
// hash = hash ^ (hash >>> 7) ^ (hash >>> 4);
//
// return hash;
//
// }
static int stringHashCode(FacetLabel label) {
int hash = label.hashCode();
hash = hash ^ ((hash >>> 20) ^ (hash >>> 12));
hash = hash ^ (hash >>> 7) ^ (hash >>> 4);
return hash;
}
static int stringHashCode(CharBlockArray labelRepository, int offset) {
int hash = CategoryPathUtils.hashCodeOfSerialized(labelRepository, offset);
hash = hash ^ ((hash >>> 20) ^ (hash >>> 12));
hash = hash ^ (hash >>> 7) ^ (hash >>> 4);
return hash;
}
// public static boolean equals(CharSequence label, CharBlockArray array,
// int offset) {
// // CONTINUE HERE
// int len = label.length();
// int bi = array.blockIndex(offset);
// CharBlockArray.Block b = array.blocks.get(bi);
// int index = array.indexInBlock(offset);
//
// for (int i = 0; i < len; i++) {
// if (label.charAt(i) != b.chars[index]) {
// return false;
// }
// index++;
// if (index == b.length) {
// b = array.blocks.get(++bi);
// index = 0;
// }
// }
//
// return b.chars[index] == TerminatorChar;
// }
/**
* Returns an estimate of the amount of memory used by this table. Called only in
* this package. Memory is consumed mainly by three structures: the hash arrays,
* label repository and collision map.
*/
int getMemoryUsage() {
int memoryUsage = 0;
if (this.hashArrays != null) {
// HashArray capacity is instance-specific.
for (HashArray ha : this.hashArrays) {
// Each has 2 capacity-length arrays of ints.
memoryUsage += ( ha.capacity * 2 * 4 ) + 4;
}
}
if (this.labelRepository != null) {
// All blocks are the same size.
int blockSize = this.labelRepository.blockSize;
// Each block has room for blockSize UTF-16 chars.
int actualBlockSize = ( blockSize * 2 ) + 4;
memoryUsage += this.labelRepository.blocks.size() * actualBlockSize;
memoryUsage += 8; // Two int values for array as a whole.
}
if (this.collisionMap != null) {
memoryUsage += this.collisionMap.getMemoryUsage();
}
return memoryUsage;
}
/**
* Opens the file and reloads the CompactLabelToOrdinal. The file it expects
* is generated from the {@link #flush(Path)} command.
*/
static CompactLabelToOrdinal open(Path file, float loadFactor,
int numHashArrays) throws IOException {
/**
* Part of the file is the labelRepository, which needs to be rehashed
* and label offsets re-added to the object. I am unsure as to why we
* can't just store these off in the file as well, but in keeping with
* the spirit of the original code, I did it this way. (ssuppe)
*/
CompactLabelToOrdinal l2o = new CompactLabelToOrdinal();
l2o.loadFactor = loadFactor;
l2o.hashArrays = new HashArray[numHashArrays];
DataInputStream dis = null;
try {
dis = new DataInputStream(new BufferedInputStream(
Files.newInputStream(file)));
// TaxiReader needs to load the "counter" or occupancy (L2O) to know
// the next unique facet. we used to load the delimiter too, but
// never used it.
l2o.counter = dis.readInt();
l2o.capacity = determineCapacity((int) Math.pow(2,
l2o.hashArrays.length), l2o.counter);
l2o.init();
// now read the chars
l2o.labelRepository = CharBlockArray.open(dis);
l2o.collisionMap = new CollisionMap(l2o.labelRepository);
// Calculate hash on the fly based on how CategoryPath hashes
// itself. Maybe in the future we can call some static based methods
// in CategoryPath so that this doesn't break again? I don't like
// having code in two different places...
int cid = 0;
// Skip the initial offset, it's the CategoryPath(0,0), which isn't
// a hashed value.
int offset = 1;
int lastStartOffset = offset;
// This loop really relies on a well-formed input (assumes pretty blindly
// that array offsets will work). Since the initial file is machine
// generated, I think this should be OK.
while (offset < l2o.labelRepository.length()) {
// identical code to CategoryPath.hashFromSerialized. since we need to
// advance offset, we cannot call the method directly. perhaps if we
// could pass a mutable Integer or something...
int length = (short) l2o.labelRepository.charAt(offset++);
int hash = length;
if (length != 0) {
for (int i = 0; i < length; i++) {
int len = (short) l2o.labelRepository.charAt(offset++);
hash = hash * 31 + l2o.labelRepository.subSequence(offset, offset + len).hashCode();
offset += len;
}
}
// Now that we've hashed the components of the label, do the
// final part of the hash algorithm.
hash = hash ^ ((hash >>> 20) ^ (hash >>> 12));
hash = hash ^ (hash >>> 7) ^ (hash >>> 4);
// Add the label, and let's keep going
l2o.addLabelOffset(hash, cid, lastStartOffset);
cid++;
lastStartOffset = offset;
}
} catch (ClassNotFoundException cnfe) {
throw new IOException("Invalid file format. Cannot deserialize.");
} finally {
if (dis != null) {
dis.close();
}
}
l2o.threshold = (int) (l2o.loadFactor * l2o.capacity);
return l2o;
}
void flush(Path file) throws IOException {
OutputStream fos = Files.newOutputStream(file);
try {
BufferedOutputStream os = new BufferedOutputStream(fos);
DataOutputStream dos = new DataOutputStream(os);
dos.writeInt(this.counter);
// write the labelRepository
this.labelRepository.flush(dos);
// Closes the data output stream
dos.close();
} finally {
fos.close();
}
}
private static final class HashArray {
int[] offsets;
int[] cids;
int capacity;
HashArray(int c) {
this.capacity = c;
this.offsets = new int[this.capacity];
this.cids = new int[this.capacity];
}
}
}

View File

@ -0,0 +1,158 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.facet.taxonomy.writercache;
import org.apache.lucene.facet.taxonomy.FacetLabel;
import org.apache.lucene.facet.taxonomy.writercache.TaxonomyWriterCache;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.ByteBlockPool.DirectTrackingAllocator;
import org.apache.lucene.util.ByteBlockPool;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.BytesRefHash;
import org.apache.lucene.util.Counter;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.UnicodeUtil;
/** A "cache" that never frees memory, and stores labels in a BytesRefHash (utf-8 encoding). */
public final class UTF8TaxonomyWriterCache implements TaxonomyWriterCache, Accountable {
private final ThreadLocal<BytesRefBuilder> bytes = new ThreadLocal<BytesRefBuilder>() {
@Override
protected BytesRefBuilder initialValue() {
return new BytesRefBuilder();
}
};
private final Counter bytesUsed = Counter.newCounter();
private final BytesRefHash map = new BytesRefHash(new ByteBlockPool(new DirectTrackingAllocator(bytesUsed)));
private final static int ORDINALS_PAGE_SIZE = 65536;
private final static int ORDINALS_PAGE_MASK = ORDINALS_PAGE_SIZE - 1;
private volatile int[][] ordinals;
// How many labels we are storing:
private int count;
// How many pages in ordinals we've allocated:
private int pageCount;
/** Sole constructor. */
public UTF8TaxonomyWriterCache() {
ordinals = new int[1][];
ordinals[0] = new int[ORDINALS_PAGE_SIZE];
}
@Override
public int get(FacetLabel label) {
BytesRef bytes = toBytes(label);
int id;
synchronized (this) {
id = map.find(bytes);
}
if (id == -1) {
return LabelToOrdinal.INVALID_ORDINAL;
}
int page = id / ORDINALS_PAGE_SIZE;
int offset = id % ORDINALS_PAGE_MASK;
return ordinals[page][offset];
}
// Called only from assert
private boolean assertSameOrdinal(FacetLabel label, int id, int ord) {
id = -id - 1;
int page = id / ORDINALS_PAGE_SIZE;
int offset = id % ORDINALS_PAGE_MASK;
int oldOrd = ordinals[page][offset];
if (oldOrd != ord) {
throw new IllegalArgumentException("label " + label + " was already cached, with old ord=" + oldOrd + " versus new ord=" + ord);
}
return true;
}
@Override
public boolean put(FacetLabel label, int ord) {
BytesRef bytes = toBytes(label);
int id;
synchronized (this) {
id = map.add(bytes);
if (id < 0) {
assert assertSameOrdinal(label, id, ord);
return false;
}
assert id == count;
int page = id / ORDINALS_PAGE_SIZE;
int offset = id % ORDINALS_PAGE_MASK;
if (page == pageCount) {
if (page == ordinals.length) {
int[][] newOrdinals = new int[ArrayUtil.oversize(page+1, RamUsageEstimator.NUM_BYTES_OBJECT_REF)][];
System.arraycopy(ordinals, 0, newOrdinals, 0, ordinals.length);
ordinals = newOrdinals;
}
ordinals[page] = new int[ORDINALS_PAGE_MASK];
pageCount++;
}
ordinals[page][offset] = ord;
count++;
// we never prune from the cache
return false;
}
}
@Override
public boolean isFull() {
// we are never full
return false;
}
@Override
public synchronized void clear() {
map.clear();
map.reinit();
ordinals = new int[1][];
ordinals[0] = new int[ORDINALS_PAGE_SIZE];
count = 0;
pageCount = 0;
assert bytesUsed.get() == 0;
}
@Override
public synchronized long ramBytesUsed() {
return bytesUsed.get() + pageCount * ORDINALS_PAGE_SIZE * RamUsageEstimator.NUM_BYTES_INT;
}
@Override
public void close() {
}
private static final byte DELIM_CHAR = (byte) 0x1F;
private BytesRef toBytes(FacetLabel label) {
BytesRefBuilder bytes = this.bytes.get();
bytes.clear();
for (int i = 0; i < label.length; i++) {
String part = label.components[i];
if (i > 0) {
bytes.append(DELIM_CHAR);
}
bytes.grow(bytes.length() + UnicodeUtil.maxUTF8Length(part.length()));
bytes.setLength(UnicodeUtil.UTF16toUTF8(part, 0, part.length(), bytes.bytes(), bytes.length()));
}
return bytes.get();
}
}

View File

@ -26,9 +26,9 @@ import org.apache.lucene.facet.FacetField;
import org.apache.lucene.facet.FacetTestCase;
import org.apache.lucene.facet.FacetsConfig;
import org.apache.lucene.facet.taxonomy.FacetLabel;
import org.apache.lucene.facet.taxonomy.writercache.TaxonomyWriterCache;
import org.apache.lucene.facet.taxonomy.writercache.Cl2oTaxonomyWriterCache;
import org.apache.lucene.facet.taxonomy.writercache.LruTaxonomyWriterCache;
import org.apache.lucene.facet.taxonomy.writercache.TaxonomyWriterCache;
import org.apache.lucene.facet.taxonomy.writercache.UTF8TaxonomyWriterCache;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.store.Directory;
@ -66,13 +66,13 @@ public class TestConcurrentFacetedIndexing extends FacetTestCase {
final double d = random().nextDouble();
if (d < 0.7) {
// this is the fastest, yet most memory consuming
return new Cl2oTaxonomyWriterCache(1024, 0.15f, 3);
return new UTF8TaxonomyWriterCache();
} else if (TEST_NIGHTLY && d > 0.98) {
// this is the slowest, but tests the writer concurrency when no caching is done.
// only pick it during NIGHTLY tests, and even then, with very low chances.
return NO_OP_CACHE;
} else {
// this is slower than CL2O, but less memory consuming, and exercises finding categories on disk too.
// this is slower than UTF8, but less memory consuming, and exercises finding categories on disk too.
return new LruTaxonomyWriterCache(ndocs / 10);
}
}

View File

@ -25,16 +25,16 @@ import java.util.concurrent.atomic.AtomicInteger;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.facet.DrillDownQuery;
import org.apache.lucene.facet.FacetField;
import org.apache.lucene.facet.FacetTestCase;
import org.apache.lucene.facet.FacetsConfig;
import org.apache.lucene.facet.DrillDownQuery;
import org.apache.lucene.facet.taxonomy.FacetLabel;
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter.MemoryOrdinalMap;
import org.apache.lucene.facet.taxonomy.writercache.TaxonomyWriterCache;
import org.apache.lucene.facet.taxonomy.writercache.Cl2oTaxonomyWriterCache;
import org.apache.lucene.facet.taxonomy.writercache.LruTaxonomyWriterCache;
import org.apache.lucene.facet.taxonomy.writercache.TaxonomyWriterCache;
import org.apache.lucene.facet.taxonomy.writercache.UTF8TaxonomyWriterCache;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
@ -252,13 +252,13 @@ public class TestDirectoryTaxonomyWriter extends FacetTestCase {
final TaxonomyWriterCache cache;
if (d < 0.7) {
// this is the fastest, yet most memory consuming
cache = new Cl2oTaxonomyWriterCache(1024, 0.15f, 3);
cache = new UTF8TaxonomyWriterCache();
} else if (TEST_NIGHTLY && d > 0.98) {
// this is the slowest, but tests the writer concurrency when no caching is done.
// only pick it during NIGHTLY tests, and even then, with very low chances.
cache = NO_OP_CACHE;
} else {
// this is slower than CL2O, but less memory consuming, and exercises finding categories on disk too.
// this is slower than UTF8, but less memory consuming, and exercises finding categories on disk too.
cache = new LruTaxonomyWriterCache(ncats / 10);
}
if (VERBOSE) {
@ -441,7 +441,7 @@ public class TestDirectoryTaxonomyWriter extends FacetTestCase {
public void testHugeLabel() throws Exception {
Directory indexDir = newDirectory(), taxoDir = newDirectory();
IndexWriter indexWriter = new IndexWriter(indexDir, newIndexWriterConfig(new MockAnalyzer(random())));
DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir, OpenMode.CREATE, new Cl2oTaxonomyWriterCache(2, 1f, 1));
DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir, OpenMode.CREATE, new UTF8TaxonomyWriterCache());
FacetsConfig config = new FacetsConfig();
// Add one huge label:

View File

@ -17,69 +17,54 @@
package org.apache.lucene.facet.taxonomy.writercache;
import java.nio.ByteBuffer;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Random;
import java.util.Set;
import org.apache.lucene.facet.FacetTestCase;
import org.apache.lucene.facet.taxonomy.FacetLabel;
import org.junit.Test;
import org.apache.lucene.util.TestUtil;
public class TestCompactLabelToOrdinal extends FacetTestCase {
public class TestUTF8TaxonomyWriterCache extends FacetTestCase {
@Test
public void testL2O() throws Exception {
public void testRandom() throws Exception {
LabelToOrdinal map = new LabelToOrdinalMap();
CompactLabelToOrdinal compact = new CompactLabelToOrdinal(2000000, 0.15f, 3);
UTF8TaxonomyWriterCache cache = new UTF8TaxonomyWriterCache();
final int n = atLeast(10 * 1000);
final int numUniqueValues = 50 * 1000;
String[] uniqueValues = new String[numUniqueValues];
byte[] buffer = new byte[50];
Random random = random();
for (int i = 0; i < numUniqueValues;) {
random.nextBytes(buffer);
int size = 1 + random.nextInt(buffer.length);
Set<String> uniqueValuesSet = new HashSet<>();
while (uniqueValuesSet.size() < numUniqueValues) {
int numParts = TestUtil.nextInt(random(), 1, 5);
StringBuilder b = new StringBuilder();
for (int i=0;i<numParts;i++) {
String part = null;
while (true) {
part = TestUtil.randomRealisticUnicodeString(random(), 16);
part = part.replace("/", "");
if (part.length() > 0) {
break;
}
}
// This test is turning random bytes into a string,
// this is asking for trouble.
CharsetDecoder decoder = StandardCharsets.UTF_8.newDecoder()
.onUnmappableCharacter(CodingErrorAction.REPLACE)
.onMalformedInput(CodingErrorAction.REPLACE);
uniqueValues[i] = decoder.decode(ByteBuffer.wrap(buffer, 0, size)).toString();
// we cannot have empty path components, so eliminate all prefix as well
// as middle consecutive delimiter chars.
uniqueValues[i] = uniqueValues[i].replaceAll("/+", "/");
if (uniqueValues[i].startsWith("/")) {
uniqueValues[i] = uniqueValues[i].substring(1);
}
if (uniqueValues[i].indexOf(CompactLabelToOrdinal.TERMINATOR_CHAR) == -1) {
i++;
if (i > 0) {
b.append('/');
}
b.append(part);
}
uniqueValuesSet.add(b.toString());
}
String[] uniqueValues = uniqueValuesSet.toArray(new String[0]);
Path tmpDir = createTempDir("testLableToOrdinal");
Path f = tmpDir.resolve("CompactLabelToOrdinalTest.tmp");
int flushInterval = 10;
int ordUpto = 0;
for (int i = 0; i < n; i++) {
if (i > 0 && i % flushInterval == 0) {
compact.flush(f);
compact = CompactLabelToOrdinal.open(f, 0.15f, 3);
Files.delete(f);
if (flushInterval < (n / 10)) {
flushInterval *= 10;
}
}
int index = random.nextInt(numUniqueValues);
FacetLabel label;
@ -91,14 +76,14 @@ public class TestCompactLabelToOrdinal extends FacetTestCase {
}
int ord1 = map.getOrdinal(label);
int ord2 = compact.getOrdinal(label);
int ord2 = cache.get(label);
assertEquals(ord1, ord2);
if (ord1 == LabelToOrdinal.INVALID_ORDINAL) {
ord1 = compact.getNextOrdinal();
ord1 = ordUpto++;
map.addLabel(label, ord1);
compact.addLabel(label, ord1);
cache.put(label, ord1);
}
}
@ -111,7 +96,7 @@ public class TestCompactLabelToOrdinal extends FacetTestCase {
label = new FacetLabel(s.split("/"));
}
int ord1 = map.getOrdinal(label);
int ord2 = compact.getOrdinal(label);
int ord2 = cache.get(label);
assertEquals(ord1, ord2);
}
}
@ -131,7 +116,5 @@ public class TestCompactLabelToOrdinal extends FacetTestCase {
Integer value = map.get(label);
return (value != null) ? value.intValue() : LabelToOrdinal.INVALID_ORDINAL;
}
}
}