mirror of https://github.com/apache/lucene.git
LUCENE-7975: change the default taxonomy facets cache to a faster UTF-8 cache
This commit is contained in:
parent
fd2b4f3f86
commit
a9fb4ddf80
|
@ -38,6 +38,9 @@ New Features
|
||||||
* LUCENE-7974: Add FloatPointNearestNeighbor, an N-dimensional FloatPoint
|
* LUCENE-7974: Add FloatPointNearestNeighbor, an N-dimensional FloatPoint
|
||||||
K-nearest-neighbor search implementation. (Steve Rowe)
|
K-nearest-neighbor search implementation. (Steve Rowe)
|
||||||
|
|
||||||
|
* LUCENE-7975: Change the default taxonomy facets cache to a faster
|
||||||
|
byte[] (UTF-8) based cache.
|
||||||
|
|
||||||
Optimizations
|
Optimizations
|
||||||
|
|
||||||
* LUCENE-7905: Optimize how OrdinalMap (used by
|
* LUCENE-7905: Optimize how OrdinalMap (used by
|
||||||
|
|
|
@ -39,19 +39,19 @@ import org.apache.lucene.facet.FacetsConfig;
|
||||||
import org.apache.lucene.facet.taxonomy.FacetLabel;
|
import org.apache.lucene.facet.taxonomy.FacetLabel;
|
||||||
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
|
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
|
||||||
import org.apache.lucene.facet.taxonomy.TaxonomyWriter;
|
import org.apache.lucene.facet.taxonomy.TaxonomyWriter;
|
||||||
import org.apache.lucene.facet.taxonomy.writercache.Cl2oTaxonomyWriterCache;
|
import org.apache.lucene.facet.taxonomy.writercache.UTF8TaxonomyWriterCache;
|
||||||
import org.apache.lucene.facet.taxonomy.writercache.LruTaxonomyWriterCache;
|
import org.apache.lucene.facet.taxonomy.writercache.LruTaxonomyWriterCache;
|
||||||
import org.apache.lucene.facet.taxonomy.writercache.TaxonomyWriterCache;
|
import org.apache.lucene.facet.taxonomy.writercache.TaxonomyWriterCache;
|
||||||
import org.apache.lucene.index.CorruptIndexException; // javadocs
|
import org.apache.lucene.index.CorruptIndexException; // javadocs
|
||||||
import org.apache.lucene.index.DirectoryReader;
|
import org.apache.lucene.index.DirectoryReader;
|
||||||
import org.apache.lucene.index.PostingsEnum;
|
|
||||||
import org.apache.lucene.index.IndexReader;
|
import org.apache.lucene.index.IndexReader;
|
||||||
import org.apache.lucene.index.IndexWriter;
|
import org.apache.lucene.index.IndexWriter;
|
||||||
import org.apache.lucene.index.IndexWriterConfig;
|
|
||||||
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
|
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
|
||||||
|
import org.apache.lucene.index.IndexWriterConfig;
|
||||||
import org.apache.lucene.index.LeafReader;
|
import org.apache.lucene.index.LeafReader;
|
||||||
import org.apache.lucene.index.LeafReaderContext;
|
import org.apache.lucene.index.LeafReaderContext;
|
||||||
import org.apache.lucene.index.LogByteSizeMergePolicy;
|
import org.apache.lucene.index.LogByteSizeMergePolicy;
|
||||||
|
import org.apache.lucene.index.PostingsEnum;
|
||||||
import org.apache.lucene.index.ReaderManager;
|
import org.apache.lucene.index.ReaderManager;
|
||||||
import org.apache.lucene.index.SegmentInfos;
|
import org.apache.lucene.index.SegmentInfos;
|
||||||
import org.apache.lucene.index.Terms;
|
import org.apache.lucene.index.Terms;
|
||||||
|
@ -149,7 +149,7 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
|
||||||
* @param cache
|
* @param cache
|
||||||
* A {@link TaxonomyWriterCache} implementation which determines
|
* A {@link TaxonomyWriterCache} implementation which determines
|
||||||
* the in-memory caching policy. See for example
|
* the in-memory caching policy. See for example
|
||||||
* {@link LruTaxonomyWriterCache} and {@link Cl2oTaxonomyWriterCache}.
|
* {@link LruTaxonomyWriterCache} and {@link UTF8TaxonomyWriterCache}.
|
||||||
* If null or missing, {@link #defaultTaxonomyWriterCache()} is used.
|
* If null or missing, {@link #defaultTaxonomyWriterCache()} is used.
|
||||||
* @throws CorruptIndexException
|
* @throws CorruptIndexException
|
||||||
* if the taxonomy is corrupted.
|
* if the taxonomy is corrupted.
|
||||||
|
@ -291,12 +291,11 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
|
||||||
* Defines the default {@link TaxonomyWriterCache} to use in constructors
|
* Defines the default {@link TaxonomyWriterCache} to use in constructors
|
||||||
* which do not specify one.
|
* which do not specify one.
|
||||||
* <P>
|
* <P>
|
||||||
* The current default is {@link Cl2oTaxonomyWriterCache} constructed
|
* The current default is {@link UTF8TaxonomyWriterCache}, i.e.,
|
||||||
* with the parameters (1024, 0.15f, 3), i.e., the entire taxonomy is
|
* the entire taxonomy is cached in memory while building it.
|
||||||
* cached in memory while building it.
|
|
||||||
*/
|
*/
|
||||||
public static TaxonomyWriterCache defaultTaxonomyWriterCache() {
|
public static TaxonomyWriterCache defaultTaxonomyWriterCache() {
|
||||||
return new Cl2oTaxonomyWriterCache(1024, 0.15f, 3);
|
return new UTF8TaxonomyWriterCache();
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Create this with {@code OpenMode.CREATE_OR_APPEND}. */
|
/** Create this with {@code OpenMode.CREATE_OR_APPEND}. */
|
||||||
|
|
|
@ -1,81 +0,0 @@
|
||||||
/*
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
|
||||||
* this work for additional information regarding copyright ownership.
|
|
||||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
* (the "License"); you may not use this file except in compliance with
|
|
||||||
* the License. You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
package org.apache.lucene.facet.taxonomy.writercache;
|
|
||||||
|
|
||||||
import org.apache.lucene.facet.taxonomy.FacetLabel;
|
|
||||||
|
|
||||||
/** Utilities for use of {@link FacetLabel} by {@link CompactLabelToOrdinal}. */
|
|
||||||
class CategoryPathUtils {
|
|
||||||
|
|
||||||
/** Serializes the given {@link FacetLabel} to the {@link CharBlockArray}. */
|
|
||||||
public static void serialize(FacetLabel cp, CharBlockArray charBlockArray) {
|
|
||||||
charBlockArray.append((char) cp.length);
|
|
||||||
if (cp.length == 0) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
for (int i = 0; i < cp.length; i++) {
|
|
||||||
charBlockArray.append((char) cp.components[i].length());
|
|
||||||
charBlockArray.append(cp.components[i]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Calculates a hash function of a path that was serialized with
|
|
||||||
* {@link #serialize(FacetLabel, CharBlockArray)}.
|
|
||||||
*/
|
|
||||||
public static int hashCodeOfSerialized(CharBlockArray charBlockArray, int offset) {
|
|
||||||
int length = charBlockArray.charAt(offset++);
|
|
||||||
if (length == 0) {
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
int hash = length;
|
|
||||||
for (int i = 0; i < length; i++) {
|
|
||||||
int len = charBlockArray.charAt(offset++);
|
|
||||||
hash = hash * 31 + charBlockArray.subSequence(offset, offset + len).hashCode();
|
|
||||||
offset += len;
|
|
||||||
}
|
|
||||||
return hash;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Check whether the {@link FacetLabel} is equal to the one serialized in
|
|
||||||
* {@link CharBlockArray}.
|
|
||||||
*/
|
|
||||||
public static boolean equalsToSerialized(FacetLabel cp, CharBlockArray charBlockArray, int offset) {
|
|
||||||
int n = charBlockArray.charAt(offset++);
|
|
||||||
if (cp.length != n) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
if (cp.length == 0) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int i = 0; i < cp.length; i++) {
|
|
||||||
int len = charBlockArray.charAt(offset++);
|
|
||||||
if (len != cp.components[i].length()) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
if (!cp.components[i].equals(charBlockArray.subSequence(offset, offset + len))) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
offset += len;
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -1,98 +0,0 @@
|
||||||
/*
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
|
||||||
* this work for additional information regarding copyright ownership.
|
|
||||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
* (the "License"); you may not use this file except in compliance with
|
|
||||||
* the License. You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
package org.apache.lucene.facet.taxonomy.writercache;
|
|
||||||
|
|
||||||
import java.util.concurrent.locks.ReadWriteLock;
|
|
||||||
import java.util.concurrent.locks.ReentrantReadWriteLock;
|
|
||||||
|
|
||||||
import org.apache.lucene.facet.taxonomy.FacetLabel;
|
|
||||||
import org.apache.lucene.facet.taxonomy.writercache.TaxonomyWriterCache;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* {@link TaxonomyWriterCache} using {@link CompactLabelToOrdinal}. Although
|
|
||||||
* called cache, it maintains in memory all the mappings from category to
|
|
||||||
* ordinal, relying on that {@link CompactLabelToOrdinal} is an efficient
|
|
||||||
* mapping for this purpose.
|
|
||||||
*
|
|
||||||
* @lucene.experimental
|
|
||||||
*/
|
|
||||||
public class Cl2oTaxonomyWriterCache implements TaxonomyWriterCache {
|
|
||||||
|
|
||||||
private final ReadWriteLock lock = new ReentrantReadWriteLock();
|
|
||||||
private final int initialCapcity, numHashArrays;
|
|
||||||
private final float loadFactor;
|
|
||||||
|
|
||||||
private volatile CompactLabelToOrdinal cache;
|
|
||||||
|
|
||||||
/** Sole constructor. */
|
|
||||||
public Cl2oTaxonomyWriterCache(int initialCapcity, float loadFactor, int numHashArrays) {
|
|
||||||
this.cache = new CompactLabelToOrdinal(initialCapcity, loadFactor, numHashArrays);
|
|
||||||
this.initialCapcity = initialCapcity;
|
|
||||||
this.numHashArrays = numHashArrays;
|
|
||||||
this.loadFactor = loadFactor;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void clear() {
|
|
||||||
lock.writeLock().lock();
|
|
||||||
try {
|
|
||||||
cache = new CompactLabelToOrdinal(initialCapcity, loadFactor, numHashArrays);
|
|
||||||
} finally {
|
|
||||||
lock.writeLock().unlock();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public synchronized void close() {
|
|
||||||
cache = null;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean isFull() {
|
|
||||||
// This cache is never full
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public int get(FacetLabel categoryPath) {
|
|
||||||
lock.readLock().lock();
|
|
||||||
try {
|
|
||||||
return cache.getOrdinal(categoryPath);
|
|
||||||
} finally {
|
|
||||||
lock.readLock().unlock();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean put(FacetLabel categoryPath, int ordinal) {
|
|
||||||
lock.writeLock().lock();
|
|
||||||
try {
|
|
||||||
cache.addLabel(categoryPath, ordinal);
|
|
||||||
// Tell the caller we didn't clear part of the cache, so it doesn't
|
|
||||||
// have to flush its on-disk index now
|
|
||||||
return false;
|
|
||||||
} finally {
|
|
||||||
lock.writeLock().unlock();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Returns the number of bytes in memory used by this object. */
|
|
||||||
public int getMemoryUsage() {
|
|
||||||
return cache == null ? 0 : cache.getMemoryUsage();
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -1,235 +0,0 @@
|
||||||
/*
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
|
||||||
* this work for additional information regarding copyright ownership.
|
|
||||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
* (the "License"); you may not use this file except in compliance with
|
|
||||||
* the License. You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
package org.apache.lucene.facet.taxonomy.writercache;
|
|
||||||
|
|
||||||
import java.util.Iterator;
|
|
||||||
import java.util.NoSuchElementException;
|
|
||||||
|
|
||||||
import org.apache.lucene.facet.taxonomy.FacetLabel;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* HashMap to store colliding labels. See {@link CompactLabelToOrdinal} for
|
|
||||||
* details.
|
|
||||||
*
|
|
||||||
* @lucene.experimental
|
|
||||||
*/
|
|
||||||
public class CollisionMap {
|
|
||||||
|
|
||||||
private int capacity;
|
|
||||||
private float loadFactor;
|
|
||||||
private int size;
|
|
||||||
private int threshold;
|
|
||||||
|
|
||||||
static class Entry {
|
|
||||||
int offset;
|
|
||||||
int cid;
|
|
||||||
Entry next;
|
|
||||||
int hash;
|
|
||||||
|
|
||||||
Entry(int offset, int cid, int h, Entry e) {
|
|
||||||
this.offset = offset;
|
|
||||||
this.cid = cid;
|
|
||||||
this.next = e;
|
|
||||||
this.hash = h;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private CharBlockArray labelRepository;
|
|
||||||
|
|
||||||
private Entry[] entries;
|
|
||||||
|
|
||||||
CollisionMap(CharBlockArray labelRepository) {
|
|
||||||
this(16 * 1024, 0.75f, labelRepository);
|
|
||||||
}
|
|
||||||
|
|
||||||
CollisionMap(int initialCapacity, CharBlockArray labelRepository) {
|
|
||||||
this(initialCapacity, 0.75f, labelRepository);
|
|
||||||
}
|
|
||||||
|
|
||||||
private CollisionMap(int initialCapacity, float loadFactor, CharBlockArray labelRepository) {
|
|
||||||
this.labelRepository = labelRepository;
|
|
||||||
this.loadFactor = loadFactor;
|
|
||||||
this.capacity = CompactLabelToOrdinal.determineCapacity(2, initialCapacity);
|
|
||||||
|
|
||||||
this.entries = new Entry[this.capacity];
|
|
||||||
this.threshold = (int) (this.capacity * this.loadFactor);
|
|
||||||
}
|
|
||||||
|
|
||||||
/** How many mappings. */
|
|
||||||
public int size() {
|
|
||||||
return this.size;
|
|
||||||
}
|
|
||||||
|
|
||||||
/** How many slots are allocated. */
|
|
||||||
public int capacity() {
|
|
||||||
return this.capacity;
|
|
||||||
}
|
|
||||||
|
|
||||||
private void grow() {
|
|
||||||
int newCapacity = this.capacity * 2;
|
|
||||||
Entry[] newEntries = new Entry[newCapacity];
|
|
||||||
Entry[] src = this.entries;
|
|
||||||
|
|
||||||
for (int j = 0; j < src.length; j++) {
|
|
||||||
Entry e = src[j];
|
|
||||||
if (e != null) {
|
|
||||||
src[j] = null;
|
|
||||||
do {
|
|
||||||
Entry next = e.next;
|
|
||||||
int hash = e.hash;
|
|
||||||
int i = indexFor(hash, newCapacity);
|
|
||||||
e.next = newEntries[i];
|
|
||||||
newEntries[i] = e;
|
|
||||||
e = next;
|
|
||||||
} while (e != null);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
this.capacity = newCapacity;
|
|
||||||
this.entries = newEntries;
|
|
||||||
this.threshold = (int) (this.capacity * this.loadFactor);
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Return the mapping, or {@link
|
|
||||||
* LabelToOrdinal#INVALID_ORDINAL} if the label isn't
|
|
||||||
* recognized. */
|
|
||||||
public int get(FacetLabel label, int hash) {
|
|
||||||
int bucketIndex = indexFor(hash, this.capacity);
|
|
||||||
Entry e = this.entries[bucketIndex];
|
|
||||||
|
|
||||||
while (e != null && !(hash == e.hash && CategoryPathUtils.equalsToSerialized(label, labelRepository, e.offset))) {
|
|
||||||
e = e.next;
|
|
||||||
}
|
|
||||||
if (e == null) {
|
|
||||||
return LabelToOrdinal.INVALID_ORDINAL;
|
|
||||||
}
|
|
||||||
|
|
||||||
return e.cid;
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Add another mapping. */
|
|
||||||
public int addLabel(FacetLabel label, int hash, int cid) {
|
|
||||||
int bucketIndex = indexFor(hash, this.capacity);
|
|
||||||
for (Entry e = this.entries[bucketIndex]; e != null; e = e.next) {
|
|
||||||
if (e.hash == hash && CategoryPathUtils.equalsToSerialized(label, labelRepository, e.offset)) {
|
|
||||||
return e.cid;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// new string; add to label repository
|
|
||||||
int offset = labelRepository.length();
|
|
||||||
CategoryPathUtils.serialize(label, labelRepository);
|
|
||||||
addEntry(offset, cid, hash, bucketIndex);
|
|
||||||
return cid;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* This method does not check if the same value is already in the map because
|
|
||||||
* we pass in an char-array offset, so so we now that we're in resize-mode
|
|
||||||
* here.
|
|
||||||
*/
|
|
||||||
public void addLabelOffset(int hash, int offset, int cid) {
|
|
||||||
int bucketIndex = indexFor(hash, this.capacity);
|
|
||||||
addEntry(offset, cid, hash, bucketIndex);
|
|
||||||
}
|
|
||||||
|
|
||||||
private void addEntry(int offset, int cid, int hash, int bucketIndex) {
|
|
||||||
Entry e = this.entries[bucketIndex];
|
|
||||||
this.entries[bucketIndex] = new Entry(offset, cid, hash, e);
|
|
||||||
if (this.size++ >= this.threshold) {
|
|
||||||
grow();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Iterator<CollisionMap.Entry> entryIterator() {
|
|
||||||
return new EntryIterator(entries, size);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns index for hash code h.
|
|
||||||
*/
|
|
||||||
static int indexFor(int h, int length) {
|
|
||||||
return h & (length - 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns an estimate of the memory usage of this CollisionMap.
|
|
||||||
* @return The approximate number of bytes used by this structure.
|
|
||||||
*/
|
|
||||||
int getMemoryUsage() {
|
|
||||||
int memoryUsage = 0;
|
|
||||||
if (this.entries != null) {
|
|
||||||
for (Entry e : this.entries) {
|
|
||||||
if (e != null) {
|
|
||||||
memoryUsage += (4 * 4);
|
|
||||||
for (Entry ee = e.next; ee != null; ee = ee.next) {
|
|
||||||
memoryUsage += (4 * 4);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return memoryUsage;
|
|
||||||
}
|
|
||||||
|
|
||||||
private static class EntryIterator implements Iterator<Entry> {
|
|
||||||
Entry next; // next entry to return
|
|
||||||
int index; // current slot
|
|
||||||
Entry[] ents;
|
|
||||||
|
|
||||||
EntryIterator(Entry[] entries, int size) {
|
|
||||||
this.ents = entries;
|
|
||||||
Entry[] t = entries;
|
|
||||||
int i = t.length;
|
|
||||||
Entry n = null;
|
|
||||||
if (size != 0) { // advance to first entry
|
|
||||||
while (i > 0 && (n = t[--i]) == null) {
|
|
||||||
// advance
|
|
||||||
}
|
|
||||||
}
|
|
||||||
this.next = n;
|
|
||||||
this.index = i;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean hasNext() {
|
|
||||||
return this.next != null;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Entry next() {
|
|
||||||
Entry e = this.next;
|
|
||||||
if (e == null) throw new NoSuchElementException();
|
|
||||||
|
|
||||||
Entry n = e.next;
|
|
||||||
Entry[] t = ents;
|
|
||||||
int i = this.index;
|
|
||||||
while (n == null && i > 0) {
|
|
||||||
n = t[--i];
|
|
||||||
}
|
|
||||||
this.index = i;
|
|
||||||
this.next = n;
|
|
||||||
return e;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void remove() {
|
|
||||||
throw new UnsupportedOperationException();
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -1,467 +0,0 @@
|
||||||
/*
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
|
||||||
* this work for additional information regarding copyright ownership.
|
|
||||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
* (the "License"); you may not use this file except in compliance with
|
|
||||||
* the License. You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
package org.apache.lucene.facet.taxonomy.writercache;
|
|
||||||
|
|
||||||
import java.io.BufferedInputStream;
|
|
||||||
import java.io.BufferedOutputStream;
|
|
||||||
import java.io.DataInputStream;
|
|
||||||
import java.io.DataOutputStream;
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.OutputStream;
|
|
||||||
import java.nio.file.Files;
|
|
||||||
import java.nio.file.Path;
|
|
||||||
import java.util.Iterator;
|
|
||||||
|
|
||||||
import org.apache.lucene.facet.taxonomy.FacetLabel;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* This is a very efficient LabelToOrdinal implementation that uses a
|
|
||||||
* CharBlockArray to store all labels and a configurable number of HashArrays to
|
|
||||||
* reference the labels.
|
|
||||||
* <p>
|
|
||||||
* Since the HashArrays don't handle collisions, a {@link CollisionMap} is used
|
|
||||||
* to store the colliding labels.
|
|
||||||
* <p>
|
|
||||||
* This data structure grows by adding a new HashArray whenever the number of
|
|
||||||
* collisions in the {@link CollisionMap} exceeds {@code loadFactor} *
|
|
||||||
* {@link #getMaxOrdinal()}. Growing also includes reinserting all colliding
|
|
||||||
* labels into the HashArrays to possibly reduce the number of collisions.
|
|
||||||
*
|
|
||||||
* For setting the {@code loadFactor} see
|
|
||||||
* {@link #CompactLabelToOrdinal(int, float, int)}.
|
|
||||||
*
|
|
||||||
* <p>
|
|
||||||
* This data structure has a much lower memory footprint (~30%) compared to a
|
|
||||||
* Java HashMap<String, Integer>. It also only uses a small fraction of objects
|
|
||||||
* a HashMap would use, thus limiting the GC overhead. Ingestion speed was also
|
|
||||||
* ~50% faster compared to a HashMap for 3M unique labels.
|
|
||||||
*
|
|
||||||
* @lucene.experimental
|
|
||||||
*/
|
|
||||||
public class CompactLabelToOrdinal extends LabelToOrdinal {
|
|
||||||
|
|
||||||
/** Default maximum load factor. */
|
|
||||||
public static final float DefaultLoadFactor = 0.15f;
|
|
||||||
|
|
||||||
static final char TERMINATOR_CHAR = 0xffff;
|
|
||||||
private static final int COLLISION = -5;
|
|
||||||
|
|
||||||
private HashArray[] hashArrays;
|
|
||||||
private CollisionMap collisionMap;
|
|
||||||
private CharBlockArray labelRepository;
|
|
||||||
|
|
||||||
private int capacity;
|
|
||||||
private int threshold;
|
|
||||||
private float loadFactor;
|
|
||||||
|
|
||||||
/** How many labels. */
|
|
||||||
public int sizeOfMap() {
|
|
||||||
return this.collisionMap.size();
|
|
||||||
}
|
|
||||||
|
|
||||||
private CompactLabelToOrdinal() {
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Sole constructor. */
|
|
||||||
public CompactLabelToOrdinal(int initialCapacity, float loadFactor,
|
|
||||||
int numHashArrays) {
|
|
||||||
|
|
||||||
this.hashArrays = new HashArray[numHashArrays];
|
|
||||||
|
|
||||||
this.capacity = determineCapacity((int) Math.pow(2, numHashArrays),
|
|
||||||
initialCapacity);
|
|
||||||
init();
|
|
||||||
this.collisionMap = new CollisionMap(this.labelRepository);
|
|
||||||
|
|
||||||
this.counter = 0;
|
|
||||||
this.loadFactor = loadFactor;
|
|
||||||
|
|
||||||
this.threshold = (int) (this.loadFactor * this.capacity);
|
|
||||||
}
|
|
||||||
|
|
||||||
static int determineCapacity(int minCapacity, int initialCapacity) {
|
|
||||||
int capacity = minCapacity;
|
|
||||||
while (capacity < initialCapacity) {
|
|
||||||
capacity <<= 1;
|
|
||||||
}
|
|
||||||
return capacity;
|
|
||||||
}
|
|
||||||
|
|
||||||
private void init() {
|
|
||||||
labelRepository = new CharBlockArray();
|
|
||||||
CategoryPathUtils.serialize(new FacetLabel(), labelRepository);
|
|
||||||
|
|
||||||
int c = this.capacity;
|
|
||||||
for (int i = 0; i < this.hashArrays.length; i++) {
|
|
||||||
this.hashArrays[i] = new HashArray(c);
|
|
||||||
c /= 2;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void addLabel(FacetLabel label, int ordinal) {
|
|
||||||
if (collisionMap.size() > threshold) {
|
|
||||||
grow();
|
|
||||||
}
|
|
||||||
|
|
||||||
int hash = CompactLabelToOrdinal.stringHashCode(label);
|
|
||||||
for (int i = 0; i < this.hashArrays.length; i++) {
|
|
||||||
if (addLabel(this.hashArrays[i], label, hash, ordinal)) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
int prevVal = collisionMap.addLabel(label, hash, ordinal);
|
|
||||||
if (prevVal != ordinal) {
|
|
||||||
throw new IllegalArgumentException("Label already exists: " + label + " prev ordinal " + prevVal);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public int getOrdinal(FacetLabel label) {
|
|
||||||
if (label == null) {
|
|
||||||
return LabelToOrdinal.INVALID_ORDINAL;
|
|
||||||
}
|
|
||||||
|
|
||||||
int hash = CompactLabelToOrdinal.stringHashCode(label);
|
|
||||||
for (int i = 0; i < this.hashArrays.length; i++) {
|
|
||||||
int ord = getOrdinal(this.hashArrays[i], label, hash);
|
|
||||||
if (ord != COLLISION) {
|
|
||||||
return ord;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return this.collisionMap.get(label, hash);
|
|
||||||
}
|
|
||||||
|
|
||||||
private void grow() {
|
|
||||||
HashArray temp = this.hashArrays[this.hashArrays.length - 1];
|
|
||||||
|
|
||||||
for (int i = this.hashArrays.length - 1; i > 0; i--) {
|
|
||||||
this.hashArrays[i] = this.hashArrays[i - 1];
|
|
||||||
}
|
|
||||||
|
|
||||||
this.capacity *= 2;
|
|
||||||
this.hashArrays[0] = new HashArray(this.capacity);
|
|
||||||
|
|
||||||
for (int i = 1; i < this.hashArrays.length; i++) {
|
|
||||||
int[] sourceOffsetArray = this.hashArrays[i].offsets;
|
|
||||||
int[] sourceCidsArray = this.hashArrays[i].cids;
|
|
||||||
|
|
||||||
for (int k = 0; k < sourceOffsetArray.length; k++) {
|
|
||||||
|
|
||||||
for (int j = 0; j < i && sourceOffsetArray[k] != 0; j++) {
|
|
||||||
int[] targetOffsetArray = this.hashArrays[j].offsets;
|
|
||||||
int[] targetCidsArray = this.hashArrays[j].cids;
|
|
||||||
|
|
||||||
int newIndex = indexFor(stringHashCode(
|
|
||||||
this.labelRepository, sourceOffsetArray[k]),
|
|
||||||
targetOffsetArray.length);
|
|
||||||
if (targetOffsetArray[newIndex] == 0) {
|
|
||||||
targetOffsetArray[newIndex] = sourceOffsetArray[k];
|
|
||||||
targetCidsArray[newIndex] = sourceCidsArray[k];
|
|
||||||
sourceOffsetArray[k] = 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int i = 0; i < temp.offsets.length; i++) {
|
|
||||||
int offset = temp.offsets[i];
|
|
||||||
if (offset > 0) {
|
|
||||||
int hash = stringHashCode(this.labelRepository, offset);
|
|
||||||
addLabelOffset(hash, temp.cids[i], offset);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
CollisionMap oldCollisionMap = this.collisionMap;
|
|
||||||
this.collisionMap = new CollisionMap(oldCollisionMap.capacity(),
|
|
||||||
this.labelRepository);
|
|
||||||
this.threshold = (int) (this.capacity * this.loadFactor);
|
|
||||||
|
|
||||||
Iterator<CollisionMap.Entry> it = oldCollisionMap.entryIterator();
|
|
||||||
while (it.hasNext()) {
|
|
||||||
CollisionMap.Entry e = it.next();
|
|
||||||
addLabelOffset(stringHashCode(this.labelRepository, e.offset),
|
|
||||||
e.cid, e.offset);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private boolean addLabel(HashArray a, FacetLabel label, int hash, int ordinal) {
|
|
||||||
int index = CompactLabelToOrdinal.indexFor(hash, a.offsets.length);
|
|
||||||
int offset = a.offsets[index];
|
|
||||||
|
|
||||||
if (offset == 0) {
|
|
||||||
a.offsets[index] = this.labelRepository.length();
|
|
||||||
CategoryPathUtils.serialize(label, labelRepository);
|
|
||||||
a.cids[index] = ordinal;
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
private void addLabelOffset(int hash, int cid, int knownOffset) {
|
|
||||||
for (int i = 0; i < this.hashArrays.length; i++) {
|
|
||||||
if (addLabelOffsetToHashArray(this.hashArrays[i], hash, cid,
|
|
||||||
knownOffset)) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
this.collisionMap.addLabelOffset(hash, knownOffset, cid);
|
|
||||||
|
|
||||||
if (this.collisionMap.size() > this.threshold) {
|
|
||||||
grow();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private boolean addLabelOffsetToHashArray(HashArray a, int hash, int ordinal,
|
|
||||||
int knownOffset) {
|
|
||||||
|
|
||||||
int index = CompactLabelToOrdinal.indexFor(hash, a.offsets.length);
|
|
||||||
int offset = a.offsets[index];
|
|
||||||
|
|
||||||
if (offset == 0) {
|
|
||||||
a.offsets[index] = knownOffset;
|
|
||||||
a.cids[index] = ordinal;
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
private int getOrdinal(HashArray a, FacetLabel label, int hash) {
|
|
||||||
if (label == null) {
|
|
||||||
return LabelToOrdinal.INVALID_ORDINAL;
|
|
||||||
}
|
|
||||||
|
|
||||||
int index = indexFor(hash, a.offsets.length);
|
|
||||||
int offset = a.offsets[index];
|
|
||||||
if (offset == 0) {
|
|
||||||
return LabelToOrdinal.INVALID_ORDINAL;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (CategoryPathUtils.equalsToSerialized(label, labelRepository, offset)) {
|
|
||||||
return a.cids[index];
|
|
||||||
}
|
|
||||||
|
|
||||||
return COLLISION;
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Returns index for hash code h. */
|
|
||||||
static int indexFor(int h, int length) {
|
|
||||||
return h & (length - 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
// static int stringHashCode(String label) {
|
|
||||||
// int len = label.length();
|
|
||||||
// int hash = 0;
|
|
||||||
// int i;
|
|
||||||
// for (i = 0; i < len; ++i)
|
|
||||||
// hash = 33 * hash + label.charAt(i);
|
|
||||||
//
|
|
||||||
// hash = hash ^ ((hash >>> 20) ^ (hash >>> 12));
|
|
||||||
// hash = hash ^ (hash >>> 7) ^ (hash >>> 4);
|
|
||||||
//
|
|
||||||
// return hash;
|
|
||||||
//
|
|
||||||
// }
|
|
||||||
|
|
||||||
static int stringHashCode(FacetLabel label) {
|
|
||||||
int hash = label.hashCode();
|
|
||||||
|
|
||||||
hash = hash ^ ((hash >>> 20) ^ (hash >>> 12));
|
|
||||||
hash = hash ^ (hash >>> 7) ^ (hash >>> 4);
|
|
||||||
|
|
||||||
return hash;
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
static int stringHashCode(CharBlockArray labelRepository, int offset) {
|
|
||||||
int hash = CategoryPathUtils.hashCodeOfSerialized(labelRepository, offset);
|
|
||||||
hash = hash ^ ((hash >>> 20) ^ (hash >>> 12));
|
|
||||||
hash = hash ^ (hash >>> 7) ^ (hash >>> 4);
|
|
||||||
return hash;
|
|
||||||
}
|
|
||||||
|
|
||||||
// public static boolean equals(CharSequence label, CharBlockArray array,
|
|
||||||
// int offset) {
|
|
||||||
// // CONTINUE HERE
|
|
||||||
// int len = label.length();
|
|
||||||
// int bi = array.blockIndex(offset);
|
|
||||||
// CharBlockArray.Block b = array.blocks.get(bi);
|
|
||||||
// int index = array.indexInBlock(offset);
|
|
||||||
//
|
|
||||||
// for (int i = 0; i < len; i++) {
|
|
||||||
// if (label.charAt(i) != b.chars[index]) {
|
|
||||||
// return false;
|
|
||||||
// }
|
|
||||||
// index++;
|
|
||||||
// if (index == b.length) {
|
|
||||||
// b = array.blocks.get(++bi);
|
|
||||||
// index = 0;
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
//
|
|
||||||
// return b.chars[index] == TerminatorChar;
|
|
||||||
// }
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns an estimate of the amount of memory used by this table. Called only in
|
|
||||||
* this package. Memory is consumed mainly by three structures: the hash arrays,
|
|
||||||
* label repository and collision map.
|
|
||||||
*/
|
|
||||||
int getMemoryUsage() {
|
|
||||||
int memoryUsage = 0;
|
|
||||||
if (this.hashArrays != null) {
|
|
||||||
// HashArray capacity is instance-specific.
|
|
||||||
for (HashArray ha : this.hashArrays) {
|
|
||||||
// Each has 2 capacity-length arrays of ints.
|
|
||||||
memoryUsage += ( ha.capacity * 2 * 4 ) + 4;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (this.labelRepository != null) {
|
|
||||||
// All blocks are the same size.
|
|
||||||
int blockSize = this.labelRepository.blockSize;
|
|
||||||
// Each block has room for blockSize UTF-16 chars.
|
|
||||||
int actualBlockSize = ( blockSize * 2 ) + 4;
|
|
||||||
memoryUsage += this.labelRepository.blocks.size() * actualBlockSize;
|
|
||||||
memoryUsage += 8; // Two int values for array as a whole.
|
|
||||||
}
|
|
||||||
if (this.collisionMap != null) {
|
|
||||||
memoryUsage += this.collisionMap.getMemoryUsage();
|
|
||||||
}
|
|
||||||
return memoryUsage;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Opens the file and reloads the CompactLabelToOrdinal. The file it expects
|
|
||||||
* is generated from the {@link #flush(Path)} command.
|
|
||||||
*/
|
|
||||||
static CompactLabelToOrdinal open(Path file, float loadFactor,
|
|
||||||
int numHashArrays) throws IOException {
|
|
||||||
/**
|
|
||||||
* Part of the file is the labelRepository, which needs to be rehashed
|
|
||||||
* and label offsets re-added to the object. I am unsure as to why we
|
|
||||||
* can't just store these off in the file as well, but in keeping with
|
|
||||||
* the spirit of the original code, I did it this way. (ssuppe)
|
|
||||||
*/
|
|
||||||
CompactLabelToOrdinal l2o = new CompactLabelToOrdinal();
|
|
||||||
l2o.loadFactor = loadFactor;
|
|
||||||
l2o.hashArrays = new HashArray[numHashArrays];
|
|
||||||
|
|
||||||
DataInputStream dis = null;
|
|
||||||
try {
|
|
||||||
dis = new DataInputStream(new BufferedInputStream(
|
|
||||||
Files.newInputStream(file)));
|
|
||||||
|
|
||||||
// TaxiReader needs to load the "counter" or occupancy (L2O) to know
|
|
||||||
// the next unique facet. we used to load the delimiter too, but
|
|
||||||
// never used it.
|
|
||||||
l2o.counter = dis.readInt();
|
|
||||||
|
|
||||||
l2o.capacity = determineCapacity((int) Math.pow(2,
|
|
||||||
l2o.hashArrays.length), l2o.counter);
|
|
||||||
l2o.init();
|
|
||||||
|
|
||||||
// now read the chars
|
|
||||||
l2o.labelRepository = CharBlockArray.open(dis);
|
|
||||||
|
|
||||||
l2o.collisionMap = new CollisionMap(l2o.labelRepository);
|
|
||||||
|
|
||||||
// Calculate hash on the fly based on how CategoryPath hashes
|
|
||||||
// itself. Maybe in the future we can call some static based methods
|
|
||||||
// in CategoryPath so that this doesn't break again? I don't like
|
|
||||||
// having code in two different places...
|
|
||||||
int cid = 0;
|
|
||||||
// Skip the initial offset, it's the CategoryPath(0,0), which isn't
|
|
||||||
// a hashed value.
|
|
||||||
int offset = 1;
|
|
||||||
int lastStartOffset = offset;
|
|
||||||
// This loop really relies on a well-formed input (assumes pretty blindly
|
|
||||||
// that array offsets will work). Since the initial file is machine
|
|
||||||
// generated, I think this should be OK.
|
|
||||||
while (offset < l2o.labelRepository.length()) {
|
|
||||||
// identical code to CategoryPath.hashFromSerialized. since we need to
|
|
||||||
// advance offset, we cannot call the method directly. perhaps if we
|
|
||||||
// could pass a mutable Integer or something...
|
|
||||||
int length = (short) l2o.labelRepository.charAt(offset++);
|
|
||||||
int hash = length;
|
|
||||||
if (length != 0) {
|
|
||||||
for (int i = 0; i < length; i++) {
|
|
||||||
int len = (short) l2o.labelRepository.charAt(offset++);
|
|
||||||
hash = hash * 31 + l2o.labelRepository.subSequence(offset, offset + len).hashCode();
|
|
||||||
offset += len;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// Now that we've hashed the components of the label, do the
|
|
||||||
// final part of the hash algorithm.
|
|
||||||
hash = hash ^ ((hash >>> 20) ^ (hash >>> 12));
|
|
||||||
hash = hash ^ (hash >>> 7) ^ (hash >>> 4);
|
|
||||||
// Add the label, and let's keep going
|
|
||||||
l2o.addLabelOffset(hash, cid, lastStartOffset);
|
|
||||||
cid++;
|
|
||||||
lastStartOffset = offset;
|
|
||||||
}
|
|
||||||
|
|
||||||
} catch (ClassNotFoundException cnfe) {
|
|
||||||
throw new IOException("Invalid file format. Cannot deserialize.");
|
|
||||||
} finally {
|
|
||||||
if (dis != null) {
|
|
||||||
dis.close();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
l2o.threshold = (int) (l2o.loadFactor * l2o.capacity);
|
|
||||||
return l2o;
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
void flush(Path file) throws IOException {
|
|
||||||
OutputStream fos = Files.newOutputStream(file);
|
|
||||||
|
|
||||||
try {
|
|
||||||
BufferedOutputStream os = new BufferedOutputStream(fos);
|
|
||||||
|
|
||||||
DataOutputStream dos = new DataOutputStream(os);
|
|
||||||
dos.writeInt(this.counter);
|
|
||||||
|
|
||||||
// write the labelRepository
|
|
||||||
this.labelRepository.flush(dos);
|
|
||||||
|
|
||||||
// Closes the data output stream
|
|
||||||
dos.close();
|
|
||||||
|
|
||||||
} finally {
|
|
||||||
fos.close();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private static final class HashArray {
|
|
||||||
int[] offsets;
|
|
||||||
int[] cids;
|
|
||||||
|
|
||||||
int capacity;
|
|
||||||
|
|
||||||
HashArray(int c) {
|
|
||||||
this.capacity = c;
|
|
||||||
this.offsets = new int[this.capacity];
|
|
||||||
this.cids = new int[this.capacity];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -0,0 +1,158 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.lucene.facet.taxonomy.writercache;
|
||||||
|
|
||||||
|
import org.apache.lucene.facet.taxonomy.FacetLabel;
|
||||||
|
import org.apache.lucene.facet.taxonomy.writercache.TaxonomyWriterCache;
|
||||||
|
import org.apache.lucene.util.Accountable;
|
||||||
|
import org.apache.lucene.util.ArrayUtil;
|
||||||
|
import org.apache.lucene.util.ByteBlockPool.DirectTrackingAllocator;
|
||||||
|
import org.apache.lucene.util.ByteBlockPool;
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.BytesRefBuilder;
|
||||||
|
import org.apache.lucene.util.BytesRefHash;
|
||||||
|
import org.apache.lucene.util.Counter;
|
||||||
|
import org.apache.lucene.util.RamUsageEstimator;
|
||||||
|
import org.apache.lucene.util.UnicodeUtil;
|
||||||
|
|
||||||
|
/** A "cache" that never frees memory, and stores labels in a BytesRefHash (utf-8 encoding). */
|
||||||
|
public final class UTF8TaxonomyWriterCache implements TaxonomyWriterCache, Accountable {
|
||||||
|
private final ThreadLocal<BytesRefBuilder> bytes = new ThreadLocal<BytesRefBuilder>() {
|
||||||
|
@Override
|
||||||
|
protected BytesRefBuilder initialValue() {
|
||||||
|
return new BytesRefBuilder();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
private final Counter bytesUsed = Counter.newCounter();
|
||||||
|
private final BytesRefHash map = new BytesRefHash(new ByteBlockPool(new DirectTrackingAllocator(bytesUsed)));
|
||||||
|
|
||||||
|
private final static int ORDINALS_PAGE_SIZE = 65536;
|
||||||
|
private final static int ORDINALS_PAGE_MASK = ORDINALS_PAGE_SIZE - 1;
|
||||||
|
|
||||||
|
private volatile int[][] ordinals;
|
||||||
|
|
||||||
|
// How many labels we are storing:
|
||||||
|
private int count;
|
||||||
|
|
||||||
|
// How many pages in ordinals we've allocated:
|
||||||
|
private int pageCount;
|
||||||
|
|
||||||
|
/** Sole constructor. */
|
||||||
|
public UTF8TaxonomyWriterCache() {
|
||||||
|
ordinals = new int[1][];
|
||||||
|
ordinals[0] = new int[ORDINALS_PAGE_SIZE];
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int get(FacetLabel label) {
|
||||||
|
BytesRef bytes = toBytes(label);
|
||||||
|
int id;
|
||||||
|
synchronized (this) {
|
||||||
|
id = map.find(bytes);
|
||||||
|
}
|
||||||
|
if (id == -1) {
|
||||||
|
return LabelToOrdinal.INVALID_ORDINAL;
|
||||||
|
}
|
||||||
|
int page = id / ORDINALS_PAGE_SIZE;
|
||||||
|
int offset = id % ORDINALS_PAGE_MASK;
|
||||||
|
return ordinals[page][offset];
|
||||||
|
}
|
||||||
|
|
||||||
|
// Called only from assert
|
||||||
|
private boolean assertSameOrdinal(FacetLabel label, int id, int ord) {
|
||||||
|
id = -id - 1;
|
||||||
|
int page = id / ORDINALS_PAGE_SIZE;
|
||||||
|
int offset = id % ORDINALS_PAGE_MASK;
|
||||||
|
int oldOrd = ordinals[page][offset];
|
||||||
|
if (oldOrd != ord) {
|
||||||
|
throw new IllegalArgumentException("label " + label + " was already cached, with old ord=" + oldOrd + " versus new ord=" + ord);
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean put(FacetLabel label, int ord) {
|
||||||
|
BytesRef bytes = toBytes(label);
|
||||||
|
int id;
|
||||||
|
synchronized (this) {
|
||||||
|
id = map.add(bytes);
|
||||||
|
if (id < 0) {
|
||||||
|
assert assertSameOrdinal(label, id, ord);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
assert id == count;
|
||||||
|
int page = id / ORDINALS_PAGE_SIZE;
|
||||||
|
int offset = id % ORDINALS_PAGE_MASK;
|
||||||
|
if (page == pageCount) {
|
||||||
|
if (page == ordinals.length) {
|
||||||
|
int[][] newOrdinals = new int[ArrayUtil.oversize(page+1, RamUsageEstimator.NUM_BYTES_OBJECT_REF)][];
|
||||||
|
System.arraycopy(ordinals, 0, newOrdinals, 0, ordinals.length);
|
||||||
|
ordinals = newOrdinals;
|
||||||
|
}
|
||||||
|
ordinals[page] = new int[ORDINALS_PAGE_MASK];
|
||||||
|
pageCount++;
|
||||||
|
}
|
||||||
|
ordinals[page][offset] = ord;
|
||||||
|
count++;
|
||||||
|
|
||||||
|
// we never prune from the cache
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean isFull() {
|
||||||
|
// we are never full
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public synchronized void clear() {
|
||||||
|
map.clear();
|
||||||
|
map.reinit();
|
||||||
|
ordinals = new int[1][];
|
||||||
|
ordinals[0] = new int[ORDINALS_PAGE_SIZE];
|
||||||
|
count = 0;
|
||||||
|
pageCount = 0;
|
||||||
|
assert bytesUsed.get() == 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public synchronized long ramBytesUsed() {
|
||||||
|
return bytesUsed.get() + pageCount * ORDINALS_PAGE_SIZE * RamUsageEstimator.NUM_BYTES_INT;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void close() {
|
||||||
|
}
|
||||||
|
|
||||||
|
private static final byte DELIM_CHAR = (byte) 0x1F;
|
||||||
|
|
||||||
|
private BytesRef toBytes(FacetLabel label) {
|
||||||
|
BytesRefBuilder bytes = this.bytes.get();
|
||||||
|
bytes.clear();
|
||||||
|
for (int i = 0; i < label.length; i++) {
|
||||||
|
String part = label.components[i];
|
||||||
|
if (i > 0) {
|
||||||
|
bytes.append(DELIM_CHAR);
|
||||||
|
}
|
||||||
|
bytes.grow(bytes.length() + UnicodeUtil.maxUTF8Length(part.length()));
|
||||||
|
bytes.setLength(UnicodeUtil.UTF16toUTF8(part, 0, part.length(), bytes.bytes(), bytes.length()));
|
||||||
|
}
|
||||||
|
return bytes.get();
|
||||||
|
}
|
||||||
|
}
|
|
@ -26,9 +26,9 @@ import org.apache.lucene.facet.FacetField;
|
||||||
import org.apache.lucene.facet.FacetTestCase;
|
import org.apache.lucene.facet.FacetTestCase;
|
||||||
import org.apache.lucene.facet.FacetsConfig;
|
import org.apache.lucene.facet.FacetsConfig;
|
||||||
import org.apache.lucene.facet.taxonomy.FacetLabel;
|
import org.apache.lucene.facet.taxonomy.FacetLabel;
|
||||||
import org.apache.lucene.facet.taxonomy.writercache.TaxonomyWriterCache;
|
|
||||||
import org.apache.lucene.facet.taxonomy.writercache.Cl2oTaxonomyWriterCache;
|
|
||||||
import org.apache.lucene.facet.taxonomy.writercache.LruTaxonomyWriterCache;
|
import org.apache.lucene.facet.taxonomy.writercache.LruTaxonomyWriterCache;
|
||||||
|
import org.apache.lucene.facet.taxonomy.writercache.TaxonomyWriterCache;
|
||||||
|
import org.apache.lucene.facet.taxonomy.writercache.UTF8TaxonomyWriterCache;
|
||||||
import org.apache.lucene.index.IndexWriter;
|
import org.apache.lucene.index.IndexWriter;
|
||||||
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
|
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
|
||||||
import org.apache.lucene.store.Directory;
|
import org.apache.lucene.store.Directory;
|
||||||
|
@ -66,13 +66,13 @@ public class TestConcurrentFacetedIndexing extends FacetTestCase {
|
||||||
final double d = random().nextDouble();
|
final double d = random().nextDouble();
|
||||||
if (d < 0.7) {
|
if (d < 0.7) {
|
||||||
// this is the fastest, yet most memory consuming
|
// this is the fastest, yet most memory consuming
|
||||||
return new Cl2oTaxonomyWriterCache(1024, 0.15f, 3);
|
return new UTF8TaxonomyWriterCache();
|
||||||
} else if (TEST_NIGHTLY && d > 0.98) {
|
} else if (TEST_NIGHTLY && d > 0.98) {
|
||||||
// this is the slowest, but tests the writer concurrency when no caching is done.
|
// this is the slowest, but tests the writer concurrency when no caching is done.
|
||||||
// only pick it during NIGHTLY tests, and even then, with very low chances.
|
// only pick it during NIGHTLY tests, and even then, with very low chances.
|
||||||
return NO_OP_CACHE;
|
return NO_OP_CACHE;
|
||||||
} else {
|
} else {
|
||||||
// this is slower than CL2O, but less memory consuming, and exercises finding categories on disk too.
|
// this is slower than UTF8, but less memory consuming, and exercises finding categories on disk too.
|
||||||
return new LruTaxonomyWriterCache(ndocs / 10);
|
return new LruTaxonomyWriterCache(ndocs / 10);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -25,16 +25,16 @@ import java.util.concurrent.atomic.AtomicInteger;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.MockAnalyzer;
|
import org.apache.lucene.analysis.MockAnalyzer;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
|
import org.apache.lucene.facet.DrillDownQuery;
|
||||||
import org.apache.lucene.facet.FacetField;
|
import org.apache.lucene.facet.FacetField;
|
||||||
import org.apache.lucene.facet.FacetTestCase;
|
import org.apache.lucene.facet.FacetTestCase;
|
||||||
import org.apache.lucene.facet.FacetsConfig;
|
import org.apache.lucene.facet.FacetsConfig;
|
||||||
import org.apache.lucene.facet.DrillDownQuery;
|
|
||||||
import org.apache.lucene.facet.taxonomy.FacetLabel;
|
import org.apache.lucene.facet.taxonomy.FacetLabel;
|
||||||
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
|
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
|
||||||
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter.MemoryOrdinalMap;
|
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter.MemoryOrdinalMap;
|
||||||
import org.apache.lucene.facet.taxonomy.writercache.TaxonomyWriterCache;
|
|
||||||
import org.apache.lucene.facet.taxonomy.writercache.Cl2oTaxonomyWriterCache;
|
|
||||||
import org.apache.lucene.facet.taxonomy.writercache.LruTaxonomyWriterCache;
|
import org.apache.lucene.facet.taxonomy.writercache.LruTaxonomyWriterCache;
|
||||||
|
import org.apache.lucene.facet.taxonomy.writercache.TaxonomyWriterCache;
|
||||||
|
import org.apache.lucene.facet.taxonomy.writercache.UTF8TaxonomyWriterCache;
|
||||||
import org.apache.lucene.index.DirectoryReader;
|
import org.apache.lucene.index.DirectoryReader;
|
||||||
import org.apache.lucene.index.IndexReader;
|
import org.apache.lucene.index.IndexReader;
|
||||||
import org.apache.lucene.index.IndexWriter;
|
import org.apache.lucene.index.IndexWriter;
|
||||||
|
@ -252,13 +252,13 @@ public class TestDirectoryTaxonomyWriter extends FacetTestCase {
|
||||||
final TaxonomyWriterCache cache;
|
final TaxonomyWriterCache cache;
|
||||||
if (d < 0.7) {
|
if (d < 0.7) {
|
||||||
// this is the fastest, yet most memory consuming
|
// this is the fastest, yet most memory consuming
|
||||||
cache = new Cl2oTaxonomyWriterCache(1024, 0.15f, 3);
|
cache = new UTF8TaxonomyWriterCache();
|
||||||
} else if (TEST_NIGHTLY && d > 0.98) {
|
} else if (TEST_NIGHTLY && d > 0.98) {
|
||||||
// this is the slowest, but tests the writer concurrency when no caching is done.
|
// this is the slowest, but tests the writer concurrency when no caching is done.
|
||||||
// only pick it during NIGHTLY tests, and even then, with very low chances.
|
// only pick it during NIGHTLY tests, and even then, with very low chances.
|
||||||
cache = NO_OP_CACHE;
|
cache = NO_OP_CACHE;
|
||||||
} else {
|
} else {
|
||||||
// this is slower than CL2O, but less memory consuming, and exercises finding categories on disk too.
|
// this is slower than UTF8, but less memory consuming, and exercises finding categories on disk too.
|
||||||
cache = new LruTaxonomyWriterCache(ncats / 10);
|
cache = new LruTaxonomyWriterCache(ncats / 10);
|
||||||
}
|
}
|
||||||
if (VERBOSE) {
|
if (VERBOSE) {
|
||||||
|
@ -441,7 +441,7 @@ public class TestDirectoryTaxonomyWriter extends FacetTestCase {
|
||||||
public void testHugeLabel() throws Exception {
|
public void testHugeLabel() throws Exception {
|
||||||
Directory indexDir = newDirectory(), taxoDir = newDirectory();
|
Directory indexDir = newDirectory(), taxoDir = newDirectory();
|
||||||
IndexWriter indexWriter = new IndexWriter(indexDir, newIndexWriterConfig(new MockAnalyzer(random())));
|
IndexWriter indexWriter = new IndexWriter(indexDir, newIndexWriterConfig(new MockAnalyzer(random())));
|
||||||
DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir, OpenMode.CREATE, new Cl2oTaxonomyWriterCache(2, 1f, 1));
|
DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir, OpenMode.CREATE, new UTF8TaxonomyWriterCache());
|
||||||
FacetsConfig config = new FacetsConfig();
|
FacetsConfig config = new FacetsConfig();
|
||||||
|
|
||||||
// Add one huge label:
|
// Add one huge label:
|
||||||
|
|
|
@ -17,69 +17,54 @@
|
||||||
|
|
||||||
package org.apache.lucene.facet.taxonomy.writercache;
|
package org.apache.lucene.facet.taxonomy.writercache;
|
||||||
|
|
||||||
import java.nio.ByteBuffer;
|
|
||||||
import java.nio.charset.CharsetDecoder;
|
|
||||||
import java.nio.charset.CodingErrorAction;
|
|
||||||
import java.nio.charset.StandardCharsets;
|
|
||||||
import java.nio.file.Files;
|
|
||||||
import java.nio.file.Path;
|
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
|
import java.util.HashSet;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Random;
|
import java.util.Random;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
import org.apache.lucene.facet.FacetTestCase;
|
import org.apache.lucene.facet.FacetTestCase;
|
||||||
import org.apache.lucene.facet.taxonomy.FacetLabel;
|
import org.apache.lucene.facet.taxonomy.FacetLabel;
|
||||||
import org.junit.Test;
|
import org.apache.lucene.util.TestUtil;
|
||||||
|
|
||||||
public class TestCompactLabelToOrdinal extends FacetTestCase {
|
public class TestUTF8TaxonomyWriterCache extends FacetTestCase {
|
||||||
|
|
||||||
@Test
|
public void testRandom() throws Exception {
|
||||||
public void testL2O() throws Exception {
|
|
||||||
LabelToOrdinal map = new LabelToOrdinalMap();
|
LabelToOrdinal map = new LabelToOrdinalMap();
|
||||||
|
|
||||||
CompactLabelToOrdinal compact = new CompactLabelToOrdinal(2000000, 0.15f, 3);
|
UTF8TaxonomyWriterCache cache = new UTF8TaxonomyWriterCache();
|
||||||
|
|
||||||
final int n = atLeast(10 * 1000);
|
final int n = atLeast(10 * 1000);
|
||||||
final int numUniqueValues = 50 * 1000;
|
final int numUniqueValues = 50 * 1000;
|
||||||
|
|
||||||
String[] uniqueValues = new String[numUniqueValues];
|
|
||||||
byte[] buffer = new byte[50];
|
byte[] buffer = new byte[50];
|
||||||
|
|
||||||
Random random = random();
|
Random random = random();
|
||||||
for (int i = 0; i < numUniqueValues;) {
|
Set<String> uniqueValuesSet = new HashSet<>();
|
||||||
random.nextBytes(buffer);
|
while (uniqueValuesSet.size() < numUniqueValues) {
|
||||||
int size = 1 + random.nextInt(buffer.length);
|
int numParts = TestUtil.nextInt(random(), 1, 5);
|
||||||
|
StringBuilder b = new StringBuilder();
|
||||||
// This test is turning random bytes into a string,
|
for (int i=0;i<numParts;i++) {
|
||||||
// this is asking for trouble.
|
String part = null;
|
||||||
CharsetDecoder decoder = StandardCharsets.UTF_8.newDecoder()
|
while (true) {
|
||||||
.onUnmappableCharacter(CodingErrorAction.REPLACE)
|
part = TestUtil.randomRealisticUnicodeString(random(), 16);
|
||||||
.onMalformedInput(CodingErrorAction.REPLACE);
|
part = part.replace("/", "");
|
||||||
uniqueValues[i] = decoder.decode(ByteBuffer.wrap(buffer, 0, size)).toString();
|
if (part.length() > 0) {
|
||||||
// we cannot have empty path components, so eliminate all prefix as well
|
break;
|
||||||
// as middle consecutive delimiter chars.
|
|
||||||
uniqueValues[i] = uniqueValues[i].replaceAll("/+", "/");
|
|
||||||
if (uniqueValues[i].startsWith("/")) {
|
|
||||||
uniqueValues[i] = uniqueValues[i].substring(1);
|
|
||||||
}
|
|
||||||
if (uniqueValues[i].indexOf(CompactLabelToOrdinal.TERMINATOR_CHAR) == -1) {
|
|
||||||
i++;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Path tmpDir = createTempDir("testLableToOrdinal");
|
if (i > 0) {
|
||||||
Path f = tmpDir.resolve("CompactLabelToOrdinalTest.tmp");
|
b.append('/');
|
||||||
int flushInterval = 10;
|
}
|
||||||
|
b.append(part);
|
||||||
|
}
|
||||||
|
uniqueValuesSet.add(b.toString());
|
||||||
|
}
|
||||||
|
String[] uniqueValues = uniqueValuesSet.toArray(new String[0]);
|
||||||
|
|
||||||
|
int ordUpto = 0;
|
||||||
for (int i = 0; i < n; i++) {
|
for (int i = 0; i < n; i++) {
|
||||||
if (i > 0 && i % flushInterval == 0) {
|
|
||||||
compact.flush(f);
|
|
||||||
compact = CompactLabelToOrdinal.open(f, 0.15f, 3);
|
|
||||||
Files.delete(f);
|
|
||||||
if (flushInterval < (n / 10)) {
|
|
||||||
flushInterval *= 10;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
int index = random.nextInt(numUniqueValues);
|
int index = random.nextInt(numUniqueValues);
|
||||||
FacetLabel label;
|
FacetLabel label;
|
||||||
|
@ -91,14 +76,14 @@ public class TestCompactLabelToOrdinal extends FacetTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
int ord1 = map.getOrdinal(label);
|
int ord1 = map.getOrdinal(label);
|
||||||
int ord2 = compact.getOrdinal(label);
|
int ord2 = cache.get(label);
|
||||||
|
|
||||||
assertEquals(ord1, ord2);
|
assertEquals(ord1, ord2);
|
||||||
|
|
||||||
if (ord1 == LabelToOrdinal.INVALID_ORDINAL) {
|
if (ord1 == LabelToOrdinal.INVALID_ORDINAL) {
|
||||||
ord1 = compact.getNextOrdinal();
|
ord1 = ordUpto++;
|
||||||
map.addLabel(label, ord1);
|
map.addLabel(label, ord1);
|
||||||
compact.addLabel(label, ord1);
|
cache.put(label, ord1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -111,7 +96,7 @@ public class TestCompactLabelToOrdinal extends FacetTestCase {
|
||||||
label = new FacetLabel(s.split("/"));
|
label = new FacetLabel(s.split("/"));
|
||||||
}
|
}
|
||||||
int ord1 = map.getOrdinal(label);
|
int ord1 = map.getOrdinal(label);
|
||||||
int ord2 = compact.getOrdinal(label);
|
int ord2 = cache.get(label);
|
||||||
assertEquals(ord1, ord2);
|
assertEquals(ord1, ord2);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -131,7 +116,5 @@ public class TestCompactLabelToOrdinal extends FacetTestCase {
|
||||||
Integer value = map.get(label);
|
Integer value = map.get(label);
|
||||||
return (value != null) ? value.intValue() : LabelToOrdinal.INVALID_ORDINAL;
|
return (value != null) ? value.intValue() : LabelToOrdinal.INVALID_ORDINAL;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
Loading…
Reference in New Issue