LUCENE-3441: facets NRT support

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1412149 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Shai Erera 2012-11-21 15:26:34 +00:00
parent 258baa7069
commit 8b5e57faee
17 changed files with 1019 additions and 933 deletions

View File

@ -34,6 +34,26 @@ Changes in backwards compatibility policy
Override lengthNorm and/or encode/decodeNormValue to change the specifics,
like Lucene 3.x. (Robert Muir)
* LUCENE-3441: The facet module now supports NRT. As a result, the following
changes were made:
- DirectoryTaxonomyReader has a new constructor which takes a
DirectoryTaxonomyWriter. You should use that constructor in order to get
the NRT support (or the old one for non-NRT).
- TaxonomyReader.refresh() removed in exchange for TaxonomyReader.openIfChanged
static method. Similar to DirectoryReader, the method either returns null
if no changes were made to the taxonomy, or a new TR instance otherwise.
Instead of calling refresh(), you should write similar code to how you reopen
a regular DirectoryReader.
- TaxonomyReader.openIfChanged (previously refresh()) no longer throws
IncosistentTaxonomyException, and supports recreate. InconsistentTaxoEx
was removed.
- ChildrenArrays was pulled out of TaxonomyReader into a top-level class.
- TaxonomyReader was made an abstract class (instead of an interface), with
methods such as close() and reference counting management pulled from
DirectoryTaxonomyReader, and made final. The rest of the methods, remained
abstract.
(Shai Erera, Gilad Barkai)
New Features
* LUCENE-4226: New experimental StoredFieldsFormat that compresses chunks of

View File

@ -8,8 +8,8 @@ import org.apache.lucene.facet.search.results.FacetResult;
import org.apache.lucene.facet.search.results.FacetResultNode;
import org.apache.lucene.facet.search.results.MutableFacetResultNode;
import org.apache.lucene.facet.search.results.IntermediateFacetResult;
import org.apache.lucene.facet.taxonomy.ChildrenArrays;
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
import org.apache.lucene.facet.taxonomy.TaxonomyReader.ChildrenArrays;
import org.apache.lucene.facet.util.ResultSortUtils;
/*
@ -120,7 +120,7 @@ public class TopKFacetResultsHandler extends FacetResultsHandler {
* @return total number of descendants considered here by pq, excluding ordinal itself.
*/
private int heapDescendants(int ordinal, Heap<FacetResultNode> pq,
MutableFacetResultNode parentResultNode, FacetArrays facetArrays, int offset) {
MutableFacetResultNode parentResultNode, FacetArrays facetArrays, int offset) throws IOException {
int partitionSize = facetArrays.getArraysLength();
int endOffset = offset + partitionSize;
ChildrenArrays childrenArray = taxonomyReader.getChildrenArrays();

View File

@ -12,8 +12,8 @@ import org.apache.lucene.facet.search.results.FacetResult;
import org.apache.lucene.facet.search.results.FacetResultNode;
import org.apache.lucene.facet.search.results.MutableFacetResultNode;
import org.apache.lucene.facet.search.results.IntermediateFacetResult;
import org.apache.lucene.facet.taxonomy.ChildrenArrays;
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
import org.apache.lucene.facet.taxonomy.TaxonomyReader.ChildrenArrays;
import org.apache.lucene.util.collections.IntIterator;
import org.apache.lucene.util.collections.IntToObjectMap;

View File

@ -0,0 +1,87 @@
package org.apache.lucene.facet.taxonomy;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Equivalent representations of the taxonomy's parent info,
* used internally for efficient computation of facet results:
* "youngest child" and "oldest sibling"
*/
public class ChildrenArrays {
private final int[] youngestChild, olderSibling;
public ChildrenArrays(int[] parents) {
this(parents, null);
}
public ChildrenArrays(int[] parents, ChildrenArrays copyFrom) {
youngestChild = new int[parents.length];
olderSibling = new int[parents.length];
int first = 0;
if (copyFrom != null) {
System.arraycopy(copyFrom.getYoungestChildArray(), 0, youngestChild, 0, copyFrom.getYoungestChildArray().length);
System.arraycopy(copyFrom.getOlderSiblingArray(), 0, olderSibling, 0, copyFrom.getOlderSiblingArray().length);
first = copyFrom.getOlderSiblingArray().length;
}
computeArrays(parents, first);
}
private void computeArrays(int[] parents, int first) {
// reset the youngest child of all ordinals. while this should be done only
// for the leaves, we don't know up front which are the leaves, so we reset
// all of them.
for (int i = first; i < parents.length; i++) {
youngestChild[i] = TaxonomyReader.INVALID_ORDINAL;
}
// the root category has no parent, and therefore no siblings
if (first == 0) {
first = 1;
olderSibling[0] = TaxonomyReader.INVALID_ORDINAL;
}
for (int i = first; i < parents.length; i++) {
// note that parents[i] is always < i, so the right-hand-side of
// the following line is already set when we get here
olderSibling[i] = youngestChild[parents[i]];
youngestChild[parents[i]] = i;
}
}
/**
* Returns an {@code int[]} the size of the taxonomy listing for each category
* the ordinal of its immediate older sibling (the sibling in the taxonomy
* tree with the highest ordinal below that of the given ordinal). The value
* for a category with no older sibling is {@link TaxonomyReader#INVALID_ORDINAL}.
*/
public int[] getOlderSiblingArray() {
return olderSibling;
}
/**
* Returns an {@code int[]} the size of the taxonomy listing the ordinal of
* the youngest (highest numbered) child category of each category in the
* taxonomy. The value for a leaf category (a category without children) is
* {@link TaxonomyReader#INVALID_ORDINAL}.
*/
public int[] getYoungestChildArray() {
return youngestChild;
}
}

View File

@ -1,40 +0,0 @@
package org.apache.lucene.facet.taxonomy;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Exception indicating that a certain operation could not be performed
* on a taxonomy related object because of an inconsistency.
* <p>
* For example, trying to refresh a taxonomy reader might fail in case
* the underlying taxonomy was meanwhile modified in a manner which
* does not allow to perform such a refresh. (See {@link TaxonomyReader#refresh()}.)
*
* @lucene.experimental
*/
public class InconsistentTaxonomyException extends Exception {
public InconsistentTaxonomyException(String message) {
super(message);
}
public InconsistentTaxonomyException() {
super();
}
}

View File

@ -3,6 +3,9 @@ package org.apache.lucene.facet.taxonomy;
import java.io.Closeable;
import java.io.IOException;
import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.lucene.store.AlreadyClosedException;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@ -60,13 +63,13 @@ import java.util.Map;
*
* @lucene.experimental
*/
public interface TaxonomyReader extends Closeable {
public abstract class TaxonomyReader implements Closeable {
/**
* The root category (the category with the empty path) always has the
* ordinal 0, to which we give a name ROOT_ORDINAL.
* getOrdinal() of an empty path will always return ROOT_ORDINAL, and
* getCategory(ROOT_ORDINAL) will return the empty path.
* The root category (the category with the empty path) always has the ordinal
* 0, to which we give a name ROOT_ORDINAL. {@link #getOrdinal(CategoryPath)}
* of an empty path will always return {@code ROOT_ORDINAL}, and
* {@link #getPath(int)} with {@code ROOT_ORDINAL} will return the empty path.
*/
public final static int ROOT_ORDINAL = 0;
@ -77,207 +80,189 @@ public interface TaxonomyReader extends Closeable {
public final static int INVALID_ORDINAL = -1;
/**
* getOrdinal() returns the ordinal of the category given as a path.
* The ordinal is the category's serial number, an integer which starts
* with 0 and grows as more categories are added (note that once a category
* is added, it can never be deleted).
* <P>
* If the given category wasn't found in the taxonomy, INVALID_ORDINAL is
* returned.
* If the taxonomy has changed since the provided reader was opened, open and
* return a new {@link TaxonomyReader}; else, return {@code null}. The new
* reader, if not {@code null}, will be the same type of reader as the one
* given to this method.
*
* <p>
* This method is typically far less costly than opening a fully new
* {@link TaxonomyReader} as it shares resources with the provided
* {@link TaxonomyReader}, when possible.
*/
public int getOrdinal(CategoryPath categoryPath) throws IOException;
/**
* getPath() returns the path name of the category with the given
* ordinal. The path is returned as a new CategoryPath object - to
* reuse an existing object, use {@link #getPath(int, CategoryPath)}.
* <P>
* A null is returned if a category with the given ordinal does not exist.
*/
public CategoryPath getPath(int ordinal) throws IOException;
public static <T extends TaxonomyReader> T openIfChanged(T oldTaxoReader) throws IOException {
@SuppressWarnings("unchecked")
final T newTaxoReader = (T) oldTaxoReader.doOpenIfChanged();
assert newTaxoReader != oldTaxoReader;
return newTaxoReader;
}
/**
* getPath() returns the path name of the category with the given
* ordinal. The path is written to the given CategoryPath object (which
* is cleared first).
* <P>
* If a category with the given ordinal does not exist, the given
* CategoryPath object is not modified, and the method returns
* <code>false</code>. Otherwise, the method returns <code>true</code>.
*/
public boolean getPath(int ordinal, CategoryPath result) throws IOException;
private volatile boolean closed = false;
/**
* refresh() re-reads the taxonomy information if there were any changes to
* the taxonomy since this instance was opened or last refreshed. Calling
* refresh() is more efficient than close()ing the old instance and opening a
* new one.
* <P>
* If there were no changes since this instance was opened or last refreshed,
* then this call does nothing. Note, however, that this is still a relatively
* slow method (as it needs to verify whether there have been any changes on
* disk to the taxonomy), so it should not be called too often needlessly. In
* faceted search, the taxonomy reader's refresh() should be called only after
* a reopen() of the main index.
* <P>
* Refreshing the taxonomy might fail in some cases, for example
* if the taxonomy was recreated since this instance was opened or last refreshed.
* In this case an {@link InconsistentTaxonomyException} is thrown,
* suggesting that in order to obtain up-to-date taxonomy data a new
* {@link TaxonomyReader} should be opened. Note: This {@link TaxonomyReader}
* instance remains unchanged and usable in this case, and the application can
* continue to use it, and should still {@link #close()} when no longer needed.
* <P>
* It should be noted that refresh() is similar in purpose to
* IndexReader.reopen(), but the two methods behave differently. refresh()
* refreshes the existing TaxonomyReader object, rather than opening a new one
* in addition to the old one as reopen() does. The reason is that in a
* taxonomy, one can only add new categories and cannot modify or delete
* existing categories; Therefore, there is no reason to keep an old snapshot
* of the taxonomy open - refreshing the taxonomy to the newest data and using
* this new snapshots in all threads (whether new or old) is fine. This saves
* us needing to keep multiple copies of the taxonomy open in memory.
* @return true if anything has changed, false otherwise.
*/
public boolean refresh() throws IOException, InconsistentTaxonomyException;
// set refCount to 1 at start
private final AtomicInteger refCount = new AtomicInteger(1);
/**
* getParent() returns the ordinal of the parent category of the category
* with the given ordinal.
* <P>
* When a category is specified as a path name, finding the path of its
* parent is as trivial as dropping the last component of the path.
* getParent() is functionally equivalent to calling getPath() on the
* given ordinal, dropping the last component of the path, and then calling
* getOrdinal() to get an ordinal back. However, implementations are
* expected to provide a much more efficient implementation:
* <P>
* getParent() should be a very quick method, as it is used during the
* facet aggregation process in faceted search. Implementations will most
* likely want to serve replies to this method from a pre-filled cache.
* <P>
* If the given ordinal is the ROOT_ORDINAL, an INVALID_ORDINAL is returned.
* If the given ordinal is a top-level category, the ROOT_ORDINAL is returned.
* If an invalid ordinal is given (negative or beyond the last available
* ordinal), an ArrayIndexOutOfBoundsException is thrown. However, it is
* expected that getParent will only be called for ordinals which are
* already known to be in the taxonomy.
* performs the actual task of closing the resources that are used by the
* taxonomy reader.
*/
public int getParent(int ordinal) throws IOException;
protected abstract void doClose() throws IOException;
/**
* getParentArray() returns an int array of size getSize() listing the
* ordinal of the parent category of each category in the taxonomy.
* <P>
* The caller can hold on to the array it got indefinitely - it is
* guaranteed that no-one else will modify it. The other side of the
* same coin is that the caller must treat the array it got as read-only
* and <B>not modify it</B>, because other callers might have gotten the
* same array too (and getParent() calls might be answered from the
* same array).
* <P>
* If you use getParentArray() instead of getParent(), remember that
* the array you got is (naturally) not modified after a refresh(),
* so you should always call getParentArray() again after a refresh().
* <P>
* This method's function is similar to allocating an array of size
* getSize() and filling it with getParent() calls, but implementations
* are encouraged to implement it much more efficiently, with O(1)
* complexity. This can be done, for example, by the implementation
* already keeping the parents in an array, and just returning this
* array (without any allocation or copying) when requested.
* Implements the actual opening of a new {@link TaxonomyReader} instance if
* the taxonomy has changed.
*
* @see #openIfChanged(TaxonomyReader)
*/
public int[] getParentArray() throws IOException;
protected abstract TaxonomyReader doOpenIfChanged() throws IOException;
/**
* Equivalent representations of the taxonomy's parent info,
* used internally for efficient computation of facet results:
* "youngest child" and "oldest sibling"
* @throws AlreadyClosedException if this IndexReader is closed
*/
public static interface ChildrenArrays {
/**
* getYoungestChildArray() returns an int array of size getSize()
* listing the ordinal of the youngest (highest numbered) child
* category of each category in the taxonomy. The value for a leaf
* category (a category without children) is
* <code>INVALID_ORDINAL</code>.
*/
public int[] getYoungestChildArray();
/**
* getOlderSiblingArray() returns an int array of size getSize()
* listing for each category the ordinal of its immediate older
* sibling (the sibling in the taxonomy tree with the highest ordinal
* below that of the given ordinal). The value for a category with no
* older sibling is <code>INVALID_ORDINAL</code>.
*/
public int[] getOlderSiblingArray();
protected final void ensureOpen() throws AlreadyClosedException {
if (getRefCount() <= 0) {
throw new AlreadyClosedException("this TaxonomyReader is closed");
}
}
@Override
public final void close() throws IOException {
if (!closed) {
synchronized (this) {
if (!closed) {
decRef();
closed = true;
}
}
}
}
/**
* getChildrenArrays() returns a {@link ChildrenArrays} object which can
* be used together to efficiently enumerate the children of any category.
* <P>
* The caller can hold on to the object it got indefinitely - it is
* guaranteed that no-one else will modify it. The other side of the
* same coin is that the caller must treat the object which it got (and
* the arrays it contains) as read-only and <B>not modify it</B>, because
* other callers might have gotten the same object too.
* <P>
* Implementations should have O(getSize()) time for the first call or
* after a refresh(), but O(1) time for further calls. In neither case
* there should be a need to read new data from disk. These guarantees
* are most likely achieved by calculating this object (based on the
* getParentArray()) when first needed, and later (if the taxonomy was not
* refreshed) returning the same object (without any allocation or copying)
* when requested.
* <P>
* The reason we have one method returning one object, rather than two
* methods returning two arrays, is to avoid race conditions in a multi-
* threaded application: We want to avoid the possibility of returning one
* new array and one old array, as those could not be used together.
* Expert: decreases the refCount of this TaxonomyReader instance. If the
* refCount drops to 0 this taxonomy reader is closed.
*/
public ChildrenArrays getChildrenArrays();
public final void decRef() throws IOException {
ensureOpen();
final int rc = refCount.decrementAndGet();
if (rc == 0) {
boolean success = false;
try {
doClose();
closed = true;
success = true;
} finally {
if (!success) {
// Put reference back on failure
refCount.incrementAndGet();
}
}
} else if (rc < 0) {
throw new IllegalStateException("too many decRef calls: refCount is " + rc + " after decrement");
}
}
/**
* Returns a {@link ChildrenArrays} object which can be used together to
* efficiently enumerate the children of any category.
* <p>
* The caller can hold on to the object it got indefinitely - it is guaranteed
* that no-one else will modify it. The other side of the same coin is that
* the caller must treat the object which it got (and the arrays it contains)
* as read-only and <b>not modify it</b>, because other callers might have
* gotten the same object too.
*/
public abstract ChildrenArrays getChildrenArrays() throws IOException;
/**
* Retrieve user committed data.
*
* @see TaxonomyWriter#commit(Map)
*/
public Map<String, String> getCommitUserData() throws IOException;
/**
* Expert: increments the refCount of this TaxonomyReader instance.
* RefCounts can be used to determine when a taxonomy reader can be closed
* safely, i.e. as soon as there are no more references.
* Be sure to always call a corresponding decRef(), in a finally clause;
* otherwise the reader may never be closed.
*/
public void incRef();
/**
* Expert: decreases the refCount of this TaxonomyReader instance.
* If the refCount drops to 0, then pending changes (if any) can be
* committed to the taxonomy index and this reader can be closed.
* @throws IOException If there is a low-level I/O error.
*/
public void decRef() throws IOException;
public abstract Map<String, String> getCommitUserData() throws IOException;
/**
* Expert: returns the current refCount for this taxonomy reader
* Returns the ordinal of the category given as a path. The ordinal is the
* category's serial number, an integer which starts with 0 and grows as more
* categories are added (note that once a category is added, it can never be
* deleted).
*
* @return the category's ordinal or {@link #INVALID_ORDINAL} if the category
* wasn't foun.
*/
public int getRefCount();
public abstract int getOrdinal(CategoryPath categoryPath) throws IOException;
/**
* getSize() returns the number of categories in the taxonomy.
* <P>
* Because categories are numbered consecutively starting with 0, it
* means the taxonomy contains ordinals 0 through getSize()-1.
* <P>
* Note that the number returned by getSize() is often slightly higher
* than the number of categories inserted into the taxonomy; This is
* because when a category is added to the taxonomy, its ancestors
* are also added automatically (including the root, which always get
* ordinal 0).
* Returns the ordinal of the parent category of the category with the given
* ordinal, according to the following rules:
*
*
* <ul>
* <li>If the given ordinal is the {@link #ROOT_ORDINAL}, an
* {@link #INVALID_ORDINAL} is returned.
* <li>If the given ordinal is a top-level category, the {@link #ROOT_ORDINAL}
* is returned.
* <li>If the given ordinal is an existing category, returns the ordinal of
* its parent
* </ul>
*
* @throws ArrayIndexOutOfBoundsException
* if an invalid ordinal is given (negative or beyond the last
* available ordinal)
*/
public int getSize();
public abstract int getParent(int ordinal) throws IOException;
/**
* Returns an {@code int[]} the size of the taxonomy listing the ordinal of
* the parent category of each category in the taxonomy.
* <p>
* The caller can hold on to the array it got indefinitely - it is guaranteed
* that no-one else will modify it. The other side of the same coin is that
* the caller must treat the array it got as read-only and <b>not modify
* it</b>, because other callers might have gotten the same array too (and
* getParent() calls might be answered from the same array).
*/
public abstract int[] getParentArray() throws IOException;
/**
* Returns the path name of the category with the given ordinal. The path is
* returned as a new CategoryPath object - to reuse an existing object, use
* {@link #getPath(int, CategoryPath)}.
*
* @return a {@link CategoryPath} with the required path, or {@code null} if
* the given ordinal is unknown to the taxonomy.
*/
public abstract CategoryPath getPath(int ordinal) throws IOException;
/**
* Same as {@link #getPath(int)}, only reuses the given {@link CategoryPath}
* instances.
*/
public abstract boolean getPath(int ordinal, CategoryPath result) throws IOException;
/** Returns the current refCount for this taxonomy reader. */
public final int getRefCount() {
return refCount.get();
}
/**
* Returns the number of categories in the taxonomy. Note that the number of
* categories returned is often slightly higher than the number of categories
* inserted into the taxonomy; This is because when a category is added to the
* taxonomy, its ancestors are also added automatically (including the root,
* which always get ordinal 0).
*/
public abstract int getSize();
/**
* Expert: increments the refCount of this TaxonomyReader instance. RefCounts
* can be used to determine when a taxonomy reader can be closed safely, i.e.
* as soon as there are no more references. Be sure to always call a
* corresponding decRef(), in a finally clause; otherwise the reader may never
* be closed.
*/
public final void incRef() {
ensureOpen();
refCount.incrementAndGet();
}
}

View File

@ -2,6 +2,7 @@ package org.apache.lucene.facet.taxonomy.directory;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.StoredFieldVisitor;
import org.apache.lucene.util.BytesRef;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@ -28,6 +29,7 @@ abstract class Consts {
static final String FULL = "$full_path$";
static final String FIELD_PAYLOADS = "$payloads$";
static final String PAYLOAD_PARENT = "p";
static final BytesRef PAYLOAD_PARENT_BYTES_REF = new BytesRef(PAYLOAD_PARENT);
static final char[] PAYLOAD_PARENT_CHARS = PAYLOAD_PARENT.toCharArray();
/**

View File

@ -1,29 +1,23 @@
package org.apache.lucene.facet.taxonomy.directory;
import java.io.IOException;
import java.util.Iterator;
import java.util.Map;
import java.util.Map.Entry;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.locks.ReadWriteLock;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.lucene.facet.taxonomy.CategoryPath;
import org.apache.lucene.facet.taxonomy.InconsistentTaxonomyException;
import org.apache.lucene.facet.taxonomy.ChildrenArrays;
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
import org.apache.lucene.facet.taxonomy.directory.Consts.LoadFullPathOnly;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.store.AlreadyClosedException;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.collections.LRUHashMap;
/*
@ -55,89 +49,341 @@ import org.apache.lucene.util.collections.LRUHashMap;
*
* @lucene.experimental
*/
public class DirectoryTaxonomyReader implements TaxonomyReader {
public class DirectoryTaxonomyReader extends TaxonomyReader {
private static final Logger logger = Logger.getLogger(DirectoryTaxonomyReader.class.getName());
private static final int DEFAULT_CACHE_VALUE = 4000;
private DirectoryReader indexReader;
private final DirectoryTaxonomyWriter taxoWriter;
private final long taxoEpoch; // used in doOpenIfChanged
private final DirectoryReader indexReader;
// The following lock is used to allow multiple threads to read from the
// index concurrently, while having them block during the very short
// critical moment of refresh() (see comments below). Note, however, that
// we only read from the index when we don't have the entry in our cache,
// and the caches are locked separately.
private ReadWriteLock indexReaderLock = new ReentrantReadWriteLock();
// TODO: test DoubleBarrelLRUCache and consider using it instead
private LRUHashMap<String, Integer> ordinalCache;
private LRUHashMap<Integer, String> categoryCache;
// The following are the limited-size LRU caches used to cache the latest
// results from getOrdinal() and getLabel().
// Because LRUHashMap is not thread-safe, we need to synchronize on this
// object when using it. Unfortunately, this is not optimal under heavy
// contention because it means that while one thread is using the cache
// (reading or modifying) others are blocked from using it - or even
// starting to do benign things like calculating the hash function. A more
// efficient approach would be to use a non-locking (as much as possible)
// concurrent solution, along the lines of java.util.concurrent.ConcurrentHashMap
// but with LRU semantics.
// However, even in the current sub-optimal implementation we do not make
// the mistake of locking out readers while waiting for disk in a cache
// miss - below, we do not hold cache lock while reading missing data from
// disk.
private final LRUHashMap<String, Integer> ordinalCache;
private final LRUHashMap<Integer, String> categoryCache;
// getParent() needs to be extremely efficient, to the point that we need
// to fetch all the data in advance into memory, and answer these calls
// from memory. Currently we use a large integer array, which is
// initialized when the taxonomy is opened, and potentially enlarged
// when it is refresh()ed.
// These arrays are not syncrhonized. Rather, the reference to the array
// is volatile, and the only writing operation (refreshPrefetchArrays)
// simply creates a new array and replaces the reference. The volatility
// of the reference ensures the correct atomic replacement and its
// visibility properties (the content of the array is visible when the
// new reference is visible).
private ParentArray parentArray;
// TODO: consolidate these objects into one ParentInfo or something?
private volatile ParentArray parentArray;
private volatile ChildrenArrays childrenArrays;
private char delimiter = Consts.DEFAULT_DELIMITER;
private volatile boolean closed = false;
// set refCount to 1 at start
private final AtomicInteger refCount = new AtomicInteger(1);
/**
* Called only from {@link #doOpenIfChanged()}. If the taxonomy has been
* recreated, you should pass {@code null} as the caches and parent/children
* arrays.
*/
DirectoryTaxonomyReader(DirectoryReader indexReader, DirectoryTaxonomyWriter taxoWriter,
LRUHashMap<String,Integer> ordinalCache,
LRUHashMap<Integer,String> categoryCache, ParentArray parentArray,
ChildrenArrays childrenArrays) throws IOException {
this.indexReader = indexReader;
this.taxoWriter = taxoWriter;
this.taxoEpoch = taxoWriter == null ? -1 : taxoWriter.getTaxonomyEpoch();
// use the same instance of the cache, note the protective code in getOrdinal and getPath
this.ordinalCache = ordinalCache == null ? new LRUHashMap<String,Integer>(DEFAULT_CACHE_VALUE) : ordinalCache;
this.categoryCache = categoryCache == null ? new LRUHashMap<Integer,String>(DEFAULT_CACHE_VALUE) : categoryCache;
this.parentArray = null;
this.childrenArrays = null;
if (parentArray != null) {
this.parentArray = new ParentArray(indexReader, parentArray);
if (childrenArrays != null) {
this.childrenArrays = new ChildrenArrays(this.parentArray.getArray(), childrenArrays);
}
}
}
/**
* Open for reading a taxonomy stored in a given {@link Directory}.
*
* @param directory
* The {@link Directory} in which to the taxonomy lives. Note that
* the taxonomy is read directly to that directory (not from a
* subdirectory of it).
* @throws CorruptIndexException if the Taxonomy is corrupted.
* @throws IOException if another error occurred.
* The {@link Directory} in which the taxonomy resides.
* @throws CorruptIndexException
* if the Taxonomy is corrupt.
* @throws IOException
* if another error occurred.
*/
public DirectoryTaxonomyReader(Directory directory) throws IOException {
this.indexReader = openIndexReader(directory);
indexReader = openIndexReader(directory);
taxoWriter = null;
taxoEpoch = -1;
// These are the default cache sizes; they can be configured after
// construction with the cache's setMaxSize() method
ordinalCache = new LRUHashMap<String, Integer>(4000);
categoryCache = new LRUHashMap<Integer, String>(4000);
ordinalCache = new LRUHashMap<String, Integer>(DEFAULT_CACHE_VALUE);
categoryCache = new LRUHashMap<Integer, String>(DEFAULT_CACHE_VALUE);
}
/**
* Opens a {@link DirectoryTaxonomyReader} over the given
* {@link DirectoryTaxonomyWriter} (for NRT).
*
* @param taxoWriter
* The {@link DirectoryTaxonomyWriter} from which to obtain newly
* added categories, in real-time.
*/
public DirectoryTaxonomyReader(DirectoryTaxonomyWriter taxoWriter) throws IOException {
this.taxoWriter = taxoWriter;
taxoEpoch = taxoWriter.getTaxonomyEpoch();
indexReader = openIndexReader(taxoWriter.getInternalIndexWriter());
// These are the default cache sizes; they can be configured after
// construction with the cache's setMaxSize() method
ordinalCache = new LRUHashMap<String, Integer>(DEFAULT_CACHE_VALUE);
categoryCache = new LRUHashMap<Integer, String>(DEFAULT_CACHE_VALUE);
}
private String getLabel(int catID) throws IOException {
ensureOpen();
// TODO (Facet): consider lazily create parent array when asked, not in the constructor
parentArray = new ParentArray();
parentArray.refresh(indexReader);
// Since the cache is shared with DTR instances allocated from
// doOpenIfChanged, we need to ensure that the ordinal is one that this DTR
// instance recognizes. Therefore we do this check up front, before we hit
// the cache.
if (catID < 0 || catID >= indexReader.maxDoc()) {
return null;
}
// TODO: can we use an int-based hash impl, such as IntToObjectMap,
// wrapped as LRU?
Integer catIDInteger = Integer.valueOf(catID);
synchronized (categoryCache) {
String res = categoryCache.get(catIDInteger);
if (res != null) {
return res;
}
}
final LoadFullPathOnly loader = new LoadFullPathOnly();
indexReader.document(catID, loader);
String ret = loader.getFullPath();
synchronized (categoryCache) {
categoryCache.put(catIDInteger, ret);
}
return ret;
}
@Override
protected void doClose() throws IOException {
indexReader.close();
parentArray = null;
childrenArrays = null;
// do not clear() the caches, as they may be used by other DTR instances.
ordinalCache = null;
categoryCache = null;
}
/**
* Implements the opening of a new {@link DirectoryTaxonomyReader} instance if
* the taxonomy has changed.
*
* <p>
* <b>NOTE:</b> the returned {@link DirectoryTaxonomyReader} shares the
* ordinal and category caches with this reader. This is not expected to cause
* any issues, unless the two instances continue to live. The reader
* guarantees that the two instances cannot affect each other in terms of
* correctness of the caches, however if the size of the cache is changed
* through {@link #setCacheSize(int)}, it will affect both reader instances.
*/
@Override
protected DirectoryTaxonomyReader doOpenIfChanged() throws IOException {
ensureOpen();
final DirectoryReader r2;
if (taxoWriter == null) {
// not NRT
r2 = DirectoryReader.openIfChanged(indexReader);
} else {
// NRT
r2 = DirectoryReader.openIfChanged(indexReader, taxoWriter.getInternalIndexWriter(), false);
}
if (r2 == null) {
return null; // no changes, nothing to do
}
// check if the taxonomy was recreated
boolean success = false;
try {
boolean recreated = false;
if (taxoWriter == null) {
// not NRT, check epoch from commit data
String t1 = indexReader.getIndexCommit().getUserData().get(DirectoryTaxonomyWriter.INDEX_EPOCH);
String t2 = r2.getIndexCommit().getUserData().get(DirectoryTaxonomyWriter.INDEX_EPOCH);
if (t1 == null) {
if (t2 != null) {
recreated = true;
}
} else if (!t1.equals(t2)) {
// t1 != null and t2 cannot be null b/c DirTaxoWriter always puts the commit data.
// it's ok to use String.equals because we require the two epoch values to be the same.
recreated = true;
}
} else {
// NRT, compare current taxoWriter.epoch() vs the one that was given at construction
if (taxoEpoch != taxoWriter.getTaxonomyEpoch()) {
recreated = true;
}
}
final DirectoryTaxonomyReader newtr;
if (recreated) {
// if recreated, do not reuse anything from this instace. the information
// will be lazily computed by the new instance when needed.
newtr = new DirectoryTaxonomyReader(r2, taxoWriter, null, null, null, null);
} else {
newtr = new DirectoryTaxonomyReader(r2, taxoWriter, ordinalCache, categoryCache, parentArray, childrenArrays);
}
success = true;
return newtr;
} finally {
if (!success) {
IOUtils.closeWhileHandlingException(r2);
}
}
}
protected DirectoryReader openIndexReader(Directory directory) throws IOException {
return DirectoryReader.open(directory);
}
protected DirectoryReader openIndexReader(IndexWriter writer) throws IOException {
return DirectoryReader.open(writer, false);
}
/**
* @throws AlreadyClosedException if this IndexReader is closed
* Expert: returns the underlying {@link DirectoryReader} instance that is
* used by this {@link TaxonomyReader}.
*/
protected final void ensureOpen() throws AlreadyClosedException {
if (getRefCount() <= 0) {
throw new AlreadyClosedException("this TaxonomyReader is closed");
DirectoryReader getInternalIndexReader() {
ensureOpen();
return indexReader;
}
@Override
public ChildrenArrays getChildrenArrays() throws IOException {
ensureOpen();
if (childrenArrays == null) {
synchronized (this) {
if (childrenArrays == null) {
childrenArrays = new ChildrenArrays(getParentArray());
}
}
}
return childrenArrays;
}
@Override
public Map<String, String> getCommitUserData() throws IOException {
ensureOpen();
return indexReader.getIndexCommit().getUserData();
}
@Override
public int getOrdinal(CategoryPath categoryPath) throws IOException {
ensureOpen();
if (categoryPath.length() == 0) {
return ROOT_ORDINAL;
}
String path = categoryPath.toString(delimiter);
// First try to find the answer in the LRU cache:
synchronized (ordinalCache) {
Integer res = ordinalCache.get(path);
if (res != null) {
if (res.intValue() < indexReader.maxDoc()) {
// Since the cache is shared with DTR instances allocated from
// doOpenIfChanged, we need to ensure that the ordinal is one that
// this DTR instance recognizes.
return res.intValue();
} else {
// if we get here, it means that the category was found in the cache,
// but is not recognized by this TR instance. Therefore there's no
// need to continue search for the path on disk, because we won't find
// it there too.
return TaxonomyReader.INVALID_ORDINAL;
}
}
}
// If we're still here, we have a cache miss. We need to fetch the
// value from disk, and then also put it in the cache:
int ret = TaxonomyReader.INVALID_ORDINAL;
DocsEnum docs = MultiFields.getTermDocsEnum(indexReader, null, Consts.FULL, new BytesRef(path), 0);
if (docs != null && docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
ret = docs.docID();
// we only store the fact that a category exists, not its inexistence.
// This is required because the caches are shared with new DTR instances
// that are allocated from doOpenIfChanged. Therefore, if we only store
// information about found categories, we cannot accidently tell a new
// generation of DTR that a category does not exist.
synchronized (ordinalCache) {
ordinalCache.put(path, Integer.valueOf(ret));
}
}
return ret;
}
// TODO: move to a ParentInfo class? (see TODO for parentArray)
@Override
public int getParent(int ordinal) throws IOException {
ensureOpen();
return getParentArray()[ordinal];
}
@Override
public int[] getParentArray() throws IOException {
ensureOpen();
if (parentArray == null) {
synchronized (this) {
if (parentArray == null) {
parentArray = new ParentArray(indexReader);
}
}
}
return parentArray.getArray();
}
@Override
public CategoryPath getPath(int ordinal) throws IOException {
ensureOpen();
// TODO (Facet): Currently, the LRU cache we use (getCategoryCache) holds
// strings with delimiters, not CategoryPath objects, so even if
// we have a cache hit, we need to process the string and build a new
// CategoryPath object every time. What is preventing us from putting
// the actual CategoryPath object in the cache is the fact that these
// objects are mutable. So we should create an immutable (read-only)
// interface that CategoryPath implements, and this method should
// return this interface, not the writable CategoryPath.
String label = getLabel(ordinal);
if (label == null) {
return null;
}
return new CategoryPath(label, delimiter);
}
@Override
public boolean getPath(int ordinal, CategoryPath result) throws IOException {
ensureOpen();
String label = getLabel(ordinal);
if (label == null) {
return false;
}
result.clear();
result.add(label, delimiter);
return true;
}
@Override
public int getSize() {
ensureOpen();
return indexReader.numDocs();
}
/**
@ -151,10 +397,10 @@ public class DirectoryTaxonomyReader implements TaxonomyReader {
*/
public void setCacheSize(int size) {
ensureOpen();
synchronized(categoryCache) {
synchronized (categoryCache) {
categoryCache.setMaxSize(size);
}
synchronized(ordinalCache) {
synchronized (ordinalCache) {
ordinalCache.setMaxSize(size);
}
}
@ -173,361 +419,11 @@ public class DirectoryTaxonomyReader implements TaxonomyReader {
ensureOpen();
this.delimiter = delimiter;
}
@Override
public int getOrdinal(CategoryPath categoryPath) throws IOException {
ensureOpen();
if (categoryPath.length()==0) {
return ROOT_ORDINAL;
}
String path = categoryPath.toString(delimiter);
// First try to find the answer in the LRU cache:
synchronized(ordinalCache) {
Integer res = ordinalCache.get(path);
if (res!=null) {
return res.intValue();
}
}
// If we're still here, we have a cache miss. We need to fetch the
// value from disk, and then also put it in the cache:
int ret = TaxonomyReader.INVALID_ORDINAL;
try {
indexReaderLock.readLock().lock();
// TODO (Facet): avoid Multi*?
Bits liveDocs = MultiFields.getLiveDocs(indexReader);
DocsEnum docs = MultiFields.getTermDocsEnum(indexReader, liveDocs, Consts.FULL, new BytesRef(path), 0);
if (docs != null && docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
ret = docs.docID();
}
} finally {
indexReaderLock.readLock().unlock();
}
// Put the new value in the cache. Note that it is possible that while
// we were doing the above fetching (without the cache locked), some
// other thread already added the same category to the cache. We do
// not care about this possibilty, as LRUCache replaces previous values
// of the same keys (it doesn't store duplicates).
synchronized(ordinalCache) {
// GB: new Integer(int); creates a new object each and every time.
// Integer.valueOf(int) might not (See JavaDoc).
ordinalCache.put(path, Integer.valueOf(ret));
}
return ret;
}
@Override
public CategoryPath getPath(int ordinal) throws IOException {
ensureOpen();
// TODO (Facet): Currently, the LRU cache we use (getCategoryCache) holds
// strings with delimiters, not CategoryPath objects, so even if
// we have a cache hit, we need to process the string and build a new
// CategoryPath object every time. What is preventing us from putting
// the actual CategoryPath object in the cache is the fact that these
// objects are mutable. So we should create an immutable (read-only)
// interface that CategoryPath implements, and this method should
// return this interface, not the writable CategoryPath.
String label = getLabel(ordinal);
if (label==null) {
return null;
}
return new CategoryPath(label, delimiter);
}
@Override
public boolean getPath(int ordinal, CategoryPath result) throws IOException {
ensureOpen();
String label = getLabel(ordinal);
if (label==null) {
return false;
}
result.clear();
result.add(label, delimiter);
return true;
}
private String getLabel(int catID) throws IOException {
ensureOpen();
// First try to find the answer in the LRU cache. It is very
// unfortunate that we need to allocate an Integer object here -
// it would have been better if we used a hash table specifically
// designed for int keys...
// GB: new Integer(int); creates a new object each and every time.
// Integer.valueOf(int) might not (See JavaDoc).
Integer catIDInteger = Integer.valueOf(catID);
synchronized(categoryCache) {
String res = categoryCache.get(catIDInteger);
if (res!=null) {
return res;
}
}
// If we're still here, we have a cache miss. We need to fetch the
// value from disk, and then also put it in the cache:
String ret;
try {
indexReaderLock.readLock().lock();
// The taxonomy API dictates that if we get an invalid category
// ID, we should return null, If we don't check this here, we
// can some sort of an exception from the document() call below.
// NOTE: Currently, we *do not* cache this return value; There
// isn't much point to do so, because checking the validity of
// the docid doesn't require disk access - just comparing with
// the number indexReader.maxDoc().
if (catID<0 || catID>=indexReader.maxDoc()) {
return null;
}
final LoadFullPathOnly loader = new LoadFullPathOnly();
indexReader.document(catID, loader);
ret = loader.getFullPath();
} finally {
indexReaderLock.readLock().unlock();
}
// Put the new value in the cache. Note that it is possible that while
// we were doing the above fetching (without the cache locked), some
// other thread already added the same category to the cache. We do
// not care about this possibility, as LRUCache replaces previous
// values of the same keys (it doesn't store duplicates).
synchronized (categoryCache) {
categoryCache.put(catIDInteger, ret);
}
return ret;
}
@Override
public int getParent(int ordinal) {
ensureOpen();
// Note how we don't need to hold the read lock to do the following,
// because the array reference is volatile, ensuring the correct
// visibility and ordering: if we get the new reference, the new
// data is also visible to this thread.
return getParentArray()[ordinal];
}
/**
* getParentArray() returns an int array of size getSize() listing the
* ordinal of the parent category of each category in the taxonomy.
* <P>
* The caller can hold on to the array it got indefinitely - it is
* guaranteed that no-one else will modify it. The other side of the
* same coin is that the caller must treat the array it got as read-only
* and <B>not modify it</B>, because other callers might have gotten the
* same array too, and getParent() calls are also answered from the
* same array.
* <P>
* The getParentArray() call is extremely efficient, merely returning
* a reference to an array that already exists. For a caller that plans
* to call getParent() for many categories, using getParentArray() and
* the array it returns is a somewhat faster approach because it avoids
* the overhead of method calls and volatile dereferencing.
* <P>
* If you use getParentArray() instead of getParent(), remember that
* the array you got is (naturally) not modified after a refresh(),
* so you should always call getParentArray() again after a refresh().
*/
@Override
public int[] getParentArray() {
ensureOpen();
// Note how we don't need to hold the read lock to do the following,
// because the array reference is volatile, ensuring the correct
// visibility and ordering: if we get the new reference, the new
// data is also visible to this thread.
return parentArray.getArray();
}
// Note that refresh() is synchronized (it is the only synchronized
// method in this class) to ensure that it never gets called concurrently
// with itself.
@Override
public synchronized boolean refresh() throws IOException, InconsistentTaxonomyException {
ensureOpen();
/*
* Since refresh() can be a lengthy operation, it is very important that we
* avoid locking out all readers for its duration. This is why we don't hold
* the indexReaderLock write lock for the entire duration of this method. In
* fact, it is enough to hold it only during a single assignment! Other
* comments in this method will explain this.
*/
// note that the lengthy operation indexReader.reopen() does not
// modify the reader, so we can do it without holding a lock. We can
// safely read indexReader without holding the write lock, because
// no other thread can be writing at this time (this method is the
// only possible writer, and it is "synchronized" to avoid this case).
DirectoryReader r2 = DirectoryReader.openIfChanged(indexReader);
if (r2 == null) {
return false; // no changes, nothing to do
}
// validate that a refresh is valid at this point, i.e. that the taxonomy
// was not recreated since this reader was last opened or refresshed.
String t1 = indexReader.getIndexCommit().getUserData().get(DirectoryTaxonomyWriter.INDEX_EPOCH);
String t2 = r2.getIndexCommit().getUserData().get(DirectoryTaxonomyWriter.INDEX_EPOCH);
if (t1 == null) {
if (t2 != null) {
r2.close();
throw new InconsistentTaxonomyException("Taxonomy was recreated, epoch= " + t2);
}
} else if (!t1.equals(t2)) {
// t1 != null and t2 cannot be null b/c DirTaxoWriter always puts the commit data.
// it's ok to use String.equals because we require the two epoch values to be the same.
r2.close();
throw new InconsistentTaxonomyException("Taxonomy was recreated epoch = " + t2 + " != " + t1);
}
IndexReader oldreader = indexReader;
// we can close the old searcher, but need to synchronize this
// so that we don't close it in the middle that another routine
// is reading from it.
indexReaderLock.writeLock().lock();
indexReader = r2;
indexReaderLock.writeLock().unlock();
// We can close the old reader, but need to be certain that we
// don't close it while another method is reading from it.
// Luckily, we can be certain of that even without putting the
// oldreader.close() in the locked section. The reason is that
// after lock() succeeded above, we know that all existing readers
// had finished (this is what a read-write lock ensures). New
// readers, starting after the unlock() we just did, already got
// the new indexReader we set above. So nobody can be possibly
// using the old indexReader, and we can close it:
oldreader.close();
// We prefetch some of the arrays to make requests much faster.
// Let's refresh these prefetched arrays; This refresh is much
// is made more efficient by assuming that it is enough to read
// the values for new categories (old categories could not have been
// changed or deleted)
// Note that this this done without the write lock being held,
// which means that it is possible that during a refresh(), a
// reader will have some methods (like getOrdinal and getCategory)
// return fresh information, while getParent()
// (only to be prefetched now) still return older information.
// We consider this to be acceptable. The important thing,
// however, is that refreshPrefetchArrays() itself writes to
// the arrays in a correct manner (see discussion there)
parentArray.refresh(indexReader);
// Remove any INVALID_ORDINAL values from the ordinal cache,
// because it is possible those are now answered by the new data!
Iterator<Entry<String, Integer>> i = ordinalCache.entrySet().iterator();
while (i.hasNext()) {
Entry<String, Integer> e = i.next();
if (e.getValue().intValue() == INVALID_ORDINAL) {
i.remove();
}
}
return true;
}
@Override
public void close() throws IOException {
if (!closed) {
synchronized (this) {
if (!closed) {
decRef();
closed = true;
}
}
}
}
/** Do the actual closing, free up resources */
private void doClose() throws IOException {
indexReader.close();
closed = true;
parentArray = null;
childrenArrays = null;
categoryCache.clear();
ordinalCache.clear();
}
@Override
public int getSize() {
ensureOpen();
indexReaderLock.readLock().lock();
try {
return indexReader.numDocs();
} finally {
indexReaderLock.readLock().unlock();
}
}
@Override
public Map<String, String> getCommitUserData() throws IOException {
ensureOpen();
return indexReader.getIndexCommit().getUserData();
}
private ChildrenArrays childrenArrays;
Object childrenArraysRebuild = new Object();
@Override
public ChildrenArrays getChildrenArrays() {
ensureOpen();
// Check if the taxonomy grew since we built the array, and if it
// did, create new (and larger) arrays and fill them as required.
// We do all this under a lock, two prevent to concurrent calls to
// needlessly do the same array building at the same time.
synchronized(childrenArraysRebuild) {
int num = getSize();
int first;
if (childrenArrays==null) {
first = 0;
} else {
first = childrenArrays.getYoungestChildArray().length;
}
// If the taxonomy hasn't grown, we can return the existing object
// immediately
if (first == num) {
return childrenArrays;
}
// Otherwise, build new arrays for a new ChildrenArray object.
// These arrays start with an enlarged copy of the previous arrays,
// and then are modified to take into account the new categories:
int[] newYoungestChildArray = new int[num];
int[] newOlderSiblingArray = new int[num];
// In Java 6, we could just do Arrays.copyOf()...
if (childrenArrays!=null) {
System.arraycopy(childrenArrays.getYoungestChildArray(), 0,
newYoungestChildArray, 0, childrenArrays.getYoungestChildArray().length);
System.arraycopy(childrenArrays.getOlderSiblingArray(), 0,
newOlderSiblingArray, 0, childrenArrays.getOlderSiblingArray().length);
}
int[] parents = getParentArray();
for (int i=first; i<num; i++) {
newYoungestChildArray[i] = INVALID_ORDINAL;
}
// In the loop below we can ignore the root category (0) because
// it has no parent
if (first==0) {
first = 1;
newOlderSiblingArray[0] = INVALID_ORDINAL;
}
for (int i=first; i<num; i++) {
// Note that parents[i] is always < i, so the right-hand-side of
// the following line is already set when we get here.
newOlderSiblingArray[i] = newYoungestChildArray[parents[i]];
newYoungestChildArray[parents[i]] = i;
}
// Finally switch to the new arrays
childrenArrays = new ChildrenArraysImpl(newYoungestChildArray,
newOlderSiblingArray);
return childrenArrays;
}
}
public String toString(int max) {
ensureOpen();
StringBuilder sb = new StringBuilder();
int upperl = Math.min(max, this.indexReader.maxDoc());
int upperl = Math.min(max, indexReader.maxDoc());
for (int i = 0; i < upperl; i++) {
try {
CategoryPath category = this.getPath(i);
@ -548,75 +444,5 @@ public class DirectoryTaxonomyReader implements TaxonomyReader {
}
return sb.toString();
}
private static final class ChildrenArraysImpl implements ChildrenArrays {
private int[] youngestChildArray, olderSiblingArray;
public ChildrenArraysImpl(int[] youngestChildArray, int[] olderSiblingArray) {
this.youngestChildArray = youngestChildArray;
this.olderSiblingArray = olderSiblingArray;
}
@Override
public int[] getOlderSiblingArray() {
return olderSiblingArray;
}
@Override
public int[] getYoungestChildArray() {
return youngestChildArray;
}
}
/**
* Expert: This method is only for expert use.
* Note also that any call to refresh() will invalidate the returned reader,
* so the caller needs to take care of appropriate locking.
*
* @return lucene indexReader
*/
DirectoryReader getInternalIndexReader() {
ensureOpen();
return this.indexReader;
}
/**
* Expert: decreases the refCount of this TaxonomyReader instance. If the
* refCount drops to 0, then this reader is closed.
*/
@Override
public void decRef() throws IOException {
ensureOpen();
final int rc = refCount.decrementAndGet();
if (rc == 0) {
boolean success = false;
try {
doClose();
success = true;
} finally {
if (!success) {
// Put reference back on failure
refCount.incrementAndGet();
}
}
} else if (rc < 0) {
throw new IllegalStateException("too many decRef calls: refCount is " + rc + " after decrement");
}
}
/** Expert: returns the current refCount for this taxonomy reader */
@Override
public int getRefCount() {
return refCount.get();
}
/**
* Expert: increments the refCount of this TaxonomyReader instance.
* RefCounts are used to determine when a taxonomy reader can be closed
* safely, i.e. as soon as there are no more references.
* Be sure to always call a corresponding decRef(), in a finally clause;
* otherwise the reader may never be closed.
*/
@Override
public void incRef() {
ensureOpen();
refCount.incrementAndGet();
}
}

View File

@ -294,6 +294,9 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
* @param openMode see {@link OpenMode}
*/
protected IndexWriterConfig createIndexWriterConfig(OpenMode openMode) {
// TODO: should we use a more optimized Codec, e.g. Pulsing (or write custom)?
// The taxonomy has a unique structure, where each term is associated with one document
// Make sure we use a MergePolicy which always merges adjacent segments and thus
// keeps the doc IDs ordered as well (this is crucial for the taxonomy index).
return new IndexWriterConfig(Version.LUCENE_50,
@ -583,7 +586,7 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
addToCache(categoryPath, length, id);
// also add to the parent array
getParentArray().add(id, parent);
parentArray = getParentArray().add(id, parent);
return id;
}
@ -811,10 +814,9 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
synchronized (this) {
if (parentArray == null) {
initReaderManager();
parentArray = new ParentArray();
DirectoryReader reader = readerManager.acquire();
try {
parentArray.refresh(reader);
parentArray = new ParentArray(reader);
} finally {
readerManager.release(reader);
}
@ -1035,5 +1037,21 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
public Directory getDirectory() {
return dir;
}
/**
* Used by {@link DirectoryTaxonomyReader} to support NRT.
* <p>
* <b>NOTE:</b> you should not use the obtained {@link IndexWriter} in any
* way, other than opening an IndexReader on it, or otherwise, the taxonomy
* index may become corrupt!
*/
final IndexWriter getInternalIndexWriter() {
return indexWriter;
}
/** Used by {@link DirectoryTaxonomyReader} to support NRT. */
final long getTaxonomyEpoch() {
return indexEpoch;
}
}

View File

@ -2,15 +2,14 @@ package org.apache.lucene.facet.taxonomy.directory;
import java.io.IOException;
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
import org.apache.lucene.util.ArrayUtil;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@ -29,55 +28,23 @@ import org.apache.lucene.facet.taxonomy.TaxonomyReader;
* limitations under the License.
*/
// getParent() needs to be extremely efficient, to the point that we need
// to fetch all the data in advance into memory, and answer these calls
// from memory. Currently we use a large integer array, which is
// initialized when the taxonomy is opened, and potentially enlarged
// when it is refresh()ed.
/**
* @lucene.experimental
*/
class ParentArray {
// These arrays are not syncrhonized. Rather, the reference to the array
// is volatile, and the only writing operation (refreshPrefetchArrays)
// simply creates a new array and replaces the reference. The volatility
// of the reference ensures the correct atomic replacement and its
// visibility properties (the content of the array is visible when the
// new reference is visible).
private volatile int prefetchParentOrdinal[] = null;
// TODO: maybe use PackedInts?
private final int[] parentOrdinals;
public int[] getArray() {
return prefetchParentOrdinal;
/** Used by {@link #add(int, int)} when the array needs to grow. */
ParentArray(int[] parentOrdinals) {
this.parentOrdinals = parentOrdinals;
}
/**
* refreshPrefetch() refreshes the parent array. Initially, it fills the
* array from the positions of an appropriate posting list. If called during
* a refresh(), when the arrays already exist, only values for new documents
* (those beyond the last one in the array) are read from the positions and
* added to the arrays (that are appropriately enlarged). We assume (and
* this is indeed a correct assumption in our case) that existing categories
* are never modified or deleted.
*/
void refresh(IndexReader indexReader) throws IOException {
// Note that it is not necessary for us to obtain the read lock.
// The reason is that we are only called from refresh() (precluding
// another concurrent writer) or from the constructor (when no method
// could be running).
// The write lock is also not held during the following code, meaning
// that reads *can* happen while this code is running. The "volatile"
// property of the prefetchParentOrdinal and prefetchDepth array
// references ensure the correct visibility property of the assignment
// but other than that, we do *not* guarantee that a reader will not
// use an old version of one of these arrays (or both) while a refresh
// is going on. But we find this acceptable - until a refresh has
// finished, the reader should not expect to see new information
// (and the old information is the same in the old and new versions).
int first;
int num = indexReader.maxDoc();
if (prefetchParentOrdinal==null) {
prefetchParentOrdinal = new int[num];
public ParentArray(IndexReader reader) throws IOException {
parentOrdinals = new int[reader.maxDoc()];
if (parentOrdinals.length > 0) {
initFromReader(reader, 0);
// Starting Lucene 2.9, following the change LUCENE-1542, we can
// no longer reliably read the parent "-1" (see comment in
// LuceneTaxonomyWriter.SinglePositionTokenStream). We have no way
@ -85,78 +52,88 @@ class ParentArray {
// with existing indexes, so what we'll do instead is just
// hard-code the parent of ordinal 0 to be -1, and assume (as is
// indeed the case) that no other parent can be -1.
if (num>0) {
prefetchParentOrdinal[0] = TaxonomyReader.INVALID_ORDINAL;
}
first = 1;
} else {
first = prefetchParentOrdinal.length;
if (first==num) {
return; // nothing to do - no category was added
}
// In Java 6, we could just do Arrays.copyOf()...
int[] newarray = new int[num];
System.arraycopy(prefetchParentOrdinal, 0, newarray, 0,
prefetchParentOrdinal.length);
prefetchParentOrdinal = newarray;
}
// Read the new part of the parents array from the positions:
// TODO (Facet): avoid Multi*?
Bits liveDocs = MultiFields.getLiveDocs(indexReader);
DocsAndPositionsEnum positions = MultiFields.getTermPositionsEnum(indexReader, liveDocs,
Consts.FIELD_PAYLOADS, new BytesRef(Consts.PAYLOAD_PARENT),
DocsAndPositionsEnum.FLAG_PAYLOADS);
if ((positions == null || positions.advance(first) == DocIdSetIterator.NO_MORE_DOCS) && first < num) {
throw new CorruptIndexException("Missing parent data for category " + first);
}
for (int i=first; i<num; i++) {
// Note that we know positions.doc() >= i (this is an
// invariant kept throughout this loop)
if (positions.docID()==i) {
if (positions.freq() == 0) { // shouldn't happen
throw new CorruptIndexException(
"Missing parent data for category "+i);
}
// TODO (Facet): keep a local (non-volatile) copy of the prefetchParentOrdinal
// reference, because access to volatile reference is slower (?).
// Note: The positions we get here are one less than the position
// increment we added originally, so we get here the right numbers:
prefetchParentOrdinal[i] = positions.nextPosition();
if (positions.nextDoc() == DocIdSetIterator.NO_MORE_DOCS) {
if ( i+1 < num ) {
throw new CorruptIndexException(
"Missing parent data for category "+(i+1));
}
break;
}
} else { // this shouldn't happen
throw new CorruptIndexException(
"Missing parent data for category "+i);
}
parentOrdinals[0] = TaxonomyReader.INVALID_ORDINAL;
}
}
public ParentArray(IndexReader reader, ParentArray copyFrom) throws IOException {
assert copyFrom != null;
int[] copyParents = copyFrom.getArray();
assert copyParents.length < reader.maxDoc() : "do not init a new ParentArray if the index hasn't changed";
this.parentOrdinals = new int[reader.maxDoc()];
System.arraycopy(copyParents, 0, parentOrdinals, 0, copyParents.length);
initFromReader(reader, copyParents.length);
}
/**
* add() is used in LuceneTaxonomyWriter, not in LuceneTaxonomyReader.
* It is only called from a synchronized method, so it is not reentrant,
* and also doesn't need to worry about reads happening at the same time.
*
* NOTE: add() and refresh() CANNOT be used together. If you call add(),
* this changes the arrays and refresh() can no longer be used.
*/
void add(int ordinal, int parentOrdinal) {
if (ordinal >= prefetchParentOrdinal.length) {
// grow the array, if necessary.
// In Java 6, we could just do Arrays.copyOf()...
int[] newarray = new int[ordinal*2+1];
System.arraycopy(prefetchParentOrdinal, 0, newarray, 0,
prefetchParentOrdinal.length);
prefetchParentOrdinal = newarray;
// Read the parents of the new categories
private void initFromReader(IndexReader reader, int first) throws IOException {
if (reader.maxDoc() == first) {
return;
}
prefetchParentOrdinal[ordinal] = parentOrdinal;
TermsEnum termsEnum = null;
DocsAndPositionsEnum positions = null;
int idx = 0;
for (AtomicReaderContext context : reader.leaves()) {
if (context.docBase < first) {
continue;
}
// in general we could call readerCtx.reader().termPositionsEnum(), but that
// passes the liveDocs. Since we know there are no deletions, the code
// below may save some CPU cycles.
termsEnum = context.reader().fields().terms(Consts.FIELD_PAYLOADS).iterator(termsEnum);
if (!termsEnum.seekExact(Consts.PAYLOAD_PARENT_BYTES_REF, true)) {
throw new CorruptIndexException("Missing parent stream data for segment " + context.reader());
}
positions = termsEnum.docsAndPositions(null /* no deletes in taxonomy */, positions);
if (positions == null) {
throw new CorruptIndexException("Missing parent stream data for segment " + context.reader());
}
idx = context.docBase;
int doc;
while ((doc = positions.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
doc += context.docBase;
if (doc == idx) {
if (positions.freq() == 0) { // shouldn't happen
throw new CorruptIndexException("Missing parent data for category " + idx);
}
parentOrdinals[idx++] = positions.nextPosition();
} else { // this shouldn't happen
throw new CorruptIndexException("Missing parent data for category " + idx);
}
}
if (idx + 1 < context.reader().maxDoc()) {
throw new CorruptIndexException("Missing parent data for category " + (idx + 1));
}
}
if (idx != reader.maxDoc()) {
throw new CorruptIndexException("Missing parent data for category " + idx);
}
}
public int[] getArray() {
return parentOrdinals;
}
/**
* Adds the given ordinal/parent info and returns either a new instance if the
* underlying array had to grow, or this instance otherwise.
* <p>
* <b>NOTE:</b> you should call this method from a thread-safe code.
*/
ParentArray add(int ordinal, int parentOrdinal) {
if (ordinal >= parentOrdinals.length) {
int[] newarray = ArrayUtil.grow(parentOrdinals);
newarray[ordinal] = parentOrdinal;
return new ParentArray(newarray);
}
parentOrdinals[ordinal] = parentOrdinal;
return this;
}
}

View File

@ -102,4 +102,10 @@ public class LRUHashMap<K,V> extends LinkedHashMap<K,V> {
return size() > maxSize;
}
@SuppressWarnings("unchecked")
@Override
public LRUHashMap<K,V> clone() {
return (LRUHashMap<K,V>) super.clone();
}
}

View File

@ -131,7 +131,7 @@ public class FacetTestUtils {
public static class IndexTaxonomyReaderPair {
public DirectoryReader indexReader;
public TaxonomyReader taxReader;
public DirectoryTaxonomyReader taxReader;
public IndexSearcher indexSearcher;
public void close() throws IOException {

View File

@ -78,11 +78,9 @@ public class TestTotalFacetCounts extends LuceneTestCase {
TestTotalFacetCountsCache.addFacets(iParams, writers[0].indexWriter, writers[0].taxWriter, "b", "c");
// Commit Changes
writers[0].commit();
writers[0].close();
IndexTaxonomyReaderPair[] readers =
FacetTestUtils.createIndexTaxonomyReaderPair(dirs);
IndexTaxonomyReaderPair[] readers = FacetTestUtils.createIndexTaxonomyReaderPair(dirs);
int[] intArray = new int[iParams.getPartitionSize()];
@ -93,8 +91,7 @@ public class TestTotalFacetCounts extends LuceneTestCase {
tfcc.load(tmpFile, readers[0].indexReader, readers[0].taxReader, iParams);
// now retrieve the one just loaded
TotalFacetCounts totalCounts =
tfcc.getTotalCounts(readers[0].indexReader, readers[0].taxReader, iParams, null);
TotalFacetCounts totalCounts = tfcc.getTotalCounts(readers[0].indexReader, readers[0].taxReader, iParams, null);
int partition = 0;
for (int i=0; i<expectedCounts.length; i+=partitionSize) {

View File

@ -297,23 +297,17 @@ public class TestTotalFacetCountsCache extends LuceneTestCase {
writers[0].indexWriter.close();
writers[0].taxWriter.close();
readers[0].taxReader.refresh();
DirectoryTaxonomyReader newTaxoReader = TaxonomyReader.openIfChanged(readers[0].taxReader);
assertNotNull(newTaxoReader);
assertTrue("should have received more cagtegories in updated taxonomy", newTaxoReader.getSize() > readers[0].taxReader.getSize());
readers[0].taxReader.close();
readers[0].taxReader = newTaxoReader;
DirectoryReader r2 = DirectoryReader.openIfChanged(readers[0].indexReader);
assertNotNull(r2);
// Hold on to the 'original' reader so we can do some checks with it
IndexReader origReader = null;
assertTrue("Reader must be updated!", readers[0].indexReader != r2);
// Set the 'original' reader
origReader = readers[0].indexReader;
// Set the new master index Reader
readers[0].indexReader.close();
readers[0].indexReader = r2;
// Try to get total-counts the originalReader AGAIN, just for sanity. Should pull from the cache - not recomputed.
assertTrue("Should be obtained from cache at 6th attempt",totalCounts ==
TFC.getTotalCounts(origReader, readers[0].taxReader, iParams, null));
// now use the new reader - should recompute
totalCounts = TFC.getTotalCounts(readers[0].indexReader, readers[0].taxReader, iParams, null);
prevGen = assertRecomputed(totalCounts, prevGen, "after updating the index - 7th attempt!");
@ -322,9 +316,7 @@ public class TestTotalFacetCountsCache extends LuceneTestCase {
assertTrue("Should be obtained from cache at 8th attempt",totalCounts ==
TFC.getTotalCounts(readers[0].indexReader, readers[0].taxReader, iParams, null));
origReader.close();
readers[0].close();
r2.close();
outputFile.delete();
IOUtils.close(dirs[0]);
}
@ -380,7 +372,10 @@ public class TestTotalFacetCountsCache extends LuceneTestCase {
writers[0].taxWriter.addCategory(new CategoryPath("foo", Integer.toString(i)));
}
writers[0].taxWriter.commit();
readers[0].taxReader.refresh();
DirectoryTaxonomyReader newTaxoReader = TaxonomyReader.openIfChanged(readers[0].taxReader);
assertNotNull(newTaxoReader);
readers[0].taxReader.close();
readers[0].taxReader = newTaxoReader;
initCache();

View File

@ -5,18 +5,17 @@ import java.io.PrintWriter;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.concurrent.atomic.AtomicBoolean;
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader;
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.store.RAMDirectory;
import org.junit.Ignore;
import org.junit.Test;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.facet.taxonomy.TaxonomyReader.ChildrenArrays;
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader;
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter;
import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;
import org.apache.lucene.util.SlowRAMDirectory;
import org.junit.Test;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@ -35,6 +34,8 @@ import org.apache.lucene.util.SlowRAMDirectory;
* limitations under the License.
*/
// TODO: remove this suppress after we fix the TaxoWriter Codec to a non-default (see todo in DirTW)
@SuppressCodecs("SimpleText")
public class TestTaxonomyCombined extends LuceneTestCase {
/** The following categories will be added to the taxonomy by
@ -725,7 +726,10 @@ public class TestTaxonomyCombined extends LuceneTestCase {
assertEquals(3, ca.getOlderSiblingArray().length);
assertEquals(3, ca.getYoungestChildArray().length);
// After the refresh, things change:
tr.refresh();
TaxonomyReader newtr = TaxonomyReader.openIfChanged(tr);
assertNotNull(newtr);
tr.close();
tr = newtr;
ca = tr.getChildrenArrays();
assertEquals(5, tr.getSize());
assertEquals(5, ca.getOlderSiblingArray().length);
@ -737,14 +741,11 @@ public class TestTaxonomyCombined extends LuceneTestCase {
indexDir.close();
}
/**
* Test that getParentArrays is valid when retrieved during refresh
*/
// Test that getParentArrays is valid when retrieved during refresh
@Test
@Ignore
public void testTaxonomyReaderRefreshRaces() throws Exception {
// compute base child arrays - after first chunk, and after the other
Directory indexDirBase = newDirectory();
Directory indexDirBase = newDirectory();
TaxonomyWriter twBase = new DirectoryTaxonomyWriter(indexDirBase);
twBase.addCategory(new CategoryPath("a", "0"));
final CategoryPath abPath = new CategoryPath("a", "b");
@ -757,56 +758,64 @@ public class TestTaxonomyCombined extends LuceneTestCase {
final int abOrd = trBase.getOrdinal(abPath);
final int abYoungChildBase1 = ca1.getYoungestChildArray()[abOrd];
for (int i=0; i < 1<<10; i++) { //1024 facets
final int numCategories = atLeast(800);
for (int i = 0; i < numCategories; i++) {
twBase.addCategory(new CategoryPath("a", "b", Integer.toString(i)));
}
twBase.commit();
twBase.close();
trBase.refresh();
TaxonomyReader newTaxoReader = TaxonomyReader.openIfChanged(trBase);
assertNotNull(newTaxoReader);
trBase.close();
trBase = newTaxoReader;
final ChildrenArrays ca2 = trBase.getChildrenArrays();
final int abYoungChildBase2 = ca2.getYoungestChildArray()[abOrd];
for (int retry=0; retry<100; retry++) {
assertConsistentYoungestChild(abPath, abOrd, abYoungChildBase1, abYoungChildBase2, retry);
int numRetries = atLeast(50);
for (int retry = 0; retry < numRetries; retry++) {
assertConsistentYoungestChild(abPath, abOrd, abYoungChildBase1, abYoungChildBase2, retry, numCategories);
}
trBase.close();
indexDirBase.close();
}
private void assertConsistentYoungestChild(final CategoryPath abPath,
final int abOrd, final int abYoungChildBase1, final int abYoungChildBase2, final int retry)
final int abOrd, final int abYoungChildBase1, final int abYoungChildBase2, final int retry, int numCategories)
throws Exception {
SlowRAMDirectory indexDir = new SlowRAMDirectory(-1,null); // no slowness for intialization
SlowRAMDirectory indexDir = new SlowRAMDirectory(-1, null); // no slowness for intialization
TaxonomyWriter tw = new DirectoryTaxonomyWriter(indexDir);
tw.addCategory(new CategoryPath("a", "0"));
tw.addCategory(abPath);
tw.commit();
final TaxonomyReader tr = new DirectoryTaxonomyReader(indexDir);
for (int i=0; i < 1<<10; i++) { //1024 facets
final DirectoryTaxonomyReader tr = new DirectoryTaxonomyReader(indexDir);
for (int i = 0; i < numCategories; i++) {
final CategoryPath cp = new CategoryPath("a", "b", Integer.toString(i));
tw.addCategory(cp);
assertEquals("Ordinal of "+cp+" must be invalid until Taxonomy Reader was refreshed", TaxonomyReader.INVALID_ORDINAL, tr.getOrdinal(cp));
}
tw.commit();
tw.close();
final boolean[] stop = new boolean[] { false };
final AtomicBoolean stop = new AtomicBoolean(false);
final Throwable[] error = new Throwable[] { null };
final int retrieval[] = { 0 };
Thread thread = new Thread("Child Arrays Verifier") {
@Override
public void run() {
setPriority(1+getPriority());
setPriority(1 + getPriority());
try {
while (!stop[0]) {
int lastOrd = tr.getParentArray().length-1;
assertNotNull("path of last-ord "+lastOrd+" is not found!",tr.getPath(lastOrd));
assertChildrenArrays(tr.getChildrenArrays(),retry,retrieval[0]++);
while (!stop.get()) {
int lastOrd = tr.getParentArray().length - 1;
assertNotNull("path of last-ord " + lastOrd + " is not found!", tr.getPath(lastOrd));
assertChildrenArrays(tr.getChildrenArrays(), retry, retrieval[0]++);
sleep(10); // don't starve refresh()'s CPU, which sleeps every 50 bytes for 1 ms
}
} catch (Throwable e) {
error[0] = e;
stop[0] = true;
stop.set(true);
}
}
@ -822,13 +831,15 @@ public class TestTaxonomyCombined extends LuceneTestCase {
thread.start();
indexDir.setSleepMillis(1); // some delay for refresh
tr.refresh();
TaxonomyReader newTaxoReader = TaxonomyReader.openIfChanged(tr);
if (newTaxoReader != null) {
newTaxoReader.close();
}
stop[0] = true;
stop.set(true);
thread.join();
assertNull("Unexpcted exception at retry "+retry+" retrieval "+retrieval[0]+": \n"+stackTraceStr(error[0]), error[0]);
tw.close();
tr.close();
}
@ -885,7 +896,7 @@ public class TestTaxonomyCombined extends LuceneTestCase {
// ok
}
assertEquals(1, tr.getSize()); // still root only...
tr.refresh(); // this is not enough, because tw.commit() hasn't been done yet
assertNull(TaxonomyReader.openIfChanged(tr)); // this is not enough, because tw.commit() hasn't been done yet
try {
tr.getParent(author);
fail("Before commit() and refresh(), getParent for "+author+" should still throw exception");
@ -901,7 +912,11 @@ public class TestTaxonomyCombined extends LuceneTestCase {
// ok
}
assertEquals(1, tr.getSize()); // still root only...
tr.refresh();
TaxonomyReader newTaxoReader = TaxonomyReader.openIfChanged(tr);
assertNotNull(newTaxoReader);
tr.close();
tr = newTaxoReader;
try {
assertEquals(TaxonomyReader.ROOT_ORDINAL, tr.getParent(author));
// ok
@ -917,7 +932,10 @@ public class TestTaxonomyCombined extends LuceneTestCase {
tw.addCategory(new CategoryPath("Author", "Richard Dawkins"));
int dawkins = 2;
tw.commit();
tr.refresh();
newTaxoReader = TaxonomyReader.openIfChanged(tr);
assertNotNull(newTaxoReader);
tr.close();
tr = newTaxoReader;
assertEquals(author, tr.getParent(dawkins));
assertEquals(TaxonomyReader.ROOT_ORDINAL, tr.getParent(author));
assertEquals(TaxonomyReader.INVALID_ORDINAL, tr.getParent(TaxonomyReader.ROOT_ORDINAL));
@ -943,16 +961,19 @@ public class TestTaxonomyCombined extends LuceneTestCase {
// before commit and refresh, no change:
assertEquals(TaxonomyReader.INVALID_ORDINAL, tr.getOrdinal(author));
assertEquals(1, tr.getSize()); // still root only...
tr.refresh(); // this is not enough, because tw.commit() hasn't been done yet
assertNull(TaxonomyReader.openIfChanged(tr)); // this is not enough, because tw.commit() hasn't been done yet
assertEquals(TaxonomyReader.INVALID_ORDINAL, tr.getOrdinal(author));
assertEquals(1, tr.getSize()); // still root only...
tw.commit();
// still not enough before refresh:
assertEquals(TaxonomyReader.INVALID_ORDINAL, tr.getOrdinal(author));
assertEquals(1, tr.getSize()); // still root only...
tr.refresh(); // finally
TaxonomyReader newTaxoReader = TaxonomyReader.openIfChanged(tr);
assertNotNull(newTaxoReader);
tr.close();
tr = newTaxoReader;
assertEquals(1, tr.getOrdinal(author));
assertEquals(2, tr.getSize()); // still root only...
assertEquals(2, tr.getSize());
tw.close();
tr.close();
indexDir.close();
@ -977,7 +998,7 @@ public class TestTaxonomyCombined extends LuceneTestCase {
// Try to open a second writer, with the first one locking the directory.
// We expect to get a LockObtainFailedException.
try {
new DirectoryTaxonomyWriter(indexDir);
assertNull(new DirectoryTaxonomyWriter(indexDir));
fail("should have failed to write in locked directory");
} catch (LockObtainFailedException e) {
// this is what we expect to happen.
@ -989,7 +1010,10 @@ public class TestTaxonomyCombined extends LuceneTestCase {
tw2.addCategory(new CategoryPath("hey"));
tw2.close();
// See that the writer indeed wrote:
tr.refresh();
TaxonomyReader newtr = TaxonomyReader.openIfChanged(tr);
assertNotNull(newtr);
tr.close();
tr = newtr;
assertEquals(3, tr.getOrdinal(new CategoryPath("hey")));
tr.close();
tw.close();
@ -1086,6 +1110,27 @@ public class TestTaxonomyCombined extends LuceneTestCase {
indexDir.close();
}
@Test
public void testNRT() throws Exception {
Directory dir = newDirectory();
DirectoryTaxonomyWriter writer = new DirectoryTaxonomyWriter(dir);
TaxonomyReader reader = new DirectoryTaxonomyReader(writer);
CategoryPath cp = new CategoryPath("a");
writer.addCategory(cp);
TaxonomyReader newReader = TaxonomyReader.openIfChanged(reader);
assertNotNull("expected a new instance", newReader);
assertEquals(2, newReader.getSize());
assertNotSame(TaxonomyReader.INVALID_ORDINAL, newReader.getOrdinal(cp));
reader.close();
reader = newReader;
writer.close();
reader.close();
dir.close();
}
// TODO (Facet): test multiple readers, one writer. Have the multiple readers
// using the same object (simulating threads) or different objects
// (simulating processes).

View File

@ -3,12 +3,11 @@ package org.apache.lucene.facet.taxonomy.directory;
import java.util.Random;
import org.apache.lucene.facet.taxonomy.CategoryPath;
import org.apache.lucene.facet.taxonomy.InconsistentTaxonomyException;
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
import org.apache.lucene.facet.taxonomy.TaxonomyWriter;
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader;
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.LogMergePolicy;
import org.apache.lucene.store.AlreadyClosedException;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
@ -67,11 +66,8 @@ public class TestDirectoryTaxonomyReader extends LuceneTestCase {
dir.close();
}
/**
* Test the boolean returned by TR.refresh
*/
@Test
public void testReaderRefreshResult() throws Exception {
public void testOpenIfChangedResult() throws Exception {
Directory dir = null;
DirectoryTaxonomyWriter ltw = null;
DirectoryTaxonomyReader ltr = null;
@ -84,13 +80,15 @@ public class TestDirectoryTaxonomyReader extends LuceneTestCase {
ltw.commit();
ltr = new DirectoryTaxonomyReader(dir);
assertFalse("Nothing has changed",ltr.refresh());
assertNull("Nothing has changed", TaxonomyReader.openIfChanged(ltr));
ltw.addCategory(new CategoryPath("b"));
ltw.commit();
assertTrue("changes were committed",ltr.refresh());
assertFalse("Nothing has changed",ltr.refresh());
DirectoryTaxonomyReader newtr = TaxonomyReader.openIfChanged(ltr);
assertNotNull("changes were committed", newtr);
assertNull("Nothing has changed", TaxonomyReader.openIfChanged(newtr));
newtr.close();
} finally {
IOUtils.close(ltw, ltr, dir);
}
@ -119,18 +117,15 @@ public class TestDirectoryTaxonomyReader extends LuceneTestCase {
*/
@Test
public void testFreshReadRecreatedTaxonomy() throws Exception {
doTestReadRecreatedTaxono(random(), true);
doTestReadRecreatedTaxonomy(random(), true);
}
/**
* recreating a taxonomy should work well with a refreshed taxonomy reader
*/
@Test
public void testRefreshReadRecreatedTaxonomy() throws Exception {
doTestReadRecreatedTaxono(random(), false);
public void testOpenIfChangedReadRecreatedTaxonomy() throws Exception {
doTestReadRecreatedTaxonomy(random(), false);
}
private void doTestReadRecreatedTaxono(Random random, boolean closeReader) throws Exception {
private void doTestReadRecreatedTaxonomy(Random random, boolean closeReader) throws Exception {
Directory dir = null;
TaxonomyWriter tw = null;
TaxonomyReader tr = null;
@ -163,13 +158,10 @@ public class TestDirectoryTaxonomyReader extends LuceneTestCase {
tr.close();
tr = new DirectoryTaxonomyReader(dir);
} else {
try {
tr.refresh();
fail("Expected InconsistentTaxonomyException");
} catch (InconsistentTaxonomyException e) {
tr.close();
tr = new DirectoryTaxonomyReader(dir);
}
TaxonomyReader newtr = TaxonomyReader.openIfChanged(tr);
assertNotNull(newtr);
tr.close();
tr = newtr;
}
assertEquals("Wrong #categories in taxonomy (i="+i+", k="+k+")", baseNumCategories + 1 + k, tr.getSize());
}
@ -179,14 +171,14 @@ public class TestDirectoryTaxonomyReader extends LuceneTestCase {
}
@Test
public void testRefreshAndRefCount() throws Exception {
public void testOpenIfChangedAndRefCount() throws Exception {
Directory dir = new RAMDirectory(); // no need for random directories here
DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(dir);
taxoWriter.addCategory(new CategoryPath("a"));
taxoWriter.commit();
DirectoryTaxonomyReader taxoReader = new DirectoryTaxonomyReader(dir);
TaxonomyReader taxoReader = new DirectoryTaxonomyReader(dir);
assertEquals("wrong refCount", 1, taxoReader.getRefCount());
taxoReader.incRef();
@ -194,12 +186,189 @@ public class TestDirectoryTaxonomyReader extends LuceneTestCase {
taxoWriter.addCategory(new CategoryPath("a", "b"));
taxoWriter.commit();
taxoReader.refresh();
assertEquals("wrong refCount", 2, taxoReader.getRefCount());
TaxonomyReader newtr = TaxonomyReader.openIfChanged(taxoReader);
assertNotNull(newtr);
taxoReader.close();
taxoReader = newtr;
assertEquals("wrong refCount", 1, taxoReader.getRefCount());
taxoWriter.close();
taxoReader.close();
dir.close();
}
@Test
public void testOpenIfChangedManySegments() throws Exception {
// test openIfChanged() when the taxonomy contains many segments
Directory dir = newDirectory();
DirectoryTaxonomyWriter writer = new DirectoryTaxonomyWriter(dir) {
@Override
protected IndexWriterConfig createIndexWriterConfig(OpenMode openMode) {
IndexWriterConfig conf = super.createIndexWriterConfig(openMode);
LogMergePolicy lmp = (LogMergePolicy) conf.getMergePolicy();
lmp.setMergeFactor(2);
return conf;
}
};
TaxonomyReader reader = new DirectoryTaxonomyReader(writer);
int numRounds = random().nextInt(10) + 10;
int numCategories = 1; // one for root
for (int i = 0; i < numRounds; i++) {
int numCats = random().nextInt(4) + 1;
for (int j = 0; j < numCats; j++) {
writer.addCategory(new CategoryPath(Integer.toString(i), Integer.toString(j)));
}
numCategories += numCats + 1 /* one for round-parent */;
TaxonomyReader newtr = TaxonomyReader.openIfChanged(reader);
assertNotNull(newtr);
reader.close();
reader = newtr;
// assert categories
assertEquals(numCategories, reader.getSize());
int roundOrdinal = reader.getOrdinal(new CategoryPath(Integer.toString(i)));
int[] parents = reader.getParentArray();
assertEquals(0, parents[roundOrdinal]); // round's parent is root
for (int j = 0; j < numCats; j++) {
int ord = reader.getOrdinal(new CategoryPath(Integer.toString(i), Integer.toString(j)));
assertEquals(roundOrdinal, parents[ord]); // round's parent is root
}
}
reader.close();
writer.close();
dir.close();
}
@Test
public void testOpenIfChangedReuseAfterRecreate() throws Exception {
// tests that if the taxonomy is recreated, no data is reused from the previous taxonomy
Directory dir = newDirectory();
DirectoryTaxonomyWriter writer = new DirectoryTaxonomyWriter(dir);
CategoryPath cp_a = new CategoryPath("a");
writer.addCategory(cp_a);
writer.close();
DirectoryTaxonomyReader r1 = new DirectoryTaxonomyReader(dir);
// fill r1's caches
assertEquals(1, r1.getOrdinal(cp_a));
assertEquals(cp_a, r1.getPath(1));
// now recreate, add a different category
writer = new DirectoryTaxonomyWriter(dir, OpenMode.CREATE);
CategoryPath cp_b = new CategoryPath("b");
writer.addCategory(cp_b);
writer.close();
DirectoryTaxonomyReader r2 = TaxonomyReader.openIfChanged(r1);
assertNotNull(r2);
// fill r2's caches
assertEquals(1, r2.getOrdinal(cp_b));
assertEquals(cp_b, r2.getPath(1));
// check that r1 doesn't see cp_b
assertEquals(TaxonomyReader.INVALID_ORDINAL, r1.getOrdinal(cp_b));
assertEquals(cp_a, r1.getPath(1));
// check that r2 doesn't see cp_a
assertEquals(TaxonomyReader.INVALID_ORDINAL, r2.getOrdinal(cp_a));
assertEquals(cp_b, r2.getPath(1));
r2.close();
r1.close();
dir.close();
}
@Test
public void testOpenIfChangedReuse() throws Exception {
// test the reuse of data from the old DTR instance
for (boolean nrt : new boolean[] {false, true}) {
Directory dir = newDirectory();
DirectoryTaxonomyWriter writer = new DirectoryTaxonomyWriter(dir);
CategoryPath cp_a = new CategoryPath("a");
writer.addCategory(cp_a);
if (!nrt) writer.commit();
DirectoryTaxonomyReader r1 = nrt ? new DirectoryTaxonomyReader(writer) : new DirectoryTaxonomyReader(dir);
// fill r1's caches
assertEquals(1, r1.getOrdinal(cp_a));
assertEquals(cp_a, r1.getPath(1));
CategoryPath cp_b = new CategoryPath("b");
writer.addCategory(cp_b);
if (!nrt) writer.commit();
DirectoryTaxonomyReader r2 = TaxonomyReader.openIfChanged(r1);
assertNotNull(r2);
// add r2's categories to the caches
assertEquals(2, r2.getOrdinal(cp_b));
assertEquals(cp_b, r2.getPath(2));
// check that r1 doesn't see cp_b
assertEquals(TaxonomyReader.INVALID_ORDINAL, r1.getOrdinal(cp_b));
assertNull(r1.getPath(2));
r1.close();
r2.close();
writer.close();
dir.close();
}
}
@Test
public void testOpenIfChangedReplaceTaxonomy() throws Exception {
// test openIfChanged when replaceTaxonomy is called, which is equivalent to recreate
// only can work with NRT as well
Directory src = newDirectory();
DirectoryTaxonomyWriter w = new DirectoryTaxonomyWriter(src);
CategoryPath cp_b = new CategoryPath("b");
w.addCategory(cp_b);
w.close();
for (boolean nrt : new boolean[] {false, true}) {
Directory dir = newDirectory();
DirectoryTaxonomyWriter writer = new DirectoryTaxonomyWriter(dir);
CategoryPath cp_a = new CategoryPath("a");
writer.addCategory(cp_a);
if (!nrt) writer.commit();
DirectoryTaxonomyReader r1 = nrt ? new DirectoryTaxonomyReader(writer) : new DirectoryTaxonomyReader(dir);
// fill r1's caches
assertEquals(1, r1.getOrdinal(cp_a));
assertEquals(cp_a, r1.getPath(1));
// now replace taxonomy
writer.replaceTaxonomy(src);
if (!nrt) writer.commit();
DirectoryTaxonomyReader r2 = TaxonomyReader.openIfChanged(r1);
assertNotNull(r2);
// fill r2's caches
assertEquals(1, r2.getOrdinal(cp_b));
assertEquals(cp_b, r2.getPath(1));
// check that r1 doesn't see cp_b
assertEquals(TaxonomyReader.INVALID_ORDINAL, r1.getOrdinal(cp_b));
assertEquals(cp_a, r1.getPath(1));
// check that r2 doesn't see cp_a
assertEquals(TaxonomyReader.INVALID_ORDINAL, r2.getOrdinal(cp_a));
assertEquals(cp_b, r2.getPath(1));
r2.close();
r1.close();
writer.close();
dir.close();
}
src.close();
}
}

View File

@ -8,7 +8,7 @@ import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.lucene.facet.taxonomy.CategoryPath;
import org.apache.lucene.facet.taxonomy.InconsistentTaxonomyException;
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
import org.apache.lucene.facet.taxonomy.writercache.TaxonomyWriterCache;
import org.apache.lucene.facet.taxonomy.writercache.cl2o.Cl2oTaxonomyWriterCache;
import org.apache.lucene.facet.taxonomy.writercache.lru.LruTaxonomyWriterCache;
@ -178,12 +178,14 @@ public class TestDirectoryTaxonomyWriter extends LuceneTestCase {
DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(dir, OpenMode.CREATE_OR_APPEND, NO_OP_CACHE);
touchTaxo(taxoWriter, new CategoryPath("a"));
DirectoryTaxonomyReader taxoReader = new DirectoryTaxonomyReader(dir);
TaxonomyReader taxoReader = new DirectoryTaxonomyReader(dir);
touchTaxo(taxoWriter, new CategoryPath("b"));
// this should not fail
taxoReader.refresh();
TaxonomyReader newtr = TaxonomyReader.openIfChanged(taxoReader);
taxoReader.close();
taxoReader = newtr;
assertEquals(1, Integer.parseInt(taxoReader.getCommitUserData().get(DirectoryTaxonomyWriter.INDEX_EPOCH)));
// now recreate the taxonomy, and check that the epoch is preserved after opening DirTW again.
taxoWriter.close();
@ -195,14 +197,11 @@ public class TestDirectoryTaxonomyWriter extends LuceneTestCase {
touchTaxo(taxoWriter, new CategoryPath("d"));
taxoWriter.close();
// this should fail
try {
taxoReader.refresh();
fail("IconsistentTaxonomyException should have been thrown");
} catch (InconsistentTaxonomyException e) {
// ok, expected
}
newtr = TaxonomyReader.openIfChanged(taxoReader);
taxoReader.close();
taxoReader = newtr;
assertEquals(2, Integer.parseInt(taxoReader.getCommitUserData().get(DirectoryTaxonomyWriter.INDEX_EPOCH)));
taxoReader.close();
dir.close();
}
@ -221,7 +220,7 @@ public class TestDirectoryTaxonomyWriter extends LuceneTestCase {
DirectoryTaxonomyReader taxoReader = new DirectoryTaxonomyReader(dir);
assertEquals(1, Integer.parseInt(taxoReader.getCommitUserData().get(DirectoryTaxonomyWriter.INDEX_EPOCH)));
taxoReader.refresh();
assertNull(TaxonomyReader.openIfChanged(taxoReader));
taxoReader.close();
dir.close();