mirror of https://github.com/apache/lucene.git
LUCENE-3441: facets NRT support
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1412149 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
258baa7069
commit
8b5e57faee
|
@ -34,6 +34,26 @@ Changes in backwards compatibility policy
|
|||
Override lengthNorm and/or encode/decodeNormValue to change the specifics,
|
||||
like Lucene 3.x. (Robert Muir)
|
||||
|
||||
* LUCENE-3441: The facet module now supports NRT. As a result, the following
|
||||
changes were made:
|
||||
- DirectoryTaxonomyReader has a new constructor which takes a
|
||||
DirectoryTaxonomyWriter. You should use that constructor in order to get
|
||||
the NRT support (or the old one for non-NRT).
|
||||
- TaxonomyReader.refresh() removed in exchange for TaxonomyReader.openIfChanged
|
||||
static method. Similar to DirectoryReader, the method either returns null
|
||||
if no changes were made to the taxonomy, or a new TR instance otherwise.
|
||||
Instead of calling refresh(), you should write similar code to how you reopen
|
||||
a regular DirectoryReader.
|
||||
- TaxonomyReader.openIfChanged (previously refresh()) no longer throws
|
||||
IncosistentTaxonomyException, and supports recreate. InconsistentTaxoEx
|
||||
was removed.
|
||||
- ChildrenArrays was pulled out of TaxonomyReader into a top-level class.
|
||||
- TaxonomyReader was made an abstract class (instead of an interface), with
|
||||
methods such as close() and reference counting management pulled from
|
||||
DirectoryTaxonomyReader, and made final. The rest of the methods, remained
|
||||
abstract.
|
||||
(Shai Erera, Gilad Barkai)
|
||||
|
||||
New Features
|
||||
|
||||
* LUCENE-4226: New experimental StoredFieldsFormat that compresses chunks of
|
||||
|
|
|
@ -8,8 +8,8 @@ import org.apache.lucene.facet.search.results.FacetResult;
|
|||
import org.apache.lucene.facet.search.results.FacetResultNode;
|
||||
import org.apache.lucene.facet.search.results.MutableFacetResultNode;
|
||||
import org.apache.lucene.facet.search.results.IntermediateFacetResult;
|
||||
import org.apache.lucene.facet.taxonomy.ChildrenArrays;
|
||||
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
|
||||
import org.apache.lucene.facet.taxonomy.TaxonomyReader.ChildrenArrays;
|
||||
import org.apache.lucene.facet.util.ResultSortUtils;
|
||||
|
||||
/*
|
||||
|
@ -120,7 +120,7 @@ public class TopKFacetResultsHandler extends FacetResultsHandler {
|
|||
* @return total number of descendants considered here by pq, excluding ordinal itself.
|
||||
*/
|
||||
private int heapDescendants(int ordinal, Heap<FacetResultNode> pq,
|
||||
MutableFacetResultNode parentResultNode, FacetArrays facetArrays, int offset) {
|
||||
MutableFacetResultNode parentResultNode, FacetArrays facetArrays, int offset) throws IOException {
|
||||
int partitionSize = facetArrays.getArraysLength();
|
||||
int endOffset = offset + partitionSize;
|
||||
ChildrenArrays childrenArray = taxonomyReader.getChildrenArrays();
|
||||
|
|
|
@ -12,8 +12,8 @@ import org.apache.lucene.facet.search.results.FacetResult;
|
|||
import org.apache.lucene.facet.search.results.FacetResultNode;
|
||||
import org.apache.lucene.facet.search.results.MutableFacetResultNode;
|
||||
import org.apache.lucene.facet.search.results.IntermediateFacetResult;
|
||||
import org.apache.lucene.facet.taxonomy.ChildrenArrays;
|
||||
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
|
||||
import org.apache.lucene.facet.taxonomy.TaxonomyReader.ChildrenArrays;
|
||||
import org.apache.lucene.util.collections.IntIterator;
|
||||
import org.apache.lucene.util.collections.IntToObjectMap;
|
||||
|
||||
|
|
|
@ -0,0 +1,87 @@
|
|||
package org.apache.lucene.facet.taxonomy;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Equivalent representations of the taxonomy's parent info,
|
||||
* used internally for efficient computation of facet results:
|
||||
* "youngest child" and "oldest sibling"
|
||||
*/
|
||||
public class ChildrenArrays {
|
||||
|
||||
private final int[] youngestChild, olderSibling;
|
||||
|
||||
public ChildrenArrays(int[] parents) {
|
||||
this(parents, null);
|
||||
}
|
||||
|
||||
public ChildrenArrays(int[] parents, ChildrenArrays copyFrom) {
|
||||
youngestChild = new int[parents.length];
|
||||
olderSibling = new int[parents.length];
|
||||
int first = 0;
|
||||
if (copyFrom != null) {
|
||||
System.arraycopy(copyFrom.getYoungestChildArray(), 0, youngestChild, 0, copyFrom.getYoungestChildArray().length);
|
||||
System.arraycopy(copyFrom.getOlderSiblingArray(), 0, olderSibling, 0, copyFrom.getOlderSiblingArray().length);
|
||||
first = copyFrom.getOlderSiblingArray().length;
|
||||
}
|
||||
computeArrays(parents, first);
|
||||
}
|
||||
|
||||
private void computeArrays(int[] parents, int first) {
|
||||
// reset the youngest child of all ordinals. while this should be done only
|
||||
// for the leaves, we don't know up front which are the leaves, so we reset
|
||||
// all of them.
|
||||
for (int i = first; i < parents.length; i++) {
|
||||
youngestChild[i] = TaxonomyReader.INVALID_ORDINAL;
|
||||
}
|
||||
|
||||
// the root category has no parent, and therefore no siblings
|
||||
if (first == 0) {
|
||||
first = 1;
|
||||
olderSibling[0] = TaxonomyReader.INVALID_ORDINAL;
|
||||
}
|
||||
|
||||
for (int i = first; i < parents.length; i++) {
|
||||
// note that parents[i] is always < i, so the right-hand-side of
|
||||
// the following line is already set when we get here
|
||||
olderSibling[i] = youngestChild[parents[i]];
|
||||
youngestChild[parents[i]] = i;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an {@code int[]} the size of the taxonomy listing for each category
|
||||
* the ordinal of its immediate older sibling (the sibling in the taxonomy
|
||||
* tree with the highest ordinal below that of the given ordinal). The value
|
||||
* for a category with no older sibling is {@link TaxonomyReader#INVALID_ORDINAL}.
|
||||
*/
|
||||
public int[] getOlderSiblingArray() {
|
||||
return olderSibling;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an {@code int[]} the size of the taxonomy listing the ordinal of
|
||||
* the youngest (highest numbered) child category of each category in the
|
||||
* taxonomy. The value for a leaf category (a category without children) is
|
||||
* {@link TaxonomyReader#INVALID_ORDINAL}.
|
||||
*/
|
||||
public int[] getYoungestChildArray() {
|
||||
return youngestChild;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,40 +0,0 @@
|
|||
package org.apache.lucene.facet.taxonomy;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Exception indicating that a certain operation could not be performed
|
||||
* on a taxonomy related object because of an inconsistency.
|
||||
* <p>
|
||||
* For example, trying to refresh a taxonomy reader might fail in case
|
||||
* the underlying taxonomy was meanwhile modified in a manner which
|
||||
* does not allow to perform such a refresh. (See {@link TaxonomyReader#refresh()}.)
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class InconsistentTaxonomyException extends Exception {
|
||||
|
||||
public InconsistentTaxonomyException(String message) {
|
||||
super(message);
|
||||
}
|
||||
|
||||
public InconsistentTaxonomyException() {
|
||||
super();
|
||||
}
|
||||
|
||||
}
|
|
@ -3,6 +3,9 @@ package org.apache.lucene.facet.taxonomy;
|
|||
import java.io.Closeable;
|
||||
import java.io.IOException;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
import org.apache.lucene.store.AlreadyClosedException;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
|
@ -60,13 +63,13 @@ import java.util.Map;
|
|||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public interface TaxonomyReader extends Closeable {
|
||||
public abstract class TaxonomyReader implements Closeable {
|
||||
|
||||
/**
|
||||
* The root category (the category with the empty path) always has the
|
||||
* ordinal 0, to which we give a name ROOT_ORDINAL.
|
||||
* getOrdinal() of an empty path will always return ROOT_ORDINAL, and
|
||||
* getCategory(ROOT_ORDINAL) will return the empty path.
|
||||
* The root category (the category with the empty path) always has the ordinal
|
||||
* 0, to which we give a name ROOT_ORDINAL. {@link #getOrdinal(CategoryPath)}
|
||||
* of an empty path will always return {@code ROOT_ORDINAL}, and
|
||||
* {@link #getPath(int)} with {@code ROOT_ORDINAL} will return the empty path.
|
||||
*/
|
||||
public final static int ROOT_ORDINAL = 0;
|
||||
|
||||
|
@ -77,207 +80,189 @@ public interface TaxonomyReader extends Closeable {
|
|||
public final static int INVALID_ORDINAL = -1;
|
||||
|
||||
/**
|
||||
* getOrdinal() returns the ordinal of the category given as a path.
|
||||
* The ordinal is the category's serial number, an integer which starts
|
||||
* with 0 and grows as more categories are added (note that once a category
|
||||
* is added, it can never be deleted).
|
||||
* <P>
|
||||
* If the given category wasn't found in the taxonomy, INVALID_ORDINAL is
|
||||
* returned.
|
||||
* If the taxonomy has changed since the provided reader was opened, open and
|
||||
* return a new {@link TaxonomyReader}; else, return {@code null}. The new
|
||||
* reader, if not {@code null}, will be the same type of reader as the one
|
||||
* given to this method.
|
||||
*
|
||||
* <p>
|
||||
* This method is typically far less costly than opening a fully new
|
||||
* {@link TaxonomyReader} as it shares resources with the provided
|
||||
* {@link TaxonomyReader}, when possible.
|
||||
*/
|
||||
public int getOrdinal(CategoryPath categoryPath) throws IOException;
|
||||
|
||||
/**
|
||||
* getPath() returns the path name of the category with the given
|
||||
* ordinal. The path is returned as a new CategoryPath object - to
|
||||
* reuse an existing object, use {@link #getPath(int, CategoryPath)}.
|
||||
* <P>
|
||||
* A null is returned if a category with the given ordinal does not exist.
|
||||
*/
|
||||
public CategoryPath getPath(int ordinal) throws IOException;
|
||||
public static <T extends TaxonomyReader> T openIfChanged(T oldTaxoReader) throws IOException {
|
||||
@SuppressWarnings("unchecked")
|
||||
final T newTaxoReader = (T) oldTaxoReader.doOpenIfChanged();
|
||||
assert newTaxoReader != oldTaxoReader;
|
||||
return newTaxoReader;
|
||||
}
|
||||
|
||||
/**
|
||||
* getPath() returns the path name of the category with the given
|
||||
* ordinal. The path is written to the given CategoryPath object (which
|
||||
* is cleared first).
|
||||
* <P>
|
||||
* If a category with the given ordinal does not exist, the given
|
||||
* CategoryPath object is not modified, and the method returns
|
||||
* <code>false</code>. Otherwise, the method returns <code>true</code>.
|
||||
*/
|
||||
public boolean getPath(int ordinal, CategoryPath result) throws IOException;
|
||||
private volatile boolean closed = false;
|
||||
|
||||
/**
|
||||
* refresh() re-reads the taxonomy information if there were any changes to
|
||||
* the taxonomy since this instance was opened or last refreshed. Calling
|
||||
* refresh() is more efficient than close()ing the old instance and opening a
|
||||
* new one.
|
||||
* <P>
|
||||
* If there were no changes since this instance was opened or last refreshed,
|
||||
* then this call does nothing. Note, however, that this is still a relatively
|
||||
* slow method (as it needs to verify whether there have been any changes on
|
||||
* disk to the taxonomy), so it should not be called too often needlessly. In
|
||||
* faceted search, the taxonomy reader's refresh() should be called only after
|
||||
* a reopen() of the main index.
|
||||
* <P>
|
||||
* Refreshing the taxonomy might fail in some cases, for example
|
||||
* if the taxonomy was recreated since this instance was opened or last refreshed.
|
||||
* In this case an {@link InconsistentTaxonomyException} is thrown,
|
||||
* suggesting that in order to obtain up-to-date taxonomy data a new
|
||||
* {@link TaxonomyReader} should be opened. Note: This {@link TaxonomyReader}
|
||||
* instance remains unchanged and usable in this case, and the application can
|
||||
* continue to use it, and should still {@link #close()} when no longer needed.
|
||||
* <P>
|
||||
* It should be noted that refresh() is similar in purpose to
|
||||
* IndexReader.reopen(), but the two methods behave differently. refresh()
|
||||
* refreshes the existing TaxonomyReader object, rather than opening a new one
|
||||
* in addition to the old one as reopen() does. The reason is that in a
|
||||
* taxonomy, one can only add new categories and cannot modify or delete
|
||||
* existing categories; Therefore, there is no reason to keep an old snapshot
|
||||
* of the taxonomy open - refreshing the taxonomy to the newest data and using
|
||||
* this new snapshots in all threads (whether new or old) is fine. This saves
|
||||
* us needing to keep multiple copies of the taxonomy open in memory.
|
||||
* @return true if anything has changed, false otherwise.
|
||||
*/
|
||||
public boolean refresh() throws IOException, InconsistentTaxonomyException;
|
||||
// set refCount to 1 at start
|
||||
private final AtomicInteger refCount = new AtomicInteger(1);
|
||||
|
||||
/**
|
||||
* getParent() returns the ordinal of the parent category of the category
|
||||
* with the given ordinal.
|
||||
* <P>
|
||||
* When a category is specified as a path name, finding the path of its
|
||||
* parent is as trivial as dropping the last component of the path.
|
||||
* getParent() is functionally equivalent to calling getPath() on the
|
||||
* given ordinal, dropping the last component of the path, and then calling
|
||||
* getOrdinal() to get an ordinal back. However, implementations are
|
||||
* expected to provide a much more efficient implementation:
|
||||
* <P>
|
||||
* getParent() should be a very quick method, as it is used during the
|
||||
* facet aggregation process in faceted search. Implementations will most
|
||||
* likely want to serve replies to this method from a pre-filled cache.
|
||||
* <P>
|
||||
* If the given ordinal is the ROOT_ORDINAL, an INVALID_ORDINAL is returned.
|
||||
* If the given ordinal is a top-level category, the ROOT_ORDINAL is returned.
|
||||
* If an invalid ordinal is given (negative or beyond the last available
|
||||
* ordinal), an ArrayIndexOutOfBoundsException is thrown. However, it is
|
||||
* expected that getParent will only be called for ordinals which are
|
||||
* already known to be in the taxonomy.
|
||||
* performs the actual task of closing the resources that are used by the
|
||||
* taxonomy reader.
|
||||
*/
|
||||
public int getParent(int ordinal) throws IOException;
|
||||
protected abstract void doClose() throws IOException;
|
||||
|
||||
/**
|
||||
* getParentArray() returns an int array of size getSize() listing the
|
||||
* ordinal of the parent category of each category in the taxonomy.
|
||||
* <P>
|
||||
* The caller can hold on to the array it got indefinitely - it is
|
||||
* guaranteed that no-one else will modify it. The other side of the
|
||||
* same coin is that the caller must treat the array it got as read-only
|
||||
* and <B>not modify it</B>, because other callers might have gotten the
|
||||
* same array too (and getParent() calls might be answered from the
|
||||
* same array).
|
||||
* <P>
|
||||
* If you use getParentArray() instead of getParent(), remember that
|
||||
* the array you got is (naturally) not modified after a refresh(),
|
||||
* so you should always call getParentArray() again after a refresh().
|
||||
* <P>
|
||||
* This method's function is similar to allocating an array of size
|
||||
* getSize() and filling it with getParent() calls, but implementations
|
||||
* are encouraged to implement it much more efficiently, with O(1)
|
||||
* complexity. This can be done, for example, by the implementation
|
||||
* already keeping the parents in an array, and just returning this
|
||||
* array (without any allocation or copying) when requested.
|
||||
* Implements the actual opening of a new {@link TaxonomyReader} instance if
|
||||
* the taxonomy has changed.
|
||||
*
|
||||
* @see #openIfChanged(TaxonomyReader)
|
||||
*/
|
||||
public int[] getParentArray() throws IOException;
|
||||
protected abstract TaxonomyReader doOpenIfChanged() throws IOException;
|
||||
|
||||
/**
|
||||
* Equivalent representations of the taxonomy's parent info,
|
||||
* used internally for efficient computation of facet results:
|
||||
* "youngest child" and "oldest sibling"
|
||||
* @throws AlreadyClosedException if this IndexReader is closed
|
||||
*/
|
||||
public static interface ChildrenArrays {
|
||||
/**
|
||||
* getYoungestChildArray() returns an int array of size getSize()
|
||||
* listing the ordinal of the youngest (highest numbered) child
|
||||
* category of each category in the taxonomy. The value for a leaf
|
||||
* category (a category without children) is
|
||||
* <code>INVALID_ORDINAL</code>.
|
||||
*/
|
||||
public int[] getYoungestChildArray();
|
||||
/**
|
||||
* getOlderSiblingArray() returns an int array of size getSize()
|
||||
* listing for each category the ordinal of its immediate older
|
||||
* sibling (the sibling in the taxonomy tree with the highest ordinal
|
||||
* below that of the given ordinal). The value for a category with no
|
||||
* older sibling is <code>INVALID_ORDINAL</code>.
|
||||
*/
|
||||
public int[] getOlderSiblingArray();
|
||||
protected final void ensureOpen() throws AlreadyClosedException {
|
||||
if (getRefCount() <= 0) {
|
||||
throw new AlreadyClosedException("this TaxonomyReader is closed");
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public final void close() throws IOException {
|
||||
if (!closed) {
|
||||
synchronized (this) {
|
||||
if (!closed) {
|
||||
decRef();
|
||||
closed = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* getChildrenArrays() returns a {@link ChildrenArrays} object which can
|
||||
* be used together to efficiently enumerate the children of any category.
|
||||
* <P>
|
||||
* The caller can hold on to the object it got indefinitely - it is
|
||||
* guaranteed that no-one else will modify it. The other side of the
|
||||
* same coin is that the caller must treat the object which it got (and
|
||||
* the arrays it contains) as read-only and <B>not modify it</B>, because
|
||||
* other callers might have gotten the same object too.
|
||||
* <P>
|
||||
* Implementations should have O(getSize()) time for the first call or
|
||||
* after a refresh(), but O(1) time for further calls. In neither case
|
||||
* there should be a need to read new data from disk. These guarantees
|
||||
* are most likely achieved by calculating this object (based on the
|
||||
* getParentArray()) when first needed, and later (if the taxonomy was not
|
||||
* refreshed) returning the same object (without any allocation or copying)
|
||||
* when requested.
|
||||
* <P>
|
||||
* The reason we have one method returning one object, rather than two
|
||||
* methods returning two arrays, is to avoid race conditions in a multi-
|
||||
* threaded application: We want to avoid the possibility of returning one
|
||||
* new array and one old array, as those could not be used together.
|
||||
* Expert: decreases the refCount of this TaxonomyReader instance. If the
|
||||
* refCount drops to 0 this taxonomy reader is closed.
|
||||
*/
|
||||
public ChildrenArrays getChildrenArrays();
|
||||
public final void decRef() throws IOException {
|
||||
ensureOpen();
|
||||
final int rc = refCount.decrementAndGet();
|
||||
if (rc == 0) {
|
||||
boolean success = false;
|
||||
try {
|
||||
doClose();
|
||||
closed = true;
|
||||
success = true;
|
||||
} finally {
|
||||
if (!success) {
|
||||
// Put reference back on failure
|
||||
refCount.incrementAndGet();
|
||||
}
|
||||
}
|
||||
} else if (rc < 0) {
|
||||
throw new IllegalStateException("too many decRef calls: refCount is " + rc + " after decrement");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a {@link ChildrenArrays} object which can be used together to
|
||||
* efficiently enumerate the children of any category.
|
||||
* <p>
|
||||
* The caller can hold on to the object it got indefinitely - it is guaranteed
|
||||
* that no-one else will modify it. The other side of the same coin is that
|
||||
* the caller must treat the object which it got (and the arrays it contains)
|
||||
* as read-only and <b>not modify it</b>, because other callers might have
|
||||
* gotten the same object too.
|
||||
*/
|
||||
public abstract ChildrenArrays getChildrenArrays() throws IOException;
|
||||
|
||||
/**
|
||||
* Retrieve user committed data.
|
||||
*
|
||||
* @see TaxonomyWriter#commit(Map)
|
||||
*/
|
||||
public Map<String, String> getCommitUserData() throws IOException;
|
||||
|
||||
/**
|
||||
* Expert: increments the refCount of this TaxonomyReader instance.
|
||||
* RefCounts can be used to determine when a taxonomy reader can be closed
|
||||
* safely, i.e. as soon as there are no more references.
|
||||
* Be sure to always call a corresponding decRef(), in a finally clause;
|
||||
* otherwise the reader may never be closed.
|
||||
*/
|
||||
public void incRef();
|
||||
|
||||
/**
|
||||
* Expert: decreases the refCount of this TaxonomyReader instance.
|
||||
* If the refCount drops to 0, then pending changes (if any) can be
|
||||
* committed to the taxonomy index and this reader can be closed.
|
||||
* @throws IOException If there is a low-level I/O error.
|
||||
*/
|
||||
public void decRef() throws IOException;
|
||||
public abstract Map<String, String> getCommitUserData() throws IOException;
|
||||
|
||||
/**
|
||||
* Expert: returns the current refCount for this taxonomy reader
|
||||
* Returns the ordinal of the category given as a path. The ordinal is the
|
||||
* category's serial number, an integer which starts with 0 and grows as more
|
||||
* categories are added (note that once a category is added, it can never be
|
||||
* deleted).
|
||||
*
|
||||
* @return the category's ordinal or {@link #INVALID_ORDINAL} if the category
|
||||
* wasn't foun.
|
||||
*/
|
||||
public int getRefCount();
|
||||
|
||||
public abstract int getOrdinal(CategoryPath categoryPath) throws IOException;
|
||||
|
||||
/**
|
||||
* getSize() returns the number of categories in the taxonomy.
|
||||
* <P>
|
||||
* Because categories are numbered consecutively starting with 0, it
|
||||
* means the taxonomy contains ordinals 0 through getSize()-1.
|
||||
* <P>
|
||||
* Note that the number returned by getSize() is often slightly higher
|
||||
* than the number of categories inserted into the taxonomy; This is
|
||||
* because when a category is added to the taxonomy, its ancestors
|
||||
* are also added automatically (including the root, which always get
|
||||
* ordinal 0).
|
||||
* Returns the ordinal of the parent category of the category with the given
|
||||
* ordinal, according to the following rules:
|
||||
*
|
||||
*
|
||||
* <ul>
|
||||
* <li>If the given ordinal is the {@link #ROOT_ORDINAL}, an
|
||||
* {@link #INVALID_ORDINAL} is returned.
|
||||
* <li>If the given ordinal is a top-level category, the {@link #ROOT_ORDINAL}
|
||||
* is returned.
|
||||
* <li>If the given ordinal is an existing category, returns the ordinal of
|
||||
* its parent
|
||||
* </ul>
|
||||
*
|
||||
* @throws ArrayIndexOutOfBoundsException
|
||||
* if an invalid ordinal is given (negative or beyond the last
|
||||
* available ordinal)
|
||||
*/
|
||||
public int getSize();
|
||||
public abstract int getParent(int ordinal) throws IOException;
|
||||
|
||||
/**
|
||||
* Returns an {@code int[]} the size of the taxonomy listing the ordinal of
|
||||
* the parent category of each category in the taxonomy.
|
||||
* <p>
|
||||
* The caller can hold on to the array it got indefinitely - it is guaranteed
|
||||
* that no-one else will modify it. The other side of the same coin is that
|
||||
* the caller must treat the array it got as read-only and <b>not modify
|
||||
* it</b>, because other callers might have gotten the same array too (and
|
||||
* getParent() calls might be answered from the same array).
|
||||
*/
|
||||
public abstract int[] getParentArray() throws IOException;
|
||||
|
||||
/**
|
||||
* Returns the path name of the category with the given ordinal. The path is
|
||||
* returned as a new CategoryPath object - to reuse an existing object, use
|
||||
* {@link #getPath(int, CategoryPath)}.
|
||||
*
|
||||
* @return a {@link CategoryPath} with the required path, or {@code null} if
|
||||
* the given ordinal is unknown to the taxonomy.
|
||||
*/
|
||||
public abstract CategoryPath getPath(int ordinal) throws IOException;
|
||||
|
||||
/**
|
||||
* Same as {@link #getPath(int)}, only reuses the given {@link CategoryPath}
|
||||
* instances.
|
||||
*/
|
||||
public abstract boolean getPath(int ordinal, CategoryPath result) throws IOException;
|
||||
|
||||
/** Returns the current refCount for this taxonomy reader. */
|
||||
public final int getRefCount() {
|
||||
return refCount.get();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the number of categories in the taxonomy. Note that the number of
|
||||
* categories returned is often slightly higher than the number of categories
|
||||
* inserted into the taxonomy; This is because when a category is added to the
|
||||
* taxonomy, its ancestors are also added automatically (including the root,
|
||||
* which always get ordinal 0).
|
||||
*/
|
||||
public abstract int getSize();
|
||||
|
||||
/**
|
||||
* Expert: increments the refCount of this TaxonomyReader instance. RefCounts
|
||||
* can be used to determine when a taxonomy reader can be closed safely, i.e.
|
||||
* as soon as there are no more references. Be sure to always call a
|
||||
* corresponding decRef(), in a finally clause; otherwise the reader may never
|
||||
* be closed.
|
||||
*/
|
||||
public final void incRef() {
|
||||
ensureOpen();
|
||||
refCount.incrementAndGet();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -2,6 +2,7 @@ package org.apache.lucene.facet.taxonomy.directory;
|
|||
|
||||
import org.apache.lucene.index.FieldInfo;
|
||||
import org.apache.lucene.index.StoredFieldVisitor;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
|
@ -28,6 +29,7 @@ abstract class Consts {
|
|||
static final String FULL = "$full_path$";
|
||||
static final String FIELD_PAYLOADS = "$payloads$";
|
||||
static final String PAYLOAD_PARENT = "p";
|
||||
static final BytesRef PAYLOAD_PARENT_BYTES_REF = new BytesRef(PAYLOAD_PARENT);
|
||||
static final char[] PAYLOAD_PARENT_CHARS = PAYLOAD_PARENT.toCharArray();
|
||||
|
||||
/**
|
||||
|
|
|
@ -1,29 +1,23 @@
|
|||
package org.apache.lucene.facet.taxonomy.directory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
import java.util.Map.Entry;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
import java.util.concurrent.locks.ReadWriteLock;
|
||||
import java.util.concurrent.locks.ReentrantReadWriteLock;
|
||||
import java.util.logging.Level;
|
||||
import java.util.logging.Logger;
|
||||
|
||||
import org.apache.lucene.facet.taxonomy.CategoryPath;
|
||||
import org.apache.lucene.facet.taxonomy.InconsistentTaxonomyException;
|
||||
import org.apache.lucene.facet.taxonomy.ChildrenArrays;
|
||||
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
|
||||
import org.apache.lucene.facet.taxonomy.directory.Consts.LoadFullPathOnly;
|
||||
import org.apache.lucene.index.CorruptIndexException;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.DocsEnum;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.MultiFields;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.store.AlreadyClosedException;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.collections.LRUHashMap;
|
||||
|
||||
/*
|
||||
|
@ -55,89 +49,341 @@ import org.apache.lucene.util.collections.LRUHashMap;
|
|||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class DirectoryTaxonomyReader implements TaxonomyReader {
|
||||
public class DirectoryTaxonomyReader extends TaxonomyReader {
|
||||
|
||||
private static final Logger logger = Logger.getLogger(DirectoryTaxonomyReader.class.getName());
|
||||
|
||||
private static final int DEFAULT_CACHE_VALUE = 4000;
|
||||
|
||||
private DirectoryReader indexReader;
|
||||
private final DirectoryTaxonomyWriter taxoWriter;
|
||||
private final long taxoEpoch; // used in doOpenIfChanged
|
||||
private final DirectoryReader indexReader;
|
||||
|
||||
// The following lock is used to allow multiple threads to read from the
|
||||
// index concurrently, while having them block during the very short
|
||||
// critical moment of refresh() (see comments below). Note, however, that
|
||||
// we only read from the index when we don't have the entry in our cache,
|
||||
// and the caches are locked separately.
|
||||
private ReadWriteLock indexReaderLock = new ReentrantReadWriteLock();
|
||||
// TODO: test DoubleBarrelLRUCache and consider using it instead
|
||||
private LRUHashMap<String, Integer> ordinalCache;
|
||||
private LRUHashMap<Integer, String> categoryCache;
|
||||
|
||||
// The following are the limited-size LRU caches used to cache the latest
|
||||
// results from getOrdinal() and getLabel().
|
||||
// Because LRUHashMap is not thread-safe, we need to synchronize on this
|
||||
// object when using it. Unfortunately, this is not optimal under heavy
|
||||
// contention because it means that while one thread is using the cache
|
||||
// (reading or modifying) others are blocked from using it - or even
|
||||
// starting to do benign things like calculating the hash function. A more
|
||||
// efficient approach would be to use a non-locking (as much as possible)
|
||||
// concurrent solution, along the lines of java.util.concurrent.ConcurrentHashMap
|
||||
// but with LRU semantics.
|
||||
// However, even in the current sub-optimal implementation we do not make
|
||||
// the mistake of locking out readers while waiting for disk in a cache
|
||||
// miss - below, we do not hold cache lock while reading missing data from
|
||||
// disk.
|
||||
private final LRUHashMap<String, Integer> ordinalCache;
|
||||
private final LRUHashMap<Integer, String> categoryCache;
|
||||
|
||||
// getParent() needs to be extremely efficient, to the point that we need
|
||||
// to fetch all the data in advance into memory, and answer these calls
|
||||
// from memory. Currently we use a large integer array, which is
|
||||
// initialized when the taxonomy is opened, and potentially enlarged
|
||||
// when it is refresh()ed.
|
||||
// These arrays are not syncrhonized. Rather, the reference to the array
|
||||
// is volatile, and the only writing operation (refreshPrefetchArrays)
|
||||
// simply creates a new array and replaces the reference. The volatility
|
||||
// of the reference ensures the correct atomic replacement and its
|
||||
// visibility properties (the content of the array is visible when the
|
||||
// new reference is visible).
|
||||
private ParentArray parentArray;
|
||||
// TODO: consolidate these objects into one ParentInfo or something?
|
||||
private volatile ParentArray parentArray;
|
||||
private volatile ChildrenArrays childrenArrays;
|
||||
|
||||
private char delimiter = Consts.DEFAULT_DELIMITER;
|
||||
|
||||
private volatile boolean closed = false;
|
||||
|
||||
// set refCount to 1 at start
|
||||
private final AtomicInteger refCount = new AtomicInteger(1);
|
||||
/**
|
||||
* Called only from {@link #doOpenIfChanged()}. If the taxonomy has been
|
||||
* recreated, you should pass {@code null} as the caches and parent/children
|
||||
* arrays.
|
||||
*/
|
||||
DirectoryTaxonomyReader(DirectoryReader indexReader, DirectoryTaxonomyWriter taxoWriter,
|
||||
LRUHashMap<String,Integer> ordinalCache,
|
||||
LRUHashMap<Integer,String> categoryCache, ParentArray parentArray,
|
||||
ChildrenArrays childrenArrays) throws IOException {
|
||||
this.indexReader = indexReader;
|
||||
this.taxoWriter = taxoWriter;
|
||||
this.taxoEpoch = taxoWriter == null ? -1 : taxoWriter.getTaxonomyEpoch();
|
||||
|
||||
// use the same instance of the cache, note the protective code in getOrdinal and getPath
|
||||
this.ordinalCache = ordinalCache == null ? new LRUHashMap<String,Integer>(DEFAULT_CACHE_VALUE) : ordinalCache;
|
||||
this.categoryCache = categoryCache == null ? new LRUHashMap<Integer,String>(DEFAULT_CACHE_VALUE) : categoryCache;
|
||||
|
||||
this.parentArray = null;
|
||||
this.childrenArrays = null;
|
||||
if (parentArray != null) {
|
||||
this.parentArray = new ParentArray(indexReader, parentArray);
|
||||
if (childrenArrays != null) {
|
||||
this.childrenArrays = new ChildrenArrays(this.parentArray.getArray(), childrenArrays);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Open for reading a taxonomy stored in a given {@link Directory}.
|
||||
*
|
||||
* @param directory
|
||||
* The {@link Directory} in which to the taxonomy lives. Note that
|
||||
* the taxonomy is read directly to that directory (not from a
|
||||
* subdirectory of it).
|
||||
* @throws CorruptIndexException if the Taxonomy is corrupted.
|
||||
* @throws IOException if another error occurred.
|
||||
* The {@link Directory} in which the taxonomy resides.
|
||||
* @throws CorruptIndexException
|
||||
* if the Taxonomy is corrupt.
|
||||
* @throws IOException
|
||||
* if another error occurred.
|
||||
*/
|
||||
public DirectoryTaxonomyReader(Directory directory) throws IOException {
|
||||
this.indexReader = openIndexReader(directory);
|
||||
indexReader = openIndexReader(directory);
|
||||
taxoWriter = null;
|
||||
taxoEpoch = -1;
|
||||
|
||||
// These are the default cache sizes; they can be configured after
|
||||
// construction with the cache's setMaxSize() method
|
||||
ordinalCache = new LRUHashMap<String, Integer>(4000);
|
||||
categoryCache = new LRUHashMap<Integer, String>(4000);
|
||||
ordinalCache = new LRUHashMap<String, Integer>(DEFAULT_CACHE_VALUE);
|
||||
categoryCache = new LRUHashMap<Integer, String>(DEFAULT_CACHE_VALUE);
|
||||
}
|
||||
|
||||
/**
|
||||
* Opens a {@link DirectoryTaxonomyReader} over the given
|
||||
* {@link DirectoryTaxonomyWriter} (for NRT).
|
||||
*
|
||||
* @param taxoWriter
|
||||
* The {@link DirectoryTaxonomyWriter} from which to obtain newly
|
||||
* added categories, in real-time.
|
||||
*/
|
||||
public DirectoryTaxonomyReader(DirectoryTaxonomyWriter taxoWriter) throws IOException {
|
||||
this.taxoWriter = taxoWriter;
|
||||
taxoEpoch = taxoWriter.getTaxonomyEpoch();
|
||||
indexReader = openIndexReader(taxoWriter.getInternalIndexWriter());
|
||||
|
||||
// These are the default cache sizes; they can be configured after
|
||||
// construction with the cache's setMaxSize() method
|
||||
ordinalCache = new LRUHashMap<String, Integer>(DEFAULT_CACHE_VALUE);
|
||||
categoryCache = new LRUHashMap<Integer, String>(DEFAULT_CACHE_VALUE);
|
||||
}
|
||||
|
||||
private String getLabel(int catID) throws IOException {
|
||||
ensureOpen();
|
||||
|
||||
// TODO (Facet): consider lazily create parent array when asked, not in the constructor
|
||||
parentArray = new ParentArray();
|
||||
parentArray.refresh(indexReader);
|
||||
// Since the cache is shared with DTR instances allocated from
|
||||
// doOpenIfChanged, we need to ensure that the ordinal is one that this DTR
|
||||
// instance recognizes. Therefore we do this check up front, before we hit
|
||||
// the cache.
|
||||
if (catID < 0 || catID >= indexReader.maxDoc()) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// TODO: can we use an int-based hash impl, such as IntToObjectMap,
|
||||
// wrapped as LRU?
|
||||
Integer catIDInteger = Integer.valueOf(catID);
|
||||
synchronized (categoryCache) {
|
||||
String res = categoryCache.get(catIDInteger);
|
||||
if (res != null) {
|
||||
return res;
|
||||
}
|
||||
}
|
||||
|
||||
final LoadFullPathOnly loader = new LoadFullPathOnly();
|
||||
indexReader.document(catID, loader);
|
||||
String ret = loader.getFullPath();
|
||||
synchronized (categoryCache) {
|
||||
categoryCache.put(catIDInteger, ret);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void doClose() throws IOException {
|
||||
indexReader.close();
|
||||
parentArray = null;
|
||||
childrenArrays = null;
|
||||
// do not clear() the caches, as they may be used by other DTR instances.
|
||||
ordinalCache = null;
|
||||
categoryCache = null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Implements the opening of a new {@link DirectoryTaxonomyReader} instance if
|
||||
* the taxonomy has changed.
|
||||
*
|
||||
* <p>
|
||||
* <b>NOTE:</b> the returned {@link DirectoryTaxonomyReader} shares the
|
||||
* ordinal and category caches with this reader. This is not expected to cause
|
||||
* any issues, unless the two instances continue to live. The reader
|
||||
* guarantees that the two instances cannot affect each other in terms of
|
||||
* correctness of the caches, however if the size of the cache is changed
|
||||
* through {@link #setCacheSize(int)}, it will affect both reader instances.
|
||||
*/
|
||||
@Override
|
||||
protected DirectoryTaxonomyReader doOpenIfChanged() throws IOException {
|
||||
ensureOpen();
|
||||
|
||||
final DirectoryReader r2;
|
||||
if (taxoWriter == null) {
|
||||
// not NRT
|
||||
r2 = DirectoryReader.openIfChanged(indexReader);
|
||||
} else {
|
||||
// NRT
|
||||
r2 = DirectoryReader.openIfChanged(indexReader, taxoWriter.getInternalIndexWriter(), false);
|
||||
}
|
||||
if (r2 == null) {
|
||||
return null; // no changes, nothing to do
|
||||
}
|
||||
|
||||
// check if the taxonomy was recreated
|
||||
boolean success = false;
|
||||
try {
|
||||
boolean recreated = false;
|
||||
if (taxoWriter == null) {
|
||||
// not NRT, check epoch from commit data
|
||||
String t1 = indexReader.getIndexCommit().getUserData().get(DirectoryTaxonomyWriter.INDEX_EPOCH);
|
||||
String t2 = r2.getIndexCommit().getUserData().get(DirectoryTaxonomyWriter.INDEX_EPOCH);
|
||||
if (t1 == null) {
|
||||
if (t2 != null) {
|
||||
recreated = true;
|
||||
}
|
||||
} else if (!t1.equals(t2)) {
|
||||
// t1 != null and t2 cannot be null b/c DirTaxoWriter always puts the commit data.
|
||||
// it's ok to use String.equals because we require the two epoch values to be the same.
|
||||
recreated = true;
|
||||
}
|
||||
} else {
|
||||
// NRT, compare current taxoWriter.epoch() vs the one that was given at construction
|
||||
if (taxoEpoch != taxoWriter.getTaxonomyEpoch()) {
|
||||
recreated = true;
|
||||
}
|
||||
}
|
||||
|
||||
final DirectoryTaxonomyReader newtr;
|
||||
if (recreated) {
|
||||
// if recreated, do not reuse anything from this instace. the information
|
||||
// will be lazily computed by the new instance when needed.
|
||||
newtr = new DirectoryTaxonomyReader(r2, taxoWriter, null, null, null, null);
|
||||
} else {
|
||||
newtr = new DirectoryTaxonomyReader(r2, taxoWriter, ordinalCache, categoryCache, parentArray, childrenArrays);
|
||||
}
|
||||
|
||||
success = true;
|
||||
return newtr;
|
||||
} finally {
|
||||
if (!success) {
|
||||
IOUtils.closeWhileHandlingException(r2);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
protected DirectoryReader openIndexReader(Directory directory) throws IOException {
|
||||
return DirectoryReader.open(directory);
|
||||
}
|
||||
|
||||
protected DirectoryReader openIndexReader(IndexWriter writer) throws IOException {
|
||||
return DirectoryReader.open(writer, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* @throws AlreadyClosedException if this IndexReader is closed
|
||||
* Expert: returns the underlying {@link DirectoryReader} instance that is
|
||||
* used by this {@link TaxonomyReader}.
|
||||
*/
|
||||
protected final void ensureOpen() throws AlreadyClosedException {
|
||||
if (getRefCount() <= 0) {
|
||||
throw new AlreadyClosedException("this TaxonomyReader is closed");
|
||||
DirectoryReader getInternalIndexReader() {
|
||||
ensureOpen();
|
||||
return indexReader;
|
||||
}
|
||||
|
||||
@Override
|
||||
public ChildrenArrays getChildrenArrays() throws IOException {
|
||||
ensureOpen();
|
||||
if (childrenArrays == null) {
|
||||
synchronized (this) {
|
||||
if (childrenArrays == null) {
|
||||
childrenArrays = new ChildrenArrays(getParentArray());
|
||||
}
|
||||
}
|
||||
}
|
||||
return childrenArrays;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Map<String, String> getCommitUserData() throws IOException {
|
||||
ensureOpen();
|
||||
return indexReader.getIndexCommit().getUserData();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getOrdinal(CategoryPath categoryPath) throws IOException {
|
||||
ensureOpen();
|
||||
if (categoryPath.length() == 0) {
|
||||
return ROOT_ORDINAL;
|
||||
}
|
||||
String path = categoryPath.toString(delimiter);
|
||||
|
||||
// First try to find the answer in the LRU cache:
|
||||
synchronized (ordinalCache) {
|
||||
Integer res = ordinalCache.get(path);
|
||||
if (res != null) {
|
||||
if (res.intValue() < indexReader.maxDoc()) {
|
||||
// Since the cache is shared with DTR instances allocated from
|
||||
// doOpenIfChanged, we need to ensure that the ordinal is one that
|
||||
// this DTR instance recognizes.
|
||||
return res.intValue();
|
||||
} else {
|
||||
// if we get here, it means that the category was found in the cache,
|
||||
// but is not recognized by this TR instance. Therefore there's no
|
||||
// need to continue search for the path on disk, because we won't find
|
||||
// it there too.
|
||||
return TaxonomyReader.INVALID_ORDINAL;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If we're still here, we have a cache miss. We need to fetch the
|
||||
// value from disk, and then also put it in the cache:
|
||||
int ret = TaxonomyReader.INVALID_ORDINAL;
|
||||
DocsEnum docs = MultiFields.getTermDocsEnum(indexReader, null, Consts.FULL, new BytesRef(path), 0);
|
||||
if (docs != null && docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
|
||||
ret = docs.docID();
|
||||
|
||||
// we only store the fact that a category exists, not its inexistence.
|
||||
// This is required because the caches are shared with new DTR instances
|
||||
// that are allocated from doOpenIfChanged. Therefore, if we only store
|
||||
// information about found categories, we cannot accidently tell a new
|
||||
// generation of DTR that a category does not exist.
|
||||
synchronized (ordinalCache) {
|
||||
ordinalCache.put(path, Integer.valueOf(ret));
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
// TODO: move to a ParentInfo class? (see TODO for parentArray)
|
||||
@Override
|
||||
public int getParent(int ordinal) throws IOException {
|
||||
ensureOpen();
|
||||
return getParentArray()[ordinal];
|
||||
}
|
||||
|
||||
@Override
|
||||
public int[] getParentArray() throws IOException {
|
||||
ensureOpen();
|
||||
if (parentArray == null) {
|
||||
synchronized (this) {
|
||||
if (parentArray == null) {
|
||||
parentArray = new ParentArray(indexReader);
|
||||
}
|
||||
}
|
||||
}
|
||||
return parentArray.getArray();
|
||||
}
|
||||
|
||||
@Override
|
||||
public CategoryPath getPath(int ordinal) throws IOException {
|
||||
ensureOpen();
|
||||
// TODO (Facet): Currently, the LRU cache we use (getCategoryCache) holds
|
||||
// strings with delimiters, not CategoryPath objects, so even if
|
||||
// we have a cache hit, we need to process the string and build a new
|
||||
// CategoryPath object every time. What is preventing us from putting
|
||||
// the actual CategoryPath object in the cache is the fact that these
|
||||
// objects are mutable. So we should create an immutable (read-only)
|
||||
// interface that CategoryPath implements, and this method should
|
||||
// return this interface, not the writable CategoryPath.
|
||||
String label = getLabel(ordinal);
|
||||
if (label == null) {
|
||||
return null;
|
||||
}
|
||||
return new CategoryPath(label, delimiter);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean getPath(int ordinal, CategoryPath result) throws IOException {
|
||||
ensureOpen();
|
||||
String label = getLabel(ordinal);
|
||||
if (label == null) {
|
||||
return false;
|
||||
}
|
||||
result.clear();
|
||||
result.add(label, delimiter);
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getSize() {
|
||||
ensureOpen();
|
||||
return indexReader.numDocs();
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -151,10 +397,10 @@ public class DirectoryTaxonomyReader implements TaxonomyReader {
|
|||
*/
|
||||
public void setCacheSize(int size) {
|
||||
ensureOpen();
|
||||
synchronized(categoryCache) {
|
||||
synchronized (categoryCache) {
|
||||
categoryCache.setMaxSize(size);
|
||||
}
|
||||
synchronized(ordinalCache) {
|
||||
synchronized (ordinalCache) {
|
||||
ordinalCache.setMaxSize(size);
|
||||
}
|
||||
}
|
||||
|
@ -173,361 +419,11 @@ public class DirectoryTaxonomyReader implements TaxonomyReader {
|
|||
ensureOpen();
|
||||
this.delimiter = delimiter;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getOrdinal(CategoryPath categoryPath) throws IOException {
|
||||
ensureOpen();
|
||||
if (categoryPath.length()==0) {
|
||||
return ROOT_ORDINAL;
|
||||
}
|
||||
String path = categoryPath.toString(delimiter);
|
||||
|
||||
// First try to find the answer in the LRU cache:
|
||||
synchronized(ordinalCache) {
|
||||
Integer res = ordinalCache.get(path);
|
||||
if (res!=null) {
|
||||
return res.intValue();
|
||||
}
|
||||
}
|
||||
|
||||
// If we're still here, we have a cache miss. We need to fetch the
|
||||
// value from disk, and then also put it in the cache:
|
||||
int ret = TaxonomyReader.INVALID_ORDINAL;
|
||||
try {
|
||||
indexReaderLock.readLock().lock();
|
||||
// TODO (Facet): avoid Multi*?
|
||||
Bits liveDocs = MultiFields.getLiveDocs(indexReader);
|
||||
DocsEnum docs = MultiFields.getTermDocsEnum(indexReader, liveDocs, Consts.FULL, new BytesRef(path), 0);
|
||||
if (docs != null && docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
|
||||
ret = docs.docID();
|
||||
}
|
||||
} finally {
|
||||
indexReaderLock.readLock().unlock();
|
||||
}
|
||||
|
||||
// Put the new value in the cache. Note that it is possible that while
|
||||
// we were doing the above fetching (without the cache locked), some
|
||||
// other thread already added the same category to the cache. We do
|
||||
// not care about this possibilty, as LRUCache replaces previous values
|
||||
// of the same keys (it doesn't store duplicates).
|
||||
synchronized(ordinalCache) {
|
||||
// GB: new Integer(int); creates a new object each and every time.
|
||||
// Integer.valueOf(int) might not (See JavaDoc).
|
||||
ordinalCache.put(path, Integer.valueOf(ret));
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
@Override
|
||||
public CategoryPath getPath(int ordinal) throws IOException {
|
||||
ensureOpen();
|
||||
// TODO (Facet): Currently, the LRU cache we use (getCategoryCache) holds
|
||||
// strings with delimiters, not CategoryPath objects, so even if
|
||||
// we have a cache hit, we need to process the string and build a new
|
||||
// CategoryPath object every time. What is preventing us from putting
|
||||
// the actual CategoryPath object in the cache is the fact that these
|
||||
// objects are mutable. So we should create an immutable (read-only)
|
||||
// interface that CategoryPath implements, and this method should
|
||||
// return this interface, not the writable CategoryPath.
|
||||
String label = getLabel(ordinal);
|
||||
if (label==null) {
|
||||
return null;
|
||||
}
|
||||
return new CategoryPath(label, delimiter);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean getPath(int ordinal, CategoryPath result) throws IOException {
|
||||
ensureOpen();
|
||||
String label = getLabel(ordinal);
|
||||
if (label==null) {
|
||||
return false;
|
||||
}
|
||||
result.clear();
|
||||
result.add(label, delimiter);
|
||||
return true;
|
||||
}
|
||||
|
||||
private String getLabel(int catID) throws IOException {
|
||||
ensureOpen();
|
||||
// First try to find the answer in the LRU cache. It is very
|
||||
// unfortunate that we need to allocate an Integer object here -
|
||||
// it would have been better if we used a hash table specifically
|
||||
// designed for int keys...
|
||||
// GB: new Integer(int); creates a new object each and every time.
|
||||
// Integer.valueOf(int) might not (See JavaDoc).
|
||||
Integer catIDInteger = Integer.valueOf(catID);
|
||||
|
||||
synchronized(categoryCache) {
|
||||
String res = categoryCache.get(catIDInteger);
|
||||
if (res!=null) {
|
||||
return res;
|
||||
}
|
||||
}
|
||||
|
||||
// If we're still here, we have a cache miss. We need to fetch the
|
||||
// value from disk, and then also put it in the cache:
|
||||
String ret;
|
||||
try {
|
||||
indexReaderLock.readLock().lock();
|
||||
// The taxonomy API dictates that if we get an invalid category
|
||||
// ID, we should return null, If we don't check this here, we
|
||||
// can some sort of an exception from the document() call below.
|
||||
// NOTE: Currently, we *do not* cache this return value; There
|
||||
// isn't much point to do so, because checking the validity of
|
||||
// the docid doesn't require disk access - just comparing with
|
||||
// the number indexReader.maxDoc().
|
||||
if (catID<0 || catID>=indexReader.maxDoc()) {
|
||||
return null;
|
||||
}
|
||||
final LoadFullPathOnly loader = new LoadFullPathOnly();
|
||||
indexReader.document(catID, loader);
|
||||
ret = loader.getFullPath();
|
||||
} finally {
|
||||
indexReaderLock.readLock().unlock();
|
||||
}
|
||||
// Put the new value in the cache. Note that it is possible that while
|
||||
// we were doing the above fetching (without the cache locked), some
|
||||
// other thread already added the same category to the cache. We do
|
||||
// not care about this possibility, as LRUCache replaces previous
|
||||
// values of the same keys (it doesn't store duplicates).
|
||||
synchronized (categoryCache) {
|
||||
categoryCache.put(catIDInteger, ret);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getParent(int ordinal) {
|
||||
ensureOpen();
|
||||
// Note how we don't need to hold the read lock to do the following,
|
||||
// because the array reference is volatile, ensuring the correct
|
||||
// visibility and ordering: if we get the new reference, the new
|
||||
// data is also visible to this thread.
|
||||
return getParentArray()[ordinal];
|
||||
}
|
||||
|
||||
/**
|
||||
* getParentArray() returns an int array of size getSize() listing the
|
||||
* ordinal of the parent category of each category in the taxonomy.
|
||||
* <P>
|
||||
* The caller can hold on to the array it got indefinitely - it is
|
||||
* guaranteed that no-one else will modify it. The other side of the
|
||||
* same coin is that the caller must treat the array it got as read-only
|
||||
* and <B>not modify it</B>, because other callers might have gotten the
|
||||
* same array too, and getParent() calls are also answered from the
|
||||
* same array.
|
||||
* <P>
|
||||
* The getParentArray() call is extremely efficient, merely returning
|
||||
* a reference to an array that already exists. For a caller that plans
|
||||
* to call getParent() for many categories, using getParentArray() and
|
||||
* the array it returns is a somewhat faster approach because it avoids
|
||||
* the overhead of method calls and volatile dereferencing.
|
||||
* <P>
|
||||
* If you use getParentArray() instead of getParent(), remember that
|
||||
* the array you got is (naturally) not modified after a refresh(),
|
||||
* so you should always call getParentArray() again after a refresh().
|
||||
*/
|
||||
|
||||
@Override
|
||||
public int[] getParentArray() {
|
||||
ensureOpen();
|
||||
// Note how we don't need to hold the read lock to do the following,
|
||||
// because the array reference is volatile, ensuring the correct
|
||||
// visibility and ordering: if we get the new reference, the new
|
||||
// data is also visible to this thread.
|
||||
return parentArray.getArray();
|
||||
}
|
||||
|
||||
// Note that refresh() is synchronized (it is the only synchronized
|
||||
// method in this class) to ensure that it never gets called concurrently
|
||||
// with itself.
|
||||
@Override
|
||||
public synchronized boolean refresh() throws IOException, InconsistentTaxonomyException {
|
||||
ensureOpen();
|
||||
/*
|
||||
* Since refresh() can be a lengthy operation, it is very important that we
|
||||
* avoid locking out all readers for its duration. This is why we don't hold
|
||||
* the indexReaderLock write lock for the entire duration of this method. In
|
||||
* fact, it is enough to hold it only during a single assignment! Other
|
||||
* comments in this method will explain this.
|
||||
*/
|
||||
|
||||
// note that the lengthy operation indexReader.reopen() does not
|
||||
// modify the reader, so we can do it without holding a lock. We can
|
||||
// safely read indexReader without holding the write lock, because
|
||||
// no other thread can be writing at this time (this method is the
|
||||
// only possible writer, and it is "synchronized" to avoid this case).
|
||||
DirectoryReader r2 = DirectoryReader.openIfChanged(indexReader);
|
||||
if (r2 == null) {
|
||||
return false; // no changes, nothing to do
|
||||
}
|
||||
|
||||
// validate that a refresh is valid at this point, i.e. that the taxonomy
|
||||
// was not recreated since this reader was last opened or refresshed.
|
||||
String t1 = indexReader.getIndexCommit().getUserData().get(DirectoryTaxonomyWriter.INDEX_EPOCH);
|
||||
String t2 = r2.getIndexCommit().getUserData().get(DirectoryTaxonomyWriter.INDEX_EPOCH);
|
||||
if (t1 == null) {
|
||||
if (t2 != null) {
|
||||
r2.close();
|
||||
throw new InconsistentTaxonomyException("Taxonomy was recreated, epoch= " + t2);
|
||||
}
|
||||
} else if (!t1.equals(t2)) {
|
||||
// t1 != null and t2 cannot be null b/c DirTaxoWriter always puts the commit data.
|
||||
// it's ok to use String.equals because we require the two epoch values to be the same.
|
||||
r2.close();
|
||||
throw new InconsistentTaxonomyException("Taxonomy was recreated epoch = " + t2 + " != " + t1);
|
||||
}
|
||||
|
||||
IndexReader oldreader = indexReader;
|
||||
// we can close the old searcher, but need to synchronize this
|
||||
// so that we don't close it in the middle that another routine
|
||||
// is reading from it.
|
||||
indexReaderLock.writeLock().lock();
|
||||
indexReader = r2;
|
||||
indexReaderLock.writeLock().unlock();
|
||||
// We can close the old reader, but need to be certain that we
|
||||
// don't close it while another method is reading from it.
|
||||
// Luckily, we can be certain of that even without putting the
|
||||
// oldreader.close() in the locked section. The reason is that
|
||||
// after lock() succeeded above, we know that all existing readers
|
||||
// had finished (this is what a read-write lock ensures). New
|
||||
// readers, starting after the unlock() we just did, already got
|
||||
// the new indexReader we set above. So nobody can be possibly
|
||||
// using the old indexReader, and we can close it:
|
||||
oldreader.close();
|
||||
|
||||
// We prefetch some of the arrays to make requests much faster.
|
||||
// Let's refresh these prefetched arrays; This refresh is much
|
||||
// is made more efficient by assuming that it is enough to read
|
||||
// the values for new categories (old categories could not have been
|
||||
// changed or deleted)
|
||||
// Note that this this done without the write lock being held,
|
||||
// which means that it is possible that during a refresh(), a
|
||||
// reader will have some methods (like getOrdinal and getCategory)
|
||||
// return fresh information, while getParent()
|
||||
// (only to be prefetched now) still return older information.
|
||||
// We consider this to be acceptable. The important thing,
|
||||
// however, is that refreshPrefetchArrays() itself writes to
|
||||
// the arrays in a correct manner (see discussion there)
|
||||
parentArray.refresh(indexReader);
|
||||
|
||||
// Remove any INVALID_ORDINAL values from the ordinal cache,
|
||||
// because it is possible those are now answered by the new data!
|
||||
Iterator<Entry<String, Integer>> i = ordinalCache.entrySet().iterator();
|
||||
while (i.hasNext()) {
|
||||
Entry<String, Integer> e = i.next();
|
||||
if (e.getValue().intValue() == INVALID_ORDINAL) {
|
||||
i.remove();
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
if (!closed) {
|
||||
synchronized (this) {
|
||||
if (!closed) {
|
||||
decRef();
|
||||
closed = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** Do the actual closing, free up resources */
|
||||
private void doClose() throws IOException {
|
||||
indexReader.close();
|
||||
closed = true;
|
||||
|
||||
parentArray = null;
|
||||
childrenArrays = null;
|
||||
categoryCache.clear();
|
||||
ordinalCache.clear();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getSize() {
|
||||
ensureOpen();
|
||||
indexReaderLock.readLock().lock();
|
||||
try {
|
||||
return indexReader.numDocs();
|
||||
} finally {
|
||||
indexReaderLock.readLock().unlock();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Map<String, String> getCommitUserData() throws IOException {
|
||||
ensureOpen();
|
||||
return indexReader.getIndexCommit().getUserData();
|
||||
}
|
||||
|
||||
private ChildrenArrays childrenArrays;
|
||||
Object childrenArraysRebuild = new Object();
|
||||
|
||||
@Override
|
||||
public ChildrenArrays getChildrenArrays() {
|
||||
ensureOpen();
|
||||
// Check if the taxonomy grew since we built the array, and if it
|
||||
// did, create new (and larger) arrays and fill them as required.
|
||||
// We do all this under a lock, two prevent to concurrent calls to
|
||||
// needlessly do the same array building at the same time.
|
||||
synchronized(childrenArraysRebuild) {
|
||||
int num = getSize();
|
||||
int first;
|
||||
if (childrenArrays==null) {
|
||||
first = 0;
|
||||
} else {
|
||||
first = childrenArrays.getYoungestChildArray().length;
|
||||
}
|
||||
// If the taxonomy hasn't grown, we can return the existing object
|
||||
// immediately
|
||||
if (first == num) {
|
||||
return childrenArrays;
|
||||
}
|
||||
// Otherwise, build new arrays for a new ChildrenArray object.
|
||||
// These arrays start with an enlarged copy of the previous arrays,
|
||||
// and then are modified to take into account the new categories:
|
||||
int[] newYoungestChildArray = new int[num];
|
||||
int[] newOlderSiblingArray = new int[num];
|
||||
// In Java 6, we could just do Arrays.copyOf()...
|
||||
if (childrenArrays!=null) {
|
||||
System.arraycopy(childrenArrays.getYoungestChildArray(), 0,
|
||||
newYoungestChildArray, 0, childrenArrays.getYoungestChildArray().length);
|
||||
System.arraycopy(childrenArrays.getOlderSiblingArray(), 0,
|
||||
newOlderSiblingArray, 0, childrenArrays.getOlderSiblingArray().length);
|
||||
}
|
||||
int[] parents = getParentArray();
|
||||
for (int i=first; i<num; i++) {
|
||||
newYoungestChildArray[i] = INVALID_ORDINAL;
|
||||
}
|
||||
// In the loop below we can ignore the root category (0) because
|
||||
// it has no parent
|
||||
if (first==0) {
|
||||
first = 1;
|
||||
newOlderSiblingArray[0] = INVALID_ORDINAL;
|
||||
}
|
||||
for (int i=first; i<num; i++) {
|
||||
// Note that parents[i] is always < i, so the right-hand-side of
|
||||
// the following line is already set when we get here.
|
||||
newOlderSiblingArray[i] = newYoungestChildArray[parents[i]];
|
||||
newYoungestChildArray[parents[i]] = i;
|
||||
}
|
||||
// Finally switch to the new arrays
|
||||
childrenArrays = new ChildrenArraysImpl(newYoungestChildArray,
|
||||
newOlderSiblingArray);
|
||||
return childrenArrays;
|
||||
}
|
||||
}
|
||||
|
||||
public String toString(int max) {
|
||||
ensureOpen();
|
||||
StringBuilder sb = new StringBuilder();
|
||||
int upperl = Math.min(max, this.indexReader.maxDoc());
|
||||
int upperl = Math.min(max, indexReader.maxDoc());
|
||||
for (int i = 0; i < upperl; i++) {
|
||||
try {
|
||||
CategoryPath category = this.getPath(i);
|
||||
|
@ -548,75 +444,5 @@ public class DirectoryTaxonomyReader implements TaxonomyReader {
|
|||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
private static final class ChildrenArraysImpl implements ChildrenArrays {
|
||||
private int[] youngestChildArray, olderSiblingArray;
|
||||
public ChildrenArraysImpl(int[] youngestChildArray, int[] olderSiblingArray) {
|
||||
this.youngestChildArray = youngestChildArray;
|
||||
this.olderSiblingArray = olderSiblingArray;
|
||||
}
|
||||
@Override
|
||||
public int[] getOlderSiblingArray() {
|
||||
return olderSiblingArray;
|
||||
}
|
||||
@Override
|
||||
public int[] getYoungestChildArray() {
|
||||
return youngestChildArray;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Expert: This method is only for expert use.
|
||||
* Note also that any call to refresh() will invalidate the returned reader,
|
||||
* so the caller needs to take care of appropriate locking.
|
||||
*
|
||||
* @return lucene indexReader
|
||||
*/
|
||||
DirectoryReader getInternalIndexReader() {
|
||||
ensureOpen();
|
||||
return this.indexReader;
|
||||
}
|
||||
|
||||
/**
|
||||
* Expert: decreases the refCount of this TaxonomyReader instance. If the
|
||||
* refCount drops to 0, then this reader is closed.
|
||||
*/
|
||||
@Override
|
||||
public void decRef() throws IOException {
|
||||
ensureOpen();
|
||||
final int rc = refCount.decrementAndGet();
|
||||
if (rc == 0) {
|
||||
boolean success = false;
|
||||
try {
|
||||
doClose();
|
||||
success = true;
|
||||
} finally {
|
||||
if (!success) {
|
||||
// Put reference back on failure
|
||||
refCount.incrementAndGet();
|
||||
}
|
||||
}
|
||||
} else if (rc < 0) {
|
||||
throw new IllegalStateException("too many decRef calls: refCount is " + rc + " after decrement");
|
||||
}
|
||||
}
|
||||
|
||||
/** Expert: returns the current refCount for this taxonomy reader */
|
||||
@Override
|
||||
public int getRefCount() {
|
||||
return refCount.get();
|
||||
}
|
||||
|
||||
/**
|
||||
* Expert: increments the refCount of this TaxonomyReader instance.
|
||||
* RefCounts are used to determine when a taxonomy reader can be closed
|
||||
* safely, i.e. as soon as there are no more references.
|
||||
* Be sure to always call a corresponding decRef(), in a finally clause;
|
||||
* otherwise the reader may never be closed.
|
||||
*/
|
||||
@Override
|
||||
public void incRef() {
|
||||
ensureOpen();
|
||||
refCount.incrementAndGet();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -294,6 +294,9 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
|
|||
* @param openMode see {@link OpenMode}
|
||||
*/
|
||||
protected IndexWriterConfig createIndexWriterConfig(OpenMode openMode) {
|
||||
// TODO: should we use a more optimized Codec, e.g. Pulsing (or write custom)?
|
||||
// The taxonomy has a unique structure, where each term is associated with one document
|
||||
|
||||
// Make sure we use a MergePolicy which always merges adjacent segments and thus
|
||||
// keeps the doc IDs ordered as well (this is crucial for the taxonomy index).
|
||||
return new IndexWriterConfig(Version.LUCENE_50,
|
||||
|
@ -583,7 +586,7 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
|
|||
addToCache(categoryPath, length, id);
|
||||
|
||||
// also add to the parent array
|
||||
getParentArray().add(id, parent);
|
||||
parentArray = getParentArray().add(id, parent);
|
||||
|
||||
return id;
|
||||
}
|
||||
|
@ -811,10 +814,9 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
|
|||
synchronized (this) {
|
||||
if (parentArray == null) {
|
||||
initReaderManager();
|
||||
parentArray = new ParentArray();
|
||||
DirectoryReader reader = readerManager.acquire();
|
||||
try {
|
||||
parentArray.refresh(reader);
|
||||
parentArray = new ParentArray(reader);
|
||||
} finally {
|
||||
readerManager.release(reader);
|
||||
}
|
||||
|
@ -1035,5 +1037,21 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
|
|||
public Directory getDirectory() {
|
||||
return dir;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Used by {@link DirectoryTaxonomyReader} to support NRT.
|
||||
* <p>
|
||||
* <b>NOTE:</b> you should not use the obtained {@link IndexWriter} in any
|
||||
* way, other than opening an IndexReader on it, or otherwise, the taxonomy
|
||||
* index may become corrupt!
|
||||
*/
|
||||
final IndexWriter getInternalIndexWriter() {
|
||||
return indexWriter;
|
||||
}
|
||||
|
||||
/** Used by {@link DirectoryTaxonomyReader} to support NRT. */
|
||||
final long getTaxonomyEpoch() {
|
||||
return indexEpoch;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -2,15 +2,14 @@ package org.apache.lucene.facet.taxonomy.directory;
|
|||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
|
||||
import org.apache.lucene.index.AtomicReaderContext;
|
||||
import org.apache.lucene.index.CorruptIndexException;
|
||||
import org.apache.lucene.index.DocsAndPositionsEnum;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.MultiFields;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
|
@ -29,55 +28,23 @@ import org.apache.lucene.facet.taxonomy.TaxonomyReader;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
// getParent() needs to be extremely efficient, to the point that we need
|
||||
// to fetch all the data in advance into memory, and answer these calls
|
||||
// from memory. Currently we use a large integer array, which is
|
||||
// initialized when the taxonomy is opened, and potentially enlarged
|
||||
// when it is refresh()ed.
|
||||
/**
|
||||
* @lucene.experimental
|
||||
*/
|
||||
class ParentArray {
|
||||
|
||||
// These arrays are not syncrhonized. Rather, the reference to the array
|
||||
// is volatile, and the only writing operation (refreshPrefetchArrays)
|
||||
// simply creates a new array and replaces the reference. The volatility
|
||||
// of the reference ensures the correct atomic replacement and its
|
||||
// visibility properties (the content of the array is visible when the
|
||||
// new reference is visible).
|
||||
private volatile int prefetchParentOrdinal[] = null;
|
||||
// TODO: maybe use PackedInts?
|
||||
private final int[] parentOrdinals;
|
||||
|
||||
public int[] getArray() {
|
||||
return prefetchParentOrdinal;
|
||||
/** Used by {@link #add(int, int)} when the array needs to grow. */
|
||||
ParentArray(int[] parentOrdinals) {
|
||||
this.parentOrdinals = parentOrdinals;
|
||||
}
|
||||
|
||||
/**
|
||||
* refreshPrefetch() refreshes the parent array. Initially, it fills the
|
||||
* array from the positions of an appropriate posting list. If called during
|
||||
* a refresh(), when the arrays already exist, only values for new documents
|
||||
* (those beyond the last one in the array) are read from the positions and
|
||||
* added to the arrays (that are appropriately enlarged). We assume (and
|
||||
* this is indeed a correct assumption in our case) that existing categories
|
||||
* are never modified or deleted.
|
||||
*/
|
||||
void refresh(IndexReader indexReader) throws IOException {
|
||||
// Note that it is not necessary for us to obtain the read lock.
|
||||
// The reason is that we are only called from refresh() (precluding
|
||||
// another concurrent writer) or from the constructor (when no method
|
||||
// could be running).
|
||||
// The write lock is also not held during the following code, meaning
|
||||
// that reads *can* happen while this code is running. The "volatile"
|
||||
// property of the prefetchParentOrdinal and prefetchDepth array
|
||||
// references ensure the correct visibility property of the assignment
|
||||
// but other than that, we do *not* guarantee that a reader will not
|
||||
// use an old version of one of these arrays (or both) while a refresh
|
||||
// is going on. But we find this acceptable - until a refresh has
|
||||
// finished, the reader should not expect to see new information
|
||||
// (and the old information is the same in the old and new versions).
|
||||
int first;
|
||||
int num = indexReader.maxDoc();
|
||||
if (prefetchParentOrdinal==null) {
|
||||
prefetchParentOrdinal = new int[num];
|
||||
public ParentArray(IndexReader reader) throws IOException {
|
||||
parentOrdinals = new int[reader.maxDoc()];
|
||||
if (parentOrdinals.length > 0) {
|
||||
initFromReader(reader, 0);
|
||||
// Starting Lucene 2.9, following the change LUCENE-1542, we can
|
||||
// no longer reliably read the parent "-1" (see comment in
|
||||
// LuceneTaxonomyWriter.SinglePositionTokenStream). We have no way
|
||||
|
@ -85,78 +52,88 @@ class ParentArray {
|
|||
// with existing indexes, so what we'll do instead is just
|
||||
// hard-code the parent of ordinal 0 to be -1, and assume (as is
|
||||
// indeed the case) that no other parent can be -1.
|
||||
if (num>0) {
|
||||
prefetchParentOrdinal[0] = TaxonomyReader.INVALID_ORDINAL;
|
||||
}
|
||||
first = 1;
|
||||
} else {
|
||||
first = prefetchParentOrdinal.length;
|
||||
if (first==num) {
|
||||
return; // nothing to do - no category was added
|
||||
}
|
||||
// In Java 6, we could just do Arrays.copyOf()...
|
||||
int[] newarray = new int[num];
|
||||
System.arraycopy(prefetchParentOrdinal, 0, newarray, 0,
|
||||
prefetchParentOrdinal.length);
|
||||
prefetchParentOrdinal = newarray;
|
||||
}
|
||||
|
||||
// Read the new part of the parents array from the positions:
|
||||
// TODO (Facet): avoid Multi*?
|
||||
Bits liveDocs = MultiFields.getLiveDocs(indexReader);
|
||||
DocsAndPositionsEnum positions = MultiFields.getTermPositionsEnum(indexReader, liveDocs,
|
||||
Consts.FIELD_PAYLOADS, new BytesRef(Consts.PAYLOAD_PARENT),
|
||||
DocsAndPositionsEnum.FLAG_PAYLOADS);
|
||||
if ((positions == null || positions.advance(first) == DocIdSetIterator.NO_MORE_DOCS) && first < num) {
|
||||
throw new CorruptIndexException("Missing parent data for category " + first);
|
||||
}
|
||||
for (int i=first; i<num; i++) {
|
||||
// Note that we know positions.doc() >= i (this is an
|
||||
// invariant kept throughout this loop)
|
||||
if (positions.docID()==i) {
|
||||
if (positions.freq() == 0) { // shouldn't happen
|
||||
throw new CorruptIndexException(
|
||||
"Missing parent data for category "+i);
|
||||
}
|
||||
|
||||
// TODO (Facet): keep a local (non-volatile) copy of the prefetchParentOrdinal
|
||||
// reference, because access to volatile reference is slower (?).
|
||||
// Note: The positions we get here are one less than the position
|
||||
// increment we added originally, so we get here the right numbers:
|
||||
prefetchParentOrdinal[i] = positions.nextPosition();
|
||||
|
||||
if (positions.nextDoc() == DocIdSetIterator.NO_MORE_DOCS) {
|
||||
if ( i+1 < num ) {
|
||||
throw new CorruptIndexException(
|
||||
"Missing parent data for category "+(i+1));
|
||||
}
|
||||
break;
|
||||
}
|
||||
} else { // this shouldn't happen
|
||||
throw new CorruptIndexException(
|
||||
"Missing parent data for category "+i);
|
||||
}
|
||||
parentOrdinals[0] = TaxonomyReader.INVALID_ORDINAL;
|
||||
}
|
||||
}
|
||||
|
||||
public ParentArray(IndexReader reader, ParentArray copyFrom) throws IOException {
|
||||
assert copyFrom != null;
|
||||
int[] copyParents = copyFrom.getArray();
|
||||
assert copyParents.length < reader.maxDoc() : "do not init a new ParentArray if the index hasn't changed";
|
||||
|
||||
this.parentOrdinals = new int[reader.maxDoc()];
|
||||
System.arraycopy(copyParents, 0, parentOrdinals, 0, copyParents.length);
|
||||
initFromReader(reader, copyParents.length);
|
||||
}
|
||||
|
||||
/**
|
||||
* add() is used in LuceneTaxonomyWriter, not in LuceneTaxonomyReader.
|
||||
* It is only called from a synchronized method, so it is not reentrant,
|
||||
* and also doesn't need to worry about reads happening at the same time.
|
||||
*
|
||||
* NOTE: add() and refresh() CANNOT be used together. If you call add(),
|
||||
* this changes the arrays and refresh() can no longer be used.
|
||||
*/
|
||||
void add(int ordinal, int parentOrdinal) {
|
||||
if (ordinal >= prefetchParentOrdinal.length) {
|
||||
// grow the array, if necessary.
|
||||
// In Java 6, we could just do Arrays.copyOf()...
|
||||
int[] newarray = new int[ordinal*2+1];
|
||||
System.arraycopy(prefetchParentOrdinal, 0, newarray, 0,
|
||||
prefetchParentOrdinal.length);
|
||||
prefetchParentOrdinal = newarray;
|
||||
// Read the parents of the new categories
|
||||
private void initFromReader(IndexReader reader, int first) throws IOException {
|
||||
if (reader.maxDoc() == first) {
|
||||
return;
|
||||
}
|
||||
prefetchParentOrdinal[ordinal] = parentOrdinal;
|
||||
|
||||
TermsEnum termsEnum = null;
|
||||
DocsAndPositionsEnum positions = null;
|
||||
int idx = 0;
|
||||
for (AtomicReaderContext context : reader.leaves()) {
|
||||
if (context.docBase < first) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// in general we could call readerCtx.reader().termPositionsEnum(), but that
|
||||
// passes the liveDocs. Since we know there are no deletions, the code
|
||||
// below may save some CPU cycles.
|
||||
termsEnum = context.reader().fields().terms(Consts.FIELD_PAYLOADS).iterator(termsEnum);
|
||||
if (!termsEnum.seekExact(Consts.PAYLOAD_PARENT_BYTES_REF, true)) {
|
||||
throw new CorruptIndexException("Missing parent stream data for segment " + context.reader());
|
||||
}
|
||||
positions = termsEnum.docsAndPositions(null /* no deletes in taxonomy */, positions);
|
||||
if (positions == null) {
|
||||
throw new CorruptIndexException("Missing parent stream data for segment " + context.reader());
|
||||
}
|
||||
|
||||
idx = context.docBase;
|
||||
int doc;
|
||||
while ((doc = positions.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
|
||||
doc += context.docBase;
|
||||
if (doc == idx) {
|
||||
if (positions.freq() == 0) { // shouldn't happen
|
||||
throw new CorruptIndexException("Missing parent data for category " + idx);
|
||||
}
|
||||
|
||||
parentOrdinals[idx++] = positions.nextPosition();
|
||||
} else { // this shouldn't happen
|
||||
throw new CorruptIndexException("Missing parent data for category " + idx);
|
||||
}
|
||||
}
|
||||
if (idx + 1 < context.reader().maxDoc()) {
|
||||
throw new CorruptIndexException("Missing parent data for category " + (idx + 1));
|
||||
}
|
||||
}
|
||||
|
||||
if (idx != reader.maxDoc()) {
|
||||
throw new CorruptIndexException("Missing parent data for category " + idx);
|
||||
}
|
||||
}
|
||||
|
||||
public int[] getArray() {
|
||||
return parentOrdinals;
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds the given ordinal/parent info and returns either a new instance if the
|
||||
* underlying array had to grow, or this instance otherwise.
|
||||
* <p>
|
||||
* <b>NOTE:</b> you should call this method from a thread-safe code.
|
||||
*/
|
||||
ParentArray add(int ordinal, int parentOrdinal) {
|
||||
if (ordinal >= parentOrdinals.length) {
|
||||
int[] newarray = ArrayUtil.grow(parentOrdinals);
|
||||
newarray[ordinal] = parentOrdinal;
|
||||
return new ParentArray(newarray);
|
||||
}
|
||||
parentOrdinals[ordinal] = parentOrdinal;
|
||||
return this;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -102,4 +102,10 @@ public class LRUHashMap<K,V> extends LinkedHashMap<K,V> {
|
|||
return size() > maxSize;
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
@Override
|
||||
public LRUHashMap<K,V> clone() {
|
||||
return (LRUHashMap<K,V>) super.clone();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -131,7 +131,7 @@ public class FacetTestUtils {
|
|||
|
||||
public static class IndexTaxonomyReaderPair {
|
||||
public DirectoryReader indexReader;
|
||||
public TaxonomyReader taxReader;
|
||||
public DirectoryTaxonomyReader taxReader;
|
||||
public IndexSearcher indexSearcher;
|
||||
|
||||
public void close() throws IOException {
|
||||
|
|
|
@ -78,11 +78,9 @@ public class TestTotalFacetCounts extends LuceneTestCase {
|
|||
TestTotalFacetCountsCache.addFacets(iParams, writers[0].indexWriter, writers[0].taxWriter, "b", "c");
|
||||
|
||||
// Commit Changes
|
||||
writers[0].commit();
|
||||
writers[0].close();
|
||||
|
||||
IndexTaxonomyReaderPair[] readers =
|
||||
FacetTestUtils.createIndexTaxonomyReaderPair(dirs);
|
||||
IndexTaxonomyReaderPair[] readers = FacetTestUtils.createIndexTaxonomyReaderPair(dirs);
|
||||
|
||||
int[] intArray = new int[iParams.getPartitionSize()];
|
||||
|
||||
|
@ -93,8 +91,7 @@ public class TestTotalFacetCounts extends LuceneTestCase {
|
|||
tfcc.load(tmpFile, readers[0].indexReader, readers[0].taxReader, iParams);
|
||||
|
||||
// now retrieve the one just loaded
|
||||
TotalFacetCounts totalCounts =
|
||||
tfcc.getTotalCounts(readers[0].indexReader, readers[0].taxReader, iParams, null);
|
||||
TotalFacetCounts totalCounts = tfcc.getTotalCounts(readers[0].indexReader, readers[0].taxReader, iParams, null);
|
||||
|
||||
int partition = 0;
|
||||
for (int i=0; i<expectedCounts.length; i+=partitionSize) {
|
||||
|
|
|
@ -297,23 +297,17 @@ public class TestTotalFacetCountsCache extends LuceneTestCase {
|
|||
writers[0].indexWriter.close();
|
||||
writers[0].taxWriter.close();
|
||||
|
||||
readers[0].taxReader.refresh();
|
||||
DirectoryTaxonomyReader newTaxoReader = TaxonomyReader.openIfChanged(readers[0].taxReader);
|
||||
assertNotNull(newTaxoReader);
|
||||
assertTrue("should have received more cagtegories in updated taxonomy", newTaxoReader.getSize() > readers[0].taxReader.getSize());
|
||||
readers[0].taxReader.close();
|
||||
readers[0].taxReader = newTaxoReader;
|
||||
|
||||
DirectoryReader r2 = DirectoryReader.openIfChanged(readers[0].indexReader);
|
||||
assertNotNull(r2);
|
||||
// Hold on to the 'original' reader so we can do some checks with it
|
||||
IndexReader origReader = null;
|
||||
|
||||
assertTrue("Reader must be updated!", readers[0].indexReader != r2);
|
||||
|
||||
// Set the 'original' reader
|
||||
origReader = readers[0].indexReader;
|
||||
// Set the new master index Reader
|
||||
readers[0].indexReader.close();
|
||||
readers[0].indexReader = r2;
|
||||
|
||||
// Try to get total-counts the originalReader AGAIN, just for sanity. Should pull from the cache - not recomputed.
|
||||
assertTrue("Should be obtained from cache at 6th attempt",totalCounts ==
|
||||
TFC.getTotalCounts(origReader, readers[0].taxReader, iParams, null));
|
||||
|
||||
// now use the new reader - should recompute
|
||||
totalCounts = TFC.getTotalCounts(readers[0].indexReader, readers[0].taxReader, iParams, null);
|
||||
prevGen = assertRecomputed(totalCounts, prevGen, "after updating the index - 7th attempt!");
|
||||
|
@ -322,9 +316,7 @@ public class TestTotalFacetCountsCache extends LuceneTestCase {
|
|||
assertTrue("Should be obtained from cache at 8th attempt",totalCounts ==
|
||||
TFC.getTotalCounts(readers[0].indexReader, readers[0].taxReader, iParams, null));
|
||||
|
||||
origReader.close();
|
||||
readers[0].close();
|
||||
r2.close();
|
||||
outputFile.delete();
|
||||
IOUtils.close(dirs[0]);
|
||||
}
|
||||
|
@ -380,7 +372,10 @@ public class TestTotalFacetCountsCache extends LuceneTestCase {
|
|||
writers[0].taxWriter.addCategory(new CategoryPath("foo", Integer.toString(i)));
|
||||
}
|
||||
writers[0].taxWriter.commit();
|
||||
readers[0].taxReader.refresh();
|
||||
DirectoryTaxonomyReader newTaxoReader = TaxonomyReader.openIfChanged(readers[0].taxReader);
|
||||
assertNotNull(newTaxoReader);
|
||||
readers[0].taxReader.close();
|
||||
readers[0].taxReader = newTaxoReader;
|
||||
|
||||
initCache();
|
||||
|
||||
|
|
|
@ -5,18 +5,17 @@ import java.io.PrintWriter;
|
|||
import java.io.StringWriter;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.concurrent.atomic.AtomicBoolean;
|
||||
|
||||
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader;
|
||||
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.LockObtainFailedException;
|
||||
import org.apache.lucene.store.RAMDirectory;
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.facet.taxonomy.TaxonomyReader.ChildrenArrays;
|
||||
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader;
|
||||
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter;
|
||||
import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;
|
||||
import org.apache.lucene.util.SlowRAMDirectory;
|
||||
import org.junit.Test;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
|
@ -35,6 +34,8 @@ import org.apache.lucene.util.SlowRAMDirectory;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
// TODO: remove this suppress after we fix the TaxoWriter Codec to a non-default (see todo in DirTW)
|
||||
@SuppressCodecs("SimpleText")
|
||||
public class TestTaxonomyCombined extends LuceneTestCase {
|
||||
|
||||
/** The following categories will be added to the taxonomy by
|
||||
|
@ -725,7 +726,10 @@ public class TestTaxonomyCombined extends LuceneTestCase {
|
|||
assertEquals(3, ca.getOlderSiblingArray().length);
|
||||
assertEquals(3, ca.getYoungestChildArray().length);
|
||||
// After the refresh, things change:
|
||||
tr.refresh();
|
||||
TaxonomyReader newtr = TaxonomyReader.openIfChanged(tr);
|
||||
assertNotNull(newtr);
|
||||
tr.close();
|
||||
tr = newtr;
|
||||
ca = tr.getChildrenArrays();
|
||||
assertEquals(5, tr.getSize());
|
||||
assertEquals(5, ca.getOlderSiblingArray().length);
|
||||
|
@ -737,14 +741,11 @@ public class TestTaxonomyCombined extends LuceneTestCase {
|
|||
indexDir.close();
|
||||
}
|
||||
|
||||
/**
|
||||
* Test that getParentArrays is valid when retrieved during refresh
|
||||
*/
|
||||
// Test that getParentArrays is valid when retrieved during refresh
|
||||
@Test
|
||||
@Ignore
|
||||
public void testTaxonomyReaderRefreshRaces() throws Exception {
|
||||
// compute base child arrays - after first chunk, and after the other
|
||||
Directory indexDirBase = newDirectory();
|
||||
Directory indexDirBase = newDirectory();
|
||||
TaxonomyWriter twBase = new DirectoryTaxonomyWriter(indexDirBase);
|
||||
twBase.addCategory(new CategoryPath("a", "0"));
|
||||
final CategoryPath abPath = new CategoryPath("a", "b");
|
||||
|
@ -757,56 +758,64 @@ public class TestTaxonomyCombined extends LuceneTestCase {
|
|||
final int abOrd = trBase.getOrdinal(abPath);
|
||||
final int abYoungChildBase1 = ca1.getYoungestChildArray()[abOrd];
|
||||
|
||||
for (int i=0; i < 1<<10; i++) { //1024 facets
|
||||
final int numCategories = atLeast(800);
|
||||
for (int i = 0; i < numCategories; i++) {
|
||||
twBase.addCategory(new CategoryPath("a", "b", Integer.toString(i)));
|
||||
}
|
||||
twBase.commit();
|
||||
twBase.close();
|
||||
|
||||
trBase.refresh();
|
||||
TaxonomyReader newTaxoReader = TaxonomyReader.openIfChanged(trBase);
|
||||
assertNotNull(newTaxoReader);
|
||||
trBase.close();
|
||||
trBase = newTaxoReader;
|
||||
|
||||
final ChildrenArrays ca2 = trBase.getChildrenArrays();
|
||||
final int abYoungChildBase2 = ca2.getYoungestChildArray()[abOrd];
|
||||
|
||||
for (int retry=0; retry<100; retry++) {
|
||||
assertConsistentYoungestChild(abPath, abOrd, abYoungChildBase1, abYoungChildBase2, retry);
|
||||
int numRetries = atLeast(50);
|
||||
for (int retry = 0; retry < numRetries; retry++) {
|
||||
assertConsistentYoungestChild(abPath, abOrd, abYoungChildBase1, abYoungChildBase2, retry, numCategories);
|
||||
}
|
||||
|
||||
trBase.close();
|
||||
indexDirBase.close();
|
||||
}
|
||||
|
||||
private void assertConsistentYoungestChild(final CategoryPath abPath,
|
||||
final int abOrd, final int abYoungChildBase1, final int abYoungChildBase2, final int retry)
|
||||
final int abOrd, final int abYoungChildBase1, final int abYoungChildBase2, final int retry, int numCategories)
|
||||
throws Exception {
|
||||
SlowRAMDirectory indexDir = new SlowRAMDirectory(-1,null); // no slowness for intialization
|
||||
SlowRAMDirectory indexDir = new SlowRAMDirectory(-1, null); // no slowness for intialization
|
||||
TaxonomyWriter tw = new DirectoryTaxonomyWriter(indexDir);
|
||||
tw.addCategory(new CategoryPath("a", "0"));
|
||||
tw.addCategory(abPath);
|
||||
tw.commit();
|
||||
|
||||
final TaxonomyReader tr = new DirectoryTaxonomyReader(indexDir);
|
||||
for (int i=0; i < 1<<10; i++) { //1024 facets
|
||||
final DirectoryTaxonomyReader tr = new DirectoryTaxonomyReader(indexDir);
|
||||
for (int i = 0; i < numCategories; i++) {
|
||||
final CategoryPath cp = new CategoryPath("a", "b", Integer.toString(i));
|
||||
tw.addCategory(cp);
|
||||
assertEquals("Ordinal of "+cp+" must be invalid until Taxonomy Reader was refreshed", TaxonomyReader.INVALID_ORDINAL, tr.getOrdinal(cp));
|
||||
}
|
||||
tw.commit();
|
||||
tw.close();
|
||||
|
||||
final boolean[] stop = new boolean[] { false };
|
||||
final AtomicBoolean stop = new AtomicBoolean(false);
|
||||
final Throwable[] error = new Throwable[] { null };
|
||||
final int retrieval[] = { 0 };
|
||||
|
||||
Thread thread = new Thread("Child Arrays Verifier") {
|
||||
@Override
|
||||
public void run() {
|
||||
setPriority(1+getPriority());
|
||||
setPriority(1 + getPriority());
|
||||
try {
|
||||
while (!stop[0]) {
|
||||
int lastOrd = tr.getParentArray().length-1;
|
||||
assertNotNull("path of last-ord "+lastOrd+" is not found!",tr.getPath(lastOrd));
|
||||
assertChildrenArrays(tr.getChildrenArrays(),retry,retrieval[0]++);
|
||||
while (!stop.get()) {
|
||||
int lastOrd = tr.getParentArray().length - 1;
|
||||
assertNotNull("path of last-ord " + lastOrd + " is not found!", tr.getPath(lastOrd));
|
||||
assertChildrenArrays(tr.getChildrenArrays(), retry, retrieval[0]++);
|
||||
sleep(10); // don't starve refresh()'s CPU, which sleeps every 50 bytes for 1 ms
|
||||
}
|
||||
} catch (Throwable e) {
|
||||
error[0] = e;
|
||||
stop[0] = true;
|
||||
stop.set(true);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -822,13 +831,15 @@ public class TestTaxonomyCombined extends LuceneTestCase {
|
|||
thread.start();
|
||||
|
||||
indexDir.setSleepMillis(1); // some delay for refresh
|
||||
tr.refresh();
|
||||
TaxonomyReader newTaxoReader = TaxonomyReader.openIfChanged(tr);
|
||||
if (newTaxoReader != null) {
|
||||
newTaxoReader.close();
|
||||
}
|
||||
|
||||
stop[0] = true;
|
||||
stop.set(true);
|
||||
thread.join();
|
||||
assertNull("Unexpcted exception at retry "+retry+" retrieval "+retrieval[0]+": \n"+stackTraceStr(error[0]), error[0]);
|
||||
|
||||
tw.close();
|
||||
tr.close();
|
||||
}
|
||||
|
||||
|
@ -885,7 +896,7 @@ public class TestTaxonomyCombined extends LuceneTestCase {
|
|||
// ok
|
||||
}
|
||||
assertEquals(1, tr.getSize()); // still root only...
|
||||
tr.refresh(); // this is not enough, because tw.commit() hasn't been done yet
|
||||
assertNull(TaxonomyReader.openIfChanged(tr)); // this is not enough, because tw.commit() hasn't been done yet
|
||||
try {
|
||||
tr.getParent(author);
|
||||
fail("Before commit() and refresh(), getParent for "+author+" should still throw exception");
|
||||
|
@ -901,7 +912,11 @@ public class TestTaxonomyCombined extends LuceneTestCase {
|
|||
// ok
|
||||
}
|
||||
assertEquals(1, tr.getSize()); // still root only...
|
||||
tr.refresh();
|
||||
TaxonomyReader newTaxoReader = TaxonomyReader.openIfChanged(tr);
|
||||
assertNotNull(newTaxoReader);
|
||||
tr.close();
|
||||
tr = newTaxoReader;
|
||||
|
||||
try {
|
||||
assertEquals(TaxonomyReader.ROOT_ORDINAL, tr.getParent(author));
|
||||
// ok
|
||||
|
@ -917,7 +932,10 @@ public class TestTaxonomyCombined extends LuceneTestCase {
|
|||
tw.addCategory(new CategoryPath("Author", "Richard Dawkins"));
|
||||
int dawkins = 2;
|
||||
tw.commit();
|
||||
tr.refresh();
|
||||
newTaxoReader = TaxonomyReader.openIfChanged(tr);
|
||||
assertNotNull(newTaxoReader);
|
||||
tr.close();
|
||||
tr = newTaxoReader;
|
||||
assertEquals(author, tr.getParent(dawkins));
|
||||
assertEquals(TaxonomyReader.ROOT_ORDINAL, tr.getParent(author));
|
||||
assertEquals(TaxonomyReader.INVALID_ORDINAL, tr.getParent(TaxonomyReader.ROOT_ORDINAL));
|
||||
|
@ -943,16 +961,19 @@ public class TestTaxonomyCombined extends LuceneTestCase {
|
|||
// before commit and refresh, no change:
|
||||
assertEquals(TaxonomyReader.INVALID_ORDINAL, tr.getOrdinal(author));
|
||||
assertEquals(1, tr.getSize()); // still root only...
|
||||
tr.refresh(); // this is not enough, because tw.commit() hasn't been done yet
|
||||
assertNull(TaxonomyReader.openIfChanged(tr)); // this is not enough, because tw.commit() hasn't been done yet
|
||||
assertEquals(TaxonomyReader.INVALID_ORDINAL, tr.getOrdinal(author));
|
||||
assertEquals(1, tr.getSize()); // still root only...
|
||||
tw.commit();
|
||||
// still not enough before refresh:
|
||||
assertEquals(TaxonomyReader.INVALID_ORDINAL, tr.getOrdinal(author));
|
||||
assertEquals(1, tr.getSize()); // still root only...
|
||||
tr.refresh(); // finally
|
||||
TaxonomyReader newTaxoReader = TaxonomyReader.openIfChanged(tr);
|
||||
assertNotNull(newTaxoReader);
|
||||
tr.close();
|
||||
tr = newTaxoReader;
|
||||
assertEquals(1, tr.getOrdinal(author));
|
||||
assertEquals(2, tr.getSize()); // still root only...
|
||||
assertEquals(2, tr.getSize());
|
||||
tw.close();
|
||||
tr.close();
|
||||
indexDir.close();
|
||||
|
@ -977,7 +998,7 @@ public class TestTaxonomyCombined extends LuceneTestCase {
|
|||
// Try to open a second writer, with the first one locking the directory.
|
||||
// We expect to get a LockObtainFailedException.
|
||||
try {
|
||||
new DirectoryTaxonomyWriter(indexDir);
|
||||
assertNull(new DirectoryTaxonomyWriter(indexDir));
|
||||
fail("should have failed to write in locked directory");
|
||||
} catch (LockObtainFailedException e) {
|
||||
// this is what we expect to happen.
|
||||
|
@ -989,7 +1010,10 @@ public class TestTaxonomyCombined extends LuceneTestCase {
|
|||
tw2.addCategory(new CategoryPath("hey"));
|
||||
tw2.close();
|
||||
// See that the writer indeed wrote:
|
||||
tr.refresh();
|
||||
TaxonomyReader newtr = TaxonomyReader.openIfChanged(tr);
|
||||
assertNotNull(newtr);
|
||||
tr.close();
|
||||
tr = newtr;
|
||||
assertEquals(3, tr.getOrdinal(new CategoryPath("hey")));
|
||||
tr.close();
|
||||
tw.close();
|
||||
|
@ -1086,6 +1110,27 @@ public class TestTaxonomyCombined extends LuceneTestCase {
|
|||
indexDir.close();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNRT() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
DirectoryTaxonomyWriter writer = new DirectoryTaxonomyWriter(dir);
|
||||
TaxonomyReader reader = new DirectoryTaxonomyReader(writer);
|
||||
|
||||
CategoryPath cp = new CategoryPath("a");
|
||||
writer.addCategory(cp);
|
||||
TaxonomyReader newReader = TaxonomyReader.openIfChanged(reader);
|
||||
assertNotNull("expected a new instance", newReader);
|
||||
assertEquals(2, newReader.getSize());
|
||||
assertNotSame(TaxonomyReader.INVALID_ORDINAL, newReader.getOrdinal(cp));
|
||||
reader.close();
|
||||
reader = newReader;
|
||||
|
||||
writer.close();
|
||||
reader.close();
|
||||
|
||||
dir.close();
|
||||
}
|
||||
|
||||
// TODO (Facet): test multiple readers, one writer. Have the multiple readers
|
||||
// using the same object (simulating threads) or different objects
|
||||
// (simulating processes).
|
||||
|
|
|
@ -3,12 +3,11 @@ package org.apache.lucene.facet.taxonomy.directory;
|
|||
import java.util.Random;
|
||||
|
||||
import org.apache.lucene.facet.taxonomy.CategoryPath;
|
||||
import org.apache.lucene.facet.taxonomy.InconsistentTaxonomyException;
|
||||
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
|
||||
import org.apache.lucene.facet.taxonomy.TaxonomyWriter;
|
||||
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader;
|
||||
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter;
|
||||
import org.apache.lucene.index.IndexWriterConfig;
|
||||
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
|
||||
import org.apache.lucene.index.LogMergePolicy;
|
||||
import org.apache.lucene.store.AlreadyClosedException;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.RAMDirectory;
|
||||
|
@ -67,11 +66,8 @@ public class TestDirectoryTaxonomyReader extends LuceneTestCase {
|
|||
dir.close();
|
||||
}
|
||||
|
||||
/**
|
||||
* Test the boolean returned by TR.refresh
|
||||
*/
|
||||
@Test
|
||||
public void testReaderRefreshResult() throws Exception {
|
||||
public void testOpenIfChangedResult() throws Exception {
|
||||
Directory dir = null;
|
||||
DirectoryTaxonomyWriter ltw = null;
|
||||
DirectoryTaxonomyReader ltr = null;
|
||||
|
@ -84,13 +80,15 @@ public class TestDirectoryTaxonomyReader extends LuceneTestCase {
|
|||
ltw.commit();
|
||||
|
||||
ltr = new DirectoryTaxonomyReader(dir);
|
||||
assertFalse("Nothing has changed",ltr.refresh());
|
||||
assertNull("Nothing has changed", TaxonomyReader.openIfChanged(ltr));
|
||||
|
||||
ltw.addCategory(new CategoryPath("b"));
|
||||
ltw.commit();
|
||||
|
||||
assertTrue("changes were committed",ltr.refresh());
|
||||
assertFalse("Nothing has changed",ltr.refresh());
|
||||
DirectoryTaxonomyReader newtr = TaxonomyReader.openIfChanged(ltr);
|
||||
assertNotNull("changes were committed", newtr);
|
||||
assertNull("Nothing has changed", TaxonomyReader.openIfChanged(newtr));
|
||||
newtr.close();
|
||||
} finally {
|
||||
IOUtils.close(ltw, ltr, dir);
|
||||
}
|
||||
|
@ -119,18 +117,15 @@ public class TestDirectoryTaxonomyReader extends LuceneTestCase {
|
|||
*/
|
||||
@Test
|
||||
public void testFreshReadRecreatedTaxonomy() throws Exception {
|
||||
doTestReadRecreatedTaxono(random(), true);
|
||||
doTestReadRecreatedTaxonomy(random(), true);
|
||||
}
|
||||
|
||||
/**
|
||||
* recreating a taxonomy should work well with a refreshed taxonomy reader
|
||||
*/
|
||||
@Test
|
||||
public void testRefreshReadRecreatedTaxonomy() throws Exception {
|
||||
doTestReadRecreatedTaxono(random(), false);
|
||||
public void testOpenIfChangedReadRecreatedTaxonomy() throws Exception {
|
||||
doTestReadRecreatedTaxonomy(random(), false);
|
||||
}
|
||||
|
||||
private void doTestReadRecreatedTaxono(Random random, boolean closeReader) throws Exception {
|
||||
private void doTestReadRecreatedTaxonomy(Random random, boolean closeReader) throws Exception {
|
||||
Directory dir = null;
|
||||
TaxonomyWriter tw = null;
|
||||
TaxonomyReader tr = null;
|
||||
|
@ -163,13 +158,10 @@ public class TestDirectoryTaxonomyReader extends LuceneTestCase {
|
|||
tr.close();
|
||||
tr = new DirectoryTaxonomyReader(dir);
|
||||
} else {
|
||||
try {
|
||||
tr.refresh();
|
||||
fail("Expected InconsistentTaxonomyException");
|
||||
} catch (InconsistentTaxonomyException e) {
|
||||
tr.close();
|
||||
tr = new DirectoryTaxonomyReader(dir);
|
||||
}
|
||||
TaxonomyReader newtr = TaxonomyReader.openIfChanged(tr);
|
||||
assertNotNull(newtr);
|
||||
tr.close();
|
||||
tr = newtr;
|
||||
}
|
||||
assertEquals("Wrong #categories in taxonomy (i="+i+", k="+k+")", baseNumCategories + 1 + k, tr.getSize());
|
||||
}
|
||||
|
@ -179,14 +171,14 @@ public class TestDirectoryTaxonomyReader extends LuceneTestCase {
|
|||
}
|
||||
|
||||
@Test
|
||||
public void testRefreshAndRefCount() throws Exception {
|
||||
public void testOpenIfChangedAndRefCount() throws Exception {
|
||||
Directory dir = new RAMDirectory(); // no need for random directories here
|
||||
|
||||
DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(dir);
|
||||
taxoWriter.addCategory(new CategoryPath("a"));
|
||||
taxoWriter.commit();
|
||||
|
||||
DirectoryTaxonomyReader taxoReader = new DirectoryTaxonomyReader(dir);
|
||||
TaxonomyReader taxoReader = new DirectoryTaxonomyReader(dir);
|
||||
assertEquals("wrong refCount", 1, taxoReader.getRefCount());
|
||||
|
||||
taxoReader.incRef();
|
||||
|
@ -194,12 +186,189 @@ public class TestDirectoryTaxonomyReader extends LuceneTestCase {
|
|||
|
||||
taxoWriter.addCategory(new CategoryPath("a", "b"));
|
||||
taxoWriter.commit();
|
||||
taxoReader.refresh();
|
||||
assertEquals("wrong refCount", 2, taxoReader.getRefCount());
|
||||
TaxonomyReader newtr = TaxonomyReader.openIfChanged(taxoReader);
|
||||
assertNotNull(newtr);
|
||||
taxoReader.close();
|
||||
taxoReader = newtr;
|
||||
assertEquals("wrong refCount", 1, taxoReader.getRefCount());
|
||||
|
||||
taxoWriter.close();
|
||||
taxoReader.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testOpenIfChangedManySegments() throws Exception {
|
||||
// test openIfChanged() when the taxonomy contains many segments
|
||||
Directory dir = newDirectory();
|
||||
|
||||
DirectoryTaxonomyWriter writer = new DirectoryTaxonomyWriter(dir) {
|
||||
@Override
|
||||
protected IndexWriterConfig createIndexWriterConfig(OpenMode openMode) {
|
||||
IndexWriterConfig conf = super.createIndexWriterConfig(openMode);
|
||||
LogMergePolicy lmp = (LogMergePolicy) conf.getMergePolicy();
|
||||
lmp.setMergeFactor(2);
|
||||
return conf;
|
||||
}
|
||||
};
|
||||
TaxonomyReader reader = new DirectoryTaxonomyReader(writer);
|
||||
|
||||
int numRounds = random().nextInt(10) + 10;
|
||||
int numCategories = 1; // one for root
|
||||
for (int i = 0; i < numRounds; i++) {
|
||||
int numCats = random().nextInt(4) + 1;
|
||||
for (int j = 0; j < numCats; j++) {
|
||||
writer.addCategory(new CategoryPath(Integer.toString(i), Integer.toString(j)));
|
||||
}
|
||||
numCategories += numCats + 1 /* one for round-parent */;
|
||||
TaxonomyReader newtr = TaxonomyReader.openIfChanged(reader);
|
||||
assertNotNull(newtr);
|
||||
reader.close();
|
||||
reader = newtr;
|
||||
|
||||
// assert categories
|
||||
assertEquals(numCategories, reader.getSize());
|
||||
int roundOrdinal = reader.getOrdinal(new CategoryPath(Integer.toString(i)));
|
||||
int[] parents = reader.getParentArray();
|
||||
assertEquals(0, parents[roundOrdinal]); // round's parent is root
|
||||
for (int j = 0; j < numCats; j++) {
|
||||
int ord = reader.getOrdinal(new CategoryPath(Integer.toString(i), Integer.toString(j)));
|
||||
assertEquals(roundOrdinal, parents[ord]); // round's parent is root
|
||||
}
|
||||
}
|
||||
|
||||
reader.close();
|
||||
writer.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testOpenIfChangedReuseAfterRecreate() throws Exception {
|
||||
// tests that if the taxonomy is recreated, no data is reused from the previous taxonomy
|
||||
Directory dir = newDirectory();
|
||||
DirectoryTaxonomyWriter writer = new DirectoryTaxonomyWriter(dir);
|
||||
CategoryPath cp_a = new CategoryPath("a");
|
||||
writer.addCategory(cp_a);
|
||||
writer.close();
|
||||
|
||||
DirectoryTaxonomyReader r1 = new DirectoryTaxonomyReader(dir);
|
||||
// fill r1's caches
|
||||
assertEquals(1, r1.getOrdinal(cp_a));
|
||||
assertEquals(cp_a, r1.getPath(1));
|
||||
|
||||
// now recreate, add a different category
|
||||
writer = new DirectoryTaxonomyWriter(dir, OpenMode.CREATE);
|
||||
CategoryPath cp_b = new CategoryPath("b");
|
||||
writer.addCategory(cp_b);
|
||||
writer.close();
|
||||
|
||||
DirectoryTaxonomyReader r2 = TaxonomyReader.openIfChanged(r1);
|
||||
assertNotNull(r2);
|
||||
|
||||
// fill r2's caches
|
||||
assertEquals(1, r2.getOrdinal(cp_b));
|
||||
assertEquals(cp_b, r2.getPath(1));
|
||||
|
||||
// check that r1 doesn't see cp_b
|
||||
assertEquals(TaxonomyReader.INVALID_ORDINAL, r1.getOrdinal(cp_b));
|
||||
assertEquals(cp_a, r1.getPath(1));
|
||||
|
||||
// check that r2 doesn't see cp_a
|
||||
assertEquals(TaxonomyReader.INVALID_ORDINAL, r2.getOrdinal(cp_a));
|
||||
assertEquals(cp_b, r2.getPath(1));
|
||||
|
||||
r2.close();
|
||||
r1.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testOpenIfChangedReuse() throws Exception {
|
||||
// test the reuse of data from the old DTR instance
|
||||
for (boolean nrt : new boolean[] {false, true}) {
|
||||
Directory dir = newDirectory();
|
||||
DirectoryTaxonomyWriter writer = new DirectoryTaxonomyWriter(dir);
|
||||
|
||||
CategoryPath cp_a = new CategoryPath("a");
|
||||
writer.addCategory(cp_a);
|
||||
if (!nrt) writer.commit();
|
||||
|
||||
DirectoryTaxonomyReader r1 = nrt ? new DirectoryTaxonomyReader(writer) : new DirectoryTaxonomyReader(dir);
|
||||
// fill r1's caches
|
||||
assertEquals(1, r1.getOrdinal(cp_a));
|
||||
assertEquals(cp_a, r1.getPath(1));
|
||||
|
||||
CategoryPath cp_b = new CategoryPath("b");
|
||||
writer.addCategory(cp_b);
|
||||
if (!nrt) writer.commit();
|
||||
|
||||
DirectoryTaxonomyReader r2 = TaxonomyReader.openIfChanged(r1);
|
||||
assertNotNull(r2);
|
||||
|
||||
// add r2's categories to the caches
|
||||
assertEquals(2, r2.getOrdinal(cp_b));
|
||||
assertEquals(cp_b, r2.getPath(2));
|
||||
|
||||
// check that r1 doesn't see cp_b
|
||||
assertEquals(TaxonomyReader.INVALID_ORDINAL, r1.getOrdinal(cp_b));
|
||||
assertNull(r1.getPath(2));
|
||||
|
||||
r1.close();
|
||||
r2.close();
|
||||
writer.close();
|
||||
dir.close();
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testOpenIfChangedReplaceTaxonomy() throws Exception {
|
||||
// test openIfChanged when replaceTaxonomy is called, which is equivalent to recreate
|
||||
// only can work with NRT as well
|
||||
Directory src = newDirectory();
|
||||
DirectoryTaxonomyWriter w = new DirectoryTaxonomyWriter(src);
|
||||
CategoryPath cp_b = new CategoryPath("b");
|
||||
w.addCategory(cp_b);
|
||||
w.close();
|
||||
|
||||
for (boolean nrt : new boolean[] {false, true}) {
|
||||
Directory dir = newDirectory();
|
||||
DirectoryTaxonomyWriter writer = new DirectoryTaxonomyWriter(dir);
|
||||
|
||||
CategoryPath cp_a = new CategoryPath("a");
|
||||
writer.addCategory(cp_a);
|
||||
if (!nrt) writer.commit();
|
||||
|
||||
DirectoryTaxonomyReader r1 = nrt ? new DirectoryTaxonomyReader(writer) : new DirectoryTaxonomyReader(dir);
|
||||
// fill r1's caches
|
||||
assertEquals(1, r1.getOrdinal(cp_a));
|
||||
assertEquals(cp_a, r1.getPath(1));
|
||||
|
||||
// now replace taxonomy
|
||||
writer.replaceTaxonomy(src);
|
||||
if (!nrt) writer.commit();
|
||||
|
||||
DirectoryTaxonomyReader r2 = TaxonomyReader.openIfChanged(r1);
|
||||
assertNotNull(r2);
|
||||
|
||||
// fill r2's caches
|
||||
assertEquals(1, r2.getOrdinal(cp_b));
|
||||
assertEquals(cp_b, r2.getPath(1));
|
||||
|
||||
// check that r1 doesn't see cp_b
|
||||
assertEquals(TaxonomyReader.INVALID_ORDINAL, r1.getOrdinal(cp_b));
|
||||
assertEquals(cp_a, r1.getPath(1));
|
||||
|
||||
// check that r2 doesn't see cp_a
|
||||
assertEquals(TaxonomyReader.INVALID_ORDINAL, r2.getOrdinal(cp_a));
|
||||
assertEquals(cp_b, r2.getPath(1));
|
||||
|
||||
r2.close();
|
||||
r1.close();
|
||||
writer.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
src.close();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -8,7 +8,7 @@ import java.util.concurrent.ConcurrentHashMap;
|
|||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
import org.apache.lucene.facet.taxonomy.CategoryPath;
|
||||
import org.apache.lucene.facet.taxonomy.InconsistentTaxonomyException;
|
||||
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
|
||||
import org.apache.lucene.facet.taxonomy.writercache.TaxonomyWriterCache;
|
||||
import org.apache.lucene.facet.taxonomy.writercache.cl2o.Cl2oTaxonomyWriterCache;
|
||||
import org.apache.lucene.facet.taxonomy.writercache.lru.LruTaxonomyWriterCache;
|
||||
|
@ -178,12 +178,14 @@ public class TestDirectoryTaxonomyWriter extends LuceneTestCase {
|
|||
DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(dir, OpenMode.CREATE_OR_APPEND, NO_OP_CACHE);
|
||||
touchTaxo(taxoWriter, new CategoryPath("a"));
|
||||
|
||||
DirectoryTaxonomyReader taxoReader = new DirectoryTaxonomyReader(dir);
|
||||
TaxonomyReader taxoReader = new DirectoryTaxonomyReader(dir);
|
||||
|
||||
touchTaxo(taxoWriter, new CategoryPath("b"));
|
||||
|
||||
// this should not fail
|
||||
taxoReader.refresh();
|
||||
TaxonomyReader newtr = TaxonomyReader.openIfChanged(taxoReader);
|
||||
taxoReader.close();
|
||||
taxoReader = newtr;
|
||||
assertEquals(1, Integer.parseInt(taxoReader.getCommitUserData().get(DirectoryTaxonomyWriter.INDEX_EPOCH)));
|
||||
|
||||
// now recreate the taxonomy, and check that the epoch is preserved after opening DirTW again.
|
||||
taxoWriter.close();
|
||||
|
@ -195,14 +197,11 @@ public class TestDirectoryTaxonomyWriter extends LuceneTestCase {
|
|||
touchTaxo(taxoWriter, new CategoryPath("d"));
|
||||
taxoWriter.close();
|
||||
|
||||
// this should fail
|
||||
try {
|
||||
taxoReader.refresh();
|
||||
fail("IconsistentTaxonomyException should have been thrown");
|
||||
} catch (InconsistentTaxonomyException e) {
|
||||
// ok, expected
|
||||
}
|
||||
|
||||
newtr = TaxonomyReader.openIfChanged(taxoReader);
|
||||
taxoReader.close();
|
||||
taxoReader = newtr;
|
||||
assertEquals(2, Integer.parseInt(taxoReader.getCommitUserData().get(DirectoryTaxonomyWriter.INDEX_EPOCH)));
|
||||
|
||||
taxoReader.close();
|
||||
dir.close();
|
||||
}
|
||||
|
@ -221,7 +220,7 @@ public class TestDirectoryTaxonomyWriter extends LuceneTestCase {
|
|||
|
||||
DirectoryTaxonomyReader taxoReader = new DirectoryTaxonomyReader(dir);
|
||||
assertEquals(1, Integer.parseInt(taxoReader.getCommitUserData().get(DirectoryTaxonomyWriter.INDEX_EPOCH)));
|
||||
taxoReader.refresh();
|
||||
assertNull(TaxonomyReader.openIfChanged(taxoReader));
|
||||
taxoReader.close();
|
||||
|
||||
dir.close();
|
||||
|
|
Loading…
Reference in New Issue