LUCENE-4565: Consolidate ParentArray and ChildrenArrays into ParallelTaxonomyArrays

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1417889 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Shai Erera 2012-12-06 13:55:33 +00:00
parent 72d71de227
commit 01fd342513
12 changed files with 359 additions and 365 deletions

View File

@ -64,6 +64,11 @@ Changes in backwards compatibility policy
even if the commitData is the only thing that changes.
(Shai Erera, Michael McCandless)
* LUCENE-4565: TaxonomyReader.getParentArray and .getChildrenArrays consolidated
into one getParallelTaxonomyArrays(). You can obtain the 3 arrays that the
previous two methods returned by calling parents(), children() or siblings()
on the returned ParallelTaxonomyArrays. (Shai Erera)
New Features
* LUCENE-4226: New experimental StoredFieldsFormat that compresses chunks of

View File

@ -6,10 +6,10 @@ import java.util.ArrayList;
import org.apache.lucene.facet.search.params.FacetRequest;
import org.apache.lucene.facet.search.results.FacetResult;
import org.apache.lucene.facet.search.results.FacetResultNode;
import org.apache.lucene.facet.search.results.MutableFacetResultNode;
import org.apache.lucene.facet.search.results.IntermediateFacetResult;
import org.apache.lucene.facet.taxonomy.ChildrenArrays;
import org.apache.lucene.facet.search.results.MutableFacetResultNode;
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
import org.apache.lucene.facet.taxonomy.directory.ParallelTaxonomyArrays;
import org.apache.lucene.facet.util.ResultSortUtils;
/*
@ -123,9 +123,9 @@ public class TopKFacetResultsHandler extends FacetResultsHandler {
MutableFacetResultNode parentResultNode, FacetArrays facetArrays, int offset) throws IOException {
int partitionSize = facetArrays.getArraysLength();
int endOffset = offset + partitionSize;
ChildrenArrays childrenArray = taxonomyReader.getChildrenArrays();
int[] youngestChild = childrenArray.getYoungestChildArray();
int[] olderSibling = childrenArray.getOlderSiblingArray();
ParallelTaxonomyArrays childrenArray = taxonomyReader.getParallelTaxonomyArrays();
int[] children = childrenArray.children();
int[] siblings = childrenArray.siblings();
FacetResultNode reusable = null;
int localDepth = 0;
int depth = facetRequest.getDepth();
@ -134,9 +134,9 @@ public class TopKFacetResultsHandler extends FacetResultsHandler {
int tosOrdinal; // top of stack element
int yc = youngestChild[ordinal];
int yc = children[ordinal];
while (yc >= endOffset) {
yc = olderSibling[yc];
yc = siblings[yc];
}
// make use of the fact that TaxonomyReader.INVALID_ORDINAL == -1, < endOffset
// and it, too, can stop the loop.
@ -161,7 +161,7 @@ public class TopKFacetResultsHandler extends FacetResultsHandler {
// need to proceed to its sibling
localDepth--;
// change element now on top of stack to its sibling.
ordinalStack[localDepth] = olderSibling[ordinalStack[localDepth]];
ordinalStack[localDepth] = siblings[ordinalStack[localDepth]];
continue;
}
// top of stack is not invalid, this is the first time we see it on top of stack.
@ -187,9 +187,9 @@ public class TopKFacetResultsHandler extends FacetResultsHandler {
}
if (localDepth < depth) {
// push kid of current tos
yc = youngestChild[tosOrdinal];
yc = children[tosOrdinal];
while (yc >= endOffset) {
yc = olderSibling[yc];
yc = siblings[yc];
}
ordinalStack[++localDepth] = yc;
} else { // localDepth == depth; current tos exhausted its possible children, mark this by pushing INVALID_ORDINAL

View File

@ -4,16 +4,15 @@ import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.util.PriorityQueue;
import org.apache.lucene.facet.search.params.FacetRequest;
import org.apache.lucene.facet.search.params.FacetRequest.SortOrder;
import org.apache.lucene.facet.search.results.FacetResult;
import org.apache.lucene.facet.search.results.FacetResultNode;
import org.apache.lucene.facet.search.results.MutableFacetResultNode;
import org.apache.lucene.facet.search.results.IntermediateFacetResult;
import org.apache.lucene.facet.taxonomy.ChildrenArrays;
import org.apache.lucene.facet.search.results.MutableFacetResultNode;
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
import org.apache.lucene.facet.taxonomy.directory.ParallelTaxonomyArrays;
import org.apache.lucene.util.PriorityQueue;
import org.apache.lucene.util.collections.IntIterator;
import org.apache.lucene.util.collections.IntToObjectMap;
@ -141,9 +140,9 @@ public class TopKInEachNodeHandler extends FacetResultsHandler {
}
int endOffset = offset + partitionSize; // one past the largest ordinal in the partition
ChildrenArrays childrenArray = taxonomyReader.getChildrenArrays();
int[] youngestChild = childrenArray.getYoungestChildArray();
int[] olderSibling = childrenArray.getOlderSiblingArray();
ParallelTaxonomyArrays childrenArray = taxonomyReader.getParallelTaxonomyArrays();
int[] children = childrenArray.children();
int[] siblings = childrenArray.siblings();
int totalNumOfDescendantsConsidered = 0; // total number of facets with value != 0,
// in the tree. These include those selected as top K in each node, and all the others that
// were not. Not including rootNode
@ -217,7 +216,7 @@ public class TopKInEachNodeHandler extends FacetResultsHandler {
* we can continue to the older sibling of rootNode once the localDepth goes down, before we verify that
* it went that down)
*/
ordinalStack[++localDepth] = youngestChild[rootNode];
ordinalStack[++localDepth] = children[rootNode];
siblingExplored[localDepth] = Integer.MAX_VALUE; // we have not verified position wrt current partition
siblingExplored[0] = -1; // as if rootNode resides to the left of current position
@ -238,7 +237,7 @@ public class TopKInEachNodeHandler extends FacetResultsHandler {
// its child, now just removed, would not have been pushed on it.
// so the father is either inside the partition, or smaller ordinal
if (siblingExplored[localDepth] < 0 ) {
ordinalStack[localDepth] = olderSibling[ordinalStack[localDepth]];
ordinalStack[localDepth] = siblings[ordinalStack[localDepth]];
continue;
}
// in this point, siblingExplored[localDepth] between 0 and number of bestSiblings
@ -264,7 +263,7 @@ public class TopKInEachNodeHandler extends FacetResultsHandler {
//tosOrdinal was not examined yet for its position relative to current partition
// and the best K of current partition, among its siblings, have not been determined yet
while (tosOrdinal >= endOffset) {
tosOrdinal = olderSibling[tosOrdinal];
tosOrdinal = siblings[tosOrdinal];
}
// now it is inside. Run it and all its siblings inside the partition through a heap
// and in doing so, count them, find best K, and sum into residue
@ -297,12 +296,12 @@ public class TopKInEachNodeHandler extends FacetResultsHandler {
// update totalNumOfDescendants by the now excluded node and all its descendants
totalNumOfDescendantsConsidered--; // reduce the 1 earned when the excluded node entered the heap
// and now return it and all its descendants. These will never make it to FacetResult
totalNumOfDescendantsConsidered += countOnly (ac.ordinal, youngestChild,
olderSibling, arrays, partitionSize, offset, endOffset, localDepth, depth);
totalNumOfDescendantsConsidered += countOnly (ac.ordinal, children,
siblings, arrays, partitionSize, offset, endOffset, localDepth, depth);
reusables[++tosReuslables] = ac;
}
}
tosOrdinal = olderSibling[tosOrdinal];
tosOrdinal = siblings[tosOrdinal];
}
// now pq has best K children of ordinals that belong to the given partition.
// Populate a new AACO with them.
@ -343,7 +342,7 @@ public class TopKInEachNodeHandler extends FacetResultsHandler {
ordinalStack[++localDepth] = TaxonomyReader.INVALID_ORDINAL;
continue;
}
ordinalStack[++localDepth] = youngestChild[tosOrdinal];
ordinalStack[++localDepth] = children[tosOrdinal];
siblingExplored[localDepth] = Integer.MAX_VALUE;
} // endof loop while stack is not empty

View File

@ -1,87 +0,0 @@
package org.apache.lucene.facet.taxonomy;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Equivalent representations of the taxonomy's parent info,
* used internally for efficient computation of facet results:
* "youngest child" and "oldest sibling"
*/
public class ChildrenArrays {
private final int[] youngestChild, olderSibling;
public ChildrenArrays(int[] parents) {
this(parents, null);
}
public ChildrenArrays(int[] parents, ChildrenArrays copyFrom) {
youngestChild = new int[parents.length];
olderSibling = new int[parents.length];
int first = 0;
if (copyFrom != null) {
System.arraycopy(copyFrom.getYoungestChildArray(), 0, youngestChild, 0, copyFrom.getYoungestChildArray().length);
System.arraycopy(copyFrom.getOlderSiblingArray(), 0, olderSibling, 0, copyFrom.getOlderSiblingArray().length);
first = copyFrom.getOlderSiblingArray().length;
}
computeArrays(parents, first);
}
private void computeArrays(int[] parents, int first) {
// reset the youngest child of all ordinals. while this should be done only
// for the leaves, we don't know up front which are the leaves, so we reset
// all of them.
for (int i = first; i < parents.length; i++) {
youngestChild[i] = TaxonomyReader.INVALID_ORDINAL;
}
// the root category has no parent, and therefore no siblings
if (first == 0) {
first = 1;
olderSibling[0] = TaxonomyReader.INVALID_ORDINAL;
}
for (int i = first; i < parents.length; i++) {
// note that parents[i] is always < i, so the right-hand-side of
// the following line is already set when we get here
olderSibling[i] = youngestChild[parents[i]];
youngestChild[parents[i]] = i;
}
}
/**
* Returns an {@code int[]} the size of the taxonomy listing for each category
* the ordinal of its immediate older sibling (the sibling in the taxonomy
* tree with the highest ordinal below that of the given ordinal). The value
* for a category with no older sibling is {@link TaxonomyReader#INVALID_ORDINAL}.
*/
public int[] getOlderSiblingArray() {
return olderSibling;
}
/**
* Returns an {@code int[]} the size of the taxonomy listing the ordinal of
* the youngest (highest numbered) child category of each category in the
* taxonomy. The value for a leaf category (a category without children) is
* {@link TaxonomyReader#INVALID_ORDINAL}.
*/
public int[] getYoungestChildArray() {
return youngestChild;
}
}

View File

@ -5,6 +5,7 @@ import java.io.IOException;
import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.lucene.facet.taxonomy.directory.ParallelTaxonomyArrays;
import org.apache.lucene.store.AlreadyClosedException;
/*
@ -162,16 +163,10 @@ public abstract class TaxonomyReader implements Closeable {
}
/**
* Returns a {@link ChildrenArrays} object which can be used together to
* efficiently enumerate the children of any category.
* <p>
* The caller can hold on to the object it got indefinitely - it is guaranteed
* that no-one else will modify it. The other side of the same coin is that
* the caller must treat the object which it got (and the arrays it contains)
* as read-only and <b>not modify it</b>, because other callers might have
* gotten the same object too.
* Returns a {@link ParallelTaxonomyArrays} object which can be used to
* efficiently traverse the taxonomy tree.
*/
public abstract ChildrenArrays getChildrenArrays() throws IOException;
public abstract ParallelTaxonomyArrays getParallelTaxonomyArrays() throws IOException;
/**
* Retrieve user committed data.
@ -195,7 +190,6 @@ public abstract class TaxonomyReader implements Closeable {
* Returns the ordinal of the parent category of the category with the given
* ordinal, according to the following rules:
*
*
* <ul>
* <li>If the given ordinal is the {@link #ROOT_ORDINAL}, an
* {@link #INVALID_ORDINAL} is returned.
@ -210,19 +204,7 @@ public abstract class TaxonomyReader implements Closeable {
* available ordinal)
*/
public abstract int getParent(int ordinal) throws IOException;
/**
* Returns an {@code int[]} the size of the taxonomy listing the ordinal of
* the parent category of each category in the taxonomy.
* <p>
* The caller can hold on to the array it got indefinitely - it is guaranteed
* that no-one else will modify it. The other side of the same coin is that
* the caller must treat the array it got as read-only and <b>not modify
* it</b>, because other callers might have gotten the same array too (and
* getParent() calls might be answered from the same array).
*/
public abstract int[] getParentArray() throws IOException;
/**
* Returns the path name of the category with the given ordinal. The path is
* returned as a new CategoryPath object - to reuse an existing object, use

View File

@ -6,7 +6,6 @@ import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.lucene.facet.taxonomy.CategoryPath;
import org.apache.lucene.facet.taxonomy.ChildrenArrays;
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
import org.apache.lucene.facet.taxonomy.directory.Consts.LoadFullPathOnly;
import org.apache.lucene.index.CorruptIndexException;
@ -63,9 +62,7 @@ public class DirectoryTaxonomyReader extends TaxonomyReader {
private LRUHashMap<String, Integer> ordinalCache;
private LRUHashMap<Integer, String> categoryCache;
// TODO: consolidate these objects into one ParentInfo or something?
private volatile ParentArray parentArray;
private volatile ChildrenArrays childrenArrays;
private volatile ParallelTaxonomyArrays taxoArrays;
private char delimiter = Consts.DEFAULT_DELIMITER;
@ -75,9 +72,8 @@ public class DirectoryTaxonomyReader extends TaxonomyReader {
* arrays.
*/
DirectoryTaxonomyReader(DirectoryReader indexReader, DirectoryTaxonomyWriter taxoWriter,
LRUHashMap<String,Integer> ordinalCache,
LRUHashMap<Integer,String> categoryCache, ParentArray parentArray,
ChildrenArrays childrenArrays) throws IOException {
LRUHashMap<String,Integer> ordinalCache, LRUHashMap<Integer,String> categoryCache,
ParallelTaxonomyArrays taxoArrays) throws IOException {
this.indexReader = indexReader;
this.taxoWriter = taxoWriter;
this.taxoEpoch = taxoWriter == null ? -1 : taxoWriter.getTaxonomyEpoch();
@ -86,14 +82,7 @@ public class DirectoryTaxonomyReader extends TaxonomyReader {
this.ordinalCache = ordinalCache == null ? new LRUHashMap<String,Integer>(DEFAULT_CACHE_VALUE) : ordinalCache;
this.categoryCache = categoryCache == null ? new LRUHashMap<Integer,String>(DEFAULT_CACHE_VALUE) : categoryCache;
this.parentArray = null;
this.childrenArrays = null;
if (parentArray != null) {
this.parentArray = new ParentArray(indexReader, parentArray);
if (childrenArrays != null) {
this.childrenArrays = new ChildrenArrays(this.parentArray.getArray(), childrenArrays);
}
}
this.taxoArrays = taxoArrays != null ? new ParallelTaxonomyArrays(indexReader, taxoArrays) : null;
}
/**
@ -167,11 +156,20 @@ public class DirectoryTaxonomyReader extends TaxonomyReader {
return ret;
}
private synchronized void initTaxoArrays() throws IOException {
if (taxoArrays == null) {
// according to Java Concurrency in Practice, this might perform better on
// some JVMs, because the array initialization doesn't happen on the
// volatile member.
ParallelTaxonomyArrays tmpArrays = new ParallelTaxonomyArrays(indexReader);
taxoArrays = tmpArrays;
}
}
@Override
protected void doClose() throws IOException {
indexReader.close();
parentArray = null;
childrenArrays = null;
taxoArrays = null;
// do not clear() the caches, as they may be used by other DTR instances.
ordinalCache = null;
categoryCache = null;
@ -233,9 +231,9 @@ public class DirectoryTaxonomyReader extends TaxonomyReader {
if (recreated) {
// if recreated, do not reuse anything from this instace. the information
// will be lazily computed by the new instance when needed.
newtr = new DirectoryTaxonomyReader(r2, taxoWriter, null, null, null, null);
newtr = new DirectoryTaxonomyReader(r2, taxoWriter, null, null, null);
} else {
newtr = new DirectoryTaxonomyReader(r2, taxoWriter, ordinalCache, categoryCache, parentArray, childrenArrays);
newtr = new DirectoryTaxonomyReader(r2, taxoWriter, ordinalCache, categoryCache, taxoArrays);
}
success = true;
@ -265,16 +263,12 @@ public class DirectoryTaxonomyReader extends TaxonomyReader {
}
@Override
public ChildrenArrays getChildrenArrays() throws IOException {
public ParallelTaxonomyArrays getParallelTaxonomyArrays() throws IOException {
ensureOpen();
if (childrenArrays == null) {
synchronized (this) {
if (childrenArrays == null) {
childrenArrays = new ChildrenArrays(getParentArray());
}
}
if (taxoArrays == null) {
initTaxoArrays();
}
return childrenArrays;
return taxoArrays;
}
@Override
@ -330,26 +324,12 @@ public class DirectoryTaxonomyReader extends TaxonomyReader {
return ret;
}
// TODO: move to a ParentInfo class? (see TODO for parentArray)
@Override
public int getParent(int ordinal) throws IOException {
ensureOpen();
return getParentArray()[ordinal];
return getParallelTaxonomyArrays().parents()[ordinal];
}
@Override
public int[] getParentArray() throws IOException {
ensureOpen();
if (parentArray == null) {
synchronized (this) {
if (parentArray == null) {
parentArray = new ParentArray(indexReader);
}
}
}
return parentArray.getArray();
}
@Override
public CategoryPath getPath(int ordinal) throws IOException {
ensureOpen();

View File

@ -112,6 +112,12 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
private int cacheMissesUntilFill = 11;
private boolean shouldFillCache = true;
// even though lazily initialized, not volatile so that access to it is
// faster. we keep a volatile boolean init instead.
private ReaderManager readerManager;
private volatile boolean initializedReaderManager = false;
private volatile boolean shouldRefreshReaderManager;
/**
* We call the cache "complete" if we know that every category in our
* taxonomy is in the cache. When the cache is <B>not</B> complete, and
@ -123,14 +129,10 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
* that some of the cached data was cleared).
*/
private volatile boolean cacheIsComplete;
private volatile ReaderManager readerManager;
private volatile boolean shouldRefreshReaderManager;
private volatile boolean isClosed = false;
private volatile ParentArray parentArray;
private volatile ParallelTaxonomyArrays taxoArrays;
private volatile int nextID;
// private Map<String,String> commitData;
/** Reads the commit data from a Directory. */
private static Map<String, String> readCommitData(Directory dir) throws IOException {
SegmentInfos infos = new SegmentInfos();
@ -308,13 +310,14 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
/** Opens a {@link ReaderManager} from the internal {@link IndexWriter}. */
private void initReaderManager() throws IOException {
if (readerManager == null) {
if (!initializedReaderManager) {
synchronized (this) {
// verify that the taxo-writer hasn't been closed on us.
ensureOpen();
if (readerManager == null) {
if (!initializedReaderManager) {
readerManager = new ReaderManager(indexWriter, false);
shouldRefreshReaderManager = false;
initializedReaderManager = true;
}
}
}
@ -341,8 +344,6 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
return new Cl2oTaxonomyWriterCache(1024, 0.15f, 3);
}
// convenience constructors:
public DirectoryTaxonomyWriter(Directory d) throws IOException {
this(d, OpenMode.CREATE_OR_APPEND);
}
@ -375,9 +376,10 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
* <code>super.closeResources()</code> call in your implementation.
*/
protected synchronized void closeResources() throws IOException {
if (readerManager != null) {
if (initializedReaderManager) {
readerManager.close();
readerManager = null;
initializedReaderManager = false;
}
if (cache != null) {
cache.close();
@ -467,15 +469,19 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
int doc = -1;
DirectoryReader reader = readerManager.acquire();
try {
TermsEnum termsEnum = null; // reuse
DocsEnum docs = null; // reuse
final BytesRef catTerm = new BytesRef(categoryPath.toString(delimiter, prefixLen));
for (AtomicReaderContext ctx : reader.leaves()) {
Terms terms = ctx.reader().terms(Consts.FULL);
if (terms != null) {
TermsEnum termsEnum = terms.iterator(null);
termsEnum = terms.iterator(termsEnum);
if (termsEnum.seekExact(catTerm, true)) {
// TODO: is it really ok that null is passed here as liveDocs?
DocsEnum docs = termsEnum.docs(null, null, 0);
// liveDocs=null because the taxonomy has no deletes
docs = termsEnum.docs(null, docs, 0 /* freqs not required */);
// if the term was found, we know it has exactly one document.
doc = docs.nextDoc() + ctx.docBase;
break;
}
}
}
@ -589,7 +595,7 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
addToCache(categoryPath, length, id);
// also add to the parent array
parentArray = getParentArray().add(id, parent);
taxoArrays = getTaxoArrays().add(id, parent);
return id;
}
@ -657,7 +663,7 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
// NOTE: since this method is sync'ed, it can call maybeRefresh, instead of
// maybeRefreshBlocking. If ever this is changed, make sure to change the
// call too.
if (shouldRefreshReaderManager && readerManager != null) {
if (shouldRefreshReaderManager && initializedReaderManager) {
readerManager.maybeRefresh();
shouldRefreshReaderManager = false;
}
@ -791,25 +797,30 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
// initReaderManager called in parallel.
readerManager.close();
readerManager = null;
initializedReaderManager = false;
}
}
}
private ParentArray getParentArray() throws IOException {
if (parentArray == null) {
private ParallelTaxonomyArrays getTaxoArrays() throws IOException {
if (taxoArrays == null) {
synchronized (this) {
if (parentArray == null) {
if (taxoArrays == null) {
initReaderManager();
DirectoryReader reader = readerManager.acquire();
try {
parentArray = new ParentArray(reader);
// according to Java Concurrency, this might perform better on some
// JVMs, since the object initialization doesn't happen on the
// volatile member.
ParallelTaxonomyArrays tmpArrays = new ParallelTaxonomyArrays(reader);
taxoArrays = tmpArrays;
} finally {
readerManager.release(reader);
}
}
}
}
return parentArray;
return taxoArrays;
}
@Override
@ -821,7 +832,7 @@ public class DirectoryTaxonomyWriter implements TaxonomyWriter {
if (ordinal >= nextID) {
throw new ArrayIndexOutOfBoundsException("requested ordinal is bigger than the largest ordinal in the taxonomy");
}
return getParentArray().getArray()[ordinal];
return getTaxoArrays().parents()[ordinal];
}
/**

View File

@ -0,0 +1,230 @@
package org.apache.lucene.facet.taxonomy.directory;
import java.io.IOException;
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.ArrayUtil;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Returns 3 arrays for traversing the taxonomy:
* <ul>
* <li>{@code parents}: {@code parents[i]} denotes the parent of category
* ordinal {@code i}.</li>
* <li>{@code children}: {@code children[i]} denotes the youngest child of
* category ordinal {@code i}. The youngest child is defined as the category
* that was added last to the taxonomy as an immediate child of {@code i}.</li>
* <li>{@code siblings}: {@code siblings[i]} denotes the sibling of category
* ordinal {@code i}. The sibling is defined as the previous youngest child of
* {@code parents[i]}.</li>
* </ul>
*
* To traverse the taxonomy tree, you typically start with {@code children[0]}
* (ordinal 0 is reserved for ROOT), and then depends if you want to do DFS or
* BFS, you call {@code children[children[0]]} or {@code siblings[children[0]]}
* and so forth, respectively.
*
* <p>
* <b>NOTE:</b> you are not expected to modify the values of the arrays, since
* the arrays are shared with other threads.
*
* @lucene.experimental
*/
public class ParallelTaxonomyArrays {
private final int[] parents;
// the following two arrays are lazily intialized. note that we only keep a
// single boolean member as volatile, instead of declaring the arrays
// volatile. the code guarantees that only after the boolean is set to true,
// the arrays are returned.
private volatile boolean initializedChildren = false;
private int[] children, siblings;
/** Used by {@link #add(int, int)} after the array grew. */
private ParallelTaxonomyArrays(int[] parents) {
this.parents = parents;
}
public ParallelTaxonomyArrays(IndexReader reader) throws IOException {
parents = new int[reader.maxDoc()];
if (parents.length > 0) {
initParents(reader, 0);
// Starting Lucene 2.9, following the change LUCENE-1542, we can
// no longer reliably read the parent "-1" (see comment in
// LuceneTaxonomyWriter.SinglePositionTokenStream). We have no way
// to fix this in indexing without breaking backward-compatibility
// with existing indexes, so what we'll do instead is just
// hard-code the parent of ordinal 0 to be -1, and assume (as is
// indeed the case) that no other parent can be -1.
parents[0] = TaxonomyReader.INVALID_ORDINAL;
}
}
public ParallelTaxonomyArrays(IndexReader reader, ParallelTaxonomyArrays copyFrom) throws IOException {
assert copyFrom != null;
// note that copyParents.length may be equal to reader.maxDoc(). this is not a bug
// it may be caused if e.g. the taxonomy segments were merged, and so an updated
// NRT reader was obtained, even though nothing was changed. this is not very likely
// to happen.
int[] copyParents = copyFrom.parents();
this.parents = new int[reader.maxDoc()];
System.arraycopy(copyParents, 0, parents, 0, copyParents.length);
initParents(reader, copyParents.length);
if (copyFrom.initializedChildren) {
initChildrenSiblings(copyFrom);
}
}
private final synchronized void initChildrenSiblings(ParallelTaxonomyArrays copyFrom) {
if (!initializedChildren) { // must do this check !
children = new int[parents.length];
siblings = new int[parents.length];
if (copyFrom != null) {
// called from the ctor, after we know copyFrom has initialized children/siblings
System.arraycopy(copyFrom.children(), 0, children, 0, copyFrom.children().length);
System.arraycopy(copyFrom.siblings(), 0, siblings, 0, copyFrom.siblings().length);
}
computeChildrenSiblings(parents, 0);
initializedChildren = true;
}
}
private void computeChildrenSiblings(int[] parents, int first) {
// reset the youngest child of all ordinals. while this should be done only
// for the leaves, we don't know up front which are the leaves, so we reset
// all of them.
for (int i = first; i < parents.length; i++) {
children[i] = TaxonomyReader.INVALID_ORDINAL;
}
// the root category has no parent, and therefore no siblings
if (first == 0) {
first = 1;
siblings[0] = TaxonomyReader.INVALID_ORDINAL;
}
for (int i = first; i < parents.length; i++) {
// note that parents[i] is always < i, so the right-hand-side of
// the following line is already set when we get here
siblings[i] = children[parents[i]];
children[parents[i]] = i;
}
}
// Read the parents of the new categories
private void initParents(IndexReader reader, int first) throws IOException {
if (reader.maxDoc() == first) {
return;
}
// it's ok to use MultiFields because we only iterate on one posting list.
// breaking it to loop over the leaves() only complicates code for no
// apparent gain.
DocsAndPositionsEnum positions = MultiFields.getTermPositionsEnum(reader, null,
Consts.FIELD_PAYLOADS, Consts.PAYLOAD_PARENT_BYTES_REF,
DocsAndPositionsEnum.FLAG_PAYLOADS);
// shouldn't really happen, if it does, something's wrong
if (positions == null || positions.advance(first) == DocIdSetIterator.NO_MORE_DOCS) {
throw new CorruptIndexException("Missing parent data for category " + first);
}
int num = reader.maxDoc();
for (int i = first; i < num; i++) {
if (positions.docID() == i) {
if (positions.freq() == 0) { // shouldn't happen
throw new CorruptIndexException("Missing parent data for category " + i);
}
parents[i] = positions.nextPosition();
if (positions.nextDoc() == DocIdSetIterator.NO_MORE_DOCS) {
if (i + 1 < num) {
throw new CorruptIndexException("Missing parent data for category "+ (i + 1));
}
break;
}
} else { // this shouldn't happen
throw new CorruptIndexException("Missing parent data for category " + i);
}
}
}
/**
* Adds the given ordinal/parent info and returns either a new instance if the
* underlying array had to grow, or this instance otherwise.
* <p>
* <b>NOTE:</b> you should call this method from a thread-safe code.
*/
ParallelTaxonomyArrays add(int ordinal, int parentOrdinal) {
if (ordinal >= parents.length) {
int[] newarray = ArrayUtil.grow(parents);
newarray[ordinal] = parentOrdinal;
return new ParallelTaxonomyArrays(newarray);
}
parents[ordinal] = parentOrdinal;
return this;
}
/**
* Returns the parents array, where {@code parents[i]} denotes the parent of
* category ordinal {@code i}.
*/
public int[] parents() {
return parents;
}
/**
* Returns the children array, where {@code children[i]} denotes the youngest
* child of category ordinal {@code i}. The youngest child is defined as the
* category that was added last to the taxonomy as an immediate child of
* {@code i}.
*/
public int[] children() {
if (!initializedChildren) {
initChildrenSiblings(null);
}
// the array is guaranteed to be populated
return children;
}
/**
* Returns the siblings array, where {@code siblings[i]} denotes the sibling
* of category ordinal {@code i}. The sibling is defined as the previous
* youngest child of {@code parents[i]}.
*/
public int[] siblings() {
if (!initializedChildren) {
initChildrenSiblings(null);
}
// the array is guaranteed to be populated
return siblings;
}
}

View File

@ -1,130 +0,0 @@
package org.apache.lucene.facet.taxonomy.directory;
import java.io.IOException;
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.ArrayUtil;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* @lucene.experimental
*/
class ParentArray {
// TODO: maybe use PackedInts?
private final int[] parentOrdinals;
/** Used by {@link #add(int, int)} when the array needs to grow. */
ParentArray(int[] parentOrdinals) {
this.parentOrdinals = parentOrdinals;
}
public ParentArray(IndexReader reader) throws IOException {
parentOrdinals = new int[reader.maxDoc()];
if (parentOrdinals.length > 0) {
initFromReader(reader, 0);
// Starting Lucene 2.9, following the change LUCENE-1542, we can
// no longer reliably read the parent "-1" (see comment in
// LuceneTaxonomyWriter.SinglePositionTokenStream). We have no way
// to fix this in indexing without breaking backward-compatibility
// with existing indexes, so what we'll do instead is just
// hard-code the parent of ordinal 0 to be -1, and assume (as is
// indeed the case) that no other parent can be -1.
parentOrdinals[0] = TaxonomyReader.INVALID_ORDINAL;
}
}
public ParentArray(IndexReader reader, ParentArray copyFrom) throws IOException {
assert copyFrom != null;
// note that copyParents.length may be equal to reader.maxDoc(). this is not a bug
// it may be caused if e.g. the taxonomy segments were merged, and so an updated
// NRT reader was obtained, even though nothing was changed. this is not very likely
// to happen.
int[] copyParents = copyFrom.getArray();
this.parentOrdinals = new int[reader.maxDoc()];
System.arraycopy(copyParents, 0, parentOrdinals, 0, copyParents.length);
initFromReader(reader, copyParents.length);
}
// Read the parents of the new categories
private void initFromReader(IndexReader reader, int first) throws IOException {
if (reader.maxDoc() == first) {
return;
}
// it's ok to use MultiFields because we only iterate on one posting list.
// breaking it to loop over the leaves() only complicates code for no
// apparent gain.
DocsAndPositionsEnum positions = MultiFields.getTermPositionsEnum(reader, null,
Consts.FIELD_PAYLOADS, Consts.PAYLOAD_PARENT_BYTES_REF,
DocsAndPositionsEnum.FLAG_PAYLOADS);
// shouldn't really happen, if it does, something's wrong
if (positions == null || positions.advance(first) == DocIdSetIterator.NO_MORE_DOCS) {
throw new CorruptIndexException("Missing parent data for category " + first);
}
int num = reader.maxDoc();
for (int i = first; i < num; i++) {
if (positions.docID() == i) {
if (positions.freq() == 0) { // shouldn't happen
throw new CorruptIndexException("Missing parent data for category " + i);
}
parentOrdinals[i] = positions.nextPosition();
if (positions.nextDoc() == DocIdSetIterator.NO_MORE_DOCS) {
if (i + 1 < num) {
throw new CorruptIndexException("Missing parent data for category "+ (i + 1));
}
break;
}
} else { // this shouldn't happen
throw new CorruptIndexException("Missing parent data for category " + i);
}
}
}
public int[] getArray() {
return parentOrdinals;
}
/**
* Adds the given ordinal/parent info and returns either a new instance if the
* underlying array had to grow, or this instance otherwise.
* <p>
* <b>NOTE:</b> you should call this method from a thread-safe code.
*/
ParentArray add(int ordinal, int parentOrdinal) {
if (ordinal >= parentOrdinals.length) {
int[] newarray = ArrayUtil.grow(parentOrdinals);
newarray[ordinal] = parentOrdinal;
return new ParentArray(newarray);
}
parentOrdinals[ordinal] = parentOrdinal;
return this;
}
}

View File

@ -232,13 +232,15 @@ public class MultiIteratorsPerCLParamsTest extends LuceneTestCase {
CategoryPath cp = new CategoryPath(requestedPath.getComponent(0));
parentOrdinal = taxo.getOrdinal(cp);
}
parentArray = taxo.getParentArray();
parentArray = taxo.getParallelTaxonomyArrays().parents();
}
@Override
public boolean init() throws IOException {
return superCLI.init();
}
@Override
public long nextCategory() throws IOException {
long next;
while ((next = superCLI.nextCategory()) <= Integer.MAX_VALUE
@ -259,6 +261,7 @@ public class MultiIteratorsPerCLParamsTest extends LuceneTestCase {
return false;
}
@Override
public boolean skipTo(int docId) throws IOException {
return superCLI.skipTo(docId);
}

View File

@ -9,6 +9,7 @@ import java.util.concurrent.atomic.AtomicBoolean;
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader;
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter;
import org.apache.lucene.facet.taxonomy.directory.ParallelTaxonomyArrays;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.store.RAMDirectory;
@ -545,7 +546,7 @@ public class TestTaxonomyCombined extends LuceneTestCase {
fillTaxonomy(tw);
tw.close();
TaxonomyReader tr = new DirectoryTaxonomyReader(indexDir);
int[] parents = tr.getParentArray();
int[] parents = tr.getParallelTaxonomyArrays().parents();
assertEquals(tr.getSize(), parents.length);
for (int i=0; i<tr.getSize(); i++) {
assertEquals(tr.getParent(i), parents[i]);
@ -566,10 +567,10 @@ public class TestTaxonomyCombined extends LuceneTestCase {
fillTaxonomy(tw);
tw.close();
TaxonomyReader tr = new DirectoryTaxonomyReader(indexDir);
ChildrenArrays ca = tr.getChildrenArrays();
int[] youngestChildArray = ca.getYoungestChildArray();
ParallelTaxonomyArrays ca = tr.getParallelTaxonomyArrays();
int[] youngestChildArray = ca.children();
assertEquals(tr.getSize(), youngestChildArray.length);
int[] olderSiblingArray = ca.getOlderSiblingArray();
int[] olderSiblingArray = ca.siblings();
assertEquals(tr.getSize(), olderSiblingArray.length);
for (int i=0; i<expectedCategories.length; i++) {
// find expected children by looking at all expectedCategories
@ -630,15 +631,15 @@ public class TestTaxonomyCombined extends LuceneTestCase {
fillTaxonomy(tw);
tw.close();
TaxonomyReader tr = new DirectoryTaxonomyReader(indexDir);
ChildrenArrays ca = tr.getChildrenArrays();
int[] youngestChildArray = ca.getYoungestChildArray();
assertEquals(tr.getSize(), youngestChildArray.length);
int[] olderSiblingArray = ca.getOlderSiblingArray();
ParallelTaxonomyArrays ca = tr.getParallelTaxonomyArrays();
int[] children = ca.children();
assertEquals(tr.getSize(), children.length);
int[] olderSiblingArray = ca.siblings();
assertEquals(tr.getSize(), olderSiblingArray.length);
// test that the "youngest child" of every category is indeed a child:
for (int i=0; i<tr.getSize(); i++) {
int youngestChild = youngestChildArray[i];
int youngestChild = children[i];
if (youngestChild != TaxonomyReader.INVALID_ORDINAL) {
assertEquals(i, tr.getParent(youngestChild));
}
@ -676,7 +677,7 @@ public class TestTaxonomyCombined extends LuceneTestCase {
if (j==i) { // no child found
j=TaxonomyReader.INVALID_ORDINAL;
}
assertEquals(j, youngestChildArray[i]);
assertEquals(j, children[i]);
}
// test that the "older sibling" is indeed the least oldest one - and
@ -710,32 +711,32 @@ public class TestTaxonomyCombined extends LuceneTestCase {
tw.addCategory(new CategoryPath("hi", "there"));
tw.commit();
TaxonomyReader tr = new DirectoryTaxonomyReader(indexDir);
ChildrenArrays ca = tr.getChildrenArrays();
ParallelTaxonomyArrays ca = tr.getParallelTaxonomyArrays();
assertEquals(3, tr.getSize());
assertEquals(3, ca.getOlderSiblingArray().length);
assertEquals(3, ca.getYoungestChildArray().length);
assertTrue(Arrays.equals(new int[] { 1, 2, -1 }, ca.getYoungestChildArray()));
assertTrue(Arrays.equals(new int[] { -1, -1, -1 }, ca.getOlderSiblingArray()));
assertEquals(3, ca.siblings().length);
assertEquals(3, ca.children().length);
assertTrue(Arrays.equals(new int[] { 1, 2, -1 }, ca.children()));
assertTrue(Arrays.equals(new int[] { -1, -1, -1 }, ca.siblings()));
tw.addCategory(new CategoryPath("hi", "ho"));
tw.addCategory(new CategoryPath("hello"));
tw.commit();
// Before refresh, nothing changed..
ChildrenArrays newca = tr.getChildrenArrays();
ParallelTaxonomyArrays newca = tr.getParallelTaxonomyArrays();
assertSame(newca, ca); // we got exactly the same object
assertEquals(3, tr.getSize());
assertEquals(3, ca.getOlderSiblingArray().length);
assertEquals(3, ca.getYoungestChildArray().length);
assertEquals(3, ca.siblings().length);
assertEquals(3, ca.children().length);
// After the refresh, things change:
TaxonomyReader newtr = TaxonomyReader.openIfChanged(tr);
assertNotNull(newtr);
tr.close();
tr = newtr;
ca = tr.getChildrenArrays();
ca = tr.getParallelTaxonomyArrays();
assertEquals(5, tr.getSize());
assertEquals(5, ca.getOlderSiblingArray().length);
assertEquals(5, ca.getYoungestChildArray().length);
assertTrue(Arrays.equals(new int[] { 4, 3, -1, -1, -1 }, ca.getYoungestChildArray()));
assertTrue(Arrays.equals(new int[] { -1, -1, -1, 2, 1 }, ca.getOlderSiblingArray()));
assertEquals(5, ca.siblings().length);
assertEquals(5, ca.children().length);
assertTrue(Arrays.equals(new int[] { 4, 3, -1, -1, -1 }, ca.children()));
assertTrue(Arrays.equals(new int[] { -1, -1, -1, 2, 1 }, ca.siblings()));
tw.close();
tr.close();
indexDir.close();
@ -753,10 +754,10 @@ public class TestTaxonomyCombined extends LuceneTestCase {
twBase.commit();
TaxonomyReader trBase = new DirectoryTaxonomyReader(indexDirBase);
final ChildrenArrays ca1 = trBase.getChildrenArrays();
final ParallelTaxonomyArrays ca1 = trBase.getParallelTaxonomyArrays();
final int abOrd = trBase.getOrdinal(abPath);
final int abYoungChildBase1 = ca1.getYoungestChildArray()[abOrd];
final int abYoungChildBase1 = ca1.children()[abOrd];
final int numCategories = atLeast(800);
for (int i = 0; i < numCategories; i++) {
@ -769,8 +770,8 @@ public class TestTaxonomyCombined extends LuceneTestCase {
trBase.close();
trBase = newTaxoReader;
final ChildrenArrays ca2 = trBase.getChildrenArrays();
final int abYoungChildBase2 = ca2.getYoungestChildArray()[abOrd];
final ParallelTaxonomyArrays ca2 = trBase.getParallelTaxonomyArrays();
final int abYoungChildBase2 = ca2.children()[abOrd];
int numRetries = atLeast(50);
for (int retry = 0; retry < numRetries; retry++) {
@ -808,9 +809,9 @@ public class TestTaxonomyCombined extends LuceneTestCase {
setPriority(1 + getPriority());
try {
while (!stop.get()) {
int lastOrd = tr.getParentArray().length - 1;
int lastOrd = tr.getParallelTaxonomyArrays().parents().length - 1;
assertNotNull("path of last-ord " + lastOrd + " is not found!", tr.getPath(lastOrd));
assertChildrenArrays(tr.getChildrenArrays(), retry, retrieval[0]++);
assertChildrenArrays(tr.getParallelTaxonomyArrays(), retry, retrieval[0]++);
sleep(10); // don't starve refresh()'s CPU, which sleeps every 50 bytes for 1 ms
}
} catch (Throwable e) {
@ -819,13 +820,13 @@ public class TestTaxonomyCombined extends LuceneTestCase {
}
}
private void assertChildrenArrays(ChildrenArrays ca, int retry, int retrieval) {
final int abYoungChild = ca.getYoungestChildArray()[abOrd];
private void assertChildrenArrays(ParallelTaxonomyArrays ca, int retry, int retrieval) {
final int abYoungChild = ca.children()[abOrd];
assertTrue(
"Retry "+retry+": retrieval: "+retrieval+": wrong youngest child for category "+abPath+" (ord="+abOrd+
") - must be either "+abYoungChildBase1+" or "+abYoungChildBase2+" but was: "+abYoungChild,
abYoungChildBase1==abYoungChild ||
abYoungChildBase2==ca.getYoungestChildArray()[abOrd]);
abYoungChildBase2==ca.children()[abOrd]);
}
};
thread.start();

View File

@ -233,7 +233,7 @@ public class TestDirectoryTaxonomyReader extends LuceneTestCase {
// assert categories
assertEquals(numCategories, reader.getSize());
int roundOrdinal = reader.getOrdinal(new CategoryPath(Integer.toString(i)));
int[] parents = reader.getParentArray();
int[] parents = reader.getParallelTaxonomyArrays().parents();
assertEquals(0, parents[roundOrdinal]); // round's parent is root
for (int j = 0; j < numCats; j++) {
int ord = reader.getOrdinal(new CategoryPath(Integer.toString(i), Integer.toString(j)));
@ -268,7 +268,7 @@ public class TestDirectoryTaxonomyReader extends LuceneTestCase {
TaxonomyReader reader = new DirectoryTaxonomyReader(writer);
assertEquals(1, reader.getSize());
assertEquals(1, reader.getParentArray().length);
assertEquals(1, reader.getParallelTaxonomyArrays().parents().length);
// add category and call forceMerge -- this should flush IW and merge segments down to 1
// in ParentArray.initFromReader, this used to fail assuming there are no parents.
@ -281,7 +281,7 @@ public class TestDirectoryTaxonomyReader extends LuceneTestCase {
reader.close();
reader = newtr;
assertEquals(2, reader.getSize());
assertEquals(2, reader.getParentArray().length);
assertEquals(2, reader.getParallelTaxonomyArrays().parents().length);
reader.close();
writer.close();
@ -315,7 +315,7 @@ public class TestDirectoryTaxonomyReader extends LuceneTestCase {
TaxonomyReader reader = new DirectoryTaxonomyReader(writer);
assertEquals(2, reader.getSize());
assertEquals(2, reader.getParentArray().length);
assertEquals(2, reader.getParallelTaxonomyArrays().parents().length);
// merge all the segments so that NRT reader thinks there's a change
iw.forceMerge(1);
@ -326,7 +326,7 @@ public class TestDirectoryTaxonomyReader extends LuceneTestCase {
reader.close();
reader = newtr;
assertEquals(2, reader.getSize());
assertEquals(2, reader.getParentArray().length);
assertEquals(2, reader.getParallelTaxonomyArrays().parents().length);
reader.close();
writer.close();