mirror of https://github.com/apache/lucene.git
LUCENE-5339: add OrdinalsReader + Cache to abstract the source of the ords
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5339@1542773 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
39f6c8a5ff
commit
7a0ee102d0
3
TODO
3
TODO
|
@ -2,10 +2,8 @@ nocommit this!
|
|||
|
||||
TODO
|
||||
- associations
|
||||
- ords cache
|
||||
- wrap an IW instead of extending one?
|
||||
- re-enable ALL_BUT_DIM somehow?
|
||||
- abstraction for 'ords source/decode'
|
||||
- simplify ddq api
|
||||
- SSDVValueSourceFacets?
|
||||
- we could put more stuff into the "schema", e.g. this field is
|
||||
|
@ -15,7 +13,6 @@ TODO
|
|||
- rename CategoryPath -> FacetLabel
|
||||
- how to do avg() agg?
|
||||
- test needsScores=true / valuesource associations
|
||||
- drill sideways
|
||||
- make FieldTypes optional (if all your dims are flat)?
|
||||
- add hierarchy to ssdv facets?
|
||||
- sparse faceting: allow skipping of certain dims?
|
||||
|
|
|
@ -0,0 +1,130 @@
|
|||
package org.apache.lucene.facet.simple;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Map;
|
||||
import java.util.WeakHashMap;
|
||||
|
||||
import org.apache.lucene.index.AtomicReaderContext;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
|
||||
/**
|
||||
* A per-segment cache of documents' facet ordinals. Every
|
||||
* {@link CachedOrds} holds the ordinals in a raw {@code
|
||||
* int[]}, and therefore consumes as much RAM as the total
|
||||
* number of ordinals found in the segment, but saves the
|
||||
* CPU cost of decoding ordinals during facet counting.
|
||||
*
|
||||
* <p>
|
||||
* <b>NOTE:</b> every {@link CachedOrds} is limited to 2.1B
|
||||
* total ordinals. If that is a limitation for you then
|
||||
* consider limiting the segment size to fewer documents, or
|
||||
* use an alternative cache which pages through the category
|
||||
* ordinals.
|
||||
*
|
||||
* <p>
|
||||
* <b>NOTE:</b> when using this cache, it is advised to use
|
||||
* a {@link DocValuesFormat} that does not cache the data in
|
||||
* memory, at least for the category lists fields, or
|
||||
* otherwise you'll be doing double-caching.
|
||||
*/
|
||||
public class CachedOrdinalsReader extends OrdinalsReader {
|
||||
|
||||
private final OrdinalsReader source;
|
||||
private CachedOrds current;
|
||||
|
||||
// outer map is a WeakHashMap which uses reader.getCoreCacheKey() as the weak
|
||||
// reference. When it's no longer referenced, the entire inner map can be
|
||||
// evicted.
|
||||
private final Map<Object,CachedOrds> ordsCache = new WeakHashMap<Object,CachedOrds>();
|
||||
|
||||
public CachedOrdinalsReader(OrdinalsReader source) {
|
||||
this.source = source;
|
||||
}
|
||||
|
||||
private synchronized CachedOrds getCachedOrds(AtomicReaderContext context) throws IOException {
|
||||
Object cacheKey = context.reader().getCoreCacheKey();
|
||||
CachedOrds ords = ordsCache.get(cacheKey);
|
||||
if (ords == null) {
|
||||
ords = new CachedOrds(source.getReader(context), context.reader().maxDoc());
|
||||
ordsCache.put(cacheKey, ords);
|
||||
}
|
||||
|
||||
return ords;
|
||||
}
|
||||
|
||||
@Override
|
||||
public OrdinalsSegmentReader getReader(AtomicReaderContext context) throws IOException {
|
||||
final CachedOrds cachedOrds = getCachedOrds(context);
|
||||
return new OrdinalsSegmentReader() {
|
||||
@Override
|
||||
public void get(int docID, IntsRef ordinals) {
|
||||
ordinals.ints = cachedOrds.ordinals;
|
||||
ordinals.offset = cachedOrds.offsets[docID];
|
||||
ordinals.length = cachedOrds.offsets[docID+1] - ordinals.offset;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
/** Holds the cached ordinals in two paralel {@code int[]} arrays. */
|
||||
public static final class CachedOrds {
|
||||
|
||||
public final int[] offsets;
|
||||
public final int[] ordinals;
|
||||
|
||||
/**
|
||||
* Creates a new {@link CachedOrds} from the {@link BinaryDocValues}.
|
||||
* Assumes that the {@link BinaryDocValues} is not {@code null}.
|
||||
*/
|
||||
public CachedOrds(OrdinalsSegmentReader source, int maxDoc) throws IOException {
|
||||
final BytesRef buf = new BytesRef();
|
||||
|
||||
offsets = new int[maxDoc + 1];
|
||||
int[] ords = new int[maxDoc]; // let's assume one ordinal per-document as an initial size
|
||||
|
||||
// this aggregator is limited to Integer.MAX_VALUE total ordinals.
|
||||
long totOrds = 0;
|
||||
final IntsRef values = new IntsRef(32);
|
||||
for (int docID = 0; docID < maxDoc; docID++) {
|
||||
offsets[docID] = (int) totOrds;
|
||||
source.get(docID, values);
|
||||
long nextLength = totOrds + values.length;
|
||||
if (nextLength > ords.length) {
|
||||
if (nextLength > ArrayUtil.MAX_ARRAY_LENGTH) {
|
||||
throw new IllegalStateException("too many ordinals (>= " + nextLength + ") to cache");
|
||||
}
|
||||
ords = ArrayUtil.grow(ords, (int) nextLength);
|
||||
}
|
||||
System.arraycopy(values.ints, 0, ords, (int) totOrds, values.length);
|
||||
totOrds = nextLength;
|
||||
}
|
||||
offsets[maxDoc] = (int) totOrds;
|
||||
|
||||
// if ords array is bigger by more than 10% of what we really need, shrink it
|
||||
if ((double) totOrds / ords.length < 0.9) {
|
||||
this.ordinals = new int[(int) totOrds];
|
||||
System.arraycopy(ords, 0, this.ordinals, 0, (int) totOrds);
|
||||
} else {
|
||||
this.ordinals = ords;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,92 @@
|
|||
package org.apache.lucene.facet.simple;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.index.AtomicReaderContext;
|
||||
import org.apache.lucene.index.BinaryDocValues;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
|
||||
/** Decodes ordinals previously indexed into a BinaryDocValues field */
|
||||
|
||||
public class DocValuesOrdinalsReader extends OrdinalsReader {
|
||||
private final String field;
|
||||
|
||||
public DocValuesOrdinalsReader() {
|
||||
this(FacetsConfig.DEFAULT_INDEXED_FIELD_NAME);
|
||||
}
|
||||
|
||||
public DocValuesOrdinalsReader(String field) {
|
||||
this.field = field;
|
||||
}
|
||||
|
||||
@Override
|
||||
public OrdinalsSegmentReader getReader(AtomicReaderContext context) throws IOException {
|
||||
BinaryDocValues values0 = context.reader().getBinaryDocValues(field);
|
||||
if (values0 == null) {
|
||||
values0 = BinaryDocValues.EMPTY;
|
||||
}
|
||||
|
||||
final BinaryDocValues values = values0;
|
||||
|
||||
return new OrdinalsSegmentReader() {
|
||||
private final BytesRef bytes = new BytesRef(32);
|
||||
|
||||
@Override
|
||||
public void get(int docID, IntsRef ordinals) throws IOException {
|
||||
values.get(docID, bytes);
|
||||
decode(bytes, ordinals);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
/** Subclass & override if you change the encoding. */
|
||||
protected void decode(BytesRef buf, IntsRef ordinals) {
|
||||
|
||||
// grow the buffer up front, even if by a large number of values (buf.length)
|
||||
// that saves the need to check inside the loop for every decoded value if
|
||||
// the buffer needs to grow.
|
||||
if (ordinals.ints.length < buf.length) {
|
||||
ordinals.ints = ArrayUtil.grow(ordinals.ints, buf.length);
|
||||
}
|
||||
|
||||
ordinals.offset = 0;
|
||||
ordinals.length = 0;
|
||||
|
||||
// it is better if the decoding is inlined like so, and not e.g.
|
||||
// in a utility method
|
||||
int upto = buf.offset + buf.length;
|
||||
int value = 0;
|
||||
int offset = buf.offset;
|
||||
int prev = 0;
|
||||
while (offset < upto) {
|
||||
byte b = buf.bytes[offset++];
|
||||
if (b >= 0) {
|
||||
ordinals.ints[ordinals.length] = ((value << 7) | b) + prev;
|
||||
value = 0;
|
||||
prev = ordinals.ints[ordinals.length];
|
||||
ordinals.length++;
|
||||
} else {
|
||||
value = (value << 7) | (b & 0x7F);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,228 @@
|
|||
package org.apache.lucene.facet.simple;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.facet.simple.SimpleFacetsCollector.MatchingDocs;
|
||||
import org.apache.lucene.facet.taxonomy.FacetLabel;
|
||||
import org.apache.lucene.facet.taxonomy.ParallelTaxonomyArrays;
|
||||
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
|
||||
import org.apache.lucene.index.BinaryDocValues;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.FixedBitSet;
|
||||
|
||||
// nocommit jdoc that this assumes/requires the default encoding
|
||||
public class FastTaxonomyFacetCounts extends Facets {
|
||||
private final FacetsConfig facetsConfig;
|
||||
private final TaxonomyReader taxoReader;
|
||||
private final int[] counts;
|
||||
private final String facetsFieldName;
|
||||
private final int[] children;
|
||||
private final int[] parents;
|
||||
private final int[] siblings;
|
||||
|
||||
public FastTaxonomyFacetCounts(TaxonomyReader taxoReader, FacetsConfig facetsConfig, SimpleFacetsCollector fc) throws IOException {
|
||||
this(FacetsConfig.DEFAULT_INDEXED_FIELD_NAME, taxoReader, facetsConfig, fc);
|
||||
}
|
||||
|
||||
public FastTaxonomyFacetCounts(String facetsFieldName, TaxonomyReader taxoReader, FacetsConfig facetsConfig, SimpleFacetsCollector fc) throws IOException {
|
||||
this.taxoReader = taxoReader;
|
||||
this.facetsFieldName = facetsFieldName;
|
||||
this.facetsConfig = facetsConfig;
|
||||
ParallelTaxonomyArrays pta = taxoReader.getParallelTaxonomyArrays();
|
||||
children = pta.children();
|
||||
parents = pta.parents();
|
||||
siblings = pta.siblings();
|
||||
counts = new int[taxoReader.getSize()];
|
||||
count(fc.getMatchingDocs());
|
||||
}
|
||||
|
||||
private final void count(List<MatchingDocs> matchingDocs) throws IOException {
|
||||
//System.out.println("count matchingDocs=" + matchingDocs + " facetsField=" + facetsFieldName);
|
||||
for(MatchingDocs hits : matchingDocs) {
|
||||
BinaryDocValues dv = hits.context.reader().getBinaryDocValues(facetsFieldName);
|
||||
if (dv == null) { // this reader does not have DocValues for the requested category list
|
||||
continue;
|
||||
}
|
||||
FixedBitSet bits = hits.bits;
|
||||
|
||||
final int length = hits.bits.length();
|
||||
int doc = 0;
|
||||
BytesRef scratch = new BytesRef();
|
||||
//System.out.println("count seg=" + hits.context.reader());
|
||||
while (doc < length && (doc = bits.nextSetBit(doc)) != -1) {
|
||||
//System.out.println(" doc=" + doc);
|
||||
dv.get(doc, scratch);
|
||||
byte[] bytes = scratch.bytes;
|
||||
int end = scratch.offset + scratch.length;
|
||||
int ord = 0;
|
||||
int offset = scratch.offset;
|
||||
int prev = 0;
|
||||
while (offset < end) {
|
||||
byte b = bytes[offset++];
|
||||
if (b >= 0) {
|
||||
prev = ord = ((ord << 7) | b) + prev;
|
||||
assert ord < counts.length: "ord=" + ord + " vs maxOrd=" + counts.length;
|
||||
++counts[ord];
|
||||
ord = 0;
|
||||
} else {
|
||||
ord = (ord << 7) | (b & 0x7F);
|
||||
}
|
||||
}
|
||||
++doc;
|
||||
}
|
||||
}
|
||||
|
||||
// nocommit we could do this lazily instead:
|
||||
|
||||
// Rollup any necessary dims:
|
||||
for(Map.Entry<String,FacetsConfig.DimConfig> ent : facetsConfig.getDimConfigs().entrySet()) {
|
||||
String dim = ent.getKey();
|
||||
FacetsConfig.DimConfig ft = ent.getValue();
|
||||
if (ft.hierarchical && ft.multiValued == false) {
|
||||
int dimRootOrd = taxoReader.getOrdinal(new FacetLabel(dim));
|
||||
// It can be -1 if this field was declared in the
|
||||
// facetsConfig but never indexed:
|
||||
if (dimRootOrd > 0) {
|
||||
counts[dimRootOrd] += rollup(children[dimRootOrd]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private int rollup(int ord) {
|
||||
int sum = 0;
|
||||
while (ord != TaxonomyReader.INVALID_ORDINAL) {
|
||||
int childValue = counts[ord] + rollup(children[ord]);
|
||||
counts[ord] = childValue;
|
||||
sum += childValue;
|
||||
ord = siblings[ord];
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
/** Return the count for a specific path. Returns -1 if
|
||||
* this path doesn't exist, else the count. */
|
||||
@Override
|
||||
public Number getSpecificValue(String dim, String... path) throws IOException {
|
||||
int ord = taxoReader.getOrdinal(FacetLabel.create(dim, path));
|
||||
if (ord < 0) {
|
||||
return -1;
|
||||
}
|
||||
return counts[ord];
|
||||
}
|
||||
|
||||
@Override
|
||||
public SimpleFacetResult getTopChildren(int topN, String dim, String... path) throws IOException {
|
||||
FacetLabel cp = FacetLabel.create(dim, path);
|
||||
int ord = taxoReader.getOrdinal(cp);
|
||||
if (ord == -1) {
|
||||
//System.out.println("no ord for path=" + path);
|
||||
return null;
|
||||
}
|
||||
return getTopChildren(cp, ord, topN);
|
||||
}
|
||||
|
||||
private SimpleFacetResult getTopChildren(FacetLabel path, int dimOrd, int topN) throws IOException {
|
||||
|
||||
TopOrdCountQueue q = new TopOrdCountQueue(topN);
|
||||
|
||||
int bottomCount = 0;
|
||||
|
||||
int ord = children[dimOrd];
|
||||
int totCount = 0;
|
||||
|
||||
TopOrdCountQueue.OrdAndCount reuse = null;
|
||||
while(ord != TaxonomyReader.INVALID_ORDINAL) {
|
||||
if (counts[ord] > 0) {
|
||||
totCount += counts[ord];
|
||||
if (counts[ord] > bottomCount) {
|
||||
if (reuse == null) {
|
||||
reuse = new TopOrdCountQueue.OrdAndCount();
|
||||
}
|
||||
reuse.ord = ord;
|
||||
reuse.count = counts[ord];
|
||||
reuse = q.insertWithOverflow(reuse);
|
||||
if (q.size() == topN) {
|
||||
bottomCount = q.top().count;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ord = siblings[ord];
|
||||
}
|
||||
|
||||
if (totCount == 0) {
|
||||
//System.out.println("totCount=0 for path=" + path);
|
||||
return null;
|
||||
}
|
||||
|
||||
FacetsConfig.DimConfig ft = facetsConfig.getDimConfig(path.components[0]);
|
||||
// nocommit shouldn't we verify the indexedFieldName
|
||||
// matches what was passed to our ctor?
|
||||
if (ft.hierarchical && ft.multiValued) {
|
||||
totCount = counts[dimOrd];
|
||||
}
|
||||
|
||||
LabelAndValue[] labelValues = new LabelAndValue[q.size()];
|
||||
for(int i=labelValues.length-1;i>=0;i--) {
|
||||
TopOrdCountQueue.OrdAndCount ordAndCount = q.pop();
|
||||
FacetLabel child = taxoReader.getPath(ordAndCount.ord);
|
||||
labelValues[i] = new LabelAndValue(child.components[path.length], ordAndCount.count);
|
||||
}
|
||||
|
||||
return new SimpleFacetResult(path, totCount, labelValues);
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<SimpleFacetResult> getAllDims(int topN) throws IOException {
|
||||
int ord = children[TaxonomyReader.ROOT_ORDINAL];
|
||||
List<SimpleFacetResult> results = new ArrayList<SimpleFacetResult>();
|
||||
while (ord != TaxonomyReader.INVALID_ORDINAL) {
|
||||
SimpleFacetResult result = getTopChildren(taxoReader.getPath(ord), ord, topN);
|
||||
if (result != null) {
|
||||
results.add(result);
|
||||
}
|
||||
ord = siblings[ord];
|
||||
}
|
||||
|
||||
// Sort by highest count:
|
||||
Collections.sort(results,
|
||||
new Comparator<SimpleFacetResult>() {
|
||||
@Override
|
||||
public int compare(SimpleFacetResult a, SimpleFacetResult b) {
|
||||
if (a.value.intValue() > b.value.intValue()) {
|
||||
return -1;
|
||||
} else if (b.value.intValue() > a.value.intValue()) {
|
||||
return 1;
|
||||
} else {
|
||||
// Tie break by dimension
|
||||
return a.path.components[0].compareTo(b.path.components[0]);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
return results;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,37 @@
|
|||
package org.apache.lucene.facet.simple;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.index.AtomicReaderContext;
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
|
||||
/** Provides per-document ordinals. */
|
||||
|
||||
public abstract class OrdinalsReader {
|
||||
|
||||
public static abstract class OrdinalsSegmentReader {
|
||||
/** Get the ordinals for this document. ordinals.offset
|
||||
* must always be 0! */
|
||||
public abstract void get(int doc, IntsRef ordinals) throws IOException;
|
||||
}
|
||||
|
||||
/** Set current atomic reader. */
|
||||
public abstract OrdinalsSegmentReader getReader(AtomicReaderContext context) throws IOException;
|
||||
}
|
|
@ -110,7 +110,7 @@ public class SimpleDrillSideways {
|
|||
* impl. */
|
||||
protected Facets buildFacetsResult(SimpleFacetsCollector drillDowns, SimpleFacetsCollector[] drillSideways, String[] drillSidewaysDims) throws IOException {
|
||||
|
||||
Facets drillDownFacets = new TaxonomyFacetCounts(taxoReader, facetsConfig, drillDowns);
|
||||
Facets drillDownFacets = new FastTaxonomyFacetCounts(taxoReader, facetsConfig, drillDowns);
|
||||
|
||||
if (drillSideways == null) {
|
||||
return drillDownFacets;
|
||||
|
@ -118,7 +118,7 @@ public class SimpleDrillSideways {
|
|||
Map<String,Facets> drillSidewaysFacets = new HashMap<String,Facets>();
|
||||
for(int i=0;i<drillSideways.length;i++) {
|
||||
drillSidewaysFacets.put(drillSidewaysDims[i],
|
||||
new TaxonomyFacetCounts(taxoReader, facetsConfig, drillSideways[i]));
|
||||
new FastTaxonomyFacetCounts(taxoReader, facetsConfig, drillSideways[i]));
|
||||
}
|
||||
return new MultiFacets(drillSidewaysFacets, drillDownFacets);
|
||||
}
|
||||
|
|
|
@ -31,24 +31,24 @@ import org.apache.lucene.facet.taxonomy.TaxonomyReader;
|
|||
import org.apache.lucene.index.BinaryDocValues;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.FixedBitSet;
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
|
||||
/** Reads from any {@link OrdinalsReader}; use {@link
|
||||
* FastTaxonomyFacetCounts} if you are just using the
|
||||
* default encoding from {@link BinaryDocValues}. */
|
||||
|
||||
// nocommit jdoc that this assumes/requires the default encoding
|
||||
public class TaxonomyFacetCounts extends Facets {
|
||||
private final OrdinalsReader ordinalsReader;
|
||||
private final FacetsConfig facetsConfig;
|
||||
private final TaxonomyReader taxoReader;
|
||||
private final int[] counts;
|
||||
private final String facetsFieldName;
|
||||
private final int[] children;
|
||||
private final int[] parents;
|
||||
private final int[] siblings;
|
||||
|
||||
public TaxonomyFacetCounts(TaxonomyReader taxoReader, FacetsConfig facetsConfig, SimpleFacetsCollector fc) throws IOException {
|
||||
this(FacetsConfig.DEFAULT_INDEXED_FIELD_NAME, taxoReader, facetsConfig, fc);
|
||||
}
|
||||
|
||||
public TaxonomyFacetCounts(String facetsFieldName, TaxonomyReader taxoReader, FacetsConfig facetsConfig, SimpleFacetsCollector fc) throws IOException {
|
||||
public TaxonomyFacetCounts(OrdinalsReader ordinalsReader, TaxonomyReader taxoReader, FacetsConfig facetsConfig, SimpleFacetsCollector fc) throws IOException {
|
||||
this.taxoReader = taxoReader;
|
||||
this.facetsFieldName = facetsFieldName;
|
||||
this.ordinalsReader = ordinalsReader;
|
||||
this.facetsConfig = facetsConfig;
|
||||
ParallelTaxonomyArrays pta = taxoReader.getParallelTaxonomyArrays();
|
||||
children = pta.children();
|
||||
|
@ -60,35 +60,17 @@ public class TaxonomyFacetCounts extends Facets {
|
|||
|
||||
private final void count(List<MatchingDocs> matchingDocs) throws IOException {
|
||||
//System.out.println("count matchingDocs=" + matchingDocs + " facetsField=" + facetsFieldName);
|
||||
IntsRef scratch = new IntsRef();
|
||||
for(MatchingDocs hits : matchingDocs) {
|
||||
BinaryDocValues dv = hits.context.reader().getBinaryDocValues(facetsFieldName);
|
||||
if (dv == null) { // this reader does not have DocValues for the requested category list
|
||||
continue;
|
||||
}
|
||||
OrdinalsReader.OrdinalsSegmentReader ords = ordinalsReader.getReader(hits.context);
|
||||
FixedBitSet bits = hits.bits;
|
||||
|
||||
final int length = hits.bits.length();
|
||||
int doc = 0;
|
||||
BytesRef scratch = new BytesRef();
|
||||
//System.out.println("count seg=" + hits.context.reader());
|
||||
while (doc < length && (doc = bits.nextSetBit(doc)) != -1) {
|
||||
//System.out.println(" doc=" + doc);
|
||||
dv.get(doc, scratch);
|
||||
byte[] bytes = scratch.bytes;
|
||||
int end = scratch.offset + scratch.length;
|
||||
int ord = 0;
|
||||
int offset = scratch.offset;
|
||||
int prev = 0;
|
||||
while (offset < end) {
|
||||
byte b = bytes[offset++];
|
||||
if (b >= 0) {
|
||||
prev = ord = ((ord << 7) | b) + prev;
|
||||
assert ord < counts.length: "ord=" + ord + " vs maxOrd=" + counts.length;
|
||||
++counts[ord];
|
||||
ord = 0;
|
||||
} else {
|
||||
ord = (ord << 7) | (b & 0x7F);
|
||||
}
|
||||
ords.get(doc, scratch);
|
||||
for(int i=0;i<scratch.length;i++) {
|
||||
++counts[scratch.ints[i]];
|
||||
}
|
||||
++doc;
|
||||
}
|
||||
|
|
|
@ -35,6 +35,7 @@ import org.apache.lucene.queries.function.ValueSource;
|
|||
import org.apache.lucene.search.Scorer;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.FixedBitSet;
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
|
||||
/** Aggregates sum of values from a {@link ValueSource}, for
|
||||
* each facet label. */
|
||||
|
@ -45,18 +46,20 @@ public class TaxonomyFacetSumValueSource extends Facets {
|
|||
private final FacetsConfig facetsConfig;
|
||||
private final TaxonomyReader taxoReader;
|
||||
private final float[] values;
|
||||
private final String facetsFieldName;
|
||||
private final int[] children;
|
||||
private final int[] parents;
|
||||
private final int[] siblings;
|
||||
private final OrdinalsReader ordinalsReader;
|
||||
|
||||
public TaxonomyFacetSumValueSource(TaxonomyReader taxoReader, FacetsConfig facetsConfig, SimpleFacetsCollector fc, ValueSource valueSource) throws IOException {
|
||||
this(FacetsConfig.DEFAULT_INDEXED_FIELD_NAME, taxoReader, facetsConfig, fc, valueSource);
|
||||
public TaxonomyFacetSumValueSource(TaxonomyReader taxoReader, FacetsConfig facetsConfig,
|
||||
SimpleFacetsCollector fc, ValueSource valueSource) throws IOException {
|
||||
this(new DocValuesOrdinalsReader(FacetsConfig.DEFAULT_INDEXED_FIELD_NAME), taxoReader, facetsConfig, fc, valueSource);
|
||||
}
|
||||
|
||||
public TaxonomyFacetSumValueSource(String facetsFieldName, TaxonomyReader taxoReader, FacetsConfig facetsConfig, SimpleFacetsCollector fc, ValueSource valueSource) throws IOException {
|
||||
public TaxonomyFacetSumValueSource(OrdinalsReader ordinalsReader, TaxonomyReader taxoReader,
|
||||
FacetsConfig facetsConfig, SimpleFacetsCollector fc, ValueSource valueSource) throws IOException {
|
||||
this.taxoReader = taxoReader;
|
||||
this.facetsFieldName = facetsFieldName;
|
||||
this.ordinalsReader = ordinalsReader;
|
||||
this.facetsConfig = facetsConfig;
|
||||
ParallelTaxonomyArrays pta = taxoReader.getParallelTaxonomyArrays();
|
||||
children = pta.children();
|
||||
|
@ -82,43 +85,26 @@ public class TaxonomyFacetSumValueSource extends Facets {
|
|||
final FakeScorer scorer = new FakeScorer();
|
||||
Map<String, Scorer> context = new HashMap<String, Scorer>();
|
||||
context.put("scorer", scorer);
|
||||
IntsRef scratch = new IntsRef();
|
||||
for(MatchingDocs hits : matchingDocs) {
|
||||
BinaryDocValues dv = hits.context.reader().getBinaryDocValues(facetsFieldName);
|
||||
if (dv == null) { // this reader does not have DocValues for the requested category list
|
||||
continue;
|
||||
}
|
||||
OrdinalsReader.OrdinalsSegmentReader ords = ordinalsReader.getReader(hits.context);
|
||||
FixedBitSet bits = hits.bits;
|
||||
|
||||
final int length = hits.bits.length();
|
||||
int doc = 0;
|
||||
int scoresIdx = 0;
|
||||
BytesRef scratch = new BytesRef();
|
||||
float[] scores = hits.scores;
|
||||
|
||||
FunctionValues functionValues = valueSource.getValues(context, hits.context);
|
||||
while (doc < length && (doc = bits.nextSetBit(doc)) != -1) {
|
||||
dv.get(doc, scratch);
|
||||
ords.get(doc, scratch);
|
||||
if (keepScores) {
|
||||
scorer.docID = doc;
|
||||
scorer.score = scores[scoresIdx++];
|
||||
}
|
||||
byte[] bytes = scratch.bytes;
|
||||
int end = scratch.offset + scratch.length;
|
||||
int ord = 0;
|
||||
int offset = scratch.offset;
|
||||
int prev = 0;
|
||||
|
||||
float value = (float) functionValues.doubleVal(doc);
|
||||
|
||||
while (offset < end) {
|
||||
byte b = bytes[offset++];
|
||||
if (b >= 0) {
|
||||
prev = ord = ((ord << 7) | b) + prev;
|
||||
values[ord] += value;
|
||||
ord = 0;
|
||||
} else {
|
||||
ord = (ord << 7) | (b & 0x7F);
|
||||
}
|
||||
for(int i=0;i<scratch.length;i++) {
|
||||
values[scratch.ints[i]] += value;
|
||||
}
|
||||
++doc;
|
||||
}
|
||||
|
|
|
@ -107,7 +107,7 @@ public class TestTaxonomyFacets extends FacetTestCase {
|
|||
// wrap collecting the "normal" hits and also facets:
|
||||
searcher.search(new MatchAllDocsQuery(), c);
|
||||
|
||||
TaxonomyFacetCounts facets = new TaxonomyFacetCounts(taxoReader, fts, c);
|
||||
Facets facets = new FastTaxonomyFacetCounts(taxoReader, fts, c);
|
||||
|
||||
// Retrieve & verify results:
|
||||
assertEquals("Publish Date (5)\n 2010 (2)\n 2012 (2)\n 1999 (1)\n", facets.getTopChildren(10, "Publish Date").toString());
|
||||
|
@ -118,7 +118,7 @@ public class TestTaxonomyFacets extends FacetTestCase {
|
|||
q2.add(new FacetLabel("Publish Date", "2010"));
|
||||
c = new SimpleFacetsCollector();
|
||||
searcher.search(q2, c);
|
||||
facets = new TaxonomyFacetCounts(taxoReader, fts, c);
|
||||
facets = new FastTaxonomyFacetCounts(taxoReader, fts, c);
|
||||
assertEquals("Author (2)\n Bob (1)\n Lisa (1)\n", facets.getTopChildren(10, "Author").toString());
|
||||
|
||||
assertEquals(1, facets.getSpecificValue("Author", "Lisa"));
|
||||
|
@ -185,7 +185,16 @@ public class TestTaxonomyFacets extends FacetTestCase {
|
|||
SimpleFacetsCollector c = new SimpleFacetsCollector();
|
||||
searcher.search(new MatchAllDocsQuery(), c);
|
||||
|
||||
TaxonomyFacetCounts facets = new TaxonomyFacetCounts(taxoReader, new FacetsConfig(), c);
|
||||
Facets facets;
|
||||
if (random().nextBoolean()) {
|
||||
facets = new FastTaxonomyFacetCounts(taxoReader, new FacetsConfig(), c);
|
||||
} else {
|
||||
OrdinalsReader ordsReader = new DocValuesOrdinalsReader();
|
||||
if (random().nextBoolean()) {
|
||||
ordsReader = new CachedOrdinalsReader(ordsReader);
|
||||
}
|
||||
facets = new TaxonomyFacetCounts(ordsReader, taxoReader, new FacetsConfig(), c);
|
||||
}
|
||||
|
||||
// Ask for top 10 labels for any dims that have counts:
|
||||
List<SimpleFacetResult> results = facets.getAllDims(10);
|
||||
|
|
Loading…
Reference in New Issue