mirror of https://github.com/apache/lucene.git
LUCENE-5339: add OrdinalsReader + Cache to abstract the source of the ords
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5339@1542773 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
39f6c8a5ff
commit
7a0ee102d0
3
TODO
3
TODO
|
@ -2,10 +2,8 @@ nocommit this!
|
||||||
|
|
||||||
TODO
|
TODO
|
||||||
- associations
|
- associations
|
||||||
- ords cache
|
|
||||||
- wrap an IW instead of extending one?
|
- wrap an IW instead of extending one?
|
||||||
- re-enable ALL_BUT_DIM somehow?
|
- re-enable ALL_BUT_DIM somehow?
|
||||||
- abstraction for 'ords source/decode'
|
|
||||||
- simplify ddq api
|
- simplify ddq api
|
||||||
- SSDVValueSourceFacets?
|
- SSDVValueSourceFacets?
|
||||||
- we could put more stuff into the "schema", e.g. this field is
|
- we could put more stuff into the "schema", e.g. this field is
|
||||||
|
@ -15,7 +13,6 @@ TODO
|
||||||
- rename CategoryPath -> FacetLabel
|
- rename CategoryPath -> FacetLabel
|
||||||
- how to do avg() agg?
|
- how to do avg() agg?
|
||||||
- test needsScores=true / valuesource associations
|
- test needsScores=true / valuesource associations
|
||||||
- drill sideways
|
|
||||||
- make FieldTypes optional (if all your dims are flat)?
|
- make FieldTypes optional (if all your dims are flat)?
|
||||||
- add hierarchy to ssdv facets?
|
- add hierarchy to ssdv facets?
|
||||||
- sparse faceting: allow skipping of certain dims?
|
- sparse faceting: allow skipping of certain dims?
|
||||||
|
|
|
@ -0,0 +1,130 @@
|
||||||
|
package org.apache.lucene.facet.simple;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.WeakHashMap;
|
||||||
|
|
||||||
|
import org.apache.lucene.index.AtomicReaderContext;
|
||||||
|
import org.apache.lucene.util.ArrayUtil;
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.IntsRef;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A per-segment cache of documents' facet ordinals. Every
|
||||||
|
* {@link CachedOrds} holds the ordinals in a raw {@code
|
||||||
|
* int[]}, and therefore consumes as much RAM as the total
|
||||||
|
* number of ordinals found in the segment, but saves the
|
||||||
|
* CPU cost of decoding ordinals during facet counting.
|
||||||
|
*
|
||||||
|
* <p>
|
||||||
|
* <b>NOTE:</b> every {@link CachedOrds} is limited to 2.1B
|
||||||
|
* total ordinals. If that is a limitation for you then
|
||||||
|
* consider limiting the segment size to fewer documents, or
|
||||||
|
* use an alternative cache which pages through the category
|
||||||
|
* ordinals.
|
||||||
|
*
|
||||||
|
* <p>
|
||||||
|
* <b>NOTE:</b> when using this cache, it is advised to use
|
||||||
|
* a {@link DocValuesFormat} that does not cache the data in
|
||||||
|
* memory, at least for the category lists fields, or
|
||||||
|
* otherwise you'll be doing double-caching.
|
||||||
|
*/
|
||||||
|
public class CachedOrdinalsReader extends OrdinalsReader {
|
||||||
|
|
||||||
|
private final OrdinalsReader source;
|
||||||
|
private CachedOrds current;
|
||||||
|
|
||||||
|
// outer map is a WeakHashMap which uses reader.getCoreCacheKey() as the weak
|
||||||
|
// reference. When it's no longer referenced, the entire inner map can be
|
||||||
|
// evicted.
|
||||||
|
private final Map<Object,CachedOrds> ordsCache = new WeakHashMap<Object,CachedOrds>();
|
||||||
|
|
||||||
|
public CachedOrdinalsReader(OrdinalsReader source) {
|
||||||
|
this.source = source;
|
||||||
|
}
|
||||||
|
|
||||||
|
private synchronized CachedOrds getCachedOrds(AtomicReaderContext context) throws IOException {
|
||||||
|
Object cacheKey = context.reader().getCoreCacheKey();
|
||||||
|
CachedOrds ords = ordsCache.get(cacheKey);
|
||||||
|
if (ords == null) {
|
||||||
|
ords = new CachedOrds(source.getReader(context), context.reader().maxDoc());
|
||||||
|
ordsCache.put(cacheKey, ords);
|
||||||
|
}
|
||||||
|
|
||||||
|
return ords;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public OrdinalsSegmentReader getReader(AtomicReaderContext context) throws IOException {
|
||||||
|
final CachedOrds cachedOrds = getCachedOrds(context);
|
||||||
|
return new OrdinalsSegmentReader() {
|
||||||
|
@Override
|
||||||
|
public void get(int docID, IntsRef ordinals) {
|
||||||
|
ordinals.ints = cachedOrds.ordinals;
|
||||||
|
ordinals.offset = cachedOrds.offsets[docID];
|
||||||
|
ordinals.length = cachedOrds.offsets[docID+1] - ordinals.offset;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Holds the cached ordinals in two paralel {@code int[]} arrays. */
|
||||||
|
public static final class CachedOrds {
|
||||||
|
|
||||||
|
public final int[] offsets;
|
||||||
|
public final int[] ordinals;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a new {@link CachedOrds} from the {@link BinaryDocValues}.
|
||||||
|
* Assumes that the {@link BinaryDocValues} is not {@code null}.
|
||||||
|
*/
|
||||||
|
public CachedOrds(OrdinalsSegmentReader source, int maxDoc) throws IOException {
|
||||||
|
final BytesRef buf = new BytesRef();
|
||||||
|
|
||||||
|
offsets = new int[maxDoc + 1];
|
||||||
|
int[] ords = new int[maxDoc]; // let's assume one ordinal per-document as an initial size
|
||||||
|
|
||||||
|
// this aggregator is limited to Integer.MAX_VALUE total ordinals.
|
||||||
|
long totOrds = 0;
|
||||||
|
final IntsRef values = new IntsRef(32);
|
||||||
|
for (int docID = 0; docID < maxDoc; docID++) {
|
||||||
|
offsets[docID] = (int) totOrds;
|
||||||
|
source.get(docID, values);
|
||||||
|
long nextLength = totOrds + values.length;
|
||||||
|
if (nextLength > ords.length) {
|
||||||
|
if (nextLength > ArrayUtil.MAX_ARRAY_LENGTH) {
|
||||||
|
throw new IllegalStateException("too many ordinals (>= " + nextLength + ") to cache");
|
||||||
|
}
|
||||||
|
ords = ArrayUtil.grow(ords, (int) nextLength);
|
||||||
|
}
|
||||||
|
System.arraycopy(values.ints, 0, ords, (int) totOrds, values.length);
|
||||||
|
totOrds = nextLength;
|
||||||
|
}
|
||||||
|
offsets[maxDoc] = (int) totOrds;
|
||||||
|
|
||||||
|
// if ords array is bigger by more than 10% of what we really need, shrink it
|
||||||
|
if ((double) totOrds / ords.length < 0.9) {
|
||||||
|
this.ordinals = new int[(int) totOrds];
|
||||||
|
System.arraycopy(ords, 0, this.ordinals, 0, (int) totOrds);
|
||||||
|
} else {
|
||||||
|
this.ordinals = ords;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,92 @@
|
||||||
|
package org.apache.lucene.facet.simple;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.lucene.index.AtomicReaderContext;
|
||||||
|
import org.apache.lucene.index.BinaryDocValues;
|
||||||
|
import org.apache.lucene.util.ArrayUtil;
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.IntsRef;
|
||||||
|
|
||||||
|
/** Decodes ordinals previously indexed into a BinaryDocValues field */
|
||||||
|
|
||||||
|
public class DocValuesOrdinalsReader extends OrdinalsReader {
|
||||||
|
private final String field;
|
||||||
|
|
||||||
|
public DocValuesOrdinalsReader() {
|
||||||
|
this(FacetsConfig.DEFAULT_INDEXED_FIELD_NAME);
|
||||||
|
}
|
||||||
|
|
||||||
|
public DocValuesOrdinalsReader(String field) {
|
||||||
|
this.field = field;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public OrdinalsSegmentReader getReader(AtomicReaderContext context) throws IOException {
|
||||||
|
BinaryDocValues values0 = context.reader().getBinaryDocValues(field);
|
||||||
|
if (values0 == null) {
|
||||||
|
values0 = BinaryDocValues.EMPTY;
|
||||||
|
}
|
||||||
|
|
||||||
|
final BinaryDocValues values = values0;
|
||||||
|
|
||||||
|
return new OrdinalsSegmentReader() {
|
||||||
|
private final BytesRef bytes = new BytesRef(32);
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void get(int docID, IntsRef ordinals) throws IOException {
|
||||||
|
values.get(docID, bytes);
|
||||||
|
decode(bytes, ordinals);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Subclass & override if you change the encoding. */
|
||||||
|
protected void decode(BytesRef buf, IntsRef ordinals) {
|
||||||
|
|
||||||
|
// grow the buffer up front, even if by a large number of values (buf.length)
|
||||||
|
// that saves the need to check inside the loop for every decoded value if
|
||||||
|
// the buffer needs to grow.
|
||||||
|
if (ordinals.ints.length < buf.length) {
|
||||||
|
ordinals.ints = ArrayUtil.grow(ordinals.ints, buf.length);
|
||||||
|
}
|
||||||
|
|
||||||
|
ordinals.offset = 0;
|
||||||
|
ordinals.length = 0;
|
||||||
|
|
||||||
|
// it is better if the decoding is inlined like so, and not e.g.
|
||||||
|
// in a utility method
|
||||||
|
int upto = buf.offset + buf.length;
|
||||||
|
int value = 0;
|
||||||
|
int offset = buf.offset;
|
||||||
|
int prev = 0;
|
||||||
|
while (offset < upto) {
|
||||||
|
byte b = buf.bytes[offset++];
|
||||||
|
if (b >= 0) {
|
||||||
|
ordinals.ints[ordinals.length] = ((value << 7) | b) + prev;
|
||||||
|
value = 0;
|
||||||
|
prev = ordinals.ints[ordinals.length];
|
||||||
|
ordinals.length++;
|
||||||
|
} else {
|
||||||
|
value = (value << 7) | (b & 0x7F);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,228 @@
|
||||||
|
package org.apache.lucene.facet.simple;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.Comparator;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import org.apache.lucene.facet.simple.SimpleFacetsCollector.MatchingDocs;
|
||||||
|
import org.apache.lucene.facet.taxonomy.FacetLabel;
|
||||||
|
import org.apache.lucene.facet.taxonomy.ParallelTaxonomyArrays;
|
||||||
|
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
|
||||||
|
import org.apache.lucene.index.BinaryDocValues;
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.FixedBitSet;
|
||||||
|
|
||||||
|
// nocommit jdoc that this assumes/requires the default encoding
|
||||||
|
public class FastTaxonomyFacetCounts extends Facets {
|
||||||
|
private final FacetsConfig facetsConfig;
|
||||||
|
private final TaxonomyReader taxoReader;
|
||||||
|
private final int[] counts;
|
||||||
|
private final String facetsFieldName;
|
||||||
|
private final int[] children;
|
||||||
|
private final int[] parents;
|
||||||
|
private final int[] siblings;
|
||||||
|
|
||||||
|
public FastTaxonomyFacetCounts(TaxonomyReader taxoReader, FacetsConfig facetsConfig, SimpleFacetsCollector fc) throws IOException {
|
||||||
|
this(FacetsConfig.DEFAULT_INDEXED_FIELD_NAME, taxoReader, facetsConfig, fc);
|
||||||
|
}
|
||||||
|
|
||||||
|
public FastTaxonomyFacetCounts(String facetsFieldName, TaxonomyReader taxoReader, FacetsConfig facetsConfig, SimpleFacetsCollector fc) throws IOException {
|
||||||
|
this.taxoReader = taxoReader;
|
||||||
|
this.facetsFieldName = facetsFieldName;
|
||||||
|
this.facetsConfig = facetsConfig;
|
||||||
|
ParallelTaxonomyArrays pta = taxoReader.getParallelTaxonomyArrays();
|
||||||
|
children = pta.children();
|
||||||
|
parents = pta.parents();
|
||||||
|
siblings = pta.siblings();
|
||||||
|
counts = new int[taxoReader.getSize()];
|
||||||
|
count(fc.getMatchingDocs());
|
||||||
|
}
|
||||||
|
|
||||||
|
private final void count(List<MatchingDocs> matchingDocs) throws IOException {
|
||||||
|
//System.out.println("count matchingDocs=" + matchingDocs + " facetsField=" + facetsFieldName);
|
||||||
|
for(MatchingDocs hits : matchingDocs) {
|
||||||
|
BinaryDocValues dv = hits.context.reader().getBinaryDocValues(facetsFieldName);
|
||||||
|
if (dv == null) { // this reader does not have DocValues for the requested category list
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
FixedBitSet bits = hits.bits;
|
||||||
|
|
||||||
|
final int length = hits.bits.length();
|
||||||
|
int doc = 0;
|
||||||
|
BytesRef scratch = new BytesRef();
|
||||||
|
//System.out.println("count seg=" + hits.context.reader());
|
||||||
|
while (doc < length && (doc = bits.nextSetBit(doc)) != -1) {
|
||||||
|
//System.out.println(" doc=" + doc);
|
||||||
|
dv.get(doc, scratch);
|
||||||
|
byte[] bytes = scratch.bytes;
|
||||||
|
int end = scratch.offset + scratch.length;
|
||||||
|
int ord = 0;
|
||||||
|
int offset = scratch.offset;
|
||||||
|
int prev = 0;
|
||||||
|
while (offset < end) {
|
||||||
|
byte b = bytes[offset++];
|
||||||
|
if (b >= 0) {
|
||||||
|
prev = ord = ((ord << 7) | b) + prev;
|
||||||
|
assert ord < counts.length: "ord=" + ord + " vs maxOrd=" + counts.length;
|
||||||
|
++counts[ord];
|
||||||
|
ord = 0;
|
||||||
|
} else {
|
||||||
|
ord = (ord << 7) | (b & 0x7F);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
++doc;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// nocommit we could do this lazily instead:
|
||||||
|
|
||||||
|
// Rollup any necessary dims:
|
||||||
|
for(Map.Entry<String,FacetsConfig.DimConfig> ent : facetsConfig.getDimConfigs().entrySet()) {
|
||||||
|
String dim = ent.getKey();
|
||||||
|
FacetsConfig.DimConfig ft = ent.getValue();
|
||||||
|
if (ft.hierarchical && ft.multiValued == false) {
|
||||||
|
int dimRootOrd = taxoReader.getOrdinal(new FacetLabel(dim));
|
||||||
|
// It can be -1 if this field was declared in the
|
||||||
|
// facetsConfig but never indexed:
|
||||||
|
if (dimRootOrd > 0) {
|
||||||
|
counts[dimRootOrd] += rollup(children[dimRootOrd]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private int rollup(int ord) {
|
||||||
|
int sum = 0;
|
||||||
|
while (ord != TaxonomyReader.INVALID_ORDINAL) {
|
||||||
|
int childValue = counts[ord] + rollup(children[ord]);
|
||||||
|
counts[ord] = childValue;
|
||||||
|
sum += childValue;
|
||||||
|
ord = siblings[ord];
|
||||||
|
}
|
||||||
|
return sum;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Return the count for a specific path. Returns -1 if
|
||||||
|
* this path doesn't exist, else the count. */
|
||||||
|
@Override
|
||||||
|
public Number getSpecificValue(String dim, String... path) throws IOException {
|
||||||
|
int ord = taxoReader.getOrdinal(FacetLabel.create(dim, path));
|
||||||
|
if (ord < 0) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
return counts[ord];
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public SimpleFacetResult getTopChildren(int topN, String dim, String... path) throws IOException {
|
||||||
|
FacetLabel cp = FacetLabel.create(dim, path);
|
||||||
|
int ord = taxoReader.getOrdinal(cp);
|
||||||
|
if (ord == -1) {
|
||||||
|
//System.out.println("no ord for path=" + path);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
return getTopChildren(cp, ord, topN);
|
||||||
|
}
|
||||||
|
|
||||||
|
private SimpleFacetResult getTopChildren(FacetLabel path, int dimOrd, int topN) throws IOException {
|
||||||
|
|
||||||
|
TopOrdCountQueue q = new TopOrdCountQueue(topN);
|
||||||
|
|
||||||
|
int bottomCount = 0;
|
||||||
|
|
||||||
|
int ord = children[dimOrd];
|
||||||
|
int totCount = 0;
|
||||||
|
|
||||||
|
TopOrdCountQueue.OrdAndCount reuse = null;
|
||||||
|
while(ord != TaxonomyReader.INVALID_ORDINAL) {
|
||||||
|
if (counts[ord] > 0) {
|
||||||
|
totCount += counts[ord];
|
||||||
|
if (counts[ord] > bottomCount) {
|
||||||
|
if (reuse == null) {
|
||||||
|
reuse = new TopOrdCountQueue.OrdAndCount();
|
||||||
|
}
|
||||||
|
reuse.ord = ord;
|
||||||
|
reuse.count = counts[ord];
|
||||||
|
reuse = q.insertWithOverflow(reuse);
|
||||||
|
if (q.size() == topN) {
|
||||||
|
bottomCount = q.top().count;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ord = siblings[ord];
|
||||||
|
}
|
||||||
|
|
||||||
|
if (totCount == 0) {
|
||||||
|
//System.out.println("totCount=0 for path=" + path);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
FacetsConfig.DimConfig ft = facetsConfig.getDimConfig(path.components[0]);
|
||||||
|
// nocommit shouldn't we verify the indexedFieldName
|
||||||
|
// matches what was passed to our ctor?
|
||||||
|
if (ft.hierarchical && ft.multiValued) {
|
||||||
|
totCount = counts[dimOrd];
|
||||||
|
}
|
||||||
|
|
||||||
|
LabelAndValue[] labelValues = new LabelAndValue[q.size()];
|
||||||
|
for(int i=labelValues.length-1;i>=0;i--) {
|
||||||
|
TopOrdCountQueue.OrdAndCount ordAndCount = q.pop();
|
||||||
|
FacetLabel child = taxoReader.getPath(ordAndCount.ord);
|
||||||
|
labelValues[i] = new LabelAndValue(child.components[path.length], ordAndCount.count);
|
||||||
|
}
|
||||||
|
|
||||||
|
return new SimpleFacetResult(path, totCount, labelValues);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public List<SimpleFacetResult> getAllDims(int topN) throws IOException {
|
||||||
|
int ord = children[TaxonomyReader.ROOT_ORDINAL];
|
||||||
|
List<SimpleFacetResult> results = new ArrayList<SimpleFacetResult>();
|
||||||
|
while (ord != TaxonomyReader.INVALID_ORDINAL) {
|
||||||
|
SimpleFacetResult result = getTopChildren(taxoReader.getPath(ord), ord, topN);
|
||||||
|
if (result != null) {
|
||||||
|
results.add(result);
|
||||||
|
}
|
||||||
|
ord = siblings[ord];
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sort by highest count:
|
||||||
|
Collections.sort(results,
|
||||||
|
new Comparator<SimpleFacetResult>() {
|
||||||
|
@Override
|
||||||
|
public int compare(SimpleFacetResult a, SimpleFacetResult b) {
|
||||||
|
if (a.value.intValue() > b.value.intValue()) {
|
||||||
|
return -1;
|
||||||
|
} else if (b.value.intValue() > a.value.intValue()) {
|
||||||
|
return 1;
|
||||||
|
} else {
|
||||||
|
// Tie break by dimension
|
||||||
|
return a.path.components[0].compareTo(b.path.components[0]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
return results;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,37 @@
|
||||||
|
package org.apache.lucene.facet.simple;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.lucene.index.AtomicReaderContext;
|
||||||
|
import org.apache.lucene.util.IntsRef;
|
||||||
|
|
||||||
|
/** Provides per-document ordinals. */
|
||||||
|
|
||||||
|
public abstract class OrdinalsReader {
|
||||||
|
|
||||||
|
public static abstract class OrdinalsSegmentReader {
|
||||||
|
/** Get the ordinals for this document. ordinals.offset
|
||||||
|
* must always be 0! */
|
||||||
|
public abstract void get(int doc, IntsRef ordinals) throws IOException;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Set current atomic reader. */
|
||||||
|
public abstract OrdinalsSegmentReader getReader(AtomicReaderContext context) throws IOException;
|
||||||
|
}
|
|
@ -110,7 +110,7 @@ public class SimpleDrillSideways {
|
||||||
* impl. */
|
* impl. */
|
||||||
protected Facets buildFacetsResult(SimpleFacetsCollector drillDowns, SimpleFacetsCollector[] drillSideways, String[] drillSidewaysDims) throws IOException {
|
protected Facets buildFacetsResult(SimpleFacetsCollector drillDowns, SimpleFacetsCollector[] drillSideways, String[] drillSidewaysDims) throws IOException {
|
||||||
|
|
||||||
Facets drillDownFacets = new TaxonomyFacetCounts(taxoReader, facetsConfig, drillDowns);
|
Facets drillDownFacets = new FastTaxonomyFacetCounts(taxoReader, facetsConfig, drillDowns);
|
||||||
|
|
||||||
if (drillSideways == null) {
|
if (drillSideways == null) {
|
||||||
return drillDownFacets;
|
return drillDownFacets;
|
||||||
|
@ -118,7 +118,7 @@ public class SimpleDrillSideways {
|
||||||
Map<String,Facets> drillSidewaysFacets = new HashMap<String,Facets>();
|
Map<String,Facets> drillSidewaysFacets = new HashMap<String,Facets>();
|
||||||
for(int i=0;i<drillSideways.length;i++) {
|
for(int i=0;i<drillSideways.length;i++) {
|
||||||
drillSidewaysFacets.put(drillSidewaysDims[i],
|
drillSidewaysFacets.put(drillSidewaysDims[i],
|
||||||
new TaxonomyFacetCounts(taxoReader, facetsConfig, drillSideways[i]));
|
new FastTaxonomyFacetCounts(taxoReader, facetsConfig, drillSideways[i]));
|
||||||
}
|
}
|
||||||
return new MultiFacets(drillSidewaysFacets, drillDownFacets);
|
return new MultiFacets(drillSidewaysFacets, drillDownFacets);
|
||||||
}
|
}
|
||||||
|
|
|
@ -31,24 +31,24 @@ import org.apache.lucene.facet.taxonomy.TaxonomyReader;
|
||||||
import org.apache.lucene.index.BinaryDocValues;
|
import org.apache.lucene.index.BinaryDocValues;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
import org.apache.lucene.util.FixedBitSet;
|
import org.apache.lucene.util.FixedBitSet;
|
||||||
|
import org.apache.lucene.util.IntsRef;
|
||||||
|
|
||||||
|
/** Reads from any {@link OrdinalsReader}; use {@link
|
||||||
|
* FastTaxonomyFacetCounts} if you are just using the
|
||||||
|
* default encoding from {@link BinaryDocValues}. */
|
||||||
|
|
||||||
// nocommit jdoc that this assumes/requires the default encoding
|
|
||||||
public class TaxonomyFacetCounts extends Facets {
|
public class TaxonomyFacetCounts extends Facets {
|
||||||
|
private final OrdinalsReader ordinalsReader;
|
||||||
private final FacetsConfig facetsConfig;
|
private final FacetsConfig facetsConfig;
|
||||||
private final TaxonomyReader taxoReader;
|
private final TaxonomyReader taxoReader;
|
||||||
private final int[] counts;
|
private final int[] counts;
|
||||||
private final String facetsFieldName;
|
|
||||||
private final int[] children;
|
private final int[] children;
|
||||||
private final int[] parents;
|
private final int[] parents;
|
||||||
private final int[] siblings;
|
private final int[] siblings;
|
||||||
|
|
||||||
public TaxonomyFacetCounts(TaxonomyReader taxoReader, FacetsConfig facetsConfig, SimpleFacetsCollector fc) throws IOException {
|
public TaxonomyFacetCounts(OrdinalsReader ordinalsReader, TaxonomyReader taxoReader, FacetsConfig facetsConfig, SimpleFacetsCollector fc) throws IOException {
|
||||||
this(FacetsConfig.DEFAULT_INDEXED_FIELD_NAME, taxoReader, facetsConfig, fc);
|
|
||||||
}
|
|
||||||
|
|
||||||
public TaxonomyFacetCounts(String facetsFieldName, TaxonomyReader taxoReader, FacetsConfig facetsConfig, SimpleFacetsCollector fc) throws IOException {
|
|
||||||
this.taxoReader = taxoReader;
|
this.taxoReader = taxoReader;
|
||||||
this.facetsFieldName = facetsFieldName;
|
this.ordinalsReader = ordinalsReader;
|
||||||
this.facetsConfig = facetsConfig;
|
this.facetsConfig = facetsConfig;
|
||||||
ParallelTaxonomyArrays pta = taxoReader.getParallelTaxonomyArrays();
|
ParallelTaxonomyArrays pta = taxoReader.getParallelTaxonomyArrays();
|
||||||
children = pta.children();
|
children = pta.children();
|
||||||
|
@ -60,35 +60,17 @@ public class TaxonomyFacetCounts extends Facets {
|
||||||
|
|
||||||
private final void count(List<MatchingDocs> matchingDocs) throws IOException {
|
private final void count(List<MatchingDocs> matchingDocs) throws IOException {
|
||||||
//System.out.println("count matchingDocs=" + matchingDocs + " facetsField=" + facetsFieldName);
|
//System.out.println("count matchingDocs=" + matchingDocs + " facetsField=" + facetsFieldName);
|
||||||
|
IntsRef scratch = new IntsRef();
|
||||||
for(MatchingDocs hits : matchingDocs) {
|
for(MatchingDocs hits : matchingDocs) {
|
||||||
BinaryDocValues dv = hits.context.reader().getBinaryDocValues(facetsFieldName);
|
OrdinalsReader.OrdinalsSegmentReader ords = ordinalsReader.getReader(hits.context);
|
||||||
if (dv == null) { // this reader does not have DocValues for the requested category list
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
FixedBitSet bits = hits.bits;
|
FixedBitSet bits = hits.bits;
|
||||||
|
|
||||||
final int length = hits.bits.length();
|
final int length = hits.bits.length();
|
||||||
int doc = 0;
|
int doc = 0;
|
||||||
BytesRef scratch = new BytesRef();
|
|
||||||
//System.out.println("count seg=" + hits.context.reader());
|
|
||||||
while (doc < length && (doc = bits.nextSetBit(doc)) != -1) {
|
while (doc < length && (doc = bits.nextSetBit(doc)) != -1) {
|
||||||
//System.out.println(" doc=" + doc);
|
ords.get(doc, scratch);
|
||||||
dv.get(doc, scratch);
|
for(int i=0;i<scratch.length;i++) {
|
||||||
byte[] bytes = scratch.bytes;
|
++counts[scratch.ints[i]];
|
||||||
int end = scratch.offset + scratch.length;
|
|
||||||
int ord = 0;
|
|
||||||
int offset = scratch.offset;
|
|
||||||
int prev = 0;
|
|
||||||
while (offset < end) {
|
|
||||||
byte b = bytes[offset++];
|
|
||||||
if (b >= 0) {
|
|
||||||
prev = ord = ((ord << 7) | b) + prev;
|
|
||||||
assert ord < counts.length: "ord=" + ord + " vs maxOrd=" + counts.length;
|
|
||||||
++counts[ord];
|
|
||||||
ord = 0;
|
|
||||||
} else {
|
|
||||||
ord = (ord << 7) | (b & 0x7F);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
++doc;
|
++doc;
|
||||||
}
|
}
|
||||||
|
|
|
@ -35,6 +35,7 @@ import org.apache.lucene.queries.function.ValueSource;
|
||||||
import org.apache.lucene.search.Scorer;
|
import org.apache.lucene.search.Scorer;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
import org.apache.lucene.util.FixedBitSet;
|
import org.apache.lucene.util.FixedBitSet;
|
||||||
|
import org.apache.lucene.util.IntsRef;
|
||||||
|
|
||||||
/** Aggregates sum of values from a {@link ValueSource}, for
|
/** Aggregates sum of values from a {@link ValueSource}, for
|
||||||
* each facet label. */
|
* each facet label. */
|
||||||
|
@ -45,18 +46,20 @@ public class TaxonomyFacetSumValueSource extends Facets {
|
||||||
private final FacetsConfig facetsConfig;
|
private final FacetsConfig facetsConfig;
|
||||||
private final TaxonomyReader taxoReader;
|
private final TaxonomyReader taxoReader;
|
||||||
private final float[] values;
|
private final float[] values;
|
||||||
private final String facetsFieldName;
|
|
||||||
private final int[] children;
|
private final int[] children;
|
||||||
private final int[] parents;
|
private final int[] parents;
|
||||||
private final int[] siblings;
|
private final int[] siblings;
|
||||||
|
private final OrdinalsReader ordinalsReader;
|
||||||
|
|
||||||
public TaxonomyFacetSumValueSource(TaxonomyReader taxoReader, FacetsConfig facetsConfig, SimpleFacetsCollector fc, ValueSource valueSource) throws IOException {
|
public TaxonomyFacetSumValueSource(TaxonomyReader taxoReader, FacetsConfig facetsConfig,
|
||||||
this(FacetsConfig.DEFAULT_INDEXED_FIELD_NAME, taxoReader, facetsConfig, fc, valueSource);
|
SimpleFacetsCollector fc, ValueSource valueSource) throws IOException {
|
||||||
|
this(new DocValuesOrdinalsReader(FacetsConfig.DEFAULT_INDEXED_FIELD_NAME), taxoReader, facetsConfig, fc, valueSource);
|
||||||
}
|
}
|
||||||
|
|
||||||
public TaxonomyFacetSumValueSource(String facetsFieldName, TaxonomyReader taxoReader, FacetsConfig facetsConfig, SimpleFacetsCollector fc, ValueSource valueSource) throws IOException {
|
public TaxonomyFacetSumValueSource(OrdinalsReader ordinalsReader, TaxonomyReader taxoReader,
|
||||||
|
FacetsConfig facetsConfig, SimpleFacetsCollector fc, ValueSource valueSource) throws IOException {
|
||||||
this.taxoReader = taxoReader;
|
this.taxoReader = taxoReader;
|
||||||
this.facetsFieldName = facetsFieldName;
|
this.ordinalsReader = ordinalsReader;
|
||||||
this.facetsConfig = facetsConfig;
|
this.facetsConfig = facetsConfig;
|
||||||
ParallelTaxonomyArrays pta = taxoReader.getParallelTaxonomyArrays();
|
ParallelTaxonomyArrays pta = taxoReader.getParallelTaxonomyArrays();
|
||||||
children = pta.children();
|
children = pta.children();
|
||||||
|
@ -82,43 +85,26 @@ public class TaxonomyFacetSumValueSource extends Facets {
|
||||||
final FakeScorer scorer = new FakeScorer();
|
final FakeScorer scorer = new FakeScorer();
|
||||||
Map<String, Scorer> context = new HashMap<String, Scorer>();
|
Map<String, Scorer> context = new HashMap<String, Scorer>();
|
||||||
context.put("scorer", scorer);
|
context.put("scorer", scorer);
|
||||||
|
IntsRef scratch = new IntsRef();
|
||||||
for(MatchingDocs hits : matchingDocs) {
|
for(MatchingDocs hits : matchingDocs) {
|
||||||
BinaryDocValues dv = hits.context.reader().getBinaryDocValues(facetsFieldName);
|
OrdinalsReader.OrdinalsSegmentReader ords = ordinalsReader.getReader(hits.context);
|
||||||
if (dv == null) { // this reader does not have DocValues for the requested category list
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
FixedBitSet bits = hits.bits;
|
FixedBitSet bits = hits.bits;
|
||||||
|
|
||||||
final int length = hits.bits.length();
|
final int length = hits.bits.length();
|
||||||
int doc = 0;
|
int doc = 0;
|
||||||
int scoresIdx = 0;
|
int scoresIdx = 0;
|
||||||
BytesRef scratch = new BytesRef();
|
|
||||||
float[] scores = hits.scores;
|
float[] scores = hits.scores;
|
||||||
|
|
||||||
FunctionValues functionValues = valueSource.getValues(context, hits.context);
|
FunctionValues functionValues = valueSource.getValues(context, hits.context);
|
||||||
while (doc < length && (doc = bits.nextSetBit(doc)) != -1) {
|
while (doc < length && (doc = bits.nextSetBit(doc)) != -1) {
|
||||||
dv.get(doc, scratch);
|
ords.get(doc, scratch);
|
||||||
if (keepScores) {
|
if (keepScores) {
|
||||||
scorer.docID = doc;
|
scorer.docID = doc;
|
||||||
scorer.score = scores[scoresIdx++];
|
scorer.score = scores[scoresIdx++];
|
||||||
}
|
}
|
||||||
byte[] bytes = scratch.bytes;
|
|
||||||
int end = scratch.offset + scratch.length;
|
|
||||||
int ord = 0;
|
|
||||||
int offset = scratch.offset;
|
|
||||||
int prev = 0;
|
|
||||||
|
|
||||||
float value = (float) functionValues.doubleVal(doc);
|
float value = (float) functionValues.doubleVal(doc);
|
||||||
|
for(int i=0;i<scratch.length;i++) {
|
||||||
while (offset < end) {
|
values[scratch.ints[i]] += value;
|
||||||
byte b = bytes[offset++];
|
|
||||||
if (b >= 0) {
|
|
||||||
prev = ord = ((ord << 7) | b) + prev;
|
|
||||||
values[ord] += value;
|
|
||||||
ord = 0;
|
|
||||||
} else {
|
|
||||||
ord = (ord << 7) | (b & 0x7F);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
++doc;
|
++doc;
|
||||||
}
|
}
|
||||||
|
|
|
@ -107,7 +107,7 @@ public class TestTaxonomyFacets extends FacetTestCase {
|
||||||
// wrap collecting the "normal" hits and also facets:
|
// wrap collecting the "normal" hits and also facets:
|
||||||
searcher.search(new MatchAllDocsQuery(), c);
|
searcher.search(new MatchAllDocsQuery(), c);
|
||||||
|
|
||||||
TaxonomyFacetCounts facets = new TaxonomyFacetCounts(taxoReader, fts, c);
|
Facets facets = new FastTaxonomyFacetCounts(taxoReader, fts, c);
|
||||||
|
|
||||||
// Retrieve & verify results:
|
// Retrieve & verify results:
|
||||||
assertEquals("Publish Date (5)\n 2010 (2)\n 2012 (2)\n 1999 (1)\n", facets.getTopChildren(10, "Publish Date").toString());
|
assertEquals("Publish Date (5)\n 2010 (2)\n 2012 (2)\n 1999 (1)\n", facets.getTopChildren(10, "Publish Date").toString());
|
||||||
|
@ -118,7 +118,7 @@ public class TestTaxonomyFacets extends FacetTestCase {
|
||||||
q2.add(new FacetLabel("Publish Date", "2010"));
|
q2.add(new FacetLabel("Publish Date", "2010"));
|
||||||
c = new SimpleFacetsCollector();
|
c = new SimpleFacetsCollector();
|
||||||
searcher.search(q2, c);
|
searcher.search(q2, c);
|
||||||
facets = new TaxonomyFacetCounts(taxoReader, fts, c);
|
facets = new FastTaxonomyFacetCounts(taxoReader, fts, c);
|
||||||
assertEquals("Author (2)\n Bob (1)\n Lisa (1)\n", facets.getTopChildren(10, "Author").toString());
|
assertEquals("Author (2)\n Bob (1)\n Lisa (1)\n", facets.getTopChildren(10, "Author").toString());
|
||||||
|
|
||||||
assertEquals(1, facets.getSpecificValue("Author", "Lisa"));
|
assertEquals(1, facets.getSpecificValue("Author", "Lisa"));
|
||||||
|
@ -185,7 +185,16 @@ public class TestTaxonomyFacets extends FacetTestCase {
|
||||||
SimpleFacetsCollector c = new SimpleFacetsCollector();
|
SimpleFacetsCollector c = new SimpleFacetsCollector();
|
||||||
searcher.search(new MatchAllDocsQuery(), c);
|
searcher.search(new MatchAllDocsQuery(), c);
|
||||||
|
|
||||||
TaxonomyFacetCounts facets = new TaxonomyFacetCounts(taxoReader, new FacetsConfig(), c);
|
Facets facets;
|
||||||
|
if (random().nextBoolean()) {
|
||||||
|
facets = new FastTaxonomyFacetCounts(taxoReader, new FacetsConfig(), c);
|
||||||
|
} else {
|
||||||
|
OrdinalsReader ordsReader = new DocValuesOrdinalsReader();
|
||||||
|
if (random().nextBoolean()) {
|
||||||
|
ordsReader = new CachedOrdinalsReader(ordsReader);
|
||||||
|
}
|
||||||
|
facets = new TaxonomyFacetCounts(ordsReader, taxoReader, new FacetsConfig(), c);
|
||||||
|
}
|
||||||
|
|
||||||
// Ask for top 10 labels for any dims that have counts:
|
// Ask for top 10 labels for any dims that have counts:
|
||||||
List<SimpleFacetResult> results = facets.getAllDims(10);
|
List<SimpleFacetResult> results = facets.getAllDims(10);
|
||||||
|
|
Loading…
Reference in New Issue