mirror of https://github.com/apache/lucene.git
LUCENE-5339: assocations
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5339@1543047 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
a872b82402
commit
18117c0b04
1
TODO
1
TODO
|
@ -2,6 +2,7 @@ nocommit this!
|
|||
|
||||
TODO
|
||||
- associations
|
||||
- make SumFloat too
|
||||
- cutover taxo writer/reader to pathToString/stringToPath
|
||||
- wrap an IW instead of extending one? or, FacetDocument?
|
||||
- re-enable ALL_BUT_DIM somehow?
|
||||
|
|
|
@ -554,7 +554,7 @@ public class Field implements IndexableField, StorableField {
|
|||
return analyzer.tokenStream(name(), stringValue());
|
||||
}
|
||||
|
||||
throw new IllegalArgumentException("Field must have either TokenStream, String, Reader or Number value");
|
||||
throw new IllegalArgumentException("Field must have either TokenStream, String, Reader or Number value; this=" + this);
|
||||
}
|
||||
|
||||
static final class StringTokenStream extends TokenStream {
|
||||
|
|
|
@ -0,0 +1,78 @@
|
|||
package org.apache.lucene.facet.simple;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.FieldType;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
/** Associates an arbitrary byte[] with the added facet
|
||||
* path. */
|
||||
public class AssociationFacetField extends Field {
|
||||
static final FieldType TYPE = new FieldType();
|
||||
static {
|
||||
TYPE.setIndexed(true);
|
||||
TYPE.freeze();
|
||||
}
|
||||
final String dim;
|
||||
final String[] path;
|
||||
final BytesRef assoc;
|
||||
|
||||
public AssociationFacetField(BytesRef assoc, String dim, String... path) {
|
||||
super("dummy", TYPE);
|
||||
this.dim = dim;
|
||||
this.assoc = assoc;
|
||||
if (path.length == 0) {
|
||||
throw new IllegalArgumentException("path must have at least one element");
|
||||
}
|
||||
this.path = path;
|
||||
}
|
||||
|
||||
/** Utility ctor: associates an int value (translates it
|
||||
* to 4-byte BytesRef). */
|
||||
public AssociationFacetField(int assoc, String dim, String... path) {
|
||||
this(intToBytesRef(assoc), dim, path);
|
||||
}
|
||||
|
||||
/** Utility ctor: associates a float value (translates it
|
||||
* to 4-byte BytesRef). */
|
||||
public AssociationFacetField(float assoc, String dim, String... path) {
|
||||
this(floatToBytesRef(assoc), dim, path);
|
||||
}
|
||||
|
||||
private static BytesRef intToBytesRef(int v) {
|
||||
byte[] bytes = new byte[4];
|
||||
// big-endian:
|
||||
bytes[0] = (byte) (v >> 24);
|
||||
bytes[1] = (byte) (v >> 16);
|
||||
bytes[2] = (byte) (v >> 8);
|
||||
bytes[3] = (byte) v;
|
||||
return new BytesRef(bytes);
|
||||
}
|
||||
|
||||
private static BytesRef floatToBytesRef(float v) {
|
||||
return intToBytesRef(Float.floatToIntBits(v));
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "FacetField(dim=" + dim + " path=" + Arrays.toString(path) + ")";
|
||||
}
|
||||
}
|
|
@ -38,6 +38,7 @@ import org.apache.lucene.index.IndexableField;
|
|||
import org.apache.lucene.index.IndexableFieldType;
|
||||
import org.apache.lucene.index.StorableField;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.IntsRef;
|
||||
|
||||
|
@ -66,6 +67,9 @@ public class FacetIndexWriter extends IndexWriter {
|
|||
// ... and also all SortedSetDocValuesFacetFields:
|
||||
Map<String,List<SortedSetDocValuesFacetField>> dvByField = new HashMap<String,List<SortedSetDocValuesFacetField>>();
|
||||
|
||||
// ... and also all AssociationFacetFields
|
||||
Map<String,List<AssociationFacetField>> assocByField = new HashMap<String,List<AssociationFacetField>>();
|
||||
|
||||
for(IndexableField field : doc.indexableFields()) {
|
||||
if (field.fieldType() == FacetField.TYPE) {
|
||||
FacetField facetField = (FacetField) field;
|
||||
|
@ -90,6 +94,20 @@ public class FacetIndexWriter extends IndexWriter {
|
|||
}
|
||||
fields.add(facetField);
|
||||
}
|
||||
|
||||
if (field.fieldType() == AssociationFacetField.TYPE) {
|
||||
AssociationFacetField facetField = (AssociationFacetField) field;
|
||||
FacetsConfig.DimConfig dimConfig = facetsConfig.getDimConfig(field.name());
|
||||
|
||||
// nocommit how to use a different default name for assocs?
|
||||
String indexedFieldName = dimConfig.indexedFieldName;
|
||||
List<AssociationFacetField> fields = assocByField.get(indexedFieldName);
|
||||
if (fields == null) {
|
||||
fields = new ArrayList<AssociationFacetField>();
|
||||
assocByField.put(indexedFieldName, fields);
|
||||
}
|
||||
fields.add(facetField);
|
||||
}
|
||||
}
|
||||
|
||||
List<Field> addedIndexedFields = new ArrayList<Field>();
|
||||
|
@ -97,13 +115,14 @@ public class FacetIndexWriter extends IndexWriter {
|
|||
|
||||
processFacetFields(byField, addedIndexedFields, addedStoredFields);
|
||||
processSSDVFacetFields(dvByField, addedIndexedFields, addedStoredFields);
|
||||
processAssocFacetFields(assocByField, addedIndexedFields, addedStoredFields);
|
||||
|
||||
//System.out.println("add stored: " + addedStoredFields);
|
||||
|
||||
final List<IndexableField> allIndexedFields = new ArrayList<IndexableField>();
|
||||
for(IndexableField field : doc.indexableFields()) {
|
||||
IndexableFieldType ft = field.fieldType();
|
||||
if (ft != FacetField.TYPE && ft != SortedSetDocValuesFacetField.TYPE) {
|
||||
if (ft != FacetField.TYPE && ft != SortedSetDocValuesFacetField.TYPE && ft != AssociationFacetField.TYPE) {
|
||||
allIndexedFields.add(field);
|
||||
}
|
||||
}
|
||||
|
@ -200,6 +219,35 @@ public class FacetIndexWriter extends IndexWriter {
|
|||
}
|
||||
}
|
||||
|
||||
private void processAssocFacetFields(Map<String,List<AssociationFacetField>> byField, List<Field> addedIndexedFields, List<Field> addedStoredFields) throws IOException {
|
||||
for(Map.Entry<String,List<AssociationFacetField>> ent : byField.entrySet()) {
|
||||
byte[] bytes = new byte[16];
|
||||
int upto = 0;
|
||||
String indexedFieldName = ent.getKey();
|
||||
for(AssociationFacetField field : ent.getValue()) {
|
||||
// NOTE: we don't add parents for associations
|
||||
// nocommit is that right? maybe we are supposed to
|
||||
// add to taxo writer, and just not index the parent
|
||||
// ords?
|
||||
int ordinal = taxoWriter.addCategory(FacetLabel.create(field.dim, field.path));
|
||||
if (upto + 4 > bytes.length) {
|
||||
bytes = ArrayUtil.grow(bytes, upto+4);
|
||||
}
|
||||
// big-endian:
|
||||
bytes[upto++] = (byte) (ordinal >> 24);
|
||||
bytes[upto++] = (byte) (ordinal >> 16);
|
||||
bytes[upto++] = (byte) (ordinal >> 8);
|
||||
bytes[upto++] = (byte) ordinal;
|
||||
if (upto + field.assoc.length > bytes.length) {
|
||||
bytes = ArrayUtil.grow(bytes, upto+field.assoc.length);
|
||||
}
|
||||
System.arraycopy(field.assoc.bytes, field.assoc.offset, bytes, upto, field.assoc.length);
|
||||
upto += field.assoc.length;
|
||||
}
|
||||
addedStoredFields.add(new BinaryDocValuesField(indexedFieldName, new BytesRef(bytes, 0, upto)));
|
||||
}
|
||||
}
|
||||
|
||||
/** We can open this up if/when we really need
|
||||
* pluggability on the encoding. */
|
||||
private final BytesRef dedupAndEncode(IntsRef ordinals) {
|
||||
|
@ -314,6 +362,7 @@ public class FacetIndexWriter extends IndexWriter {
|
|||
buffer[upto++] = ch;
|
||||
}
|
||||
}
|
||||
parts.add(new String(buffer, 0, upto));
|
||||
assert !lastEscape;
|
||||
return parts.toArray(new String[parts.size()]);
|
||||
}
|
||||
|
|
|
@ -16,7 +16,9 @@ package org.apache.lucene.facet.simple;
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.regex.Pattern;
|
||||
|
@ -110,7 +112,7 @@ public final class SortedSetDocValuesReaderState {
|
|||
dv.lookupOrd(ord, spare);
|
||||
String[] components = FacetIndexWriter.stringToPath(spare.utf8ToString());
|
||||
if (components.length != 2) {
|
||||
throw new IllegalArgumentException("this class can only handle 2 level hierarchy (dim/value); got: " + spare.utf8ToString());
|
||||
throw new IllegalArgumentException("this class can only handle 2 level hierarchy (dim/value); got: " + Arrays.toString(components) + " " + spare.utf8ToString());
|
||||
}
|
||||
if (!components[0].equals(lastDim)) {
|
||||
if (lastDim != null) {
|
||||
|
|
|
@ -0,0 +1,235 @@
|
|||
package org.apache.lucene.facet.simple;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.facet.simple.SimpleFacetsCollector.MatchingDocs;
|
||||
import org.apache.lucene.facet.taxonomy.FacetLabel;
|
||||
import org.apache.lucene.facet.taxonomy.ParallelTaxonomyArrays;
|
||||
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
|
||||
import org.apache.lucene.index.BinaryDocValues;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.FixedBitSet;
|
||||
|
||||
// nocommit jdoc that this assumes/requires the default encoding
|
||||
public class SumFloatAssociationFacets extends Facets {
|
||||
private final FacetsConfig facetsConfig;
|
||||
private final TaxonomyReader taxoReader;
|
||||
private final float[] values;
|
||||
private final String facetsFieldName;
|
||||
private final int[] children;
|
||||
private final int[] parents;
|
||||
private final int[] siblings;
|
||||
|
||||
public SumFloatAssociationFacets(TaxonomyReader taxoReader, FacetsConfig facetsConfig, SimpleFacetsCollector fc) throws IOException {
|
||||
this(FacetsConfig.DEFAULT_INDEXED_FIELD_NAME, taxoReader, facetsConfig, fc);
|
||||
}
|
||||
|
||||
public SumFloatAssociationFacets(String facetsFieldName, TaxonomyReader taxoReader, FacetsConfig facetsConfig, SimpleFacetsCollector fc) throws IOException {
|
||||
this.facetsFieldName = facetsFieldName;
|
||||
this.taxoReader = taxoReader;
|
||||
this.facetsConfig = facetsConfig;
|
||||
ParallelTaxonomyArrays pta = taxoReader.getParallelTaxonomyArrays();
|
||||
children = pta.children();
|
||||
parents = pta.parents();
|
||||
siblings = pta.siblings();
|
||||
values = new float[taxoReader.getSize()];
|
||||
sumValues(fc.getMatchingDocs());
|
||||
}
|
||||
|
||||
private final void sumValues(List<MatchingDocs> matchingDocs) throws IOException {
|
||||
//System.out.println("count matchingDocs=" + matchingDocs + " facetsField=" + facetsFieldName);
|
||||
for(MatchingDocs hits : matchingDocs) {
|
||||
BinaryDocValues dv = hits.context.reader().getBinaryDocValues(facetsFieldName);
|
||||
if (dv == null) { // this reader does not have DocValues for the requested category list
|
||||
continue;
|
||||
}
|
||||
FixedBitSet bits = hits.bits;
|
||||
|
||||
final int length = hits.bits.length();
|
||||
int doc = 0;
|
||||
BytesRef scratch = new BytesRef();
|
||||
//System.out.println("count seg=" + hits.context.reader());
|
||||
while (doc < length && (doc = bits.nextSetBit(doc)) != -1) {
|
||||
//System.out.println(" doc=" + doc);
|
||||
// nocommit use OrdinalsReader? but, add a
|
||||
// BytesRef getAssociation()?
|
||||
dv.get(doc, scratch);
|
||||
byte[] bytes = scratch.bytes;
|
||||
int end = scratch.offset + scratch.length;
|
||||
int offset = scratch.offset;
|
||||
while (offset < end) {
|
||||
int ord = ((bytes[offset]&0xFF) << 24) |
|
||||
((bytes[offset+1]&0xFF) << 16) |
|
||||
((bytes[offset+2]&0xFF) << 8) |
|
||||
(bytes[offset+3]&0xFF);
|
||||
offset += 4;
|
||||
int value = ((bytes[offset]&0xFF) << 24) |
|
||||
((bytes[offset+1]&0xFF) << 16) |
|
||||
((bytes[offset+2]&0xFF) << 8) |
|
||||
(bytes[offset+3]&0xFF);
|
||||
offset += 4;
|
||||
values[ord] += Float.intBitsToFloat(value);
|
||||
}
|
||||
++doc;
|
||||
}
|
||||
}
|
||||
|
||||
// nocommit we could do this lazily instead:
|
||||
|
||||
// Rollup any necessary dims:
|
||||
// nocommit should we rollup?
|
||||
/*
|
||||
for(Map.Entry<String,FacetsConfig.DimConfig> ent : facetsConfig.getDimConfigs().entrySet()) {
|
||||
String dim = ent.getKey();
|
||||
FacetsConfig.DimConfig ft = ent.getValue();
|
||||
if (ft.hierarchical && ft.multiValued == false) {
|
||||
int dimRootOrd = taxoReader.getOrdinal(new FacetLabel(dim));
|
||||
// It can be -1 if this field was declared in the
|
||||
// facetsConfig but never indexed:
|
||||
if (dimRootOrd > 0) {
|
||||
counts[dimRootOrd] += rollup(children[dimRootOrd]);
|
||||
}
|
||||
}
|
||||
}
|
||||
*/
|
||||
}
|
||||
|
||||
private float rollup(int ord) {
|
||||
int sum = 0;
|
||||
while (ord != TaxonomyReader.INVALID_ORDINAL) {
|
||||
float childValue = values[ord] + rollup(children[ord]);
|
||||
values[ord] = childValue;
|
||||
sum += childValue;
|
||||
ord = siblings[ord];
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
/** Return the count for a specific path. Returns -1 if
|
||||
* this path doesn't exist, else the count. */
|
||||
@Override
|
||||
public Number getSpecificValue(String dim, String... path) throws IOException {
|
||||
int ord = taxoReader.getOrdinal(FacetLabel.create(dim, path));
|
||||
if (ord < 0) {
|
||||
return -1;
|
||||
}
|
||||
return values[ord];
|
||||
}
|
||||
|
||||
@Override
|
||||
public SimpleFacetResult getTopChildren(int topN, String dim, String... path) throws IOException {
|
||||
FacetLabel cp = FacetLabel.create(dim, path);
|
||||
int ord = taxoReader.getOrdinal(cp);
|
||||
if (ord == -1) {
|
||||
//System.out.println("no ord for path=" + path);
|
||||
return null;
|
||||
}
|
||||
return getTopChildren(cp, ord, topN);
|
||||
}
|
||||
|
||||
private SimpleFacetResult getTopChildren(FacetLabel path, int dimOrd, int topN) throws IOException {
|
||||
|
||||
TopOrdValueQueue q = new TopOrdValueQueue(topN);
|
||||
|
||||
float bottomValue = 0;
|
||||
|
||||
int ord = children[dimOrd];
|
||||
float sumValue = 0;
|
||||
|
||||
TopOrdValueQueue.OrdAndValue reuse = null;
|
||||
while(ord != TaxonomyReader.INVALID_ORDINAL) {
|
||||
if (values[ord] > 0) {
|
||||
sumValue += values[ord];
|
||||
if (values[ord] > bottomValue) {
|
||||
if (reuse == null) {
|
||||
reuse = new TopOrdValueQueue.OrdAndValue();
|
||||
}
|
||||
reuse.ord = ord;
|
||||
reuse.value = values[ord];
|
||||
reuse = q.insertWithOverflow(reuse);
|
||||
if (q.size() == topN) {
|
||||
bottomValue = q.top().value;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ord = siblings[ord];
|
||||
}
|
||||
|
||||
if (sumValue == 0) {
|
||||
//System.out.println("totCount=0 for path=" + path);
|
||||
return null;
|
||||
}
|
||||
|
||||
/*
|
||||
FacetsConfig.DimConfig ft = facetsConfig.getDimConfig(path.components[0]);
|
||||
// nocommit shouldn't we verify the indexedFieldName
|
||||
// matches what was passed to our ctor?
|
||||
if (ft.hierarchical && ft.multiValued) {
|
||||
totCount = counts[dimOrd];
|
||||
}
|
||||
*/
|
||||
|
||||
LabelAndValue[] labelValues = new LabelAndValue[q.size()];
|
||||
for(int i=labelValues.length-1;i>=0;i--) {
|
||||
TopOrdValueQueue.OrdAndValue ordAndValue = q.pop();
|
||||
FacetLabel child = taxoReader.getPath(ordAndValue.ord);
|
||||
labelValues[i] = new LabelAndValue(child.components[path.length], ordAndValue.value);
|
||||
}
|
||||
|
||||
return new SimpleFacetResult(path, sumValue, labelValues);
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<SimpleFacetResult> getAllDims(int topN) throws IOException {
|
||||
int ord = children[TaxonomyReader.ROOT_ORDINAL];
|
||||
List<SimpleFacetResult> results = new ArrayList<SimpleFacetResult>();
|
||||
while (ord != TaxonomyReader.INVALID_ORDINAL) {
|
||||
SimpleFacetResult result = getTopChildren(taxoReader.getPath(ord), ord, topN);
|
||||
if (result != null) {
|
||||
results.add(result);
|
||||
}
|
||||
ord = siblings[ord];
|
||||
}
|
||||
|
||||
// Sort by highest count:
|
||||
Collections.sort(results,
|
||||
new Comparator<SimpleFacetResult>() {
|
||||
@Override
|
||||
public int compare(SimpleFacetResult a, SimpleFacetResult b) {
|
||||
if (a.value.intValue() > b.value.intValue()) {
|
||||
return -1;
|
||||
} else if (b.value.intValue() > a.value.intValue()) {
|
||||
return 1;
|
||||
} else {
|
||||
// Tie break by dimension
|
||||
return a.path.components[0].compareTo(b.path.components[0]);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
return results;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,235 @@
|
|||
package org.apache.lucene.facet.simple;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.facet.simple.SimpleFacetsCollector.MatchingDocs;
|
||||
import org.apache.lucene.facet.taxonomy.FacetLabel;
|
||||
import org.apache.lucene.facet.taxonomy.ParallelTaxonomyArrays;
|
||||
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
|
||||
import org.apache.lucene.index.BinaryDocValues;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.FixedBitSet;
|
||||
|
||||
// nocommit jdoc that this assumes/requires the default encoding
|
||||
public class SumIntAssociationFacets extends Facets {
|
||||
private final FacetsConfig facetsConfig;
|
||||
private final TaxonomyReader taxoReader;
|
||||
private final int[] values;
|
||||
private final String facetsFieldName;
|
||||
private final int[] children;
|
||||
private final int[] parents;
|
||||
private final int[] siblings;
|
||||
|
||||
public SumIntAssociationFacets(TaxonomyReader taxoReader, FacetsConfig facetsConfig, SimpleFacetsCollector fc) throws IOException {
|
||||
this(FacetsConfig.DEFAULT_INDEXED_FIELD_NAME, taxoReader, facetsConfig, fc);
|
||||
}
|
||||
|
||||
public SumIntAssociationFacets(String facetsFieldName, TaxonomyReader taxoReader, FacetsConfig facetsConfig, SimpleFacetsCollector fc) throws IOException {
|
||||
this.facetsFieldName = facetsFieldName;
|
||||
this.taxoReader = taxoReader;
|
||||
this.facetsConfig = facetsConfig;
|
||||
ParallelTaxonomyArrays pta = taxoReader.getParallelTaxonomyArrays();
|
||||
children = pta.children();
|
||||
parents = pta.parents();
|
||||
siblings = pta.siblings();
|
||||
values = new int[taxoReader.getSize()];
|
||||
sumValues(fc.getMatchingDocs());
|
||||
}
|
||||
|
||||
private final void sumValues(List<MatchingDocs> matchingDocs) throws IOException {
|
||||
//System.out.println("count matchingDocs=" + matchingDocs + " facetsField=" + facetsFieldName);
|
||||
for(MatchingDocs hits : matchingDocs) {
|
||||
BinaryDocValues dv = hits.context.reader().getBinaryDocValues(facetsFieldName);
|
||||
if (dv == null) { // this reader does not have DocValues for the requested category list
|
||||
continue;
|
||||
}
|
||||
FixedBitSet bits = hits.bits;
|
||||
|
||||
final int length = hits.bits.length();
|
||||
int doc = 0;
|
||||
BytesRef scratch = new BytesRef();
|
||||
//System.out.println("count seg=" + hits.context.reader());
|
||||
while (doc < length && (doc = bits.nextSetBit(doc)) != -1) {
|
||||
//System.out.println(" doc=" + doc);
|
||||
// nocommit use OrdinalsReader? but, add a
|
||||
// BytesRef getAssociation()?
|
||||
dv.get(doc, scratch);
|
||||
byte[] bytes = scratch.bytes;
|
||||
int end = scratch.offset + scratch.length;
|
||||
int offset = scratch.offset;
|
||||
while (offset < end) {
|
||||
int ord = ((bytes[offset]&0xFF) << 24) |
|
||||
((bytes[offset+1]&0xFF) << 16) |
|
||||
((bytes[offset+2]&0xFF) << 8) |
|
||||
(bytes[offset+3]&0xFF);
|
||||
offset += 4;
|
||||
int value = ((bytes[offset]&0xFF) << 24) |
|
||||
((bytes[offset+1]&0xFF) << 16) |
|
||||
((bytes[offset+2]&0xFF) << 8) |
|
||||
(bytes[offset+3]&0xFF);
|
||||
offset += 4;
|
||||
values[ord] += value;
|
||||
}
|
||||
++doc;
|
||||
}
|
||||
}
|
||||
|
||||
// nocommit we could do this lazily instead:
|
||||
|
||||
// Rollup any necessary dims:
|
||||
// nocommit should we rollup?
|
||||
/*
|
||||
for(Map.Entry<String,FacetsConfig.DimConfig> ent : facetsConfig.getDimConfigs().entrySet()) {
|
||||
String dim = ent.getKey();
|
||||
FacetsConfig.DimConfig ft = ent.getValue();
|
||||
if (ft.hierarchical && ft.multiValued == false) {
|
||||
int dimRootOrd = taxoReader.getOrdinal(new FacetLabel(dim));
|
||||
// It can be -1 if this field was declared in the
|
||||
// facetsConfig but never indexed:
|
||||
if (dimRootOrd > 0) {
|
||||
counts[dimRootOrd] += rollup(children[dimRootOrd]);
|
||||
}
|
||||
}
|
||||
}
|
||||
*/
|
||||
}
|
||||
|
||||
private int rollup(int ord) {
|
||||
int sum = 0;
|
||||
while (ord != TaxonomyReader.INVALID_ORDINAL) {
|
||||
int childValue = values[ord] + rollup(children[ord]);
|
||||
values[ord] = childValue;
|
||||
sum += childValue;
|
||||
ord = siblings[ord];
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
/** Return the count for a specific path. Returns -1 if
|
||||
* this path doesn't exist, else the count. */
|
||||
@Override
|
||||
public Number getSpecificValue(String dim, String... path) throws IOException {
|
||||
int ord = taxoReader.getOrdinal(FacetLabel.create(dim, path));
|
||||
if (ord < 0) {
|
||||
return -1;
|
||||
}
|
||||
return values[ord];
|
||||
}
|
||||
|
||||
@Override
|
||||
public SimpleFacetResult getTopChildren(int topN, String dim, String... path) throws IOException {
|
||||
FacetLabel cp = FacetLabel.create(dim, path);
|
||||
int ord = taxoReader.getOrdinal(cp);
|
||||
if (ord == -1) {
|
||||
//System.out.println("no ord for path=" + path);
|
||||
return null;
|
||||
}
|
||||
return getTopChildren(cp, ord, topN);
|
||||
}
|
||||
|
||||
private SimpleFacetResult getTopChildren(FacetLabel path, int dimOrd, int topN) throws IOException {
|
||||
|
||||
TopOrdCountQueue q = new TopOrdCountQueue(topN);
|
||||
|
||||
int bottomValue = 0;
|
||||
|
||||
int ord = children[dimOrd];
|
||||
long sumValue = 0;
|
||||
|
||||
TopOrdCountQueue.OrdAndCount reuse = null;
|
||||
while(ord != TaxonomyReader.INVALID_ORDINAL) {
|
||||
if (values[ord] > 0) {
|
||||
sumValue += values[ord];
|
||||
if (values[ord] > bottomValue) {
|
||||
if (reuse == null) {
|
||||
reuse = new TopOrdCountQueue.OrdAndCount();
|
||||
}
|
||||
reuse.ord = ord;
|
||||
reuse.count = values[ord];
|
||||
reuse = q.insertWithOverflow(reuse);
|
||||
if (q.size() == topN) {
|
||||
bottomValue = q.top().count;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ord = siblings[ord];
|
||||
}
|
||||
|
||||
if (sumValue == 0) {
|
||||
//System.out.println("totCount=0 for path=" + path);
|
||||
return null;
|
||||
}
|
||||
|
||||
/*
|
||||
FacetsConfig.DimConfig ft = facetsConfig.getDimConfig(path.components[0]);
|
||||
// nocommit shouldn't we verify the indexedFieldName
|
||||
// matches what was passed to our ctor?
|
||||
if (ft.hierarchical && ft.multiValued) {
|
||||
totCount = counts[dimOrd];
|
||||
}
|
||||
*/
|
||||
|
||||
LabelAndValue[] labelValues = new LabelAndValue[q.size()];
|
||||
for(int i=labelValues.length-1;i>=0;i--) {
|
||||
TopOrdCountQueue.OrdAndCount ordAndCount = q.pop();
|
||||
FacetLabel child = taxoReader.getPath(ordAndCount.ord);
|
||||
labelValues[i] = new LabelAndValue(child.components[path.length], ordAndCount.count);
|
||||
}
|
||||
|
||||
return new SimpleFacetResult(path, sumValue, labelValues);
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<SimpleFacetResult> getAllDims(int topN) throws IOException {
|
||||
int ord = children[TaxonomyReader.ROOT_ORDINAL];
|
||||
List<SimpleFacetResult> results = new ArrayList<SimpleFacetResult>();
|
||||
while (ord != TaxonomyReader.INVALID_ORDINAL) {
|
||||
SimpleFacetResult result = getTopChildren(taxoReader.getPath(ord), ord, topN);
|
||||
if (result != null) {
|
||||
results.add(result);
|
||||
}
|
||||
ord = siblings[ord];
|
||||
}
|
||||
|
||||
// Sort by highest count:
|
||||
Collections.sort(results,
|
||||
new Comparator<SimpleFacetResult>() {
|
||||
@Override
|
||||
public int compare(SimpleFacetResult a, SimpleFacetResult b) {
|
||||
if (a.value.intValue() > b.value.intValue()) {
|
||||
return -1;
|
||||
} else if (b.value.intValue() > a.value.intValue()) {
|
||||
return 1;
|
||||
} else {
|
||||
// Tie break by dimension
|
||||
return a.path.components[0].compareTo(b.path.components[0]);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
return results;
|
||||
}
|
||||
}
|
|
@ -37,6 +37,7 @@ import org.apache.lucene.util.IntsRef;
|
|||
* FastTaxonomyFacetCounts} if you are just using the
|
||||
* default encoding from {@link BinaryDocValues}. */
|
||||
|
||||
// nocommit remove & add specialized Cached variation only?
|
||||
public class TaxonomyFacetCounts extends Facets {
|
||||
private final OrdinalsReader ordinalsReader;
|
||||
private final FacetsConfig facetsConfig;
|
||||
|
|
|
@ -19,6 +19,8 @@ package org.apache.lucene.facet.simple;
|
|||
|
||||
import org.apache.lucene.util.PriorityQueue;
|
||||
|
||||
// nocommit rename to TopOrdIntQueue?
|
||||
|
||||
/** Keeps highest count results. */
|
||||
class TopOrdCountQueue extends PriorityQueue<TopOrdCountQueue.OrdAndCount> {
|
||||
|
||||
|
|
|
@ -0,0 +1,159 @@
|
|||
package org.apache.lucene.facet.simple;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.facet.FacetTestCase;
|
||||
import org.apache.lucene.facet.params.FacetSearchParams;
|
||||
import org.apache.lucene.facet.search.FacetResult;
|
||||
import org.apache.lucene.facet.search.FacetsCollector;
|
||||
import org.apache.lucene.facet.taxonomy.FacetLabel;
|
||||
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
|
||||
import org.apache.lucene.facet.taxonomy.TaxonomyWriter;
|
||||
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader;
|
||||
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.IndexWriterConfig;
|
||||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.MatchAllDocsQuery;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.junit.AfterClass;
|
||||
import org.junit.BeforeClass;
|
||||
|
||||
/** Test for associations */
|
||||
public class TestAssociationFacets extends FacetTestCase {
|
||||
|
||||
private static Directory dir;
|
||||
private static IndexReader reader;
|
||||
private static Directory taxoDir;
|
||||
|
||||
private static final FacetLabel aint = new FacetLabel("int", "a");
|
||||
private static final FacetLabel bint = new FacetLabel("int", "b");
|
||||
private static final FacetLabel afloat = new FacetLabel("float", "a");
|
||||
private static final FacetLabel bfloat = new FacetLabel("float", "b");
|
||||
|
||||
@BeforeClass
|
||||
public static void beforeClass() throws Exception {
|
||||
dir = newDirectory();
|
||||
taxoDir = newDirectory();
|
||||
// preparations - index, taxonomy, content
|
||||
|
||||
TaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir);
|
||||
|
||||
IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
|
||||
IndexWriter writer = new FacetIndexWriter(dir, iwc, taxoWriter, new FacetsConfig());
|
||||
|
||||
// index documents, 50% have only 'b' and all have 'a'
|
||||
for (int i = 0; i < 110; i++) {
|
||||
Document doc = new Document();
|
||||
// every 11th document is added empty, this used to cause the association
|
||||
// aggregators to go into an infinite loop
|
||||
if (i % 11 != 0) {
|
||||
doc.add(new AssociationFacetField(2, "int", "a"));
|
||||
doc.add(new AssociationFacetField(0.5f, "float", "a"));
|
||||
if (i % 2 == 0) { // 50
|
||||
doc.add(new AssociationFacetField(3, "int", "b"));
|
||||
doc.add(new AssociationFacetField(0.2f, "float", "b"));
|
||||
}
|
||||
}
|
||||
writer.addDocument(doc);
|
||||
}
|
||||
|
||||
taxoWriter.close();
|
||||
reader = DirectoryReader.open(writer, true);
|
||||
writer.close();
|
||||
}
|
||||
|
||||
@AfterClass
|
||||
public static void afterClass() throws Exception {
|
||||
reader.close();
|
||||
reader = null;
|
||||
dir.close();
|
||||
dir = null;
|
||||
taxoDir.close();
|
||||
taxoDir = null;
|
||||
}
|
||||
|
||||
public void testIntSumAssociation() throws Exception {
|
||||
TaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoDir);
|
||||
|
||||
SimpleFacetsCollector fc = new SimpleFacetsCollector();
|
||||
|
||||
IndexSearcher searcher = newSearcher(reader);
|
||||
searcher.search(new MatchAllDocsQuery(), fc);
|
||||
|
||||
SumIntAssociationFacets facets = new SumIntAssociationFacets(taxoReader, new FacetsConfig(), fc);
|
||||
|
||||
assertEquals("Wrong count for category 'a'!", 200, facets.getSpecificValue("int", "a").intValue());
|
||||
assertEquals("Wrong count for category 'b'!", 150, facets.getSpecificValue("int", "b").intValue());
|
||||
|
||||
taxoReader.close();
|
||||
}
|
||||
|
||||
public void testFloatSumAssociation() throws Exception {
|
||||
DirectoryTaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoDir);
|
||||
|
||||
SimpleFacetsCollector fc = new SimpleFacetsCollector();
|
||||
|
||||
IndexSearcher searcher = newSearcher(reader);
|
||||
searcher.search(new MatchAllDocsQuery(), fc);
|
||||
|
||||
SumFloatAssociationFacets facets = new SumFloatAssociationFacets(taxoReader, new FacetsConfig(), fc);
|
||||
assertEquals("Wrong count for category 'a'!", 50f, facets.getSpecificValue("float", "a").floatValue(), 0.00001);
|
||||
assertEquals("Wrong count for category 'b'!", 10f, facets.getSpecificValue("float", "b").floatValue(), 0.00001);
|
||||
|
||||
taxoReader.close();
|
||||
}
|
||||
|
||||
/*
|
||||
public void testDifferentAggregatorsSameCategoryList() throws Exception {
|
||||
DirectoryTaxonomyReader taxo = new DirectoryTaxonomyReader(taxoDir);
|
||||
|
||||
// facet requests for two facets
|
||||
FacetSearchParams fsp = new FacetSearchParams(
|
||||
new SumIntAssociationFacetRequest(aint, 10),
|
||||
new SumIntAssociationFacetRequest(bint, 10),
|
||||
new SumFloatAssociationFacetRequest(afloat, 10),
|
||||
new SumFloatAssociationFacetRequest(bfloat, 10));
|
||||
|
||||
Query q = new MatchAllDocsQuery();
|
||||
|
||||
FacetsCollector fc = FacetsCollector.create(fsp, reader, taxo);
|
||||
|
||||
IndexSearcher searcher = newSearcher(reader);
|
||||
searcher.search(q, fc);
|
||||
List<FacetResult> res = fc.getFacetResults();
|
||||
|
||||
assertEquals("Wrong number of results!", 4, res.size());
|
||||
assertEquals("Wrong count for category 'a'!", 200, (int) res.get(0).getFacetResultNode().value);
|
||||
assertEquals("Wrong count for category 'b'!", 150, (int) res.get(1).getFacetResultNode().value);
|
||||
assertEquals("Wrong count for category 'a'!",50f, (float) res.get(2).getFacetResultNode().value, 0.00001);
|
||||
assertEquals("Wrong count for category 'b'!",10f, (float) res.get(3).getFacetResultNode().value, 0.00001);
|
||||
|
||||
taxo.close();
|
||||
}
|
||||
*/
|
||||
}
|
Loading…
Reference in New Issue