LUCENE-9946: Support multi-value fields in range facet counting (#127)

This commit is contained in:
Greg Miller 2021-05-30 19:46:11 -07:00 committed by GitHub
parent 5fdff6eabb
commit d669ddebc5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 1589 additions and 362 deletions

View File

@ -231,6 +231,9 @@ Improvements
* LUCENE-9850: Switch to PFOR encoding for doc IDs (instead of FOR). (Greg Miller)
* LUCENE-9946: Support for multi-value fields in LongRangeFacetCounts and
DoubleRangeFacetCounts. (Greg Miller)
* LUCENE-9929: Add NorwegianNormalizationFilter, which does the same as ScandinavianNormalizationFilter except
it does not fold oo->ø and ao->å. (janhoy, Robert Muir, Adrien Grand)

View File

@ -22,23 +22,19 @@ import org.apache.lucene.document.FloatDocValuesField;
import org.apache.lucene.facet.Facets;
import org.apache.lucene.facet.FacetsCollector;
import org.apache.lucene.facet.FacetsCollector.MatchingDocs;
import org.apache.lucene.index.IndexReaderContext;
import org.apache.lucene.index.ReaderUtil;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.SortedNumericDocValues;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.DoubleValues;
import org.apache.lucene.search.DoubleValuesSource;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.Weight;
import org.apache.lucene.util.NumericUtils;
/**
* {@link Facets} implementation that computes counts for dynamic double ranges from a provided
* {@link DoubleValuesSource}. Use this for dimensions that change in real-time (e.g. a relative
* time based dimension like "Past day", "Past 2 days", etc.) or that change for each request (e.g.
* distance from the user's location, "< 1 km", "< 2 km", etc.).
* {@link Facets} implementation that computes counts for dynamic double ranges. Use this for
* dimensions that change in real-time (e.g. a relative time based dimension like "Past day", "Past
* 2 days", etc.) or that change for each request (e.g. distance from the user's location, "< 1
* km", "< 2 km", etc.).
*
* <p>If you have indexed your field using {@link FloatDocValuesField}, then you should use a
* DoubleValuesSource generated from {@link DoubleValuesSource#fromFloatField(String)}.
@ -48,18 +44,29 @@ import org.apache.lucene.util.NumericUtils;
public class DoubleRangeFacetCounts extends RangeFacetCounts {
/**
* Create {@code RangeFacetCounts}, using {@link DoubleValues} from the specified field.
* Create {@code RangeFacetCounts}, using double value from the specified field. The field may be
* single-valued ({@link NumericDocValues}) or multi-valued ({@link SortedNumericDocValues}), and
* will be interpreted as containing double values.
*
* <p>N.B This assumes that the field was indexed with {@link
* org.apache.lucene.document.DoubleDocValuesField}. For float-valued fields, use {@link
* #DoubleRangeFacetCounts(String, DoubleValuesSource, FacetsCollector, DoubleRange...)}
*
* <p>TODO: Extend multi-valued support to fields that have been indexed as float values
*/
public DoubleRangeFacetCounts(String field, FacetsCollector hits, DoubleRange... ranges)
throws IOException {
this(field, DoubleValuesSource.fromDoubleField(field), hits, ranges);
this(field, null, hits, ranges);
}
/** Create {@code RangeFacetCounts} using the provided {@link DoubleValuesSource} */
/**
* Create {@code RangeFacetCounts}, using the provided {@link DoubleValuesSource} if non-null. If
* {@code valueSource} is null, doc values from the provided {@code field} will be used.
*
* <p>N.B If relying on the provided {@code field}, see javadoc notes associated with {@link
* #DoubleRangeFacetCounts(String, FacetsCollector, DoubleRange...)} for assumptions on how the
* field is indexed.
*/
public DoubleRangeFacetCounts(
String field, DoubleValuesSource valueSource, FacetsCollector hits, DoubleRange... ranges)
throws IOException {
@ -67,9 +74,14 @@ public class DoubleRangeFacetCounts extends RangeFacetCounts {
}
/**
* Create {@code RangeFacetCounts}, using the provided {@link DoubleValuesSource}, and using the
* provided Query as a fastmatch: only documents matching the query are checked for the matching
* ranges.
* Create {@code RangeFacetCounts}, using the provided {@link DoubleValuesSource} if non-null. If
* {@code valueSource} is null, doc values from the provided {@code field} will be used. Use the
* provided {@code Query} as a fastmatch: only documents matching the query are checked for the
* matching ranges.
*
* <p>N.B If relying on the provided {@code field}, see javadoc notes associated with {@link
* #DoubleRangeFacetCounts(String, FacetsCollector, DoubleRange...)} for assumptions on how the
* field is indexed.
*/
public DoubleRangeFacetCounts(
String field,
@ -79,14 +91,53 @@ public class DoubleRangeFacetCounts extends RangeFacetCounts {
DoubleRange... ranges)
throws IOException {
super(field, ranges, fastMatchQuery);
count(valueSource, hits.getMatchingDocs());
// use the provided valueSource if non-null, otherwise use the doc values associated with the
// field
if (valueSource != null) {
count(valueSource, hits.getMatchingDocs());
} else {
count(field, hits.getMatchingDocs());
}
}
/** Counts from the provided valueSource. */
private void count(DoubleValuesSource valueSource, List<MatchingDocs> matchingDocs)
throws IOException {
DoubleRange[] ranges = (DoubleRange[]) this.ranges;
LongRange[] longRanges = getLongRanges();
LongRangeCounter counter = LongRangeCounter.create(longRanges, counts);
int missingCount = 0;
for (MatchingDocs hits : matchingDocs) {
DoubleValues fv = valueSource.getValues(hits.context, null);
totCount += hits.totalHits;
final DocIdSetIterator it = createIterator(hits);
if (it == null) {
continue;
}
for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; ) {
// Skip missing docs:
if (fv.advanceExact(doc)) {
counter.addSingleValued(NumericUtils.doubleToSortableLong(fv.doubleValue()));
} else {
missingCount++;
}
doc = it.nextDoc();
}
}
missingCount += counter.finish();
totCount -= missingCount;
}
/** Create long ranges from the double ranges. */
@Override
protected LongRange[] getLongRanges() {
DoubleRange[] ranges = (DoubleRange[]) this.ranges;
LongRange[] longRanges = new LongRange[ranges.length];
for (int i = 0; i < ranges.length; i++) {
DoubleRange range = ranges[i];
@ -99,56 +150,11 @@ public class DoubleRangeFacetCounts extends RangeFacetCounts {
true);
}
LongRangeCounter counter = new LongRangeCounter(longRanges);
return longRanges;
}
int missingCount = 0;
for (MatchingDocs hits : matchingDocs) {
DoubleValues fv = valueSource.getValues(hits.context, null);
totCount += hits.totalHits;
final DocIdSetIterator fastMatchDocs;
if (fastMatchQuery != null) {
final IndexReaderContext topLevelContext = ReaderUtil.getTopLevelContext(hits.context);
final IndexSearcher searcher = new IndexSearcher(topLevelContext);
searcher.setQueryCache(null);
final Weight fastMatchWeight =
searcher.createWeight(
searcher.rewrite(fastMatchQuery), ScoreMode.COMPLETE_NO_SCORES, 1);
Scorer s = fastMatchWeight.scorer(hits.context);
if (s == null) {
continue;
}
fastMatchDocs = s.iterator();
} else {
fastMatchDocs = null;
}
DocIdSetIterator docs = hits.bits.iterator();
for (int doc = docs.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; ) {
if (fastMatchDocs != null) {
int fastMatchDoc = fastMatchDocs.docID();
if (fastMatchDoc < doc) {
fastMatchDoc = fastMatchDocs.advance(doc);
}
if (doc != fastMatchDoc) {
doc = docs.advance(fastMatchDoc);
continue;
}
}
// Skip missing docs:
if (fv.advanceExact(doc)) {
counter.add(NumericUtils.doubleToSortableLong(fv.doubleValue()));
} else {
missingCount++;
}
doc = docs.nextDoc();
}
}
missingCount += counter.fillCounts(counts);
totCount -= missingCount;
@Override
protected long mapDocValue(long l) {
return NumericUtils.sortableDoubleBits(l);
}
}

View File

@ -0,0 +1,170 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.facet.range;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.List;
/**
* This implementation assumes the requested ranges <i>do not</i> overlap. With this assumption,
* we're able to take a simpler approach to accumulating range counts by just binary searching for
* the appropriate range and counting directly as each value comes in.
*/
class ExclusiveLongRangeCounter extends LongRangeCounter {
/** elementary interval boundaries used for efficient counting (bsearch to find interval) */
private final long[] boundaries;
/** original range number each elementary interval corresponds to (index into countBuffer) */
private final int[] rangeNums;
/** number of counted documents that haven't matched any requested ranges */
private int missingCount;
/** whether-or-not the multi-valued doc currently being counted has matched any ranges */
private boolean multiValuedDocMatchedRange;
ExclusiveLongRangeCounter(LongRange[] ranges, int[] countBuffer) {
super(countBuffer);
// Create a copy of the requested ranges, sorted by min, and keeping track of the original
// position:
LongRangeAndPos[] sortedRanges = new LongRangeAndPos[ranges.length];
for (int i = 0; i < ranges.length; i++) {
sortedRanges[i] = new LongRangeAndPos(ranges[i], i);
}
Arrays.sort(sortedRanges, Comparator.comparingLong(r -> r.range.min));
// Create elementary intervals, which include requested ranges and "gaps" in-between:
List<InclusiveRange> elementaryIntervals = buildElementaryIntervals(sortedRanges);
// Keep track of elementary interval boundary ends (for bsearching) along with the requested
// range they map back to (and -1 when they map to a "gap" range):
boundaries = new long[elementaryIntervals.size()];
rangeNums = new int[elementaryIntervals.size()];
Arrays.fill(rangeNums, -1);
int currRange = 0;
for (int i = 0; i < boundaries.length; i++) {
boundaries[i] = elementaryIntervals.get(i).end;
if (currRange < sortedRanges.length) {
LongRangeAndPos curr = sortedRanges[currRange];
if (boundaries[i] == curr.range.max) {
rangeNums[i] = curr.pos;
currRange++;
}
}
}
}
@Override
void startMultiValuedDoc() {
super.startMultiValuedDoc();
multiValuedDocMatchedRange = false;
}
@Override
boolean endMultiValuedDoc() {
return multiValuedDocMatchedRange;
}
@Override
void addSingleValued(long v) {
if (rangeCount() == 0) {
missingCount++;
return;
}
super.addSingleValued(v);
}
@Override
int finish() {
// Nothing much to do in this case since we're able to count directly into the requested
// ranges as we go in this implementation. Just report any missing count:
return missingCount;
}
@Override
protected long[] boundaries() {
return boundaries;
}
@Override
protected void processSingleValuedHit(int elementaryIntervalNum) {
int rangeNum = rangeNums[elementaryIntervalNum];
if (rangeNum != -1) {
// The elementary interval we matched against corresponds to a requested
// range, so increment it:
increment(rangeNum);
} else {
// The matched elementary interval is a "gap" range, so the doc isn't
// present in any requested ranges:
missingCount++;
}
}
@Override
protected void processMultiValuedHit(int elementaryIntervalNum) {
int rangeNum = rangeNums[elementaryIntervalNum];
if (rangeNum != -1) {
// The elementary interval we matched against corresponds to a requested
// range, so increment it. We can do this without fear of double-counting
// since we know the requested ranges don't overlap:
increment(rangeNum);
multiValuedDocMatchedRange = true;
}
}
/**
* Create elementary intervals, which include requested ranges and "gaps" in-between. This logic
* assumes no requested ranges overlap, and that the incoming ranges have already been sorted.
*/
private static List<InclusiveRange> buildElementaryIntervals(LongRangeAndPos[] sortedRanges) {
List<InclusiveRange> elementaryIntervals = new ArrayList<>();
long prev = Long.MIN_VALUE;
for (LongRangeAndPos range : sortedRanges) {
if (range.range.min > prev) {
// add a "gap" range preceding requested range if necessary:
elementaryIntervals.add(new InclusiveRange(prev, range.range.min - 1));
}
// add the requested range:
elementaryIntervals.add(new InclusiveRange(range.range.min, range.range.max));
prev = range.range.max + 1;
}
if (elementaryIntervals.isEmpty() == false) {
long lastEnd = elementaryIntervals.get(elementaryIntervals.size() - 1).end;
if (lastEnd < Long.MAX_VALUE) {
elementaryIntervals.add(new InclusiveRange(lastEnd + 1, Long.MAX_VALUE));
}
} else {
// If no ranges were requested, create a single entry from MIN_VALUE to MAX_VALUE:
elementaryIntervals.add(new InclusiveRange(Long.MIN_VALUE, Long.MAX_VALUE));
}
return elementaryIntervals;
}
/** Simple container for a requested range and its original position */
private static final class LongRangeAndPos {
final LongRange range;
final int pos;
LongRangeAndPos(LongRange range, int pos) {
this.range = range;
this.pos = pos;
}
}
}

View File

@ -16,122 +16,65 @@
*/
package org.apache.lucene.facet.range;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Arrays;
import java.util.Comparator;
/**
* Counts how many times each range was seen; per-hit it's just a binary search ({@link #add})
* against the elementary intervals, and in the end we rollup back to the original ranges.
* Counter for numeric ranges. Works for both single- and multi-valued cases (assuming you use it
* correctly).
*
* <p>Usage notes: When counting a document field that only has a single value, callers should call
* addSingleValued() with the value. Whenever a document field has multiple values, callers should
* call startMultiValuedDoc() at the beginning of processing the document, followed by
* addMultiValued() with each value before finally calling endMultiValuedDoc() at the end of
* processing the document. The call to endMultiValuedDoc() will respond with a boolean indicating
* whether-or-not the specific document matched against at least one of the ranges being counted.
* Finally, after processing all documents, the caller should call finish(). This final call will
* ensure the contents of the user-provided {@code countBuffer} contains accurate counts (each index
* corresponding to the provided {@code LongRange} in {@code ranges}). The final call to finish()
* will also report how many additional documents did not match against any ranges. The combination
* of the endMultiValuedDoc() boolean responses and the number reported by finish() communicates the
* total number of missing documents. Note that the call to finish() will not report any documents
* already reported missing by endMultiValuedDoc().
*/
final class LongRangeCounter {
abstract class LongRangeCounter {
final LongRangeNode root;
final long[] boundaries;
final int[] leafCounts;
/** accumulated counts for all of the ranges */
private final int[] countBuffer;
// Used during rollup
private int leafUpto;
private int missingCount;
/**
* for multi-value docs, we keep track of the last elementary interval we've counted so we can use
* that as a lower-bound when counting subsequent values. this takes advantage of the fact that
* values within a given doc are sorted.
*/
protected int multiValuedDocLastSeenElementaryInterval;
public LongRangeCounter(LongRange[] ranges) {
// Maps all range inclusive endpoints to int flags; 1
// = start of interval, 2 = end of interval. We need to
// track the start vs end case separately because if a
// given point is both, then it must be its own
// elementary interval:
Map<Long, Integer> endsMap = new HashMap<>();
endsMap.put(Long.MIN_VALUE, 1);
endsMap.put(Long.MAX_VALUE, 2);
for (LongRange range : ranges) {
Integer cur = endsMap.get(range.min);
if (cur == null) {
endsMap.put(range.min, 1);
} else {
endsMap.put(range.min, cur.intValue() | 1);
}
cur = endsMap.get(range.max);
if (cur == null) {
endsMap.put(range.max, 2);
} else {
endsMap.put(range.max, cur.intValue() | 2);
}
}
List<Long> endsList = new ArrayList<>(endsMap.keySet());
Collections.sort(endsList);
// Build elementaryIntervals (a 1D Venn diagram):
List<InclusiveRange> elementaryIntervals = new ArrayList<>();
int upto0 = 1;
long v = endsList.get(0);
long prev;
if (endsMap.get(v) == 3) {
elementaryIntervals.add(new InclusiveRange(v, v));
prev = v + 1;
static LongRangeCounter create(LongRange[] ranges, int[] countBuffer) {
if (hasOverlappingRanges(ranges)) {
return new OverlappingLongRangeCounter(ranges, countBuffer);
} else {
prev = v;
return new ExclusiveLongRangeCounter(ranges, countBuffer);
}
while (upto0 < endsList.size()) {
v = endsList.get(upto0);
int flags = endsMap.get(v);
// System.out.println(" v=" + v + " flags=" + flags);
if (flags == 3) {
// This point is both an end and a start; we need to
// separate it:
if (v > prev) {
elementaryIntervals.add(new InclusiveRange(prev, v - 1));
}
elementaryIntervals.add(new InclusiveRange(v, v));
prev = v + 1;
} else if (flags == 1) {
// This point is only the start of an interval;
// attach it to next interval:
if (v > prev) {
elementaryIntervals.add(new InclusiveRange(prev, v - 1));
}
prev = v;
} else {
assert flags == 2;
// This point is only the end of an interval; attach
// it to last interval:
elementaryIntervals.add(new InclusiveRange(prev, v));
prev = v + 1;
}
// System.out.println(" ints=" + elementaryIntervals);
upto0++;
}
// Build binary tree on top of intervals:
root = split(0, elementaryIntervals.size(), elementaryIntervals);
// Set outputs, so we know which range to output for
// each node in the tree:
for (int i = 0; i < ranges.length; i++) {
root.addOutputs(i, ranges[i]);
}
// Set boundaries (ends of each elementary interval):
boundaries = new long[elementaryIntervals.size()];
for (int i = 0; i < boundaries.length; i++) {
boundaries[i] = elementaryIntervals.get(i).end;
}
leafCounts = new int[boundaries.length];
// System.out.println("ranges: " + Arrays.toString(ranges));
// System.out.println("intervals: " + elementaryIntervals);
// System.out.println("boundaries: " + Arrays.toString(boundaries));
// System.out.println("root:\n" + root);
}
public void add(long v) {
// System.out.println("add v=" + v);
protected LongRangeCounter(int[] countBuffer) {
// We'll populate the user-provided count buffer with range counts:
this.countBuffer = countBuffer;
}
/** Start processing a new doc. It's unnecessary to call this for single-value cases. */
void startMultiValuedDoc() {
multiValuedDocLastSeenElementaryInterval = -1;
}
/**
* Finish processing a new doc. Returns whether-or-not the document contributed a count to at
* least one range. It's unnecessary to call this for single-value cases.
*/
abstract boolean endMultiValuedDoc();
/** Count a single valued doc */
void addSingleValued(long v) {
// NOTE: this works too, but it's ~6% slower on a simple
// test with a high-freq TermQuery w/ range faceting on
@ -148,15 +91,15 @@ final class LongRangeCounter {
// are guaranteed to find a match because the last
// boundary is Long.MAX_VALUE:
long[] boundaries = boundaries();
int lo = 0;
int hi = boundaries.length - 1;
while (true) {
int mid = (lo + hi) >>> 1;
// System.out.println(" cycle lo=" + lo + " hi=" + hi + " mid=" + mid + " boundary=" +
// boundaries[mid] + " to " + boundaries[mid+1]);
if (v <= boundaries[mid]) {
if (mid == 0) {
leafCounts[0]++;
processSingleValuedHit(mid);
return;
} else {
hi = mid - 1;
@ -164,68 +107,118 @@ final class LongRangeCounter {
} else if (v > boundaries[mid + 1]) {
lo = mid + 1;
} else {
leafCounts[mid + 1]++;
// System.out.println(" incr @ " + (mid+1) + "; now " + leafCounts[mid+1]);
processSingleValuedHit(mid + 1);
return;
}
}
}
/** Count a multi-valued doc value */
void addMultiValued(long v) {
if (rangeCount() == 0) {
return; // don't bother if there aren't any requested ranges
}
long[] boundaries = boundaries();
// First check if we've "advanced" beyond the last elementary interval we counted for this doc.
// If we haven't, there's no sense doing anything else:
if (multiValuedDocLastSeenElementaryInterval != -1
&& v <= boundaries[multiValuedDocLastSeenElementaryInterval]) {
return;
}
// Also check if we've already counted the last elementary interval. If so, there's nothing
// else to count for this doc:
final int nextCandidateElementaryInterval = multiValuedDocLastSeenElementaryInterval + 1;
if (nextCandidateElementaryInterval == boundaries.length) {
return;
}
// Binary search in the range of the next candidate interval up to the last interval:
int lo = nextCandidateElementaryInterval;
int hi = boundaries.length - 1;
while (true) {
int mid = (lo + hi) >>> 1;
if (v <= boundaries[mid]) {
if (mid == nextCandidateElementaryInterval) {
processMultiValuedHit(mid);
multiValuedDocLastSeenElementaryInterval = mid;
return;
} else {
hi = mid - 1;
}
} else if (v > boundaries[mid + 1]) {
lo = mid + 1;
} else {
int idx = mid + 1;
processMultiValuedHit(idx);
multiValuedDocLastSeenElementaryInterval = idx;
return;
}
}
}
/**
* Fills counts corresponding to the original input ranges, returning the missing count (how many
* hits didn't match any ranges).
* Finish processing all documents. This will return the number of docs that didn't contribute to
* any ranges (that weren't already reported when calling endMultiValuedDoc()).
*/
public int fillCounts(int[] counts) {
// System.out.println(" rollup");
missingCount = 0;
leafUpto = 0;
rollup(root, counts, false);
return missingCount;
abstract int finish();
/** Provide boundary information for elementary intervals (max inclusive value per interval) */
protected abstract long[] boundaries();
/** Process a single-value "hit" against an elementary interval. */
protected abstract void processSingleValuedHit(int elementaryIntervalNum);
/** Process a multi-value "hit" against an elementary interval. */
protected abstract void processMultiValuedHit(int elementaryIntervalNum);
/** Increment the specified range by one. */
protected final void increment(int rangeNum) {
countBuffer[rangeNum]++;
}
private int rollup(LongRangeNode node, int[] counts, boolean sawOutputs) {
int count;
sawOutputs |= node.outputs != null;
if (node.left != null) {
count = rollup(node.left, counts, sawOutputs);
count += rollup(node.right, counts, sawOutputs);
} else {
// Leaf:
count = leafCounts[leafUpto];
leafUpto++;
if (!sawOutputs) {
// This is a missing count (no output ranges were
// seen "above" us):
missingCount += count;
/** Increment the specified range by the specified count. */
protected final void increment(int rangeNum, int count) {
countBuffer[rangeNum] += count;
}
/** Number of ranges requested by the caller. */
protected final int rangeCount() {
return countBuffer.length;
}
/** Determine whether-or-not any requested ranges overlap */
private static boolean hasOverlappingRanges(LongRange[] ranges) {
if (ranges.length == 0) {
return false;
}
// Copy before sorting so we don't mess with the caller's original ranges:
LongRange[] sortedRanges = new LongRange[ranges.length];
System.arraycopy(ranges, 0, sortedRanges, 0, ranges.length);
Arrays.sort(sortedRanges, Comparator.comparingLong(r -> r.min));
long previousMax = sortedRanges[0].max;
for (int i = 1; i < sortedRanges.length; i++) {
// Ranges overlap if the next min is <= the previous max (note that LongRange models
// closed ranges, so equal limit points are considered overlapping):
if (sortedRanges[i].min <= previousMax) {
return true;
}
previousMax = sortedRanges[i].max;
}
if (node.outputs != null) {
for (int rangeIndex : node.outputs) {
counts[rangeIndex] += count;
}
}
// System.out.println(" rollup node=" + node.start + " to " + node.end + ": count=" + count);
return count;
return false;
}
private static LongRangeNode split(int start, int end, List<InclusiveRange> elementaryIntervals) {
if (start == end - 1) {
// leaf
InclusiveRange range = elementaryIntervals.get(start);
return new LongRangeNode(range.start, range.end, null, null, start);
} else {
int mid = (start + end) >>> 1;
LongRangeNode left = split(start, mid, elementaryIntervals);
LongRangeNode right = split(mid, end, elementaryIntervals);
return new LongRangeNode(left.start, right.end, left, right, -1);
}
}
protected static final class InclusiveRange {
final long start;
final long end;
private static final class InclusiveRange {
public final long start;
public final long end;
public InclusiveRange(long start, long end) {
InclusiveRange(long start, long end) {
assert end >= start;
this.start = start;
this.end = end;
@ -236,82 +229,4 @@ final class LongRangeCounter {
return start + " to " + end;
}
}
/** Holds one node of the segment tree. */
public static final class LongRangeNode {
final LongRangeNode left;
final LongRangeNode right;
// Our range, inclusive:
final long start;
final long end;
// If we are a leaf, the index into elementary ranges that
// we point to:
final int leafIndex;
// Which range indices to output when a query goes
// through this node:
List<Integer> outputs;
public LongRangeNode(
long start, long end, LongRangeNode left, LongRangeNode right, int leafIndex) {
this.start = start;
this.end = end;
this.left = left;
this.right = right;
this.leafIndex = leafIndex;
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
toString(sb, 0);
return sb.toString();
}
static void indent(StringBuilder sb, int depth) {
for (int i = 0; i < depth; i++) {
sb.append(" ");
}
}
/** Recursively assigns range outputs to each node. */
void addOutputs(int index, LongRange range) {
if (start >= range.min && end <= range.max) {
// Our range is fully included in the incoming
// range; add to our output list:
if (outputs == null) {
outputs = new ArrayList<>();
}
outputs.add(index);
} else if (left != null) {
assert right != null;
// Recurse:
left.addOutputs(index, range);
right.addOutputs(index, range);
}
}
void toString(StringBuilder sb, int depth) {
indent(sb, depth);
if (left == null) {
assert right == null;
sb.append("leaf: ").append(start).append(" to ").append(end);
} else {
sb.append("node: ").append(start).append(" to ").append(end);
}
if (outputs != null) {
sb.append(" outputs=");
sb.append(outputs);
}
sb.append('\n');
if (left != null) {
assert right != null;
left.toString(sb, depth + 1);
right.toString(sb, depth + 1);
}
}
}
}

View File

@ -21,37 +21,37 @@ import java.util.List;
import org.apache.lucene.facet.Facets;
import org.apache.lucene.facet.FacetsCollector;
import org.apache.lucene.facet.FacetsCollector.MatchingDocs;
import org.apache.lucene.index.IndexReaderContext;
import org.apache.lucene.index.ReaderUtil;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.SortedNumericDocValues;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.LongValues;
import org.apache.lucene.search.LongValuesSource;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.Weight;
/**
* {@link Facets} implementation that computes counts for dynamic long ranges from a provided {@link
* LongValuesSource}. Use this for dimensions that change in real-time (e.g. a relative time based
* dimension like "Past day", "Past 2 days", etc.) or that change for each request (e.g. distance
* from the user's location, "&lt; 1 km", "&lt; 2 km", etc.).
* {@link Facets} implementation that computes counts for dynamic long ranges. Use this for
* dimensions that change in real-time (e.g. a relative time based dimension like "Past day", "Past
* 2 days", etc.) or that change for each request (e.g. distance from the user's location, "&lt; 1
* km", "&lt; 2 km", etc.).
*
* @lucene.experimental
*/
public class LongRangeFacetCounts extends RangeFacetCounts {
/**
* Create {@code LongRangeFacetCounts}, using {@link LongValuesSource} from the specified field.
* Create {@code LongRangeFacetCounts} using long values from the specified field. The field may
* be single-valued ({@link NumericDocValues}) or multi-valued ({@link SortedNumericDocValues}),
* and will be interpreted as containing long values.
*/
public LongRangeFacetCounts(String field, FacetsCollector hits, LongRange... ranges)
throws IOException {
this(field, LongValuesSource.fromLongField(field), hits, ranges);
this(field, null, hits, ranges);
}
/** Create {@code LongRangeFacetCounts}, using the provided {@link LongValuesSource}. */
/**
* Create {@code LongRangeFacetCounts}, using the provided {@link LongValuesSource} if non-null.
* If {@code valueSource} is null, doc values from the provided {@code field} will be used.
*/
public LongRangeFacetCounts(
String field, LongValuesSource valueSource, FacetsCollector hits, LongRange... ranges)
throws IOException {
@ -59,10 +59,11 @@ public class LongRangeFacetCounts extends RangeFacetCounts {
}
/**
* Create {@code LongRangeFacetCounts}, using the provided {@link LongValuesSource}, and using the
* provided Filter as a fastmatch: only documents passing the filter are checked for the matching
* ranges, which is helpful when the provided {@link LongValuesSource} is costly per-document,
* such as a geo distance. The filter must be random access (implement {@link DocIdSet#bits}).
* Create {@code LongRangeFacetCounts}, using the provided {@link LongValuesSource} if non-null.
* If {@code valueSource} is null, doc values from the provided {@code field} will be used. Use
* the provided {@code Query} as a fastmatch: only documents passing the filter are checked for
* the matching ranges, which is helpful when the provided {@link LongValuesSource} is costly
* per-document, such as a geo distance.
*/
public LongRangeFacetCounts(
String field,
@ -72,67 +73,58 @@ public class LongRangeFacetCounts extends RangeFacetCounts {
LongRange... ranges)
throws IOException {
super(field, ranges, fastMatchQuery);
count(valueSource, hits.getMatchingDocs());
// use the provided valueSource if non-null, otherwise use the doc values associated with the
// field
if (valueSource != null) {
count(valueSource, hits.getMatchingDocs());
} else {
count(field, hits.getMatchingDocs());
}
}
/**
* Counts from the provided valueSource.
*
* <p>TODO: Seems like we could extract this into RangeFacetCounts and make the logic common
* between this class and DoubleRangeFacetCounts somehow. The blocker right now is that this
* implementation expects LongValueSource and DoubleRangeFacetCounts expects DoubleValueSource.
*/
private void count(LongValuesSource valueSource, List<MatchingDocs> matchingDocs)
throws IOException {
LongRange[] ranges = (LongRange[]) this.ranges;
LongRange[] ranges = getLongRanges();
LongRangeCounter counter = new LongRangeCounter(ranges);
LongRangeCounter counter = LongRangeCounter.create(ranges, counts);
int missingCount = 0;
for (MatchingDocs hits : matchingDocs) {
LongValues fv = valueSource.getValues(hits.context, null);
totCount += hits.totalHits;
final DocIdSetIterator fastMatchDocs;
if (fastMatchQuery != null) {
final IndexReaderContext topLevelContext = ReaderUtil.getTopLevelContext(hits.context);
final IndexSearcher searcher = new IndexSearcher(topLevelContext);
searcher.setQueryCache(null);
final Weight fastMatchWeight =
searcher.createWeight(
searcher.rewrite(fastMatchQuery), ScoreMode.COMPLETE_NO_SCORES, 1);
Scorer s = fastMatchWeight.scorer(hits.context);
if (s == null) {
continue;
}
fastMatchDocs = s.iterator();
} else {
fastMatchDocs = null;
final DocIdSetIterator it = createIterator(hits);
if (it == null) {
continue;
}
DocIdSetIterator docs = hits.bits.iterator();
for (int doc = docs.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; ) {
if (fastMatchDocs != null) {
int fastMatchDoc = fastMatchDocs.docID();
if (fastMatchDoc < doc) {
fastMatchDoc = fastMatchDocs.advance(doc);
}
if (doc != fastMatchDoc) {
doc = docs.advance(fastMatchDoc);
continue;
}
}
for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; ) {
// Skip missing docs:
if (fv.advanceExact(doc)) {
counter.add(fv.longValue());
counter.addSingleValued(fv.longValue());
} else {
missingCount++;
}
doc = docs.nextDoc();
doc = it.nextDoc();
}
}
int x = counter.fillCounts(counts);
missingCount += x;
// System.out.println("totCount " + totCount + " x " + x + " missingCount " + missingCount);
missingCount += counter.finish();
totCount -= missingCount;
}
@Override
protected LongRange[] getLongRanges() {
return (LongRange[]) this.ranges;
}
}

View File

@ -0,0 +1,383 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.facet.range;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.lucene.util.FixedBitSet;
/**
* This implementation supports requested ranges that overlap. Because of this, we use a
* segment-tree to more efficiently aggregate counts into ranges at the end of processing. We also
* need to worry about double-counting issues since it's possible that multiple elementary
* intervals, although mutually-exclusive, can roll-up to the same requested range. This creates
* some complexity with how we need to handle multi-valued documents.
*/
class OverlappingLongRangeCounter extends LongRangeCounter {
/** segment tree root node */
private final LongRangeNode root;
/** elementary interval boundaries used for efficient counting (bsearch to find interval) */
private final long[] boundaries;
/**
* whether-or-not there are elementary interval counts that still need to be rolled up at the end
*/
private boolean hasUnflushedCounts;
// Needed only for counting single-valued docs:
/** counts seen in each elementary interval */
private int[] singleValuedElementaryIntervalCounts;
// Needed only for counting multi-valued docs:
/** whether-or-not an elementary interval has seen at least one match for a single doc */
private FixedBitSet multiValuedDocElementaryIntervalHits;
/** whether-or-not a requested range has seen at least one match for a single doc */
private FixedBitSet multiValuedDocRangeHits;
// Used during rollup
private int elementaryIntervalUpto;
/** number of counted documents that haven't matched any requested ranges */
private int missingCount;
OverlappingLongRangeCounter(LongRange[] ranges, int[] countBuffer) {
super(countBuffer);
// Build elementary intervals:
List<InclusiveRange> elementaryIntervals = buildElementaryIntervals(ranges);
// Build binary tree on top of intervals:
root = split(0, elementaryIntervals.size(), elementaryIntervals);
// Set outputs, so we know which range to output for each node in the tree:
for (int i = 0; i < ranges.length; i++) {
root.addOutputs(i, ranges[i]);
}
// Keep track of elementary interval max boundaries for bsearch:
boundaries = new long[elementaryIntervals.size()];
for (int i = 0; i < boundaries.length; i++) {
boundaries[i] = elementaryIntervals.get(i).end;
}
}
@Override
void startMultiValuedDoc() {
super.startMultiValuedDoc();
// Lazy init a bitset to track the elementary intervals we see of a multi-valued doc:
if (multiValuedDocElementaryIntervalHits == null) {
multiValuedDocElementaryIntervalHits = new FixedBitSet(boundaries.length);
} else {
multiValuedDocElementaryIntervalHits.clear(0, multiValuedDocElementaryIntervalHits.length());
}
}
@Override
boolean endMultiValuedDoc() {
assert multiValuedDocElementaryIntervalHits != null : "must call startDoc() first";
// Short-circuit if the caller didn't specify any ranges to count:
if (rangeCount() == 0) {
return false;
}
// Do the rollup for this doc:
// Lazy init a bitset to track the requested ranges seen for this multi-valued doc:
if (multiValuedDocRangeHits == null) {
multiValuedDocRangeHits = new FixedBitSet(rangeCount());
} else {
multiValuedDocRangeHits.clear(0, multiValuedDocRangeHits.length());
}
elementaryIntervalUpto = 0;
rollupMultiValued(root);
// Actually increment the count for each matching range, and see if the doc contributed to
// at least one:
boolean docContributedToAtLeastOneRange = false;
for (int i = multiValuedDocRangeHits.nextSetBit(0); i < multiValuedDocRangeHits.length(); ) {
increment(i);
docContributedToAtLeastOneRange = true;
if (++i < multiValuedDocRangeHits.length()) {
i = multiValuedDocRangeHits.nextSetBit(i);
}
}
return docContributedToAtLeastOneRange;
}
@Override
int finish() {
if (hasUnflushedCounts) {
// Rollup any outstanding counts from single-valued cases:
missingCount = 0;
elementaryIntervalUpto = 0;
rollupSingleValued(root, false);
return missingCount;
} else {
return 0;
}
}
@Override
protected long[] boundaries() {
return boundaries;
}
@Override
protected void processSingleValuedHit(int elementaryIntervalNum) {
// Lazy init:
if (singleValuedElementaryIntervalCounts == null) {
singleValuedElementaryIntervalCounts = new int[boundaries.length];
}
singleValuedElementaryIntervalCounts[elementaryIntervalNum]++;
hasUnflushedCounts = true;
}
@Override
protected void processMultiValuedHit(int elementaryIntervalNum) {
assert multiValuedDocElementaryIntervalHits != null : "must call startDoc() first";
multiValuedDocElementaryIntervalHits.set(elementaryIntervalNum);
}
private static LongRangeNode split(int start, int end, List<InclusiveRange> elementaryIntervals) {
if (start == end - 1) {
// leaf
InclusiveRange range = elementaryIntervals.get(start);
return new LongRangeNode(range.start, range.end, null, null, start);
} else {
int mid = (start + end) >>> 1;
LongRangeNode left = split(start, mid, elementaryIntervals);
LongRangeNode right = split(mid, end, elementaryIntervals);
return new LongRangeNode(left.start, right.end, left, right, -1);
}
}
/**
* Rolls up all the single-valued doc counts. Note that this is done once at the end of processing
* all documents (as part of {@link #finish()}. This is done in bulk at the end for efficiency
* purposes (vs. after ever document). This works only for cases where documents have a
* single-value. Multi-valued docs need to get rolled up after each document to ensure there's no
* double-counting (see {@link #rollupMultiValued(LongRangeNode)})
*/
private int rollupSingleValued(LongRangeNode node, boolean sawOutputs) {
int count;
sawOutputs |= node.outputs != null;
if (node.left != null) {
count = rollupSingleValued(node.left, sawOutputs);
count += rollupSingleValued(node.right, sawOutputs);
} else {
// Leaf:
count = singleValuedElementaryIntervalCounts[elementaryIntervalUpto];
elementaryIntervalUpto++;
if (sawOutputs == false) {
// This is a missing count (no output ranges were seen "above" us):
missingCount += count;
}
}
if (node.outputs != null) {
for (int rangeIndex : node.outputs) {
increment(rangeIndex, count);
}
}
return count;
}
/**
* Rolls up all the multi-valued doc counts. Note that this is done at the end of each document
* (as part of {@link #endMultiValuedDoc()}). All of the counts contributed by a single document
* get rolled up into the appropriate ranges in this step. It must be done after each document so
* that counts don't get double-counted, and so we know whether-or-not an individual doc actually
* contributed to any of the user-requested ranges.
*/
private boolean rollupMultiValued(LongRangeNode node) {
boolean containedHit;
if (node.left != null) {
containedHit = rollupMultiValued(node.left);
containedHit |= rollupMultiValued(node.right);
} else {
// Leaf:
containedHit = multiValuedDocElementaryIntervalHits.get(elementaryIntervalUpto);
elementaryIntervalUpto++;
}
if (containedHit && node.outputs != null) {
for (int rangeIndex : node.outputs) {
multiValuedDocRangeHits.set(rangeIndex);
}
}
return containedHit;
}
private static List<InclusiveRange> buildElementaryIntervals(LongRange[] ranges) {
// Maps all range inclusive endpoints to int flags; 1
// = start of interval, 2 = end of interval. We need to
// track the start vs end case separately because if a
// given point is both, then it must be its own
// elementary interval:
Map<Long, Integer> endsMap = new HashMap<>();
endsMap.put(Long.MIN_VALUE, 1);
endsMap.put(Long.MAX_VALUE, 2);
for (LongRange range : ranges) {
Integer cur = endsMap.get(range.min);
if (cur == null) {
endsMap.put(range.min, 1);
} else {
endsMap.put(range.min, cur | 1);
}
cur = endsMap.get(range.max);
if (cur == null) {
endsMap.put(range.max, 2);
} else {
endsMap.put(range.max, cur | 2);
}
}
List<Long> endsList = new ArrayList<>(endsMap.keySet());
Collections.sort(endsList);
// Build elementaryIntervals (a 1D Venn diagram):
List<InclusiveRange> elementaryIntervals = new ArrayList<>();
int upto = 1;
long v = endsList.get(0);
long prev;
if (endsMap.get(v) == 3) {
elementaryIntervals.add(new InclusiveRange(v, v));
prev = v + 1;
} else {
prev = v;
}
while (upto < endsList.size()) {
v = endsList.get(upto);
int flags = endsMap.get(v);
if (flags == 3) {
// This point is both an end and a start; we need to
// separate it:
if (v > prev) {
elementaryIntervals.add(new InclusiveRange(prev, v - 1));
}
elementaryIntervals.add(new InclusiveRange(v, v));
prev = v + 1;
} else if (flags == 1) {
// This point is only the start of an interval;
// attach it to next interval:
if (v > prev) {
elementaryIntervals.add(new InclusiveRange(prev, v - 1));
}
prev = v;
} else {
assert flags == 2;
// This point is only the end of an interval; attach
// it to last interval:
elementaryIntervals.add(new InclusiveRange(prev, v));
prev = v + 1;
}
upto++;
}
return elementaryIntervals;
}
/** Holds one node of the segment tree. */
public static final class LongRangeNode {
final LongRangeNode left;
final LongRangeNode right;
// Our range, inclusive:
final long start;
final long end;
// If we are a leaf, the index into elementary ranges that we point to:
final int elementaryIntervalIndex;
// Which range indices to output when a query goes
// through this node:
List<Integer> outputs;
public LongRangeNode(
long start,
long end,
LongRangeNode left,
LongRangeNode right,
int elementaryIntervalIndex) {
this.start = start;
this.end = end;
this.left = left;
this.right = right;
this.elementaryIntervalIndex = elementaryIntervalIndex;
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
toString(sb, 0);
return sb.toString();
}
static void indent(StringBuilder sb, int depth) {
for (int i = 0; i < depth; i++) {
sb.append(" ");
}
}
/** Recursively assigns range outputs to each node. */
void addOutputs(int index, LongRange range) {
if (start >= range.min && end <= range.max) {
// Our range is fully included in the incoming
// range; add to our output list:
if (outputs == null) {
outputs = new ArrayList<>();
}
outputs.add(index);
} else if (left != null) {
assert right != null;
// Recurse:
left.addOutputs(index, range);
right.addOutputs(index, range);
}
}
void toString(StringBuilder sb, int depth) {
indent(sb, depth);
if (left == null) {
assert right == null;
sb.append("leaf: ").append(start).append(" to ").append(end);
} else {
sb.append("node: ").append(start).append(" to ").append(end);
}
if (outputs != null) {
sb.append(" outputs=");
sb.append(outputs);
}
sb.append('\n');
if (left != null) {
assert right != null;
left.toString(sb, depth + 1);
right.toString(sb, depth + 1);
}
}
}
}

View File

@ -17,12 +17,25 @@
package org.apache.lucene.facet.range;
import java.io.IOException;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import org.apache.lucene.facet.FacetResult;
import org.apache.lucene.facet.Facets;
import org.apache.lucene.facet.FacetsCollector;
import org.apache.lucene.facet.LabelAndValue;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.IndexReaderContext;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.ReaderUtil;
import org.apache.lucene.index.SortedNumericDocValues;
import org.apache.lucene.search.ConjunctionUtils;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.Weight;
/**
* Base class for range faceting.
@ -57,6 +70,147 @@ abstract class RangeFacetCounts extends Facets {
counts = new int[ranges.length];
}
/**
* Create a {@link org.apache.lucene.search.DocIdSetIterator} from the provided {@code hits} that
* relies on {@code fastMatchQuery} if available for first-pass filtering. A null response
* indicates no documents will match.
*/
protected DocIdSetIterator createIterator(FacetsCollector.MatchingDocs hits) throws IOException {
if (fastMatchQuery != null) {
final IndexReaderContext topLevelContext = ReaderUtil.getTopLevelContext(hits.context);
final IndexSearcher searcher = new IndexSearcher(topLevelContext);
searcher.setQueryCache(null);
final Weight fastMatchWeight =
searcher.createWeight(searcher.rewrite(fastMatchQuery), ScoreMode.COMPLETE_NO_SCORES, 1);
final Scorer s = fastMatchWeight.scorer(hits.context);
if (s == null) {
return null; // no hits from the fastMatchQuery; return null
} else {
DocIdSetIterator fastMatchDocs = s.iterator();
return ConjunctionUtils.intersectIterators(
Arrays.asList(hits.bits.iterator(), fastMatchDocs));
}
} else {
return hits.bits.iterator();
}
}
protected abstract LongRange[] getLongRanges();
protected long mapDocValue(long l) {
return l;
}
/** Counts from the provided field. */
protected void count(String field, List<FacetsCollector.MatchingDocs> matchingDocs)
throws IOException {
// load doc values for all segments up front and keep track of whether-or-not we found any that
// were actually multi-valued. this allows us to optimize the case where all segments contain
// single-values.
SortedNumericDocValues[] multiValuedDocVals = new SortedNumericDocValues[matchingDocs.size()];
NumericDocValues[] singleValuedDocVals = null;
boolean foundMultiValued = false;
for (int i = 0; i < matchingDocs.size(); i++) {
FacetsCollector.MatchingDocs hits = matchingDocs.get(i);
SortedNumericDocValues multiValues = DocValues.getSortedNumeric(hits.context.reader(), field);
multiValuedDocVals[i] = multiValues;
// only bother trying to unwrap a singleton if we haven't yet seen any true multi-valued cases
if (foundMultiValued == false) {
NumericDocValues singleValues = DocValues.unwrapSingleton(multiValues);
if (singleValues != null) {
if (singleValuedDocVals == null) {
singleValuedDocVals = new NumericDocValues[matchingDocs.size()];
}
singleValuedDocVals[i] = singleValues;
} else {
foundMultiValued = true;
}
}
}
// we only need to keep around one or the other at this point
if (foundMultiValued) {
singleValuedDocVals = null;
} else {
multiValuedDocVals = null;
}
LongRangeCounter counter = LongRangeCounter.create(getLongRanges(), counts);
int missingCount = 0;
// if we didn't find any multi-valued cases, we can run a more optimal counting algorithm
if (foundMultiValued == false) {
for (int i = 0; i < matchingDocs.size(); i++) {
FacetsCollector.MatchingDocs hits = matchingDocs.get(i);
final DocIdSetIterator it = createIterator(hits);
if (it == null) {
continue;
}
assert singleValuedDocVals != null;
NumericDocValues singleValues = singleValuedDocVals[i];
totCount += hits.totalHits;
for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; ) {
if (singleValues.advanceExact(doc)) {
counter.addSingleValued(mapDocValue(singleValues.longValue()));
} else {
missingCount++;
}
doc = it.nextDoc();
}
}
} else {
for (int i = 0; i < matchingDocs.size(); i++) {
final DocIdSetIterator it = createIterator(matchingDocs.get(i));
if (it == null) {
continue;
}
SortedNumericDocValues multiValues = multiValuedDocVals[i];
for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; ) {
if (multiValues.advanceExact(doc)) {
int limit = multiValues.docValueCount();
// optimize single-value case
if (limit == 1) {
counter.addSingleValued(mapDocValue(multiValues.nextValue()));
totCount++;
} else {
counter.startMultiValuedDoc();
for (int j = 0; j < limit; j++) {
counter.addMultiValued(mapDocValue(multiValues.nextValue()));
}
if (counter.endMultiValuedDoc()) {
totCount++;
}
}
}
doc = it.nextDoc();
}
}
}
missingCount += counter.finish();
totCount -= missingCount;
}
@Override
public FacetResult getTopChildren(int topN, String dim, String... path) {
if (dim.equals(field) == false) {

View File

@ -16,6 +16,7 @@
*/
package org.apache.lucene.facet.range;
import com.carrotsearch.randomizedtesting.generators.RandomNumbers;
import java.io.IOException;
import java.util.HashMap;
import java.util.List;
@ -26,6 +27,7 @@ import org.apache.lucene.document.DoubleDocValuesField;
import org.apache.lucene.document.DoublePoint;
import org.apache.lucene.document.LongPoint;
import org.apache.lucene.document.NumericDocValuesField;
import org.apache.lucene.document.SortedNumericDocValuesField;
import org.apache.lucene.facet.DrillDownQuery;
import org.apache.lucene.facet.DrillSideways;
import org.apache.lucene.facet.DrillSideways.DrillSidewaysResult;
@ -104,6 +106,103 @@ public class TestRangeFacetCounts extends FacetTestCase {
d.close();
}
public void testBasicLongMultiValued() throws Exception {
Directory d = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), d);
Document doc = new Document();
// just index the same value twice each time and make sure we don't double count
SortedNumericDocValuesField field1 = new SortedNumericDocValuesField("field", 0L);
SortedNumericDocValuesField field2 = new SortedNumericDocValuesField("field", 0L);
doc.add(field1);
doc.add(field2);
for (long l = 0; l < 100; l++) {
field1.setLongValue(l);
field2.setLongValue(l);
w.addDocument(doc);
}
// Also add Long.MAX_VALUE
field1.setLongValue(Long.MAX_VALUE);
field2.setLongValue(Long.MAX_VALUE);
w.addDocument(doc);
IndexReader r = w.getReader();
w.close();
FacetsCollector fc = new FacetsCollector();
IndexSearcher s = newSearcher(r);
s.search(new MatchAllDocsQuery(), fc);
Facets facets =
new LongRangeFacetCounts(
"field",
fc,
new LongRange("less than 10", 0L, true, 10L, false),
new LongRange("less than or equal to 10", 0L, true, 10L, true),
new LongRange("over 90", 90L, false, 100L, false),
new LongRange("90 or above", 90L, true, 100L, false),
new LongRange("over 1000", 1000L, false, Long.MAX_VALUE, true));
FacetResult result = facets.getTopChildren(10, "field");
assertEquals(
"dim=field path=[] value=22 childCount=5\n less than 10 (10)\n less than or equal to 10 (11)\n over 90 (9)\n 90 or above (10)\n over 1000 (1)\n",
result.toString());
r.close();
d.close();
}
public void testBasicLongMultiValuedMixedSegmentTypes() throws Exception {
Directory d = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), d);
SortedNumericDocValuesField field1 = new SortedNumericDocValuesField("field", 0L);
SortedNumericDocValuesField field2 = new SortedNumericDocValuesField("field", 0L);
// write docs as two segments (50 in each). the first segment will contain a mix of single- and
// multi-value cases, while the second segment will be all single values.
for (int l = 0; l < 100; l++) {
field1.setLongValue(l);
field2.setLongValue(l);
Document doc = new Document();
doc.add(field1);
if (l == 0) {
doc.add(field2);
} else if (l < 50) {
if (random().nextBoolean()) {
doc.add(field2);
}
}
w.addDocument(doc);
if (l == 50) {
w.commit();
}
}
IndexReader r = w.getReader();
w.close();
FacetsCollector fc = new FacetsCollector();
IndexSearcher s = newSearcher(r);
s.search(new MatchAllDocsQuery(), fc);
Facets facets =
new LongRangeFacetCounts(
"field",
fc,
new LongRange("less than 10", 0L, true, 10L, false),
new LongRange("less than or equal to 10", 0L, true, 10L, true),
new LongRange("over 90", 90L, false, 100L, false),
new LongRange("90 or above", 90L, true, 100L, false),
new LongRange("over 1000", 1000L, false, Long.MAX_VALUE, true));
FacetResult result = facets.getTopChildren(10, "field");
assertEquals(
"dim=field path=[] value=21 childCount=5\n less than 10 (10)\n less than or equal to 10 (11)\n over 90 (9)\n 90 or above (10)\n over 1000 (0)\n",
result.toString());
r.close();
d.close();
}
public void testLongGetAllDims() throws Exception {
Directory d = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), d);
@ -248,6 +347,63 @@ public class TestRangeFacetCounts extends FacetTestCase {
d.close();
}
public void testEmptyRangesSingleValued() throws Exception {
Directory d = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), d);
Document doc = new Document();
NumericDocValuesField field = new NumericDocValuesField("field", 0L);
doc.add(field);
for (long l = 0; l < 100; l++) {
field.setLongValue(l);
w.addDocument(doc);
}
IndexReader r = w.getReader();
w.close();
FacetsCollector fc = new FacetsCollector();
IndexSearcher s = newSearcher(r);
s.search(new MatchAllDocsQuery(), fc);
Facets facets = new LongRangeFacetCounts("field", fc);
FacetResult result = facets.getTopChildren(10, "field");
assertEquals("dim=field path=[] value=0 childCount=0\n", result.toString());
r.close();
d.close();
}
public void testEmptyRangesMultiValued() throws Exception {
Directory d = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), d);
Document doc = new Document();
SortedNumericDocValuesField field1 = new SortedNumericDocValuesField("field", 0L);
SortedNumericDocValuesField field2 = new SortedNumericDocValuesField("field", 0L);
doc.add(field1);
doc.add(field2);
for (long l = 0; l < 100; l++) {
field1.setLongValue(l);
field2.setLongValue(l);
w.addDocument(doc);
}
IndexReader r = w.getReader();
w.close();
FacetsCollector fc = new FacetsCollector();
IndexSearcher s = newSearcher(r);
s.search(new MatchAllDocsQuery(), fc);
Facets facets = new LongRangeFacetCounts("field", fc);
FacetResult result = facets.getTopChildren(10, "field");
assertEquals("dim=field path=[] value=0 childCount=0\n", result.toString());
r.close();
d.close();
}
/**
* Tests single request that mixes Range and non-Range faceting, with DrillSideways and taxonomy.
*/
@ -402,7 +558,96 @@ public class TestRangeFacetCounts extends FacetTestCase {
IOUtils.close(r, d);
}
public void testRandomLongs() throws Exception {
public void testBasicDoubleMultiValued() throws Exception {
Directory d = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), d);
Document doc = new Document();
// index the same value twice and make sure we don't double count
SortedNumericDocValuesField field1 = new SortedNumericDocValuesField("field", 0);
SortedNumericDocValuesField field2 = new SortedNumericDocValuesField("field", 0);
doc.add(field1);
doc.add(field2);
for (long l = 0; l < 100; l++) {
field1.setLongValue(NumericUtils.doubleToSortableLong(l));
field2.setLongValue(NumericUtils.doubleToSortableLong(l));
w.addDocument(doc);
}
IndexReader r = w.getReader();
FacetsCollector fc = new FacetsCollector();
IndexSearcher s = newSearcher(r);
s.search(new MatchAllDocsQuery(), fc);
Facets facets =
new DoubleRangeFacetCounts(
"field",
fc,
new DoubleRange("less than 10", 0.0, true, 10.0, false),
new DoubleRange("less than or equal to 10", 0.0, true, 10.0, true),
new DoubleRange("over 90", 90.0, false, 100.0, false),
new DoubleRange("90 or above", 90.0, true, 100.0, false),
new DoubleRange("over 1000", 1000.0, false, Double.POSITIVE_INFINITY, false));
assertEquals(
"dim=field path=[] value=21 childCount=5\n less than 10 (10)\n less than or equal to 10 (11)\n over 90 (9)\n 90 or above (10)\n over 1000 (0)\n",
facets.getTopChildren(10, "field").toString());
w.close();
IOUtils.close(r, d);
}
public void testBasicDoubleMultiValuedMixedSegmentTypes() throws Exception {
Directory d = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), d);
SortedNumericDocValuesField field1 = new SortedNumericDocValuesField("field", 0L);
SortedNumericDocValuesField field2 = new SortedNumericDocValuesField("field", 0L);
// write docs as two segments (50 in each). the first segment will contain a mix of single- and
// multi-value cases, while the second segment will be all single values.
for (int l = 0; l < 100; l++) {
field1.setLongValue(NumericUtils.doubleToSortableLong(l));
field2.setLongValue(NumericUtils.doubleToSortableLong(l));
Document doc = new Document();
doc.add(field1);
if (l == 0) {
doc.add(field2);
} else if (l < 50) {
if (random().nextBoolean()) {
doc.add(field2);
}
}
w.addDocument(doc);
if (l == 50) {
w.commit();
}
}
IndexReader r = w.getReader();
w.close();
FacetsCollector fc = new FacetsCollector();
IndexSearcher s = newSearcher(r);
s.search(new MatchAllDocsQuery(), fc);
Facets facets =
new DoubleRangeFacetCounts(
"field",
fc,
new DoubleRange("less than 10", 0.0, true, 10.0, false),
new DoubleRange("less than or equal to 10", 0.0, true, 10.0, true),
new DoubleRange("over 90", 90.0, false, 100.0, false),
new DoubleRange("90 or above", 90.0, true, 100.0, false),
new DoubleRange("over 1000", 1000.0, false, Double.POSITIVE_INFINITY, false));
FacetResult result = facets.getTopChildren(10, "field");
assertEquals(
"dim=field path=[] value=21 childCount=5\n less than 10 (10)\n less than or equal to 10 (11)\n over 90 (9)\n 90 or above (10)\n over 1000 (0)\n",
result.toString());
r.close();
d.close();
}
public void testRandomLongsSingleValued() throws Exception {
Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
@ -550,7 +795,166 @@ public class TestRangeFacetCounts extends FacetTestCase {
IOUtils.close(r, dir);
}
public void testRandomDoubles() throws Exception {
public void testRandomLongsMultiValued() throws Exception {
Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
int numDocs = atLeast(1000);
if (VERBOSE) {
System.out.println("TEST: numDocs=" + numDocs);
}
long[][] values = new long[numDocs][];
long minValue = Long.MAX_VALUE;
long maxValue = Long.MIN_VALUE;
for (int i = 0; i < numDocs; i++) {
Document doc = new Document();
int numVals = RandomNumbers.randomIntBetween(random(), 1, 50);
if (random().nextInt(10) == 0) {
numVals = 1; // make sure we have ample testing of single-value cases
}
values[i] = new long[numVals];
for (int j = 0; j < numVals; j++) {
long v = random().nextLong();
values[i][j] = v;
doc.add(new SortedNumericDocValuesField("field", v));
doc.add(new LongPoint("field", v));
minValue = Math.min(minValue, v);
maxValue = Math.max(maxValue, v);
}
w.addDocument(doc);
}
IndexReader r = w.getReader();
IndexSearcher s = newSearcher(r, false);
FacetsConfig config = new FacetsConfig();
int numIters = atLeast(10);
for (int iter = 0; iter < numIters; iter++) {
if (VERBOSE) {
System.out.println("TEST: iter=" + iter);
}
int numRange = TestUtil.nextInt(random(), 1, 100);
LongRange[] ranges = new LongRange[numRange];
int[] expectedCounts = new int[numRange];
long minAcceptedValue = Long.MAX_VALUE;
long maxAcceptedValue = Long.MIN_VALUE;
for (int rangeID = 0; rangeID < numRange; rangeID++) {
long min;
if (rangeID > 0 && random().nextInt(10) == 7) {
// Use an existing boundary:
LongRange prevRange = ranges[random().nextInt(rangeID)];
if (random().nextBoolean()) {
min = prevRange.min;
} else {
min = prevRange.max;
}
} else {
min = random().nextLong();
}
long max;
if (rangeID > 0 && random().nextInt(10) == 7) {
// Use an existing boundary:
LongRange prevRange = ranges[random().nextInt(rangeID)];
if (random().nextBoolean()) {
max = prevRange.min;
} else {
max = prevRange.max;
}
} else {
max = random().nextLong();
}
if (min > max) {
long x = min;
min = max;
max = x;
}
boolean minIncl;
boolean maxIncl;
// NOTE: max - min >= 0 is here to handle the common overflow case!
if (max - min >= 0 && max - min < 2) {
// If max == min or max == min+1, we always do inclusive, else we might pass an empty
// range and hit exc from LongRange's ctor:
minIncl = true;
maxIncl = true;
} else {
minIncl = random().nextBoolean();
maxIncl = random().nextBoolean();
}
ranges[rangeID] = new LongRange("r" + rangeID, min, minIncl, max, maxIncl);
if (VERBOSE) {
System.out.println(" range " + rangeID + ": " + ranges[rangeID]);
}
// Do "slow but hopefully correct" computation of
// expected count:
for (int i = 0; i < numDocs; i++) {
for (int j = 0; j < values[i].length; j++) {
boolean accept = true;
if (minIncl) {
accept &= values[i][j] >= min;
} else {
accept &= values[i][j] > min;
}
if (maxIncl) {
accept &= values[i][j] <= max;
} else {
accept &= values[i][j] < max;
}
if (accept) {
expectedCounts[rangeID]++;
minAcceptedValue = Math.min(minAcceptedValue, values[i][j]);
maxAcceptedValue = Math.max(maxAcceptedValue, values[i][j]);
break; // ensure each doc can contribute at most 1 count to each range
}
}
}
}
FacetsCollector sfc = new FacetsCollector();
s.search(new MatchAllDocsQuery(), sfc);
Query fastMatchQuery;
if (random().nextBoolean()) {
if (random().nextBoolean()) {
fastMatchQuery = LongPoint.newRangeQuery("field", minValue, maxValue);
} else {
fastMatchQuery = LongPoint.newRangeQuery("field", minAcceptedValue, maxAcceptedValue);
}
} else {
fastMatchQuery = null;
}
Facets facets = new LongRangeFacetCounts("field", null, sfc, fastMatchQuery, ranges);
FacetResult result = facets.getTopChildren(10, "field");
assertEquals(numRange, result.labelValues.length);
for (int rangeID = 0; rangeID < numRange; rangeID++) {
if (VERBOSE) {
System.out.println(" range " + rangeID + " expectedCount=" + expectedCounts[rangeID]);
}
LabelAndValue subNode = result.labelValues[rangeID];
assertEquals("r" + rangeID, subNode.label);
assertEquals(expectedCounts[rangeID], subNode.value.intValue());
LongRange range = ranges[rangeID];
// Test drill-down:
DrillDownQuery ddq = new DrillDownQuery(config);
if (random().nextBoolean()) {
ddq.add("field", LongPoint.newRangeQuery("field", range.min, range.max));
} else {
ddq.add(
"field",
SortedNumericDocValuesField.newSlowRangeQuery("field", range.min, range.max));
}
assertEquals(expectedCounts[rangeID], s.count(ddq));
}
}
w.close();
IOUtils.close(r, dir);
}
public void testRandomDoublesSingleValued() throws Exception {
Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
@ -694,6 +1098,162 @@ public class TestRangeFacetCounts extends FacetTestCase {
IOUtils.close(r, dir);
}
public void testRandomDoublesMultiValued() throws Exception {
Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
int numDocs = atLeast(1000);
double[][] values = new double[numDocs][];
double minValue = Double.POSITIVE_INFINITY;
double maxValue = Double.NEGATIVE_INFINITY;
for (int i = 0; i < numDocs; i++) {
Document doc = new Document();
int numVals = RandomNumbers.randomIntBetween(random(), 1, 50);
if (random().nextInt(10) == 0) {
numVals = 1; // make sure we have ample testing of single-value cases
}
values[i] = new double[numVals];
for (int j = 0; j < numVals; j++) {
double v = random().nextDouble();
values[i][j] = v;
doc.add(new SortedNumericDocValuesField("field", Double.doubleToLongBits(v)));
doc.add(new DoublePoint("field", v));
minValue = Math.min(minValue, v);
maxValue = Math.max(maxValue, v);
}
w.addDocument(doc);
}
IndexReader r = w.getReader();
IndexSearcher s = newSearcher(r, false);
FacetsConfig config = new FacetsConfig();
int numIters = atLeast(10);
for (int iter = 0; iter < numIters; iter++) {
if (VERBOSE) {
System.out.println("TEST: iter=" + iter);
}
int numRange = TestUtil.nextInt(random(), 1, 5);
DoubleRange[] ranges = new DoubleRange[numRange];
int[] expectedCounts = new int[numRange];
double minAcceptedValue = Double.POSITIVE_INFINITY;
double maxAcceptedValue = Double.NEGATIVE_INFINITY;
for (int rangeID = 0; rangeID < numRange; rangeID++) {
double min;
if (rangeID > 0 && random().nextInt(10) == 7) {
// Use an existing boundary:
DoubleRange prevRange = ranges[random().nextInt(rangeID)];
if (random().nextBoolean()) {
min = prevRange.min;
} else {
min = prevRange.max;
}
} else {
min = random().nextDouble();
}
double max;
if (rangeID > 0 && random().nextInt(10) == 7) {
// Use an existing boundary:
DoubleRange prevRange = ranges[random().nextInt(rangeID)];
if (random().nextBoolean()) {
max = prevRange.min;
} else {
max = prevRange.max;
}
} else {
max = random().nextDouble();
}
if (min > max) {
double x = min;
min = max;
max = x;
}
boolean minIncl;
boolean maxIncl;
long minAsLong = NumericUtils.doubleToSortableLong(min);
long maxAsLong = NumericUtils.doubleToSortableLong(max);
// NOTE: maxAsLong - minAsLong >= 0 is here to handle the common overflow case!
if (maxAsLong - minAsLong >= 0 && maxAsLong - minAsLong < 2) {
minIncl = true;
maxIncl = true;
} else {
minIncl = random().nextBoolean();
maxIncl = random().nextBoolean();
}
ranges[rangeID] = new DoubleRange("r" + rangeID, min, minIncl, max, maxIncl);
// Do "slow but hopefully correct" computation of
// expected count:
for (int i = 0; i < numDocs; i++) {
for (int j = 0; j < values[i].length; j++) {
boolean accept = true;
if (minIncl) {
accept &= values[i][j] >= min;
} else {
accept &= values[i][j] > min;
}
if (maxIncl) {
accept &= values[i][j] <= max;
} else {
accept &= values[i][j] < max;
}
if (accept) {
expectedCounts[rangeID]++;
minAcceptedValue = Math.min(minAcceptedValue, values[i][j]);
maxAcceptedValue = Math.max(maxAcceptedValue, values[i][j]);
break; // ensure each doc can contribute at most 1 count to each range
}
}
}
}
FacetsCollector sfc = new FacetsCollector();
s.search(new MatchAllDocsQuery(), sfc);
Query fastMatchFilter;
if (random().nextBoolean()) {
if (random().nextBoolean()) {
fastMatchFilter = DoublePoint.newRangeQuery("field", minValue, maxValue);
} else {
fastMatchFilter = DoublePoint.newRangeQuery("field", minAcceptedValue, maxAcceptedValue);
}
} else {
fastMatchFilter = null;
}
Facets facets = new DoubleRangeFacetCounts("field", null, sfc, fastMatchFilter, ranges);
FacetResult result = facets.getTopChildren(10, "field");
assertEquals(numRange, result.labelValues.length);
for (int rangeID = 0; rangeID < numRange; rangeID++) {
if (VERBOSE) {
System.out.println(" range " + rangeID + " expectedCount=" + expectedCounts[rangeID]);
}
LabelAndValue subNode = result.labelValues[rangeID];
assertEquals("r" + rangeID, subNode.label);
assertEquals(expectedCounts[rangeID], subNode.value.intValue());
DoubleRange range = ranges[rangeID];
// Test drill-down:
DrillDownQuery ddq = new DrillDownQuery(config);
if (random().nextBoolean()) {
ddq.add("field", DoublePoint.newRangeQuery("field", range.min, range.max));
} else {
ddq.add(
"field",
SortedNumericDocValuesField.newSlowRangeQuery(
"field", Double.doubleToLongBits(range.min), Double.doubleToLongBits(range.max)));
}
assertEquals(expectedCounts[rangeID], s.count(ddq));
}
}
w.close();
IOUtils.close(r, dir);
}
// LUCENE-5178
public void testMissingValues() throws Exception {
Directory d = newDirectory();
@ -735,6 +1295,50 @@ public class TestRangeFacetCounts extends FacetTestCase {
IOUtils.close(r, d);
}
public void testMissingValuesMultiValued() throws Exception {
Directory d = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), d);
Document doc = new Document();
// index the same field twice to test multi-valued logic
SortedNumericDocValuesField field1 = new SortedNumericDocValuesField("field", 0L);
SortedNumericDocValuesField field2 = new SortedNumericDocValuesField("field", 0L);
doc.add(field1);
doc.add(field2);
for (long l = 0; l < 100; l++) {
if (l % 5 == 0) {
// Every 5th doc is missing the value:
w.addDocument(new Document());
continue;
}
field1.setLongValue(l);
field2.setLongValue(l);
w.addDocument(doc);
}
IndexReader r = w.getReader();
FacetsCollector fc = new FacetsCollector();
IndexSearcher s = newSearcher(r);
s.search(new MatchAllDocsQuery(), fc);
Facets facets =
new LongRangeFacetCounts(
"field",
fc,
new LongRange("less than 10", 0L, true, 10L, false),
new LongRange("less than or equal to 10", 0L, true, 10L, true),
new LongRange("over 90", 90L, false, 100L, false),
new LongRange("90 or above", 90L, true, 100L, false),
new LongRange("over 1000", 1000L, false, Long.MAX_VALUE, false));
assertEquals(
"dim=field path=[] value=16 childCount=5\n less than 10 (8)\n less than or equal to 10 (8)\n over 90 (8)\n 90 or above (8)\n over 1000 (0)\n",
facets.getTopChildren(10, "field").toString());
w.close();
IOUtils.close(r, d);
}
private static class UsedQuery extends Query {
private final AtomicBoolean used;