Github#11869: Add RangeOnRangeFacetCounts (#11901)

This commit is contained in:
Marc D'Mello 2022-12-30 07:38:13 -08:00 committed by GitHub
parent 6f477e5831
commit cbfed77fd3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 2350 additions and 6 deletions

View File

@ -21,14 +21,22 @@ import java.io.IOException;
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.util.BytesRef;
class BinaryRangeDocValues extends BinaryDocValues {
/** A binary representation of a range that wraps a BinaryDocValues field */
public class BinaryRangeDocValues extends BinaryDocValues {
private final BinaryDocValues in;
private byte[] packedValue;
private final int numDims;
private final int numBytesPerDimension;
private int docID = -1;
BinaryRangeDocValues(BinaryDocValues in, int numDims, int numBytesPerDimension) {
/**
* Constructor for BinaryRangeDocValues
*
* @param in the binary doc values source field
* @param numDims the number of dimensions in each doc values field
* @param numBytesPerDimension size of each dimension (2 * encoded value size)
*/
public BinaryRangeDocValues(BinaryDocValues in, int numDims, int numBytesPerDimension) {
assert in != null;
this.in = in;
this.numBytesPerDimension = numBytesPerDimension;
@ -82,6 +90,11 @@ class BinaryRangeDocValues extends BinaryDocValues {
return in.binaryValue();
}
/**
* Gets the packed value that represents this range
*
* @return the packed value that represents this range
*/
public byte[] getPackedValue() {
return packedValue;
}

View File

@ -131,7 +131,7 @@ public class DoubleRange extends Field {
*
* <p>example for 4 dimensions (8 bytes per dimension value): minD1 ... minD4 | maxD1 ... maxD4
*/
static void verifyAndEncode(double[] min, double[] max, byte[] bytes) {
public static void verifyAndEncode(double[] min, double[] max, byte[] bytes) {
for (int d = 0, i = 0, j = min.length * BYTES; d < min.length; ++d, i += BYTES, j += BYTES) {
if (Double.isNaN(min[d])) {
throw new IllegalArgumentException(

View File

@ -131,7 +131,7 @@ public class LongRange extends Field {
*
* <p>example for 4 dimensions (8 bytes per dimension value): minD1 ... minD4 | maxD1 ... maxD4
*/
static void verifyAndEncode(long[] min, long[] max, byte[] bytes) {
public static void verifyAndEncode(long[] min, long[] max, byte[] bytes) {
for (int d = 0, i = 0, j = min.length * BYTES; d < min.length; ++d, i += BYTES, j += BYTES) {
if (min[d] > max[d]) {
throw new IllegalArgumentException(

View File

@ -251,7 +251,7 @@ public abstract class RangeFieldQuery extends Query {
}
@Override
boolean matches(
public boolean matches(
byte[] queryPackedValue,
byte[] packedValue,
int numDims,
@ -307,7 +307,11 @@ public abstract class RangeFieldQuery extends Query {
int dim,
ByteArrayComparator comparator);
boolean matches(
/**
* Compares every dim for 2 encoded ranges and returns true if all dims match. Matching
* implementation is based on the QueryType.
*/
public boolean matches(
byte[] queryPackedValue,
byte[] packedValue,
int numDims,

View File

@ -28,4 +28,5 @@ module org.apache.lucene.facet {
exports org.apache.lucene.facet.taxonomy.directory;
exports org.apache.lucene.facet.taxonomy.writercache;
exports org.apache.lucene.facet.facetset;
exports org.apache.lucene.facet.rangeonrange;
}

View File

@ -0,0 +1,122 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.facet.rangeonrange;
import java.util.Arrays;
import java.util.Objects;
/** Represents a double range for RangeOnRange faceting */
public class DoubleRange extends Range {
/** Minimum (inclusive). */
public final double[] min;
/** Maximum (inclusive). */
public final double[] max;
/**
* Represents a double range for RangeOnRange faceting
*
* @param label the name of the range
* @param minIn the minimum
* @param minInclusive if the minimum is inclusive
* @param maxIn the maximum
* @param maxInclusive if the maximum is inclusive
*/
public DoubleRange(
String label, double minIn, boolean minInclusive, double maxIn, boolean maxInclusive) {
super(label, 1);
if (Double.isNaN(minIn)) {
throw new IllegalArgumentException("min cannot be NaN");
}
if (minInclusive == false) {
minIn = Math.nextUp(minIn);
}
if (Double.isNaN(maxIn)) {
throw new IllegalArgumentException("max cannot be NaN");
}
if (maxInclusive == false) {
// Why no Math.nextDown?
maxIn = Math.nextAfter(maxIn, Double.NEGATIVE_INFINITY);
}
if (minIn > maxIn) {
failNoMatch();
}
this.min = new double[] {minIn};
this.max = new double[] {maxIn};
}
/**
* Represents a double range for RangeOnRange faceting
*
* @param label the name of the range
* @param min the minimum, inclusive
* @param max the maximum, inclusive
*/
public DoubleRange(String label, double[] min, double[] max) {
super(label, min.length);
checkArgs(min, max);
this.min = min;
this.max = max;
}
@Override
public String toString() {
return "DoubleRange(label: "
+ label
+ ", min: "
+ Arrays.toString(min)
+ ", max: "
+ Arrays.toString(max)
+ ")";
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
DoubleRange doubleRange = (DoubleRange) o;
return Arrays.equals(min, doubleRange.min)
&& Arrays.equals(max, doubleRange.max)
&& label.equals(doubleRange.label)
&& dims == doubleRange.dims;
}
@Override
public int hashCode() {
return Objects.hash(label, Arrays.hashCode(min), Arrays.hashCode(max), dims);
}
/** validate the arguments */
private void checkArgs(final double[] min, final double[] max) {
if (min == null || max == null || min.length == 0 || max.length == 0) {
failNoMatch();
}
if (min.length != max.length) {
failNoMatch();
}
for (int i = 0; i < min.length; i++) {
if (min[i] > max[i]) {
failNoMatch();
}
}
}
}

View File

@ -0,0 +1,94 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.facet.rangeonrange;
import static org.apache.lucene.document.DoubleRange.verifyAndEncode;
import java.io.IOException;
import org.apache.lucene.document.RangeFieldQuery;
import org.apache.lucene.facet.FacetsCollector;
import org.apache.lucene.search.Query;
/**
* Represents counts for double range on range faceting. To be more specific, this means that given
* a range (or list of ranges), this class will count all the documents in the index (or that match
* a fast match query) that contain ranges that "match" the provided ranges. These ranges are
* specified by the field parameter and expected to be of type {@link
* org.apache.lucene.document.DoubleRangeDocValuesField}. Matching is defined by the queryType
* param, you can see the type of matching supported by looking at {@link
* org.apache.lucene.document.RangeFieldQuery.QueryType}. In addition, this class supports
* multidimensional ranges. A multidimensional range will be counted as a match if every dimension
* matches the corresponding indexed range's dimension.
*/
public class DoubleRangeOnRangeFacetCounts extends RangeOnRangeFacetCounts {
/**
* Constructor without the fast match query, see other constructor description for more details.
*/
public DoubleRangeOnRangeFacetCounts(
String field,
FacetsCollector hits,
RangeFieldQuery.QueryType queryType,
DoubleRange... ranges)
throws IOException {
super(
field,
hits,
queryType,
null,
Double.BYTES,
getEncodedRanges(ranges),
Range.getLabelsFromRanges(ranges));
}
/**
* Represents counts for double range on range faceting. See class javadoc for more details.
*
* @param field specifies a {@link org.apache.lucene.document.DoubleRangeDocValuesField} that will
* define the indexed ranges
* @param hits hits we want to count against
* @param queryType type of intersection we want to count (IE: range intersection, range contains,
* etc.)
* @param fastMatchQuery query to quickly discard hits using some heuristic
* @param ranges ranges we want the counts of
* @throws IOException low level exception
*/
public DoubleRangeOnRangeFacetCounts(
String field,
FacetsCollector hits,
RangeFieldQuery.QueryType queryType,
Query fastMatchQuery,
DoubleRange... ranges)
throws IOException {
super(
field,
hits,
queryType,
fastMatchQuery,
Double.BYTES,
getEncodedRanges(ranges),
Range.getLabelsFromRanges(ranges));
}
private static byte[][] getEncodedRanges(DoubleRange... ranges) {
byte[][] result = new byte[ranges.length][2 * Double.BYTES * ranges[0].dims];
for (int i = 0; i < ranges.length; i++) {
verifyAndEncode(ranges[i].min, ranges[i].max, result[i]);
}
return result;
}
}

View File

@ -0,0 +1,122 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.facet.rangeonrange;
import java.util.Arrays;
import java.util.Objects;
/** Represents a long range for RangeOnRange faceting */
public class LongRange extends Range {
/** Minimum (inclusive). */
public final long[] min;
/** Maximum (inclusive). */
public final long[] max;
/**
* Represents a single dimensional long range for RangeOnRange faceting
*
* @param label the name of the range
* @param minIn the minimum
* @param minInclusive if the minimum is inclusive
* @param maxIn the maximum
* @param maxInclusive if the maximum is inclusive
*/
public LongRange(
String label, long minIn, boolean minInclusive, long maxIn, boolean maxInclusive) {
super(label, 1);
if (minInclusive == false) {
if (minIn != Long.MAX_VALUE) {
minIn++;
} else {
failNoMatch();
}
}
if (maxInclusive == false) {
if (maxIn != Long.MIN_VALUE) {
maxIn--;
} else {
failNoMatch();
}
}
if (minIn > maxIn) {
failNoMatch();
}
this.min = new long[] {minIn};
this.max = new long[] {maxIn};
}
/**
* Represents a multidimensional long range for RangeOnRange faceting
*
* @param label the name of the range
* @param min the minimum, inclusive
* @param max the maximum, inclusive
*/
public LongRange(String label, long[] min, long[] max) {
super(label, min.length);
checkArgs(min, max);
this.min = min;
this.max = max;
}
@Override
public String toString() {
return "LongRange(label: "
+ label
+ ", min: "
+ Arrays.toString(min)
+ ", max: "
+ Arrays.toString(max)
+ ")";
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
LongRange longRange = (LongRange) o;
return Arrays.equals(min, longRange.min)
&& Arrays.equals(max, longRange.max)
&& label.equals(longRange.label)
&& dims == longRange.dims;
}
@Override
public int hashCode() {
return Objects.hash(label, Arrays.hashCode(min), Arrays.hashCode(max), dims);
}
private void checkArgs(final long[] min, final long[] max) {
if (min == null || max == null || min.length == 0 || max.length == 0) {
failNoMatch();
}
if (min.length != max.length) {
failNoMatch();
}
for (int i = 0; i < min.length; i++) {
if (min[i] > max[i]) {
failNoMatch();
}
}
}
}

View File

@ -0,0 +1,91 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.facet.rangeonrange;
import static org.apache.lucene.document.LongRange.verifyAndEncode;
import java.io.IOException;
import org.apache.lucene.document.RangeFieldQuery;
import org.apache.lucene.facet.FacetsCollector;
import org.apache.lucene.search.Query;
/**
* Represents counts for long range on range faceting. To be more specific, this means that given a
* range (or list of ranges), this class will count all the documents in the {@link FacetsCollector}
* (or that match a fast match query) that contain ranges that "match" the provided ranges. These
* ranges are specified by the field parameter and expected to be of type {@link
* org.apache.lucene.document.LongRangeDocValuesField}. Matching is defined by the queryType param,
* you can see the type of matching supported by looking at {@link
* org.apache.lucene.document.RangeFieldQuery.QueryType}. In addition, this class supports
* multidimensional ranges. A multidimensional range will be counted as a match if every dimension
* matches the corresponding indexed range's dimension.
*/
public class LongRangeOnRangeFacetCounts extends RangeOnRangeFacetCounts {
/**
* Constructor without the fast match query, see other constructor description for more details.
*/
public LongRangeOnRangeFacetCounts(
String field, FacetsCollector hits, RangeFieldQuery.QueryType queryType, LongRange... ranges)
throws IOException {
super(
field,
hits,
queryType,
null,
Long.BYTES,
getEncodedRanges(ranges),
Range.getLabelsFromRanges(ranges));
}
/**
* Represents counts for long range on range faceting. See class javadoc for more details.
*
* @param field specifies a {@link org.apache.lucene.document.LongRangeDocValuesField} that will
* define the indexed ranges
* @param hits hits we want to count against
* @param queryType type of intersection we want to count (IE: range intersection, range contains,
* etc.)
* @param fastMatchQuery query to quickly discard hits using some heuristic
* @param ranges ranges we want the counts of
* @throws IOException low level exception
*/
public LongRangeOnRangeFacetCounts(
String field,
FacetsCollector hits,
RangeFieldQuery.QueryType queryType,
Query fastMatchQuery,
LongRange... ranges)
throws IOException {
super(
field,
hits,
queryType,
fastMatchQuery,
Long.BYTES,
getEncodedRanges(ranges),
Range.getLabelsFromRanges(ranges));
}
private static byte[][] getEncodedRanges(LongRange... ranges) {
byte[][] result = new byte[ranges.length][2 * Long.BYTES * ranges[0].dims];
for (int i = 0; i < ranges.length; i++) {
verifyAndEncode(ranges[i].min, ranges[i].max, result[i]);
}
return result;
}
}

View File

@ -0,0 +1,52 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.facet.rangeonrange;
import java.util.Arrays;
/**
* Base class for a single labeled range.
*
* @lucene.experimental
*/
public abstract class Range {
/** Label that identifies this range. */
public final String label;
/** Number of dims per range */
public final int dims;
/** Sole constructor. */
protected Range(String label, int dims) {
if (label == null) {
throw new NullPointerException("label must not be null");
}
this.label = label;
this.dims = dims;
}
/** Invoke this for a useless range. */
protected void failNoMatch() {
throw new IllegalArgumentException("range \"" + label + "\" matches nothing");
}
/** Gets a corresponding array of labels given a range */
protected static String[] getLabelsFromRanges(Range[] ranges) {
return Arrays.stream(ranges).map(r -> r.label).toArray(String[]::new);
}
}

View File

@ -0,0 +1,206 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.facet.rangeonrange;
import java.io.IOException;
import java.util.Collections;
import java.util.List;
import org.apache.lucene.document.BinaryRangeDocValues;
import org.apache.lucene.document.RangeFieldQuery;
import org.apache.lucene.facet.FacetCountsWithFilterQuery;
import org.apache.lucene.facet.FacetResult;
import org.apache.lucene.facet.FacetsCollector;
import org.apache.lucene.facet.LabelAndValue;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.PriorityQueue;
abstract class RangeOnRangeFacetCounts extends FacetCountsWithFilterQuery {
private final String[] labels;
/** Counts, initialized in by subclass. */
private final int[] counts;
/** Our field name. */
private final String field;
/** Total number of hits. */
private int totCount;
protected RangeOnRangeFacetCounts(
String field,
FacetsCollector hits,
RangeFieldQuery.QueryType queryType,
Query fastMatchQuery,
int numEncodedValueBytes,
byte[][] encodedRanges,
String[] labels)
throws IOException {
super(fastMatchQuery);
assert encodedRanges.length == labels.length;
assert encodedRanges[0].length % (2 * numEncodedValueBytes) == 0;
this.field = field;
this.labels = labels;
this.counts = new int[encodedRanges.length];
count(field, hits.getMatchingDocs(), encodedRanges, numEncodedValueBytes, queryType);
}
/** Counts from the provided field. */
protected void count(
String field,
List<FacetsCollector.MatchingDocs> matchingDocs,
byte[][] encodedRanges,
int numEncodedValueBytes,
RangeFieldQuery.QueryType queryType)
throws IOException {
// TODO: We currently just exhaustively check the ranges in each document with every range in
// the ranges array.
// We might be able to do something more efficient here by grouping the ranges array into a
// space partitioning
// data structure of some sort.
int dims = encodedRanges[0].length / (2 * numEncodedValueBytes);
ArrayUtil.ByteArrayComparator comparator =
ArrayUtil.getUnsignedComparator(numEncodedValueBytes);
int missingCount = 0;
for (FacetsCollector.MatchingDocs hits : matchingDocs) {
BinaryRangeDocValues binaryRangeDocValues =
new BinaryRangeDocValues(
DocValues.getBinary(hits.context.reader(), field), dims, numEncodedValueBytes);
final DocIdSetIterator it = createIterator(hits);
if (it == null) {
continue;
}
totCount += hits.totalHits;
for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; ) {
if (binaryRangeDocValues.advanceExact(doc)) {
boolean hasValidRange = false;
for (int range = 0; range < encodedRanges.length; range++) {
byte[] encodedRange = encodedRanges[range];
byte[] packedRange = binaryRangeDocValues.getPackedValue();
assert encodedRange.length == packedRange.length;
if (queryType.matches(
encodedRange, packedRange, dims, numEncodedValueBytes, comparator)) {
counts[range]++;
hasValidRange = true;
}
}
if (hasValidRange == false) {
missingCount++;
}
} else {
missingCount++;
}
doc = it.nextDoc();
}
}
totCount -= missingCount;
}
/**
* {@inheritDoc}
*
* <p>NOTE: This implementation guarantees that ranges will be returned in the order specified by
* the user when calling the constructor.
*/
@Override
public FacetResult getAllChildren(String dim, String... path) throws IOException {
validateDimAndPathForGetChildren(dim, path);
LabelAndValue[] labelValues = new LabelAndValue[counts.length];
for (int i = 0; i < counts.length; i++) {
labelValues[i] = new LabelAndValue(labels[i], counts[i]);
}
return new FacetResult(dim, path, totCount, labelValues, labelValues.length);
}
@Override
public FacetResult getTopChildren(int topN, String dim, String... path) throws IOException {
validateTopN(topN);
validateDimAndPathForGetChildren(dim, path);
PriorityQueue<Entry> pq =
new PriorityQueue<>(Math.min(topN, counts.length)) {
@Override
protected boolean lessThan(Entry a, Entry b) {
int cmp = Integer.compare(a.count, b.count);
if (cmp == 0) {
cmp = b.label.compareTo(a.label);
}
return cmp < 0;
}
};
int childCount = 0;
Entry e = null;
for (int i = 0; i < counts.length; i++) {
if (counts[i] != 0) {
childCount++;
if (e == null) {
e = new Entry();
}
e.label = labels[i];
e.count = counts[i];
e = pq.insertWithOverflow(e);
}
}
LabelAndValue[] results = new LabelAndValue[pq.size()];
while (pq.size() != 0) {
Entry entry = pq.pop();
assert entry != null;
results[pq.size()] = new LabelAndValue(entry.label, entry.count);
}
return new FacetResult(dim, path, totCount, results, childCount);
}
@Override
public Number getSpecificValue(String dim, String... path) throws IOException {
throw new UnsupportedOperationException();
}
@Override
public List<FacetResult> getAllDims(int topN) throws IOException {
validateTopN(topN);
return Collections.singletonList(getTopChildren(topN, field));
}
private void validateDimAndPathForGetChildren(String dim, String... path) {
if (dim.equals(field) == false) {
throw new IllegalArgumentException(
"invalid dim \"" + dim + "\"; should be \"" + field + "\"");
}
if (path.length != 0) {
throw new IllegalArgumentException("path.length should be 0");
}
}
private static final class Entry {
int count;
String label;
}
}

View File

@ -0,0 +1,19 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/** Provides range on range faceting capabilities. */
package org.apache.lucene.facet.rangeonrange;