LUCENE-5339: small opto for range facets, and factor out base class; put longHashCode back

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5339@1547511 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2013-12-03 18:22:14 +00:00
parent 034595f5e9
commit 8da43e722e
10 changed files with 166 additions and 96 deletions

View File

@ -31,8 +31,8 @@ import org.apache.lucene.util.Bits;
/** Represents a range over double values. */
public final class DoubleRange extends Range {
private final double minIncl;
private final double maxIncl;
final double minIncl;
final double maxIncl;
public final double min;
public final double max;

View File

@ -45,11 +45,7 @@ import org.apache.lucene.queries.function.valuesource.FloatFieldSource; // javad
* pass just a the field name).
*
* @lucene.experimental */
public class DoubleRangeFacetCounts extends Facets {
private final DoubleRange[] ranges;
private final int[] counts;
private final String field;
private int totCount;
public class DoubleRangeFacetCounts extends RangeFacetCounts {
/** Create {@code RangeFacetCounts}, using {@link
* DoubleFieldSource} from the specified field. */
@ -60,14 +56,22 @@ public class DoubleRangeFacetCounts extends Facets {
/** Create {@code RangeFacetCounts}, using the provided
* {@link ValueSource}. */
public DoubleRangeFacetCounts(String field, ValueSource valueSource, FacetsCollector hits, DoubleRange... ranges) throws IOException {
this.ranges = ranges;
this.field = field;
counts = new int[ranges.length];
super(field, ranges);
count(valueSource, hits.getMatchingDocs());
}
private void count(ValueSource valueSource, List<MatchingDocs> matchingDocs) throws IOException {
DoubleRange[] ranges = (DoubleRange[]) this.ranges;
// Compute min & max over all ranges:
double minIncl = Double.POSITIVE_INFINITY;
double maxIncl = Double.NEGATIVE_INFINITY;
for(DoubleRange range : ranges) {
minIncl = Math.min(minIncl, range.minIncl);
maxIncl = Math.max(maxIncl, range.maxIncl);
}
// TODO: test if this is faster (in the past it was
// faster to do MatchingDocs on the inside) ... see
// patches on LUCENE-4965):
@ -81,6 +85,10 @@ public class DoubleRangeFacetCounts extends Facets {
if (fv.exists(doc)) {
double v = fv.doubleVal(doc);
if (v < minIncl || v > maxIncl) {
doc++;
continue;
}
// TODO: if all ranges are non-overlapping, we
// should instead do a bin-search up front
@ -98,35 +106,4 @@ public class DoubleRangeFacetCounts extends Facets {
}
}
}
// nocommit all args are ... unused ... this doesn't "fit"
// very well:
@Override
public FacetResult getTopChildren(int topN, String dim, String... path) {
if (dim.equals(field) == false) {
throw new IllegalArgumentException("invalid dim \"" + dim + "\"; should be \"" + field + "\"");
}
if (path.length != 0) {
throw new IllegalArgumentException("path.length should be 0");
}
LabelAndValue[] labelValues = new LabelAndValue[counts.length];
for(int i=0;i<counts.length;i++) {
// nocommit can we add the range into this?
labelValues[i] = new LabelAndValue(ranges[i].label, counts[i]);
}
return new FacetResult(dim, path, totCount, labelValues, labelValues.length);
}
@Override
public Number getSpecificValue(String dim, String... path) throws IOException {
// nocommit we could impl this?
throw new UnsupportedOperationException();
}
@Override
public List<FacetResult> getAllDims(int topN) throws IOException {
return Collections.singletonList(getTopChildren(topN, null));
}
}

View File

@ -31,8 +31,8 @@ import org.apache.lucene.util.Bits;
/** Represents a range over long values. */
public final class LongRange extends Range {
private final long minIncl;
private final long maxIncl;
final long minIncl;
final long maxIncl;
public final long min;
public final long max;

View File

@ -35,13 +35,9 @@ import org.apache.lucene.queries.function.valuesource.LongFieldSource;
* distance dimension like "< 1 km", "< 2 km", etc.).
*
* @lucene.experimental */
public class LongRangeFacetCounts extends Facets {
private final LongRange[] ranges;
private final int[] counts;
private final String field;
private int totCount;
public class LongRangeFacetCounts extends RangeFacetCounts {
/** Create {@code RangeFacetCounts}, using {@link
/** Create {@code LongRangeFacetCounts}, using {@link
* LongFieldSource} from the specified field. */
public LongRangeFacetCounts(String field, FacetsCollector hits, LongRange... ranges) throws IOException {
this(field, new LongFieldSource(field), hits, ranges);
@ -50,14 +46,22 @@ public class LongRangeFacetCounts extends Facets {
/** Create {@code RangeFacetCounts}, using the provided
* {@link ValueSource}. */
public LongRangeFacetCounts(String field, ValueSource valueSource, FacetsCollector hits, LongRange... ranges) throws IOException {
this.ranges = ranges;
this.field = field;
counts = new int[ranges.length];
super(field, ranges);
count(valueSource, hits.getMatchingDocs());
}
private void count(ValueSource valueSource, List<MatchingDocs> matchingDocs) throws IOException {
LongRange[] ranges = (LongRange[]) this.ranges;
// Compute min & max over all ranges:
long minIncl = Long.MAX_VALUE;
long maxIncl = Long.MIN_VALUE;
for(LongRange range : ranges) {
minIncl = Math.min(minIncl, range.minIncl);
maxIncl = Math.max(maxIncl, range.maxIncl);
}
// TODO: test if this is faster (in the past it was
// faster to do MatchingDocs on the inside) ... see
// patches on LUCENE-4965):
@ -71,6 +75,10 @@ public class LongRangeFacetCounts extends Facets {
if (fv.exists(doc)) {
long v = fv.longVal(doc);
if (v < minIncl || v > maxIncl) {
doc++;
continue;
}
// TODO: if all ranges are non-overlapping, we
// should instead do a bin-search up front
@ -88,35 +96,4 @@ public class LongRangeFacetCounts extends Facets {
}
}
}
// nocommit all args are ... unused ... this doesn't "fit"
// very well:
@Override
public FacetResult getTopChildren(int topN, String dim, String... path) {
if (dim.equals(field) == false) {
throw new IllegalArgumentException("invalid dim \"" + dim + "\"; should be \"" + field + "\"");
}
if (path.length != 0) {
throw new IllegalArgumentException("path.length should be 0");
}
LabelAndValue[] labelValues = new LabelAndValue[counts.length];
for(int i=0;i<counts.length;i++) {
// nocommit can we add the range into this?
labelValues[i] = new LabelAndValue(ranges[i].label, counts[i]);
}
return new FacetResult(dim, path, totCount, labelValues, labelValues.length);
}
@Override
public Number getSpecificValue(String dim, String... path) throws IOException {
// nocommit we could impl this?
throw new UnsupportedOperationException();
}
@Override
public List<FacetResult> getAllDims(int topN) throws IOException {
return Collections.singletonList(getTopChildren(topN, null));
}
}

View File

@ -0,0 +1,70 @@
package org.apache.lucene.facet;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Collections;
import java.util.List;
/** Base class for range faceting.
*
* @lucene.experimental */
abstract class RangeFacetCounts extends Facets {
protected final Range[] ranges;
protected final int[] counts;
protected final String field;
protected int totCount;
/** Create {@code RangeFacetCounts}, using {@link
* LongFieldSource} from the specified field. */
protected RangeFacetCounts(String field, Range[] ranges) throws IOException {
this.field = field;
this.ranges = ranges;
counts = new int[ranges.length];
}
// nocommit all args are ... unused ... this doesn't "fit"
// very well:
@Override
public FacetResult getTopChildren(int topN, String dim, String... path) {
if (dim.equals(field) == false) {
throw new IllegalArgumentException("invalid dim \"" + dim + "\"; should be \"" + field + "\"");
}
if (path.length != 0) {
throw new IllegalArgumentException("path.length should be 0");
}
LabelAndValue[] labelValues = new LabelAndValue[counts.length];
for(int i=0;i<counts.length;i++) {
labelValues[i] = new LabelAndValue(ranges[i].label, counts[i]);
}
return new FacetResult(dim, path, totCount, labelValues, labelValues.length);
}
@Override
public Number getSpecificValue(String dim, String... path) throws IOException {
// TODO: should we impl this?
throw new UnsupportedOperationException();
}
@Override
public List<FacetResult> getAllDims(int topN) throws IOException {
return Collections.singletonList(getTopChildren(topN, null));
}
}

View File

@ -41,9 +41,6 @@ public class FacetLabel implements Comparable<FacetLabel> {
*/
public final static int MAX_CATEGORY_PATH_LENGTH = (BYTE_BLOCK_SIZE - 2) / 4;
/** An empty {@link FacetLabel} */
//public static final FacetLabel EMPTY = new FacetLabel();
/**
* The components of this {@link FacetLabel}. Note that this array may be
* shared with other {@link FacetLabel} instances, e.g. as a result of
@ -114,8 +111,12 @@ public class FacetLabel implements Comparable<FacetLabel> {
final int len = length < other.length ? length : other.length;
for (int i = 0, j = 0; i < len; i++, j++) {
int cmp = components[i].compareTo(other.components[j]);
if (cmp < 0) return -1; // this is 'before'
if (cmp > 0) return 1; // this is 'after'
if (cmp < 0) {
return -1; // this is 'before'
}
if (cmp > 0) {
return 1; // this is 'after'
}
}
// one is a prefix of the other
@ -156,6 +157,23 @@ public class FacetLabel implements Comparable<FacetLabel> {
return hash;
}
/** Calculate a 64-bit hash function for this path. This
* is necessary for {@link NameHashIntCacheLRU} (the
* default cache impl for {@link
* LruTaxonomyWriterCache}) to reduce the chance of
* "silent but deadly" collisions. */
public long longHashCode() {
if (length == 0) {
return 0;
}
long hash = length;
for (int i = 0; i < length; i++) {
hash = hash * 65599 + components[i].hashCode();
}
return hash;
}
/** Returns a sub-path of this path up to {@code length} components. */
public FacetLabel subpath(final int length) {
if (length >= this.length || length < 0) {

View File

@ -36,12 +36,12 @@ public class NameHashIntCacheLRU extends NameIntCacheLRU {
@Override
Object key(FacetLabel name) {
return new Integer(name.hashCode());
return new Long(name.longHashCode());
}
@Override
Object key(FacetLabel name, int prefixLen) {
return new Integer(name.subpath(prefixLen).hashCode());
return new Long(name.subpath(prefixLen).longHashCode());
}
}

View File

@ -107,6 +107,13 @@ public class TestFacetLabel extends FacetTestCase {
assertEquals(new FacetLabel("hello", "world").hashCode(), new FacetLabel("hello", "world").hashCode());
}
@Test
public void testLongHashCode() {
assertEquals(new FacetLabel().longHashCode(), new FacetLabel().longHashCode());
assertFalse(new FacetLabel().longHashCode() == new FacetLabel("hi").longHashCode());
assertEquals(new FacetLabel("hello", "world").longHashCode(), new FacetLabel("hello", "world").longHashCode());
}
@Test
public void testArrayConstructor() {
FacetLabel p = new FacetLabel("hello", "world", "yo");

View File

@ -128,14 +128,23 @@ public class TestConcurrentFacetedIndexing extends FacetTestCase {
for (Thread t : indexThreads) t.join();
DirectoryTaxonomyReader tr = new DirectoryTaxonomyReader(tw);
assertEquals("mismatch number of categories", values.size() + 1, tr.getSize()); // +1 for root category
// +1 for root category
if (values.size() + 1 != tr.getSize()) {
for(String value : values.keySet()) {
FacetLabel label = new FacetLabel(FacetsConfig.stringToPath(value));
if (tr.getOrdinal(label) == -1) {
System.out.println("FAIL: path=" + label + " not recognized");
}
}
fail("mismatch number of categories");
}
int[] parents = tr.getParallelTaxonomyArrays().parents();
for (String cat : values.keySet()) {
FacetLabel cp = new FacetLabel(FacetsConfig.stringToPath(cat));
assertTrue("category not found " + cp, tr.getOrdinal(cp) > 0);
int level = cp.length;
int parentOrd = 0; // for root, parent is always virtual ROOT (ord=0)
FacetLabel path = new FacetLabel();
FacetLabel path = null;
for (int i = 0; i < level; i++) {
path = cp.subpath(i + 1);
int ord = tr.getOrdinal(path);

View File

@ -256,6 +256,9 @@ public class TestDirectoryTaxonomyWriter extends FacetTestCase {
// this is slower than CL2O, but less memory consuming, and exercises finding categories on disk too.
cache = new LruTaxonomyWriterCache(ncats / 10);
}
if (VERBOSE) {
System.out.println("TEST: use cache=" + cache);
}
final DirectoryTaxonomyWriter tw = new DirectoryTaxonomyWriter(dir, OpenMode.CREATE, cache);
Thread[] addThreads = new Thread[atLeast(4)];
for (int z = 0; z < addThreads.length; z++) {
@ -291,7 +294,17 @@ public class TestDirectoryTaxonomyWriter extends FacetTestCase {
tw.close();
DirectoryTaxonomyReader dtr = new DirectoryTaxonomyReader(dir);
assertEquals("mismatch number of categories", values.size() + 1, dtr.getSize()); // +1 for root category
// +1 for root category
if (values.size() + 1 != dtr.getSize()) {
for(String value : values.keySet()) {
FacetLabel label = new FacetLabel(FacetsConfig.stringToPath(value));
if (dtr.getOrdinal(label) == -1) {
System.out.println("FAIL: path=" + label + " not recognized");
}
}
fail("mismatch number of categories");
}
int[] parents = dtr.getParallelTaxonomyArrays().parents();
for (String cat : values.keySet()) {
FacetLabel cp = new FacetLabel(FacetsConfig.stringToPath(cat));
@ -306,9 +319,8 @@ public class TestDirectoryTaxonomyWriter extends FacetTestCase {
parentOrd = ord; // next level should have this parent
}
}
dtr.close();
dir.close();
IOUtils.close(dtr, dir);
}
private long getEpoch(Directory taxoDir) throws IOException {