LUCENE-5735: remove NumberRangePrefixTreeStrategy.calcFacets from 6x for now

This commit is contained in:
David Smiley 2016-03-04 10:50:53 -05:00
parent 1cbf22569a
commit 0b15fd8636
3 changed files with 2 additions and 418 deletions

View File

@ -25,10 +25,6 @@ New Features
input tokens. Useful for normalizing short text in clustering/linking
tasks. (Mark Harwood, Adrien Grand)
* LUCENE-5735: NumberRangePrefixTreeStrategy now includes interval/range faceting
for counting ranges that align with the underlying terms as defined by the
NumberRangePrefixTree (e.g. familiar date units like days). (David Smiley)
* LUCENE-6711: Use CollectionStatistics.docCount() for IDF and average field
length computations, to avoid skew from documents that don't have the field.
(Ahmet Arslan via Robert Muir)

View File

@ -16,22 +16,13 @@
*/
package org.apache.lucene.spatial.prefix;
import java.io.IOException;
import java.util.Arrays;
import java.util.Iterator;
import java.util.Map;
import java.util.SortedMap;
import java.util.TreeMap;
import org.locationtech.spatial4j.shape.Point;
import org.locationtech.spatial4j.shape.Shape;
import org.apache.lucene.index.IndexReaderContext;
import org.apache.lucene.queries.function.ValueSource;
import org.apache.lucene.spatial.prefix.tree.Cell;
import org.apache.lucene.spatial.prefix.tree.NumberRangePrefixTree;
import org.apache.lucene.util.Bits;
import static org.apache.lucene.spatial.prefix.tree.NumberRangePrefixTree.UnitNRShape;
import org.locationtech.spatial4j.shape.Point;
import org.locationtech.spatial4j.shape.Shape;
/** A PrefixTree based on Number/Date ranges. This isn't very "spatial" on the surface (to the user) but
* it's implemented using spatial so that's why it's here extending a SpatialStrategy. When using this class, you will
@ -68,132 +59,4 @@ public class NumberRangePrefixTreeStrategy extends RecursivePrefixTreeStrategy {
throw new UnsupportedOperationException();
}
/** Calculates facets between {@code start} and {@code end} to a detail level one greater than that provided by the
* arguments. For example providing March to October of 2014 would return facets to the day level of those months.
* This is just a convenience method.
* @see #calcFacets(IndexReaderContext, Bits, Shape, int)
*/
public Facets calcFacets(IndexReaderContext context, Bits topAcceptDocs, UnitNRShape start, UnitNRShape end)
throws IOException {
Shape facetRange = getGrid().toRangeShape(start, end);
int detailLevel = Math.max(start.getLevel(), end.getLevel()) + 1;
return calcFacets(context, topAcceptDocs, facetRange, detailLevel);
}
/**
* Calculates facets (aggregated counts) given a range shape (start-end span) and a level, which specifies the detail.
* To get the level of an existing shape, say a Calendar, call
* {@link org.apache.lucene.spatial.prefix.tree.NumberRangePrefixTree#toUnitShape(Object)} then call
* {@link org.apache.lucene.spatial.prefix.tree.NumberRangePrefixTree.UnitNRShape#getLevel()}.
* Facet computation is implemented by navigating the underlying indexed terms efficiently.
*/
public Facets calcFacets(IndexReaderContext context, Bits topAcceptDocs, Shape facetRange, final int level)
throws IOException {
final Facets facets = new Facets(level);
PrefixTreeFacetCounter.compute(this, context, topAcceptDocs, facetRange, level,
new PrefixTreeFacetCounter.FacetVisitor() {
Facets.FacetParentVal parentFacet;
UnitNRShape parentShape;
@Override
public void visit(Cell cell, int count) {
if (cell.getLevel() < level - 1) {//some ancestor of parent facet level, direct or distant
parentFacet = null;//reset
parentShape = null;//reset
facets.topLeaves += count;
} else if (cell.getLevel() == level - 1) {//parent
//set up FacetParentVal
setupParent((UnitNRShape) cell.getShape());
parentFacet.parentLeaves += count;
} else {//at facet level
UnitNRShape unitShape = (UnitNRShape) cell.getShape();
UnitNRShape unitShapeParent = unitShape.getShapeAtLevel(unitShape.getLevel() - 1);
if (parentFacet == null || !parentShape.equals(unitShapeParent)) {
setupParent(unitShapeParent);
}
//lazy init childCounts
if (parentFacet.childCounts == null) {
parentFacet.childCounts = new int[parentFacet.childCountsLen];
}
parentFacet.childCounts[unitShape.getValAtLevel(cell.getLevel())] += count;
}
}
private void setupParent(UnitNRShape unitShape) {
parentShape = unitShape.clone();
//Look for existing parentFacet (from previous segment), or create anew if needed
parentFacet = facets.parents.get(parentShape);
if (parentFacet == null) {//didn't find one; make a new one
parentFacet = new Facets.FacetParentVal();
parentFacet.childCountsLen = getGrid().getNumSubCells(parentShape);
facets.parents.put(parentShape, parentFacet);
}
}
});
return facets;
}
/** Facet response information */
public static class Facets {
//TODO consider a variable-level structure -- more general purpose.
public Facets(int detailLevel) {
this.detailLevel = detailLevel;
}
/** The bottom-most detail-level counted, as requested. */
public final int detailLevel;
/**
* The count of documents with ranges that completely spanned the parents of the detail level. In more technical
* terms, this is the count of leaf cells 2 up and higher from the bottom. Usually you only care about counts at
* detailLevel, and so you will add this number to all other counts below, including to omitted/implied children
* counts of 0. If there are no indexed ranges (just instances, i.e. fully specified dates) then this value will
* always be 0.
*/
public int topLeaves;
/** Holds all the {@link FacetParentVal} instances in order of the key. This is sparse; there won't be an
* instance if it's count and children are all 0. The keys are {@link org.apache.lucene.spatial.prefix.tree.NumberRangePrefixTree.UnitNRShape} shapes, which can be
* converted back to the original Object (i.e. a Calendar) via
* {@link NumberRangePrefixTree#toObject(org.apache.lucene.spatial.prefix.tree.NumberRangePrefixTree.UnitNRShape)}. */
public final SortedMap<UnitNRShape,FacetParentVal> parents = new TreeMap<>();
/** Holds a block of detailLevel counts aggregated to their parent level. */
public static class FacetParentVal {
/** The count of ranges that span all of the childCounts. In more technical terms, this is the number of leaf
* cells found at this parent. Treat this like {@link Facets#topLeaves}. */
public int parentLeaves;
/** The length of {@link #childCounts}. If childCounts is not null then this is childCounts.length, otherwise it
* says how long it would have been if it weren't null. */
public int childCountsLen;
/** The detail level counts. It will be null if there are none, and thus they are assumed 0. Most apps, when
* presenting the information, will add {@link #topLeaves} and {@link #parentLeaves} to each count. */
public int[] childCounts;
//assert childCountsLen == childCounts.length
}
@Override
public String toString() {
StringBuilder buf = new StringBuilder(2048);
buf.append("Facets: level=" + detailLevel + " topLeaves=" + topLeaves + " parentCount=" + parents.size());
for (Map.Entry<UnitNRShape, FacetParentVal> entry : parents.entrySet()) {
buf.append('\n');
if (buf.length() > 1000) {
buf.append("...");
break;
}
final FacetParentVal pVal = entry.getValue();
buf.append(' ').append(entry.getKey()+" leafCount=" + pVal.parentLeaves);
if (pVal.childCounts != null) {
buf.append(' ').append(Arrays.toString(pVal.childCounts));
}
}
return buf.toString();
}
}
}

View File

@ -1,275 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.spatial.prefix;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Collections;
import java.util.List;
import com.carrotsearch.randomizedtesting.annotations.Repeat;
import org.locationtech.spatial4j.shape.Shape;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.queries.TermsQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.SimpleCollector;
import org.apache.lucene.spatial.StrategyTestCase;
import org.apache.lucene.spatial.prefix.NumberRangePrefixTreeStrategy.Facets;
import org.apache.lucene.spatial.prefix.tree.Cell;
import org.apache.lucene.spatial.prefix.tree.CellIterator;
import org.apache.lucene.spatial.prefix.tree.DateRangePrefixTree;
import org.apache.lucene.spatial.prefix.tree.NumberRangePrefixTree;
import org.apache.lucene.spatial.prefix.tree.NumberRangePrefixTree.UnitNRShape;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.FixedBitSet;
import org.junit.Before;
import org.junit.Test;
import static com.carrotsearch.randomizedtesting.RandomizedTest.randomInt;
import static com.carrotsearch.randomizedtesting.RandomizedTest.randomIntBetween;
public class NumberRangeFacetsTest extends StrategyTestCase {
DateRangePrefixTree tree;
int randomCalWindowField;
long randomCalWindowMs;
@Before
public void setUp() throws Exception {
super.setUp();
tree = DateRangePrefixTree.INSTANCE;
strategy = new NumberRangePrefixTreeStrategy(tree, "dateRange");
Calendar tmpCal = tree.newCal();
randomCalWindowField = randomIntBetween(1, Calendar.ZONE_OFFSET - 1);//we're not allowed to add zone offset
tmpCal.add(randomCalWindowField, 2_000);
randomCalWindowMs = Math.max(2000L, tmpCal.getTimeInMillis());
}
@Repeat(iterations = 20)
@Test
public void test() throws IOException {
//generate test data
List<Shape> indexedShapes = new ArrayList<>();
final int numIndexedShapes = random().nextInt(15);
for (int i = 0; i < numIndexedShapes; i++) {
indexedShapes.add(randomShape());
}
//Main index loop:
for (int i = 0; i < indexedShapes.size(); i++) {
Shape shape = indexedShapes.get(i);
adoc(""+i, shape);
if (random().nextInt(10) == 0)
commit();//intermediate commit, produces extra segments
}
//delete some documents randomly
for (int id = 0; id < indexedShapes.size(); id++) {
if (random().nextInt(10) == 0) {
deleteDoc(""+id);
indexedShapes.set(id, null);
}
}
commit();
//Main query loop:
for (int queryIdx = 0; queryIdx < 10; queryIdx++) {
preQueryHavoc();
// We need to have a facet range window to do the facets between (a start time & end time). We randomly
// pick a date, decide the level we want to facet on, and then pick a right end time that is up to 2 thousand
// values later.
int calFieldFacet = randomCalWindowField - 1;
if (calFieldFacet > 1 && rarely()) {
calFieldFacet--;
}
final Calendar leftCal = randomCalendar();
leftCal.add(calFieldFacet, -1 * randomInt(1000));
Calendar rightCal = (Calendar) leftCal.clone();
rightCal.add(calFieldFacet, randomInt(2000));
// Pick facet detail level based on cal field.
int detailLevel = tree.getTreeLevelForCalendarField(calFieldFacet);
if (detailLevel < 0) {//no exact match
detailLevel = -1 * detailLevel;
}
//Randomly pick a filter/acceptDocs
Bits topAcceptDocs = null;
List<Integer> acceptFieldIds = new ArrayList<>();
if (usually()) {
//get all possible IDs into a list, random shuffle it, then randomly choose how many of the first we use to
// replace the list.
for (int i = 0; i < indexedShapes.size(); i++) {
if (indexedShapes.get(i) == null) { // we deleted this one
continue;
}
acceptFieldIds.add(i);
}
Collections.shuffle(acceptFieldIds, random());
acceptFieldIds = acceptFieldIds.subList(0, randomInt(acceptFieldIds.size()));
if (!acceptFieldIds.isEmpty()) {
List<Term> terms = new ArrayList<>();
for (Integer acceptDocId : acceptFieldIds) {
terms.add(new Term("id", acceptDocId.toString()));
}
topAcceptDocs = searchForDocBits(new TermsQuery(terms));
}
}
//Lets do it!
NumberRangePrefixTree.NRShape facetRange = tree.toRangeShape(tree.toShape(leftCal), tree.toShape(rightCal));
Facets facets = ((NumberRangePrefixTreeStrategy) strategy)
.calcFacets(indexSearcher.getTopReaderContext(), topAcceptDocs, facetRange, detailLevel);
//System.out.println("Q: " + queryIdx + " " + facets);
//Verify results. We do it by looping over indexed shapes and reducing the facet counts.
Shape facetShapeRounded = facetRange.roundToLevel(detailLevel);
for (int indexedShapeId = 0; indexedShapeId < indexedShapes.size(); indexedShapeId++) {
if (topAcceptDocs != null && !acceptFieldIds.contains(indexedShapeId)) {
continue;// this doc was filtered out via acceptDocs
}
Shape indexedShape = indexedShapes.get(indexedShapeId);
if (indexedShape == null) {//was deleted
continue;
}
Shape indexedShapeRounded = ((NumberRangePrefixTree.NRShape) indexedShape).roundToLevel(detailLevel);
if (!indexedShapeRounded.relate(facetShapeRounded).intersects()) { // no intersection at all
continue;
}
// walk the cells
final CellIterator cellIterator = tree.getTreeCellIterator(indexedShape, detailLevel);
while (cellIterator.hasNext()) {
Cell cell = cellIterator.next();
if (!cell.getShape().relate(facetShapeRounded).intersects()) {
cellIterator.remove();//no intersection; prune
continue;
}
assert cell.getLevel() <= detailLevel;
if (cell.getLevel() == detailLevel) {
//count it
UnitNRShape shape = (UnitNRShape) cell.getShape();
final UnitNRShape parentShape = shape.getShapeAtLevel(detailLevel - 1);//get parent
final Facets.FacetParentVal facetParentVal = facets.parents.get(parentShape);
assertNotNull(facetParentVal);
int index = shape.getValAtLevel(shape.getLevel());
assertNotNull(facetParentVal.childCounts);
assert facetParentVal.childCounts[index] > 0;
facetParentVal.childCounts[index]--;
} else if (cell.isLeaf()) {
//count it, and remove/prune.
if (cell.getLevel() < detailLevel - 1) {
assert facets.topLeaves > 0;
facets.topLeaves--;
} else {
UnitNRShape shape = (UnitNRShape) cell.getShape();
final UnitNRShape parentShape = shape.getShapeAtLevel(detailLevel - 1);//get parent
final Facets.FacetParentVal facetParentVal = facets.parents.get(parentShape);
assertNotNull(facetParentVal);
assert facetParentVal.parentLeaves > 0;
facetParentVal.parentLeaves--;
}
cellIterator.remove();
}
}
}
// At this point; all counts should be down to zero.
assertTrue(facets.topLeaves == 0);
for (Facets.FacetParentVal facetParentVal : facets.parents.values()) {
assertTrue(facetParentVal.parentLeaves == 0);
if (facetParentVal.childCounts != null) {
for (int childCount : facetParentVal.childCounts) {
assertTrue(childCount == 0);
}
}
}
}
}
private Bits searchForDocBits(Query query) throws IOException {
FixedBitSet bitSet = new FixedBitSet(indexSearcher.getIndexReader().maxDoc());
indexSearcher.search(query,
new SimpleCollector() {
int leafDocBase;
@Override
public void collect(int doc) throws IOException {
bitSet.set(leafDocBase + doc);
}
@Override
protected void doSetNextReader(LeafReaderContext context) throws IOException {
leafDocBase = context.docBase;
}
@Override
public boolean needsScores() {
return false;
}
});
return bitSet;
}
private void preQueryHavoc() {
if (strategy instanceof RecursivePrefixTreeStrategy) {
RecursivePrefixTreeStrategy rpts = (RecursivePrefixTreeStrategy) strategy;
int scanLevel = randomInt(rpts.getGrid().getMaxLevels());
rpts.setPrefixGridScanLevel(scanLevel);
}
}
protected Shape randomShape() {
Calendar cal1 = randomCalendar();
UnitNRShape s1 = tree.toShape(cal1);
if (rarely()) {
return s1;
}
try {
Calendar cal2 = randomCalendar();
UnitNRShape s2 = tree.toShape(cal2);
if (cal1.compareTo(cal2) < 0) {
return tree.toRangeShape(s1, s2);
} else {
return tree.toRangeShape(s2, s1);
}
} catch (IllegalArgumentException e) {
assert e.getMessage().startsWith("Differing precision");
return s1;
}
}
private Calendar randomCalendar() {
Calendar cal = tree.newCal();
cal.setTimeInMillis(random().nextLong() % randomCalWindowMs);
try {
tree.clearFieldsAfter(cal, random().nextInt(Calendar.FIELD_COUNT+1)-1);
} catch (AssertionError e) {
if (!e.getMessage().equals("Calendar underflow"))
throw e;
}
return cal;
}
}