diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index fd01a922866..71a715b80f3 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -25,10 +25,6 @@ New Features input tokens. Useful for normalizing short text in clustering/linking tasks. (Mark Harwood, Adrien Grand) -* LUCENE-5735: NumberRangePrefixTreeStrategy now includes interval/range faceting - for counting ranges that align with the underlying terms as defined by the - NumberRangePrefixTree (e.g. familiar date units like days). (David Smiley) - * LUCENE-6711: Use CollectionStatistics.docCount() for IDF and average field length computations, to avoid skew from documents that don't have the field. (Ahmet Arslan via Robert Muir) diff --git a/lucene/spatial-extras/src/java/org/apache/lucene/spatial/prefix/NumberRangePrefixTreeStrategy.java b/lucene/spatial-extras/src/java/org/apache/lucene/spatial/prefix/NumberRangePrefixTreeStrategy.java index c727c0da075..b310de39f69 100644 --- a/lucene/spatial-extras/src/java/org/apache/lucene/spatial/prefix/NumberRangePrefixTreeStrategy.java +++ b/lucene/spatial-extras/src/java/org/apache/lucene/spatial/prefix/NumberRangePrefixTreeStrategy.java @@ -16,22 +16,13 @@ */ package org.apache.lucene.spatial.prefix; -import java.io.IOException; -import java.util.Arrays; import java.util.Iterator; -import java.util.Map; -import java.util.SortedMap; -import java.util.TreeMap; -import org.locationtech.spatial4j.shape.Point; -import org.locationtech.spatial4j.shape.Shape; -import org.apache.lucene.index.IndexReaderContext; import org.apache.lucene.queries.function.ValueSource; import org.apache.lucene.spatial.prefix.tree.Cell; import org.apache.lucene.spatial.prefix.tree.NumberRangePrefixTree; -import org.apache.lucene.util.Bits; - -import static org.apache.lucene.spatial.prefix.tree.NumberRangePrefixTree.UnitNRShape; +import org.locationtech.spatial4j.shape.Point; +import org.locationtech.spatial4j.shape.Shape; /** A PrefixTree based on Number/Date ranges. This isn't very "spatial" on the surface (to the user) but * it's implemented using spatial so that's why it's here extending a SpatialStrategy. When using this class, you will @@ -68,132 +59,4 @@ public class NumberRangePrefixTreeStrategy extends RecursivePrefixTreeStrategy { throw new UnsupportedOperationException(); } - /** Calculates facets between {@code start} and {@code end} to a detail level one greater than that provided by the - * arguments. For example providing March to October of 2014 would return facets to the day level of those months. - * This is just a convenience method. - * @see #calcFacets(IndexReaderContext, Bits, Shape, int) - */ - public Facets calcFacets(IndexReaderContext context, Bits topAcceptDocs, UnitNRShape start, UnitNRShape end) - throws IOException { - Shape facetRange = getGrid().toRangeShape(start, end); - int detailLevel = Math.max(start.getLevel(), end.getLevel()) + 1; - return calcFacets(context, topAcceptDocs, facetRange, detailLevel); - } - - /** - * Calculates facets (aggregated counts) given a range shape (start-end span) and a level, which specifies the detail. - * To get the level of an existing shape, say a Calendar, call - * {@link org.apache.lucene.spatial.prefix.tree.NumberRangePrefixTree#toUnitShape(Object)} then call - * {@link org.apache.lucene.spatial.prefix.tree.NumberRangePrefixTree.UnitNRShape#getLevel()}. - * Facet computation is implemented by navigating the underlying indexed terms efficiently. - */ - public Facets calcFacets(IndexReaderContext context, Bits topAcceptDocs, Shape facetRange, final int level) - throws IOException { - final Facets facets = new Facets(level); - PrefixTreeFacetCounter.compute(this, context, topAcceptDocs, facetRange, level, - new PrefixTreeFacetCounter.FacetVisitor() { - Facets.FacetParentVal parentFacet; - UnitNRShape parentShape; - - @Override - public void visit(Cell cell, int count) { - if (cell.getLevel() < level - 1) {//some ancestor of parent facet level, direct or distant - parentFacet = null;//reset - parentShape = null;//reset - facets.topLeaves += count; - } else if (cell.getLevel() == level - 1) {//parent - //set up FacetParentVal - setupParent((UnitNRShape) cell.getShape()); - parentFacet.parentLeaves += count; - } else {//at facet level - UnitNRShape unitShape = (UnitNRShape) cell.getShape(); - UnitNRShape unitShapeParent = unitShape.getShapeAtLevel(unitShape.getLevel() - 1); - if (parentFacet == null || !parentShape.equals(unitShapeParent)) { - setupParent(unitShapeParent); - } - //lazy init childCounts - if (parentFacet.childCounts == null) { - parentFacet.childCounts = new int[parentFacet.childCountsLen]; - } - parentFacet.childCounts[unitShape.getValAtLevel(cell.getLevel())] += count; - } - } - - private void setupParent(UnitNRShape unitShape) { - parentShape = unitShape.clone(); - //Look for existing parentFacet (from previous segment), or create anew if needed - parentFacet = facets.parents.get(parentShape); - if (parentFacet == null) {//didn't find one; make a new one - parentFacet = new Facets.FacetParentVal(); - parentFacet.childCountsLen = getGrid().getNumSubCells(parentShape); - facets.parents.put(parentShape, parentFacet); - } - } - }); - return facets; - } - - /** Facet response information */ - public static class Facets { - //TODO consider a variable-level structure -- more general purpose. - - public Facets(int detailLevel) { - this.detailLevel = detailLevel; - } - - /** The bottom-most detail-level counted, as requested. */ - public final int detailLevel; - - /** - * The count of documents with ranges that completely spanned the parents of the detail level. In more technical - * terms, this is the count of leaf cells 2 up and higher from the bottom. Usually you only care about counts at - * detailLevel, and so you will add this number to all other counts below, including to omitted/implied children - * counts of 0. If there are no indexed ranges (just instances, i.e. fully specified dates) then this value will - * always be 0. - */ - public int topLeaves; - - /** Holds all the {@link FacetParentVal} instances in order of the key. This is sparse; there won't be an - * instance if it's count and children are all 0. The keys are {@link org.apache.lucene.spatial.prefix.tree.NumberRangePrefixTree.UnitNRShape} shapes, which can be - * converted back to the original Object (i.e. a Calendar) via - * {@link NumberRangePrefixTree#toObject(org.apache.lucene.spatial.prefix.tree.NumberRangePrefixTree.UnitNRShape)}. */ - public final SortedMap parents = new TreeMap<>(); - - /** Holds a block of detailLevel counts aggregated to their parent level. */ - public static class FacetParentVal { - - /** The count of ranges that span all of the childCounts. In more technical terms, this is the number of leaf - * cells found at this parent. Treat this like {@link Facets#topLeaves}. */ - public int parentLeaves; - - /** The length of {@link #childCounts}. If childCounts is not null then this is childCounts.length, otherwise it - * says how long it would have been if it weren't null. */ - public int childCountsLen; - - /** The detail level counts. It will be null if there are none, and thus they are assumed 0. Most apps, when - * presenting the information, will add {@link #topLeaves} and {@link #parentLeaves} to each count. */ - public int[] childCounts; - //assert childCountsLen == childCounts.length - } - - @Override - public String toString() { - StringBuilder buf = new StringBuilder(2048); - buf.append("Facets: level=" + detailLevel + " topLeaves=" + topLeaves + " parentCount=" + parents.size()); - for (Map.Entry entry : parents.entrySet()) { - buf.append('\n'); - if (buf.length() > 1000) { - buf.append("..."); - break; - } - final FacetParentVal pVal = entry.getValue(); - buf.append(' ').append(entry.getKey()+" leafCount=" + pVal.parentLeaves); - if (pVal.childCounts != null) { - buf.append(' ').append(Arrays.toString(pVal.childCounts)); - } - } - return buf.toString(); - } - } - } diff --git a/lucene/spatial-extras/src/test/org/apache/lucene/spatial/prefix/NumberRangeFacetsTest.java b/lucene/spatial-extras/src/test/org/apache/lucene/spatial/prefix/NumberRangeFacetsTest.java deleted file mode 100644 index 514c18e078b..00000000000 --- a/lucene/spatial-extras/src/test/org/apache/lucene/spatial/prefix/NumberRangeFacetsTest.java +++ /dev/null @@ -1,275 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.spatial.prefix; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.Calendar; -import java.util.Collections; -import java.util.List; - -import com.carrotsearch.randomizedtesting.annotations.Repeat; -import org.locationtech.spatial4j.shape.Shape; -import org.apache.lucene.index.LeafReaderContext; -import org.apache.lucene.index.Term; -import org.apache.lucene.queries.TermsQuery; -import org.apache.lucene.search.Query; -import org.apache.lucene.search.SimpleCollector; -import org.apache.lucene.spatial.StrategyTestCase; -import org.apache.lucene.spatial.prefix.NumberRangePrefixTreeStrategy.Facets; -import org.apache.lucene.spatial.prefix.tree.Cell; -import org.apache.lucene.spatial.prefix.tree.CellIterator; -import org.apache.lucene.spatial.prefix.tree.DateRangePrefixTree; -import org.apache.lucene.spatial.prefix.tree.NumberRangePrefixTree; -import org.apache.lucene.spatial.prefix.tree.NumberRangePrefixTree.UnitNRShape; -import org.apache.lucene.util.Bits; -import org.apache.lucene.util.FixedBitSet; -import org.junit.Before; -import org.junit.Test; - -import static com.carrotsearch.randomizedtesting.RandomizedTest.randomInt; -import static com.carrotsearch.randomizedtesting.RandomizedTest.randomIntBetween; - -public class NumberRangeFacetsTest extends StrategyTestCase { - - DateRangePrefixTree tree; - - int randomCalWindowField; - long randomCalWindowMs; - - @Before - public void setUp() throws Exception { - super.setUp(); - tree = DateRangePrefixTree.INSTANCE; - strategy = new NumberRangePrefixTreeStrategy(tree, "dateRange"); - Calendar tmpCal = tree.newCal(); - randomCalWindowField = randomIntBetween(1, Calendar.ZONE_OFFSET - 1);//we're not allowed to add zone offset - tmpCal.add(randomCalWindowField, 2_000); - randomCalWindowMs = Math.max(2000L, tmpCal.getTimeInMillis()); - } - - @Repeat(iterations = 20) - @Test - public void test() throws IOException { - //generate test data - List indexedShapes = new ArrayList<>(); - final int numIndexedShapes = random().nextInt(15); - for (int i = 0; i < numIndexedShapes; i++) { - indexedShapes.add(randomShape()); - } - - //Main index loop: - for (int i = 0; i < indexedShapes.size(); i++) { - Shape shape = indexedShapes.get(i); - adoc(""+i, shape); - - if (random().nextInt(10) == 0) - commit();//intermediate commit, produces extra segments - } - - //delete some documents randomly - for (int id = 0; id < indexedShapes.size(); id++) { - if (random().nextInt(10) == 0) { - deleteDoc(""+id); - indexedShapes.set(id, null); - } - } - - commit(); - - //Main query loop: - for (int queryIdx = 0; queryIdx < 10; queryIdx++) { - preQueryHavoc(); - - // We need to have a facet range window to do the facets between (a start time & end time). We randomly - // pick a date, decide the level we want to facet on, and then pick a right end time that is up to 2 thousand - // values later. - int calFieldFacet = randomCalWindowField - 1; - if (calFieldFacet > 1 && rarely()) { - calFieldFacet--; - } - final Calendar leftCal = randomCalendar(); - leftCal.add(calFieldFacet, -1 * randomInt(1000)); - Calendar rightCal = (Calendar) leftCal.clone(); - rightCal.add(calFieldFacet, randomInt(2000)); - // Pick facet detail level based on cal field. - int detailLevel = tree.getTreeLevelForCalendarField(calFieldFacet); - if (detailLevel < 0) {//no exact match - detailLevel = -1 * detailLevel; - } - - //Randomly pick a filter/acceptDocs - Bits topAcceptDocs = null; - List acceptFieldIds = new ArrayList<>(); - if (usually()) { - //get all possible IDs into a list, random shuffle it, then randomly choose how many of the first we use to - // replace the list. - for (int i = 0; i < indexedShapes.size(); i++) { - if (indexedShapes.get(i) == null) { // we deleted this one - continue; - } - acceptFieldIds.add(i); - } - Collections.shuffle(acceptFieldIds, random()); - acceptFieldIds = acceptFieldIds.subList(0, randomInt(acceptFieldIds.size())); - if (!acceptFieldIds.isEmpty()) { - List terms = new ArrayList<>(); - for (Integer acceptDocId : acceptFieldIds) { - terms.add(new Term("id", acceptDocId.toString())); - } - - topAcceptDocs = searchForDocBits(new TermsQuery(terms)); - } - } - - //Lets do it! - NumberRangePrefixTree.NRShape facetRange = tree.toRangeShape(tree.toShape(leftCal), tree.toShape(rightCal)); - Facets facets = ((NumberRangePrefixTreeStrategy) strategy) - .calcFacets(indexSearcher.getTopReaderContext(), topAcceptDocs, facetRange, detailLevel); - - //System.out.println("Q: " + queryIdx + " " + facets); - - //Verify results. We do it by looping over indexed shapes and reducing the facet counts. - Shape facetShapeRounded = facetRange.roundToLevel(detailLevel); - for (int indexedShapeId = 0; indexedShapeId < indexedShapes.size(); indexedShapeId++) { - if (topAcceptDocs != null && !acceptFieldIds.contains(indexedShapeId)) { - continue;// this doc was filtered out via acceptDocs - } - Shape indexedShape = indexedShapes.get(indexedShapeId); - if (indexedShape == null) {//was deleted - continue; - } - Shape indexedShapeRounded = ((NumberRangePrefixTree.NRShape) indexedShape).roundToLevel(detailLevel); - if (!indexedShapeRounded.relate(facetShapeRounded).intersects()) { // no intersection at all - continue; - } - // walk the cells - final CellIterator cellIterator = tree.getTreeCellIterator(indexedShape, detailLevel); - while (cellIterator.hasNext()) { - Cell cell = cellIterator.next(); - if (!cell.getShape().relate(facetShapeRounded).intersects()) { - cellIterator.remove();//no intersection; prune - continue; - } - assert cell.getLevel() <= detailLevel; - - if (cell.getLevel() == detailLevel) { - //count it - UnitNRShape shape = (UnitNRShape) cell.getShape(); - final UnitNRShape parentShape = shape.getShapeAtLevel(detailLevel - 1);//get parent - final Facets.FacetParentVal facetParentVal = facets.parents.get(parentShape); - assertNotNull(facetParentVal); - int index = shape.getValAtLevel(shape.getLevel()); - assertNotNull(facetParentVal.childCounts); - assert facetParentVal.childCounts[index] > 0; - facetParentVal.childCounts[index]--; - - } else if (cell.isLeaf()) { - //count it, and remove/prune. - if (cell.getLevel() < detailLevel - 1) { - assert facets.topLeaves > 0; - facets.topLeaves--; - } else { - UnitNRShape shape = (UnitNRShape) cell.getShape(); - final UnitNRShape parentShape = shape.getShapeAtLevel(detailLevel - 1);//get parent - final Facets.FacetParentVal facetParentVal = facets.parents.get(parentShape); - assertNotNull(facetParentVal); - assert facetParentVal.parentLeaves > 0; - facetParentVal.parentLeaves--; - } - - cellIterator.remove(); - } - } - } - // At this point; all counts should be down to zero. - assertTrue(facets.topLeaves == 0); - for (Facets.FacetParentVal facetParentVal : facets.parents.values()) { - assertTrue(facetParentVal.parentLeaves == 0); - if (facetParentVal.childCounts != null) { - for (int childCount : facetParentVal.childCounts) { - assertTrue(childCount == 0); - } - } - } - - } - } - - private Bits searchForDocBits(Query query) throws IOException { - FixedBitSet bitSet = new FixedBitSet(indexSearcher.getIndexReader().maxDoc()); - indexSearcher.search(query, - new SimpleCollector() { - int leafDocBase; - @Override - public void collect(int doc) throws IOException { - bitSet.set(leafDocBase + doc); - } - - @Override - protected void doSetNextReader(LeafReaderContext context) throws IOException { - leafDocBase = context.docBase; - } - - @Override - public boolean needsScores() { - return false; - } - }); - return bitSet; - } - - private void preQueryHavoc() { - if (strategy instanceof RecursivePrefixTreeStrategy) { - RecursivePrefixTreeStrategy rpts = (RecursivePrefixTreeStrategy) strategy; - int scanLevel = randomInt(rpts.getGrid().getMaxLevels()); - rpts.setPrefixGridScanLevel(scanLevel); - } - } - - protected Shape randomShape() { - Calendar cal1 = randomCalendar(); - UnitNRShape s1 = tree.toShape(cal1); - if (rarely()) { - return s1; - } - try { - Calendar cal2 = randomCalendar(); - UnitNRShape s2 = tree.toShape(cal2); - if (cal1.compareTo(cal2) < 0) { - return tree.toRangeShape(s1, s2); - } else { - return tree.toRangeShape(s2, s1); - } - } catch (IllegalArgumentException e) { - assert e.getMessage().startsWith("Differing precision"); - return s1; - } - } - - private Calendar randomCalendar() { - Calendar cal = tree.newCal(); - cal.setTimeInMillis(random().nextLong() % randomCalWindowMs); - try { - tree.clearFieldsAfter(cal, random().nextInt(Calendar.FIELD_COUNT+1)-1); - } catch (AssertionError e) { - if (!e.getMessage().equals("Calendar underflow")) - throw e; - } - return cal; - } -}