LUCENE-4585: PrefixTree bugs

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1418006 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
David Wayne Smiley 2012-12-06 17:17:53 +00:00
parent ffb1558e91
commit 4eb1c502c9
7 changed files with 61 additions and 45 deletions

View File

@ -69,6 +69,12 @@ Changes in backwards compatibility policy
previous two methods returned by calling parents(), children() or siblings() previous two methods returned by calling parents(), children() or siblings()
on the returned ParallelTaxonomyArrays. (Shai Erera) on the returned ParallelTaxonomyArrays. (Shai Erera)
* LUCENE-4585: Spatial PrefixTree based Strategies (either TermQuery or
RecursivePrefix based) MAY want to re-index if used for point data. If a
re-index is not done, then an indexed point is ~1/2 the smallest grid cell
larger and as such is slightly more likely to match a query shape.
(David Smiley)
New Features New Features
* LUCENE-4226: New experimental StoredFieldsFormat that compresses chunks of * LUCENE-4226: New experimental StoredFieldsFormat that compresses chunks of
@ -207,16 +213,6 @@ Bug Fixes
* LUCENE-4009: Improve TermsFilter.toString (Tim Costermans via Chris * LUCENE-4009: Improve TermsFilter.toString (Tim Costermans via Chris
Male, Mike McCandless) Male, Mike McCandless)
* LUCENE-4588: Benchmark's EnwikiContentSource was discarding last wiki
document and had leaking threads in 'forever' mode. (Doron Cohen)
Changes in Runtime Behavior
* LUCENE-4586: Change default ResultMode of FacetRequest to PER_NODE_IN_TREE.
This only affects requests with depth>1. If you execute such requests and
rely on the facet results being returned flat (i.e. no hierarchy), you should
set the ResultMode to GLOBAL_FLAT. (Shai Erera, Gilad Barkai)
Optimizations Optimizations
* LUCENE-2221: oal.util.BitUtil was modified to use Long.bitCount and * LUCENE-2221: oal.util.BitUtil was modified to use Long.bitCount and
@ -269,9 +265,6 @@ Optimizations
Users of this API can now simply obtain an instance via DocValues#getDirectSource per thread. Users of this API can now simply obtain an instance via DocValues#getDirectSource per thread.
(Simon Willnauer) (Simon Willnauer)
* LUCENE-4580: DrillDown.query variants return a ConstantScoreQuery with boost set to 0.0f
so that documents scores are not affected by running a drill-down query. (Shai Erera)
Documentation Documentation
* LUCENE-4483: Refer to BytesRef.deepCopyOf in Term's constructor that takes BytesRef. * LUCENE-4483: Refer to BytesRef.deepCopyOf in Term's constructor that takes BytesRef.
@ -287,10 +280,6 @@ Build
RandomizedContext.contexts static map. Upgrade randomized testing RandomizedContext.contexts static map. Upgrade randomized testing
to version 2.0.2 (Mike McCandless, Dawid Weiss) to version 2.0.2 (Mike McCandless, Dawid Weiss)
* LUCENE-4589: Upgraded benchmark module's Nekohtml dependency to version
1.9.17, removing the workaround in Lucene's HTML parser for the
Turkish locale. (Uwe Schindler)
======================= Lucene 4.0.0 ======================= ======================= Lucene 4.0.0 =======================

View File

@ -45,6 +45,8 @@ public class PointPrefixTreeFieldCacheProvider extends ShapeFieldCacheProvider<P
@Override @Override
protected Point readShape(BytesRef term) { protected Point readShape(BytesRef term) {
scanCell = grid.getNode(term.bytes, term.offset, term.length, scanCell); scanCell = grid.getNode(term.bytes, term.offset, term.length, scanCell);
return scanCell.isLeaf() ? scanCell.getShape().getCenter() : null; if (scanCell.getLevel() == grid.getMaxLevels() && !scanCell.isLeaf())
return scanCell.getCenter();
return null;
} }
} }

View File

@ -19,7 +19,11 @@ package org.apache.lucene.spatial.prefix;
import com.spatial4j.core.shape.Shape; import com.spatial4j.core.shape.Shape;
import com.spatial4j.core.shape.SpatialRelation; import com.spatial4j.core.shape.SpatialRelation;
import org.apache.lucene.index.*; import org.apache.lucene.index.AtomicReader;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.DocIdSet; import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.Filter; import org.apache.lucene.search.Filter;
@ -110,23 +114,28 @@ RE "scan" threshold:
while(!cells.isEmpty()) { while(!cells.isEmpty()) {
final Node cell = cells.removeFirst(); final Node cell = cells.removeFirst();
final BytesRef cellTerm = new BytesRef(cell.getTokenBytes()); final BytesRef cellTerm = new BytesRef(cell.getTokenBytes());
TermsEnum.SeekStatus seekStat = termsEnum.seekCeil(cellTerm); if (!termsEnum.seekExact(cellTerm, true))
if (seekStat == TermsEnum.SeekStatus.END)
break;
if (seekStat == TermsEnum.SeekStatus.NOT_FOUND)
continue; continue;
if (cell.getLevel() == detailLevel || cell.isLeaf()) { if (cell.getLevel() == detailLevel || cell.isLeaf()) {
docsEnum = termsEnum.docs(acceptDocs, docsEnum, 0); docsEnum = termsEnum.docs(acceptDocs, docsEnum, 0);
addDocs(docsEnum,bits); addDocs(docsEnum,bits);
} else {//any other intersection } else {//any other intersection
//If the next indexed term is the leaf marker, then add all of them assert cell.getLevel() < detailLevel; //assertions help clarify logic
assert !cell.isLeaf();
//If the next indexed term just adds a leaf marker ('+') to cell,
// then add all of those docs
BytesRef nextCellTerm = termsEnum.next(); BytesRef nextCellTerm = termsEnum.next();
if (nextCellTerm == null)
break;
assert StringHelper.startsWith(nextCellTerm, cellTerm); assert StringHelper.startsWith(nextCellTerm, cellTerm);
scanCell = grid.getNode(nextCellTerm.bytes, nextCellTerm.offset, nextCellTerm.length, scanCell); scanCell = grid.getNode(nextCellTerm.bytes, nextCellTerm.offset, nextCellTerm.length, scanCell);
if (scanCell.isLeaf()) { if (scanCell.getLevel() == cell.getLevel() && scanCell.isLeaf()) {
docsEnum = termsEnum.docs(acceptDocs, docsEnum, 0); docsEnum = termsEnum.docs(acceptDocs, docsEnum, 0);
addDocs(docsEnum,bits); addDocs(docsEnum,bits);
termsEnum.next();//move pointer to avoid potential redundant addDocs() below //increment pointer to avoid potential redundant addDocs() below
nextCellTerm = termsEnum.next();
if (nextCellTerm == null)
break;
} }
//Decide whether to continue to divide & conquer, or whether it's time to scan through terms beneath this cell. //Decide whether to continue to divide & conquer, or whether it's time to scan through terms beneath this cell.
@ -144,8 +153,13 @@ RE "scan" threshold:
if (termLevel > detailLevel) if (termLevel > detailLevel)
continue; continue;
if (termLevel == detailLevel || scanCell.isLeaf()) { if (termLevel == detailLevel || scanCell.isLeaf()) {
//TODO should put more thought into implications of box vs point Shape cShape;
Shape cShape = termLevel == grid.getMaxLevels() ? scanCell.getCenter() : scanCell.getShape(); //if this cell represents a point, use the cell center vs the box
// (points never have isLeaf())
if (termLevel == grid.getMaxLevels() && !scanCell.isLeaf())
cShape = scanCell.getCenter();
else
cShape = scanCell.getShape();
if(queryShape.relate(cShape) == SpatialRelation.DISJOINT) if(queryShape.relate(cShape) == SpatialRelation.DISJOINT)
continue; continue;

View File

@ -101,11 +101,11 @@ public class GeohashPrefixTree extends SpatialPrefixTree {
class GhCell extends Node { class GhCell extends Node {
GhCell(String token) { GhCell(String token) {
super(GeohashPrefixTree.this, token); super(token);
} }
GhCell(byte[] bytes, int off, int len) { GhCell(byte[] bytes, int off, int len) {
super(GeohashPrefixTree.this, bytes, off, len); super(bytes, off, len);
} }
@Override @Override

View File

@ -44,11 +44,14 @@ public abstract class Node implements Comparable<Node> {
private String token;//this is the only part of equality private String token;//this is the only part of equality
protected SpatialRelation shapeRel;//set in getSubCells(filter), and via setLeaf(). /** When set via getSubCells(filter), it is the relationship between this
private SpatialPrefixTree spatialPrefixTree; * cell and the given shape filter. If set via setLeaf() (to WITHIN), it is
* meant to indicate no further sub-cells are going to be provided because
* maxLevels or a detailLevel is hit. It's always null for points.
*/
protected SpatialRelation shapeRel;
protected Node(SpatialPrefixTree spatialPrefixTree, String token) { protected Node(String token) {
this.spatialPrefixTree = spatialPrefixTree;
this.token = token; this.token = token;
if (token.length() > 0 && token.charAt(token.length() - 1) == (char) LEAF_BYTE) { if (token.length() > 0 && token.charAt(token.length() - 1) == (char) LEAF_BYTE) {
this.token = token.substring(0, token.length() - 1); this.token = token.substring(0, token.length() - 1);
@ -59,8 +62,7 @@ public abstract class Node implements Comparable<Node> {
getShape();//ensure any lazy instantiation completes to make this threadsafe getShape();//ensure any lazy instantiation completes to make this threadsafe
} }
protected Node(SpatialPrefixTree spatialPrefixTree, byte[] bytes, int off, int len) { protected Node(byte[] bytes, int off, int len) {
this.spatialPrefixTree = spatialPrefixTree;
this.bytes = bytes; this.bytes = bytes;
this.b_off = off; this.b_off = off;
this.b_len = len; this.b_len = len;
@ -78,11 +80,10 @@ public abstract class Node implements Comparable<Node> {
} }
private void b_fixLeaf() { private void b_fixLeaf() {
//note that non-point shapes always have the maxLevels cell set with setLeaf
if (bytes[b_off + b_len - 1] == LEAF_BYTE) { if (bytes[b_off + b_len - 1] == LEAF_BYTE) {
b_len--; b_len--;
setLeaf(); setLeaf();
} else if (getLevel() == spatialPrefixTree.getMaxLevels()) {
setLeaf();
} }
} }
@ -90,6 +91,10 @@ public abstract class Node implements Comparable<Node> {
return shapeRel; return shapeRel;
} }
/**
* For points, this is always false. Otherwise this is true if there are no
* further cells with this prefix for the shape (always true at maxLevels).
*/
public boolean isLeaf() { public boolean isLeaf() {
return shapeRel == SpatialRelation.WITHIN; return shapeRel == SpatialRelation.WITHIN;
} }
@ -133,8 +138,14 @@ public abstract class Node implements Comparable<Node> {
//public Cell getParent(); //public Cell getParent();
/** /**
* Like {@link #getSubCells()} but with the results filtered by a shape. If that shape is a {@link com.spatial4j.core.shape.Point} then it * Like {@link #getSubCells()} but with the results filtered by a shape. If
* must call {@link #getSubCell(com.spatial4j.core.shape.Point)}; * that shape is a {@link com.spatial4j.core.shape.Point} then it
* must call {@link #getSubCell(com.spatial4j.core.shape.Point)}.
* The returned cells should have their {@link Node#shapeRel} set to their
* relation with {@code shapeFilter} for non-point. As such,
* {@link org.apache.lucene.spatial.prefix.tree.Node#isLeaf()} should be
* accurate.
* <p/>
* Precondition: Never called when getLevel() == maxLevel. * Precondition: Never called when getLevel() == maxLevel.
* *
* @param shapeFilter an optional filter for the returned cells. * @param shapeFilter an optional filter for the returned cells.

View File

@ -228,16 +228,16 @@ public class QuadPrefixTree extends SpatialPrefixTree {
class QuadCell extends Node { class QuadCell extends Node {
public QuadCell(String token) { public QuadCell(String token) {
super(QuadPrefixTree.this, token); super(token);
} }
public QuadCell(String token, SpatialRelation shapeRel) { public QuadCell(String token, SpatialRelation shapeRel) {
super(QuadPrefixTree.this, token); super(token);
this.shapeRel = shapeRel; this.shapeRel = shapeRel;
} }
QuadCell(byte[] bytes, int off, int len) { QuadCell(byte[] bytes, int off, int len) {
super(QuadPrefixTree.this, bytes, off, len); super(bytes, off, len);
} }
@Override @Override

View File

@ -156,7 +156,7 @@ public abstract class SpatialPrefixTree {
} }
final Collection<Node> subCells = node.getSubCells(shape); final Collection<Node> subCells = node.getSubCells(shape);
if (node.getLevel() == detailLevel - 1) { if (node.getLevel() == detailLevel - 1) {
if (subCells.size() < node.getSubCellsSize()) { if (subCells.size() < node.getSubCellsSize() || node.getLevel() == 0) {
if (inclParents) if (inclParents)
result.add(node); result.add(node);
for (Node subCell : subCells) { for (Node subCell : subCells) {
@ -164,7 +164,7 @@ public abstract class SpatialPrefixTree {
} }
result.addAll(subCells); result.addAll(subCells);
} else {//a bottom level (i.e. detail level) optimization where all boxes intersect, so use parent cell. } else {//a bottom level (i.e. detail level) optimization where all boxes intersect, so use parent cell.
node.setLeaf(); node.setLeaf();//the cell may not be strictly within but its close
result.add(node); result.add(node);
} }
} else { } else {