LUCENE-7871: fix false positive match in BlockJoinSelector when children have no value.

- introducing  BlockJoinSelector.wrap methods accepting children as DISI.
- extracting ToParentDocValues
This commit is contained in:
Mikhail Khludnev 2017-06-15 10:50:46 +03:00
parent d070ca6051
commit 706d201815
4 changed files with 401 additions and 302 deletions

View File

@ -16,18 +16,17 @@
*/
package org.apache.lucene.search.join;
import java.io.IOException;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.index.SortedNumericDocValues;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.SortedNumericSelector;
import org.apache.lucene.search.SortedSetSelector;
import org.apache.lucene.util.BitSet;
import org.apache.lucene.util.BitSetIterator;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
/** Select a value from a block of documents.
* @lucene.internal */
@ -78,7 +77,15 @@ public class BlockJoinSelector {
/** Wraps the provided {@link SortedSetDocValues} in order to only select
* one value per parent among its {@code children} using the configured
* {@code selection} type. */
@Deprecated
public static SortedDocValues wrap(SortedSetDocValues sortedSet, Type selection, BitSet parents, BitSet children) {
return wrap(sortedSet, selection, parents, toIter(children));
}
/** Wraps the provided {@link SortedSetDocValues} in order to only select
* one value per parent among its {@code children} using the configured
* {@code selection} type. */
public static SortedDocValues wrap(SortedSetDocValues sortedSet, Type selection, BitSet parents, DocIdSetIterator children) {
SortedDocValues values;
switch (selection) {
case MIN:
@ -96,158 +103,38 @@ public class BlockJoinSelector {
/** Wraps the provided {@link SortedDocValues} in order to only select
* one value per parent among its {@code children} using the configured
* {@code selection} type. */
@Deprecated
public static SortedDocValues wrap(final SortedDocValues values, Type selection, BitSet parents, BitSet children) {
return wrap(values, selection, parents, toIter(children));
}
/** Wraps the provided {@link SortedDocValues} in order to only select
* one value per parent among its {@code children} using the configured
* {@code selection} type. */
public static SortedDocValues wrap(final SortedDocValues values, Type selection, BitSet parents, DocIdSetIterator children) {
if (values.docID() != -1) {
throw new IllegalArgumentException("values iterator was already consumed: values.docID=" + values.docID());
}
return new SortedDocValues() {
private int ord = -1;
private int docID = -1;
@Override
public int docID() {
return docID;
}
@Override
public int nextDoc() throws IOException {
assert docID != NO_MORE_DOCS;
if (values.docID() == -1) {
if (values.nextDoc() == NO_MORE_DOCS) {
docID = NO_MORE_DOCS;
return docID;
}
}
if (values.docID() == NO_MORE_DOCS) {
docID = NO_MORE_DOCS;
return docID;
}
int nextParentDocID = parents.nextSetBit(values.docID());
ord = values.ordValue();
while (true) {
int childDocID = values.nextDoc();
assert childDocID != nextParentDocID;
if (childDocID > nextParentDocID) {
break;
}
if (children.get(childDocID) == false) {
continue;
}
if (selection == Type.MIN) {
ord = Math.min(ord, values.ordValue());
} else if (selection == Type.MAX) {
ord = Math.max(ord, values.ordValue());
} else {
throw new AssertionError();
}
}
docID = nextParentDocID;
return docID;
}
@Override
public int advance(int target) throws IOException {
if (target >= parents.length()) {
docID = NO_MORE_DOCS;
return docID;
}
if (target == 0) {
assert docID() == -1;
return nextDoc();
}
int prevParentDocID = parents.prevSetBit(target-1);
if (values.docID() <= prevParentDocID) {
values.advance(prevParentDocID+1);
}
return nextDoc();
}
@Override
public boolean advanceExact(int targetParentDocID) throws IOException {
if (targetParentDocID < docID) {
throw new IllegalArgumentException("target must be after the current document: current=" + docID + " target=" + targetParentDocID);
}
int previousDocId = docID;
docID = targetParentDocID;
if (targetParentDocID == previousDocId) {
return ord != -1;
}
docID = targetParentDocID;
ord = -1;
if (parents.get(targetParentDocID) == false) {
return false;
}
int prevParentDocId = docID == 0 ? -1 : parents.prevSetBit(docID - 1);
int childDoc = values.docID();
if (childDoc <= prevParentDocId) {
childDoc = values.advance(prevParentDocId + 1);
}
if (childDoc >= docID) {
return false;
}
boolean hasValue = false;
for (int doc = values.docID(); doc < docID; doc = values.nextDoc()) {
if (children.get(doc)) {
ord = values.ordValue();
hasValue = true;
values.nextDoc();
break;
}
}
if (hasValue == false) {
return false;
}
for (int doc = values.docID(); doc < docID; doc = values.nextDoc()) {
if (children.get(doc)) {
switch (selection) {
case MIN:
ord = Math.min(ord, values.ordValue());
break;
case MAX:
ord = Math.max(ord, values.ordValue());
break;
default:
throw new AssertionError();
}
}
}
return true;
}
@Override
public int ordValue() {
return ord;
}
@Override
public BytesRef lookupOrd(int ord) throws IOException {
return values.lookupOrd(ord);
}
@Override
public int getValueCount() {
return values.getValueCount();
}
@Override
public long cost() {
return values.cost();
}
};
return ToParentDocValues.wrap(values, selection, parents, children);
}
/** Wraps the provided {@link SortedNumericDocValues} in order to only select
* one value per parent among its {@code children} using the configured
* {@code selection} type. */
@Deprecated
public static NumericDocValues wrap(SortedNumericDocValues sortedNumerics, Type selection, BitSet parents, BitSet children) {
return wrap(sortedNumerics, selection, parents, toIter(children));
}
/** creates an interator for the given bitset */
protected static BitSetIterator toIter(BitSet children) {
return new BitSetIterator(children, 0);
}
/** Wraps the provided {@link SortedNumericDocValues} in order to only select
* one value per parent among its {@code children} using the configured
* {@code selection} type. */
public static NumericDocValues wrap(SortedNumericDocValues sortedNumerics, Type selection, BitSet parents, DocIdSetIterator children) {
NumericDocValues values;
switch (selection) {
case MIN:
@ -265,145 +152,18 @@ public class BlockJoinSelector {
/** Wraps the provided {@link NumericDocValues}, iterating over only
* child documents, in order to only select one value per parent among
* its {@code children} using the configured {@code selection} type. */
@Deprecated
public static NumericDocValues wrap(final NumericDocValues values, Type selection, BitSet parents, BitSet children) {
return new NumericDocValues() {
private int parentDocID = -1;
private long value;
@Override
public int nextDoc() throws IOException {
if (parentDocID == -1) {
values.nextDoc();
}
while (true) {
// TODO: make this crazy loop more efficient
int childDocID = values.docID();
if (childDocID == NO_MORE_DOCS) {
parentDocID = NO_MORE_DOCS;
return parentDocID;
}
if (children.get(childDocID) == false) {
values.nextDoc();
continue;
}
assert parents.get(childDocID) == false;
parentDocID = parents.nextSetBit(childDocID);
value = values.longValue();
while (true) {
childDocID = values.nextDoc();
assert childDocID != parentDocID;
if (childDocID > parentDocID) {
break;
}
switch (selection) {
case MIN:
value = Math.min(value, values.longValue());
break;
case MAX:
value = Math.max(value, values.longValue());
break;
default:
throw new AssertionError();
}
}
break;
}
return parentDocID;
}
@Override
public int advance(int targetParentDocID) throws IOException {
if (targetParentDocID <= parentDocID) {
throw new IllegalArgumentException("target must be after the current document: current=" + parentDocID + " target=" + targetParentDocID);
}
if (targetParentDocID == 0) {
return nextDoc();
}
int firstChild = parents.prevSetBit(targetParentDocID - 1) + 1;
if (values.advance(firstChild) == NO_MORE_DOCS) {
parentDocID = NO_MORE_DOCS;
return parentDocID;
} else {
return nextDoc();
}
}
@Override
public boolean advanceExact(int targetParentDocID) throws IOException {
if (targetParentDocID <= parentDocID) {
throw new IllegalArgumentException("target must be after the current document: current=" + parentDocID + " target=" + targetParentDocID);
}
parentDocID = targetParentDocID;
if (parents.get(targetParentDocID) == false) {
return false;
}
int prevParentDocId = parentDocID == 0 ? -1 : parents.prevSetBit(parentDocID - 1);
int childDoc = values.docID();
if (childDoc <= prevParentDocId) {
childDoc = values.advance(prevParentDocId + 1);
}
if (childDoc >= parentDocID) {
return false;
}
boolean hasValue = false;
for (int doc = values.docID(); doc < parentDocID; doc = values.nextDoc()) {
if (children.get(doc)) {
value = values.longValue();
hasValue = true;
values.nextDoc();
break;
}
}
if (hasValue == false) {
return false;
}
for (int doc = values.docID(); doc < parentDocID; doc = values.nextDoc()) {
if (children.get(doc)) {
switch (selection) {
case MIN:
value = Math.min(value, values.longValue());
break;
case MAX:
value = Math.max(value, values.longValue());
break;
default:
throw new AssertionError();
}
}
}
return true;
}
@Override
public long longValue() {
return value;
}
@Override
public int docID() {
return parentDocID;
}
@Override
public long cost() {
return values.cost();
}
};
return wrap(values,selection, parents, toIter(children));
}
/** Wraps the provided {@link NumericDocValues}, iterating over only
* child documents, in order to only select one value per parent among
* its {@code children} using the configured {@code selection} type. */
public static NumericDocValues wrap(final NumericDocValues values, Type selection, BitSet parents, DocIdSetIterator children) {
if (values.docID() != -1) {
throw new IllegalArgumentException("values iterator was already consumed: values.docID=" + values.docID());
}
return ToParentDocValues.wrap(values,selection, parents, children);
}
}

View File

@ -0,0 +1,296 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.join;
import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.search.ConjunctionDISI;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.join.BlockJoinSelector.Type;
import org.apache.lucene.util.BitSet;
import org.apache.lucene.util.BytesRef;
class ToParentDocValues extends DocIdSetIterator {
interface Accumulator{
void reset() throws IOException;
void increment() throws IOException;
}
private static final class SortedDVs extends SortedDocValues implements Accumulator{
private final SortedDocValues values;
private final BlockJoinSelector.Type selection;
private int ord = -1;
private final ToParentDocValues iter;
private SortedDVs(SortedDocValues values, BlockJoinSelector.Type selection, BitSet parents, DocIdSetIterator children) {
this.values = values;
this.selection = selection;
this.iter = new ToParentDocValues(values,parents, children, this);
}
@Override
public int docID() {
return iter.docID();
}
@Override
public
void reset() throws IOException {
ord = values.ordValue();
}
@Override
public
void increment() throws IOException {
if (selection == BlockJoinSelector.Type.MIN) {
ord = Math.min(ord, values.ordValue());
} else if (selection == BlockJoinSelector.Type.MAX) {
ord = Math.max(ord, values.ordValue());
} else {
throw new AssertionError();
}
}
@Override
public int nextDoc() throws IOException {
return iter.nextDoc();
}
@Override
public int advance(int target) throws IOException {
return iter.advance(target);
}
@Override
public boolean advanceExact(int targetParentDocID) throws IOException {
return iter.advanceExact(targetParentDocID);
}
@Override
public int ordValue() {
return ord;
}
@Override
public BytesRef lookupOrd(int ord) throws IOException {
return values.lookupOrd(ord);
}
@Override
public int getValueCount() {
return values.getValueCount();
}
@Override
public long cost() {
return values.cost();
}
}
static private final class NumDV extends NumericDocValues implements Accumulator{
private final NumericDocValues values;
private long value;
private final BlockJoinSelector.Type selection;
private final ToParentDocValues iter;
private NumDV(NumericDocValues values, BlockJoinSelector.Type selection, BitSet parents, DocIdSetIterator children) {
this.values = values;
this.selection = selection;
iter = new ToParentDocValues(values, parents, children, this);
}
@Override
public void reset() throws IOException {
value = values.longValue();
}
@Override
public void increment() throws IOException {
switch (selection) {
case MIN:
value = Math.min(value, values.longValue());
break;
case MAX:
value = Math.max(value, values.longValue());
break;
default:
throw new AssertionError();
}
}
@Override
public int nextDoc() throws IOException {
return iter.nextDoc();
}
@Override
public int advance(int targetParentDocID) throws IOException {
return iter.advance(targetParentDocID);
}
@Override
public boolean advanceExact(int targetParentDocID) throws IOException {
return iter.advanceExact(targetParentDocID);
}
@Override
public long longValue() {
return value;
}
@Override
public int docID() {
return iter.docID();
}
@Override
public long cost() {
return values.cost();
}
}
private ToParentDocValues(DocIdSetIterator values, BitSet parents, DocIdSetIterator children, Accumulator collect) {
this.parents = parents;
childWithValues = ConjunctionDISI.intersectIterators(Arrays.asList(children, values));
this.collector = collect;
}
final private BitSet parents;
private int docID = -1;
final private Accumulator collector;
boolean seen=false;
private DocIdSetIterator childWithValues;
@Override
public int docID() {
return docID;
}
@Override
public int nextDoc() throws IOException {
assert docID != NO_MORE_DOCS;
assert childWithValues.docID()!=docID || docID==-1;
if (childWithValues.docID()<docID || docID==-1) {
childWithValues.nextDoc();
}
if (childWithValues.docID() == NO_MORE_DOCS) {
docID = NO_MORE_DOCS;
return docID;
}
assert parents.get(childWithValues.docID()) == false;
int nextParentDocID = parents.nextSetBit(childWithValues.docID());
collector.reset();
seen=true;
while (true) {
int childDocID = childWithValues.nextDoc();
assert childDocID != nextParentDocID;
if (childDocID > nextParentDocID) {
break;
}
collector.increment();
}
docID = nextParentDocID;
return docID;
}
@Override
public int advance(int target) throws IOException {
if (target >= parents.length()) {
docID = NO_MORE_DOCS;
return docID;
}
if (target == 0) {
assert docID() == -1;
return nextDoc();
}
int prevParentDocID = parents.prevSetBit(target-1);
if (childWithValues.docID() <= prevParentDocID) {
childWithValues.advance(prevParentDocID+1);
}
return nextDoc();
}
//@Override
public boolean advanceExact(int targetParentDocID) throws IOException {
if (targetParentDocID < docID) {
throw new IllegalArgumentException("target must be after the current document: current=" + docID + " target=" + targetParentDocID);
}
int previousDocId = docID;
docID = targetParentDocID;
if (targetParentDocID == previousDocId) {
return seen;//ord != -1; rlly???
}
docID = targetParentDocID;
seen =false;
//ord = -1;
if (parents.get(targetParentDocID) == false) {
return false;
}
int prevParentDocId = docID == 0 ? -1 : parents.prevSetBit(docID - 1);
int childDoc = childWithValues.docID();
if (childDoc <= prevParentDocId) {
childDoc = childWithValues.advance(prevParentDocId + 1);
}
if (childDoc >= docID) {
return false;
}
if (childWithValues.docID() < docID) {
collector.reset();
seen=true;
childWithValues.nextDoc();
}
if (seen == false) {
return false;
}
for (int doc = childWithValues.docID(); doc < docID; doc = childWithValues.nextDoc()) {
collector.increment();
}
return true;
}
@Override
public long cost() {
return 0;
}
static NumericDocValues wrap(NumericDocValues values, Type selection, BitSet parents2,
DocIdSetIterator children) {
return new ToParentDocValues.NumDV(values,selection, parents2, children);
}
static SortedDocValues wrap(SortedDocValues values, Type selection, BitSet parents2,
DocIdSetIterator children) {
return new ToParentDocValues.SortedDVs(values, selection, parents2, children);
}
}

View File

@ -16,20 +16,22 @@
*/
package org.apache.lucene.search.join;
import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
import java.io.IOException;
import java.util.Arrays;
import java.util.Random;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.BitSet;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.LuceneTestCase;
import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
public class TestBlockJoinSelector extends LuceneTestCase {
public void testDocsWithValue() {
@ -64,6 +66,48 @@ public class TestBlockJoinSelector extends LuceneTestCase {
assertFalse(docsWithValue.get(19));
}
static void assertNoMoreDoc(DocIdSetIterator sdv, int maxDoc) throws IOException{
Random r = random();
if(r.nextBoolean()){
assertEquals(NO_MORE_DOCS, sdv.nextDoc());
} else {
if (r.nextBoolean()) {
assertEquals(NO_MORE_DOCS, sdv.advance(sdv.docID()+random().nextInt(maxDoc-sdv.docID())));
} else {
final int noMatchDoc = sdv.docID()+random().nextInt(maxDoc-sdv.docID()-1)+1;
assertFalse(advanceExact(sdv,noMatchDoc));
assertEquals(noMatchDoc, sdv.docID());
if (r.nextBoolean()){
assertEquals(NO_MORE_DOCS, sdv.nextDoc());
}
}
}
}
static int nextDoc(DocIdSetIterator sdv, int docId) throws IOException{
Random r = random();
if(r.nextBoolean()){
return sdv.nextDoc();
} else {
if (r.nextBoolean()) {
return sdv.advance(sdv.docID()+random().nextInt(docId-sdv.docID()-1)+1);
} else {
if (r.nextBoolean()){
final int noMatchDoc = sdv.docID()+random().nextInt(docId-sdv.docID()-1)+1;
assertFalse(advanceExact(sdv,noMatchDoc));
assertEquals(noMatchDoc, sdv.docID());
}
assertTrue(advanceExact(sdv,docId));
return sdv.docID();
}
}
}
private static boolean advanceExact(DocIdSetIterator sdv, int target) throws IOException {
return sdv instanceof SortedDocValues ? ((SortedDocValues) sdv).advanceExact(target)
: ((NumericDocValues) sdv).advanceExact(target);
}
public void testSortedSelector() throws IOException {
final BitSet parents = new FixedBitSet(20);
parents.set(0);
@ -89,22 +133,18 @@ public class TestBlockJoinSelector extends LuceneTestCase {
ords[18] = 10;
final SortedDocValues mins = BlockJoinSelector.wrap(DocValues.singleton(new CannedSortedDocValues(ords)), BlockJoinSelector.Type.MIN, parents, children);
assertEquals(5, mins.nextDoc());
assertEquals(5, nextDoc(mins,5));
assertEquals(3, mins.ordValue());
assertEquals(15, mins.nextDoc());
assertEquals(15, nextDoc(mins,15));
assertEquals(10, mins.ordValue());
assertEquals(19, mins.nextDoc());
assertEquals(10, mins.ordValue());
assertEquals(NO_MORE_DOCS, mins.nextDoc());
assertNoMoreDoc(mins, 20);
final SortedDocValues maxs = BlockJoinSelector.wrap(DocValues.singleton(new CannedSortedDocValues(ords)), BlockJoinSelector.Type.MAX, parents, children);
assertEquals(5, maxs.nextDoc());
assertEquals(5, nextDoc(maxs,5));
assertEquals(7, maxs.ordValue());
assertEquals(15, maxs.nextDoc());
assertEquals(15, nextDoc(maxs,15));
assertEquals(10, maxs.ordValue());
assertEquals(19, maxs.nextDoc());
assertEquals(10, maxs.ordValue());
assertEquals(NO_MORE_DOCS, maxs.nextDoc());
assertNoMoreDoc( maxs,20);
}
private static class CannedSortedDocValues extends SortedDocValues {
@ -207,18 +247,18 @@ public class TestBlockJoinSelector extends LuceneTestCase {
longs[18] = 10;
final NumericDocValues mins = BlockJoinSelector.wrap(DocValues.singleton(new CannedNumericDocValues(longs, docsWithValue)), BlockJoinSelector.Type.MIN, parents, children);
assertEquals(5, mins.nextDoc());
assertEquals(5, nextDoc(mins,5));
assertEquals(3, mins.longValue());
assertEquals(15, mins.nextDoc());
assertEquals(15, nextDoc(mins,15));
assertEquals(10, mins.longValue());
assertEquals(NO_MORE_DOCS, mins.nextDoc());
assertNoMoreDoc(mins, 20);
final NumericDocValues maxs = BlockJoinSelector.wrap(DocValues.singleton(new CannedNumericDocValues(longs, docsWithValue)), BlockJoinSelector.Type.MAX, parents, children);
assertEquals(5, maxs.nextDoc());
assertEquals(5, nextDoc(maxs, 5));
assertEquals(7, maxs.longValue());
assertEquals(15, maxs.nextDoc());
assertEquals(15, nextDoc(maxs, 15));
assertEquals(10, maxs.longValue());
assertEquals(NO_MORE_DOCS, maxs.nextDoc());
assertNoMoreDoc(maxs, 20);
}
private static class CannedNumericDocValues extends NumericDocValues {

View File

@ -501,6 +501,9 @@ when using one of Exact*StatsCache (Mikhail Khludnev)
* SOLR-10910: Clean up a few details left over from pluggable transient core and untangling
CoreDescriptor/CoreContainer references (Erick Erickson)
* LUCENE-7871: fix false positive match in BlockJoinSelector when children have no value, introducing
wrap methods accepting children as DISI. Extracting ToParentDocValues (Mikhail Khludnev)
Optimizations
----------------------
* SOLR-10634: JSON Facet API: When a field/terms facet will retrieve all buckets (i.e. limit:-1)