add and/not/or docid sets, not just docsets, and improve caching behavior

This commit is contained in:
kimchy 2010-10-27 18:16:15 +02:00
parent 8fef3df16f
commit ca67c12de5
9 changed files with 492 additions and 22 deletions

View File

@ -0,0 +1,135 @@
/*
* Licensed to Elastic Search and Shay Banon under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. Elastic Search licenses this
* file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.common.lucene.docset;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator;
import java.io.IOException;
import java.util.List;
/**
* @author kimchy (shay.banon)
*/
public class AndDocIdSet extends DocIdSet {
private final List<DocIdSet> sets;
public AndDocIdSet(List<DocIdSet> sets) {
this.sets = sets;
}
@Override public boolean isCacheable() {
// not cacheable, the reason is that by default, when constructing the filter, it is not cacheable,
// so if someone wants it to be cacheable, we might as well construct a cached version of the result
return false;
// for (DocIdSet set : sets) {
// if (!set.isCacheable()) {
// return false;
// }
// }
// return true;
}
@Override public DocIdSetIterator iterator() throws IOException {
return new AndDocIdSetIterator();
}
class AndDocIdSetIterator extends DocIdSetIterator {
int lastReturn = -1;
private DocIdSetIterator[] iterators = null;
AndDocIdSetIterator() throws IOException {
iterators = new DocIdSetIterator[sets.size()];
int j = 0;
for (DocIdSet set : sets) {
if (set != null) {
DocIdSetIterator dcit = set.iterator();
iterators[j++] = dcit;
}
}
lastReturn = (iterators.length > 0 ? -1 : DocIdSetIterator.NO_MORE_DOCS);
}
@Override
public final int docID() {
return lastReturn;
}
@Override
public final int nextDoc() throws IOException {
if (lastReturn == DocIdSetIterator.NO_MORE_DOCS) return DocIdSetIterator.NO_MORE_DOCS;
DocIdSetIterator dcit = iterators[0];
int target = dcit.nextDoc();
int size = iterators.length;
int skip = 0;
int i = 1;
while (i < size) {
if (i != skip) {
dcit = iterators[i];
int docid = dcit.advance(target);
if (docid > target) {
target = docid;
if (i != 0) {
skip = i;
i = 0;
continue;
} else
skip = 0;
}
}
i++;
}
return (lastReturn = target);
}
@Override
public final int advance(int target) throws IOException {
if (lastReturn == DocIdSetIterator.NO_MORE_DOCS) return DocIdSetIterator.NO_MORE_DOCS;
DocIdSetIterator dcit = iterators[0];
target = dcit.advance(target);
int size = iterators.length;
int skip = 0;
int i = 1;
while (i < size) {
if (i != skip) {
dcit = iterators[i];
int docid = dcit.advance(target);
if (docid > target) {
target = docid;
if (i != 0) {
skip = i;
i = 0;
continue;
} else {
skip = 0;
}
}
}
i++;
}
return (lastReturn = target);
}
}
}

View File

@ -44,12 +44,15 @@ public class AndDocSet extends DocSet {
}
@Override public boolean isCacheable() {
for (DocSet set : sets) {
if (!set.isCacheable()) {
return false;
}
}
return true;
// not cacheable, the reason is that by default, when constructing the filter, it is not cacheable,
// so if someone wants it to be cacheable, we might as well construct a cached version of the result
return false;
// for (DocSet set : sets) {
// if (!set.isCacheable()) {
// return false;
// }
// }
// return true;
}
@Override public DocIdSetIterator iterator() throws IOException {

View File

@ -0,0 +1,110 @@
/*
* Licensed to Elastic Search and Shay Banon under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. Elastic Search licenses this
* file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.common.lucene.docset;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator;
import java.io.IOException;
/**
* @author kimchy (shay.banon)
*/
public class NotDocIdSet extends DocIdSet {
private final DocIdSet set;
private final int max;
public NotDocIdSet(DocIdSet set, int max) {
this.max = max;
this.set = set;
}
@Override public boolean isCacheable() {
// not cacheable, the reason is that by default, when constructing the filter, it is not cacheable,
// so if someone wants it to be cacheable, we might as well construct a cached version of the result
return false;
// return set.isCacheable();
}
@Override public DocIdSetIterator iterator() throws IOException {
return new NotDocIdSetIterator();
}
class NotDocIdSetIterator extends DocIdSetIterator {
int lastReturn = -1;
private DocIdSetIterator it1 = null;
private int innerDocid = -1;
NotDocIdSetIterator() throws IOException {
initialize();
}
private void initialize() throws IOException {
it1 = set.iterator();
if ((innerDocid = it1.nextDoc()) == DocIdSetIterator.NO_MORE_DOCS) it1 = null;
}
@Override
public int docID() {
return lastReturn;
}
@Override
public int nextDoc() throws IOException {
return advance(0);
}
@Override
public int advance(int target) throws IOException {
if (lastReturn == DocIdSetIterator.NO_MORE_DOCS) {
return DocIdSetIterator.NO_MORE_DOCS;
}
if (target <= lastReturn) target = lastReturn + 1;
if (it1 != null && innerDocid < target) {
if ((innerDocid = it1.advance(target)) == DocIdSetIterator.NO_MORE_DOCS) {
it1 = null;
}
}
while (it1 != null && innerDocid == target) {
target++;
if (target >= max) {
return (lastReturn = DocIdSetIterator.NO_MORE_DOCS);
}
if ((innerDocid = it1.advance(target)) == DocIdSetIterator.NO_MORE_DOCS) {
it1 = null;
}
}
// ADDED THIS, bug in original code
if (target >= max) {
return (lastReturn = DocIdSetIterator.NO_MORE_DOCS);
}
return (lastReturn = target);
}
}
}

View File

@ -34,7 +34,10 @@ public class NotDocSet extends GetDocSet {
}
@Override public boolean isCacheable() {
return set.isCacheable();
// not cacheable, the reason is that by default, when constructing the filter, it is not cacheable,
// so if someone wants it to be cacheable, we might as well construct a cached version of the result
return false;
// return set.isCacheable();
}
@Override public boolean get(int doc) throws IOException {

View File

@ -0,0 +1,195 @@
/*
* Licensed to Elastic Search and Shay Banon under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. Elastic Search licenses this
* file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.common.lucene.docset;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator;
import java.io.IOException;
import java.util.List;
/**
* @author kimchy (shay.banon)
*/
public class OrDocIdSet extends DocIdSet {
private final List<DocIdSet> sets;
public OrDocIdSet(List<DocIdSet> sets) {
this.sets = sets;
}
@Override public boolean isCacheable() {
// not cacheable, the reason is that by default, when constructing the filter, it is not cacheable,
// so if someone wants it to be cacheable, we might as well construct a cached version of the result
return false;
// for (DocIdSet set : sets) {
// if (!set.isCacheable()) {
// return false;
// }
// }
// return true;
}
@Override public DocIdSetIterator iterator() throws IOException {
return new OrDocIdSetIterator();
}
public class OrDocIdSetIterator extends DocIdSetIterator {
private final class Item {
public final DocIdSetIterator iter;
public int doc;
public Item(DocIdSetIterator iter) {
this.iter = iter;
this.doc = -1;
}
}
private int _curDoc;
private final Item[] _heap;
private int _size;
OrDocIdSetIterator() throws IOException {
_curDoc = -1;
_heap = new Item[sets.size()];
_size = 0;
for (DocIdSet set : sets) {
_heap[_size++] = new Item(set.iterator());
}
if (_size == 0) _curDoc = DocIdSetIterator.NO_MORE_DOCS;
}
@Override
public final int docID() {
return _curDoc;
}
@Override
public final int nextDoc() throws IOException {
if (_curDoc == DocIdSetIterator.NO_MORE_DOCS) return DocIdSetIterator.NO_MORE_DOCS;
Item top = _heap[0];
while (true) {
DocIdSetIterator topIter = top.iter;
int docid;
if ((docid = topIter.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
top.doc = docid;
heapAdjust();
} else {
heapRemoveRoot();
if (_size == 0) return (_curDoc = DocIdSetIterator.NO_MORE_DOCS);
}
top = _heap[0];
int topDoc = top.doc;
if (topDoc > _curDoc) {
return (_curDoc = topDoc);
}
}
}
@Override
public final int advance(int target) throws IOException {
if (_curDoc == DocIdSetIterator.NO_MORE_DOCS) return DocIdSetIterator.NO_MORE_DOCS;
if (target <= _curDoc) target = _curDoc + 1;
Item top = _heap[0];
while (true) {
DocIdSetIterator topIter = top.iter;
int docid;
if ((docid = topIter.advance(target)) != DocIdSetIterator.NO_MORE_DOCS) {
top.doc = docid;
heapAdjust();
} else {
heapRemoveRoot();
if (_size == 0) return (_curDoc = DocIdSetIterator.NO_MORE_DOCS);
}
top = _heap[0];
int topDoc = top.doc;
if (topDoc >= target) {
return (_curDoc = topDoc);
}
}
}
// Organize subScorers into a min heap with scorers generating the earlest document on top.
/*
private final void heapify() {
int size = _size;
for (int i=(size>>1)-1; i>=0; i--)
heapAdjust(i);
}
*/
/* The subtree of subScorers at root is a min heap except possibly for its root element.
* Bubble the root down as required to make the subtree a heap.
*/
private final void heapAdjust() {
final Item[] heap = _heap;
final Item top = heap[0];
final int doc = top.doc;
final int size = _size;
int i = 0;
while (true) {
int lchild = (i << 1) + 1;
if (lchild >= size) break;
Item left = heap[lchild];
int ldoc = left.doc;
int rchild = lchild + 1;
if (rchild < size) {
Item right = heap[rchild];
int rdoc = right.doc;
if (rdoc <= ldoc) {
if (doc <= rdoc) break;
heap[i] = right;
i = rchild;
continue;
}
}
if (doc <= ldoc) break;
heap[i] = left;
i = lchild;
}
heap[i] = top;
}
// Remove the root Scorer from subScorers and re-establish it as a heap
private void heapRemoveRoot() {
_size--;
if (_size > 0) {
Item tmp = _heap[0];
_heap[0] = _heap[_size];
_heap[_size] = tmp; // keep the finished iterator at the end for debugging
heapAdjust();
}
}
}
}

View File

@ -37,10 +37,13 @@ public class OrDocSet extends DocSet {
}
@Override public boolean get(int doc) throws IOException {
for (DocSet s : sets) {
if (s.get(doc)) return true;
}
// not cacheable, the reason is that by default, when constructing the filter, it is not cacheable,
// so if someone wants it to be cacheable, we might as well construct a cached version of the result
return false;
// for (DocSet s : sets) {
// if (s.get(doc)) return true;
// }
// return false;
}
@Override public boolean isCacheable() {

View File

@ -23,9 +23,9 @@ import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.Filter;
import org.elasticsearch.common.collect.Lists;
import org.elasticsearch.common.lucene.docset.AndDocIdSet;
import org.elasticsearch.common.lucene.docset.AndDocSet;
import org.elasticsearch.common.lucene.docset.DocSet;
import org.elasticsearch.common.lucene.docset.DocSets;
import java.io.IOException;
import java.util.List;
@ -47,13 +47,21 @@ public class AndFilter extends Filter {
@Override public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
if (filters.size() == 1) {
return DocSets.convert(reader, filters.get(0).getDocIdSet(reader));
return filters.get(0).getDocIdSet(reader);
}
List<DocSet> sets = Lists.newArrayListWithExpectedSize(filters.size());
List sets = Lists.newArrayListWithExpectedSize(filters.size());
boolean allAreDocSet = true;
for (Filter filter : filters) {
sets.add(DocSets.convert(reader, filter.getDocIdSet(reader)));
DocIdSet set = filter.getDocIdSet(reader);
if (!(set instanceof DocSet)) {
allAreDocSet = false;
}
sets.add(set);
}
return new AndDocSet(sets);
if (allAreDocSet) {
return new AndDocSet(sets);
}
return new AndDocIdSet(sets);
}
@Override public int hashCode() {

View File

@ -22,7 +22,8 @@ package org.elasticsearch.common.lucene.search;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.Filter;
import org.elasticsearch.common.lucene.docset.DocSets;
import org.elasticsearch.common.lucene.docset.DocSet;
import org.elasticsearch.common.lucene.docset.NotDocIdSet;
import org.elasticsearch.common.lucene.docset.NotDocSet;
import java.io.IOException;
@ -43,7 +44,11 @@ public class NotFilter extends Filter {
}
@Override public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
return new NotDocSet(DocSets.convert(reader, filter.getDocIdSet(reader)), reader.maxDoc());
DocIdSet set = filter.getDocIdSet(reader);
if (set instanceof DocSet) {
return new NotDocSet((DocSet) set, reader.maxDoc());
}
return new NotDocIdSet(set, reader.maxDoc());
}
@Override

View File

@ -24,7 +24,7 @@ import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.Filter;
import org.elasticsearch.common.collect.Lists;
import org.elasticsearch.common.lucene.docset.DocSet;
import org.elasticsearch.common.lucene.docset.DocSets;
import org.elasticsearch.common.lucene.docset.OrDocIdSet;
import org.elasticsearch.common.lucene.docset.OrDocSet;
import java.io.IOException;
@ -47,13 +47,21 @@ public class OrFilter extends Filter {
@Override public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
if (filters.size() == 1) {
return DocSets.convert(reader, filters.get(0).getDocIdSet(reader));
return filters.get(0).getDocIdSet(reader);
}
List<DocSet> sets = Lists.newArrayListWithExpectedSize(filters.size());
List sets = Lists.newArrayListWithExpectedSize(filters.size());
boolean allAreDocSet = true;
for (Filter filter : filters) {
sets.add(DocSets.convert(reader, filter.getDocIdSet(reader)));
DocIdSet set = filter.getDocIdSet(reader);
if (!(set instanceof DocSet)) {
allAreDocSet = false;
}
sets.add(set);
}
return new OrDocSet(sets);
if (allAreDocSet) {
return new OrDocSet(sets);
}
return new OrDocIdSet(sets);
}
@Override public int hashCode() {