add and/not/or docid sets, not just docsets, and improve caching behavior

2025-03-25 01:19:02 +00:00 · 2010-10-27 18:16:15 +02:00 · 2010-10-27 18:16:15 +02:00 · ca67c12de5
commit ca67c12de5
parent 8fef3df16f
9 changed files with 492 additions and 22 deletions
--- a/modules/elasticsearch/src/main/java/org/elasticsearch/common/lucene/docset/AndDocIdSet.java
+++ b/modules/elasticsearch/src/main/java/org/elasticsearch/common/lucene/docset/AndDocIdSet.java
@ -0,0 +1,135 @@
+/*
+ * Licensed to Elastic Search and Shay Banon under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. Elastic Search licenses this
+ * file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.common.lucene.docset;
+
+import org.apache.lucene.search.DocIdSet;
+import org.apache.lucene.search.DocIdSetIterator;
+
+import java.io.IOException;
+import java.util.List;
+
+/**
+ * @author kimchy (shay.banon)
+ */
+public class AndDocIdSet extends DocIdSet {
+
+    private final List<DocIdSet> sets;
+
+    public AndDocIdSet(List<DocIdSet> sets) {
+        this.sets = sets;
+    }
+
+    @Override public boolean isCacheable() {
+        // not cacheable, the reason is that by default, when constructing the filter, it is not cacheable,
+        // so if someone wants it to be cacheable, we might as well construct a cached version of the result
+        return false;
+//        for (DocIdSet set : sets) {
+//            if (!set.isCacheable()) {
+//                return false;
+//            }
+//        }
+//        return true;
+    }
+
+    @Override public DocIdSetIterator iterator() throws IOException {
+        return new AndDocIdSetIterator();
+    }
+
+    class AndDocIdSetIterator extends DocIdSetIterator {
+        int lastReturn = -1;
+        private DocIdSetIterator[] iterators = null;
+
+        AndDocIdSetIterator() throws IOException {
+            iterators = new DocIdSetIterator[sets.size()];
+            int j = 0;
+            for (DocIdSet set : sets) {
+                if (set != null) {
+                    DocIdSetIterator dcit = set.iterator();
+                    iterators[j++] = dcit;
+                }
+            }
+            lastReturn = (iterators.length > 0 ? -1 : DocIdSetIterator.NO_MORE_DOCS);
+        }
+
+        @Override
+        public final int docID() {
+            return lastReturn;
+        }
+
+        @Override
+        public final int nextDoc() throws IOException {
+
+            if (lastReturn == DocIdSetIterator.NO_MORE_DOCS) return DocIdSetIterator.NO_MORE_DOCS;
+
+            DocIdSetIterator dcit = iterators[0];
+            int target = dcit.nextDoc();
+            int size = iterators.length;
+            int skip = 0;
+            int i = 1;
+            while (i < size) {
+                if (i != skip) {
+                    dcit = iterators[i];
+                    int docid = dcit.advance(target);
+                    if (docid > target) {
+                        target = docid;
+                        if (i != 0) {
+                            skip = i;
+                            i = 0;
+                            continue;
+                        } else
+                            skip = 0;
+                    }
+                }
+                i++;
+            }
+            return (lastReturn = target);
+        }
+
+        @Override
+        public final int advance(int target) throws IOException {
+
+            if (lastReturn == DocIdSetIterator.NO_MORE_DOCS) return DocIdSetIterator.NO_MORE_DOCS;
+
+            DocIdSetIterator dcit = iterators[0];
+            target = dcit.advance(target);
+            int size = iterators.length;
+            int skip = 0;
+            int i = 1;
+            while (i < size) {
+                if (i != skip) {
+                    dcit = iterators[i];
+                    int docid = dcit.advance(target);
+                    if (docid > target) {
+                        target = docid;
+                        if (i != 0) {
+                            skip = i;
+                            i = 0;
+                            continue;
+                        } else {
+                            skip = 0;
+                        }
+                    }
+                }
+                i++;
+            }
+            return (lastReturn = target);
+        }
+    }
+}
--- a/modules/elasticsearch/src/main/java/org/elasticsearch/common/lucene/docset/AndDocSet.java
+++ b/modules/elasticsearch/src/main/java/org/elasticsearch/common/lucene/docset/AndDocSet.java
@ -44,12 +44,15 @@ public class AndDocSet extends DocSet {
    }

    @Override public boolean isCacheable() {
-        for (DocSet set : sets) {
-            if (!set.isCacheable()) {
-                return false;
-            }
-        }
-        return true;
+        // not cacheable, the reason is that by default, when constructing the filter, it is not cacheable,
+        // so if someone wants it to be cacheable, we might as well construct a cached version of the result
+        return false;
+//        for (DocSet set : sets) {
+//            if (!set.isCacheable()) {
+//                return false;
+//            }
+//        }
+//        return true;
    }

    @Override public DocIdSetIterator iterator() throws IOException {
--- a/modules/elasticsearch/src/main/java/org/elasticsearch/common/lucene/docset/NotDocIdSet.java
+++ b/modules/elasticsearch/src/main/java/org/elasticsearch/common/lucene/docset/NotDocIdSet.java
@ -0,0 +1,110 @@
+/*
+ * Licensed to Elastic Search and Shay Banon under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. Elastic Search licenses this
+ * file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.common.lucene.docset;
+
+import org.apache.lucene.search.DocIdSet;
+import org.apache.lucene.search.DocIdSetIterator;
+
+import java.io.IOException;
+
+/**
+ * @author kimchy (shay.banon)
+ */
+public class NotDocIdSet extends DocIdSet {
+
+    private final DocIdSet set;
+
+    private final int max;
+
+    public NotDocIdSet(DocIdSet set, int max) {
+        this.max = max;
+        this.set = set;
+    }
+
+    @Override public boolean isCacheable() {
+        // not cacheable, the reason is that by default, when constructing the filter, it is not cacheable,
+        // so if someone wants it to be cacheable, we might as well construct a cached version of the result
+        return false;
+//        return set.isCacheable();
+    }
+
+    @Override public DocIdSetIterator iterator() throws IOException {
+        return new NotDocIdSetIterator();
+    }
+
+    class NotDocIdSetIterator extends DocIdSetIterator {
+        int lastReturn = -1;
+        private DocIdSetIterator it1 = null;
+        private int innerDocid = -1;
+
+        NotDocIdSetIterator() throws IOException {
+            initialize();
+        }
+
+        private void initialize() throws IOException {
+            it1 = set.iterator();
+
+            if ((innerDocid = it1.nextDoc()) == DocIdSetIterator.NO_MORE_DOCS) it1 = null;
+        }
+
+        @Override
+        public int docID() {
+            return lastReturn;
+        }
+
+        @Override
+        public int nextDoc() throws IOException {
+            return advance(0);
+        }
+
+        @Override
+        public int advance(int target) throws IOException {
+
+            if (lastReturn == DocIdSetIterator.NO_MORE_DOCS) {
+                return DocIdSetIterator.NO_MORE_DOCS;
+            }
+
+            if (target <= lastReturn) target = lastReturn + 1;
+
+            if (it1 != null && innerDocid < target) {
+                if ((innerDocid = it1.advance(target)) == DocIdSetIterator.NO_MORE_DOCS) {
+                    it1 = null;
+                }
+            }
+
+            while (it1 != null && innerDocid == target) {
+                target++;
+                if (target >= max) {
+                    return (lastReturn = DocIdSetIterator.NO_MORE_DOCS);
+                }
+                if ((innerDocid = it1.advance(target)) == DocIdSetIterator.NO_MORE_DOCS) {
+                    it1 = null;
+                }
+            }
+
+            // ADDED THIS, bug in original code
+            if (target >= max) {
+                return (lastReturn = DocIdSetIterator.NO_MORE_DOCS);
+            }
+
+            return (lastReturn = target);
+        }
+    }
+}
--- a/modules/elasticsearch/src/main/java/org/elasticsearch/common/lucene/docset/NotDocSet.java
+++ b/modules/elasticsearch/src/main/java/org/elasticsearch/common/lucene/docset/NotDocSet.java
@ -34,7 +34,10 @@ public class NotDocSet extends GetDocSet {
    }

    @Override public boolean isCacheable() {
-        return set.isCacheable();
+        // not cacheable, the reason is that by default, when constructing the filter, it is not cacheable,
+        // so if someone wants it to be cacheable, we might as well construct a cached version of the result
+        return false;
+//        return set.isCacheable();
    }

    @Override public boolean get(int doc) throws IOException {
--- a/modules/elasticsearch/src/main/java/org/elasticsearch/common/lucene/docset/OrDocIdSet.java
+++ b/modules/elasticsearch/src/main/java/org/elasticsearch/common/lucene/docset/OrDocIdSet.java
@ -0,0 +1,195 @@
+/*
+ * Licensed to Elastic Search and Shay Banon under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. Elastic Search licenses this
+ * file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.common.lucene.docset;
+
+import org.apache.lucene.search.DocIdSet;
+import org.apache.lucene.search.DocIdSetIterator;
+
+import java.io.IOException;
+import java.util.List;
+
+/**
+ * @author kimchy (shay.banon)
+ */
+public class OrDocIdSet extends DocIdSet {
+
+    private final List<DocIdSet> sets;
+
+    public OrDocIdSet(List<DocIdSet> sets) {
+        this.sets = sets;
+    }
+
+    @Override public boolean isCacheable() {
+        // not cacheable, the reason is that by default, when constructing the filter, it is not cacheable,
+        // so if someone wants it to be cacheable, we might as well construct a cached version of the result
+        return false;
+//        for (DocIdSet set : sets) {
+//            if (!set.isCacheable()) {
+//                return false;
+//            }
+//        }
+//        return true;
+    }
+
+    @Override public DocIdSetIterator iterator() throws IOException {
+        return new OrDocIdSetIterator();
+    }
+
+    public class OrDocIdSetIterator extends DocIdSetIterator {
+
+        private final class Item {
+            public final DocIdSetIterator iter;
+            public int doc;
+
+            public Item(DocIdSetIterator iter) {
+                this.iter = iter;
+                this.doc = -1;
+            }
+        }
+
+        private int _curDoc;
+        private final Item[] _heap;
+        private int _size;
+
+        OrDocIdSetIterator() throws IOException {
+            _curDoc = -1;
+            _heap = new Item[sets.size()];
+            _size = 0;
+            for (DocIdSet set : sets) {
+                _heap[_size++] = new Item(set.iterator());
+            }
+            if (_size == 0) _curDoc = DocIdSetIterator.NO_MORE_DOCS;
+        }
+
+        @Override
+        public final int docID() {
+            return _curDoc;
+        }
+
+        @Override
+        public final int nextDoc() throws IOException {
+            if (_curDoc == DocIdSetIterator.NO_MORE_DOCS) return DocIdSetIterator.NO_MORE_DOCS;
+
+            Item top = _heap[0];
+            while (true) {
+                DocIdSetIterator topIter = top.iter;
+                int docid;
+                if ((docid = topIter.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
+                    top.doc = docid;
+                    heapAdjust();
+                } else {
+                    heapRemoveRoot();
+                    if (_size == 0) return (_curDoc = DocIdSetIterator.NO_MORE_DOCS);
+                }
+                top = _heap[0];
+                int topDoc = top.doc;
+                if (topDoc > _curDoc) {
+                    return (_curDoc = topDoc);
+                }
+            }
+        }
+
+        @Override
+        public final int advance(int target) throws IOException {
+            if (_curDoc == DocIdSetIterator.NO_MORE_DOCS) return DocIdSetIterator.NO_MORE_DOCS;
+
+            if (target <= _curDoc) target = _curDoc + 1;
+
+            Item top = _heap[0];
+            while (true) {
+                DocIdSetIterator topIter = top.iter;
+                int docid;
+                if ((docid = topIter.advance(target)) != DocIdSetIterator.NO_MORE_DOCS) {
+                    top.doc = docid;
+                    heapAdjust();
+                } else {
+                    heapRemoveRoot();
+                    if (_size == 0) return (_curDoc = DocIdSetIterator.NO_MORE_DOCS);
+                }
+                top = _heap[0];
+                int topDoc = top.doc;
+                if (topDoc >= target) {
+                    return (_curDoc = topDoc);
+                }
+            }
+        }
+
+// Organize subScorers into a min heap with scorers generating the earlest document on top.
+        /*
+        private final void heapify() {
+            int size = _size;
+            for (int i=(size>>1)-1; i>=0; i--)
+                heapAdjust(i);
+        }
+        */
+        /* The subtree of subScorers at root is a min heap except possibly for its root element.
+        * Bubble the root down as required to make the subtree a heap.
+        */
+
+        private final void heapAdjust() {
+            final Item[] heap = _heap;
+            final Item top = heap[0];
+            final int doc = top.doc;
+            final int size = _size;
+            int i = 0;
+
+            while (true) {
+                int lchild = (i << 1) + 1;
+                if (lchild >= size) break;
+
+                Item left = heap[lchild];
+                int ldoc = left.doc;
+
+                int rchild = lchild + 1;
+                if (rchild < size) {
+                    Item right = heap[rchild];
+                    int rdoc = right.doc;
+
+                    if (rdoc <= ldoc) {
+                        if (doc <= rdoc) break;
+
+                        heap[i] = right;
+                        i = rchild;
+                        continue;
+                    }
+                }
+
+                if (doc <= ldoc) break;
+
+                heap[i] = left;
+                i = lchild;
+            }
+            heap[i] = top;
+        }
+
+        // Remove the root Scorer from subScorers and re-establish it as a heap
+
+        private void heapRemoveRoot() {
+            _size--;
+            if (_size > 0) {
+                Item tmp = _heap[0];
+                _heap[0] = _heap[_size];
+                _heap[_size] = tmp; // keep the finished iterator at the end for debugging
+                heapAdjust();
+            }
+        }
+
+    }
+}
--- a/modules/elasticsearch/src/main/java/org/elasticsearch/common/lucene/docset/OrDocSet.java
+++ b/modules/elasticsearch/src/main/java/org/elasticsearch/common/lucene/docset/OrDocSet.java
@ -37,10 +37,13 @@ public class OrDocSet extends DocSet {
    }

    @Override public boolean get(int doc) throws IOException {
-        for (DocSet s : sets) {
-            if (s.get(doc)) return true;
-        }
+        // not cacheable, the reason is that by default, when constructing the filter, it is not cacheable,
+        // so if someone wants it to be cacheable, we might as well construct a cached version of the result
        return false;
+//        for (DocSet s : sets) {
+//            if (s.get(doc)) return true;
+//        }
+//        return false;
    }

    @Override public boolean isCacheable() {
--- a/modules/elasticsearch/src/main/java/org/elasticsearch/common/lucene/search/AndFilter.java
+++ b/modules/elasticsearch/src/main/java/org/elasticsearch/common/lucene/search/AndFilter.java
@ -23,9 +23,9 @@ import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.search.DocIdSet;
 import org.apache.lucene.search.Filter;
 import org.elasticsearch.common.collect.Lists;
+import org.elasticsearch.common.lucene.docset.AndDocIdSet;
 import org.elasticsearch.common.lucene.docset.AndDocSet;
 import org.elasticsearch.common.lucene.docset.DocSet;
-import org.elasticsearch.common.lucene.docset.DocSets;

 import java.io.IOException;
 import java.util.List;
@ -47,13 +47,21 @@ public class AndFilter extends Filter {

    @Override public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
        if (filters.size() == 1) {
-            return DocSets.convert(reader, filters.get(0).getDocIdSet(reader));
+            return filters.get(0).getDocIdSet(reader);
        }
-        List<DocSet> sets = Lists.newArrayListWithExpectedSize(filters.size());
+        List sets = Lists.newArrayListWithExpectedSize(filters.size());
+        boolean allAreDocSet = true;
        for (Filter filter : filters) {
-            sets.add(DocSets.convert(reader, filter.getDocIdSet(reader)));
+            DocIdSet set = filter.getDocIdSet(reader);
+            if (!(set instanceof DocSet)) {
+                allAreDocSet = false;
+            }
+            sets.add(set);
        }
-        return new AndDocSet(sets);
+        if (allAreDocSet) {
+            return new AndDocSet(sets);
+        }
+        return new AndDocIdSet(sets);
    }

    @Override public int hashCode() {
--- a/modules/elasticsearch/src/main/java/org/elasticsearch/common/lucene/search/NotFilter.java
+++ b/modules/elasticsearch/src/main/java/org/elasticsearch/common/lucene/search/NotFilter.java
@ -22,7 +22,8 @@ package org.elasticsearch.common.lucene.search;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.search.DocIdSet;
 import org.apache.lucene.search.Filter;
-import org.elasticsearch.common.lucene.docset.DocSets;
+import org.elasticsearch.common.lucene.docset.DocSet;
+import org.elasticsearch.common.lucene.docset.NotDocIdSet;
 import org.elasticsearch.common.lucene.docset.NotDocSet;

 import java.io.IOException;
@ -43,7 +44,11 @@ public class NotFilter extends Filter {
    }

    @Override public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
-        return new NotDocSet(DocSets.convert(reader, filter.getDocIdSet(reader)), reader.maxDoc());
+        DocIdSet set = filter.getDocIdSet(reader);
+        if (set instanceof DocSet) {
+            return new NotDocSet((DocSet) set, reader.maxDoc());
+        }
+        return new NotDocIdSet(set, reader.maxDoc());
    }

    @Override
--- a/modules/elasticsearch/src/main/java/org/elasticsearch/common/lucene/search/OrFilter.java
+++ b/modules/elasticsearch/src/main/java/org/elasticsearch/common/lucene/search/OrFilter.java
@ -24,7 +24,7 @@ import org.apache.lucene.search.DocIdSet;
 import org.apache.lucene.search.Filter;
 import org.elasticsearch.common.collect.Lists;
 import org.elasticsearch.common.lucene.docset.DocSet;
-import org.elasticsearch.common.lucene.docset.DocSets;
+import org.elasticsearch.common.lucene.docset.OrDocIdSet;
 import org.elasticsearch.common.lucene.docset.OrDocSet;

 import java.io.IOException;
@ -47,13 +47,21 @@ public class OrFilter extends Filter {

    @Override public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
        if (filters.size() == 1) {
-            return DocSets.convert(reader, filters.get(0).getDocIdSet(reader));
+            return filters.get(0).getDocIdSet(reader);
        }
-        List<DocSet> sets = Lists.newArrayListWithExpectedSize(filters.size());
+        List sets = Lists.newArrayListWithExpectedSize(filters.size());
+        boolean allAreDocSet = true;
        for (Filter filter : filters) {
-            sets.add(DocSets.convert(reader, filter.getDocIdSet(reader)));
+            DocIdSet set = filter.getDocIdSet(reader);
+            if (!(set instanceof DocSet)) {
+                allAreDocSet = false;
+            }
+            sets.add(set);
        }
-        return new OrDocSet(sets);
+        if (allAreDocSet) {
+            return new OrDocSet(sets);
+        }
+        return new OrDocIdSet(sets);
    }

    @Override public int hashCode() {