From 3dd0b7f2786b820b854bed33f7e5cb736f74decf Mon Sep 17 00:00:00 2001 From: Yonik Seeley Date: Wed, 27 Apr 2011 01:42:05 +0000 Subject: [PATCH] SOLR-2272: Pseudo-join git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1096978 13f79535-47bb-0310-9956-ffa450edef68 --- .../org/apache/lucene/index/DocTermOrds.java | 4 +- solr/CHANGES.txt | 7 + .../handler/component/ResponseBuilder.java | 19 + .../org/apache/solr/request/SimpleFacets.java | 8 +- .../apache/solr/request/SolrRequestInfo.java | 31 +- .../apache/solr/request/UnInvertedField.java | 17 +- .../org/apache/solr/search/BitDocSet.java | 20 + .../java/org/apache/solr/search/DocSet.java | 30 + .../java/org/apache/solr/search/DocSlice.java | 20 + .../org/apache/solr/search/HashDocSet.java | 35 ++ .../apache/solr/search/JoinQParserPlugin.java | 572 ++++++++++++++++++ .../org/apache/solr/search/QParserPlugin.java | 1 + .../apache/solr/search/SolrIndexSearcher.java | 119 ++-- .../apache/solr/search/SortedIntDocSet.java | 107 ++++ .../org/apache/solr/SolrTestCaseJ4.java | 24 + solr/src/test/org/apache/solr/TestJoin.java | 216 +++++++ .../solrj/MultiCoreExampleTestBase.java | 13 +- 17 files changed, 1177 insertions(+), 66 deletions(-) create mode 100644 solr/src/java/org/apache/solr/search/JoinQParserPlugin.java create mode 100644 solr/src/test/org/apache/solr/TestJoin.java diff --git a/lucene/src/java/org/apache/lucene/index/DocTermOrds.java b/lucene/src/java/org/apache/lucene/index/DocTermOrds.java index 9c5361f6dde..7bf10a8b06f 100644 --- a/lucene/src/java/org/apache/lucene/index/DocTermOrds.java +++ b/lucene/src/java/org/apache/lucene/index/DocTermOrds.java @@ -129,6 +129,8 @@ public class DocTermOrds { protected BytesRef prefix; protected int ordBase; + protected DocsEnum docsEnum; //used while uninverting + public long ramUsedInBytes() { // can cache the mem size since it shouldn't change if (memsz!=0) return memsz; @@ -270,7 +272,7 @@ public class DocTermOrds { // frequent terms ahead of time. int termNum = 0; - DocsEnum docsEnum = null; + docsEnum = null; // Loop begins with te positioned to first term (we call // seek above): diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 20237e40a96..0ed4698c902 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -137,6 +137,13 @@ New Features * SOLR-2383: /browse improvements: generalize range and date facet display (Jan Høydahl via yonik) +* SOLR-2272: Pseudo-join queries / filters. Examples: + To restrict to the set of parents with at least one blue-eyed child: + fq={!join from=parent to=name}eyes:blue + To restrict to the set of children with at least one blue-eyed parent: + fq={!join from=name to=parent}eyes:blue + (yonik) + Optimizations diff --git a/solr/src/java/org/apache/solr/handler/component/ResponseBuilder.java b/solr/src/java/org/apache/solr/handler/component/ResponseBuilder.java index ea86f88a36d..5bcb5a967d2 100644 --- a/solr/src/java/org/apache/solr/handler/component/ResponseBuilder.java +++ b/solr/src/java/org/apache/solr/handler/component/ResponseBuilder.java @@ -163,6 +163,25 @@ public class ResponseBuilder debugInfo.add( name, val ); } + public void addDebug(Object val, String... path) { + if( debugInfo == null ) { + debugInfo = new SimpleOrderedMap(); + } + + NamedList target = debugInfo; + for (int i=0; i newTarget = (NamedList)debugInfo.get(elem); + if (newTarget == null) { + newTarget = new SimpleOrderedMap(); + target.add(elem, newTarget); + } + target = newTarget; + } + + target.add(path[path.length-1], val); + } + //------------------------------------------------------------------------- //------------------------------------------------------------------------- diff --git a/solr/src/java/org/apache/solr/request/SimpleFacets.java b/solr/src/java/org/apache/solr/request/SimpleFacets.java index 1173082764b..37080c06461 100644 --- a/solr/src/java/org/apache/solr/request/SimpleFacets.java +++ b/solr/src/java/org/apache/solr/request/SimpleFacets.java @@ -21,6 +21,7 @@ import org.apache.lucene.index.*; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.search.*; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.StringHelper; import org.apache.lucene.util.packed.Direct16; import org.apache.lucene.util.packed.Direct32; import org.apache.lucene.util.packed.Direct8; @@ -682,14 +683,15 @@ public class SimpleFacets { if (deState==null) { deState = new SolrIndexSearcher.DocsEnumState(); + deState.fieldName = StringHelper.intern(field); deState.deletedDocs = MultiFields.getDeletedDocs(r); deState.termsEnum = termsEnum; - deState.reuse = docsEnum; + deState.docsEnum = docsEnum; } - c = searcher.numDocs(new TermQuery(t), docs, deState); + c = searcher.numDocs(docs, deState); - docsEnum = deState.reuse; + docsEnum = deState.docsEnum; } else { // iterate over TermDocs to calculate the intersection diff --git a/solr/src/java/org/apache/solr/request/SolrRequestInfo.java b/solr/src/java/org/apache/solr/request/SolrRequestInfo.java index e95de6796ac..8f926f3d50b 100755 --- a/solr/src/java/org/apache/solr/request/SolrRequestInfo.java +++ b/solr/src/java/org/apache/solr/request/SolrRequestInfo.java @@ -17,11 +17,15 @@ package org.apache.solr.request; +import org.apache.solr.common.SolrException; import org.apache.solr.core.SolrCore; import org.apache.solr.handler.component.ResponseBuilder; import org.apache.solr.response.SolrQueryResponse; +import java.io.Closeable; import java.util.Date; +import java.util.LinkedList; +import java.util.List; public class SolrRequestInfo { @@ -31,6 +35,8 @@ public class SolrRequestInfo { protected SolrQueryResponse rsp; protected Date now; protected ResponseBuilder rb; + protected List closeHooks; + public static SolrRequestInfo getRequestInfo() { return threadLocal.get(); @@ -48,7 +54,20 @@ public class SolrRequestInfo { } public static void clearRequestInfo() { - threadLocal.remove(); + try { + SolrRequestInfo info = threadLocal.get(); + if (info != null && info.closeHooks != null) { + for (Closeable hook : info.closeHooks) { + try { + hook.close(); + } catch (Throwable throwable) { + SolrException.log(SolrCore.log, "Exception during close hook", throwable); + } + } + } + } finally { + threadLocal.remove(); + } } public SolrRequestInfo(SolrQueryRequest req, SolrQueryResponse rsp) { @@ -88,4 +107,14 @@ public class SolrRequestInfo { public void setResponseBuilder(ResponseBuilder rb) { this.rb = rb; } + + public void addCloseHook(Closeable hook) { + // is this better here, or on SolrQueryRequest? + synchronized (this) { + if (closeHooks == null) { + closeHooks = new LinkedList(); + } + closeHooks.add(hook); + } + } } diff --git a/solr/src/java/org/apache/solr/request/UnInvertedField.java b/solr/src/java/org/apache/solr/request/UnInvertedField.java index 6596f2d073d..da7fd273187 100755 --- a/solr/src/java/org/apache/solr/request/UnInvertedField.java +++ b/solr/src/java/org/apache/solr/request/UnInvertedField.java @@ -23,6 +23,7 @@ import org.apache.lucene.index.Term; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TermRangeQuery; +import org.apache.lucene.util.StringHelper; import org.apache.noggit.CharArr; import org.apache.solr.common.params.FacetParams; import org.apache.solr.common.util.NamedList; @@ -122,11 +123,15 @@ public class UnInvertedField extends DocTermOrds { if (deState == null) { deState = new SolrIndexSearcher.DocsEnumState(); - deState.termsEnum = te; + deState.fieldName = StringHelper.intern(field); + // deState.termsEnum = te.tenum; + deState.termsEnum = te; // TODO: check for MultiTermsEnum in SolrIndexSearcher could now fail? + deState.docsEnum = docsEnum; + deState.minSetSizeCached = maxTermDocFreq; } - - maxTermCounts[termNum] = searcher.getDocSet(new TermQuery(new Term(field, topTerm.term)), deState).size(); - //System.out.println(" big term termNum=" + termNum + " term=" + topTerm.term.utf8ToString() + " size=" + maxTermCounts[termNum] + " dF=" + te.docFreq()); + docsEnum = deState.docsEnum; + DocSet set = searcher.getDocSet(deState); + maxTermCounts[termNum] = set.size(); } } @@ -158,10 +163,10 @@ public class UnInvertedField extends DocTermOrds { super(field, // threshold, over which we use set intersections instead of counting // to (1) save memory, and (2) speed up faceting. - // Add 1 for testing purposes so that there will always be some terms under + // Add 2 for testing purposes so that there will always be some terms under // the threshold even when the index is very // small. - searcher.maxDoc()/20 + 1, + searcher.maxDoc()/20 + 2, DEFAULT_INDEX_INTERVAL_BITS); //System.out.println("maxTermDocFreq=" + maxTermDocFreq + " maxDoc=" + searcher.maxDoc()); diff --git a/solr/src/java/org/apache/solr/search/BitDocSet.java b/solr/src/java/org/apache/solr/search/BitDocSet.java index 617986cb8f9..d5cd85d6d2e 100644 --- a/solr/src/java/org/apache/solr/search/BitDocSet.java +++ b/solr/src/java/org/apache/solr/search/BitDocSet.java @@ -160,6 +160,16 @@ public class BitDocSet extends DocSetBase { } } + @Override + public boolean intersects(DocSet other) { + if (other instanceof BitDocSet) { + return bits.intersects(((BitDocSet)other).bits); + } else { + // they had better not call us back! + return other.intersects(this); + } + } + @Override public int unionSize(DocSet other) { if (other instanceof BitDocSet) { @@ -183,6 +193,11 @@ public class BitDocSet extends DocSetBase { } } + @Override + public void setBitsOn(OpenBitSet target) { + target.union(bits); + } + @Override public DocSet andNot(DocSet other) { OpenBitSet newbits = (OpenBitSet)(bits.clone()); @@ -211,4 +226,9 @@ public class BitDocSet extends DocSetBase { public long memSize() { return (bits.getBits().length << 3) + 16; } + + @Override + protected BitDocSet clone() { + return new BitDocSet((OpenBitSet)bits.clone(), size); + } } diff --git a/solr/src/java/org/apache/solr/search/DocSet.java b/solr/src/java/org/apache/solr/search/DocSet.java index ddfc3cde67f..fcf8785cce4 100644 --- a/solr/src/java/org/apache/solr/search/DocSet.java +++ b/solr/src/java/org/apache/solr/search/DocSet.java @@ -115,6 +115,9 @@ public interface DocSet /* extends Collection */ { */ public int intersectionSize(DocSet other); + /** Returns true if these sets have any elements in common */ + public boolean intersects(DocSet other); + /** * Returns the union of this set with another set. Neither set is modified - a new DocSet is * created and returned. @@ -146,6 +149,14 @@ public interface DocSet /* extends Collection */ { * methods will be invoked with. */ public Filter getTopFilter(); + + /** + * Takes the docs from this set and sets those bits on the target OpenBitSet. + * The target should be sized large enough to accommodate all of the documents before calling this method. + */ + public void setBitsOn(OpenBitSet target); + + public static DocSet EMPTY = new SortedIntDocSet(new int[0], 0); } /** A base class that may be usefull for implementing DocSets */ @@ -213,6 +224,17 @@ abstract class DocSetBase implements DocSet { return new BitDocSet(newbits); } + public boolean intersects(DocSet other) { + // intersection is overloaded in the smaller DocSets to be more + // efficient, so dispatch off of it instead. + if (!(other instanceof BitDocSet)) { + return other.intersects(this); + } + // less efficient way: get the intersection size + return intersectionSize(other) > 0; + } + + public DocSet union(DocSet other) { OpenBitSet newbits = (OpenBitSet)(this.getBits().clone()); newbits.or(other.getBits()); @@ -295,6 +317,14 @@ abstract class DocSetBase implements DocSet { } }; } + + public void setBitsOn(OpenBitSet target) { + DocIterator iter = iterator(); + while (iter.hasNext()) { + target.fastSet(iter.nextDoc()); + } + } + } diff --git a/solr/src/java/org/apache/solr/search/DocSlice.java b/solr/src/java/org/apache/solr/search/DocSlice.java index 0f8887c7943..51fc6f12c3a 100644 --- a/solr/src/java/org/apache/solr/search/DocSlice.java +++ b/solr/src/java/org/apache/solr/search/DocSlice.java @@ -17,6 +17,8 @@ package org.apache.solr.search; +import java.util.Arrays; + /** * DocSlice implements DocList as an array of docids and optional scores. * @@ -141,4 +143,22 @@ public class DocSlice extends DocSetBase implements DocList { HashDocSet h = new HashDocSet(docs,offset,len); return h.intersectionSize(other); } + + @Override + public boolean intersects(DocSet other) { + if (other instanceof SortedIntDocSet || other instanceof HashDocSet) { + return other.intersects(this); + } + HashDocSet h = new HashDocSet(docs,offset,len); + return h.intersects(other); + } + + @Override + protected DocSlice clone() { + try { + // DocSlice is not currently mutable + DocSlice slice = (DocSlice) super.clone(); + } catch (CloneNotSupportedException e) {} + return null; + } } diff --git a/solr/src/java/org/apache/solr/search/HashDocSet.java b/solr/src/java/org/apache/solr/search/HashDocSet.java index 1c610841b5d..d5dd5023261 100644 --- a/solr/src/java/org/apache/solr/search/HashDocSet.java +++ b/solr/src/java/org/apache/solr/search/HashDocSet.java @@ -48,6 +48,12 @@ public final class HashDocSet extends DocSetBase { private final int mask; + public HashDocSet(HashDocSet set) { + this.table = set.table.clone(); + this.size = set.size; + this.mask = set.mask; + } + /** Create a HashDocSet from a list of *unique* ids */ public HashDocSet(int[] docs, int offset, int len) { this(docs, offset, len, DEFAULT_INVERSE_LOAD_FACTOR); @@ -207,6 +213,31 @@ public final class HashDocSet extends DocSetBase { } + @Override + public boolean intersects(DocSet other) { + if (other instanceof HashDocSet) { + // set "a" to the smallest doc set for the most efficient + // intersection. + final HashDocSet a = size()<=other.size() ? this : (HashDocSet)other; + final HashDocSet b = size()<=other.size() ? (HashDocSet)other : this; + + for (int i=0; i= 0 && b.exists(id)) { + return true; + } + } + return false; + } else { + for (int i=0; i= 0 && other.exists(id)) { + return true; + } + } + return false; + } + } @Override public DocSet andNot(DocSet other) { @@ -249,6 +280,10 @@ public final class HashDocSet extends DocSetBase { } } + @Override + protected HashDocSet clone() { + return new HashDocSet(this); + } // don't implement andNotSize() and unionSize() on purpose... they are implemented // in BaseDocSet in terms of intersectionSize(). diff --git a/solr/src/java/org/apache/solr/search/JoinQParserPlugin.java b/solr/src/java/org/apache/solr/search/JoinQParserPlugin.java new file mode 100644 index 00000000000..fbfac0788e8 --- /dev/null +++ b/solr/src/java/org/apache/solr/search/JoinQParserPlugin.java @@ -0,0 +1,572 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.search; + +import org.apache.lucene.index.*; +import org.apache.lucene.queryParser.ParseException; +import org.apache.lucene.search.*; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.OpenBitSet; +import org.apache.lucene.util.StringHelper; +import org.apache.solr.common.SolrException; +import org.apache.solr.common.params.SolrParams; +import org.apache.solr.common.util.NamedList; +import org.apache.solr.common.util.SimpleOrderedMap; +import org.apache.solr.core.CoreContainer; +import org.apache.solr.core.SolrCore; +import org.apache.solr.handler.component.ResponseBuilder; +import org.apache.solr.request.SolrQueryRequest; +import org.apache.solr.request.SolrRequestInfo; +import org.apache.solr.schema.TrieField; +import org.apache.solr.util.RefCounted; + +import java.io.Closeable; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Set; + + +public class JoinQParserPlugin extends QParserPlugin { + public static String NAME = "join"; + + public void init(NamedList args) { + } + + public QParser createParser(String qstr, SolrParams localParams, SolrParams params, SolrQueryRequest req) { + return new QParser(qstr, localParams, params, req) { + public Query parse() throws ParseException { + String fromField = getParam("from"); + String fromIndex = getParam("fromIndex"); + String toField = getParam("to"); + String v = localParams.get("v"); + QParser fromQueryParser = subQuery(v, "lucene"); + Query fromQuery = fromQueryParser.getQuery(); + JoinQuery jq = new JoinQuery(fromField, toField, fromIndex, fromQuery); + return jq; + } + }; + } +} + + +class JoinQuery extends Query { + String fromField; + String toField; + String fromIndex; + Query q; + + public JoinQuery(String fromField, String toField, String fromIndex, Query subQuery) { + this.fromField = fromField; + this.toField = toField; + this.fromIndex = fromIndex; + this.q = subQuery; + } + + public Query getQuery() { return q; } + + @Override + public Query rewrite(IndexReader reader) throws IOException { + Query newQ = q.rewrite(reader); + if (newQ == q) return this; + JoinQuery nq = (JoinQuery)this.clone(); + nq.q = newQ; + return nq; + } + + @Override + public void extractTerms(Set terms) { + q.extractTerms(terms); + } + + public Weight createWeight(IndexSearcher searcher) throws IOException { + return new JoinQueryWeight((SolrIndexSearcher)searcher); + } + + private class JoinQueryWeight extends Weight { + SolrIndexSearcher fromSearcher; + RefCounted fromRef; + SolrIndexSearcher toSearcher; + private Similarity similarity; + private float queryNorm; + private float queryWeight; + ResponseBuilder rb; + + public JoinQueryWeight(SolrIndexSearcher searcher) throws IOException { + this.fromSearcher = searcher; + SolrRequestInfo info = SolrRequestInfo.getRequestInfo(); + if (info != null) { + rb = info.getResponseBuilder(); + } + + if (fromIndex == null) { + this.fromSearcher = searcher; + } else { + if (info == null) { + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Cross-core join must have SolrRequestInfo"); + } + + CoreContainer container = searcher.getCore().getCoreDescriptor().getCoreContainer(); + final SolrCore fromCore = container.getCore(fromIndex); + + if (fromCore == null) { + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Cross-core join: no such core "); + } + + if (info.getReq().getCore() == fromCore) { + // if this is the same core, use the searcher passed in... otherwise we could be warming and + // get an older searcher from the core. + fromSearcher = searcher; + } else { + // This could block if there is a static warming query with a join in it, and if useColdSearcher is true. + // Deadlock could result if two cores both had useColdSearcher and had joins that used eachother. + // This would be very predictable though (should happen every time if misconfigured) + fromRef = fromCore.getSearcher(false, true, null); + + // be careful not to do anything with this searcher that requires the thread local + // SolrRequestInfo in a manner that requires the core in the request to match + fromSearcher = fromRef.get(); + } + + if (fromRef != null) { + final RefCounted ref = fromRef; + info.addCloseHook(new Closeable() { + @Override + public void close() throws IOException { + ref.decref(); + } + }); + } + + info.addCloseHook(new Closeable() { + @Override + public void close() throws IOException { + fromCore.close(); + } + }); + + } + this.toSearcher = searcher; + } + + public Query getQuery() { + return JoinQuery.this; + } + + public float getValue() { + return getBoost(); + } + + @Override + public float sumOfSquaredWeights() throws IOException { + queryWeight = getBoost(); + return queryWeight * queryWeight; + } + + @Override + public void normalize(float norm) { + this.queryNorm = norm; + queryWeight *= this.queryNorm; + } + + DocSet resultSet; + Filter filter; + + + + @Override + public Scorer scorer(IndexReader.AtomicReaderContext context, ScorerContext scorerContext) throws IOException { + if (filter == null) { + boolean debug = rb != null && rb.isDebug(); + long start = debug ? System.currentTimeMillis() : 0; + resultSet = getDocSet(); + long end = debug ? System.currentTimeMillis() : 0; + + if (debug) { + SimpleOrderedMap dbg = new SimpleOrderedMap(); + dbg.add("time", (end-start)); + dbg.add("fromSetSize", fromSetSize); // the input + dbg.add("toSetSize", resultSet.size()); // the output + + dbg.add("fromTermCount", fromTermCount); + dbg.add("fromTermTotalDf", fromTermTotalDf); + dbg.add("fromTermDirectCount", fromTermDirectCount); + dbg.add("fromTermHits", fromTermHits); + dbg.add("fromTermHitsTotalDf", fromTermHitsTotalDf); + dbg.add("toTermHits", toTermHits); + dbg.add("toTermHitsTotalDf", toTermHitsTotalDf); + dbg.add("toTermDirectCount", toTermDirectCount); + dbg.add("smallSetsDeferred", smallSetsDeferred); + dbg.add("toSetDocsAdded", resultListDocs); + + // TODO: perhaps synchronize addDebug in the future... + rb.addDebug(dbg, "join", JoinQuery.this.toString()); + } + + filter = resultSet.getTopFilter(); + } + + DocIdSet readerSet = filter.getDocIdSet(context); + if (readerSet == null) readerSet=DocIdSet.EMPTY_DOCIDSET; + return new JoinScorer(this, readerSet.iterator()); + } + + + int fromSetSize; // number of docs in the fromSet (that match the from query) + long resultListDocs; // total number of docs collected + int fromTermCount; + long fromTermTotalDf; + int fromTermDirectCount; // number of fromTerms that were too small to use the filter cache + int fromTermHits; // number of fromTerms that intersected the from query + long fromTermHitsTotalDf; // sum of the df of the matching terms + int toTermHits; // num if intersecting from terms that match a term in the to field + long toTermHitsTotalDf; // sum of the df for the toTermHits + int toTermDirectCount; // number of toTerms that we set directly on a bitset rather than doing set intersections + int smallSetsDeferred; // number of small sets collected to be used later to intersect w/ bitset or create another small set + + + public DocSet getDocSet() throws IOException { + OpenBitSet resultBits = null; + + // minimum docFreq to use the cache + int minDocFreqFrom = Math.max(5, fromSearcher.maxDoc() >> 13); + int minDocFreqTo = Math.max(5, toSearcher.maxDoc() >> 13); + + // use a smaller size than normal since we will need to sort and dedup the results + int maxSortedIntSize = Math.max(10, toSearcher.maxDoc() >> 10); + + DocSet fromSet = fromSearcher.getDocSet(q); + fromSetSize = fromSet.size(); + + List resultList = new ArrayList(10); + + // make sure we have a set that is fast for random access, if we will use it for that + DocSet fastForRandomSet = fromSet; + if (minDocFreqFrom>0 && fromSet instanceof SortedIntDocSet) { + SortedIntDocSet sset = (SortedIntDocSet)fromSet; + fastForRandomSet = new HashDocSet(sset.getDocs(), 0, sset.size()); + } + + Fields fromFields = MultiFields.getFields(fromSearcher.getIndexReader()); + Fields toFields = fromSearcher==toSearcher ? fromFields : MultiFields.getFields(toSearcher.getIndexReader()); + if (fromFields == null) return DocSet.EMPTY; + Terms terms = fromFields.terms(fromField); + Terms toTerms = toFields.terms(toField); + if (terms == null || toTerms==null) return DocSet.EMPTY; + String prefixStr = TrieField.getMainValuePrefix(fromSearcher.getSchema().getFieldType(fromField)); + BytesRef prefix = prefixStr == null ? null : new BytesRef(prefixStr); + + BytesRef term = null; + TermsEnum termsEnum = terms.iterator(); + TermsEnum toTermsEnum = toTerms.iterator(); + SolrIndexSearcher.DocsEnumState fromDeState = null; + SolrIndexSearcher.DocsEnumState toDeState = null; + + if (prefix == null) { + term = termsEnum.next(); + } else { + if (termsEnum.seek(prefix, true) != TermsEnum.SeekStatus.END) { + term = termsEnum.term(); + } + } + + Bits fromDeletedDocs = MultiFields.getDeletedDocs(fromSearcher.getIndexReader()); + Bits toDeletedDocs = fromSearcher == toSearcher ? fromDeletedDocs : MultiFields.getDeletedDocs(toSearcher.getIndexReader()); + + fromDeState = new SolrIndexSearcher.DocsEnumState(); + fromDeState.fieldName = StringHelper.intern(fromField); + fromDeState.deletedDocs = fromDeletedDocs; + fromDeState.termsEnum = termsEnum; + fromDeState.docsEnum = null; + fromDeState.minSetSizeCached = minDocFreqFrom; + + toDeState = new SolrIndexSearcher.DocsEnumState(); + toDeState.fieldName = StringHelper.intern(toField); + toDeState.deletedDocs = toDeletedDocs; + toDeState.termsEnum = toTermsEnum; + toDeState.docsEnum = null; + toDeState.minSetSizeCached = minDocFreqTo; + + while (term != null) { + if (prefix != null && !term.startsWith(prefix)) + break; + + fromTermCount++; + + boolean intersects = false; + int freq = termsEnum.docFreq(); + fromTermTotalDf++; + + if (freq < minDocFreqFrom) { + fromTermDirectCount++; + // OK to skip deletedDocs, since we check for intersection with docs matching query + fromDeState.docsEnum = fromDeState.termsEnum.docs(null, fromDeState.docsEnum); + DocsEnum docsEnum = fromDeState.docsEnum; + + if (docsEnum instanceof MultiDocsEnum) { + MultiDocsEnum.EnumWithSlice[] subs = ((MultiDocsEnum)docsEnum).getSubs(); + int numSubs = ((MultiDocsEnum)docsEnum).getNumSubs(); + outer: for (int subindex = 0; subindex maxSortedIntSize && resultList.size() > 0) { + resultBits = new OpenBitSet(toSearcher.maxDoc()); + } + + // if we don't have a bitset yet, or if the resulting set will be too large + // use the filterCache to get a DocSet + if (toTermsEnum.docFreq() >= minDocFreqTo || resultBits == null) { + // use filter cache + DocSet toTermSet = toSearcher.getDocSet(toDeState); + resultListDocs += toTermSet.size(); + if (resultBits != null) { + toTermSet.setBitsOn(resultBits); + } else { + if (toTermSet instanceof BitDocSet) { + resultBits = (OpenBitSet)((BitDocSet)toTermSet).bits.clone(); + } else { + resultList.add(toTermSet); + } + } + } else { + toTermDirectCount++; + + // need to use deletedDocs here so we don't map to any deleted ones + toDeState.docsEnum = toDeState.termsEnum.docs(toDeState.deletedDocs, toDeState.docsEnum); + DocsEnum docsEnum = toDeState.docsEnum; + + if (docsEnum instanceof MultiDocsEnum) { + MultiDocsEnum.EnumWithSlice[] subs = ((MultiDocsEnum)docsEnum).getSubs(); + int numSubs = ((MultiDocsEnum)docsEnum).getNumSubs(); + for (int subindex = 0; subindex= deState.minSetSizeCached; + TermQuery key = null; + + if (useCache) { + key = new TermQuery(new Term(deState.fieldName, new BytesRef(deState.termsEnum.term()), false)); + DocSet result = filterCache.get(key); + if (result != null) return result; + } int smallSetSize = maxDoc()>>6; - int largestPossible = deState.termsEnum.docFreq(); + int scratchSize = Math.min(smallSetSize, largestPossible); + if (deState.scratch == null || deState.scratch.length < scratchSize) + deState.scratch = new int[scratchSize]; - int[] docs = new int[Math.min(smallSetSize, largestPossible)]; + final int[] docs = deState.scratch; int upto = 0; int bitsSet = 0; OpenBitSet obs = null; - DocsEnum docsEnum = deState.termsEnum.docs(deState.deletedDocs, deState.reuse); - if (deState.reuse == null) { - deState.reuse = docsEnum; + DocsEnum docsEnum = deState.termsEnum.docs(deState.deletedDocs, deState.docsEnum); + if (deState.docsEnum == null) { + deState.docsEnum = docsEnum; } if (docsEnum instanceof MultiDocsEnum) { @@ -822,15 +803,22 @@ public class SolrIndexSearcher extends IndexSearcher implements SolrInfoMBean { } } + DocSet result; if (obs != null) { for (int i=0; i=doca) { + high=probe; + } else { + low=probe+1; + probe = low + step; + if (probe=doca) { + high=probe; + } else { + low=probe+1; + } + } + } + } + + while (low <= high) { + int mid = (low+high) >>> 1; + int docb = b[mid]; + + if (docb < doca) { + low = mid+1; + } + else if (docb > doca) { + high = mid-1; + } + else { + return true; + } + } + } + + return false; + } + public int intersectionSize(DocSet other) { if (!(other instanceof SortedIntDocSet)) { // assume other implementations are better at random access than we are, @@ -215,6 +268,49 @@ public class SortedIntDocSet extends DocSetBase { return icount; } + @Override + public boolean intersects(DocSet other) { + if (!(other instanceof SortedIntDocSet)) { + // assume other implementations are better at random access than we are, + // true of BitDocSet and HashDocSet. + for (int i=0; i>3) >= a.length) { + return intersects(a,b); + } + + // if they are close in size, just do a linear walk of both. + int i=0,j=0; + int doca=a[i],docb=b[j]; + for(;;) { + // switch on the sign bit somehow? Hopefull JVM is smart enough to just test once. + + // Since set a is less dense then set b, doca is likely to be greater than docb so + // check that case first. This resulted in a 13% speedup. + if (doca > docb) { + if (++j >= b.length) break; + docb=b[j]; + } else if (doca < docb) { + if (++i >= a.length) break; + doca=a[i]; + } else { + return true; + } + } + return false; + } /** puts the intersection of a and b into the target array and returns the size */ public static int intersection(int a[], int lena, int b[], int lenb, int[] target) { @@ -463,6 +559,13 @@ public class SortedIntDocSet extends DocSetBase { return new SortedIntDocSet(arr,sz); } + @Override + public void setBitsOn(OpenBitSet target) { + for (int doc : docs) { + target.fastSet(doc); + } + } + public boolean exists(int doc) { // this could be faster by estimating where in the list the doc is likely to appear, @@ -653,4 +756,8 @@ public class SortedIntDocSet extends DocSetBase { }; } + @Override + protected SortedIntDocSet clone() { + return new SortedIntDocSet(docs.clone()); + } } diff --git a/solr/src/test-framework/org/apache/solr/SolrTestCaseJ4.java b/solr/src/test-framework/org/apache/solr/SolrTestCaseJ4.java index da5d837af74..113329286ae 100755 --- a/solr/src/test-framework/org/apache/solr/SolrTestCaseJ4.java +++ b/solr/src/test-framework/org/apache/solr/SolrTestCaseJ4.java @@ -722,6 +722,7 @@ public abstract class SolrTestCaseJ4 extends LuceneTestCase { } public static final IRange ZERO_ONE = new IRange(0,1); + public static final IRange ZERO_TWO = new IRange(0,2); public static final IRange ONE_ONE = new IRange(1,1); public static class Doc implements Comparable{ @@ -1040,6 +1041,29 @@ public abstract class SolrTestCaseJ4 extends LuceneTestCase { return out.toString(); } + /** Return a Map from field value to a list of document ids */ + Map> invertField(Map model, String field) { + Map> value_to_id = new HashMap>(); + + // invert field + for (Comparable key : model.keySet()) { + Doc doc = model.get(key); + List vals = doc.getValues(field); + if (vals == null) continue; + for (Comparable val : vals) { + List ids = value_to_id.get(val); + if (ids == null) { + ids = new ArrayList(2); + value_to_id.put(val, ids); + } + ids.add(key); + } + } + + return value_to_id; + } + + /** Gets a resource from the context classloader as {@link File}. This method should only be used, * if a real file is needed. To get a stream, code should prefer * {@link Class#getResourceAsStream} using {@code this.getClass()}. diff --git a/solr/src/test/org/apache/solr/TestJoin.java b/solr/src/test/org/apache/solr/TestJoin.java new file mode 100644 index 00000000000..533d94d003f --- /dev/null +++ b/solr/src/test/org/apache/solr/TestJoin.java @@ -0,0 +1,216 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr; + +import org.apache.lucene.search.FieldCache; +import org.apache.noggit.JSONUtil; +import org.apache.noggit.ObjectBuilder; +import org.apache.solr.common.params.ModifiableSolrParams; +import org.apache.solr.core.SolrCore; +import org.apache.solr.handler.JsonUpdateRequestHandler; +import org.apache.solr.request.SolrQueryRequest; +import org.apache.solr.request.SolrRequestHandler; +import org.apache.solr.schema.IndexSchema; +import org.apache.solr.servlet.DirectSolrConnection; +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.Test; + +import java.util.*; + +public class TestJoin extends SolrTestCaseJ4 { + + @BeforeClass + public static void beforeTests() throws Exception { + initCore("solrconfig.xml","schema12.xml"); + } + + @Test + public void testJoin() throws Exception { + assertU(add(doc("id", "1","name", "john", "title", "Director", "dept_s","Engineering"))); + assertU(add(doc("id", "2","name", "mark", "title", "VP", "dept_s","Marketing"))); + assertU(add(doc("id", "3","name", "nancy", "title", "MTS", "dept_s","Sales"))); + assertU(add(doc("id", "4","name", "dave", "title", "MTS", "dept_s","Support", "dept_s","Engineering"))); + assertU(add(doc("id", "5","name", "tina", "title", "VP", "dept_s","Engineering"))); + + assertU(add(doc("id","10", "dept_id_s", "Engineering", "text","These guys develop stuff"))); + assertU(add(doc("id","11", "dept_id_s", "Marketing", "text","These guys make you look good"))); + assertU(add(doc("id","12", "dept_id_s", "Sales", "text","These guys sell stuff"))); + assertU(add(doc("id","13", "dept_id_s", "Support", "text","These guys help customers"))); + + assertU(commit()); + + // test debugging + assertJQ(req("q","{!join from=dept_s to=dept_id_s}title:MTS", "fl","id", "debugQuery","true") + ,"/debug/join/{!join from=dept_s to=dept_id_s}title:MTS=={'_MATCH_':'fromSetSize,toSetSize', 'fromSetSize':2, 'toSetSize':3}" + ); + + assertJQ(req("q","{!join from=dept_s to=dept_id_s}title:MTS", "fl","id") + ,"/response=={'numFound':3,'start':0,'docs':[{'id':'10'},{'id':'12'},{'id':'13'}]}" + ); + + // empty from + assertJQ(req("q","{!join from=noexist_s to=dept_id_s}*:*", "fl","id") + ,"/response=={'numFound':0,'start':0,'docs':[]}" + ); + + // empty to + assertJQ(req("q","{!join from=dept_s to=noexist_s}*:*", "fl","id") + ,"/response=={'numFound':0,'start':0,'docs':[]}" + ); + + // self join... return everyone with she same title as Dave + assertJQ(req("q","{!join from=title to=title}name:dave", "fl","id") + ,"/response=={'numFound':2,'start':0,'docs':[{'id':'3'},{'id':'4'}]}" + ); + + // find people that develop stuff + assertJQ(req("q","{!join from=dept_id_s to=dept_s}text:develop", "fl","id") + ,"/response=={'numFound':3,'start':0,'docs':[{'id':'1'},{'id':'4'},{'id':'5'}]}" + ); + + // self join on multivalued text field + assertJQ(req("q","{!join from=title to=title}name:dave", "fl","id") + ,"/response=={'numFound':2,'start':0,'docs':[{'id':'3'},{'id':'4'}]}" + ); + + assertJQ(req("q","{!join from=dept_s to=dept_id_s}title:MTS", "fl","id", "debugQuery","true") + ,"/response=={'numFound':3,'start':0,'docs':[{'id':'10'},{'id':'12'},{'id':'13'}]}" + ); + + } + + + @Test + public void testRandomJoin() throws Exception { + int indexIter=50 * RANDOM_MULTIPLIER; + int queryIter=50 * RANDOM_MULTIPLIER; + + while (--indexIter >= 0) { + int indexSize = random.nextInt(20 * RANDOM_MULTIPLIER); + + List types = new ArrayList(); + types.add(new FldType("id",ONE_ONE, new SVal('A','Z',4,4))); + types.add(new FldType("score_f",ONE_ONE, new FVal(1,100))); // field used to score + types.add(new FldType("small_s",ZERO_ONE, new SVal('a',(char)('c'+indexSize/3),1,1))); + types.add(new FldType("small2_s",ZERO_ONE, new SVal('a',(char)('c'+indexSize/3),1,1))); + types.add(new FldType("small2_ss",ZERO_TWO, new SVal('a',(char)('c'+indexSize/3),1,1))); + types.add(new FldType("small3_ss",new IRange(0,25), new SVal('A','z',1,1))); + types.add(new FldType("small_i",ZERO_ONE, new IRange(0,5+indexSize/3))); + types.add(new FldType("small2_i",ZERO_ONE, new IRange(0,5+indexSize/3))); + types.add(new FldType("small2_is",ZERO_TWO, new IRange(0,5+indexSize/3))); + types.add(new FldType("small3_is",new IRange(0,25), new IRange(0,100))); + + clearIndex(); + Map model = indexDocs(types, null, indexSize); + Map>> pivots = new HashMap>>(); + + for (int qiter=0; qiter> pivot = pivots.get(fromField+"/"+toField); + if (pivot == null) { + pivot = createJoinMap(model, fromField, toField); + pivots.put(fromField+"/"+toField, pivot); + } + + Collection fromDocs = model.values(); + Set docs = join(fromDocs, pivot); + List docList = new ArrayList(docs.size()); + for (Comparable id : docs) docList.add(model.get(id)); + Collections.sort(docList, createComparator("_docid_",true,false,false,false)); + List sortedDocs = new ArrayList(); + for (Doc doc : docList) { + if (sortedDocs.size() >= 10) break; + sortedDocs.add(doc.toObject(h.getCore().getSchema())); + } + + Map resultSet = new LinkedHashMap(); + resultSet.put("numFound", docList.size()); + resultSet.put("start", 0); + resultSet.put("docs", sortedDocs); + + // todo: use filters + + SolrQueryRequest req = req("wt","json","indent","true", "echoParams","all", + "q","{!join from="+fromField+" to="+toField + + (random.nextInt(4)==0 ? " fromIndex=collection1" : "") + +"}*:*" + ); + + String strResponse = h.query(req); + + Object realResponse = ObjectBuilder.fromJSON(strResponse); + String err = JSONTestUtil.matchObj("/response", realResponse, resultSet); + if (err != null) { + log.error("GROUPING MISMATCH: " + err + + "\n\trequest="+req + + "\n\tresult="+strResponse + + "\n\texpected="+ JSONUtil.toJSON(resultSet) + + "\n\tmodel="+ JSONUtil.toJSON(model) + ); + + // re-execute the request... good for putting a breakpoint here for debugging + String rsp = h.query(req); + + fail(err); + } + + } + } + } + + + Map> createJoinMap(Map model, String fromField, String toField) { + Map> id_to_id = new HashMap>(); + + Map> value_to_id = invertField(model, toField); + + for (Comparable fromId : model.keySet()) { + Doc doc = model.get(fromId); + List vals = doc.getValues(fromField); + if (vals == null) continue; + for (Comparable val : vals) { + List toIds = value_to_id.get(val); + if (toIds == null) continue; + Set ids = id_to_id.get(fromId); + if (ids == null) { + ids = new HashSet(); + id_to_id.put(fromId, ids); + } + for (Comparable toId : toIds) + ids.add(toId); + } + } + + return id_to_id; + } + + + Set join(Collection input, Map> joinMap) { + Set ids = new HashSet(); + for (Doc doc : input) { + Collection output = joinMap.get(doc.id); + if (output == null) continue; + ids.addAll(output); + } + return ids; + } + +} diff --git a/solr/src/test/org/apache/solr/client/solrj/MultiCoreExampleTestBase.java b/solr/src/test/org/apache/solr/client/solrj/MultiCoreExampleTestBase.java index 39fae68318b..5a435d75acc 100644 --- a/solr/src/test/org/apache/solr/client/solrj/MultiCoreExampleTestBase.java +++ b/solr/src/test/org/apache/solr/client/solrj/MultiCoreExampleTestBase.java @@ -26,6 +26,7 @@ import org.apache.solr.common.SolrInputDocument; import org.apache.solr.core.CoreContainer; import org.apache.solr.core.SolrCore; import org.apache.solr.util.ExternalPaths; +import org.junit.Test; /** @@ -65,8 +66,8 @@ public abstract class MultiCoreExampleTestBase extends SolrExampleTestBase protected abstract SolrServer getSolrCore1(); protected abstract SolrServer getSolrAdmin(); protected abstract SolrServer getSolrCore(String name); - + @Test public void testMultiCore() throws Exception { UpdateRequest up = new UpdateRequest(); @@ -79,6 +80,8 @@ public abstract class MultiCoreExampleTestBase extends SolrExampleTestBase // Add something to each core SolrInputDocument doc = new SolrInputDocument(); doc.setField( "id", "AAA" ); + doc.setField( "name", "AAA1" ); + doc.setField( "type", "BBB1" ); doc.setField( "core0", "yup" ); // Add to core0 @@ -96,6 +99,8 @@ public abstract class MultiCoreExampleTestBase extends SolrExampleTestBase // Add to core1 doc.setField( "id", "BBB" ); + doc.setField( "name", "BBB1" ); + doc.setField( "type", "AAA1" ); doc.setField( "core1", "yup" ); doc.removeField( "core0" ); up.add( doc ); @@ -124,6 +129,12 @@ public abstract class MultiCoreExampleTestBase extends SolrExampleTestBase assertEquals( 0, getSolrCore1().query( new SolrQuery( "id:AAA" ) ).getResults().size() ); assertEquals( 1, getSolrCore1().query( new SolrQuery( "id:BBB" ) ).getResults().size() ); + // cross-core join + assertEquals( 0, getSolrCore0().query( new SolrQuery( "{!join from=type to=name}*:*" ) ).getResults().size() ); // normal join + assertEquals( 1, getSolrCore0().query( new SolrQuery( "{!join from=type to=name fromIndex=core1}id:BBB" ) ).getResults().size() ); + assertEquals( 1, getSolrCore1().query( new SolrQuery( "{!join from=type to=name fromIndex=core0}id:AAA" ) ).getResults().size() ); + + // Now test reloading it should have a newer open time String name = "core0"; SolrServer coreadmin = getSolrAdmin();