SOLR-7543: basic graph traversal query

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1707818 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Yonik Seeley 2015-10-09 21:27:03 +00:00
parent 99c2515d99
commit 0a4b0833a2
11 changed files with 1022 additions and 1 deletions

View File

@ -33,7 +33,15 @@ import org.apache.lucene.util.UnicodeUtil;
* @see #build(Collection)
* @see Automata#makeStringUnion(Collection)
*/
final class DaciukMihovAutomatonBuilder {
public final class DaciukMihovAutomatonBuilder {
/**
* The default constructor is private. Use static methods directly.
*/
private DaciukMihovAutomatonBuilder() {
super();
}
/**
* DFSA state with <code>char</code> labels on transitions.
*/

View File

@ -71,6 +71,11 @@ New Features
* SOLR-8038: Add the StatsStream to the Streaming API and wire it into the SQLHandler (Joel Bernstein)
* SOLR-7543: Basic graph traversal query
Example: {!graph from="node_id" to="edge_id"}id:doc_1
(Kevin Watters, yonik)
Optimizations
----------------------
* SOLR-7876: Speed up queries and operations that use many terms when timeAllowed has not been

View File

@ -22,6 +22,7 @@ import org.apache.solr.core.SolrInfoMBean;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.search.join.BlockJoinChildQParserPlugin;
import org.apache.solr.search.join.BlockJoinParentQParserPlugin;
import org.apache.solr.search.join.GraphQParserPlugin;
import org.apache.solr.search.mlt.MLTQParserPlugin;
import org.apache.solr.util.plugin.NamedListInitializedPlugin;
@ -73,6 +74,7 @@ public abstract class QParserPlugin implements NamedListInitializedPlugin, SolrI
map.put(ExportQParserPlugin.NAME, ExportQParserPlugin.class);
map.put(MLTQParserPlugin.NAME, MLTQParserPlugin.class);
map.put(HashQParserPlugin.NAME, HashQParserPlugin.class);
map.put(GraphQParserPlugin.NAME, GraphQParserPlugin.class);
standardPlugins = Collections.unmodifiableMap(map);
}

View File

@ -0,0 +1,49 @@
package org.apache.solr.search.join;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.search.Query;
/**
* Frontier Query represents the next hop of a GraphTraversal.
* It contains the query to execute and the number of edges to traverse.
* @lucene.internal
*/
class FrontierQuery {
private final Query query;
private final Integer frontierSize;
public FrontierQuery(Query query, Integer frontierSize) {
super();
this.query = query;
this.frontierSize = frontierSize;
}
/**
* Return the query that represents the frontier at the current level.
*/
public Query getQuery() {
return query;
}
/**
* Return the number of edges in the frontier query.
*/
public Integer getFrontierSize() {
return frontierSize;
}
}

View File

@ -0,0 +1,44 @@
package org.apache.solr.search.join;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.search.QParser;
import org.apache.solr.search.QParserPlugin;
/**
* Query parser plugin for solr to wrap the graph query parser.
*/
public class GraphQParserPlugin extends QParserPlugin {
// Graph Query Parser parser name
public static final String NAME = "graph";
@Override
public void init(NamedList args) {
}
@Override
public QParser createParser(String qstr, SolrParams localParams, SolrParams params, SolrQueryRequest req) {
// return the graph query parser for this request.
return new GraphQueryParser(qstr, localParams, params, req);
}
}

View File

@ -0,0 +1,506 @@
package org.apache.solr.search.join;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.util.TreeSet;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.queries.TermsQuery;
import org.apache.lucene.search.AutomatonQuery;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.Filter;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.Weight;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefHash;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.DaciukMihovAutomatonBuilder;
import org.apache.solr.handler.component.ResponseBuilder;
import org.apache.solr.search.BitDocSet;
import org.apache.solr.search.DocSet;
import org.apache.solr.search.SolrIndexSearcher;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* GraphQuery - search for nodes and traverse edges in an index.
*
* Params:
* fromField = the field that contains the node id
* toField = the field that contains the edge ids
* traversalFilter = a query that can be applied for each hop in the graph.
* maxDepth = the max depth to traverse. (start nodes is depth=1)
* onlyLeafNodes = only return documents that have no edge id values.
* returnRoot = if false, the documents matching the initial query will not be returned.
*
* @lucene.experimental
*/
public class GraphQuery extends Query {
/** The inital node matching query */
private Query q;
/** the field with the node id */
private String fromField;
/** the field containing the edge ids */
private String toField;
/** A query to apply while traversing the graph to filter out edges */
private Query traversalFilter;
/** The max depth to traverse the graph, -1 means no limit. */
private int maxDepth = -1;
/** Use automaton compilation for graph query traversal (experimental + expert use only) */
private boolean useAutn = true;
/** If this is true, the graph traversal result will only return documents that
* do not have a value in the edge field. (Only leaf nodes returned from the graph) */
private boolean onlyLeafNodes = false;
/** False if documents matching the start query for the graph will be excluded from the final result set. */
private boolean returnRoot = true;
/**
* Create a graph query
* q - the starting node query
* fromField - the field containing the node id
* toField - the field containing the edge ids
*/
public GraphQuery(Query q, String fromField, String toField) {
this(q, fromField, toField, null);
}
/**
* Create a graph query with a traversal filter applied while traversing the frontier.
* q - the starting node query
* fromField - the field containing the node id
* toField - the field containing the edge ids
* traversalFilter - the filter to be applied on each iteration of the frontier.
*/
public GraphQuery(Query q, String fromField, String toField, Query traversalFilter) {
this.q = q;
this.fromField = fromField;
this.toField = toField;
this.traversalFilter = traversalFilter;
}
@Override
public Weight createWeight(IndexSearcher searcher, boolean needsScores) throws IOException {
Weight graphWeight = new GraphQueryWeight((SolrIndexSearcher)searcher);
return graphWeight;
}
@Override
public String toString(String field) {
StringBuilder sb = new StringBuilder();
sb.append("[[" + q.toString() + "]," + fromField + "=" + toField + "]");
if (traversalFilter != null) {
sb.append(" [TraversalFilter: " + traversalFilter.toString() + "]");
}
sb.append("[maxDepth=" + maxDepth + "]");
sb.append("[returnRoot=" + returnRoot + "]");
sb.append("[onlyLeafNodes=" + onlyLeafNodes + "]");
sb.append("[useAutn=" + useAutn + "]");
return sb.toString();
}
protected class GraphQueryWeight extends Weight {
SolrIndexSearcher fromSearcher;
private float queryNorm = 1.0F;
private float queryWeight = 1.0F;
int frontierSize = 0;
public int currentDepth = 0;
private Filter filter;
private DocSet resultSet;
public GraphQueryWeight(SolrIndexSearcher searcher) {
// Grab the searcher so we can run additional searches.
super(null);
this.fromSearcher = searcher;
}
@Override
public Explanation explain(LeafReaderContext context, int doc) throws IOException {
// currently no ranking for graph queries.
final Scorer cs = scorer(context);
final boolean exists = (cs != null && cs.advance(doc) == doc);
if (exists) {
List<Explanation> subs = new ArrayList<Explanation>();
return Explanation.match(1.0F, "Graph Match", subs);
} else {
List<Explanation> subs = new ArrayList<Explanation>();
return Explanation.noMatch("No Graph Match.", subs);
}
}
@Override
public float getValueForNormalization() throws IOException {
return 1F;
}
@Override
public void normalize(float norm, float topLevelBoost) {
this.queryWeight = norm * topLevelBoost;
}
/**
* This computes the matching doc set for a given graph query
*
* @return DocSet representing the documents in the graph.
* @throws IOException - if a sub search fails... maybe other cases too! :)
*/
private DocSet getDocSet() throws IOException {
DocSet fromSet = null;
FixedBitSet seedResultBits = null;
// Size that the bit set needs to be.
int capacity = fromSearcher.getRawReader().maxDoc();
// The bit set to contain the results that match the query.
FixedBitSet resultBits = new FixedBitSet(capacity);
// The measure of how deep in the graph we have gone.
currentDepth = 0;
// the initial query for the frontier for the first query
Query frontierQuery = q;
// Find all documents in this graph that are leaf nodes to speed traversal
// TODO: speed this up in the future with HAS_FIELD type queries
BooleanQuery.Builder leafNodeQuery = new BooleanQuery.Builder();
WildcardQuery edgeQuery = new WildcardQuery(new Term(toField, "*"));
leafNodeQuery.add(edgeQuery, Occur.MUST_NOT);
DocSet leafNodes = fromSearcher.getDocSet(leafNodeQuery.build());
// Start the breadth first graph traversal.
do {
// Create the graph result collector for this level
GraphTermsCollector graphResultCollector = new GraphTermsCollector(toField,capacity, resultBits, leafNodes);
// traverse the level!
fromSearcher.search(frontierQuery, graphResultCollector);
// All edge ids on the frontier.
BytesRefHash collectorTerms = graphResultCollector.getCollectorTerms();
frontierSize = collectorTerms.size();
// The resulting doc set from the frontier.
fromSet = graphResultCollector.getDocSet();
if (seedResultBits == null) {
// grab a copy of the seed bits (these are the "rootNodes")
seedResultBits = ((BitDocSet)fromSet).getBits().clone();
}
Integer fs = new Integer(frontierSize);
FrontierQuery fq = buildFrontierQuery(collectorTerms, fs);
if (fq == null) {
// in case we get null back, make sure we know we're done at this level.
fq = new FrontierQuery(null, 0);
}
frontierQuery = fq.getQuery();
frontierSize = fq.getFrontierSize();
// Add the bits from this level to the result set.
resultBits.or(((BitDocSet)fromSet).getBits());
// Increment how far we have gone in the frontier.
currentDepth++;
// Break out if we have reached our max depth
if (currentDepth >= maxDepth && maxDepth != -1) {
break;
}
// test if we discovered any new edges, if not , we're done.
} while (frontierSize > 0);
// helper bit set operations on the final result set
if (!returnRoot) {
resultBits.andNot(seedResultBits);
}
BitDocSet resultSet = new BitDocSet(resultBits);
// If we only want to return leaf nodes do that here.
if (onlyLeafNodes) {
return resultSet.intersection(leafNodes);
} else {
// create a doc set off the bits that we found.
return resultSet;
}
}
/** Build an automaton to represent the frontier query */
private Automaton buildAutomaton(BytesRefHash termBytesHash) {
// need top pass a sorted set of terms to the autn builder (maybe a better way to avoid this?)
final TreeSet<BytesRef> terms = new TreeSet<BytesRef>();
for (int i = 0 ; i < termBytesHash.size(); i++) {
BytesRef ref = new BytesRef();
termBytesHash.get(i, ref);
terms.add(ref);
}
final Automaton a = DaciukMihovAutomatonBuilder.build(terms);
return a;
}
/**
* This return a query that represents the documents that match the next hop in the query.
*
* collectorTerms - the terms that represent the edge ids for the current frontier.
* frontierSize - the size of the frontier query (number of unique edges)
*
*/
public FrontierQuery buildFrontierQuery(BytesRefHash collectorTerms, Integer frontierSize) {
if (collectorTerms == null || collectorTerms.size() == 0) {
// return null if there are no terms (edges) to traverse.
return null;
} else {
// Create a query
Query q = null;
// TODO: see if we should dynamically select this based on the frontier size.
if (useAutn) {
// build an automaton based query for the frontier.
Automaton autn = buildAutomaton(collectorTerms);
AutomatonQuery autnQuery = new AutomatonQuery(new Term(fromField), autn);
q = autnQuery;
} else {
List<BytesRef> termList = new ArrayList<>(collectorTerms.size());
for (int i = 0 ; i < collectorTerms.size(); i++) {
BytesRef ref = new BytesRef();
collectorTerms.get(i, ref);
termList.add(ref);
}
q = new TermsQuery(fromField, termList);
}
// If there is a filter to be used while crawling the graph, add that.
if (traversalFilter != null) {
BooleanQuery.Builder builder = new BooleanQuery.Builder();
builder.add(q, Occur.MUST);
builder.add(traversalFilter, Occur.MUST);
q = builder.build();
}
// return the new query.
FrontierQuery frontier = new FrontierQuery(q, frontierSize);
return frontier;
}
}
@Override
public Scorer scorer(LeafReaderContext context) throws IOException {
if (filter == null) {
resultSet = getDocSet();
filter = resultSet.getTopFilter();
}
DocIdSet readerSet = filter.getDocIdSet(context,context.reader().getLiveDocs());
// create a scrorer on the result set, if results from right query are empty, use empty iterator.
return new GraphScorer(this, readerSet == null ? DocIdSetIterator.empty() : readerSet.iterator(), 1);
}
@Override
public void extractTerms(Set<Term> terms) {
// NoOp for now , not used.. / supported
}
}
private class GraphScorer extends Scorer {
final DocIdSetIterator iter;
final float score;
// graph query scorer constructor with iterator
public GraphScorer(Weight w, DocIdSetIterator iter, float score) throws IOException {
super(w);
this.iter = iter==null ? DocIdSet.EMPTY.iterator() : iter;
this.score = score;
}
@Override
public float score() throws IOException {
// no dynamic scoring now.
return score;
}
@Override
public int nextDoc() throws IOException {
return iter.nextDoc();
}
@Override
public int docID() {
// current position of the doc iterator.
return iter.docID();
}
@Override
public int advance(int target) throws IOException {
return iter.advance(target);
}
@Override
public int freq() throws IOException {
return 1;
}
@Override
public long cost() {
// TODO: potentially very expensive! what's a good value for this?
return 0;
}
}
/**
* @return The query to be used as a filter for each hop in the graph.
*/
public Query getTraversalFilter() {
return traversalFilter;
}
public void setTraversalFilter(Query traversalFilter) {
this.traversalFilter = traversalFilter;
}
public Query getQ() {
return q;
}
public void setQ(Query q) {
this.q = q;
}
/**
* @return The field that contains the node id
*/
public String getFromField() {
return fromField;
}
public void setFromField(String fromField) {
this.fromField = fromField;
}
/**
* @return the field that contains the edge id(s)
*/
public String getToField() {
return toField;
}
public void setToField(String toField) {
this.toField = toField;
}
/**
* @return Max depth for traversal, -1 for infinite!
*/
public int getMaxDepth() {
return maxDepth;
}
public void setMaxDepth(int maxDepth) {
this.maxDepth = maxDepth;
}
/**
* @return If true , an automaton query will be compiled for each new frontier traversal
* this helps to avoid max boolean clause errors.
*/
public boolean isUseAutn() {
return useAutn;
}
public void setUseAutn(boolean useAutn) {
this.useAutn = useAutn;
}
/**
* @return if true only documents that do not have a value in the edge id field will be returned.
*/
public boolean isOnlyLeafNodes() {
return onlyLeafNodes;
}
public void setOnlyLeafNodes(boolean onlyLeafNodes) {
this.onlyLeafNodes = onlyLeafNodes;
}
/**
* @return if true the documents that matched the rootNodes query will be returned. o/w they will be removed from the result set.
*/
public boolean isReturnRoot() {
return returnRoot;
}
public void setReturnRoot(boolean returnRoot) {
this.returnRoot = returnRoot;
}
@Override
public int hashCode() {
final int prime = 31;
int result = super.hashCode();
result = prime * result + ((fromField == null) ? 0 : fromField.hashCode());
result = prime * result + maxDepth;
result = prime * result + (onlyLeafNodes ? 1231 : 1237);
result = prime * result + ((q == null) ? 0 : q.hashCode());
result = prime * result + (returnRoot ? 1231 : 1237);
result = prime * result + ((toField == null) ? 0 : toField.hashCode());
result = prime * result + ((traversalFilter == null) ? 0 : traversalFilter.hashCode());
result = prime * result + (useAutn ? 1231 : 1237);
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (!super.equals(obj))
return false;
if (getClass() != obj.getClass())
return false;
GraphQuery other = (GraphQuery) obj;
if (fromField == null) {
if (other.fromField != null)
return false;
} else if (!fromField.equals(other.fromField))
return false;
if (maxDepth != other.maxDepth)
return false;
if (onlyLeafNodes != other.onlyLeafNodes)
return false;
if (q == null) {
if (other.q != null)
return false;
} else if (!q.equals(other.q))
return false;
if (returnRoot != other.returnRoot)
return false;
if (toField == null) {
if (other.toField != null)
return false;
} else if (!toField.equals(other.toField))
return false;
if (traversalFilter == null) {
if (other.traversalFilter != null)
return false;
} else if (!traversalFilter.equals(other.traversalFilter))
return false;
if (useAutn != other.useAutn)
return false;
return true;
}
}

View File

@ -0,0 +1,70 @@
package org.apache.solr.search.join;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.search.Query;
import org.apache.solr.common.params.CommonParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.search.QParser;
import org.apache.solr.search.QueryParsing;
import org.apache.solr.search.SyntaxError;
/**
* Solr query parser that will handle parsing graph query requests.
*/
public class GraphQueryParser extends QParser {
public GraphQueryParser(String qstr, SolrParams localParams, SolrParams params, SolrQueryRequest req) {
super(qstr, localParams, params, req);
}
@Override
public Query parse() throws SyntaxError {
// grab query params and defaults
SolrParams localParams = getLocalParams();
Query rootNodeQuery = subQuery(localParams.get(QueryParsing.V), null).getQuery();
String traversalFilterS = localParams.get("traversalFilter");
Query traversalFilter = traversalFilterS == null ? null : subQuery(traversalFilterS, null).getQuery();
String fromField = localParams.get("from", "node_id");
String toField = localParams.get("to", "edge_ids");
// only documents that do not have values in the edge id fields.
boolean onlyLeafNodes = localParams.getBool("returnOnlyLeaf", false);
// choose if you want to return documents that match the initial query or not.
boolean returnRootNodes = localParams.getBool("returnRoot", true);
// enable or disable the use of an automaton term for the frontier traversal.
int maxDepth = localParams.getInt("maxDepth", -1);
// if true, an automaton will be compiled to issue the next graph hop
// this avoid having a large number of boolean clauses. (and it's faster too!)
boolean useAutn = localParams.getBool("useAutn", false);
// Construct a graph query object based on parameters passed in.
GraphQuery gq = new GraphQuery(rootNodeQuery, fromField, toField, traversalFilter);
// set additional parameters that are not in the constructor.
gq.setMaxDepth(maxDepth);
gq.setOnlyLeafNodes(onlyLeafNodes);
gq.setReturnRoot(returnRootNodes);
gq.setUseAutn(useAutn);
// return the parsed graph query.
return gq;
}
}

View File

@ -0,0 +1,136 @@
package org.apache.solr.search.join;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.SimpleCollector;
import org.apache.lucene.util.BitSet;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefHash;
import org.apache.lucene.util.FixedBitSet;
import org.apache.solr.search.BitDocSet;
import org.apache.solr.search.DocSet;
/**
* A graph hit collector. This accumulates the edges for a given graph traversal.
* On each collect method, the collector skips edge extraction for nodes that it has
* already traversed.
* @lucene.internal
*/
class GraphTermsCollector extends SimpleCollector implements Collector {
// the field to collect edge ids from
private String field;
// all the collected terms
private BytesRefHash collectorTerms;
private SortedSetDocValues docTermOrds;
// the result set that is being collected.
private Bits currentResult;
// known leaf nodes
private DocSet leafNodes;
// number of hits discovered at this level.
int numHits=0;
BitSet bits;
final int maxDoc;
int base;
int baseInParent;
// if we care to track this.
boolean hasCycles = false;
GraphTermsCollector(String field,int maxDoc, Bits currentResult, DocSet leafNodes) {
this.field = field;
this.maxDoc = maxDoc;
this.collectorTerms = new BytesRefHash();
this.currentResult = currentResult;
this.leafNodes = leafNodes;
if (bits==null) {
// create a bitset at the start that will hold the graph traversal result set
bits = new FixedBitSet(maxDoc);
}
}
public void collect(int doc) throws IOException {
doc += base;
if (currentResult.get(doc)) {
// cycle detected / already been here.
// knowing if your graph had a cycle might be useful and it's lightweight to implement here.
hasCycles = true;
return;
}
// collect the docs
addDocToResult(doc);
// Optimization to not look up edges for a document that is a leaf node
if (!leafNodes.exists(doc)) {
addEdgeIdsToResult(doc-base);
}
// Note: tracking links in for each result would be a huge memory hog... so not implementing at this time.
}
private void addEdgeIdsToResult(int doc) throws IOException {
// set the doc to pull the edges ids for.
docTermOrds.setDocument(doc);
BytesRef edgeValue = new BytesRef();
long ord;
while ((ord = docTermOrds.nextOrd()) != SortedSetDocValues.NO_MORE_ORDS) {
// TODO: handle non string type fields.
edgeValue = docTermOrds.lookupOrd(ord);
// add the edge id to the collector terms.
collectorTerms.add(edgeValue);
}
}
private void addDocToResult(int docWithBase) {
// this document is part of the traversal. mark it in our bitmap.
bits.set(docWithBase);
// increment the hit count so we know how many docs we traversed this time.
numHits++;
}
public DocSet getDocSet() {
if (bits == null) {
// TODO: this shouldn't happen
bits = new FixedBitSet(maxDoc);
}
return new BitDocSet((FixedBitSet)bits,numHits);
}
@Override
public void doSetNextReader(LeafReaderContext context) throws IOException {
// Grab the updated doc values.
docTermOrds = DocValues.getSortedSet(context.reader(), field);
base = context.docBase;
baseInParent = context.docBaseInParent;
}
public BytesRefHash getCollectorTerms() {
return collectorTerms;
}
@Override
public boolean needsScores() {
return false;
}
}

View File

@ -0,0 +1,71 @@
<?xml version="1.0" encoding="UTF-8" ?>
<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
license agreements. See the NOTICE file distributed with this work for additional
information regarding copyright ownership. The ASF licenses this file to
You under the Apache License, Version 2.0 (the "License"); you may not use
this file except in compliance with the License. You may obtain a copy of
the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
by applicable law or agreed to in writing, software distributed under the
License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
OF ANY KIND, either express or implied. See the License for the specific
language governing permissions and limitations under the License. -->
<!-- This is a stripped down schema that includes the node_id and edge_id
fields to test graph queries -->
<schema name="graphexample" version="1.5">
<!-- field names should consist of alphanumeric or underscore characters
only and not start with a digit. This is not currently strictly enforced,
but other field names will not have first class support from all components
and back compatibility is not guaranteed. Names with both leading and trailing
underscores (e.g. _version_) are reserved. -->
<!-- unique id for all records in the index. -->
<field name="id" type="string" indexed="true" stored="true"
required="true" multiValued="false" />
<!-- If you remove this field, you must _also_ disable the update log in
solrconfig.xml or Solr won't start. _version_ and update log are required
for SolrCloud -->
<field name="_version_" type="long" indexed="true" stored="true" />
<!-- points to the root document of a block of nested documents. Required
for nested document support, may be removed otherwise (not used in graph
query test) -->
<field name="_root_" type="string" indexed="true" stored="false" />
<!-- the field that contains the "node_id" for graph traversal -->
<field name="node_id" type="string" indexed="true" stored="true"
multiValued="false" omitNorms="true" termVectors="true" />
<!-- multi-valued field that contains the edge id's for graph traversal -->
<field name="edge_id" type="string" indexed="true" stored="true"
multiValued="true" omitNorms="true" omitPositions="true" termVectors="true" />
<!-- typical title/text fields -->
<field name="title" type="text_general" indexed="true" stored="true"
multiValued="true" omitNorms="true" omitPositions="true" termVectors="true" />
<field name="text" type="text_general" indexed="true" stored="true"
multiValued="true" omitNorms="true" omitPositions="true" termVectors="true" />
<!-- catch all field for indexing unknown fields. -->
<dynamicField name="*" type="string" indexed="true"
stored="true" multiValued="true" />
<!-- call out the explicit doc id. -->
<uniqueKey>id</uniqueKey>
<!-- Field types -->
<fieldType name="string" class="solr.StrField"
sortMissingLast="true" />
<fieldType name="long" class="solr.TrieLongField"
precisionStep="0" positionIncrementGap="0" />
<fieldType name="text_general" class="solr.TextField"
positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.StandardTokenizerFactory" />
<filter class="solr.LowerCaseFilterFactory" />
</analyzer>
<analyzer type="query">
<tokenizer class="solr.StandardTokenizerFactory" />
<filter class="solr.LowerCaseFilterFactory" />
</analyzer>
</fieldType>
</schema>

View File

@ -412,6 +412,42 @@ public class QueryEqualityTest extends SolrTestCaseJ4 {
"{!child of=foo_s:parent}dude");
}
public void testGraphQuery() throws Exception {
SolrQueryRequest req = req("from", "node_s",
"to","edge_s",
"traversalFilter","foo",
"returnOnlyLeaf","true",
"returnRoot","false",
"maxDepth","2",
"useAutn","false"
);
// make sure all param subsitution works for all args to graph query.
assertQueryEquals("graph", req,
"{!graph from=node_s to=edge_s}*:*",
"{!graph from=$from to=$to}*:*");
assertQueryEquals("graph", req,
"{!graph from=node_s to=edge_s traversalFilter=foo}*:*",
"{!graph from=$from to=$to traversalFilter=$traversalFilter}*:*");
assertQueryEquals("graph", req,
"{!graph from=node_s to=edge_s traversalFilter=foo returnOnlyLeaf=true}*:*",
"{!graph from=$from to=$to traversalFilter=$traversalFilter returnOnlyLeaf=$returnOnlyLeaf}*:*");
assertQueryEquals("graph", req,
"{!graph from=node_s to=edge_s traversalFilter=foo returnOnlyLeaf=true returnRoot=false}*:*",
"{!graph from=$from to=$to traversalFilter=$traversalFilter returnOnlyLeaf=$returnOnlyLeaf returnRoot=$returnRoot}*:*");
assertQueryEquals("graph", req,
"{!graph from=node_s to=edge_s traversalFilter=foo returnOnlyLeaf=true returnRoot=false maxDepth=2}*:*",
"{!graph from=$from to=$to traversalFilter=$traversalFilter returnOnlyLeaf=$returnOnlyLeaf returnRoot=$returnRoot maxDepth=$maxDepth}*:*");
assertQueryEquals("graph", req,
"{!graph from=node_s to=edge_s traversalFilter=foo returnOnlyLeaf=true returnRoot=false maxDepth=2 useAutn=false}*:*",
"{!graph from=$from to=$to traversalFilter=$traversalFilter returnOnlyLeaf=$returnOnlyLeaf returnRoot=$returnRoot maxDepth=$maxDepth useAutn=$useAutn}*:*");
}
public void testQuerySurround() throws Exception {
assertQueryEquals("surround", "{!surround}and(apache,solr)",
"and(apache,solr)", "apache AND solr");

View File

@ -0,0 +1,94 @@
package org.apache.solr.search.join;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.request.SolrQueryRequest;
import org.junit.BeforeClass;
import org.junit.Test;
public class GraphQueryTest extends SolrTestCaseJ4 {
@BeforeClass
public static void beforeTests() throws Exception {
initCore("solrconfig.xml","schema-graph.xml");
}
@Test
public void testGraph() throws Exception {
// 1 -> 2 -> 3 -> ( 4 5 )
// 7 -> 1
// 8 -> ( 1 2 )
assertU(adoc("id", "doc_1", "node_id", "1", "edge_id", "2", "text", "foo", "title", "foo10"));
assertU(adoc("id", "doc_2", "node_id", "2", "edge_id", "3", "text", "foo"));
assertU(commit());
assertU(adoc("id", "doc_3", "node_id", "3", "edge_id", "4", "edge_id", "5", "table", "foo"));
assertU(adoc("id", "doc_4", "node_id", "4", "table", "foo"));
assertU(commit());
assertU(adoc("id", "doc_5", "node_id", "5", "edge_id", "7", "table", "bar"));
assertU(adoc("id", "doc_6", "node_id", "6", "edge_id", "3" ));
assertU(adoc("id", "doc_7", "node_id", "7", "edge_id", "1" ));
assertU(adoc("id", "doc_8", "node_id", "8", "edge_id", "1", "edge_id", "2" ));
assertU(adoc("id", "doc_9", "node_id", "9"));
assertU(commit());
// update docs so they're in a new segment.
assertU(adoc("id", "doc_1", "node_id", "1", "edge_id", "2", "text", "foo"));
assertU(adoc("id", "doc_2", "node_id", "2", "edge_id", "3", "edge_id", "9", "text", "foo11"));
assertU(commit());
// a graph for testing traversal filter 10 - 11 -> (12 | 13)
assertU(adoc("id", "doc_10", "node_id", "10", "edge_id", "11", "title", "foo"));
assertU(adoc("id", "doc_11", "node_id", "11", "edge_id", "12", "edge_id", "13", "text", "foo11"));
assertU(adoc("id", "doc_12", "node_id", "12", "text", "foo10"));
assertU(adoc("id", "doc_13", "node_id", "13", "edge_id", "12", "text", "foo10"));
assertU(commit());
// Now we have created a simple graph
// start traversal from node id to edge id
String gQuery = "{!graph from=\"node_id\" to=\"edge_id\"}id:doc_1";
SolrQueryRequest qr = createRequest(gQuery);
assertQ(qr,"//*[@numFound='7']");
String g2Query = "{!graph from=\"node_id\" to=\"edge_id\" returnRoot=\"true\" returnOnlyLeaf=\"false\"}id:doc_8";
qr = createRequest(g2Query);
assertQ(qr,"//*[@numFound='8']");
String g3Query = "{!graph from=\"node_id\" to=\"edge_id\" returnRoot=\"false\" returnOnlyLeaf=\"false\"}id:doc_8";
qr = createRequest(g3Query);
assertQ(qr,"//*[@numFound='7']");
String g4Query = "{!graph from=\"node_id\" to=\"edge_id\" returnRoot=\"true\" returnOnlyLeaf=\"false\" traversalFilter=\"text:foo11\"}id:doc_8";
qr = createRequest(g4Query);
assertQ(qr,"//*[@numFound='2']");
}
private SolrQueryRequest createRequest(String query) {
SolrQueryRequest qr = req(query);
NamedList<Object> par = qr.getParams().toNamedList();
par.add("debug", "true");
par.add("rows", "10");
par.add("fl", "id,node_id,edge_id");
par.remove("qt");
SolrParams newp = SolrParams.toSolrParams(par);
qr.setParams(newp);
return qr;
}
}