SOLR-9337: Add fetch Streaming Expression

This commit is contained in:
Joel Bernstein 2016-10-10 14:20:09 -04:00
parent 4fe3110e49
commit ee3f9e1e05
3 changed files with 482 additions and 0 deletions

View File

@ -137,6 +137,7 @@ public class StreamHandler extends RequestHandlerBase implements SolrCoreAware,
.withFunctionName("scoreNodes", ScoreNodesStream.class)
.withFunctionName("model", ModelStream.class)
.withFunctionName("classify", ClassifyStream.class)
.withFunctionName("fetch", FetchStream.class)
// metrics
.withFunctionName("min", MinMetric.class)

View File

@ -0,0 +1,314 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.client.solrj.io.stream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.HashMap;
import org.apache.solr.client.solrj.io.Tuple;
import org.apache.solr.client.solrj.io.comp.StreamComparator;
import org.apache.solr.client.solrj.io.stream.expr.Explanation;
import org.apache.solr.client.solrj.io.stream.expr.Explanation.ExpressionType;
import org.apache.solr.client.solrj.io.stream.expr.Expressible;
import org.apache.solr.client.solrj.io.stream.expr.StreamExplanation;
import org.apache.solr.client.solrj.io.stream.expr.StreamExpression;
import org.apache.solr.client.solrj.io.stream.expr.StreamExpressionNamedParameter;
import org.apache.solr.client.solrj.io.stream.expr.StreamExpressionValue;
import org.apache.solr.client.solrj.io.stream.expr.StreamFactory;
import org.apache.solr.common.params.ModifiableSolrParams;
/**
* Iterates over a stream and fetches additional fields from a specified collection.
* Fetches are done in batches.
*
* Syntax:
*
* fetch(collection, stream, on="a=b", fl="c,d,e", batchSize="50")
*
**/
public class FetchStream extends TupleStream implements Expressible {
private static final long serialVersionUID = 1;
protected String zkHost;
private TupleStream stream;
private StreamContext streamContext;
private Iterator<Tuple> tuples;
private String leftKey;
private String rightKey;
private String fieldList;
private String[] fields;
private String collection;
private int batchSize;
private boolean appendVersion = true;
private boolean appendKey = true;
public FetchStream(String zkHost, String collection, TupleStream tupleStream, String on, String fieldList, int batchSize) throws IOException {
init(zkHost, collection, tupleStream, on, fieldList, batchSize);
}
public FetchStream(StreamExpression expression, StreamFactory factory) throws IOException {
// grab all parameters out
String collectionName = factory.getValueOperand(expression, 0);
List<StreamExpression> streamExpressions = factory.getExpressionOperandsRepresentingTypes(expression, Expressible.class, TupleStream.class);
StreamExpressionNamedParameter onParam = factory.getNamedOperand(expression, "on");
StreamExpressionNamedParameter flParam = factory.getNamedOperand(expression, "fl");
StreamExpressionNamedParameter batchSizeParam = factory.getNamedOperand(expression, "batchSize");
StreamExpressionNamedParameter zkHostExpression = factory.getNamedOperand(expression, "zkHost");
String on = null;
String fl = null;
int batchSize = 50;
if(onParam == null) {
throw new IOException("on parameter cannot be null for the fetch expression");
} else {
on = ((StreamExpressionValue)onParam.getParameter()).getValue();
}
if(flParam == null) {
throw new IOException("fl parameter cannot be null for the fetch expression");
} else {
fl = ((StreamExpressionValue)flParam.getParameter()).getValue();
}
if(batchSizeParam != null) {
batchSize = Integer.parseInt(((StreamExpressionValue)batchSizeParam.getParameter()).getValue());
}
if(1 != streamExpressions.size()){
throw new IOException(String.format(Locale.ROOT,"Invalid expression %s - expecting a single stream but found %d",expression, streamExpressions.size()));
}
TupleStream stream = factory.constructStream(streamExpressions.get(0));
String zkHost = null;
if(null == zkHostExpression){
zkHost = factory.getCollectionZkHost(collectionName);
if(zkHost == null) {
zkHost = factory.getDefaultZkHost();
}
}
else if(zkHostExpression.getParameter() instanceof StreamExpressionValue){
zkHost = ((StreamExpressionValue)zkHostExpression.getParameter()).getValue();
}
if(null == zkHost){
throw new IOException(String.format(Locale.ROOT,"invalid expression %s - zkHost not found for collection '%s'",expression,collectionName));
}
init(zkHost, collectionName, stream, on, fl, batchSize);
}
private void init(String zkHost, String collection, TupleStream tupleStream, String on, String fieldList, int batchSize) throws IOException{
this.zkHost = zkHost;
this.collection = collection;
this.stream = tupleStream;
this.batchSize = batchSize;
this.fields = fieldList.split(",");
this.fieldList = fieldList;
if(on.indexOf("=") > -1) {
String[] leftright = on.split("=");
leftKey = leftright[0].trim();
rightKey = leftright[1].trim();
} else {
leftKey = rightKey = on;
}
for(int i=0; i<fields.length; i++) {
fields[i] = fields[i].trim();
if(fields[i].equals("_version_")) {
appendVersion = false;
}
if(fields[i].equals(rightKey)) {
appendKey = false;
}
}
}
@Override
public StreamExpression toExpression(StreamFactory factory) throws IOException {
return toExpression(factory, true);
}
private StreamExpression toExpression(StreamFactory factory, boolean includeStreams) throws IOException {
// function name
StreamExpression expression = new StreamExpression(factory.getFunctionName(this.getClass()));
expression.addParameter(collection);
expression.addParameter(new StreamExpressionNamedParameter("on", leftKey+"="+rightKey));
expression.addParameter(new StreamExpressionNamedParameter("fl", fieldList));
expression.addParameter(new StreamExpressionNamedParameter("batchSize", Integer.toString(batchSize)));
// stream
if(includeStreams) {
if (stream instanceof Expressible) {
expression.addParameter(((Expressible) stream).toExpression(factory));
} else {
throw new IOException("The FetchStream contains a non-expressible TupleStream - it cannot be converted to an expression");
}
}
return expression;
}
@Override
public Explanation toExplanation(StreamFactory factory) throws IOException {
return new StreamExplanation(getStreamNodeId().toString())
.withChildren(new Explanation[]{
stream.toExplanation(factory)
})
.withFunctionName(factory.getFunctionName(this.getClass()))
.withImplementingClass(this.getClass().getName())
.withExpressionType(ExpressionType.STREAM_DECORATOR)
.withExpression(toExpression(factory, false).toString());
}
public void setStreamContext(StreamContext streamContext) {
this.streamContext = streamContext;
this.stream.setStreamContext(streamContext);
}
public List<TupleStream> children() {
List<TupleStream> l = new ArrayList();
l.add(stream);
return l;
}
public void open() throws IOException {
tuples = new ArrayList().iterator();
stream.open();
}
private void fetchBatch() throws IOException {
Tuple EOFTuple = null;
List<Tuple> batch = new ArrayList();
for(int i=0; i<batchSize; i++) {
Tuple tuple = stream.read();
if(tuple.EOF) {
EOFTuple = tuple;
break;
} else {
batch.add(tuple);
}
}
if(batch.size() > 0) {
StringBuilder buf = new StringBuilder();
buf.append(rightKey);
buf.append(":(");
for (int i = 0; i < batch.size(); i++) {
if (i > 0) {
buf.append(" ");
}
Tuple tuple = batch.get(i);
String key = tuple.getString(leftKey);
buf.append(key);
}
buf.append(")");
ModifiableSolrParams params = new ModifiableSolrParams();
params.add("q", buf.toString());
params.add("fl", fieldList+appendFields());
params.add("rows", Integer.toString(batchSize));
params.add("sort", "_version_ desc");
CloudSolrStream cloudSolrStream = new CloudSolrStream(zkHost, collection, params);
StreamContext newContext = new StreamContext();
newContext.setSolrClientCache(streamContext.getSolrClientCache());
cloudSolrStream.setStreamContext(newContext);
Map<String, Tuple> fetched = new HashMap();
try {
cloudSolrStream.open();
while (true) {
Tuple t = cloudSolrStream.read();
if (t.EOF) {
break;
} else {
String rightValue = t.getString(rightKey);
fetched.put(rightValue, t);
}
}
} finally {
cloudSolrStream.close();
}
//Iterate the batch and add the fetched fields to the Tuples
for (Tuple batchTuple : batch) {
Tuple fetchedTuple = fetched.get(batchTuple.getString(leftKey));
if(fetchedTuple !=null) {
for (String field : fields) {
Object value = fetchedTuple.get(field);
if(value != null) {
batchTuple.put(field, value);
}
}
}
}
}
if(EOFTuple != null) {
batch.add(EOFTuple);
}
this.tuples = batch.iterator();
}
public void close() throws IOException {
stream.close();
}
public Tuple read() throws IOException {
if(!tuples.hasNext()) {
fetchBatch();
}
return tuples.next();
}
public StreamComparator getStreamSort(){
return stream.getStreamSort();
}
public int getCost() {
return 0;
}
private String appendFields() {
StringBuffer buf = new StringBuffer();
if(appendKey) {
buf.append(",");
buf.append(rightKey);
}
if(appendVersion) {
buf.append(",_version_");
}
return buf.toString();
}
}

View File

@ -717,6 +717,173 @@ public class StreamExpressionTest extends SolrCloudTestCase {
}
@Test
public void testFetchStream() throws Exception {
SolrClientCache solrClientCache = new SolrClientCache();
new UpdateRequest()
.add(id, "0", "a_s", "hello0", "a_i", "0", "a_f", "1", "subject", "blah blah blah 0")
.add(id, "2", "a_s", "hello0", "a_i", "2", "a_f", "2", "subject", "blah blah blah 2")
.add(id, "3", "a_s", "hello3", "a_i", "3", "a_f", "3", "subject", "blah blah blah 3")
.add(id, "4", "a_s", "hello4", "a_i", "4", "a_f", "4", "subject", "blah blah blah 4")
.add(id, "1", "a_s", "hello0", "a_i", "1", "a_f", "5", "subject", "blah blah blah 1")
.add(id, "5", "a_s", "hello3", "a_i", "5", "a_f", "6", "subject", "blah blah blah 5")
.add(id, "6", "a_s", "hello4", "a_i", "6", "a_f", "7", "subject", "blah blah blah 6")
.add(id, "7", "a_s", "hello3", "a_i", "7", "a_f", "8", "subject", "blah blah blah 7")
.add(id, "8", "a_s", "hello3", "a_i", "8", "a_f", "9", "subject", "blah blah blah 8")
.add(id, "9", "a_s", "hello0", "a_i", "9", "a_f", "10", "subject", "blah blah blah 9")
.commit(cluster.getSolrClient(), COLLECTION);
TupleStream stream;
List<Tuple> tuples;
StreamFactory factory = new StreamFactory()
.withCollectionZkHost(COLLECTION, cluster.getZkServer().getZkAddress())
.withFunctionName("search", CloudSolrStream.class)
.withFunctionName("fetch", FetchStream.class);
stream = factory.constructStream("fetch("+COLLECTION+", search(" + COLLECTION + ", q=*:*, fl=\"id,a_s,a_i,a_f\", sort=\"a_f asc\"), on=\"id=a_i\", batchSize=\"2\", fl=\"subject\")");
StreamContext context = new StreamContext();
context.setSolrClientCache(solrClientCache);
stream.setStreamContext(context);
tuples = getTuples(stream);
assert(tuples.size() == 10);
Tuple t = tuples.get(0);
assertTrue("blah blah blah 0".equals(t.getString("subject")));
t = tuples.get(1);
assertTrue("blah blah blah 2".equals(t.getString("subject")));
t = tuples.get(2);
assertTrue("blah blah blah 3".equals(t.getString("subject")));
t = tuples.get(3);
assertTrue("blah blah blah 4".equals(t.getString("subject")));
t = tuples.get(4);
assertTrue("blah blah blah 1".equals(t.getString("subject")));
t = tuples.get(5);
assertTrue("blah blah blah 5".equals(t.getString("subject")));
t = tuples.get(6);
assertTrue("blah blah blah 6".equals(t.getString("subject")));
t = tuples.get(7);
assertTrue("blah blah blah 7".equals(t.getString("subject")));
t = tuples.get(8);
assertTrue("blah blah blah 8".equals(t.getString("subject")));
t = tuples.get(9);
assertTrue("blah blah blah 9".equals(t.getString("subject")));
//Change the batch size
stream = factory.constructStream("fetch("+COLLECTION+", search(" + COLLECTION + ", q=*:*, fl=\"id,a_s,a_i,a_f\", sort=\"a_f asc\"), on=\"id=a_i\", batchSize=\"3\", fl=\"subject\")");
context = new StreamContext();
context.setSolrClientCache(solrClientCache);
stream.setStreamContext(context);
tuples = getTuples(stream);
assert(tuples.size() == 10);
t = tuples.get(0);
assertTrue("blah blah blah 0".equals(t.getString("subject")));
t = tuples.get(1);
assertTrue("blah blah blah 2".equals(t.getString("subject")));
t = tuples.get(2);
assertTrue("blah blah blah 3".equals(t.getString("subject")));
t = tuples.get(3);
assertTrue("blah blah blah 4".equals(t.getString("subject")));
t = tuples.get(4);
assertTrue("blah blah blah 1".equals(t.getString("subject")));
t = tuples.get(5);
assertTrue("blah blah blah 5".equals(t.getString("subject")));
t = tuples.get(6);
assertTrue("blah blah blah 6".equals(t.getString("subject")));
t = tuples.get(7);
assertTrue("blah blah blah 7".equals(t.getString("subject")));
t = tuples.get(8);
assertTrue("blah blah blah 8".equals(t.getString("subject")));
t = tuples.get(9);
assertTrue("blah blah blah 9".equals(t.getString("subject")));
solrClientCache.close();
}
@Test
public void testParallelFetchStream() throws Exception {
new UpdateRequest()
.add(id, "0", "a_s", "hello0", "a_i", "0", "a_f", "1", "subject", "blah blah blah 0")
.add(id, "2", "a_s", "hello0", "a_i", "2", "a_f", "2", "subject", "blah blah blah 2")
.add(id, "3", "a_s", "hello3", "a_i", "3", "a_f", "3", "subject", "blah blah blah 3")
.add(id, "4", "a_s", "hello4", "a_i", "4", "a_f", "4", "subject", "blah blah blah 4")
.add(id, "1", "a_s", "hello0", "a_i", "1", "a_f", "5", "subject", "blah blah blah 1")
.add(id, "5", "a_s", "hello3", "a_i", "5", "a_f", "6", "subject", "blah blah blah 5")
.add(id, "6", "a_s", "hello4", "a_i", "6", "a_f", "7", "subject", "blah blah blah 6")
.add(id, "7", "a_s", "hello3", "a_i", "7", "a_f", "8", "subject", "blah blah blah 7")
.add(id, "8", "a_s", "hello3", "a_i", "8", "a_f", "9", "subject", "blah blah blah 8")
.add(id, "9", "a_s", "hello0", "a_i", "9", "a_f", "10", "subject", "blah blah blah 9")
.commit(cluster.getSolrClient(), COLLECTION);
TupleStream stream;
List<Tuple> tuples;
StreamFactory factory = new StreamFactory()
.withCollectionZkHost(COLLECTION, cluster.getZkServer().getZkAddress())
.withFunctionName("search", CloudSolrStream.class)
.withFunctionName("parallel", ParallelStream.class)
.withFunctionName("fetch", FetchStream.class);
stream = factory.constructStream("parallel("+COLLECTION+", workers=2, sort=\"a_f asc\", fetch("+COLLECTION+", search(" + COLLECTION + ", q=*:*, fl=\"id,a_s,a_i,a_f\", sort=\"a_f asc\", partitionKeys=\"id\"), on=\"id=a_i\", batchSize=\"2\", fl=\"subject\"))");
tuples = getTuples(stream);
assert(tuples.size() == 10);
Tuple t = tuples.get(0);
assertTrue("blah blah blah 0".equals(t.getString("subject")));
t = tuples.get(1);
assertTrue("blah blah blah 2".equals(t.getString("subject")));
t = tuples.get(2);
assertTrue("blah blah blah 3".equals(t.getString("subject")));
t = tuples.get(3);
assertTrue("blah blah blah 4".equals(t.getString("subject")));
t = tuples.get(4);
assertTrue("blah blah blah 1".equals(t.getString("subject")));
t = tuples.get(5);
assertTrue("blah blah blah 5".equals(t.getString("subject")));
t = tuples.get(6);
assertTrue("blah blah blah 6".equals(t.getString("subject")));
t = tuples.get(7);
assertTrue("blah blah blah 7".equals(t.getString("subject")));
t = tuples.get(8);
assertTrue("blah blah blah 8".equals(t.getString("subject")));
t = tuples.get(9);
assertTrue("blah blah blah 9".equals(t.getString("subject")));
stream = factory.constructStream("parallel("+COLLECTION+", workers=2, sort=\"a_f asc\", fetch("+COLLECTION+", search(" + COLLECTION + ", q=*:*, fl=\"id,a_s,a_i,a_f\", sort=\"a_f asc\", partitionKeys=\"id\"), on=\"id=a_i\", batchSize=\"3\", fl=\"subject\"))");
tuples = getTuples(stream);
assert(tuples.size() == 10);
t = tuples.get(0);
assertTrue("blah blah blah 0".equals(t.getString("subject")));
t = tuples.get(1);
assertTrue("blah blah blah 2".equals(t.getString("subject")));
t = tuples.get(2);
assertTrue("blah blah blah 3".equals(t.getString("subject")));
t = tuples.get(3);
assertTrue("blah blah blah 4".equals(t.getString("subject")));
t = tuples.get(4);
assertTrue("blah blah blah 1".equals(t.getString("subject")));
t = tuples.get(5);
assertTrue("blah blah blah 5".equals(t.getString("subject")));
t = tuples.get(6);
assertTrue("blah blah blah 6".equals(t.getString("subject")));
t = tuples.get(7);
assertTrue("blah blah blah 7".equals(t.getString("subject")));
t = tuples.get(8);
assertTrue("blah blah blah 8".equals(t.getString("subject")));
t = tuples.get(9);
assertTrue("blah blah blah 9".equals(t.getString("subject")));
}
@Test
public void testDaemonStream() throws Exception {