mirror of https://github.com/apache/lucene.git
SOLR-10786: Add DBSCAN clustering Streaming Evaluator
This commit is contained in:
parent
1f6c06f305
commit
fa7fe3af19
|
@ -302,7 +302,7 @@ public class Lang {
|
|||
.withFunctionName("trim", TrimEvaluator.class)
|
||||
.withFunctionName("cosine", CosineDistanceEvaluator.class)
|
||||
.withFunctionName("trunc", TruncEvaluator.class)
|
||||
|
||||
.withFunctionName("dbscan", DbscanEvaluator.class)
|
||||
// Boolean Stream Evaluators
|
||||
|
||||
.withFunctionName("and", AndEvaluator.class)
|
||||
|
|
|
@ -0,0 +1,141 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.client.solrj.io.eval;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.HashMap;
|
||||
|
||||
import org.apache.commons.math3.ml.clustering.Cluster;
|
||||
import org.apache.commons.math3.ml.distance.DistanceMeasure;
|
||||
import org.apache.commons.math3.ml.distance.EuclideanDistance;
|
||||
import org.apache.solr.client.solrj.io.Tuple;
|
||||
import org.apache.commons.math3.ml.clustering.Clusterable;
|
||||
import org.apache.commons.math3.ml.clustering.DBSCANClusterer;
|
||||
import org.apache.solr.client.solrj.io.stream.expr.StreamExpression;
|
||||
import org.apache.solr.client.solrj.io.stream.expr.StreamFactory;
|
||||
|
||||
public class DbscanEvaluator extends RecursiveObjectEvaluator implements ManyValueWorker {
|
||||
protected static final long serialVersionUID = 1L;
|
||||
|
||||
|
||||
public DbscanEvaluator(StreamExpression expression, StreamFactory factory) throws IOException{
|
||||
super(expression, factory);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object doWork(Object values[]) throws IOException {
|
||||
|
||||
Matrix matrix = null;
|
||||
double e = 0;
|
||||
int minPoints = 1;
|
||||
DistanceMeasure distanceMeasure = new EuclideanDistance();
|
||||
|
||||
if(values.length < 3 || values.length > 4) {
|
||||
throw new IOException("The dbscan scan function requires 3 or 4 parameters.");
|
||||
}
|
||||
|
||||
if(values[0] instanceof Matrix) {
|
||||
matrix = (Matrix)values[0];
|
||||
} else {
|
||||
throw new IOException("The first parameter for dbscan should be the observation matrix.");
|
||||
}
|
||||
|
||||
if(values[1] instanceof Number) {
|
||||
e = ((Number)values[1]).doubleValue();
|
||||
} else {
|
||||
throw new IOException("The second parameter for dbscan should be e.");
|
||||
}
|
||||
|
||||
if(values[2] instanceof Number) {
|
||||
minPoints = ((Number)values[2]).intValue();
|
||||
} else {
|
||||
throw new IOException("The third parameter for dbscan should be minPoints.");
|
||||
}
|
||||
|
||||
if(values.length > 3) {
|
||||
distanceMeasure = (DistanceMeasure)values[3];
|
||||
}
|
||||
|
||||
DBSCANClusterer<ClusterPoint> dbscan = new DBSCANClusterer(e, minPoints, distanceMeasure);
|
||||
List<ClusterPoint> points = new ArrayList();
|
||||
double[][] data = matrix.getData();
|
||||
List<String> ids = matrix.getRowLabels();
|
||||
|
||||
for(int i=0; i<data.length; i++) {
|
||||
double[] vec = data[i];
|
||||
if(ids != null) {
|
||||
points.add(new ClusterPoint(ids.get(i), vec));
|
||||
} else {
|
||||
points.add(new ClusterPoint(Integer.toString(i), vec));
|
||||
}
|
||||
}
|
||||
|
||||
Map fields = new HashMap();
|
||||
|
||||
fields.put("e", e);
|
||||
fields.put("minPoints", minPoints);
|
||||
fields.put("distance", distanceMeasure.toString());
|
||||
|
||||
return new ClusterTuple(fields, dbscan.cluster(points), matrix.getColumnLabels());
|
||||
}
|
||||
|
||||
public static class ClusterPoint implements Clusterable {
|
||||
|
||||
private double[] point;
|
||||
private String id;
|
||||
|
||||
public ClusterPoint(String id, double[] point) {
|
||||
this.id = id;
|
||||
this.point = point;
|
||||
}
|
||||
|
||||
public double[] getPoint() {
|
||||
return this.point;
|
||||
}
|
||||
|
||||
public String getId() {
|
||||
return this.id;
|
||||
}
|
||||
}
|
||||
|
||||
public static class ClusterTuple extends Tuple {
|
||||
|
||||
private List<String> columnLabels;
|
||||
private List<Cluster<ClusterPoint>> clusters;
|
||||
|
||||
public ClusterTuple(Map fields,
|
||||
List<Cluster<ClusterPoint>> clusters,
|
||||
List<String> columnLabels) {
|
||||
super(fields);
|
||||
this.clusters = clusters;
|
||||
this.columnLabels = columnLabels;
|
||||
}
|
||||
|
||||
public List<String> getColumnLabels() {
|
||||
return this.columnLabels;
|
||||
}
|
||||
|
||||
public List<Cluster<ClusterPoint>> getClusters() {
|
||||
return this.clusters;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -28,12 +28,14 @@ import java.util.Set;
|
|||
import org.apache.commons.math3.distribution.IntegerDistribution;
|
||||
import org.apache.commons.math3.distribution.RealDistribution;
|
||||
import org.apache.commons.math3.ml.clustering.CentroidCluster;
|
||||
import org.apache.commons.math3.ml.clustering.Cluster;
|
||||
import org.apache.commons.math3.random.EmpiricalDistribution;
|
||||
import org.apache.commons.math3.stat.Frequency;
|
||||
import org.apache.commons.math3.stat.descriptive.SummaryStatistics;
|
||||
import org.apache.commons.math3.util.Precision;
|
||||
import org.apache.solr.client.solrj.io.Tuple;
|
||||
import org.apache.solr.client.solrj.io.comp.StreamComparator;
|
||||
import org.apache.solr.client.solrj.io.eval.DbscanEvaluator;
|
||||
import org.apache.solr.client.solrj.io.eval.KmeansEvaluator;
|
||||
import org.apache.solr.client.solrj.io.eval.StreamEvaluator;
|
||||
import org.apache.solr.client.solrj.io.eval.Matrix;
|
||||
|
@ -214,18 +216,35 @@ public class ZplotStream extends TupleStream implements Expressible {
|
|||
}
|
||||
} else if(clusters) {
|
||||
Object o = evaluated.get("clusters");
|
||||
KmeansEvaluator.ClusterTuple ct = (KmeansEvaluator.ClusterTuple)o;
|
||||
List<CentroidCluster<KmeansEvaluator.ClusterPoint>> cs = ct.getClusters();
|
||||
int clusterNum = 0;
|
||||
for(CentroidCluster<KmeansEvaluator.ClusterPoint> c : cs) {
|
||||
clusterNum++;
|
||||
List<KmeansEvaluator.ClusterPoint> points = c.getPoints();
|
||||
for(KmeansEvaluator.ClusterPoint p : points) {
|
||||
Tuple tuple = new Tuple(new HashMap());
|
||||
tuple.put("x", p.getPoint()[0]);
|
||||
tuple.put("y", p.getPoint()[1]);
|
||||
tuple.put("cluster", "cluster"+clusterNum);
|
||||
outTuples.add(tuple);
|
||||
if(o instanceof KmeansEvaluator.ClusterTuple) {
|
||||
KmeansEvaluator.ClusterTuple ct = (KmeansEvaluator.ClusterTuple) o;
|
||||
List<CentroidCluster<KmeansEvaluator.ClusterPoint>> cs = ct.getClusters();
|
||||
int clusterNum = 0;
|
||||
for (CentroidCluster<KmeansEvaluator.ClusterPoint> c : cs) {
|
||||
clusterNum++;
|
||||
List<KmeansEvaluator.ClusterPoint> points = c.getPoints();
|
||||
for (KmeansEvaluator.ClusterPoint p : points) {
|
||||
Tuple tuple = new Tuple(new HashMap());
|
||||
tuple.put("x", p.getPoint()[0]);
|
||||
tuple.put("y", p.getPoint()[1]);
|
||||
tuple.put("cluster", "cluster" + clusterNum);
|
||||
outTuples.add(tuple);
|
||||
}
|
||||
}
|
||||
} else if(o instanceof DbscanEvaluator.ClusterTuple) {
|
||||
DbscanEvaluator.ClusterTuple ct = (DbscanEvaluator.ClusterTuple) o;
|
||||
List<Cluster<DbscanEvaluator.ClusterPoint>> cs = ct.getClusters();
|
||||
int clusterNum = 0;
|
||||
for (Cluster<DbscanEvaluator.ClusterPoint> c : cs) {
|
||||
clusterNum++;
|
||||
List<DbscanEvaluator.ClusterPoint> points = c.getPoints();
|
||||
for (DbscanEvaluator.ClusterPoint p : points) {
|
||||
Tuple tuple = new Tuple(new HashMap());
|
||||
tuple.put("x", p.getPoint()[0]);
|
||||
tuple.put("y", p.getPoint()[1]);
|
||||
tuple.put("cluster", "cluster" + clusterNum);
|
||||
outTuples.add(tuple);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if(distribution) {
|
||||
|
|
|
@ -77,7 +77,7 @@ public class TestLang extends SolrTestCase {
|
|||
"getSupportPoints", "pairSort", "log10", "plist", "recip", "pivot", "ltrim", "rtrim", "export",
|
||||
"zplot", "natural", "repeat", "movingMAD", "hashRollup", "noop", "var", "stddev", "recNum", "isNull",
|
||||
"notNull", "matches", "projectToBorder", "double", "long", "parseCSV", "parseTSV", "dateTime",
|
||||
"split", "upper", "trim", "lower", "trunc", "cosine"};
|
||||
"split", "upper", "trim", "lower", "trunc", "cosine", "dbscan"};
|
||||
|
||||
@Test
|
||||
public void testLang() {
|
||||
|
|
|
@ -2976,6 +2976,9 @@ public class MathExpressionTest extends SolrCloudTestCase {
|
|||
assertEquals(row.get(2).doubleValue(), 18.0, 0.0);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
@Test
|
||||
public void testKmeans() throws Exception {
|
||||
String cexpr = "let(echo=true," +
|
||||
|
@ -3055,6 +3058,90 @@ public class MathExpressionTest extends SolrCloudTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
@Test
|
||||
public void testDbscanBasic() throws Exception {
|
||||
String cexpr = "let(echo=true," +
|
||||
" a=array(5,4,5,1,1,1)," +
|
||||
" b=array(5,5,5,1,2,1)," +
|
||||
" f=dbscan(transpose(matrix(a,b)), 2, 2)," +
|
||||
" zplot(clusters=f))";
|
||||
ModifiableSolrParams paramsLoc = new ModifiableSolrParams();
|
||||
paramsLoc.set("expr", cexpr);
|
||||
paramsLoc.set("qt", "/stream");
|
||||
String url = cluster.getJettySolrRunners().get(0).getBaseUrl().toString()+"/"+COLLECTIONORALIAS;
|
||||
TupleStream solrStream = new SolrStream(url, paramsLoc);
|
||||
StreamContext context = new StreamContext();
|
||||
solrStream.setStreamContext(context);
|
||||
List<Tuple> tuples = getTuples(solrStream);
|
||||
assertTrue(tuples.size() == 6);
|
||||
Tuple tuple0 = tuples.get(0);
|
||||
assertEquals(tuple0.getString("cluster"), "cluster1");
|
||||
Tuple tuple1 = tuples.get(1);
|
||||
assertEquals(tuple1.getString("cluster"), "cluster1");
|
||||
|
||||
Tuple tuple2 = tuples.get(2);
|
||||
assertEquals(tuple2.getString("cluster"), "cluster1");
|
||||
Tuple tuple3 = tuples.get(3);
|
||||
assertEquals(tuple3.getString("cluster"), "cluster2");
|
||||
|
||||
Tuple tuple4 = tuples.get(4);
|
||||
assertEquals(tuple4.getString("cluster"), "cluster2");
|
||||
Tuple tuple5 = tuples.get(5);
|
||||
assertEquals(tuple5.getString("cluster"), "cluster2");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDbscanDistance() throws Exception {
|
||||
String cexpr = "let(echo=true," +
|
||||
" a=array(5,4,5,1,1,1)," +
|
||||
" b=array(5,5,5,1,2,1)," +
|
||||
" f=dbscan(transpose(matrix(a,b)), 500000, 2, haversineMeters())," +
|
||||
" zplot(clusters=f))";
|
||||
ModifiableSolrParams paramsLoc = new ModifiableSolrParams();
|
||||
paramsLoc.set("expr", cexpr);
|
||||
paramsLoc.set("qt", "/stream");
|
||||
String url = cluster.getJettySolrRunners().get(0).getBaseUrl().toString()+"/"+COLLECTIONORALIAS;
|
||||
TupleStream solrStream = new SolrStream(url, paramsLoc);
|
||||
StreamContext context = new StreamContext();
|
||||
solrStream.setStreamContext(context);
|
||||
List<Tuple> tuples = getTuples(solrStream);
|
||||
assertTrue(tuples.size() == 6);
|
||||
Tuple tuple0 = tuples.get(0);
|
||||
assertEquals(tuple0.getString("cluster"), "cluster1");
|
||||
Tuple tuple1 = tuples.get(1);
|
||||
assertEquals(tuple1.getString("cluster"), "cluster1");
|
||||
|
||||
Tuple tuple2 = tuples.get(2);
|
||||
assertEquals(tuple2.getString("cluster"), "cluster1");
|
||||
Tuple tuple3 = tuples.get(3);
|
||||
assertEquals(tuple3.getString("cluster"), "cluster1");
|
||||
|
||||
Tuple tuple4 = tuples.get(4);
|
||||
assertEquals(tuple4.getString("cluster"), "cluster1");
|
||||
Tuple tuple5 = tuples.get(5);
|
||||
assertEquals(tuple5.getString("cluster"), "cluster1");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDbscanNoClusters() throws Exception {
|
||||
String cexpr = "let(echo=true," +
|
||||
" a=array(5,4,5,1,1,1)," +
|
||||
" b=array(5,5,5,1,2,1)," +
|
||||
" f=dbscan(transpose(matrix(a,b)), 5000, 2, haversineMeters())," +
|
||||
" zplot(clusters=f))";
|
||||
ModifiableSolrParams paramsLoc = new ModifiableSolrParams();
|
||||
paramsLoc.set("expr", cexpr);
|
||||
paramsLoc.set("qt", "/stream");
|
||||
String url = cluster.getJettySolrRunners().get(0).getBaseUrl().toString()+"/"+COLLECTIONORALIAS;
|
||||
TupleStream solrStream = new SolrStream(url, paramsLoc);
|
||||
StreamContext context = new StreamContext();
|
||||
solrStream.setStreamContext(context);
|
||||
List<Tuple> tuples = getTuples(solrStream);
|
||||
assertTrue(tuples.size() == 0);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMultiKmeans() throws Exception {
|
||||
String cexpr = "let(echo=true," +
|
||||
|
|
Loading…
Reference in New Issue