SOLR-10786: Add DBSCAN clustering Streaming Evaluator

2019-11-02 14:26:49 -04:00 · 2019-11-02 14:26:49 -04:00 · c908ea2e5a
parent 5c6a299eff
commit c908ea2e5a
5 changed files with 261 additions and 14 deletions
--- a/solr/solrj/src/java/org/apache/solr/client/solrj/io/Lang.java
+++ b/solr/solrj/src/java/org/apache/solr/client/solrj/io/Lang.java
@ -302,7 +302,7 @@ public class Lang {
        .withFunctionName("trim", TrimEvaluator.class)
        .withFunctionName("cosine", CosineDistanceEvaluator.class)
        .withFunctionName("trunc", TruncEvaluator.class)
-
+        .withFunctionName("dbscan", DbscanEvaluator.class)
        // Boolean Stream Evaluators

        .withFunctionName("and", AndEvaluator.class)
--- a/solr/solrj/src/java/org/apache/solr/client/solrj/io/eval/DbscanEvaluator.java
+++ b/solr/solrj/src/java/org/apache/solr/client/solrj/io/eval/DbscanEvaluator.java
@ -0,0 +1,141 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.client.solrj.io.eval;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.HashMap;
+
+import org.apache.commons.math3.ml.clustering.Cluster;
+import org.apache.commons.math3.ml.distance.DistanceMeasure;
+import org.apache.commons.math3.ml.distance.EuclideanDistance;
+import org.apache.solr.client.solrj.io.Tuple;
+import org.apache.commons.math3.ml.clustering.Clusterable;
+import org.apache.commons.math3.ml.clustering.DBSCANClusterer;
+import org.apache.solr.client.solrj.io.stream.expr.StreamExpression;
+import org.apache.solr.client.solrj.io.stream.expr.StreamFactory;
+
+public class DbscanEvaluator extends RecursiveObjectEvaluator implements ManyValueWorker {
+  protected static final long serialVersionUID = 1L;
+
+
+  public DbscanEvaluator(StreamExpression expression, StreamFactory factory) throws IOException{
+    super(expression, factory);
+  }
+
+  @Override
+  public Object doWork(Object values[]) throws IOException {
+
+    Matrix matrix = null;
+    double e = 0;
+    int minPoints = 1;
+    DistanceMeasure distanceMeasure = new EuclideanDistance();
+
+    if(values.length < 3 || values.length > 4) {
+      throw new IOException("The dbscan scan function requires 3 or 4 parameters.");
+    }
+
+    if(values[0] instanceof Matrix) {
+      matrix = (Matrix)values[0];
+    } else {
+      throw new IOException("The first parameter for dbscan should be the observation matrix.");
+    }
+
+    if(values[1] instanceof Number) {
+      e = ((Number)values[1]).doubleValue();
+    } else {
+      throw new IOException("The second parameter for dbscan should be e.");
+    }
+
+    if(values[2] instanceof Number) {
+      minPoints = ((Number)values[2]).intValue();
+    } else {
+      throw new IOException("The third parameter for dbscan should be minPoints.");
+    }
+
+    if(values.length > 3) {
+      distanceMeasure = (DistanceMeasure)values[3];
+    }
+
+    DBSCANClusterer<ClusterPoint> dbscan = new DBSCANClusterer(e, minPoints, distanceMeasure);
+    List<ClusterPoint> points = new ArrayList();
+    double[][] data = matrix.getData();
+    List<String> ids = matrix.getRowLabels();
+
+    for(int i=0; i<data.length; i++) {
+      double[] vec = data[i];
+      if(ids != null) {
+        points.add(new ClusterPoint(ids.get(i), vec));
+      } else {
+        points.add(new ClusterPoint(Integer.toString(i), vec));
+      }
+    }
+
+    Map fields = new HashMap();
+
+    fields.put("e", e);
+    fields.put("minPoints", minPoints);
+    fields.put("distance", distanceMeasure.toString());
+
+    return new ClusterTuple(fields, dbscan.cluster(points), matrix.getColumnLabels());
+  }
+
+  public static class ClusterPoint implements Clusterable {
+
+    private double[] point;
+    private String id;
+
+    public ClusterPoint(String id, double[] point) {
+      this.id = id;
+      this.point = point;
+    }
+
+    public double[] getPoint() {
+      return this.point;
+    }
+
+    public String getId() {
+      return this.id;
+    }
+  }
+
+  public static class ClusterTuple extends Tuple {
+
+    private List<String> columnLabels;
+    private List<Cluster<ClusterPoint>> clusters;
+
+    public ClusterTuple(Map fields,
+                        List<Cluster<ClusterPoint>> clusters,
+                        List<String> columnLabels) {
+      super(fields);
+      this.clusters = clusters;
+      this.columnLabels = columnLabels;
+    }
+
+    public List<String> getColumnLabels() {
+      return this.columnLabels;
+    }
+
+    public List<Cluster<ClusterPoint>> getClusters() {
+      return this.clusters;
+    }
+  }
+}
+
--- a/solr/solrj/src/java/org/apache/solr/client/solrj/io/stream/ZplotStream.java
+++ b/solr/solrj/src/java/org/apache/solr/client/solrj/io/stream/ZplotStream.java
@ -28,12 +28,14 @@ import java.util.Set;
 import org.apache.commons.math3.distribution.IntegerDistribution;
 import org.apache.commons.math3.distribution.RealDistribution;
 import org.apache.commons.math3.ml.clustering.CentroidCluster;
+import org.apache.commons.math3.ml.clustering.Cluster;
 import org.apache.commons.math3.random.EmpiricalDistribution;
 import org.apache.commons.math3.stat.Frequency;
 import org.apache.commons.math3.stat.descriptive.SummaryStatistics;
 import org.apache.commons.math3.util.Precision;
 import org.apache.solr.client.solrj.io.Tuple;
 import org.apache.solr.client.solrj.io.comp.StreamComparator;
+import org.apache.solr.client.solrj.io.eval.DbscanEvaluator;
 import org.apache.solr.client.solrj.io.eval.KmeansEvaluator;
 import org.apache.solr.client.solrj.io.eval.StreamEvaluator;
 import org.apache.solr.client.solrj.io.eval.Matrix;
@ -214,18 +216,35 @@ public class ZplotStream extends TupleStream implements Expressible {
      }
    } else if(clusters) {
      Object o = evaluated.get("clusters");
-      KmeansEvaluator.ClusterTuple ct = (KmeansEvaluator.ClusterTuple)o;
-      List<CentroidCluster<KmeansEvaluator.ClusterPoint>> cs = ct.getClusters();
-      int clusterNum = 0;
-      for(CentroidCluster<KmeansEvaluator.ClusterPoint> c : cs) {
-        clusterNum++;
-        List<KmeansEvaluator.ClusterPoint> points = c.getPoints();
-        for(KmeansEvaluator.ClusterPoint p : points) {
-          Tuple tuple = new Tuple(new HashMap());
-          tuple.put("x", p.getPoint()[0]);
-          tuple.put("y", p.getPoint()[1]);
-          tuple.put("cluster", "cluster"+clusterNum);
-          outTuples.add(tuple);
+      if(o instanceof  KmeansEvaluator.ClusterTuple) {
+        KmeansEvaluator.ClusterTuple ct = (KmeansEvaluator.ClusterTuple) o;
+        List<CentroidCluster<KmeansEvaluator.ClusterPoint>> cs = ct.getClusters();
+        int clusterNum = 0;
+        for (CentroidCluster<KmeansEvaluator.ClusterPoint> c : cs) {
+          clusterNum++;
+          List<KmeansEvaluator.ClusterPoint> points = c.getPoints();
+          for (KmeansEvaluator.ClusterPoint p : points) {
+            Tuple tuple = new Tuple(new HashMap());
+            tuple.put("x", p.getPoint()[0]);
+            tuple.put("y", p.getPoint()[1]);
+            tuple.put("cluster", "cluster" + clusterNum);
+            outTuples.add(tuple);
+          }
+        }
+      } else if(o instanceof DbscanEvaluator.ClusterTuple) {
+        DbscanEvaluator.ClusterTuple ct = (DbscanEvaluator.ClusterTuple) o;
+        List<Cluster<DbscanEvaluator.ClusterPoint>> cs = ct.getClusters();
+        int clusterNum = 0;
+        for (Cluster<DbscanEvaluator.ClusterPoint> c : cs) {
+          clusterNum++;
+          List<DbscanEvaluator.ClusterPoint> points = c.getPoints();
+          for (DbscanEvaluator.ClusterPoint p : points) {
+            Tuple tuple = new Tuple(new HashMap());
+            tuple.put("x", p.getPoint()[0]);
+            tuple.put("y", p.getPoint()[1]);
+            tuple.put("cluster", "cluster" + clusterNum);
+            outTuples.add(tuple);
+          }
        }
      }
    } else if(distribution) {
--- a/solr/solrj/src/test/org/apache/solr/client/solrj/io/TestLang.java
+++ b/solr/solrj/src/test/org/apache/solr/client/solrj/io/TestLang.java
@ -77,7 +77,7 @@ public class TestLang extends SolrTestCase {
      "getSupportPoints", "pairSort", "log10", "plist", "recip", "pivot", "ltrim", "rtrim", "export",
      "zplot", "natural", "repeat", "movingMAD", "hashRollup", "noop", "var", "stddev", "recNum", "isNull",
      "notNull", "matches", "projectToBorder", "double", "long", "parseCSV", "parseTSV", "dateTime",
-       "split", "upper", "trim", "lower", "trunc", "cosine"};
+       "split", "upper", "trim", "lower", "trunc", "cosine", "dbscan"};

  @Test
  public void testLang() {
--- a/solr/solrj/src/test/org/apache/solr/client/solrj/io/stream/MathExpressionTest.java
+++ b/solr/solrj/src/test/org/apache/solr/client/solrj/io/stream/MathExpressionTest.java
@ -2976,6 +2976,9 @@ public class MathExpressionTest extends SolrCloudTestCase {
    assertEquals(row.get(2).doubleValue(), 18.0, 0.0);
  }

+
+
+
  @Test
  public void testKmeans() throws Exception {
    String cexpr = "let(echo=true," +
@ -3055,6 +3058,90 @@ public class MathExpressionTest extends SolrCloudTestCase {
    }
  }

+
+
+  @Test
+  public void testDbscanBasic() throws Exception {
+    String cexpr = "let(echo=true," +
+        "               a=array(5,4,5,1,1,1)," +
+        "               b=array(5,5,5,1,2,1)," +
+        "               f=dbscan(transpose(matrix(a,b)), 2, 2)," +
+        "               zplot(clusters=f))";
+    ModifiableSolrParams paramsLoc = new ModifiableSolrParams();
+    paramsLoc.set("expr", cexpr);
+    paramsLoc.set("qt", "/stream");
+    String url = cluster.getJettySolrRunners().get(0).getBaseUrl().toString()+"/"+COLLECTIONORALIAS;
+    TupleStream solrStream = new SolrStream(url, paramsLoc);
+    StreamContext context = new StreamContext();
+    solrStream.setStreamContext(context);
+    List<Tuple> tuples = getTuples(solrStream);
+    assertTrue(tuples.size() == 6);
+    Tuple tuple0 = tuples.get(0);
+    assertEquals(tuple0.getString("cluster"), "cluster1");
+    Tuple tuple1 = tuples.get(1);
+    assertEquals(tuple1.getString("cluster"), "cluster1");
+
+    Tuple tuple2 = tuples.get(2);
+    assertEquals(tuple2.getString("cluster"), "cluster1");
+    Tuple tuple3 = tuples.get(3);
+    assertEquals(tuple3.getString("cluster"), "cluster2");
+
+    Tuple tuple4 = tuples.get(4);
+    assertEquals(tuple4.getString("cluster"), "cluster2");
+    Tuple tuple5 = tuples.get(5);
+    assertEquals(tuple5.getString("cluster"), "cluster2");
+  }
+
+  @Test
+  public void testDbscanDistance() throws Exception {
+    String cexpr = "let(echo=true," +
+        "               a=array(5,4,5,1,1,1)," +
+        "               b=array(5,5,5,1,2,1)," +
+        "               f=dbscan(transpose(matrix(a,b)), 500000, 2, haversineMeters())," +
+        "               zplot(clusters=f))";
+    ModifiableSolrParams paramsLoc = new ModifiableSolrParams();
+    paramsLoc.set("expr", cexpr);
+    paramsLoc.set("qt", "/stream");
+    String url = cluster.getJettySolrRunners().get(0).getBaseUrl().toString()+"/"+COLLECTIONORALIAS;
+    TupleStream solrStream = new SolrStream(url, paramsLoc);
+    StreamContext context = new StreamContext();
+    solrStream.setStreamContext(context);
+    List<Tuple> tuples = getTuples(solrStream);
+    assertTrue(tuples.size() == 6);
+    Tuple tuple0 = tuples.get(0);
+    assertEquals(tuple0.getString("cluster"), "cluster1");
+    Tuple tuple1 = tuples.get(1);
+    assertEquals(tuple1.getString("cluster"), "cluster1");
+
+    Tuple tuple2 = tuples.get(2);
+    assertEquals(tuple2.getString("cluster"), "cluster1");
+    Tuple tuple3 = tuples.get(3);
+    assertEquals(tuple3.getString("cluster"), "cluster1");
+
+    Tuple tuple4 = tuples.get(4);
+    assertEquals(tuple4.getString("cluster"), "cluster1");
+    Tuple tuple5 = tuples.get(5);
+    assertEquals(tuple5.getString("cluster"), "cluster1");
+  }
+
+  @Test
+  public void testDbscanNoClusters() throws Exception {
+    String cexpr = "let(echo=true," +
+        "               a=array(5,4,5,1,1,1)," +
+        "               b=array(5,5,5,1,2,1)," +
+        "               f=dbscan(transpose(matrix(a,b)), 5000, 2, haversineMeters())," +
+        "               zplot(clusters=f))";
+    ModifiableSolrParams paramsLoc = new ModifiableSolrParams();
+    paramsLoc.set("expr", cexpr);
+    paramsLoc.set("qt", "/stream");
+    String url = cluster.getJettySolrRunners().get(0).getBaseUrl().toString()+"/"+COLLECTIONORALIAS;
+    TupleStream solrStream = new SolrStream(url, paramsLoc);
+    StreamContext context = new StreamContext();
+    solrStream.setStreamContext(context);
+    List<Tuple> tuples = getTuples(solrStream);
+    assertTrue(tuples.size() == 0);
+  }
+
  @Test
  public void testMultiKmeans() throws Exception {
    String cexpr = "let(echo=true," +