mirror of https://github.com/apache/lucene.git
SOLR-11890: Add multiKmeans Stream Evaluator
This commit is contained in:
parent
50a6cb1b05
commit
a664b63d42
|
@ -312,6 +312,7 @@ public class StreamHandler extends RequestHandlerBase implements SolrCoreAware,
|
|||
.withFunctionName("rowCount", RowCountEvaluator.class)
|
||||
.withFunctionName("fuzzyKmeans", FuzzyKmeansEvaluator.class)
|
||||
.withFunctionName("getMembershipMatrix", GetMembershipMatrixEvaluator.class)
|
||||
.withFunctionName("multiKmeans", MultiKmeansEvaluator.class)
|
||||
|
||||
// Boolean Stream Evaluators
|
||||
|
||||
|
|
|
@ -0,0 +1,108 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.client.solrj.io.eval;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.HashMap;
|
||||
|
||||
import org.apache.commons.math3.ml.clustering.KMeansPlusPlusClusterer;
|
||||
import org.apache.commons.math3.ml.clustering.MultiKMeansPlusPlusClusterer;
|
||||
import org.apache.solr.client.solrj.io.stream.expr.StreamExpression;
|
||||
import org.apache.solr.client.solrj.io.stream.expr.StreamExpressionNamedParameter;
|
||||
import org.apache.solr.client.solrj.io.stream.expr.StreamFactory;
|
||||
|
||||
public class MultiKmeansEvaluator extends RecursiveObjectEvaluator implements ManyValueWorker {
|
||||
protected static final long serialVersionUID = 1L;
|
||||
|
||||
private int maxIterations = 1000;
|
||||
|
||||
public MultiKmeansEvaluator(StreamExpression expression, StreamFactory factory) throws IOException{
|
||||
super(expression, factory);
|
||||
|
||||
List<StreamExpressionNamedParameter> namedParams = factory.getNamedOperands(expression);
|
||||
|
||||
for(StreamExpressionNamedParameter namedParam : namedParams){
|
||||
if(namedParam.getName().equals("maxIterations")) {
|
||||
this.maxIterations = Integer.parseInt(namedParam.getParameter().toString().trim());
|
||||
} else {
|
||||
throw new IOException("Unexpected named parameter:"+namedParam.getName());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object doWork(Object... values) throws IOException {
|
||||
|
||||
if(values.length != 3) {
|
||||
throw new IOException("The multiKmeans function expects three parameters; a matrix to cluster, k and number of trials.");
|
||||
}
|
||||
|
||||
Object value1 = values[0];
|
||||
Object value2 = values[1];
|
||||
Object value3 = values[2];
|
||||
|
||||
Matrix matrix = null;
|
||||
int k = 0;
|
||||
int trials=0;
|
||||
|
||||
if(value1 instanceof Matrix) {
|
||||
matrix = (Matrix)value1;
|
||||
} else {
|
||||
throw new IOException("The first parameter for multiKmeans should be the observation matrix.");
|
||||
}
|
||||
|
||||
if(value2 instanceof Number) {
|
||||
k = ((Number)value2).intValue();
|
||||
} else {
|
||||
throw new IOException("The second parameter for multiKmeans should be k.");
|
||||
}
|
||||
|
||||
if(value3 instanceof Number) {
|
||||
trials= ((Number)value3).intValue();
|
||||
} else {
|
||||
throw new IOException("The third parameter for multiKmeans should be trials.");
|
||||
}
|
||||
|
||||
KMeansPlusPlusClusterer<KmeansEvaluator.ClusterPoint> kmeans = new KMeansPlusPlusClusterer(k, maxIterations);
|
||||
MultiKMeansPlusPlusClusterer multiKmeans = new MultiKMeansPlusPlusClusterer(kmeans, trials);
|
||||
|
||||
List<KmeansEvaluator.ClusterPoint> points = new ArrayList();
|
||||
double[][] data = matrix.getData();
|
||||
|
||||
List<String> ids = matrix.getRowLabels();
|
||||
|
||||
for(int i=0; i<data.length; i++) {
|
||||
double[] vec = data[i];
|
||||
points.add(new KmeansEvaluator.ClusterPoint(ids.get(i), vec));
|
||||
}
|
||||
|
||||
Map fields = new HashMap();
|
||||
|
||||
fields.put("k", k);
|
||||
fields.put("trials", trials);
|
||||
fields.put("distance", "euclidean");
|
||||
fields.put("maxIterations", maxIterations);
|
||||
|
||||
return new KmeansEvaluator.ClusterTuple(fields, multiKmeans.cluster(points), matrix.getColumnLabels());
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -7076,6 +7076,87 @@ public class StreamExpressionTest extends SolrCloudTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMultiKmeans() throws Exception {
|
||||
String cexpr = "let(echo=true," +
|
||||
" a=array(1,1,1,0,0,0)," +
|
||||
" b=array(1,1,1,0,0,0)," +
|
||||
" c=array(0,0,0,1,1,1)," +
|
||||
" d=array(0,0,0,1,1,1)," +
|
||||
" e=setRowLabels(matrix(a,b,c,d), " +
|
||||
" array(doc1, doc2, doc3, doc4))," +
|
||||
" f=multiKmeans(e, 2, 5)," +
|
||||
" g=getCluster(f, 0)," +
|
||||
" h=getCluster(f, 1)," +
|
||||
" i=getCentroids(f)," +
|
||||
" j=getRowLabels(g)," +
|
||||
" k=getRowLabels(h))";
|
||||
ModifiableSolrParams paramsLoc = new ModifiableSolrParams();
|
||||
paramsLoc.set("expr", cexpr);
|
||||
paramsLoc.set("qt", "/stream");
|
||||
String url = cluster.getJettySolrRunners().get(0).getBaseUrl().toString()+"/"+COLLECTIONORALIAS;
|
||||
TupleStream solrStream = new SolrStream(url, paramsLoc);
|
||||
StreamContext context = new StreamContext();
|
||||
solrStream.setStreamContext(context);
|
||||
List<Tuple> tuples = getTuples(solrStream);
|
||||
assertTrue(tuples.size() == 1);
|
||||
List<List<Number>> cluster1 = (List<List<Number>>)tuples.get(0).get("g");
|
||||
List<List<Number>> cluster2 = (List<List<Number>>)tuples.get(0).get("h");
|
||||
List<List<Number>> centroids = (List<List<Number>>)tuples.get(0).get("i");
|
||||
List<String> labels1 = (List<String>)tuples.get(0).get("j");
|
||||
List<String> labels2 = (List<String>)tuples.get(0).get("k");
|
||||
|
||||
assertEquals(cluster1.size(), 2);
|
||||
assertEquals(cluster2.size(), 2);
|
||||
assertEquals(centroids.size(), 2);
|
||||
|
||||
//Assert that the docs are not in both clusters
|
||||
assertTrue(!(labels1.contains("doc1") && labels2.contains("doc1")));
|
||||
assertTrue(!(labels1.contains("doc2") && labels2.contains("doc2")));
|
||||
assertTrue(!(labels1.contains("doc3") && labels2.contains("doc3")));
|
||||
assertTrue(!(labels1.contains("doc4") && labels2.contains("doc4")));
|
||||
|
||||
//Assert that (doc1 and doc2) or (doc3 and doc4) are in labels1
|
||||
assertTrue((labels1.contains("doc1") && labels1.contains("doc2")) ||
|
||||
((labels1.contains("doc3") && labels1.contains("doc4"))));
|
||||
|
||||
//Assert that (doc1 and doc2) or (doc3 and doc4) are in labels2
|
||||
assertTrue((labels2.contains("doc1") && labels2.contains("doc2")) ||
|
||||
((labels2.contains("doc3") && labels2.contains("doc4"))));
|
||||
|
||||
if(labels1.contains("doc1")) {
|
||||
assertEquals(centroids.get(0).get(0).doubleValue(), 1.0, 0.0);
|
||||
assertEquals(centroids.get(0).get(1).doubleValue(), 1.0, 0.0);
|
||||
assertEquals(centroids.get(0).get(2).doubleValue(), 1.0, 0.0);
|
||||
assertEquals(centroids.get(0).get(3).doubleValue(), 0.0, 0.0);
|
||||
assertEquals(centroids.get(0).get(4).doubleValue(), 0.0, 0.0);
|
||||
assertEquals(centroids.get(0).get(5).doubleValue(), 0.0, 0.0);
|
||||
|
||||
assertEquals(centroids.get(1).get(0).doubleValue(), 0.0, 0.0);
|
||||
assertEquals(centroids.get(1).get(1).doubleValue(), 0.0, 0.0);
|
||||
assertEquals(centroids.get(1).get(2).doubleValue(), 0.0, 0.0);
|
||||
assertEquals(centroids.get(1).get(3).doubleValue(), 1.0, 0.0);
|
||||
assertEquals(centroids.get(1).get(4).doubleValue(), 1.0, 0.0);
|
||||
assertEquals(centroids.get(1).get(5).doubleValue(), 1.0, 0.0);
|
||||
} else {
|
||||
assertEquals(centroids.get(0).get(0).doubleValue(), 0.0, 0.0);
|
||||
assertEquals(centroids.get(0).get(1).doubleValue(), 0.0, 0.0);
|
||||
assertEquals(centroids.get(0).get(2).doubleValue(), 0.0, 0.0);
|
||||
assertEquals(centroids.get(0).get(3).doubleValue(), 1.0, 0.0);
|
||||
assertEquals(centroids.get(0).get(4).doubleValue(), 1.0, 0.0);
|
||||
assertEquals(centroids.get(0).get(5).doubleValue(), 1.0, 0.0);
|
||||
|
||||
assertEquals(centroids.get(1).get(0).doubleValue(), 1.0, 0.0);
|
||||
assertEquals(centroids.get(1).get(1).doubleValue(), 1.0, 0.0);
|
||||
assertEquals(centroids.get(1).get(2).doubleValue(), 1.0, 0.0);
|
||||
assertEquals(centroids.get(1).get(3).doubleValue(), 0.0, 0.0);
|
||||
assertEquals(centroids.get(1).get(4).doubleValue(), 0.0, 0.0);
|
||||
assertEquals(centroids.get(1).get(5).doubleValue(), 0.0, 0.0);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
@Test
|
||||
public void testFuzzyKmeans() throws Exception {
|
||||
|
|
Loading…
Reference in New Issue