mirror of https://github.com/apache/lucene.git
SOLR-12660: Add outliers Stream Evaluator to support outlier detection
This commit is contained in:
parent
8d3f59a47f
commit
9d57963f41
|
@ -248,6 +248,7 @@ public class Lang {
|
|||
.withFunctionName("setValue", SetValueEvaluator.class)
|
||||
.withFunctionName("knnRegress", KnnRegressionEvaluator.class)
|
||||
.withFunctionName("gaussfit", GaussFitEvaluator.class)
|
||||
.withFunctionName("outliers", OutliersEvaluator.class)
|
||||
|
||||
// Boolean Stream Evaluators
|
||||
|
||||
|
|
|
@ -0,0 +1,142 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.client.solrj.io.eval;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.commons.math3.distribution.IntegerDistribution;
|
||||
import org.apache.commons.math3.distribution.AbstractRealDistribution;
|
||||
import org.apache.solr.client.solrj.io.stream.expr.StreamExpression;
|
||||
import org.apache.solr.client.solrj.io.stream.expr.StreamFactory;
|
||||
import org.apache.solr.client.solrj.io.Tuple;
|
||||
|
||||
public class OutliersEvaluator extends RecursiveObjectEvaluator implements ManyValueWorker {
|
||||
protected static final long serialVersionUID = 1L;
|
||||
|
||||
public OutliersEvaluator(StreamExpression expression, StreamFactory factory) throws IOException{
|
||||
super(expression, factory);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object doWork(Object... values) throws IOException{
|
||||
|
||||
if(values.length < 4) {
|
||||
throw new IOException("The outliers function requires 4 parameters");
|
||||
}
|
||||
|
||||
Object dist = values[0];
|
||||
List<Number> vec = null;
|
||||
if(values[1] instanceof List) {
|
||||
vec = (List<Number>)values[1];
|
||||
} else {
|
||||
throw new IOException("The second parameter of the outliers function is the numeric array to be tested for outliers.");
|
||||
}
|
||||
|
||||
double low = 0.0;
|
||||
|
||||
if(values[2] instanceof Number) {
|
||||
low = ((Number)values[2]).doubleValue();
|
||||
} else {
|
||||
throw new IOException("The third parameter of the outliers function is a number for the low outlier threshold.");
|
||||
}
|
||||
|
||||
double hi = 0.0;
|
||||
|
||||
if(values[3] instanceof Number) {
|
||||
hi = ((Number)values[3]).doubleValue();
|
||||
} else {
|
||||
throw new IOException("The fourth parameter of the outliers function is a number for the high outlier threshold");
|
||||
}
|
||||
|
||||
List<Tuple> tuples = null;
|
||||
|
||||
if(values.length ==5) {
|
||||
if(values[4] instanceof List) {
|
||||
tuples = (List<Tuple>) values[4];
|
||||
} else {
|
||||
throw new IOException("The optional fifth parameter of the outliers function is an array of Tuples that are paired with the numeric array of values to be tested.");
|
||||
}
|
||||
} else {
|
||||
tuples = new ArrayList();
|
||||
for(int i=0; i<vec.size(); i++) {
|
||||
tuples.add(new Tuple(new HashMap()));
|
||||
}
|
||||
}
|
||||
|
||||
List<Tuple> outliers = new ArrayList();
|
||||
|
||||
if(dist instanceof IntegerDistribution) {
|
||||
|
||||
IntegerDistribution d = (IntegerDistribution) dist;
|
||||
|
||||
for(int i=0; i<vec.size(); i++) {
|
||||
|
||||
Number n = vec.get(i);
|
||||
Tuple t = tuples.get(i);
|
||||
|
||||
double cumProb = d.cumulativeProbability(n.intValue());
|
||||
if(low >= 0 && cumProb <= low) {
|
||||
t.put("lowOutlier", true);
|
||||
t.put("lowOutlierValue", n);
|
||||
t.put("cumulativeProbablity", cumProb);
|
||||
outliers.add(t);
|
||||
}
|
||||
|
||||
if(hi >= 0 && cumProb >= hi) {
|
||||
t.put("highOutlier", true);
|
||||
t.put("highOutlierValue", n);
|
||||
t.put("cumulativeProbablity", cumProb);
|
||||
outliers.add(t);
|
||||
}
|
||||
}
|
||||
|
||||
return outliers;
|
||||
|
||||
} else if(dist instanceof AbstractRealDistribution) {
|
||||
|
||||
AbstractRealDistribution d = (AbstractRealDistribution)dist;
|
||||
for(int i=0; i<vec.size(); i++) {
|
||||
|
||||
Number n = vec.get(i);
|
||||
Tuple t = tuples.get(i);
|
||||
|
||||
double cumProb = d.cumulativeProbability(n.doubleValue());
|
||||
if(low >= 0 && cumProb <= low) {
|
||||
t.put("lowOutlier", true);
|
||||
t.put("lowOutlierValue", n);
|
||||
t.put("cumulativeProbablity", cumProb);
|
||||
outliers.add(t);
|
||||
|
||||
}
|
||||
|
||||
if(hi >= 0 && cumProb >= hi) {
|
||||
t.put("highOutlier", true);
|
||||
t.put("highOutlierValue", n);
|
||||
t.put("cumulativeProbablity", cumProb);
|
||||
outliers.add(t);
|
||||
}
|
||||
}
|
||||
|
||||
return outliers;
|
||||
} else {
|
||||
throw new IOException("The first parameter of the outliers function must be a real or integer probability distribution");
|
||||
}
|
||||
}
|
||||
}
|
|
@ -3272,6 +3272,59 @@ public class MathExpressionTest extends SolrCloudTestCase {
|
|||
assertEquals(out1.get(7).doubleValue(), 61.5, 0.0001);
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testOutliers() throws Exception {
|
||||
String cexpr = "let(echo=true," +
|
||||
" a=list(tuple(id=0.0), tuple(id=1), tuple(id=2), tuple(id=3)), " +
|
||||
" b=normalDistribution(100, 5)," +
|
||||
" d=array(100, 110, 90, 99), " +
|
||||
" e=outliers(b, d, .05, .95, a)," +
|
||||
" f=outliers(b, d, .05, .95))";
|
||||
|
||||
ModifiableSolrParams paramsLoc = new ModifiableSolrParams();
|
||||
paramsLoc.set("expr", cexpr);
|
||||
paramsLoc.set("qt", "/stream");
|
||||
String url = cluster.getJettySolrRunners().get(0).getBaseUrl().toString()+"/"+COLLECTIONORALIAS;
|
||||
TupleStream solrStream = new SolrStream(url, paramsLoc);
|
||||
StreamContext context = new StreamContext();
|
||||
solrStream.setStreamContext(context);
|
||||
List<Tuple> tuples = getTuples(solrStream);
|
||||
assertTrue(tuples.size() == 1);
|
||||
List<Map> out = (List<Map>)tuples.get(0).get("e");
|
||||
assertEquals(out.size(), 2);
|
||||
Map high = out.get(0);
|
||||
assertEquals(((String)high.get("id")), "1");
|
||||
|
||||
assertEquals(((Number)high.get("cumulativeProbablity")).doubleValue(), 0.9772498680518208, 0.0 );
|
||||
assertEquals(((Number)high.get("highOutlierValue")).doubleValue(), 110.0, 0.0);
|
||||
assertEquals(((Boolean)high.get("highOutlier")).booleanValue(), true);
|
||||
|
||||
|
||||
Map low = out.get(1);
|
||||
assertEquals(((String)low.get("id")), "2");
|
||||
assertEquals(((Number)low.get("cumulativeProbablity")).doubleValue(), 0.022750131948179167, 0.0 );
|
||||
assertEquals(((Number)low.get("lowOutlierValue")).doubleValue(), 90, 0.0);
|
||||
assertEquals(((Boolean)low.get("lowOutlier")).booleanValue(), true);
|
||||
|
||||
|
||||
List<Map> out1 = (List<Map>)tuples.get(0).get("f");
|
||||
assertEquals(out1.size(), 2);
|
||||
Map high1 = out1.get(0);
|
||||
assert(high1.get("id") == null);
|
||||
assertEquals(((Number)high1.get("cumulativeProbablity")).doubleValue(), 0.9772498680518208, 0.0 );
|
||||
assertEquals(((Number)high1.get("highOutlierValue")).doubleValue(), 110.0, 0.0);
|
||||
assertEquals(((Boolean)high1.get("highOutlier")).booleanValue(), true);
|
||||
|
||||
|
||||
Map low1 = out1.get(1);
|
||||
assert(low1.get("id") == null);
|
||||
assertEquals(((Number)low1.get("cumulativeProbablity")).doubleValue(), 0.022750131948179167, 0.0 );
|
||||
assertEquals(((Number)low1.get("lowOutlierValue")).doubleValue(), 90, 0.0);
|
||||
assertEquals(((Boolean)low1.get("lowOutlier")).booleanValue(), true);
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testLerp() throws Exception {
|
||||
String cexpr = "let(echo=true," +
|
||||
|
|
Loading…
Reference in New Issue