SOLR-11947: Squashed commit of the following ref guide changes:

commit 61053f2fe373bff0b451f549e063550f08ecdac1
Author: Joel Bernstein <jbernste@apache.org>
Date:   Mon Mar 26 12:44:12 2018 -0400

    SOLR-11947: Fix orphaned files

commit 42302073bf61fde134caeff71b6db3978e113b4d
Author: Joel Bernstein <jbernste@apache.org>
Date:   Mon Mar 26 12:27:26 2018 -0400

    SOLR-11947: small change

commit b16b1453c2e7d5083f588b4b874c918d521e9fe5
Author: Joel Bernstein <jbernste@apache.org>
Date:   Mon Mar 26 12:23:17 2018 -0400

    SOLR-11947: proofing

commit 57265ce4659a427c179e206b79d8fe05b01a5f93
Author: Joel Bernstein <jbernste@apache.org>
Date:   Sat Mar 24 14:41:48 2018 -0400

    SOLR-11947: monte carlo WIP

commit 04e8381f6b5b329c5fa17c1f31c2d848fe9cec2a
Author: Joel Bernstein <jbernste@apache.org>
Date:   Fri Mar 23 16:24:10 2018 -0400

    SOLR-11947: probabiity WIP

commit 4298a6d514e7e431e322a4f62c22c336430a89f1
Author: Joel Bernstein <jbernste@apache.org>
Date:   Fri Mar 23 13:07:05 2018 -0400

    SOLR-11947: time series WIP

commit 1a7654f9225948cd4adb3056bc2192cc0d24b3ee
Author: Joel Bernstein <jbernste@apache.org>
Date:   Fri Mar 23 11:32:53 2018 -0400

    SOLR-11947: machine learning WIP

commit fae0c3aa46e6f26fecb59077207982b2f584ec86
Author: Joel Bernstein <jbernste@apache.org>
Date:   Thu Mar 22 22:14:15 2018 -0400

    SOLR-11947: machine learning WIP

commit fb6a96b2bdc4bbc4c2b5b62b6e69cd561ef9e31b
Author: Joel Bernstein <jbernste@apache.org>
Date:   Thu Mar 22 14:36:08 2018 -0400

    SOLR-11947: numerical analysis WIP

commit a648ba939c90caf5db2a5b88023bd580d4d1e8af
Author: Joel Bernstein <jbernste@apache.org>
Date:   Thu Mar 22 12:27:33 2018 -0400

    SOLR-11947: numerical analysis WIP

commit ce8f1b710d414d8e3ff3c8676f64fc3017316a15
Author: Joel Bernstein <jbernste@apache.org>
Date:   Wed Mar 21 19:56:10 2018 -0400

    SOLR-11947: numerical analysis WIP

commit 5e25a4884341cdd84988e13250f255eb23d7fd50
Author: Joel Bernstein <jbernste@apache.org>
Date:   Tue Mar 20 22:01:59 2018 -0400

    SOLR-11947: Curve fitting WIP

commit f381414dc44ecfa781988c5ca75bfb1c80de6674
Author: Joel Bernstein <jbernste@apache.org>
Date:   Tue Mar 20 21:49:39 2018 -0400

    SOLR-11947: Curve fitting WIP

commit 4be725132215ed44cc84587bb0d11be216360b74
Author: Joel Bernstein <jbernste@apache.org>
Date:   Mon Mar 19 19:55:10 2018 -0400

    SOLR-11947: Monte Carlo WIP

commit d330b412e46be0ebf8d75e99295e3fe9f978c02c
Author: Joel Bernstein <jbernste@apache.org>
Date:   Sun Mar 18 22:00:55 2018 -0400

    SOLR-11947: Probability WIP

commit e3d6160c1fa650e054b9694c57d34b3950c80175
Author: Joel Bernstein <jbernste@apache.org>
Date:   Sat Mar 17 21:18:43 2018 -0400

    SOLR-11947: More WIP

commit 8484b0283f79825dee8eaee82604120d04511de4
Author: Joel Bernstein <jbernste@apache.org>
Date:   Fri Mar 16 15:03:06 2018 -0400

    SOLR-11947: machine learning WIP

commit 77ecfdc71d79ca8eded0355669310c6025c70d96
Author: Joel Bernstein <jbernste@apache.org>
Date:   Thu Mar 15 21:33:09 2018 -0400

    SOLR-11947: machine learning WIP

commit 7488caf5e54436a0e5fe85c0dda4ea31d8357600
Author: Joel Bernstein <jbernste@apache.org>
Date:   Thu Mar 15 19:08:50 2018 -0400

    SOLR-11947: machine learning WIP

commit 102ee2e1857e7d7f45d7f3195a0a4e91eacb766d
Author: Joel Bernstein <jbernste@apache.org>
Date:   Thu Mar 15 15:18:31 2018 -0400

    SOLR-11947: machine learning WIP

commit 0d5cd2b4a4fd012fe6d640a86733280702cf8673
Author: Joel Bernstein <jbernste@apache.org>
Date:   Wed Mar 14 21:49:15 2018 -0400

    SOLR-11947: numerical analysis WIP

commit 31eec30576479a9023c7b0e6ccb2d9f685e128a1
Author: Joel Bernstein <jbernste@apache.org>
Date:   Wed Mar 14 14:41:06 2018 -0400

    SOLR-11947: numerical analysis WIP

commit c6e324ac56ca6e9f229d6acb39fdcf60c3356230
Author: Joel Bernstein <jbernste@apache.org>
Date:   Tue Mar 13 15:16:26 2018 -0400

    SOLR-11947: term vectors WIP

commit 8c843999eabdb82665641caa9c21f07e95b70a86
Author: Joel Bernstein <jbernste@apache.org>
Date:   Mon Mar 12 18:03:53 2018 -0400

    SOLR-11947: Add curve fitting to TOC

commit 09be026f6ad400d965fd373403d7a2eb2fae0c90
Author: Joel Bernstein <jbernste@apache.org>
Date:   Mon Mar 12 15:36:05 2018 -0400

    SOLR-11947: Text analysis WIP

commit e48b4d69abadb603a90c052aa1e36dd60ae7fd33
Author: Joel Bernstein <jbernste@apache.org>
Date:   Sun Mar 11 18:29:20 2018 -0400

    SOLR-11947: TOC changes

commit f71ebc079713e16492ba45cedafc3b9512f6bae2
Author: Joel Bernstein <jbernste@apache.org>
Date:   Sat Mar 10 17:54:04 2018 -0500

    SOLR-11947: WIP term vectors

commit ebc6b3943a27454adaf1a2309b6720bb2ba63c8c
Author: Joel Bernstein <jbernste@apache.org>
Date:   Sat Mar 10 13:34:19 2018 -0500

    SOLR-11947: WIP regression

commit 44752b2d34f46bc7f5693839e42ab3cef9edc47c
Author: Joel Bernstein <jbernste@apache.org>
Date:   Fri Mar 9 22:40:40 2018 -0500

    SOLR-11947: WIP for vectorization.adoc

commit 43254fcb05386264a6d591b1fa2c2573dcc2d2a3
Author: Joel Bernstein <jbernste@apache.org>
Date:   Fri Mar 9 19:42:26 2018 -0500

    SOLR-11947: Test local links

commit b60df2000978f70720eb0a36543752fd3bf07d2c
Author: Joel Bernstein <jbernste@apache.org>
Date:   Thu Mar 8 21:41:17 2018 -0500

    SOLR-11947: Update math-expressions TOC

commit de068c3af8557d60de37cb29f3ed7da3f5442772
Author: Joel Bernstein <jbernste@apache.org>
Date:   Thu Mar 8 21:24:46 2018 -0500

    SOLR-11947: Continued work on math expressions documentation.

commit fe445f2c997ea825d1ae9b9912406521249befc0
Author: Joel Bernstein <jbernste@apache.org>
Date:   Sun Mar 4 20:22:33 2018 -0500

    SOLR-12054: ebeAdd and ebeSubtract should support matrix operations

commit 1f3ae745cc26453a34a64a4327ceac7cc91d23f5
Author: Joel Bernstein <jbernste@apache.org>
Date:   Sun Mar 4 13:24:54 2018 -0500

    SOLR-11947: Initial commit for new math expression docs WIP
This commit is contained in:
Joel Bernstein 2018-03-26 12:48:33 -04:00
parent dc2ad7022c
commit 1ed4e226ac
17 changed files with 4982 additions and 6 deletions

View File

@ -0,0 +1,182 @@
= Curve Fitting
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
== Polynomial Curve Fitting
The `polyfit` function is a general purpose curve fitter used to model
the *non-linear* relationship between two random variables.
The `polyfit` function is passed *x* and *y* axises and fits a smooth curve to the data.
If only a single array is provided it is treated as the *y* axis and a sequence is generated
for the *x* axis.
The `polyfit` function also has a parameter the specifies the degree of the polynomial. The higher
the degree the more curves that can be modeled.
The example below uses the `polyfit` function to fit a curve to an array using
a 3 degree polynomial. The fitted curve is then subtracted from the original curve. The output
shows the error between the fitted curve and the original curve, known as the residuals.
The output also includes the sum-of-squares of the residuals which provides a measure
of how large the error is..
[source,text]
----
let(echo="residuals, sumSqError",
y=array(0, 1, 2, 3, 4, 5.7, 6, 7, 6, 5, 5, 3, 2, 1, 0),
curve=polyfit(y, 3),
residuals=ebeSubtract(y, curve),
sumSqError=sumSq(residuals))
----
When this expression is sent to the /stream handler it
responds with:
[source,json]
----
{
"result-set": {
"docs": [
{
"residuals": [
0.5886274509803899,
-0.0746078431372561,
-0.49492135315664765,
-0.6689571213100631,
-0.5933591898297781,
0.4352283990519288,
0.32016160310277897,
1.1647963800904968,
0.272488687782805,
-0.3534055160525744,
0.2904697263520779,
-0.7925296272355089,
-0.5990476190476182,
-0.12572829131652274,
0.6307843137254909
],
"sumSqError": 4.7294282482223595
},
{
"EOF": true,
"RESPONSE_TIME": 0
}
]
}
}
----
In the next example the curve is fit using a 5 degree polynomial. Notice that the curve
is fit closer, shown by the smaller residuals and lower value for the sum-of-squares of the
residuals. This is because the higher polynomial produced a closer fit.
[source,text]
----
let(echo="residuals, sumSqError",
y=array(0, 1, 2, 3, 4, 5.7, 6, 7, 6, 5, 5, 3, 2, 1, 0),
curve=polyfit(y, 5),
residuals=ebeSubtract(y, curve),
sumSqError=sumSq(residuals))
----
When this expression is sent to the /stream handler it
responds with:
[source,json]
----
{
"result-set": {
"docs": [
{
"residuals": [
-0.12337461300309674,
0.22708978328173413,
0.12266015718028167,
-0.16502738747320755,
-0.41142804563857105,
0.2603044014808713,
-0.12128970101106162,
0.6234168308471704,
-0.1754692675745293,
-0.5379689969473249,
0.4651616185671843,
-0.288175756132409,
0.027970945463215102,
0.18699690402476687,
-0.09086687306501587
],
"sumSqError": 1.413089480179252
},
{
"EOF": true,
"RESPONSE_TIME": 0
}
]
}
}
----
== Prediction, Derivatives and Integrals
The `polyfit` function returns an array which contains the *y* value data points
of the fitted curve.
In order to predict values along the curve an interpolation function must be created
for the curve. Once an interpolation functin has been created the `predict`,
`derivative` and `integral` functions can be applied to the curve.
In the example below the x axis is included for clarity.
The `polyfit` function returns an array with the fitted curve.
A linear inpolation function is then created for the curve with the `lerp` function.
The `predict` function is then used to predict a value along the curve, in this
case the prediction is made for the *x* value of .5.
[source,text]
----
let(x=array(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14),
y=array(0, 1, 2, 3, 4, 5.7, 6, 7, 6, 5, 5, 3, 2, 1, 0),
curve=polyfit(x, y, 5),
interp=lerp(x, curve),
p=predict(interp, .5))
----
When this expression is sent to the /stream handler it
responds with:
[source,json]
----
{
"result-set": {
"docs": [
{
"p": 0.4481424148606813
},
{
"EOF": true,
"RESPONSE_TIME": 0
}
]
}
}
----

View File

@ -0,0 +1,680 @@
= Machine Learning
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
This section of the math expressions user guide covers machine learning
functions.
== Feature Scaling
Before performing machine learning operations its often necessary to
scale the feature vectors so they can be compared at the same scale.
All the scaling function operate on vectors and matrices.
When operating on a matrix the *rows* of the matrix are scaled.
=== Min/Max Scaling
The `minMaxScale` function scales a vector or matrix between a min and
max value. By default it will scale between 0 and 1 if min/max values
are not provided.
Below is a simple example of min/max scaling between 0 and 1.
Notice that once brought into the same scale the vectors are the same.
[source,text]
----
let(a=array(20, 30, 40, 50),
b=array(200, 300, 400, 500),
c=matrix(a, b),
d=minMaxScale(c))
----
This expression returns the following response:
[source,json]
----
{
"result-set": {
"docs": [
{
"d": [
[
0,
0.3333333333333333,
0.6666666666666666,
1
],
[
0,
0.3333333333333333,
0.6666666666666666,
1
]
]
},
{
"EOF": true,
"RESPONSE_TIME": 0
}
]
}
}
----
=== Standardization
The `standardize` function scales a vector so that it has a
mean of 0 and a standard deviation of 1. Standardization can be
used with machine learning algorithms, such as SVM, that
perform better when the data has a normal distribution.
[source,text]
----
let(a=array(20, 30, 40, 50),
b=array(200, 300, 400, 500),
c=matrix(a, b),
d=standardize(c))
----
This expression returns the following response:
[source,json]
----
{
"result-set": {
"docs": [
{
"d": [
[
-1.161895003862225,
-0.3872983346207417,
0.3872983346207417,
1.161895003862225
],
[
-1.1618950038622249,
-0.38729833462074165,
0.38729833462074165,
1.1618950038622249
]
]
},
{
"EOF": true,
"RESPONSE_TIME": 17
}
]
}
}
----
=== Unitize
The `unitize` function scales vectors to a magnitude of 1. A vector with a
magnitude of 1 is known as a unit vector. Unit vectors are
preferred when the vector math deals
with vector direction rather than magnitude.
[source,text]
----
let(a=array(20, 30, 40, 50),
b=array(200, 300, 400, 500),
c=matrix(a, b),
d=unitize(c))
----
This expression returns the following response:
[source,json]
----
{
"result-set": {
"docs": [
{
"d": [
[
0.2721655269759087,
0.40824829046386296,
0.5443310539518174,
0.6804138174397716
],
[
0.2721655269759087,
0.4082482904638631,
0.5443310539518174,
0.6804138174397717
]
]
},
{
"EOF": true,
"RESPONSE_TIME": 6
}
]
}
}
----
== Distance
The `distance` function computes a distance measure for two
numeric arrays or a *distance matrix* for the columns of a matrix.
There are four distance measures currently supported:
* euclidean (default)
* manhattan
* canberra
* earthMovers
Below is an example for computing euclidean distance for
two numeric arrays:
[source,text]
----
let(a=array(20, 30, 40, 50),
b=array(21, 29, 41, 49),
c=distance(a, b))
----
This expression returns the following response:
[source,json]
----
{
"result-set": {
"docs": [
{
"c": 2
},
{
"EOF": true,
"RESPONSE_TIME": 0
}
]
}
}
----
Below is an example for computing a distance matrix for columns
of a matrix:
[source,text]
----
let(a=array(20, 30, 40),
b=array(21, 29, 41),
c=array(31, 40, 50),
d=matrix(a, b, c),
c=distance(d))
----
This expression returns the following response:
[source,json]
----
{
"result-set": {
"docs": [
{
"e": [
[
0,
15.652475842498529,
34.07345007480164
],
[
15.652475842498529,
0,
18.547236990991408
],
[
34.07345007480164,
18.547236990991408,
0
]
]
},
{
"EOF": true,
"RESPONSE_TIME": 24
}
]
}
}
----
== K-means Clustering
The `kmeans` functions performs k-means clustering of the rows of a matrix.
Once the clustering has been completed there are a number of useful functions available
for examining the *clusters* and *centroids*.
The examples below are clustering *term vectors*.
The chapter on link:term-vectors.adoc[Text Analysis and Term Vectors] should be
consulted for a full explanation of these features.
=== Centroid Features
In the example below the `kmeans` function is used to cluster a result set from the Enron email data-set
and then the top features are extracted from the cluster centroids.
Let's look at what data is assigned to each variable:
* *a*: The `random` function returns a sample of 500 documents from the *enron*
collection that match the query *body:oil*. The `select` function selects the *id* and
and annotates each tuple with the analyzed bigram terms from the body field.
* *b*: The `termVectors` function creates a TF-IDF term vector matrix from the
tuples stored in variable *a*. Each row in the matrix represents a document. The columns of the matrix
are the bigram terms that were attached to each tuple.
* *c*: The `kmeans` function clusters the rows of the matrix into 5 clusters. The k-means clustering is performed using the
*Euclidean distance* measure.
* *d*: The `getCentroids` function returns a matrix of cluster centroids. Each row in the matrix is a centroid
from one of the 5 clusters. The columns of the matrix are the same bigrams terms of the term vector matrix.
* *e*: The `topFeatures` function returns the column labels for the top 5 features of each centroid in the matrix.
This returns the top 5 bigram terms for each centroid.
[source,text]
----
let(a=select(random(enron, q="body:oil", rows="500", fl="id, body"),
id,
analyze(body, body_bigram) as terms),
b=termVectors(a, maxDocFreq=.10, minDocFreq=.05, minTermLength=14, exclude="_,copyright"),
c=kmeans(b, 5),
d=getCentroids(c),
e=topFeatures(d, 5))
----
This expression returns the following response:
[source,json]
----
{
"result-set": {
"docs": [
{
"e": [
[
"enron enronxgate",
"north american",
"energy services",
"conference call",
"power generation"
],
[
"financial times",
"chief financial",
"financial officer",
"exchange commission",
"houston chronicle"
],
[
"southern california",
"california edison",
"public utilities",
"utilities commission",
"rate increases"
],
[
"rolling blackouts",
"public utilities",
"electricity prices",
"federal energy",
"price controls"
],
[
"california edison",
"regulatory commission",
"southern california",
"federal energy",
"power generators"
]
]
},
{
"EOF": true,
"RESPONSE_TIME": 982
}
]
}
}
----
=== Cluster Features
The example below examines the top features of a specific cluster. This example uses the same techniques
as the centroids example but the top features are extracted from a cluster rather then the centroids.
The `getCluster` function returns a cluster by its index. Each cluster is a matrix containing term vectors
that have been clustered together based on their features.
In the example below the `topFeatures` function is used to extract the top 4 features from each term vector
in the cluster.
[source,text]
----
let(a=select(random(collection3, q="body:oil", rows="500", fl="id, body"),
id,
analyze(body, body_bigram) as terms),
b=termVectors(a, maxDocFreq=.09, minDocFreq=.03, minTermLength=14, exclude="_,copyright"),
c=kmeans(b, 25),
d=getCluster(c, 0),
e=topFeatures(d, 4))
----
This expression returns the following response:
[source,json]
----
{
"result-set": {
"docs": [
{
"e": [
[
"electricity board",
"maharashtra state",
"power purchase",
"state electricity",
"reserved enron"
],
[
"electricity board",
"maharashtra state",
"state electricity",
"purchase agreement",
"independent power"
],
[
"maharashtra state",
"reserved enron",
"federal government",
"state government",
"dabhol project"
],
[
"purchase agreement",
"power purchase",
"electricity board",
"maharashtra state",
"state government"
],
[
"investment grade",
"portland general",
"general electric",
"holding company",
"transmission lines"
],
[
"state government",
"state electricity",
"purchase agreement",
"electricity board",
"maharashtra state"
],
[
"electricity board",
"state electricity",
"energy management",
"maharashtra state",
"energy markets"
],
[
"electricity board",
"maharashtra state",
"state electricity",
"state government",
"second quarter"
]
]
},
{
"EOF": true,
"RESPONSE_TIME": 978
}
]
}
}
----
== Multi K-means Clustering
K-means clustering will be produce different results depending on
the initial placement of the centroids. K-means is fast enough
that multiple trials can be performed and the best outcome selected.
The `multiKmeans` function runs the K-means
clustering algorithm for a gven number of trials and selects the
best result based on which trial produces the lowest intra-cluster
variance.
The example below is identical to centroids example except that
it uses `multiKmeans` with 100 trials, rather then a single
trial of the `kmeans` function.
[source,text]
----
let(a=select(random(collection3, q="body:oil", rows="500", fl="id, body"),
id,
analyze(body, body_bigram) as terms),
b=termVectors(a, maxDocFreq=.09, minDocFreq=.03, minTermLength=14, exclude="_,copyright"),
c=multiKmeans(b, 5, 100),
d=getCentroids(c),
e=topFeatures(d, 5))
----
This expression returns the following response:
[source,json]
----
{
"result-set": {
"docs": [
{
"e": [
[
"enron enronxgate",
"energy trading",
"energy markets",
"energy services",
"unleaded gasoline"
],
[
"maharashtra state",
"electricity board",
"state electricity",
"energy trading",
"chief financial"
],
[
"price controls",
"electricity prices",
"francisco chronicle",
"wholesale electricity",
"power generators"
],
[
"southern california",
"california edison",
"public utilities",
"francisco chronicle",
"utilities commission"
],
[
"california edison",
"power purchases",
"system operator",
"term contracts",
"independent system"
]
]
},
{
"EOF": true,
"RESPONSE_TIME": 1182
}
]
}
}
----
== Fuzzy K-means Clustering
The `fuzzyKmeans` function is a soft clustering algorithm which
allows vectors to be assigned to more then one cluster. The *fuzziness* parameter
is a value between 1 and 2 that determines how fuzzy to make the cluster assignment.
After the clustering has been performed the `getMembershipMatrix` function can be called
on the clustering result to return a matrix describing which clusters each vector belongs to.
There is a row in the matrix for each vector that was clustered. There is a column in the matrix
for each cluster. The values in the columns are the probability that the vector belonged to the specific
cluster.
A simple example will make this more clear. In the example below 300 documents are analyzed and
then turned into a term vector matrix. Then the `fuzzyKmeans` function clusters the
term vectors into 12 clusters with a fuzziness factor of 1.25.
The `getMembershipMatrix` function is used to return the membership matrix and the first row
of membership matrix is retrieved with the `rowAt` function. The `precision` function is then applied to the first row
of the matrix to make it easier to read.
The output shows a single vector representing the cluster membership probabilities for the first
term vector. Notice that the term vector has the highest association with the 12th cluster,
but also has significant associations with the 3rd, 5th, 6th and 7th clusters.
[source,text]
----
et(a=select(random(collection3, q="body:oil", rows="300", fl="id, body"),
id,
analyze(body, body_bigram) as terms),
b=termVectors(a, maxDocFreq=.09, minDocFreq=.03, minTermLength=14, exclude="_,copyright"),
c=fuzzyKmeans(b, 12, fuzziness=1.25),
d=getMembershipMatrix(c),
e=rowAt(d, 0),
f=precision(e, 5))
----
This expression returns the following response:
[source,json]
----
{
"result-set": {
"docs": [
{
"f": [
0,
0,
0.178,
0,
0.17707,
0.17775,
0.16214,
0,
0,
0,
0,
0.30504
]
},
{
"EOF": true,
"RESPONSE_TIME": 2157
}
]
}
}
----
== K-nearest Neighbor
The `knn` function searches the rows of a matrix for the
K-nearest neighbors of a search vector. The `knn` function
returns a *matrix* of the K-nearest neighbors. The `knn` function
has a *named parameter* called *distance* which specifies the distance measure.
There are four distance measures currently supported:
* euclidean (Default)
* manhattan
* canberra
* earthMovers
The example below builds on the clustering examples to demonstrate
the `knn` function.
In the example, the centroids matrix is set to variable *d*. The first
centroid vector is selected from the matrix with the `rowAt` function.
Then the `knn` function is used to find the 3 nearest neighbors
to the centroid vector in the term vector matrix (variable b).
The `knn` function returns a matrix with the 3 nearest neighbors based on the
default distance measure which is euclidean. Finally, the top 4 features
of the term vectors in the nearest neighbor matrix are returned.
[source,text]
----
let(a=select(random(collection3, q="body:oil", rows="500", fl="id, body"),
id,
analyze(body, body_bigram) as terms),
b=termVectors(a, maxDocFreq=.09, minDocFreq=.03, minTermLength=14, exclude="_,copyright"),
c=multiKmeans(b, 5, 100),
d=getCentroids(c),
e=rowAt(d, 0),
g=knn(b, e, 3),
h=topFeatures(g, 4))
----
This expression returns the following response:
[source,json]
----
{
"result-set": {
"docs": [
{
"h": [
[
"california power",
"electricity supply",
"concerned about",
"companies like"
],
[
"maharashtra state",
"california power",
"electricity board",
"alternative energy"
],
[
"electricity board",
"maharashtra state",
"state electricity",
"houston chronicle"
]
]
},
{
"EOF": true,
"RESPONSE_TIME": 1243
}
]
}
}
----

View File

@ -0,0 +1,59 @@
= Math Expressions
:page-children: scalar-math, vector-math, variables, matrix-math, vectorization, term-vectors, statistics, probability, montecarlo, time-series, regression, numerical-analysis, curve-fitting, machine-learning
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
The Streaming Expression library includes a powerful
mathematical programing syntax with many of the features of a
functional programming language. The syntax includes variables,
data structures and a growing set of mathematical functions.
This user guide provides an overview of the different areas of
mathematical coverage starting with basic scalar math and
ending with machine learning. Along the way the guide covers variables
and data structures and techniques for combining Solr's
powerful streams with mathematical functions to make every
record in your Solr Cloud cluster computable.
== link:scalar-math.adoc[Scalar Math]
== link:vector-math.adoc[Vector Math]
== link:variables.adoc[Variables]
== link:matrix-math.adoc[Matrix Math]
== link:vectorization.adoc[Streams and Vectorization]
== link:term-vectors.adoc[Text Analysis and Term Vectors]
== link:statistics.adoc[Statistics]
== link:probability.adoc[Probability]
== link:montecarlo.adoc[Monte Carlo Simulations]
== link:time-series.adoc[Time Series]
== link:regression.adoc[Linear Regression]
== link:numerical-analysis.adoc[Interpolation, Derivatives and Integrals]
== link:curve-fitting.adoc[Curve Fitting]
== link:machine-learning.adoc[Machine Learning]

View File

@ -0,0 +1,443 @@
= Matrices and Matrix Math
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
This section of the user guide covers the
basics of matrix creation, manipulation and matrix math. Other sections
of the user guide demonstrate how matrices are used by the statistics,
probability and machine learning functions.
== Matrix Creation
A matrix can be created with the `matrix` function.
The matrix function is passed a list of `arrays` with
each array representing a *row* in the matrix.
The example below creates a two-by-two matrix.
[source,text]
----
matrix(array(1, 2),
array(4, 5))
----
When this expression is sent to the /stream handler it
responds with:
[source,json]
----
{
"result-set": {
"docs": [
{
"return-value": [
[
1,
2
],
[
4,
5
]
]
},
{
"EOF": true,
"RESPONSE_TIME": 0
}
]
}
----
== Accessing Rows and Columns
The rows and columns of a matrix can be accessed using the `rowAt`
and `colAt` functions.
The example below creates a 2 by 2 matrix and returns the second column of the matrix.
Notice that the matrix is passed variables in this example rather than
directly passed a list of arrays.
[source,text]
----
let(a=array(1, 2),
b=array(4, 5),
c=matrix(a, b),
d=colAt(c, 1))
----
When this expression is sent to the /stream handler it
responds with:
[source,json]
----
{
"result-set": {
"docs": [
{
"d": [
2,
5
]
},
{
"EOF": true,
"RESPONSE_TIME": 0
}
]
}
}
----
== Row and Column Labels
A matrix can have column and rows and labels. The functions
`setRowLabels`, `setColumnLabels`, `getRowLabels` and `getColumnLabels`
can be used to set and get the labels. The label values
are set using string arrays.
The example below sets the row and column labels. In other sections of the
user guide examples are shown where functions return matrices
with the labels already set.
Below is a simple example of setting and
getting row and column labels
on a matrix.
[source,text]
----
let(echo="d, e",
a=matrix(array(1, 2),
array(4, 5)),
b=setRowLabels(a, array("row0", "row1")),
c=setColumnLabels(b, array("col0", "col1")),
d=getRowLabels(c),
e=getColumnLabels(c))
----
When this expression is sent to the /stream handler it
responds with:
[source,json]
----
{
"result-set": {
"docs": [
{
"d": [
"row0",
"row1"
],
"e": [
"col0",
"col1"
]
},
{
"EOF": true,
"RESPONSE_TIME": 0
}
]
}
}
----
== Matrix Attributes
A matrix can also have an arbitrary set of named attributes associated
with it. Certain functions, such as the `termVectors` function,
return matrices that contain attributes that describe data in the matrix.
Attributes can be retrieved by name using the `getAttribute` function and
the entire attribute map can be returned using the `getAttributes`
function.
== Matrix Dimensions
The dimensions of a matrix can be determined using the
`rowCount` and `columnCount` functions.
The example below retrieves the dimensions of a matrix.
[source,text]
----
let(echo="b,c",
a=matrix(array(1, 2, 3),
array(4, 5, 6)),
b=rowCount(a),
c=columnCount(a))
----
When this expression is sent to the /stream handler it
responds with:
[source,json]
----
{
"result-set": {
"docs": [
{
"b": 2,
"c": 3
},
{
"EOF": true,
"RESPONSE_TIME": 0
}
]
}
}
----
== Matrix Transposition
A matrix can be https://en.wikipedia.org/wiki/Transpose[transposed]
using the `transpose` function.
An example of matrix transposition is shown below:
[source,text]
----
let(a=matrix(array(1, 2),
array(4, 5)),
b=transpose(a))
----
When this expression is sent to the /stream handler it
responds with:
[source,json]
----
{
"result-set": {
"docs": [
{
"b": [
[
1,
4
],
[
2,
5
]
]
},
{
"EOF": true,
"RESPONSE_TIME": 24
}
]
}
}
----
== Matrix Summations
The rows and columns of a matrix can be summed with the `sumRows` and `sumColumns` functions.
Below is an example of the `sumRows` function which returns an
array with the sum of each row.
[source,text]
----
let(a=matrix(array(1, 2, 3),
array(4, 5, 6)),
b=sumRows(a))
----
When this expression is sent to the /stream handler it
responds with:
[source,json]
----
{
"result-set": {
"docs": [
{
"b": [
6,
15
]
},
{
"EOF": true,
"RESPONSE_TIME": 2
}
]
}
}
----
The `grandSum` function returns the sum of all values in the matrix.
Below is an example of the `grandSum` function:
[source,text]
----
let(a=matrix(array(1, 2, 3),
array(4, 5, 6)),
b=grandSum(a))
----
When this expression is sent to the /stream handler it
responds with:
[source,json]
----
{
"result-set": {
"docs": [
{
"b": 21
},
{
"EOF": true,
"RESPONSE_TIME": 0
}
]
}
}
----
== Scalar Matrix Math
The same scalar math functions that apply to vectors can also be applied to matrices: `scalarAdd`, `scalarSubtract`,
`scalarMultiply`, `scalarDivide`. Below is an example of the `scalarAdd` function
which adds a scalar value to each element in a matrix.
[source,text]
----
let(a=matrix(array(1, 2),
array(4, 5)),
b=scalarAdd(10, a))
----
When this expression is sent to the /stream handler it
responds with:
[source,json]
----
{
"result-set": {
"docs": [
{
"b": [
[
11,
12
],
[
14,
15
]
]
},
{
"EOF": true,
"RESPONSE_TIME": 0
}
]
}
}
----
== Matrix Addition and Subtraction
Two matrices can be added and subtracted using the `ebeAdd` and `ebeSubtract` functions,
which perform element-by-element addition
and subtraction of matrices.
Below is a simple example of an element-by-element addition of a matrix by itself:
[source,text]
----
let(a=matrix(array(1, 2),
array(4, 5)),
b=ebeAdd(a, a))
----
When this expression is sent to the /stream handler it
responds with:
[source,json]
----
{
"result-set": {
"docs": [
{
"b": [
[
2,
4
],
[
8,
10
]
]
},
{
"EOF": true,
"RESPONSE_TIME": 0
}
]
}
}
----
== Matrix Multiplication
Matrix multiplication can be accomplished using the `matrixMult` function. Below is a simple
example of matrix multiplication:
[source,text]
----
let(a=matrix(array(1, 2),
array(4, 5)),
b=matrix(array(11, 12),
array(14, 15)),
c=matrixMult(a, b))
----
When this expression is sent to the /stream handler it
responds with:
[source,json]
----
{
"result-set": {
"docs": [
{
"c": [
[
39,
42
],
[
114,
123
]
]
},
{
"EOF": true,
"RESPONSE_TIME": 0
}
]
}
}
----

View File

@ -0,0 +1,213 @@
= Monte Carlo Simulations
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
Monte Carlo simulations are commonly used to model the behavior of
stochastic systems. This section of the user guide describes
how to perform both *uncorrelated* and *correlated* Monte Carlo simulations
using the *sampling* capabilities of the probability distribution framework.
=== Uncorrelated Simulations
Uncorrelated Monte Carlo simulations model stochastic systems with the assumption
that the underlying random variables move independently of each other.
A simple example of a Monte Carlo simulation using two independently changing random variables
is described below.
In this example a Monte Carlo simulation is used to determine the probability that a simple hinge assembly will
fall within a required length specification.
The hinge has two components *A* and *B*. The combined length of the two components must be less then 5 centimeters
to fall within specification.
A random sampling of lengths for component *A* has shown that its length conforms to a
normal distribution with a mean of 2.2 centimeters and a standard deviation of .0195
centimeters.
A random sampling of lengths for component *B* has shown that its length conforms
to a normal distribution with a mean of 2.71 centimeters and a standard deviation of .0198 centimeters.
The Monte Carlo simulation below performs the following steps:
* A normal distribution with a mean of 2.2 and a standard deviation of .0195 is created to model the length of componentA.
* A normal distribution with a mean of 2.71 and a standard deviation of .0198 is created to model the length of componentB.
* The `monteCarlo` function is used to simulate component pairs. The `monteCarlo` function
calls the *add(sample(componentA), sample(componentB))* function 100000 times and collects the results in an array. Each
time the function is called a random sample is drawn from the componentA
and componentB length distributions. The `add` function adds the two samples to calculate the combined length.
The result of each function run is collected in an array and assigned to the *simresults* variable.
* An `empiricalDistribution` function is then created from the *simresults* array to model the distribution of the
simulation results.
* Finally, the `cumulativeProbability` function is called on the *simmodel* to determine the cumulative probability
that the combined length of the components is 5 or less.
* Based on the simulation there is .9994371944629039 probability that the combined length of a component pair will
be 5 or less.
[source,text]
----
let(componentA=normalDistribution(2.2, .0195),
componentB=normalDistribution(2.71, .0198),
simresults=monteCarlo(add(sample(componentA), sample(componentB)), 100000),
simmodel=empiricalDistribution(simresults),
prob=cumulativeProbability(simmodel, 5))
----
When this expression is sent to the /stream handler it responds with:
[source,json]
----
{
"result-set": {
"docs": [
{
"prob": 0.9994371944629039
},
{
"EOF": true,
"RESPONSE_TIME": 660
}
]
}
}
----
=== Correlated Simulations
The simulation above assumes that the lengths of *componentA* and *componentB* vary independently.
What would happen to the probability model if there was a correlation between the lengths of
*componentA* and *componentB*.
In the example below a database containing assembled pairs of components is used to determine
if there is a correlation between the lengths of the components, and how the correlation effects the model.
Before performing a simulation of the effects of correlation on the probability model its
useful to understand what the correlation is between the lengths of *componentA* and *componentB*.
In the example below 5000 random samples are selected from a collection
of assembled hinges. Each sample contains
lengths of the components in the fields *componentA_d* and *componentB_d*.
Both fields are then vectorized. The *componentA_d* vector is stored in
variable *b* and the *componentB_d* variable is stored in variable *c*.
Then the correlation of the two vectors is calculated using the `corr` function. Note that the outcome
from `corr` is 0.9996931313216989. This means that *componentA_d* and *componentB_d* are almost
perfectly correlated.
[source,text]
----
let(a=random(collection5, q="*:*", rows="5000", fl="componentA_d, componentB_d"),
b=col(a, componentA_d)),
c=col(a, componentB_d)),
d=corr(b, c))
----
When this expression is sent to the /stream handler it responds with:
[source,json]
----
{
"result-set": {
"docs": [
{
"d": 0.9996931313216989
},
{
"EOF": true,
"RESPONSE_TIME": 309
}
]
}
}
----
How does correlation effect the probability model?
The example below explores how to use a *multivariate normal distribution* function
to model how correlation effects the probability of hinge defects.
In this example 5000 random samples are selected from a collection
containing length data for assembled hinges. Each sample contains
the fields *componentA_d* and *componentB_d*.
Both fields are then vectorized. The *componentA_d* vector is stored in
variable *b* and the *componentB_d* variable is stored in variable *c*.
An array is created that contains the *means* of the two vectorized fields.
Then both vectors are added to a matrix which is transposed. This creates
an *observation* matrix where each row contains one observation of
*componentA_d* and *componentB_d*. A covariance matrix is then created from the columns of
the observation matrix with the
`cov` function. The covariance matrix describes the covariance between
*componentA_d* and *componentB_d*.
The `multivariateNormalDistribution` function is then called with the
array of means for the two fields and the covariance matrix. The model
for the multivariate normal distribution is stored in variable *g*.
The `monteCarlo` function then calls the function *add(sample(g))* 50000 times
and collections the results in a vector. Each time the function is called a single sample
is drawn from the multivariate normal distribution. Each sample is a vector containing
one *componentA* and *componentB* pair. the `add` function adds the values in the vector to
calculate the length of the pair. Over the long term the samples drawn from the
multivariate normal distribution will conform to the covariance matrix used to construct it.
Just as in the non-correlated example an empirical distribution is used to model probabilities
of the simulation vector and the `cumulativeProbability` function is used to compute the cumulative
probability that the combined component length will be 5 centimeters or less.
Notice that the probability of a hinge meeting specification has dropped to 0.9889517439980468.
This is because the strong correlation
between the lengths of components means that their lengths rise together causing more hinges to
fall out of the 5 centimeter specification.
[source,text]
----
let(a=random(hinges, q="*:*", rows="5000", fl="componentA_d, componentB_d"),
b=col(a, componentA_d),
c=col(a, componentB_d),
cor=corr(b,c),
d=array(mean(b), mean(c)),
e=transpose(matrix(b, c)),
f=cov(e),
g=multiVariateNormalDistribution(d, f),
h=monteCarlo(add(sample(g)), 50000),
i=empiricalDistribution(h),
j=cumulativeProbability(i, 5))
----
When this expression is sent to the /stream handler it responds with:
[source,json]
----
{
"result-set": {
"docs": [
{
"j": 0.9889517439980468
},
{
"EOF": true,
"RESPONSE_TIME": 599
}
]
}
}
----

View File

@ -0,0 +1,430 @@
= Interpolation, Derivatives and Integrals
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
This section of the math expression user guide covers *interpolation*, *derivatives* and *integrals*.
These three interrelated topics are part of the field of mathematics called *numerical analysis*.
== Interpolation
Interpolation is used to construct new data points between a set of known control of points.
The ability to *predict* new data points allows for *sampling* along the curve defined by the
control points.
The interpolation functions described below all return an *interpolation model*
that can be passed to other functions which make use of the sampling capability.
If returned directly the interpolation model returns an array containing predictions for each of the
control points. This is useful in the case of `loess` interpolation which first smooths the control points
and then interpolates the smoothed points. All other interpolation function simply return the original
control points because interpolation predicts a curve that passes through the original control points.
There are different algorithms for interpolation that will result in different predictions
along the curve. The math expressions library currently supports the following
interpolation functions:
* `lerp`: Linear interpolation predicts points that pass through each control point and
form straight lines between control points.
* `spline`: Spline interpolation predicts points that pass through each control point
and form a smooth curve between control points.
* `akima`: Akima spline interpolation is similar to spline interpolation but is stable to outliers.
* `loess`: Loess interpolation first performs a non-linear local regression to smooth the original
control points. Then a spline is used to interpolate the smoothed control points.
=== Upsampling
Interpolation can be used to increase the sampling rate along a curve. One example
of this would be to take a time series with samples every minute and create a data set with
samples every second. In order to do this the data points between the minutes must be created.
The `predict` function can be used to predict values anywhere within the bounds of the interpolation
range. The example below shows a very simple example of upsampling.
In the example linear interpolation is performed on the arrays in variables *x* and *y*. The *x* variable,
which is the x axis, is a sequence from 0 to 20 with a stride of 2. The *y* variable defines the curve
along the x axis.
The `lerp` function performs the interpolation and returns the interpolation model.
The `u` value is an array from 0 to 20 with a stride of 1. This fills in the gaps of the original x axis.
The `predict` function then uses the interpolation function in variable *l* to predict values for
every point in the array assigned to variable *u*.
The variable *p* is the array of predictions, which is the upsampled set of y values.
[source,text]
----
let(x=array(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20),
y=array(5, 10, 60, 190, 100, 130, 100, 20, 30, 10, 5),
l=lerp(x, y),
u=array(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20),
p=predict(l, u))
----
When this expression is sent to the /stream handler it
responds with:
[source,json]
----
{
"result-set": {
"docs": [
{
"g": [
5,
7.5,
10,
35,
60,
125,
190,
145,
100,
115,
130,
115,
100,
60,
20,
25,
30,
20,
10,
7.5,
5
]
},
{
"EOF": true,
"RESPONSE_TIME": 0
}
]
}
}
----
=== Smoothing Interpolation
The `loess` function is a smoothing interpolator which means it doesn't derive
a function that passes through the original control points. Instead the `loess` function
returns a function that smooths the original control points.
A technique known as local regression is used to compute the smoothed curve. The size of the
neighborhood of the local regression can be adjusted
to control how close the new curve conforms to the original control points.
The `loess` function is passed *x* and *y* axises and fits a smooth curve to the data.
If only a single array is provided it is treated as the *y* axis and a sequence is generated
for the *x* axis.
The example below uses the `loess` function to fit a curve to a set of *y* values in an array.
The bandwidth parameter defines the percent of data to use for the local
regression. The lower the percent the smaller the neighborhood used for the local
regression and the closer the curve will be to the original data.
In the example the fitted curve is subtracted from the original curve using the
`ebeSubtract` function. The output shows the error between the
fitted curve and the original curve, known as the residuals. The output also includes
the sum-of-squares of the residuals which provides a measure
of how large the error is.
[source,text]
----
let(echo="residuals, sumSqError",
y=array(0, 1, 2, 3, 4, 5.7, 6, 7, 7, 7,6, 7, 7, 7, 6, 5, 5, 3, 2, 1, 0),
curve=loess(y, bandwidth=.3),
residuals=ebeSubtract(y, curve),
sumSqError=sumSq(residuals))
----
When this expression is sent to the /stream handler it
responds with:
[source,json]
----
{
"result-set": {
"docs": [
{
"residuals": [
0,
0,
0,
-0.040524802275866634,
-0.10531988096456502,
0.5906115002526198,
0.004215074334896762,
0.4201374330912433,
0.09618315578013803,
0.012107948556718817,
-0.9892939034492398,
0.012014364143757561,
0.1093830927709325,
0.523166271893805,
0.09658362075164639,
-0.011433819306139625,
0.9899403519886416,
-0.011707983372932773,
-0.004223284004140737,
-0.00021462867928434548,
0.0018723112875456138
],
"sumSqError": 2.8016013870800616
},
{
"EOF": true,
"RESPONSE_TIME": 0
}
]
}
}
----
In the next example the curve is fit using a bandwidth of .25. Notice that the curve
is a closer fit, shown by the smaller residuals and lower value for the sum-of-squares of the
residuals.
[source,text]
----
let(echo="residuals, sumSqError",
y=array(0, 1, 2, 3, 4, 5.7, 6, 7, 6, 5, 5, 3, 2, 1, 0),
curve=loess(y, .25),
residuals=ebeSubtract(y, curve),
sumSqError=sumSq(residuals))
----
When this expression is sent to the /stream handler it
responds with:
[source,json]
----
{
"result-set": {
"docs": [
{
"residuals": [
0,
0,
0,
0,
-0.19117650587715396,
0.442863451538809,
-0.18553845993358564,
0.29990769020356645,
0,
0.23761890236245709,
-0.7344358765888117,
0.2376189023624491,
0,
0.30373119215254984,
-3.552713678800501e-15,
-0.23761890236245264,
0.7344358765888046,
-0.2376189023625095,
0,
2.842170943040401e-14,
-2.4868995751603507e-14
],
"sumSqError": 1.7539413576337557
},
{
"EOF": true,
"RESPONSE_TIME": 0
}
]
}
}
----
== Derivatives
The derivative of a function measures the rate of change of the *y* value in respects to the
rate of change of the *x* value.
The `derivative` function can compute the derivative of any *interpolation* function.
The `derivative` function can also compute the derivative of a derivative.
The example below computes the derivative for a `loess` interpolation function.
[source,text]
----
let(x=array(0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20),
y=array(0, 1, 2, 3, 4, 5.7, 6, 7, 7, 7,6, 7, 7, 7, 6, 5, 5, 3, 2, 1, 0),
curve=loess(x, y, bandwidth=.3),
derivative=derivative(curve))
----
When this expression is sent to the /stream handler it
responds with:
[source,json]
----
{
"result-set": {
"docs": [
{
"derivative": [
1.0022002675659012,
0.9955994648681976,
1.0154018729613081,
1.0643674501141696,
1.0430879694757085,
0.9698717643975381,
0.7488201070357539,
0.44627000894357516,
0.19019561285422165,
0.01703599324311178,
-0.001908408138535126,
-0.009121607450087499,
-0.2576361507216319,
-0.49378951291352746,
-0.7288073815664,
-0.9871806872210384,
-1.0025400632604322,
-1.001836567536853,
-1.0076227586138085,
-1.0021524620888589,
-1.0020541789058157
]
},
{
"EOF": true,
"RESPONSE_TIME": 0
}
]
}
}
----
== Integrals
An integral is a measure of the volume underneath a curve.
The `integrate` function computes an integral for a specific
range of an interpolated curve.
In the example below the `integrate` function computes an
integral for the entire range of the curve, 0 through 20.
[source,text]
----
let(x=array(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20),
y=array(0, 1, 2, 3, 4, 5.7, 6, 7, 7, 7,6, 7, 7, 7, 6, 5, 5, 3, 2, 1, 0),
curve=loess(x, y, bandwidth=.3),
integral=integrate(curve, 0, 20))
----
When this expression is sent to the /stream handler it
responds with:
[source,json]
----
{
"result-set": {
"docs": [
{
"integral": 90.17446104846645
},
{
"EOF": true,
"RESPONSE_TIME": 0
}
]
}
}
----
In the next example an integral is computed for the range of 0 through 10.
[source,text]
----
let(x=array(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20),
y=array(0, 1, 2, 3, 4, 5.7, 6, 7, 7, 7,6, 7, 7, 7, 6, 5, 5, 3, 2, 1, 0),
curve=loess(x, y, bandwidth=.3),
integral=integrate(curve, 0, 10))
----
When this expression is sent to the /stream handler it
responds with:
[source,json]
----
{
"result-set": {
"docs": [
{
"integral": 45.300912584519914
},
{
"EOF": true,
"RESPONSE_TIME": 0
}
]
}
}
----
== Bicubic Spline
The `bicubicSpline` function can be used to interpolate and predict values
anywhere within a grid of data.
A simple example will make this more clear.
In example below a bicubic spline is used to interpolate a matrix of real estate data.
Each row of the matrix represents a specific *year*. Each column of the matrix
represents a *floor* of the building. The grid of numbers is the average selling price of
an apartment for each year and floor. For example in 2002 the average selling price for
the 9th floor was 415000 (row 3, column 3).
The `bicubicSpline` function is then used to
interpolate the grid, and the `predict` function is used to predict a value for year 2003, floor 8.
Notice that the matrix does not included a data point for year 2003, floor 8. The `bicupicSpline`
function creates that data point based on the surrounding data in the matrix.
[source,text]
----
let(years=array(1998, 2000, 2002, 2004, 2006),
floors=array(1, 5, 9, 13, 17, 19),
prices = matrix(array(300000, 320000, 330000, 350000, 360000, 370000),
array(320000, 330000, 340000, 350000, 365000, 380000),
array(400000, 410000, 415000, 425000, 430000, 440000),
array(410000, 420000, 425000, 435000, 445000, 450000),
array(420000, 430000, 435000, 445000, 450000, 470000)),
bspline=bicubicSpline(years, floors, prices),
prediction=predict(bspline, 2003, 8))
----
When this expression is sent to the /stream handler it
responds with:
[source,json]
----
{
"result-set": {
"docs": [
{
"prediction": 418279.5009328358
},
{
"EOF": true,
"RESPONSE_TIME": 0
}
]
}
}
----

View File

@ -0,0 +1,415 @@
= Probability Distributions
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
This section of the user guide covers the
*probability distribution
framework* included in the math expressions library.
== Probability Distributions
The probability distribution framework includes
many commonly used *real* and *discrete* probability
distributions, including support for *empirical* and
*enumerated* distributions that model real world data.
The probability distribution framework also includes a set
of functions that use the probability distributions
to support probability calculations and sampling.
=== Real Distributions
The probability distribution framework has the following functions
which support well known real probability distributions:
* `normalDistribution`: Creates a normal distribution function.
* `logNormalDistribution`: Creates a log normal distribution function.
* `gammaDistribution`: Creates a gamma distribution function.
* `betaDistribution`: Creates a beta distribution function.
* `uniformDistribution`: Creates a uniform real distribution function.
* `weibullDistribution`: Creates a Weibull distribution function.
* `triangularDistribution`: Creates a triangular distribution function.
* `constantDistribution`: Creates constant real distribution function.
=== Empirical Distribution
The `empiricalDistribution` function creates a real probability
distribution from actual data. An empirical distribution
can be used interchangeably with any of the theoretical
real distributions.
=== Discrete
The probability distribution framework has the following functions
which support well known discrete probability distributions:
* `poissonDistribution`: Creates a Poisson distribution function.
* `binomialDistribution`: Creates a binomial distribution function.
* `uniformIntegerDistribution`: Creates a uniform integer distribution function.
* `geometricDistribution`: Creates a geometric distribution function.
* `zipFDistribution`: Creates a Zipf distribution function.
=== Enumerated Distributions
The `enumeratedDistribution` function creates a discrete
distribution function from a data set of discrete values,
or from and enumerated list of values and probabilities.
Enumerated distribution functions can be used interchangeably
with any of the theoretical discrete distributions.
=== Cumulative Probability
The `cumulativeProbability` function can be used with all
probability distributions to calculate the
cumulative probability of encountering a specific
random variable within a specific distribution.
Below is example of calculating the cumulative probability
of a random variable within a normal distribution.
In the example a normal distribution function is created
with a mean of 10 and a standard deviation of 5. Then
the cumulative probability of the value 12 is calculated for this
specific distribution.
[source,text]
----
let(a=normalDistribution(10, 5),
b=cumulativeProbability(a, 12))
----
When this expression is sent to the /stream handler it responds with:
[source,json]
----
{
"result-set": {
"docs": [
{
"b": 0.6554217416103242
},
{
"EOF": true,
"RESPONSE_TIME": 0
}
]
}
}
----
Below is an example of a cumulative probability calculation
using an empirical distribution.
In the example an empirical distribution is created from a random
sample taken from the *price_f* field.
The cumulative probability of the value .75 is then calculated.
The *price_f* field in this example was generated using a
uniform real distribution between 0 and 1, so the output of the
`cumulativeProbability` function is very close to .75.
[source,text]
----
let(a=random(collection1, q="*:*", rows="30000", fl="price_f"),
b=col(a, price_f),
c=empiricalDistribution(b),
d=cumulativeProbability(c, .75))
----
When this expression is sent to the /stream handler it responds with:
[source,json]
----
{
"result-set": {
"docs": [
{
"b": 0.7554217416103242
},
{
"EOF": true,
"RESPONSE_TIME": 0
}
]
}
}
----
=== Probability
The `probability` function can be used with any discrete
distribution function to compute the probability of a
discrete value.
Below is an example which calculates the probability
of a discrete value within a Poisson distribution.
In the example a Poisson distribution function is created
with a mean of 100. Then the
probability of encountering a sample of the discrete value 101 is calculated for this
specific distribution.
[source,text]
----
let(a=poissonDistribution(100),
b=probability(a, 101))
----
When this expression is sent to the /stream handler it responds with:
[source,json]
----
{
"result-set": {
"docs": [
{
"b": 0.039466333474403106
},
{
"EOF": true,
"RESPONSE_TIME": 0
}
]
}
}
----
Below is an example of a probability calculation
using an enumerated distribution.
In the example an enumerated distribution is created from a random
sample taken from the *day_i* field, which was created
using a uniform integer distribution between 0 and 30.
The probability of the discrete value 10 is then calculated.
[source,text]
----
let(a=random(collection1, q="*:*", rows="30000", fl="day_i"),
b=col(a, day_i),
c=enumeratedDistribution(b),
d=probability(c, 10))
----
When this expression is sent to the /stream handler it responds with:
[source,json]
----
{
"result-set": {
"docs": [
{
"d": 0.03356666666666666
},
{
"EOF": true,
"RESPONSE_TIME": 488
}
]
}
}
----
=== Sampling
All probability distributions support sampling. The `sample`
function returns 1 or more random samples from a probability
distribution.
Below is an example drawing a single sample from
a normal distribution.
[source,text]
----
let(a=normalDistribution(10, 5),
b=sample(a))
----
When this expression is sent to the /stream handler it responds with:
[source,json]
----
{
"result-set": {
"docs": [
{
"b": 11.24578055004963
},
{
"EOF": true,
"RESPONSE_TIME": 0
}
]
}
}
----
Below is an example drawing 10 samples from a normal
distribution.
[source,text]
----
let(a=normalDistribution(10, 5),
b=sample(a, 10))
----
When this expression is sent to the /stream handler it responds with:
[source,json]
----
{
"result-set": {
"docs": [
{
"b": [
10.18444709339441,
9.466947971749377,
1.2420697166234458,
11.074501226984806,
7.659629052136225,
0.4440887839190708,
13.710925254778786,
2.089566359480239,
0.7907293097654424,
2.8184587681006734
]
},
{
"EOF": true,
"RESPONSE_TIME": 3
}
]
}
}
----
=== Multivariate Normal Distribution
The multivariate normal distribution is a generalization of the
univariate normal distribution to higher dimensions.
The multivariate normal distribution models two or more random
variables that are normally distributed. The relationship between
the variables is defined by a covariance matrix.
==== Sampling
The `sample` function can be used to draw samples
from a multivariate normal distribution in much the same
way as a univariate normal distribution.
The difference is that each sample will be an array containing a sample
drawn from each of the underlying normal distributions.
If multiple samples are drawn, the `sample` function returns a matrix with a
sample in each row. Over the long term the columns of the sample
matrix will conform to the covariance matrix used to parametrize the
multivariate normal distribution.
The example below demonstrates how to initialize and draw samples
from a multivariate normal distribution.
In this example 5000 random samples are selected from a collection
of log records. Each sample contains
the fields *filesize_d* and *response_d*. The values of both fields conform
to a normal distribution.
Both fields are then vectorized. The *filesize_d* vector is stored in
variable *b* and the *response_d* variable is stored in variable *c*.
An array is created that contains the *means* of the two vectorized fields.
Then both vectors are added to a matrix which is transposed. This creates
an *observation* matrix where each row contains one observation of
*filesize_d* and *response_d*. A covariance matrix is then created from the columns of
the observation matrix with the
`cov` function. The covariance matrix describes the covariance between
*filesize_d* and *response_d*.
The `multivariateNormalDistribution` function is then called with the
array of means for the two fields and the covariance matrix. The model for the
multivariate normal distribution is assigned to variable *g*.
Finally five samples are drawn from the multivariate normal distribution. The samples
are returned as a matrix, with each row representing one sample. There are two
columns in the matrix. The first column contains samples for *filesize_d* and the second
column contains samples for *response_d*. Over the long term the covariance between
the columns will conform to the covariance matrix used to instantiate the
multivariate normal distribution.
[source,text]
----
let(a=random(collection2, q="*:*", rows="5000", fl="filesize_d, response_d"),
b=col(a, filesize_d),
c=col(a, response_d),
d=array(mean(b), mean(c)),
e=transpose(matrix(b, c)),
f=cov(e),
g=multiVariateNormalDistribution(d, f),
h=sample(g, 5))
----
When this expression is sent to the /stream handler it responds with:
[source,json]
----
{
"result-set": {
"docs": [
{
"h": [
[
41974.85669321393,
779.4097049705296
],
[
42869.19876441414,
834.2599296790783
],
[
38556.30444839889,
720.3683470060988
],
[
37689.31290928216,
686.5549428100018
],
[
40564.74398214547,
769.9328090774
]
]
},
{
"EOF": true,
"RESPONSE_TIME": 162
}
]
}
}
----

View File

@ -0,0 +1,439 @@
= Linear Regression
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
This section of the math expressions user guide covers simple and multivariate linear regression.
== Simple Linear Regression
The `regress` function is used to build a linear regression model
between two random variables. Sample observations are provided with two
numeric arrays. The first numeric array is the *independent variable* and
the second array is the *dependent variable*.
In the example below the `random` function selects 5000 random samples each containing
the fields *filesize_d* and *response_d*. The two fields are vectorized
and stored in variables *b* and *c*. Then the `regress` function performs a regression
analysis on the two numeric arrays.
The `regress` function returns a single tuple with the results of the regression
analysis.
Note that in this regression analysis the value of *RSquared* is *.75*. This means that changes in
*filesize_d* explain 75% of the variability of the *response_d* variable.
[source,text]
----
let(a=random(collection2, q="*:*", rows="5000", fl="filesize_d, response_d"),
b=col(a, filesize_d),
c=col(a, response_d),
d=regress(b, c))
----
When this expression is sent to the /stream handler it responds with:
[source,json]
----
{
"result-set": {
"docs": [
{
"d": {
"significance": 0,
"totalSumSquares": 10564812.895147054,
"R": 0.8674822407146515,
"RSquared": 0.7525254379553127,
"meanSquareError": 523.1137343558588,
"intercept": -49.528134913099095,
"slopeConfidenceInterval": 0.0003171801710329995,
"regressionSumSquares": 7950290.450836472,
"slope": 0.019945557923159506,
"interceptStdErr": 6.489732340389941,
"N": 5000
}
},
{
"EOF": true,
"RESPONSE_TIME": 98
}
]
}
}
----
=== Prediction
The `predict` function uses the regression model to make predictions.
Using the example above the regression model can be used to predict the value
of *response_d* given a value for *filesize_d*.
In the example below the `predict` function uses the regression analysis to predict
the value of *response_d* for the *filesize_d* value of 40000.
[source,text]
----
let(a=random(collection2, q="*:*", rows="5000", fl="filesize_d, response_d"),
b=col(a, filesize_d),
c=col(a, response_d),
d=regress(b, c),
e=predict(d, 40000))
----
When this expression is sent to the /stream handler it responds with:
[source,json]
----
{
"result-set": {
"docs": [
{
"e": 748.079241022975
},
{
"EOF": true,
"RESPONSE_TIME": 95
}
]
}
}
----
The `predict` function can also make predictions for an array of values. In this
case it returns an array of predictions.
In the example below the `predict` function uses the regression analysis to
predict values for each of the 5000 samples of `filesize_d` used to generate the model.
In this case 5000 predictions are returned.
[source,text]
----
let(a=random(collection2, q="*:*", rows="5000", fl="filesize_d, response_d"),
b=col(a, filesize_d),
c=col(a, response_d),
d=regress(b, c),
e=predict(d, b))
----
When this expression is sent to the /stream handler it responds with:
[source,json]
----
{
"result-set": {
"docs": [
{
"e": [
742.2525322514165,
709.6972488729955,
687.8382568904871,
820.2511324266264,
720.4006432289061,
761.1578181053039,
759.1304101159126,
699.5597256337142,
742.4738911248204,
769.0342605881644,
746.6740473150268,
...
]
},
{
"EOF": true,
"RESPONSE_TIME": 113
}
]
}
}
----
=== Residuals
The difference between the observed value and the predicted value is known as the
residual. There isn't a specific function to calculate the residuals but vector
math can used to perform the calculation.
In the example below the predictions are stored in variable *e*. The `ebeSubtract`
function is then used to subtract the predictions
from the actual *response_d* values stored in variable *c*. Variable *f* contains
the array of residuals.
[source,text]
----
let(a=random(collection2, q="*:*", rows="5000", fl="filesize_d, response_d"),
b=col(a, filesize_d),
c=col(a, response_d),
d=regress(b, c),
e=predict(d, b),
f=ebeSubtract(c, e))
----
When this expression is sent to the /stream handler it responds with:
[source,json]
----
{
"result-set": {
"docs": [
{
"e": [
31.30678554491226,
-30.292830927953446,
-30.49508862647258,
-30.499884780783532,
-9.696458959319784,
-30.521563961535094,
-30.28380938033081,
-9.890289849359306,
30.819723560583157,
-30.213178859683012,
-30.609943619066826,
10.527700442607625,
10.68046928406568,
...
]
},
{
"EOF": true,
"RESPONSE_TIME": 113
}
]
}
}
----
== Multivariate Linear Regression
The `olsRegress` function performs a multivariate linear regression analysis. Multivariate linear
regression models the linear relationship between two or more *independent* variables and a *dependent* variable.
The example below extends the simple linear regression example by introducing a new independent variable
called *service_d*. The *service_d* variable is the service level of the request and it can range from 1 to 4
in the data-set. The higher the service level, the higher the bandwidth available for the request.
Notice that the two independent variables *filesize_d* and *service_d* are vectorized and stored
in the variables *b* and *c*. The variables *b* and *c* are then added as rows to a `matrix`. The matrix is
then transposed so that each row in the matrix represents one observation with *filesize_d* and *service_d*.
The `olsRegress` function then performs the multivariate regression analysis using the observation matrix as the
independent variables and the *response_d* values, stored in variable *d*, as the dependent variable.
Notice that the RSquared of the regression analysis is 1. This means that linear relationship between
*filesize_d* and *service_d* describe 100% of the variability of the *response_d* variable.
[source,text]
----
let(a=random(collection2, q="*:*", rows="30000", fl="filesize_d, service_d, response_d"),
b=col(a, filesize_d),
c=col(a, service_d),
d=col(a, response_d),
e=transpose(matrix(b, c)),
f=olsRegress(e, d))
----
When this expression is sent to the /stream handler it responds with:
[source,json]
----
{
"result-set": {
"docs": [
{
"f": {
"regressionParametersStandardErrors": [
2.0660690430026933e-13,
5.1212982077663434e-18,
9.10920932555875e-15
],
"RSquared": 1,
"regressionParameters": [
6.553210695971329e-12,
0.019999999999999858,
-20.49999999999968
],
"regressandVariance": 2124.130825172683,
"regressionParametersVariance": [
[
0.013660174897582315,
-3.361258014840509e-7,
-0.00006893737578369605
],
[
-3.361258014840509e-7,
8.393183709503206e-12,
6.430253229589981e-11
],
[
-0.00006893737578369605,
6.430253229589981e-11,
0.000026553878455570856
]
],
"adjustedRSquared": 1,
"residualSumSquares": 9.373703759269822e-20
}
},
{
"EOF": true,
"RESPONSE_TIME": 690
}
]
}
}
----
=== Prediction
The `predict` function can also be used to make predictions for multivariate linear regression. Below is an example
of a single prediction using the multivariate linear regression model and a single observation. The observation
is an array that matches the structure of the observation matrix used to build the model. In this case
the first value represent a *filesize_d* of 40000 and the second value represents a *service_d* of 4.
[source,text]
----
let(a=random(collection2, q="*:*", rows="5000", fl="filesize_d, service_d, response_d"),
b=col(a, filesize_d),
c=col(a, service_d),
d=col(a, response_d),
e=transpose(matrix(b, c)),
f=olsRegress(e, d),
g=predict(f, array(40000, 4)))
----
When this expression is sent to the /stream handler it responds with:
[source,json]
----
{
"result-set": {
"docs": [
{
"g": 718.0000000000005
},
{
"EOF": true,
"RESPONSE_TIME": 117
}
]
}
}
----
The `predict` function can also make predictions for more than one multivariate observation. In this scenario
an observation matrix used. In the example below the observation matrix used to build the multivariate regression model
is passed to the `predict` function and it returns an array of predictions.
[source,text]
----
let(a=random(collection2, q="*:*", rows="5000", fl="filesize_d, service_d, response_d"),
b=col(a, filesize_d),
c=col(a, service_d),
d=col(a, response_d),
e=transpose(matrix(b, c)),
f=olsRegress(e, d),
g=predict(f, e))
----
When this expression is sent to the /stream handler it responds with:
[source,json]
----
{
"result-set": {
"docs": [
{
"e": [
685.498283591961,
801.2175699959365,
776.7638245911025,
610.3559852681935,
751.0925865965207,
787.2914663381897,
744.3632053810668,
688.3729301599697,
765.367783417171,
724.9309687628346,
834.4350712384264,
...
]
},
{
"EOF": true,
"RESPONSE_TIME": 113
}
]
}
}
----
=== Residuals
Once the predictions are generated the residuals can be calculated using the same approach used with
simple linear regression.
Below is an example of the residuals calculation following a multivariate linear regression. In the example
the predictions stored variable *g* are subtracted from observed values stored in variable *d*.
[source,text]
----
let(a=random(collection2, q="*:*", rows="5000", fl="filesize_d, service_d, response_d"),
b=col(a, filesize_d),
c=col(a, service_d),
d=col(a, response_d),
e=transpose(matrix(b, c)),
f=olsRegress(e, d),
g=predict(f, e),
h=ebeSubtract(d, g))
----
When this expression is sent to the /stream handler it responds with:
[source,json]
----
{
"result-set": {
"docs": [
{
"e": [
1.1368683772161603e-13,
1.1368683772161603e-13,
0,
1.1368683772161603e-13,
0,
1.1368683772161603e-13,
0,
2.2737367544323206e-13,
1.1368683772161603e-13,
2.2737367544323206e-13,
1.1368683772161603e-13,
...
]
},
{
"EOF": true,
"RESPONSE_TIME": 113
}
]
}
}
----

View File

@ -0,0 +1,137 @@
= Scalar Math
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
The most basic math expressions are scalar expressions. Scalar expressions
perform mathematical operations on numbers.
For example the expression below adds two numbers together:
[source,text]
----
add(1, 1)
----
When this expression is sent to the /stream handler it
responds with:
[source,json]
----
{
"result-set": {
"docs": [
{
"return-value": 2
},
{
"EOF": true,
"RESPONSE_TIME": 2
}
]
}
}
----
Math expressions can be nested. For example in the expression
below the output of the `add` function is the second parameter
of the `pow` function:
[source,text]
----
pow(10, add(1,1))
----
This expression returns the following response:
[source,json]
----
{
"result-set": {
"docs": [
{
"return-value": 100
},
{
"EOF": true,
"RESPONSE_TIME": 0
}
]
}
}
----
== Streaming Scalar Math
Scalar math expressions can also be applied to each tuple in a stream
through use of the `select` stream decorator. The `select` function wraps a
stream of tuples and selects fields to include in each tuple.
The `select` function can also use math expressions to compute
new values and add them to the outgoing tuples.
In the example below the `select` expression is wrapping a search
expression. The `select` function is selecting the *price_f* field
and computing a new field called *newPrice* using the `mult` math
expression.
The first parameter of the `mult` expression is the *price_f* field.
The second parameter is the scalar value 10. This multiplies the value
of the *price_f* field in each tuple by 10.
[source,text]
----
select(search(collection2, q="*:*", fl="price_f", sort="price_f desc", rows="3"),
price_f,
mult(price_f, 10) as newPrice)
----
When this expression is sent to the /stream handler it responds with:
[source,json]
----
{
"result-set": {
"docs": [
{
"price_f": 0.99999994,
"newPrice": 9.9999994
},
{
"price_f": 0.99999994,
"newPrice": 9.9999994
},
{
"price_f": 0.9999992,
"newPrice": 9.999992
},
{
"EOF": true,
"RESPONSE_TIME": 3
}
]
}
}
----
== More Scalar Math Functions
The following scalar math functions are available in the math expressions library:
`abs`, `add`, `div`, `mult`, `sub`, `log`,
`pow`, `mod`, `ceil`, `floor`, `sin`, `asin`,
`sinh`, `cos`, `acos`, `cosh`, `tan`, `atan`,
`tanh`, `round`, `precision`, `sqrt`, `cbrt`

View File

@ -0,0 +1,575 @@
= Statistics
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
This section of the user guide covers the core statistical functions
available in math expressions.
== Descriptive Statistics
The `describe` function can be used to return descriptive statistics about a
numeric array. The `describe` function returns a single *tuple* with name/value
pairs containing descriptive statistics.
Below is a simple example that selects a random sample of documents,
vectorizes the *price_f* field in the result set and uses the `describe` function to
return descriptive statistics about the vector:
[source,text]
----
let(a=random(collection1, q="*:*", rows="1500", fl="price_f"),
b=col(a, price_f),
c=describe(b))
----
When this expression is sent to the /stream handler it responds with:
[source,json]
----
{
"result-set": {
"docs": [
{
"c": {
"sumsq": 4999.041975263254,
"max": 0.99995726,
"var": 0.08344429493940454,
"geometricMean": 0.36696588922559575,
"sum": 7497.460565552007,
"kurtosis": -1.2000739963006035,
"N": 15000,
"min": 0.00012338161,
"mean": 0.49983070437013266,
"popVar": 0.08343873198640858,
"skewness": -0.001735537500095477,
"stdev": 0.28886726179926403
}
},
{
"EOF": true,
"RESPONSE_TIME": 305
}
]
}
}
----
== Histograms and Frequency Tables
Histograms and frequency tables are are tools for understanding the distribution
of a random variable.
The `hist` function creates a histogram designed for usage with continuous data. The
`freqTable` function creates a frequency table for use with discrete data.
=== histograms
Below is an example that selects a random sample, creates a vector from the
result set and uses the `hist` function to return a histogram with 5 bins.
The `hist` function returns a list of tuples with summary statistics for each bin.
[source,text]
----
let(a=random(collection1, q="*:*", rows="15000", fl="price_f"),
b=col(a, price_f),
c=hist(b, 5))
----
When this expression is sent to the /stream handler it responds with:
[source,json]
----
{
"result-set": {
"docs": [
{
"c": [
{
"prob": 0.2057939717603699,
"min": 0.000010371208,
"max": 0.19996578,
"mean": 0.10010319358402578,
"var": 0.003366805016271609,
"cumProb": 0.10293732468049072,
"sum": 309.0185585938884,
"stdev": 0.058024176136086666,
"N": 3087
},
{
"prob": 0.19381868629885585,
"min": 0.20007741,
"max": 0.3999073,
"mean": 0.2993590803885827,
"var": 0.003401644034068929,
"cumProb": 0.3025295802728267,
"sum": 870.5362057700005,
"stdev": 0.0583236147205309,
"N": 2908
},
{
"prob": 0.20565789836690007,
"min": 0.39995712,
"max": 0.5999038,
"mean": 0.4993620963792545,
"var": 0.0033158364923609046,
"cumProb": 0.5023006239697967,
"sum": 1540.5320673300018,
"stdev": 0.05758330046429177,
"N": 3085
},
{
"prob": 0.19437108496008693,
"min": 0.6000449,
"max": 0.79973197,
"mean": 0.7001752711861512,
"var": 0.0033895105082360185,
"cumProb": 0.7026537198687285,
"sum": 2042.4112660500066,
"stdev": 0.058219502816805456,
"N": 2917
},
{
"prob": 0.20019582213899467,
"min": 0.7999126,
"max": 0.99987316,
"mean": 0.8985428275824184,
"var": 0.003312360017780078,
"cumProb": 0.899450457219298,
"sum": 2698.3241112299997,
"stdev": 0.05755310606544253,
"N": 3003
}
]
},
{
"EOF": true,
"RESPONSE_TIME": 322
}
]
}
}
----
The `col` function can be used to *vectorize* a column of data from the list of tuples
returned by the `hist` function.
In the example below, the *N* field,
which is the number of observations in the each bin, is returned as a vector.
[source,text]
----
let(a=random(collection1, q="*:*", rows="15000", fl="price_f"),
b=col(a, price_f),
c=hist(b, 11),
d=col(c, N))
----
When this expression is sent to the /stream handler it responds with:
[source,json]
----
{
"result-set": {
"docs": [
{
"d": [
1387,
1396,
1391,
1357,
1384,
1360,
1367,
1375,
1307,
1310,
1366
]
},
{
"EOF": true,
"RESPONSE_TIME": 307
}
]
}
}
----
=== Frequency Tables
The `freqTable` function returns a frequency distribution for a discrete data set.
The `freqTable` function doesn't create bins like the histogram. Instead it counts
the occurrence of each discrete data value and returns a list of tuples with the
frequency statistics for each value. Fields from a frequency table can be vectorized using
using the `col` function in the same manner as a histogram.
Below is a simple example of a frequency table built from a random sample of
a discrete variable.
[source,text]
----
let(a=random(collection1, q="*:*", rows="15000", fl="day_i"),
b=col(a, day_i),
c=freqTable(b))
----
When this expression is sent to the /stream handler it responds with:
[source,json]
----
"result-set": {
"docs": [
{
"c": [
{
"pct": 0.0318,
"count": 477,
"cumFreq": 477,
"cumPct": 0.0318,
"value": 0
},
{
"pct": 0.033133333333333334,
"count": 497,
"cumFreq": 974,
"cumPct": 0.06493333333333333,
"value": 1
},
{
"pct": 0.03426666666666667,
"count": 514,
"cumFreq": 1488,
"cumPct": 0.0992,
"value": 2
},
{
"pct": 0.0346,
"count": 519,
"cumFreq": 2007,
"cumPct": 0.1338,
"value": 3
},
{
"pct": 0.03133333333333333,
"count": 470,
"cumFreq": 2477,
"cumPct": 0.16513333333333333,
"value": 4
},
{
"pct": 0.03333333333333333,
"count": 500,
"cumFreq": 2977,
"cumPct": 0.19846666666666668,
"value": 5
}
]
},
{
"EOF": true,
"RESPONSE_TIME": 281
}
]
}
}
----
== Percentiles
The `percentile` function returns the estimated value for a specific percentile in
a sample set. The example below returns the estimation for the 95th percentile
of the *price_f* field.
[source,text]
----
let(a=random(collection1, q="*:*", rows="15000", fl="price_f"),
b=col(a, price_f),
c=percentile(b, 95))
----
When this expression is sent to the /stream handler it responds with:
[source,json]
----
{
"result-set": {
"docs": [
{
"c": 312.94
},
{
"EOF": true,
"RESPONSE_TIME": 286
}
]
}
}
----
== Covariance and Correlation
Covariance and Correlation measure how random variables move
together.
=== Covariance and Covariance Matrices
The `cov` function calculates the covariance of two sample sets of data.
In the example below covariance is calculated for two numeric
arrays.
The example below uses arrays created by the `array` function. Its important to note that
vectorized data from Solr Cloud collections can be used with any function that
operates on arrays.
[source,text]
----
let(a=array(1, 2, 3, 4, 5),
b=array(100, 200, 300, 400, 500),
c=cov(a, b))
----
When this expression is sent to the /stream handler it responds with:
[source,json]
----
{
"result-set": {
"docs": [
{
"c": 0.9484775349999998
},
{
"EOF": true,
"RESPONSE_TIME": 286
}
]
}
}
----
If a matrix is passed to the `cov` function it will automatically compute a covariance
matrix for the columns of the matrix.
Notice in the example three numeric arrays are added as rows
in a matrix. The matrix is then transposed to turn the rows into
columns, and the covariance matrix is computed for the columns of the
matrix.
[source,text]
----
let(a=array(1, 2, 3, 4, 5),
b=array(100, 200, 300, 400, 500),
c=array(30, 40, 80, 90, 110),
d=transpose(matrix(a, b, c)),
e=cov(d))
----
When this expression is sent to the /stream handler it responds with:
[source,json]
----
{
"result-set": {
"docs": [
{
"e": [
[
2.5,
250,
52.5
],
[
250,
25000,
5250
],
[
52.5,
5250,
1150
]
]
},
{
"EOF": true,
"RESPONSE_TIME": 2
}
]
}
}
----
=== Correlation and Correlation Matrices
Correlation is measure of covariance that has been scaled between
-1 and 1.
Three correlation types are supported:
* *pearsons* (default)
* *kendalls*
* *spearmans*
The type of correlation is specified by adding the *type* named parameter in the
function call. The example below demonstrates the use of the *type*
named parameter.
[source,text]
----
let(a=array(1, 2, 3, 4, 5),
b=array(100, 200, 300, 400, 5000),
c=corr(a, b, type=spearmans))
----
When this expression is sent to the /stream handler it responds with:
[source,json]
----
{
"result-set": {
"docs": [
{
"c": 0.7432941462471664
},
{
"EOF": true,
"RESPONSE_TIME": 0
}
]
}
}
----
Like the `cov` function, the `corr` function automatically builds a correlation matrix
if a matrix is passed as a parameter. The correlation matrix is built by correlating the columns
of the matrix passed in.
== Statistical Inference Tests
Statistical inference tests test a hypothesis on *random samples* and return p-values which
can be used to infer the reliability of the test for the entire population.
The following statistical inference tests are available:
* `anova`: One-Way-Anova tests if there is a statistically significant difference in the
means of two or more random samples.
* `ttest`: The T-test tests if there is a statistically significant difference in the means of two
random samples.
* `pairedTtest`: The paired t-test tests if there is a statistically significant difference
in the means of two random samples with paired data.
* `gTestDataSet`: The G-test tests if two samples of binned discrete data were drawn
from the same population.
* `chiSquareDataset`: The Chi-Squared test tests if two samples of binned discrete data were
drawn from the same population.
* `mannWhitney`: The Mann-Whitney test is a non-parametric test that tests if two
samples of continuous were pulled
from the same population. The Mann-Whitney test is often used instead of the T-test when the
underlying assumptions of the T-test are not
met.
* `ks`: The Kolmogorov-Smirnov test tests if two samples of continuous data were drawn from
the same distribution.
Below is a simple example of a T-test performed on two random samples.
The returned p-value of .93 means we can accept the null hypothesis
that the two samples do not have statistically significantly differences in the means.
[source,text]
----
let(a=random(collection1, q="*:*", rows="1500", fl="price_f"),
b=random(collection1, q="*:*", rows="1500", fl="price_f"),
c=col(a, price_f),
d=col(b, price_f),
e=ttest(c, d))
----
When this expression is sent to the /stream handler it responds with:
[source,json]
----
{
"result-set": {
"docs": [
{
"e": {
"p-value": 0.9350135639249795,
"t-statistic": 0.081545541074817
}
},
{
"EOF": true,
"RESPONSE_TIME": 48
}
]
}
}
----
== Transformations
In statistical analysis its often useful to transform data sets before performing
statistical calculations. The statistical function library includes the following
commonly used transformations:
* `rank`: Returns a numeric array with the rank-transformed value of each element of the original
array.
* `log`: Returns a numeric array with the natural log of each element of the original array.
* `sqrt`: Returns a numeric array with the square root of each element of the original array.
* `cbrt`: Returns a numeric array with the cube root of each element of the original array.
Below is an example of a ttest performed on log transformed data sets:
[source,text]
----
let(a=random(collection1, q="*:*", rows="1500", fl="price_f"),
b=random(collection1, q="*:*", rows="1500", fl="price_f"),
c=log(col(a, price_f)),
d=log(col(b, price_f)),
e=ttest(c, d))
----
When this expression is sent to the /stream handler it responds with:
[source,json]
----
{
"result-set": {
"docs": [
{
"e": {
"p-value": 0.9655110070265056,
"t-statistic": -0.04324265449471238
}
},
{
"EOF": true,
"RESPONSE_TIME": 58
}
]
}
}
----

View File

@ -1,5 +1,5 @@
= Streaming Expressions = Streaming Expressions
:page-children: stream-source-reference, stream-decorator-reference, stream-evaluator-reference, statistical-programming, graph-traversal :page-children: stream-source-reference, stream-decorator-reference, stream-evaluator-reference, statistical-programming, math-expressions, graph-traversal
// Licensed to the Apache Software Foundation (ASF) under one // Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file // or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information // distributed with this work for additional information

View File

@ -0,0 +1,237 @@
= Text Analysis and Term Vectors
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
TF-IDF term vectors are often used to represent text documents when performing text mining
and machine learning operations. This section of the user guide describes how to
use math expressions to perform text analysis and create TF-IDF term vectors.
== Text Analysis
The `analyze` function applies a Solr analyzer to a text field and returns the tokens
emitted by the analyzer in an array. Any analyzer chain that is attached to a field in Solr's
schema can be used with the `analyze` function.
In the example below, the text "hello world" is analyzed using the analyzer chain attached to the *subject* field in
the schema. The *subject* field is defined as the field type *text_general* and the text is analyzed using the
analysis chain configured for the *text_general* field type.
[source,text]
----
analyze("hello world", subject)
----
When this expression is sent to the /stream handler it
responds with:
[source,json]
----
{
"result-set": {
"docs": [
{
"return-value": [
"hello",
"world"
]
},
{
"EOF": true,
"RESPONSE_TIME": 0
}
]
}
}
----
=== Annotating Documents
The `analyze` function can be used inside of a `select` function to annotate documents with the tokens
generated by the analysis.
The example below is performing a `search` in collection1. Each tuple returned by the `search`
contains an *id* and *subject*. For each tuple, the
`select` function is selecting the *id* field and calling the `analyze` function on the *subject* field.
The analyzer chain specified by the *subject_bigram* field is configured to perform a bigram analysis.
The tokens generated by the `analyze` function are added to each tuple in a field called `terms`.
Notice in the output that an array of bigram terms have been added to the tuples.
[source,text]
----
select(search(collection1, q="*:*", fl="id, subject", sort="id asc"),
id,
analyze(subject, subject_bigram) as terms)
----
When this expression is sent to the /stream handler it
responds with:
[source,json]
----
{
"result-set": {
"docs": [
{
"terms": [
"text analysis",
"analysis example"
],
"id": "1"
},
{
"terms": [
"example number",
"number two"
],
"id": "2"
},
{
"EOF": true,
"RESPONSE_TIME": 4
}
]
}
}
----
== Term Vectors
The `termVectors` function can be used to build *TF-IDF*
term vectors from the terms generated by the `analyze` function.
The `termVectors` function operates over a list of tuples that contain a field
called *id* and a field called *terms*. Notice
that this is the exact output structure of the *document annotation* example above.
The `termVectors` function builds a *matrix* from the list of tuples. There is *row* in the
matrix for each tuple in the list. There is a *column* in the matrix for each term in the *terms*
field.
The example below builds on the *document annotation* example.
The list of tuples are stored in variable *a*. The `termVectors` function
operates over variable *a* and builds a matrix with *2 rows* and *4 columns*.
The `termVectors` function also sets the *row* and *column* labels of the term vectors matrix.
The row labels are the document ids and the
column labels are the terms.
In the example below, the `getRowLabels` and `getColumnLabels` functions return
the row and column labels which are then stored in variables *c* and *d*.
The *echo* parameter is echoing variables *c* and *d*, so the output includes
the row and column labels.
[source,text]
----
let(echo="c, d",
a=select(search(collection3, q="*:*", fl="id, subject", sort="id asc"),
id,
analyze(subject, subject_bigram) as terms),
b=termVectors(a, minTermLength=4, minDocFreq=0, maxDocFreq=1),
c=getRowLabels(b),
d=getColumnLabels(b))
----
When this expression is sent to the /stream handler it
responds with:
[source,json]
----
{
"result-set": {
"docs": [
{
"c": [
"1",
"2"
],
"d": [
"analysis example",
"example number",
"number two",
"text analysis"
]
},
{
"EOF": true,
"RESPONSE_TIME": 5
}
]
}
}
----
=== TF-IDF Values
The values within the term vectors matrix are the TF-IDF values for each term in each document. The
example below shows the values of the matrix.
[source,text]
----
let(a=select(search(collection3, q="*:*", fl="id, subject", sort="id asc"),
id,
analyze(subject, subject_bigram) as terms),
b=termVectors(a, minTermLength=4, minDocFreq=0, maxDocFreq=1))
----
When this expression is sent to the /stream handler it
responds with:
[source,json]
----
{
"result-set": {
"docs": [
{
"b": [
[
1.4054651081081644,
0,
0,
1.4054651081081644
],
[
0,
1.4054651081081644,
1.4054651081081644,
0
]
]
},
{
"EOF": true,
"RESPONSE_TIME": 5
}
]
}
}
----
=== Limiting the Noise
One of the key challenges when with working term vectors is that text often has a significant amount of noise
which can obscure the important terms in the data. The `termVectors` function has several parameters
designed to filter out the less meaningful terms. This is also important because eliminating
the noisy terms helps keep the term vector matrix small enough to fit comfortably in memory.
There are four parameters designed to filter noisy terms from the term vector matrix:
* *minTermLength*: The minimum term length required to include the term in the matrix.
* *minDocFreq*: The minimum *percentage* (0 to 1) of documents the term must appear in to be included in the index.
* *maxDocFreq*: The maximum *percentage* (0 to 1) of documents the term can appear in to be included in the index.
* *exclude*: A comma delimited list of strings used to exclude terms. If a term contains any of the exclude strings that
term will be excluded from the term vector.

View File

@ -0,0 +1,431 @@
= Time Series
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
This section of the user guide provides an overview of time series *aggregation*,
*smoothing* and *differencing*.
== Time Series Aggregation
The `timeseries` function performs fast, distributed time
series aggregation leveraging Solr's builtin faceting and date math capabilities.
The example below performs a monthly time series aggregation:
[source,text]
----
timeseries(collection1,
q=*:*,
field="recdate_dt",
start="2012-01-20T17:33:18Z",
end="2012-12-20T17:33:18Z",
gap="+1MONTH",
format="YYYY-MM",
count(*))
----
When this expression is sent to the /stream handler it responds with:
[source,json]
----
{
"result-set": {
"docs": [
{
"recdate_dt": "2012-01",
"count(*)": 8703
},
{
"recdate_dt": "2012-02",
"count(*)": 8648
},
{
"recdate_dt": "2012-03",
"count(*)": 8621
},
{
"recdate_dt": "2012-04",
"count(*)": 8533
},
{
"recdate_dt": "2012-05",
"count(*)": 8792
},
{
"recdate_dt": "2012-06",
"count(*)": 8598
},
{
"recdate_dt": "2012-07",
"count(*)": 8679
},
{
"recdate_dt": "2012-08",
"count(*)": 8469
},
{
"recdate_dt": "2012-09",
"count(*)": 8637
},
{
"recdate_dt": "2012-10",
"count(*)": 8536
},
{
"recdate_dt": "2012-11",
"count(*)": 8785
},
{
"EOF": true,
"RESPONSE_TIME": 16
}
]
}
}
----
== Vectorizing the Time Series
Before a time series result can be operated on by math expressions
the data will need to be vectorized. Specifically
in the example above, the aggregation field count(*) will need to by moved into an array.
As described in the Streams and Vectorization section of the user guide, the `col` function can be used
to copy a numeric column from a list of tuples into an array.
The expression below demonstrates the vectorization of the count(*) field.
[source,text]
----
let(a=timeseries(collection1,
q=*:*,
field="test_dt",
start="2012-01-20T17:33:18Z",
end="2012-12-20T17:33:18Z",
gap="+1MONTH",
format="YYYY-MM",
count(*)),
b=col(a, count(*)))
----
When this expression is sent to the /stream handler it responds with:
[source,json]
----
{
"result-set": {
"docs": [
{
"b": [
8703,
8648,
8621,
8533,
8792,
8598,
8679,
8469,
8637,
8536,
8785
]
},
{
"EOF": true,
"RESPONSE_TIME": 5
}
]
}
}
----
== Smoothing
Time series smoothing is often used to remove the noise from a time series and help
spot the underlying trends.
The math expressions library has three *sliding window* approaches
for time series smoothing. The *sliding window* approaches use a summary value
from a sliding window of the data to calculate a new set of smoothed data points.
The three *sliding window* functions are lagging indicators, which means
they don't start to move in the direction of the trend until the trend effects
the summary value of the sliding window. Because of this lagging quality these smoothing
functions are often used to confirm the direction of the trend.
=== Moving Average
The `movingAvg` function computes a simple moving average over a sliding window of data.
The example below generates a time series, vectorizes the count(*) field and computes the
moving average with a window size of 3.
The moving average function returns an array that is of shorter length
then the original data set. This is because results are generated only when a full window of data
is available for computing the average. With a window size of three the moving average will
begin generating results at the 3rd value. The prior values are not included in the result.
This is true for all the sliding window functions.
[source,text]
----
let(a=timeseries(collection1,
q=*:*,
field="test_dt",
start="2012-01-20T17:33:18Z",
end="2012-12-20T17:33:18Z",
gap="+1MONTH",
format="YYYY-MM",
count(*)),
b=col(a, count(*)),
c=movingAvg(b, 3))
----
When this expression is sent to the /stream handler it responds with:
[source,json]
----
{
"result-set": {
"docs": [
{
"c": [
8657.333333333334,
8600.666666666666,
8648.666666666666,
8641,
8689.666666666666,
8582,
8595,
8547.333333333334,
8652.666666666666
]
},
{
"EOF": true,
"RESPONSE_TIME": 7
}
]
}
}
----
=== Exponential Moving Average
The `expMovingAvg` function uses a different formula for computing the moving average that
responds faster to changes in the underlying data. This means that it is
less of a lagging indicator then the simple moving average.
Below is an example that computes an exponential moving average:
[source,text]
----
let(a=timeseries(collection1, q=*:*,
field="test_dt",
start="2012-01-20T17:33:18Z",
end="2012-12-20T17:33:18Z",
gap="+1MONTH",
format="YYYY-MM",
count(*)),
b=col(a, count(*)),
c=expMovingAvg(b, 3))
----
When this expression is sent to the /stream handler it responds with:
[source,json]
----
{
"result-set": {
"docs": [
{
"c": [
8657.333333333334,
8595.166666666668,
8693.583333333334,
8645.791666666668,
8662.395833333334,
8565.697916666668,
8601.348958333334,
8568.674479166668,
8676.837239583334
]
},
{
"EOF": true,
"RESPONSE_TIME": 5
}
]
}
}
----
=== Moving Median
The `movingMedian` function uses the median of the sliding window rather than the average.
In many cases the moving median will be more *robust* to outliers then moving averages.
Below is an example computing the moving median:
[source,text]
----
let(a=timeseries(collection1,
q=*:*,
field="test_dt",
start="2012-01-20T17:33:18Z",
end="2012-12-20T17:33:18Z",
gap="+1MONTH",
format="YYYY-MM",
count(*)),
b=col(a, count(*)),
c=movingMedian(b, 3))
----
When this expression is sent to the /stream handler it responds with:
[source,json]
----
{
"result-set": {
"docs": [
{
"c": [
8648,
8621,
8621,
8598,
8679,
8598,
8637,
8536,
8637
]
},
{
"EOF": true,
"RESPONSE_TIME": 7
}
]
}
}
----
== Differencing
Differencing is often used to remove the
trend or seasonality from a time series. This is known as making a time series
*stationary*.
=== First Difference
The actual technique of differencing is to use the difference between values rather then the
original values. The *first difference* takes the difference between a value and the value
that came directly before it. The first difference is often used to remove the trend
from a time series.
In the example below, the `diff` function computes the first difference of a time series.
The result array length is one value smaller then the original array.
This is because the `diff` function only returns a result for values
where the prior value has been subtracted.
[source,text]
----
let(a=timeseries(collection1,
q=*:*,
field="test_dt",
start="2012-01-20T17:33:18Z",
end="2012-12-20T17:33:18Z",
gap="+1MONTH",
format="YYYY-MM",
count(*)),
b=col(a, count(*)),
c=diff(b))
----
When this expression is sent to the /stream handler it responds with:
[source,json]
----
{
"result-set": {
"docs": [
{
"c": [
-55,
-27,
-88,
259,
-194,
81,
-210,
168,
-101,
249
]
},
{
"EOF": true,
"RESPONSE_TIME": 11
}
]
}
}
----
=== Lagged Differences
The `diff` function has an optional second parameter to specify a lag in the difference.
If a lag is specified the difference is taken between a value and the value at a specified
lag in the past. Lagged differences are often used to remove seasonality from a time series.
The simple example below demonstrates how lagged differencing works.
Notice that the array in the example follows a simple repeated pattern. This type of pattern
is often displayed with seasonality. In this example we can remove this pattern using
the `diff` function with a lag of 4. This will subtract the value lagging four indexes
behind the current index. Notice that result set size is the original array size minus the lag.
This is because the `diff` function only returns results for values where the lag of 4
is possible to compute.
[source,text]
----
let(a=array(1,2,5,2,1,2,5,2,1,2,5),
b=diff(a, 4))
----
Expression is sent to the /stream handler it responds with:
[source,json]
----
{
"result-set": {
"docs": [
{
"b": [
0,
0,
0,
0,
0,
0,
0
]
},
{
"EOF": true,
"RESPONSE_TIME": 0
}
]
}
}
----

View File

@ -0,0 +1,147 @@
= Variables
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
== The Let Expressions
The `let` expression sets variables and returns
the value of the last variable by default. The output of any streaming expression
or math expression can be set to a variable.
Below is a simple example setting three variables *a*, *b*
and *c*. Variables *a* and *b* are set to arrays. The variable *c* is set
to the output of the `ebeAdd` function which performs element-by-element
addition of the two arrays.
Notice that the last variable, *c*, is returned.
[source,text]
----
let(a=array(1, 2, 3),
b=array(10, 20, 30),
c=ebeAdd(a, b))
----
When this expression is sent to the /stream handler it
responds with:
[source,json]
----
{
"result-set": {
"docs": [
{
"c": [
11,
22,
33
]
},
{
"EOF": true,
"RESPONSE_TIME": 4
}
]
}
}
----
== Echoing Variables
All variables can be output by setting the *echo* variable to *true*.
[source,text]
----
let(echo=true,
a=array(1, 2, 3),
b=array(10, 20, 30),
c=ebeAdd(a, b))
----
When this expression is sent to the /stream handler it
responds with:
[source,json]
----
{
"result-set": {
"docs": [
{
"a": [
1,
2,
3
],
"b": [
10,
20,
30
],
"c": [
11,
22,
33
]
},
{
"EOF": true,
"RESPONSE_TIME": 0
}
]
}
}
----
A specific set of variables can be echoed by providing a comma delimited
list of variables to the echo parameter.
[source,text]
----
let(echo="a,b",
a=array(1, 2, 3),
b=array(10, 20, 30),
c=ebeAdd(a, b))
----
When this expression is sent to the /stream handler it
responds with:
[source,json]
----
{
"result-set": {
"docs": [
{
"a": [
1,
2,
3
],
"b": [
10,
20,
30
]
},
{
"EOF": true,
"RESPONSE_TIME": 0
}
]
}
}
----

View File

@ -0,0 +1,343 @@
= Vector Math
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
This section of the user guide covers vector math and
vector manipulation functions.
== Arrays
Arrays can be created with the `array` function.
For example the expression below creates a numeric array with
three elements:
[source,text]
----
array(1, 2, 3)
----
When this expression is sent to the /stream handler it responds with
a json array.
[source,json]
----
{
"result-set": {
"docs": [
{
"return-value": [
1,
2,
3
]
},
{
"EOF": true,
"RESPONSE_TIME": 0
}
]
}
}
----
== Array Operations
Arrays can be passed as parameters to functions that operate on arrays.
For example, an array can be reversed with the `rev` function:
[source,text]
----
rev(array(1, 2, 3))
----
When this expression is sent to the /stream handler it responds with:
[source,json]
----
{
"result-set": {
"docs": [
{
"return-value": [
3,
2,
1
]
},
{
"EOF": true,
"RESPONSE_TIME": 0
}
]
}
}
----
Another example is the `length` function,
which returns the length of an array:
[source,text]
----
length(array(1, 2, 3))
----
When this expression is sent to the /stream handler it responds with:
[source,json]
----
{
"result-set": {
"docs": [
{
"return-value": 3
},
{
"EOF": true,
"RESPONSE_TIME": 0
}
]
}
}
----
A slice of an array can be taken with the `copyOfRange` function, which
copies elements of an array from a start and end range.
[source,text]
----
copyOfRange(array(1,2,3,4,5,6), 1, 4)
----
When this expression is sent to the /stream handler it responds with:
[source,json]
----
{
"result-set": {
"docs": [
{
"return-value": [
2,
3,
4
]
},
{
"EOF": true,
"RESPONSE_TIME": 0
}
]
}
}
----
== Vector Summarizations and Norms
There are a set of functions that perform
summerizations and return norms of arrays. These functions
operate over an array and return a single
value. The following vector summarizations and norm functions are available:
`mult`, `add`, `sumSq`, `mean`, `l1norm`, `l2norm`, `linfnorm`.
The example below is using the `mult` function,
which multiples all the values of an array.
[source,text]
----
mult(array(2,4,8))
----
When this expression is sent to the /stream handler it responds with:
[source,json]
----
{
"result-set": {
"docs": [
{
"return-value": 64
},
{
"EOF": true,
"RESPONSE_TIME": 0
}
]
}
}
----
The vector norm functions provide different formulas for calculating vector magnitude.
The example below calculates the *l2norm* of an array.
[source,text]
----
l2norm(array(2,4,8))
----
When this expression is sent to the /stream handler it responds with:
[source,json]
----
{
"result-set": {
"docs": [
{
"return-value": 9.16515138991168
},
{
"EOF": true,
"RESPONSE_TIME": 0
}
]
}
}
----
== Scalar Vector Math
Scalar vector math functions add, subtract, multiple or divide a scalar value with every value in a vector.
The following functions perform these operations: `scalarAdd`, `scalarSubtract`, `scalarMultiply`
and `scalarDivide`.
Below is an example of the `scalarMultiply` function, which multiplies the scalar value 3 with
every value of an array.
[source,text]
----
scalarMultiply(3, array(1,2,3))
----
When this expression is sent to the /stream handler it responds with:
[source,json]
----
{
"result-set": {
"docs": [
{
"return-value": [
3,
6,
9
]
},
{
"EOF": true,
"RESPONSE_TIME": 0
}
]
}
}
----
== Element-By-Element Vector Math
Two vectors can be added, subtracted, multiplied and divided using element-by-element
vector math functions. The following element-by-element vector math functions are:
`ebeAdd`, `ebeSubtract`, `ebeMultiply`, `ebeDivide`.
The expression below performs the element-by-element subtraction of two arrays.
[source,text]
----
ebeSubtract(array(10, 15, 20), array(1,2,3))
----
When this expression is sent to the /stream handler it responds with:
[source,json]
----
{
"result-set": {
"docs": [
{
"return-value": [
9,
13,
17
]
},
{
"EOF": true,
"RESPONSE_TIME": 5
}
]
}
}
----
== Dot Product and Cosine Similarity
The `dotProduct` and `cosineSimilarity` functions are often used as similarity measures between two
sparse vectors. The `dotProduct` is a measure of both angle and magnitude while `cosineSimilarity`
is a measure only of angle.
Below is an example of the `dotProduct` function:
[source,text]
----
dotProduct(array(2,3,0,0,0,1), array(2,0,1,0,0,3))
----
When this expression is sent to the /stream handler it responds with:
[source,json]
----
{
"result-set": {
"docs": [
{
"return-value": 7
},
{
"EOF": true,
"RESPONSE_TIME": 15
}
]
}
}
----
Below is an example of the `cosineSimilarity` function:
[source,text]
----
cosineSimilarity(array(2,3,0,0,0,1), array(2,0,1,0,0,3))
----
When this expression is sent to the /stream handler it responds with:
[source,json]
----
{
"result-set": {
"docs": [
{
"return-value": 0.5
},
{
"EOF": true,
"RESPONSE_TIME": 7
}
]
}
}
----

View File

@ -0,0 +1,243 @@
= Streams and Vectorization
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
This section of the user guide explores techniques
for retrieving streams of data from Solr and vectorizing the
*numeric* fields.
The next chapter of the user guide covers
Text Analysis and Term Vectors which describes how to
vectorize *text* fields.
== Streams
Streaming Expressions has a wide range of stream sources that can be used to
retrieve data from Solr Cloud collections. Math expressions can be used
to vectorize and analyze the results sets.
Below are some of the key stream sources:
* *random*: Random sampling is widely used in statistics, probability and machine learning.
The `random` function returns a random sample of search results that match a
query. The random samples can be vectorized and operated on by math expressions and the results
can be used to describe and make inferences about the entire population.
* *timeseries*: The `timeseries`
expression provides fast distributed time series aggregations, which can be
vectorized and analyzed with math expressions.
* *knnSearch*: K-nearest neighbor is a core machine learning algorithm. The `knnSearch`
function is a specialized knn algorithm optimized to find the k-nearest neighbors of a document in
a distributed index. Once the nearest neighbors are retrieved they can be vectorized
and operated on by machine learning and text mining algorithms.
* *sql*: SQL is the primary query language used by data scientists. The `sql` function supports
data retrieval using a subset of SQL which includes both full text search and
fast distributed aggregations. The result sets can then be vectorized and operated
on by math expressions.
* *jdbc*: The `jdbc` function allows data from any JDBC compliant data source to be combined with
streams originating from Solr. Result sets from outside data sources can be vectorized and operated
on by math expressions in the same manner as result sets originating from Solr.
* *topic*: Messaging is an important foundational technology for large scale computing. The `topic`
function provides publish/subscribe messaging capabilities by treating
Solr Cloud as a distributed message queue. Topics are extremely powerful
because they allow subscription by query. Topics can be use to support a broad set of
use cases including bulk text mining operations and AI alerting.
* *nodes*: Graph queries are frequently used by recommendation engines and are an important
machine learning tool. The `nodes` function provides fast, distributed, breadth
first graph traversal over documents in a Solr Cloud collection. The node sets collected
by the `nodes` function can be operated on by statistical and machine learning expressions to
gain more insight into the graph.
* *search*: Ranked search results are a powerful tool for finding the most relevant
documents from a large document corpus. The `search` expression
returns the top N ranked search results that match any
Solr query, including geo-spatial queries. The smaller set of relevant
documents can then be explored with statistical, machine learning and
text mining expressions to gather insights about the data set.
== Assigning Streams to Variables
The output of any streaming expression can be set to a variable.
Below is a very simple example using the `random` function to fetch
three random samples from collection1. The random samples are returned
as *tuples*, which contain name/value pairs.
[source,text]
----
let(a=random(collection1, q="*:*", rows="3", fl="price_f"))
----
When this expression is sent to the /stream handler it responds with:
[source,json]
----
{
"result-set": {
"docs": [
{
"a": [
{
"price_f": 0.7927976
},
{
"price_f": 0.060795486
},
{
"price_f": 0.55128294
}
]
},
{
"EOF": true,
"RESPONSE_TIME": 11
}
]
}
}
----
== Creating a Vector with the *col* Function
The `col` function iterates over a list of tuples and copies the values
from a specific column into an *array*.
The output of the `col` function is an numeric array that can be set to a
variable and operated on by math expressions.
Below is an example of the `col` function:
[source,text]
----
let(a=random(collection1, q="*:*", rows="3", fl="price_f"),
b=col(a, price_f))
----
[source,json]
----
{
"result-set": {
"docs": [
{
"b": [
0.42105234,
0.85237443,
0.7566981
]
},
{
"EOF": true,
"RESPONSE_TIME": 9
}
]
}
}
----
== Applying Math Expressions to the Vector
Once a vector has been created any math expression that operates on vectors
can be applied. In the example below the `mean` function is applied to
the vector assigned to variable *b*.
[source,text]
----
let(a=random(collection1, q="*:*", rows="15000", fl="price_f"),
b=col(a, price_f),
c=mean(b))
----
When this expression is sent to the /stream handler it responds with:
[source,json]
----
{
"result-set": {
"docs": [
{
"c": 0.5016035594638814
},
{
"EOF": true,
"RESPONSE_TIME": 306
}
]
}
}
----
== Creating Matrices
Matrices can be created by vectorizing multiple numeric fields
and adding them to a matrix. The matrices can then be operated on by
any math expression that operates on matrices.
Note that this section deals with the creation of matrices
from numeric data. The next chapter of the user guide covers
Text Analysis and Term Vectors which describes how to build TF-IDF
term vector matrices from text fields.
Below is a simple example where four random samples are taken
from different sub-populations in the data. The *price_f* field of
each random sample is
vectorized and the vectors are added as rows to a matrix.
Then the `sumRows`
function is applied to the matrix to return a vector containing
the sum of each row.
[source,text]
----
let(a=random(collection1, q="market:A", rows="5000", fl="price_f"),
b=random(collection1, q="market:B", rows="5000", fl="price_f"),
c=random(collection1, q="market:C", rows="5000", fl="price_f"),
d=random(collection1, q="market:D", rows="5000", fl="price_f"),
e=col(a, price_f),
f=col(b, price_f),
g=col(c, price_f),
h=col(d, price_f),
i=matrix(e, f, g, h),
j=sumRows(i))
----
When this expression is sent to the /stream handler it responds with:
[source,json]
----
{
"result-set": {
"docs": [
{
"j": [
154390.1293375,
167434.89453,
159293.258493,
149773.42769,
]
},
{
"EOF": true,
"RESPONSE_TIME": 9
}
]
}
}
----

View File

@ -31,10 +31,12 @@ public class FieldValueEvaluator extends SourceEvaluator {
private static final long serialVersionUID = 1L; private static final long serialVersionUID = 1L;
private String fieldName; private String fieldName;
private boolean literal;
public FieldValueEvaluator(String fieldName) { public FieldValueEvaluator(String fieldName) {
if(fieldName.startsWith("'") && fieldName.endsWith("'") && fieldName.length() > 1){ if(fieldName.startsWith("\"") && fieldName.endsWith("\"") && fieldName.length() > 1){
fieldName = fieldName.substring(1, fieldName.length() - 1); fieldName = fieldName.substring(1, fieldName.length() - 1);
literal = true;
} }
this.fieldName = fieldName; this.fieldName = fieldName;
@ -42,6 +44,10 @@ public class FieldValueEvaluator extends SourceEvaluator {
@Override @Override
public Object evaluate(Tuple tuple) throws IOException { public Object evaluate(Tuple tuple) throws IOException {
if(literal) {
return fieldName;
}
Object value = tuple.get(fieldName); Object value = tuple.get(fieldName);
// This is somewhat radical. // This is somewhat radical.
@ -84,10 +90,6 @@ public class FieldValueEvaluator extends SourceEvaluator {
} }
} }
if(value == null) {
return fieldName;
}
return value; return value;
} }