mirror of https://github.com/apache/lucene.git
SOLR-11947: Squashed commit of the following ref guide changes:
commit 61053f2fe373bff0b451f549e063550f08ecdac1 Author: Joel Bernstein <jbernste@apache.org> Date: Mon Mar 26 12:44:12 2018 -0400 SOLR-11947: Fix orphaned files commit 42302073bf61fde134caeff71b6db3978e113b4d Author: Joel Bernstein <jbernste@apache.org> Date: Mon Mar 26 12:27:26 2018 -0400 SOLR-11947: small change commit b16b1453c2e7d5083f588b4b874c918d521e9fe5 Author: Joel Bernstein <jbernste@apache.org> Date: Mon Mar 26 12:23:17 2018 -0400 SOLR-11947: proofing commit 57265ce4659a427c179e206b79d8fe05b01a5f93 Author: Joel Bernstein <jbernste@apache.org> Date: Sat Mar 24 14:41:48 2018 -0400 SOLR-11947: monte carlo WIP commit 04e8381f6b5b329c5fa17c1f31c2d848fe9cec2a Author: Joel Bernstein <jbernste@apache.org> Date: Fri Mar 23 16:24:10 2018 -0400 SOLR-11947: probabiity WIP commit 4298a6d514e7e431e322a4f62c22c336430a89f1 Author: Joel Bernstein <jbernste@apache.org> Date: Fri Mar 23 13:07:05 2018 -0400 SOLR-11947: time series WIP commit 1a7654f9225948cd4adb3056bc2192cc0d24b3ee Author: Joel Bernstein <jbernste@apache.org> Date: Fri Mar 23 11:32:53 2018 -0400 SOLR-11947: machine learning WIP commit fae0c3aa46e6f26fecb59077207982b2f584ec86 Author: Joel Bernstein <jbernste@apache.org> Date: Thu Mar 22 22:14:15 2018 -0400 SOLR-11947: machine learning WIP commit fb6a96b2bdc4bbc4c2b5b62b6e69cd561ef9e31b Author: Joel Bernstein <jbernste@apache.org> Date: Thu Mar 22 14:36:08 2018 -0400 SOLR-11947: numerical analysis WIP commit a648ba939c90caf5db2a5b88023bd580d4d1e8af Author: Joel Bernstein <jbernste@apache.org> Date: Thu Mar 22 12:27:33 2018 -0400 SOLR-11947: numerical analysis WIP commit ce8f1b710d414d8e3ff3c8676f64fc3017316a15 Author: Joel Bernstein <jbernste@apache.org> Date: Wed Mar 21 19:56:10 2018 -0400 SOLR-11947: numerical analysis WIP commit 5e25a4884341cdd84988e13250f255eb23d7fd50 Author: Joel Bernstein <jbernste@apache.org> Date: Tue Mar 20 22:01:59 2018 -0400 SOLR-11947: Curve fitting WIP commit f381414dc44ecfa781988c5ca75bfb1c80de6674 Author: Joel Bernstein <jbernste@apache.org> Date: Tue Mar 20 21:49:39 2018 -0400 SOLR-11947: Curve fitting WIP commit 4be725132215ed44cc84587bb0d11be216360b74 Author: Joel Bernstein <jbernste@apache.org> Date: Mon Mar 19 19:55:10 2018 -0400 SOLR-11947: Monte Carlo WIP commit d330b412e46be0ebf8d75e99295e3fe9f978c02c Author: Joel Bernstein <jbernste@apache.org> Date: Sun Mar 18 22:00:55 2018 -0400 SOLR-11947: Probability WIP commit e3d6160c1fa650e054b9694c57d34b3950c80175 Author: Joel Bernstein <jbernste@apache.org> Date: Sat Mar 17 21:18:43 2018 -0400 SOLR-11947: More WIP commit 8484b0283f79825dee8eaee82604120d04511de4 Author: Joel Bernstein <jbernste@apache.org> Date: Fri Mar 16 15:03:06 2018 -0400 SOLR-11947: machine learning WIP commit 77ecfdc71d79ca8eded0355669310c6025c70d96 Author: Joel Bernstein <jbernste@apache.org> Date: Thu Mar 15 21:33:09 2018 -0400 SOLR-11947: machine learning WIP commit 7488caf5e54436a0e5fe85c0dda4ea31d8357600 Author: Joel Bernstein <jbernste@apache.org> Date: Thu Mar 15 19:08:50 2018 -0400 SOLR-11947: machine learning WIP commit 102ee2e1857e7d7f45d7f3195a0a4e91eacb766d Author: Joel Bernstein <jbernste@apache.org> Date: Thu Mar 15 15:18:31 2018 -0400 SOLR-11947: machine learning WIP commit 0d5cd2b4a4fd012fe6d640a86733280702cf8673 Author: Joel Bernstein <jbernste@apache.org> Date: Wed Mar 14 21:49:15 2018 -0400 SOLR-11947: numerical analysis WIP commit 31eec30576479a9023c7b0e6ccb2d9f685e128a1 Author: Joel Bernstein <jbernste@apache.org> Date: Wed Mar 14 14:41:06 2018 -0400 SOLR-11947: numerical analysis WIP commit c6e324ac56ca6e9f229d6acb39fdcf60c3356230 Author: Joel Bernstein <jbernste@apache.org> Date: Tue Mar 13 15:16:26 2018 -0400 SOLR-11947: term vectors WIP commit 8c843999eabdb82665641caa9c21f07e95b70a86 Author: Joel Bernstein <jbernste@apache.org> Date: Mon Mar 12 18:03:53 2018 -0400 SOLR-11947: Add curve fitting to TOC commit 09be026f6ad400d965fd373403d7a2eb2fae0c90 Author: Joel Bernstein <jbernste@apache.org> Date: Mon Mar 12 15:36:05 2018 -0400 SOLR-11947: Text analysis WIP commit e48b4d69abadb603a90c052aa1e36dd60ae7fd33 Author: Joel Bernstein <jbernste@apache.org> Date: Sun Mar 11 18:29:20 2018 -0400 SOLR-11947: TOC changes commit f71ebc079713e16492ba45cedafc3b9512f6bae2 Author: Joel Bernstein <jbernste@apache.org> Date: Sat Mar 10 17:54:04 2018 -0500 SOLR-11947: WIP term vectors commit ebc6b3943a27454adaf1a2309b6720bb2ba63c8c Author: Joel Bernstein <jbernste@apache.org> Date: Sat Mar 10 13:34:19 2018 -0500 SOLR-11947: WIP regression commit 44752b2d34f46bc7f5693839e42ab3cef9edc47c Author: Joel Bernstein <jbernste@apache.org> Date: Fri Mar 9 22:40:40 2018 -0500 SOLR-11947: WIP for vectorization.adoc commit 43254fcb05386264a6d591b1fa2c2573dcc2d2a3 Author: Joel Bernstein <jbernste@apache.org> Date: Fri Mar 9 19:42:26 2018 -0500 SOLR-11947: Test local links commit b60df2000978f70720eb0a36543752fd3bf07d2c Author: Joel Bernstein <jbernste@apache.org> Date: Thu Mar 8 21:41:17 2018 -0500 SOLR-11947: Update math-expressions TOC commit de068c3af8557d60de37cb29f3ed7da3f5442772 Author: Joel Bernstein <jbernste@apache.org> Date: Thu Mar 8 21:24:46 2018 -0500 SOLR-11947: Continued work on math expressions documentation. commit fe445f2c997ea825d1ae9b9912406521249befc0 Author: Joel Bernstein <jbernste@apache.org> Date: Sun Mar 4 20:22:33 2018 -0500 SOLR-12054: ebeAdd and ebeSubtract should support matrix operations commit 1f3ae745cc26453a34a64a4327ceac7cc91d23f5 Author: Joel Bernstein <jbernste@apache.org> Date: Sun Mar 4 13:24:54 2018 -0500 SOLR-11947: Initial commit for new math expression docs WIP
This commit is contained in:
parent
dc2ad7022c
commit
1ed4e226ac
|
@ -0,0 +1,182 @@
|
|||
= Curve Fitting
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
|
||||
== Polynomial Curve Fitting
|
||||
|
||||
|
||||
The `polyfit` function is a general purpose curve fitter used to model
|
||||
the *non-linear* relationship between two random variables.
|
||||
|
||||
The `polyfit` function is passed *x* and *y* axises and fits a smooth curve to the data.
|
||||
If only a single array is provided it is treated as the *y* axis and a sequence is generated
|
||||
for the *x* axis.
|
||||
|
||||
The `polyfit` function also has a parameter the specifies the degree of the polynomial. The higher
|
||||
the degree the more curves that can be modeled.
|
||||
|
||||
The example below uses the `polyfit` function to fit a curve to an array using
|
||||
a 3 degree polynomial. The fitted curve is then subtracted from the original curve. The output
|
||||
shows the error between the fitted curve and the original curve, known as the residuals.
|
||||
The output also includes the sum-of-squares of the residuals which provides a measure
|
||||
of how large the error is..
|
||||
|
||||
[source,text]
|
||||
----
|
||||
let(echo="residuals, sumSqError",
|
||||
y=array(0, 1, 2, 3, 4, 5.7, 6, 7, 6, 5, 5, 3, 2, 1, 0),
|
||||
curve=polyfit(y, 3),
|
||||
residuals=ebeSubtract(y, curve),
|
||||
sumSqError=sumSq(residuals))
|
||||
----
|
||||
|
||||
When this expression is sent to the /stream handler it
|
||||
responds with:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"residuals": [
|
||||
0.5886274509803899,
|
||||
-0.0746078431372561,
|
||||
-0.49492135315664765,
|
||||
-0.6689571213100631,
|
||||
-0.5933591898297781,
|
||||
0.4352283990519288,
|
||||
0.32016160310277897,
|
||||
1.1647963800904968,
|
||||
0.272488687782805,
|
||||
-0.3534055160525744,
|
||||
0.2904697263520779,
|
||||
-0.7925296272355089,
|
||||
-0.5990476190476182,
|
||||
-0.12572829131652274,
|
||||
0.6307843137254909
|
||||
],
|
||||
"sumSqError": 4.7294282482223595
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 0
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
In the next example the curve is fit using a 5 degree polynomial. Notice that the curve
|
||||
is fit closer, shown by the smaller residuals and lower value for the sum-of-squares of the
|
||||
residuals. This is because the higher polynomial produced a closer fit.
|
||||
|
||||
[source,text]
|
||||
----
|
||||
let(echo="residuals, sumSqError",
|
||||
y=array(0, 1, 2, 3, 4, 5.7, 6, 7, 6, 5, 5, 3, 2, 1, 0),
|
||||
curve=polyfit(y, 5),
|
||||
residuals=ebeSubtract(y, curve),
|
||||
sumSqError=sumSq(residuals))
|
||||
----
|
||||
|
||||
When this expression is sent to the /stream handler it
|
||||
responds with:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"residuals": [
|
||||
-0.12337461300309674,
|
||||
0.22708978328173413,
|
||||
0.12266015718028167,
|
||||
-0.16502738747320755,
|
||||
-0.41142804563857105,
|
||||
0.2603044014808713,
|
||||
-0.12128970101106162,
|
||||
0.6234168308471704,
|
||||
-0.1754692675745293,
|
||||
-0.5379689969473249,
|
||||
0.4651616185671843,
|
||||
-0.288175756132409,
|
||||
0.027970945463215102,
|
||||
0.18699690402476687,
|
||||
-0.09086687306501587
|
||||
],
|
||||
"sumSqError": 1.413089480179252
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 0
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
|
||||
== Prediction, Derivatives and Integrals
|
||||
|
||||
The `polyfit` function returns an array which contains the *y* value data points
|
||||
of the fitted curve.
|
||||
|
||||
In order to predict values along the curve an interpolation function must be created
|
||||
for the curve. Once an interpolation functin has been created the `predict`,
|
||||
`derivative` and `integral` functions can be applied to the curve.
|
||||
|
||||
In the example below the x axis is included for clarity.
|
||||
The `polyfit` function returns an array with the fitted curve.
|
||||
A linear inpolation function is then created for the curve with the `lerp` function.
|
||||
The `predict` function is then used to predict a value along the curve, in this
|
||||
case the prediction is made for the *x* value of .5.
|
||||
|
||||
[source,text]
|
||||
----
|
||||
let(x=array(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14),
|
||||
y=array(0, 1, 2, 3, 4, 5.7, 6, 7, 6, 5, 5, 3, 2, 1, 0),
|
||||
curve=polyfit(x, y, 5),
|
||||
interp=lerp(x, curve),
|
||||
p=predict(interp, .5))
|
||||
----
|
||||
|
||||
When this expression is sent to the /stream handler it
|
||||
responds with:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"p": 0.4481424148606813
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 0
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,680 @@
|
|||
= Machine Learning
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
|
||||
This section of the math expressions user guide covers machine learning
|
||||
functions.
|
||||
|
||||
== Feature Scaling
|
||||
|
||||
Before performing machine learning operations its often necessary to
|
||||
scale the feature vectors so they can be compared at the same scale.
|
||||
|
||||
All the scaling function operate on vectors and matrices.
|
||||
When operating on a matrix the *rows* of the matrix are scaled.
|
||||
|
||||
=== Min/Max Scaling
|
||||
|
||||
The `minMaxScale` function scales a vector or matrix between a min and
|
||||
max value. By default it will scale between 0 and 1 if min/max values
|
||||
are not provided.
|
||||
|
||||
Below is a simple example of min/max scaling between 0 and 1.
|
||||
Notice that once brought into the same scale the vectors are the same.
|
||||
|
||||
[source,text]
|
||||
----
|
||||
let(a=array(20, 30, 40, 50),
|
||||
b=array(200, 300, 400, 500),
|
||||
c=matrix(a, b),
|
||||
d=minMaxScale(c))
|
||||
----
|
||||
|
||||
This expression returns the following response:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"d": [
|
||||
[
|
||||
0,
|
||||
0.3333333333333333,
|
||||
0.6666666666666666,
|
||||
1
|
||||
],
|
||||
[
|
||||
0,
|
||||
0.3333333333333333,
|
||||
0.6666666666666666,
|
||||
1
|
||||
]
|
||||
]
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 0
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
=== Standardization
|
||||
|
||||
The `standardize` function scales a vector so that it has a
|
||||
mean of 0 and a standard deviation of 1. Standardization can be
|
||||
used with machine learning algorithms, such as SVM, that
|
||||
perform better when the data has a normal distribution.
|
||||
|
||||
[source,text]
|
||||
----
|
||||
let(a=array(20, 30, 40, 50),
|
||||
b=array(200, 300, 400, 500),
|
||||
c=matrix(a, b),
|
||||
d=standardize(c))
|
||||
----
|
||||
|
||||
This expression returns the following response:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"d": [
|
||||
[
|
||||
-1.161895003862225,
|
||||
-0.3872983346207417,
|
||||
0.3872983346207417,
|
||||
1.161895003862225
|
||||
],
|
||||
[
|
||||
-1.1618950038622249,
|
||||
-0.38729833462074165,
|
||||
0.38729833462074165,
|
||||
1.1618950038622249
|
||||
]
|
||||
]
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 17
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
=== Unitize
|
||||
|
||||
The `unitize` function scales vectors to a magnitude of 1. A vector with a
|
||||
magnitude of 1 is known as a unit vector. Unit vectors are
|
||||
preferred when the vector math deals
|
||||
with vector direction rather than magnitude.
|
||||
|
||||
[source,text]
|
||||
----
|
||||
let(a=array(20, 30, 40, 50),
|
||||
b=array(200, 300, 400, 500),
|
||||
c=matrix(a, b),
|
||||
d=unitize(c))
|
||||
----
|
||||
|
||||
This expression returns the following response:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"d": [
|
||||
[
|
||||
0.2721655269759087,
|
||||
0.40824829046386296,
|
||||
0.5443310539518174,
|
||||
0.6804138174397716
|
||||
],
|
||||
[
|
||||
0.2721655269759087,
|
||||
0.4082482904638631,
|
||||
0.5443310539518174,
|
||||
0.6804138174397717
|
||||
]
|
||||
]
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 6
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
== Distance
|
||||
|
||||
The `distance` function computes a distance measure for two
|
||||
numeric arrays or a *distance matrix* for the columns of a matrix.
|
||||
|
||||
There are four distance measures currently supported:
|
||||
|
||||
* euclidean (default)
|
||||
* manhattan
|
||||
* canberra
|
||||
* earthMovers
|
||||
|
||||
Below is an example for computing euclidean distance for
|
||||
two numeric arrays:
|
||||
|
||||
|
||||
[source,text]
|
||||
----
|
||||
let(a=array(20, 30, 40, 50),
|
||||
b=array(21, 29, 41, 49),
|
||||
c=distance(a, b))
|
||||
----
|
||||
|
||||
This expression returns the following response:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"c": 2
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 0
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
Below is an example for computing a distance matrix for columns
|
||||
of a matrix:
|
||||
|
||||
[source,text]
|
||||
----
|
||||
let(a=array(20, 30, 40),
|
||||
b=array(21, 29, 41),
|
||||
c=array(31, 40, 50),
|
||||
d=matrix(a, b, c),
|
||||
c=distance(d))
|
||||
----
|
||||
|
||||
This expression returns the following response:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"e": [
|
||||
[
|
||||
0,
|
||||
15.652475842498529,
|
||||
34.07345007480164
|
||||
],
|
||||
[
|
||||
15.652475842498529,
|
||||
0,
|
||||
18.547236990991408
|
||||
],
|
||||
[
|
||||
34.07345007480164,
|
||||
18.547236990991408,
|
||||
0
|
||||
]
|
||||
]
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 24
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
== K-means Clustering
|
||||
|
||||
The `kmeans` functions performs k-means clustering of the rows of a matrix.
|
||||
Once the clustering has been completed there are a number of useful functions available
|
||||
for examining the *clusters* and *centroids*.
|
||||
|
||||
The examples below are clustering *term vectors*.
|
||||
The chapter on link:term-vectors.adoc[Text Analysis and Term Vectors] should be
|
||||
consulted for a full explanation of these features.
|
||||
|
||||
=== Centroid Features
|
||||
|
||||
In the example below the `kmeans` function is used to cluster a result set from the Enron email data-set
|
||||
and then the top features are extracted from the cluster centroids.
|
||||
|
||||
Let's look at what data is assigned to each variable:
|
||||
|
||||
* *a*: The `random` function returns a sample of 500 documents from the *enron*
|
||||
collection that match the query *body:oil*. The `select` function selects the *id* and
|
||||
and annotates each tuple with the analyzed bigram terms from the body field.
|
||||
|
||||
* *b*: The `termVectors` function creates a TF-IDF term vector matrix from the
|
||||
tuples stored in variable *a*. Each row in the matrix represents a document. The columns of the matrix
|
||||
are the bigram terms that were attached to each tuple.
|
||||
* *c*: The `kmeans` function clusters the rows of the matrix into 5 clusters. The k-means clustering is performed using the
|
||||
*Euclidean distance* measure.
|
||||
* *d*: The `getCentroids` function returns a matrix of cluster centroids. Each row in the matrix is a centroid
|
||||
from one of the 5 clusters. The columns of the matrix are the same bigrams terms of the term vector matrix.
|
||||
* *e*: The `topFeatures` function returns the column labels for the top 5 features of each centroid in the matrix.
|
||||
This returns the top 5 bigram terms for each centroid.
|
||||
|
||||
[source,text]
|
||||
----
|
||||
let(a=select(random(enron, q="body:oil", rows="500", fl="id, body"),
|
||||
id,
|
||||
analyze(body, body_bigram) as terms),
|
||||
b=termVectors(a, maxDocFreq=.10, minDocFreq=.05, minTermLength=14, exclude="_,copyright"),
|
||||
c=kmeans(b, 5),
|
||||
d=getCentroids(c),
|
||||
e=topFeatures(d, 5))
|
||||
----
|
||||
|
||||
This expression returns the following response:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"e": [
|
||||
[
|
||||
"enron enronxgate",
|
||||
"north american",
|
||||
"energy services",
|
||||
"conference call",
|
||||
"power generation"
|
||||
],
|
||||
[
|
||||
"financial times",
|
||||
"chief financial",
|
||||
"financial officer",
|
||||
"exchange commission",
|
||||
"houston chronicle"
|
||||
],
|
||||
[
|
||||
"southern california",
|
||||
"california edison",
|
||||
"public utilities",
|
||||
"utilities commission",
|
||||
"rate increases"
|
||||
],
|
||||
[
|
||||
"rolling blackouts",
|
||||
"public utilities",
|
||||
"electricity prices",
|
||||
"federal energy",
|
||||
"price controls"
|
||||
],
|
||||
[
|
||||
"california edison",
|
||||
"regulatory commission",
|
||||
"southern california",
|
||||
"federal energy",
|
||||
"power generators"
|
||||
]
|
||||
]
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 982
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
=== Cluster Features
|
||||
|
||||
The example below examines the top features of a specific cluster. This example uses the same techniques
|
||||
as the centroids example but the top features are extracted from a cluster rather then the centroids.
|
||||
|
||||
The `getCluster` function returns a cluster by its index. Each cluster is a matrix containing term vectors
|
||||
that have been clustered together based on their features.
|
||||
|
||||
In the example below the `topFeatures` function is used to extract the top 4 features from each term vector
|
||||
in the cluster.
|
||||
|
||||
[source,text]
|
||||
----
|
||||
let(a=select(random(collection3, q="body:oil", rows="500", fl="id, body"),
|
||||
id,
|
||||
analyze(body, body_bigram) as terms),
|
||||
b=termVectors(a, maxDocFreq=.09, minDocFreq=.03, minTermLength=14, exclude="_,copyright"),
|
||||
c=kmeans(b, 25),
|
||||
d=getCluster(c, 0),
|
||||
e=topFeatures(d, 4))
|
||||
----
|
||||
|
||||
This expression returns the following response:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"e": [
|
||||
[
|
||||
"electricity board",
|
||||
"maharashtra state",
|
||||
"power purchase",
|
||||
"state electricity",
|
||||
"reserved enron"
|
||||
],
|
||||
[
|
||||
"electricity board",
|
||||
"maharashtra state",
|
||||
"state electricity",
|
||||
"purchase agreement",
|
||||
"independent power"
|
||||
],
|
||||
[
|
||||
"maharashtra state",
|
||||
"reserved enron",
|
||||
"federal government",
|
||||
"state government",
|
||||
"dabhol project"
|
||||
],
|
||||
[
|
||||
"purchase agreement",
|
||||
"power purchase",
|
||||
"electricity board",
|
||||
"maharashtra state",
|
||||
"state government"
|
||||
],
|
||||
[
|
||||
"investment grade",
|
||||
"portland general",
|
||||
"general electric",
|
||||
"holding company",
|
||||
"transmission lines"
|
||||
],
|
||||
[
|
||||
"state government",
|
||||
"state electricity",
|
||||
"purchase agreement",
|
||||
"electricity board",
|
||||
"maharashtra state"
|
||||
],
|
||||
[
|
||||
"electricity board",
|
||||
"state electricity",
|
||||
"energy management",
|
||||
"maharashtra state",
|
||||
"energy markets"
|
||||
],
|
||||
[
|
||||
"electricity board",
|
||||
"maharashtra state",
|
||||
"state electricity",
|
||||
"state government",
|
||||
"second quarter"
|
||||
]
|
||||
]
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 978
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
== Multi K-means Clustering
|
||||
|
||||
K-means clustering will be produce different results depending on
|
||||
the initial placement of the centroids. K-means is fast enough
|
||||
that multiple trials can be performed and the best outcome selected.
|
||||
The `multiKmeans` function runs the K-means
|
||||
clustering algorithm for a gven number of trials and selects the
|
||||
best result based on which trial produces the lowest intra-cluster
|
||||
variance.
|
||||
|
||||
The example below is identical to centroids example except that
|
||||
it uses `multiKmeans` with 100 trials, rather then a single
|
||||
trial of the `kmeans` function.
|
||||
|
||||
[source,text]
|
||||
----
|
||||
let(a=select(random(collection3, q="body:oil", rows="500", fl="id, body"),
|
||||
id,
|
||||
analyze(body, body_bigram) as terms),
|
||||
b=termVectors(a, maxDocFreq=.09, minDocFreq=.03, minTermLength=14, exclude="_,copyright"),
|
||||
c=multiKmeans(b, 5, 100),
|
||||
d=getCentroids(c),
|
||||
e=topFeatures(d, 5))
|
||||
----
|
||||
|
||||
This expression returns the following response:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"e": [
|
||||
[
|
||||
"enron enronxgate",
|
||||
"energy trading",
|
||||
"energy markets",
|
||||
"energy services",
|
||||
"unleaded gasoline"
|
||||
],
|
||||
[
|
||||
"maharashtra state",
|
||||
"electricity board",
|
||||
"state electricity",
|
||||
"energy trading",
|
||||
"chief financial"
|
||||
],
|
||||
[
|
||||
"price controls",
|
||||
"electricity prices",
|
||||
"francisco chronicle",
|
||||
"wholesale electricity",
|
||||
"power generators"
|
||||
],
|
||||
[
|
||||
"southern california",
|
||||
"california edison",
|
||||
"public utilities",
|
||||
"francisco chronicle",
|
||||
"utilities commission"
|
||||
],
|
||||
[
|
||||
"california edison",
|
||||
"power purchases",
|
||||
"system operator",
|
||||
"term contracts",
|
||||
"independent system"
|
||||
]
|
||||
]
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 1182
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
== Fuzzy K-means Clustering
|
||||
|
||||
The `fuzzyKmeans` function is a soft clustering algorithm which
|
||||
allows vectors to be assigned to more then one cluster. The *fuzziness* parameter
|
||||
is a value between 1 and 2 that determines how fuzzy to make the cluster assignment.
|
||||
|
||||
After the clustering has been performed the `getMembershipMatrix` function can be called
|
||||
on the clustering result to return a matrix describing which clusters each vector belongs to.
|
||||
There is a row in the matrix for each vector that was clustered. There is a column in the matrix
|
||||
for each cluster. The values in the columns are the probability that the vector belonged to the specific
|
||||
cluster.
|
||||
|
||||
A simple example will make this more clear. In the example below 300 documents are analyzed and
|
||||
then turned into a term vector matrix. Then the `fuzzyKmeans` function clusters the
|
||||
term vectors into 12 clusters with a fuzziness factor of 1.25.
|
||||
|
||||
The `getMembershipMatrix` function is used to return the membership matrix and the first row
|
||||
of membership matrix is retrieved with the `rowAt` function. The `precision` function is then applied to the first row
|
||||
of the matrix to make it easier to read.
|
||||
|
||||
The output shows a single vector representing the cluster membership probabilities for the first
|
||||
term vector. Notice that the term vector has the highest association with the 12th cluster,
|
||||
but also has significant associations with the 3rd, 5th, 6th and 7th clusters.
|
||||
|
||||
[source,text]
|
||||
----
|
||||
et(a=select(random(collection3, q="body:oil", rows="300", fl="id, body"),
|
||||
id,
|
||||
analyze(body, body_bigram) as terms),
|
||||
b=termVectors(a, maxDocFreq=.09, minDocFreq=.03, minTermLength=14, exclude="_,copyright"),
|
||||
c=fuzzyKmeans(b, 12, fuzziness=1.25),
|
||||
d=getMembershipMatrix(c),
|
||||
e=rowAt(d, 0),
|
||||
f=precision(e, 5))
|
||||
----
|
||||
|
||||
This expression returns the following response:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"f": [
|
||||
0,
|
||||
0,
|
||||
0.178,
|
||||
0,
|
||||
0.17707,
|
||||
0.17775,
|
||||
0.16214,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0.30504
|
||||
]
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 2157
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
== K-nearest Neighbor
|
||||
|
||||
The `knn` function searches the rows of a matrix for the
|
||||
K-nearest neighbors of a search vector. The `knn` function
|
||||
returns a *matrix* of the K-nearest neighbors. The `knn` function
|
||||
has a *named parameter* called *distance* which specifies the distance measure.
|
||||
There are four distance measures currently supported:
|
||||
|
||||
* euclidean (Default)
|
||||
* manhattan
|
||||
* canberra
|
||||
* earthMovers
|
||||
|
||||
The example below builds on the clustering examples to demonstrate
|
||||
the `knn` function.
|
||||
|
||||
In the example, the centroids matrix is set to variable *d*. The first
|
||||
centroid vector is selected from the matrix with the `rowAt` function.
|
||||
Then the `knn` function is used to find the 3 nearest neighbors
|
||||
to the centroid vector in the term vector matrix (variable b).
|
||||
|
||||
The `knn` function returns a matrix with the 3 nearest neighbors based on the
|
||||
default distance measure which is euclidean. Finally, the top 4 features
|
||||
of the term vectors in the nearest neighbor matrix are returned.
|
||||
|
||||
[source,text]
|
||||
----
|
||||
let(a=select(random(collection3, q="body:oil", rows="500", fl="id, body"),
|
||||
id,
|
||||
analyze(body, body_bigram) as terms),
|
||||
b=termVectors(a, maxDocFreq=.09, minDocFreq=.03, minTermLength=14, exclude="_,copyright"),
|
||||
c=multiKmeans(b, 5, 100),
|
||||
d=getCentroids(c),
|
||||
e=rowAt(d, 0),
|
||||
g=knn(b, e, 3),
|
||||
h=topFeatures(g, 4))
|
||||
----
|
||||
|
||||
This expression returns the following response:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"h": [
|
||||
[
|
||||
"california power",
|
||||
"electricity supply",
|
||||
"concerned about",
|
||||
"companies like"
|
||||
],
|
||||
[
|
||||
"maharashtra state",
|
||||
"california power",
|
||||
"electricity board",
|
||||
"alternative energy"
|
||||
],
|
||||
[
|
||||
"electricity board",
|
||||
"maharashtra state",
|
||||
"state electricity",
|
||||
"houston chronicle"
|
||||
]
|
||||
]
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 1243
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
|
@ -0,0 +1,59 @@
|
|||
= Math Expressions
|
||||
:page-children: scalar-math, vector-math, variables, matrix-math, vectorization, term-vectors, statistics, probability, montecarlo, time-series, regression, numerical-analysis, curve-fitting, machine-learning
|
||||
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
The Streaming Expression library includes a powerful
|
||||
mathematical programing syntax with many of the features of a
|
||||
functional programming language. The syntax includes variables,
|
||||
data structures and a growing set of mathematical functions.
|
||||
|
||||
This user guide provides an overview of the different areas of
|
||||
mathematical coverage starting with basic scalar math and
|
||||
ending with machine learning. Along the way the guide covers variables
|
||||
and data structures and techniques for combining Solr's
|
||||
powerful streams with mathematical functions to make every
|
||||
record in your Solr Cloud cluster computable.
|
||||
|
||||
== link:scalar-math.adoc[Scalar Math]
|
||||
|
||||
== link:vector-math.adoc[Vector Math]
|
||||
|
||||
== link:variables.adoc[Variables]
|
||||
|
||||
== link:matrix-math.adoc[Matrix Math]
|
||||
|
||||
== link:vectorization.adoc[Streams and Vectorization]
|
||||
|
||||
== link:term-vectors.adoc[Text Analysis and Term Vectors]
|
||||
|
||||
== link:statistics.adoc[Statistics]
|
||||
|
||||
== link:probability.adoc[Probability]
|
||||
|
||||
== link:montecarlo.adoc[Monte Carlo Simulations]
|
||||
|
||||
== link:time-series.adoc[Time Series]
|
||||
|
||||
== link:regression.adoc[Linear Regression]
|
||||
|
||||
== link:numerical-analysis.adoc[Interpolation, Derivatives and Integrals]
|
||||
|
||||
== link:curve-fitting.adoc[Curve Fitting]
|
||||
|
||||
== link:machine-learning.adoc[Machine Learning]
|
|
@ -0,0 +1,443 @@
|
|||
= Matrices and Matrix Math
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
This section of the user guide covers the
|
||||
basics of matrix creation, manipulation and matrix math. Other sections
|
||||
of the user guide demonstrate how matrices are used by the statistics,
|
||||
probability and machine learning functions.
|
||||
|
||||
== Matrix Creation
|
||||
|
||||
A matrix can be created with the `matrix` function.
|
||||
The matrix function is passed a list of `arrays` with
|
||||
each array representing a *row* in the matrix.
|
||||
|
||||
The example below creates a two-by-two matrix.
|
||||
|
||||
[source,text]
|
||||
----
|
||||
matrix(array(1, 2),
|
||||
array(4, 5))
|
||||
----
|
||||
|
||||
When this expression is sent to the /stream handler it
|
||||
responds with:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"return-value": [
|
||||
[
|
||||
1,
|
||||
2
|
||||
],
|
||||
[
|
||||
4,
|
||||
5
|
||||
]
|
||||
]
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 0
|
||||
}
|
||||
]
|
||||
}
|
||||
----
|
||||
|
||||
== Accessing Rows and Columns
|
||||
|
||||
The rows and columns of a matrix can be accessed using the `rowAt`
|
||||
and `colAt` functions.
|
||||
|
||||
The example below creates a 2 by 2 matrix and returns the second column of the matrix.
|
||||
Notice that the matrix is passed variables in this example rather than
|
||||
directly passed a list of arrays.
|
||||
|
||||
[source,text]
|
||||
----
|
||||
let(a=array(1, 2),
|
||||
b=array(4, 5),
|
||||
c=matrix(a, b),
|
||||
d=colAt(c, 1))
|
||||
----
|
||||
|
||||
When this expression is sent to the /stream handler it
|
||||
responds with:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"d": [
|
||||
2,
|
||||
5
|
||||
]
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 0
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
== Row and Column Labels
|
||||
|
||||
A matrix can have column and rows and labels. The functions
|
||||
`setRowLabels`, `setColumnLabels`, `getRowLabels` and `getColumnLabels`
|
||||
can be used to set and get the labels. The label values
|
||||
are set using string arrays.
|
||||
|
||||
The example below sets the row and column labels. In other sections of the
|
||||
user guide examples are shown where functions return matrices
|
||||
with the labels already set.
|
||||
|
||||
Below is a simple example of setting and
|
||||
getting row and column labels
|
||||
on a matrix.
|
||||
|
||||
[source,text]
|
||||
----
|
||||
let(echo="d, e",
|
||||
a=matrix(array(1, 2),
|
||||
array(4, 5)),
|
||||
b=setRowLabels(a, array("row0", "row1")),
|
||||
c=setColumnLabels(b, array("col0", "col1")),
|
||||
d=getRowLabels(c),
|
||||
e=getColumnLabels(c))
|
||||
----
|
||||
|
||||
When this expression is sent to the /stream handler it
|
||||
responds with:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"d": [
|
||||
"row0",
|
||||
"row1"
|
||||
],
|
||||
"e": [
|
||||
"col0",
|
||||
"col1"
|
||||
]
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 0
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
== Matrix Attributes
|
||||
|
||||
A matrix can also have an arbitrary set of named attributes associated
|
||||
with it. Certain functions, such as the `termVectors` function,
|
||||
return matrices that contain attributes that describe data in the matrix.
|
||||
|
||||
Attributes can be retrieved by name using the `getAttribute` function and
|
||||
the entire attribute map can be returned using the `getAttributes`
|
||||
function.
|
||||
|
||||
== Matrix Dimensions
|
||||
|
||||
The dimensions of a matrix can be determined using the
|
||||
`rowCount` and `columnCount` functions.
|
||||
|
||||
The example below retrieves the dimensions of a matrix.
|
||||
|
||||
[source,text]
|
||||
----
|
||||
let(echo="b,c",
|
||||
a=matrix(array(1, 2, 3),
|
||||
array(4, 5, 6)),
|
||||
b=rowCount(a),
|
||||
c=columnCount(a))
|
||||
----
|
||||
|
||||
When this expression is sent to the /stream handler it
|
||||
responds with:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"b": 2,
|
||||
"c": 3
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 0
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
== Matrix Transposition
|
||||
|
||||
A matrix can be https://en.wikipedia.org/wiki/Transpose[transposed]
|
||||
using the `transpose` function.
|
||||
|
||||
An example of matrix transposition is shown below:
|
||||
|
||||
[source,text]
|
||||
----
|
||||
let(a=matrix(array(1, 2),
|
||||
array(4, 5)),
|
||||
b=transpose(a))
|
||||
----
|
||||
|
||||
When this expression is sent to the /stream handler it
|
||||
responds with:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"b": [
|
||||
[
|
||||
1,
|
||||
4
|
||||
],
|
||||
[
|
||||
2,
|
||||
5
|
||||
]
|
||||
]
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 24
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
== Matrix Summations
|
||||
|
||||
The rows and columns of a matrix can be summed with the `sumRows` and `sumColumns` functions.
|
||||
Below is an example of the `sumRows` function which returns an
|
||||
array with the sum of each row.
|
||||
|
||||
[source,text]
|
||||
----
|
||||
let(a=matrix(array(1, 2, 3),
|
||||
array(4, 5, 6)),
|
||||
b=sumRows(a))
|
||||
----
|
||||
|
||||
When this expression is sent to the /stream handler it
|
||||
responds with:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"b": [
|
||||
6,
|
||||
15
|
||||
]
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 2
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
The `grandSum` function returns the sum of all values in the matrix.
|
||||
Below is an example of the `grandSum` function:
|
||||
|
||||
[source,text]
|
||||
----
|
||||
let(a=matrix(array(1, 2, 3),
|
||||
array(4, 5, 6)),
|
||||
b=grandSum(a))
|
||||
----
|
||||
|
||||
When this expression is sent to the /stream handler it
|
||||
responds with:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"b": 21
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 0
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
== Scalar Matrix Math
|
||||
|
||||
The same scalar math functions that apply to vectors can also be applied to matrices: `scalarAdd`, `scalarSubtract`,
|
||||
`scalarMultiply`, `scalarDivide`. Below is an example of the `scalarAdd` function
|
||||
which adds a scalar value to each element in a matrix.
|
||||
|
||||
|
||||
[source,text]
|
||||
----
|
||||
let(a=matrix(array(1, 2),
|
||||
array(4, 5)),
|
||||
b=scalarAdd(10, a))
|
||||
----
|
||||
|
||||
When this expression is sent to the /stream handler it
|
||||
responds with:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"b": [
|
||||
[
|
||||
11,
|
||||
12
|
||||
],
|
||||
[
|
||||
14,
|
||||
15
|
||||
]
|
||||
]
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 0
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
== Matrix Addition and Subtraction
|
||||
|
||||
Two matrices can be added and subtracted using the `ebeAdd` and `ebeSubtract` functions,
|
||||
which perform element-by-element addition
|
||||
and subtraction of matrices.
|
||||
|
||||
Below is a simple example of an element-by-element addition of a matrix by itself:
|
||||
|
||||
[source,text]
|
||||
----
|
||||
let(a=matrix(array(1, 2),
|
||||
array(4, 5)),
|
||||
b=ebeAdd(a, a))
|
||||
----
|
||||
|
||||
When this expression is sent to the /stream handler it
|
||||
responds with:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"b": [
|
||||
[
|
||||
2,
|
||||
4
|
||||
],
|
||||
[
|
||||
8,
|
||||
10
|
||||
]
|
||||
]
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 0
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
== Matrix Multiplication
|
||||
|
||||
Matrix multiplication can be accomplished using the `matrixMult` function. Below is a simple
|
||||
example of matrix multiplication:
|
||||
|
||||
[source,text]
|
||||
----
|
||||
let(a=matrix(array(1, 2),
|
||||
array(4, 5)),
|
||||
b=matrix(array(11, 12),
|
||||
array(14, 15)),
|
||||
c=matrixMult(a, b))
|
||||
----
|
||||
|
||||
When this expression is sent to the /stream handler it
|
||||
responds with:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"c": [
|
||||
[
|
||||
39,
|
||||
42
|
||||
],
|
||||
[
|
||||
114,
|
||||
123
|
||||
]
|
||||
]
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 0
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
|
@ -0,0 +1,213 @@
|
|||
= Monte Carlo Simulations
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
|
||||
Monte Carlo simulations are commonly used to model the behavior of
|
||||
stochastic systems. This section of the user guide describes
|
||||
how to perform both *uncorrelated* and *correlated* Monte Carlo simulations
|
||||
using the *sampling* capabilities of the probability distribution framework.
|
||||
|
||||
=== Uncorrelated Simulations
|
||||
|
||||
Uncorrelated Monte Carlo simulations model stochastic systems with the assumption
|
||||
that the underlying random variables move independently of each other.
|
||||
A simple example of a Monte Carlo simulation using two independently changing random variables
|
||||
is described below.
|
||||
|
||||
In this example a Monte Carlo simulation is used to determine the probability that a simple hinge assembly will
|
||||
fall within a required length specification.
|
||||
|
||||
The hinge has two components *A* and *B*. The combined length of the two components must be less then 5 centimeters
|
||||
to fall within specification.
|
||||
|
||||
A random sampling of lengths for component *A* has shown that its length conforms to a
|
||||
normal distribution with a mean of 2.2 centimeters and a standard deviation of .0195
|
||||
centimeters.
|
||||
|
||||
A random sampling of lengths for component *B* has shown that its length conforms
|
||||
to a normal distribution with a mean of 2.71 centimeters and a standard deviation of .0198 centimeters.
|
||||
|
||||
The Monte Carlo simulation below performs the following steps:
|
||||
|
||||
* A normal distribution with a mean of 2.2 and a standard deviation of .0195 is created to model the length of componentA.
|
||||
* A normal distribution with a mean of 2.71 and a standard deviation of .0198 is created to model the length of componentB.
|
||||
* The `monteCarlo` function is used to simulate component pairs. The `monteCarlo` function
|
||||
calls the *add(sample(componentA), sample(componentB))* function 100000 times and collects the results in an array. Each
|
||||
time the function is called a random sample is drawn from the componentA
|
||||
and componentB length distributions. The `add` function adds the two samples to calculate the combined length.
|
||||
The result of each function run is collected in an array and assigned to the *simresults* variable.
|
||||
* An `empiricalDistribution` function is then created from the *simresults* array to model the distribution of the
|
||||
simulation results.
|
||||
* Finally, the `cumulativeProbability` function is called on the *simmodel* to determine the cumulative probability
|
||||
that the combined length of the components is 5 or less.
|
||||
* Based on the simulation there is .9994371944629039 probability that the combined length of a component pair will
|
||||
be 5 or less.
|
||||
|
||||
[source,text]
|
||||
----
|
||||
let(componentA=normalDistribution(2.2, .0195),
|
||||
componentB=normalDistribution(2.71, .0198),
|
||||
simresults=monteCarlo(add(sample(componentA), sample(componentB)), 100000),
|
||||
simmodel=empiricalDistribution(simresults),
|
||||
prob=cumulativeProbability(simmodel, 5))
|
||||
----
|
||||
|
||||
When this expression is sent to the /stream handler it responds with:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"prob": 0.9994371944629039
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 660
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
=== Correlated Simulations
|
||||
|
||||
The simulation above assumes that the lengths of *componentA* and *componentB* vary independently.
|
||||
What would happen to the probability model if there was a correlation between the lengths of
|
||||
*componentA* and *componentB*.
|
||||
|
||||
In the example below a database containing assembled pairs of components is used to determine
|
||||
if there is a correlation between the lengths of the components, and how the correlation effects the model.
|
||||
|
||||
Before performing a simulation of the effects of correlation on the probability model its
|
||||
useful to understand what the correlation is between the lengths of *componentA* and *componentB*.
|
||||
|
||||
In the example below 5000 random samples are selected from a collection
|
||||
of assembled hinges. Each sample contains
|
||||
lengths of the components in the fields *componentA_d* and *componentB_d*.
|
||||
|
||||
Both fields are then vectorized. The *componentA_d* vector is stored in
|
||||
variable *b* and the *componentB_d* variable is stored in variable *c*.
|
||||
|
||||
Then the correlation of the two vectors is calculated using the `corr` function. Note that the outcome
|
||||
from `corr` is 0.9996931313216989. This means that *componentA_d* and *componentB_d* are almost
|
||||
perfectly correlated.
|
||||
|
||||
[source,text]
|
||||
----
|
||||
let(a=random(collection5, q="*:*", rows="5000", fl="componentA_d, componentB_d"),
|
||||
b=col(a, componentA_d)),
|
||||
c=col(a, componentB_d)),
|
||||
d=corr(b, c))
|
||||
----
|
||||
|
||||
When this expression is sent to the /stream handler it responds with:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"d": 0.9996931313216989
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 309
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
How does correlation effect the probability model?
|
||||
|
||||
The example below explores how to use a *multivariate normal distribution* function
|
||||
to model how correlation effects the probability of hinge defects.
|
||||
|
||||
In this example 5000 random samples are selected from a collection
|
||||
containing length data for assembled hinges. Each sample contains
|
||||
the fields *componentA_d* and *componentB_d*.
|
||||
|
||||
Both fields are then vectorized. The *componentA_d* vector is stored in
|
||||
variable *b* and the *componentB_d* variable is stored in variable *c*.
|
||||
|
||||
An array is created that contains the *means* of the two vectorized fields.
|
||||
|
||||
Then both vectors are added to a matrix which is transposed. This creates
|
||||
an *observation* matrix where each row contains one observation of
|
||||
*componentA_d* and *componentB_d*. A covariance matrix is then created from the columns of
|
||||
the observation matrix with the
|
||||
`cov` function. The covariance matrix describes the covariance between
|
||||
*componentA_d* and *componentB_d*.
|
||||
|
||||
The `multivariateNormalDistribution` function is then called with the
|
||||
array of means for the two fields and the covariance matrix. The model
|
||||
for the multivariate normal distribution is stored in variable *g*.
|
||||
|
||||
The `monteCarlo` function then calls the function *add(sample(g))* 50000 times
|
||||
and collections the results in a vector. Each time the function is called a single sample
|
||||
is drawn from the multivariate normal distribution. Each sample is a vector containing
|
||||
one *componentA* and *componentB* pair. the `add` function adds the values in the vector to
|
||||
calculate the length of the pair. Over the long term the samples drawn from the
|
||||
multivariate normal distribution will conform to the covariance matrix used to construct it.
|
||||
|
||||
Just as in the non-correlated example an empirical distribution is used to model probabilities
|
||||
of the simulation vector and the `cumulativeProbability` function is used to compute the cumulative
|
||||
probability that the combined component length will be 5 centimeters or less.
|
||||
|
||||
Notice that the probability of a hinge meeting specification has dropped to 0.9889517439980468.
|
||||
This is because the strong correlation
|
||||
between the lengths of components means that their lengths rise together causing more hinges to
|
||||
fall out of the 5 centimeter specification.
|
||||
|
||||
[source,text]
|
||||
----
|
||||
let(a=random(hinges, q="*:*", rows="5000", fl="componentA_d, componentB_d"),
|
||||
b=col(a, componentA_d),
|
||||
c=col(a, componentB_d),
|
||||
cor=corr(b,c),
|
||||
d=array(mean(b), mean(c)),
|
||||
e=transpose(matrix(b, c)),
|
||||
f=cov(e),
|
||||
g=multiVariateNormalDistribution(d, f),
|
||||
h=monteCarlo(add(sample(g)), 50000),
|
||||
i=empiricalDistribution(h),
|
||||
j=cumulativeProbability(i, 5))
|
||||
----
|
||||
|
||||
When this expression is sent to the /stream handler it responds with:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"j": 0.9889517439980468
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 599
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
|
@ -0,0 +1,430 @@
|
|||
= Interpolation, Derivatives and Integrals
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
This section of the math expression user guide covers *interpolation*, *derivatives* and *integrals*.
|
||||
These three interrelated topics are part of the field of mathematics called *numerical analysis*.
|
||||
|
||||
== Interpolation
|
||||
|
||||
Interpolation is used to construct new data points between a set of known control of points.
|
||||
The ability to *predict* new data points allows for *sampling* along the curve defined by the
|
||||
control points.
|
||||
|
||||
The interpolation functions described below all return an *interpolation model*
|
||||
that can be passed to other functions which make use of the sampling capability.
|
||||
|
||||
If returned directly the interpolation model returns an array containing predictions for each of the
|
||||
control points. This is useful in the case of `loess` interpolation which first smooths the control points
|
||||
and then interpolates the smoothed points. All other interpolation function simply return the original
|
||||
control points because interpolation predicts a curve that passes through the original control points.
|
||||
|
||||
There are different algorithms for interpolation that will result in different predictions
|
||||
along the curve. The math expressions library currently supports the following
|
||||
interpolation functions:
|
||||
|
||||
* `lerp`: Linear interpolation predicts points that pass through each control point and
|
||||
form straight lines between control points.
|
||||
* `spline`: Spline interpolation predicts points that pass through each control point
|
||||
and form a smooth curve between control points.
|
||||
* `akima`: Akima spline interpolation is similar to spline interpolation but is stable to outliers.
|
||||
* `loess`: Loess interpolation first performs a non-linear local regression to smooth the original
|
||||
control points. Then a spline is used to interpolate the smoothed control points.
|
||||
|
||||
=== Upsampling
|
||||
|
||||
Interpolation can be used to increase the sampling rate along a curve. One example
|
||||
of this would be to take a time series with samples every minute and create a data set with
|
||||
samples every second. In order to do this the data points between the minutes must be created.
|
||||
|
||||
The `predict` function can be used to predict values anywhere within the bounds of the interpolation
|
||||
range. The example below shows a very simple example of upsampling.
|
||||
|
||||
In the example linear interpolation is performed on the arrays in variables *x* and *y*. The *x* variable,
|
||||
which is the x axis, is a sequence from 0 to 20 with a stride of 2. The *y* variable defines the curve
|
||||
along the x axis.
|
||||
|
||||
The `lerp` function performs the interpolation and returns the interpolation model.
|
||||
|
||||
The `u` value is an array from 0 to 20 with a stride of 1. This fills in the gaps of the original x axis.
|
||||
The `predict` function then uses the interpolation function in variable *l* to predict values for
|
||||
every point in the array assigned to variable *u*.
|
||||
|
||||
The variable *p* is the array of predictions, which is the upsampled set of y values.
|
||||
|
||||
[source,text]
|
||||
----
|
||||
let(x=array(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20),
|
||||
y=array(5, 10, 60, 190, 100, 130, 100, 20, 30, 10, 5),
|
||||
l=lerp(x, y),
|
||||
u=array(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20),
|
||||
p=predict(l, u))
|
||||
----
|
||||
|
||||
When this expression is sent to the /stream handler it
|
||||
responds with:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"g": [
|
||||
5,
|
||||
7.5,
|
||||
10,
|
||||
35,
|
||||
60,
|
||||
125,
|
||||
190,
|
||||
145,
|
||||
100,
|
||||
115,
|
||||
130,
|
||||
115,
|
||||
100,
|
||||
60,
|
||||
20,
|
||||
25,
|
||||
30,
|
||||
20,
|
||||
10,
|
||||
7.5,
|
||||
5
|
||||
]
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 0
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
=== Smoothing Interpolation
|
||||
|
||||
The `loess` function is a smoothing interpolator which means it doesn't derive
|
||||
a function that passes through the original control points. Instead the `loess` function
|
||||
returns a function that smooths the original control points.
|
||||
|
||||
A technique known as local regression is used to compute the smoothed curve. The size of the
|
||||
neighborhood of the local regression can be adjusted
|
||||
to control how close the new curve conforms to the original control points.
|
||||
|
||||
The `loess` function is passed *x* and *y* axises and fits a smooth curve to the data.
|
||||
If only a single array is provided it is treated as the *y* axis and a sequence is generated
|
||||
for the *x* axis.
|
||||
|
||||
The example below uses the `loess` function to fit a curve to a set of *y* values in an array.
|
||||
The bandwidth parameter defines the percent of data to use for the local
|
||||
regression. The lower the percent the smaller the neighborhood used for the local
|
||||
regression and the closer the curve will be to the original data.
|
||||
|
||||
In the example the fitted curve is subtracted from the original curve using the
|
||||
`ebeSubtract` function. The output shows the error between the
|
||||
fitted curve and the original curve, known as the residuals. The output also includes
|
||||
the sum-of-squares of the residuals which provides a measure
|
||||
of how large the error is.
|
||||
|
||||
[source,text]
|
||||
----
|
||||
let(echo="residuals, sumSqError",
|
||||
y=array(0, 1, 2, 3, 4, 5.7, 6, 7, 7, 7,6, 7, 7, 7, 6, 5, 5, 3, 2, 1, 0),
|
||||
curve=loess(y, bandwidth=.3),
|
||||
residuals=ebeSubtract(y, curve),
|
||||
sumSqError=sumSq(residuals))
|
||||
----
|
||||
|
||||
When this expression is sent to the /stream handler it
|
||||
responds with:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"residuals": [
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
-0.040524802275866634,
|
||||
-0.10531988096456502,
|
||||
0.5906115002526198,
|
||||
0.004215074334896762,
|
||||
0.4201374330912433,
|
||||
0.09618315578013803,
|
||||
0.012107948556718817,
|
||||
-0.9892939034492398,
|
||||
0.012014364143757561,
|
||||
0.1093830927709325,
|
||||
0.523166271893805,
|
||||
0.09658362075164639,
|
||||
-0.011433819306139625,
|
||||
0.9899403519886416,
|
||||
-0.011707983372932773,
|
||||
-0.004223284004140737,
|
||||
-0.00021462867928434548,
|
||||
0.0018723112875456138
|
||||
],
|
||||
"sumSqError": 2.8016013870800616
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 0
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
In the next example the curve is fit using a bandwidth of .25. Notice that the curve
|
||||
is a closer fit, shown by the smaller residuals and lower value for the sum-of-squares of the
|
||||
residuals.
|
||||
|
||||
[source,text]
|
||||
----
|
||||
let(echo="residuals, sumSqError",
|
||||
y=array(0, 1, 2, 3, 4, 5.7, 6, 7, 6, 5, 5, 3, 2, 1, 0),
|
||||
curve=loess(y, .25),
|
||||
residuals=ebeSubtract(y, curve),
|
||||
sumSqError=sumSq(residuals))
|
||||
----
|
||||
|
||||
When this expression is sent to the /stream handler it
|
||||
responds with:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"residuals": [
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
-0.19117650587715396,
|
||||
0.442863451538809,
|
||||
-0.18553845993358564,
|
||||
0.29990769020356645,
|
||||
0,
|
||||
0.23761890236245709,
|
||||
-0.7344358765888117,
|
||||
0.2376189023624491,
|
||||
0,
|
||||
0.30373119215254984,
|
||||
-3.552713678800501e-15,
|
||||
-0.23761890236245264,
|
||||
0.7344358765888046,
|
||||
-0.2376189023625095,
|
||||
0,
|
||||
2.842170943040401e-14,
|
||||
-2.4868995751603507e-14
|
||||
],
|
||||
"sumSqError": 1.7539413576337557
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 0
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
== Derivatives
|
||||
|
||||
The derivative of a function measures the rate of change of the *y* value in respects to the
|
||||
rate of change of the *x* value.
|
||||
|
||||
The `derivative` function can compute the derivative of any *interpolation* function.
|
||||
The `derivative` function can also compute the derivative of a derivative.
|
||||
|
||||
The example below computes the derivative for a `loess` interpolation function.
|
||||
|
||||
[source,text]
|
||||
----
|
||||
let(x=array(0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20),
|
||||
y=array(0, 1, 2, 3, 4, 5.7, 6, 7, 7, 7,6, 7, 7, 7, 6, 5, 5, 3, 2, 1, 0),
|
||||
curve=loess(x, y, bandwidth=.3),
|
||||
derivative=derivative(curve))
|
||||
----
|
||||
|
||||
When this expression is sent to the /stream handler it
|
||||
responds with:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"derivative": [
|
||||
1.0022002675659012,
|
||||
0.9955994648681976,
|
||||
1.0154018729613081,
|
||||
1.0643674501141696,
|
||||
1.0430879694757085,
|
||||
0.9698717643975381,
|
||||
0.7488201070357539,
|
||||
0.44627000894357516,
|
||||
0.19019561285422165,
|
||||
0.01703599324311178,
|
||||
-0.001908408138535126,
|
||||
-0.009121607450087499,
|
||||
-0.2576361507216319,
|
||||
-0.49378951291352746,
|
||||
-0.7288073815664,
|
||||
-0.9871806872210384,
|
||||
-1.0025400632604322,
|
||||
-1.001836567536853,
|
||||
-1.0076227586138085,
|
||||
-1.0021524620888589,
|
||||
-1.0020541789058157
|
||||
]
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 0
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
== Integrals
|
||||
|
||||
An integral is a measure of the volume underneath a curve.
|
||||
The `integrate` function computes an integral for a specific
|
||||
range of an interpolated curve.
|
||||
|
||||
In the example below the `integrate` function computes an
|
||||
integral for the entire range of the curve, 0 through 20.
|
||||
|
||||
[source,text]
|
||||
----
|
||||
let(x=array(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20),
|
||||
y=array(0, 1, 2, 3, 4, 5.7, 6, 7, 7, 7,6, 7, 7, 7, 6, 5, 5, 3, 2, 1, 0),
|
||||
curve=loess(x, y, bandwidth=.3),
|
||||
integral=integrate(curve, 0, 20))
|
||||
----
|
||||
|
||||
When this expression is sent to the /stream handler it
|
||||
responds with:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"integral": 90.17446104846645
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 0
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
In the next example an integral is computed for the range of 0 through 10.
|
||||
|
||||
[source,text]
|
||||
----
|
||||
let(x=array(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20),
|
||||
y=array(0, 1, 2, 3, 4, 5.7, 6, 7, 7, 7,6, 7, 7, 7, 6, 5, 5, 3, 2, 1, 0),
|
||||
curve=loess(x, y, bandwidth=.3),
|
||||
integral=integrate(curve, 0, 10))
|
||||
----
|
||||
|
||||
When this expression is sent to the /stream handler it
|
||||
responds with:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"integral": 45.300912584519914
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 0
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
== Bicubic Spline
|
||||
|
||||
The `bicubicSpline` function can be used to interpolate and predict values
|
||||
anywhere within a grid of data.
|
||||
|
||||
A simple example will make this more clear.
|
||||
|
||||
In example below a bicubic spline is used to interpolate a matrix of real estate data.
|
||||
Each row of the matrix represents a specific *year*. Each column of the matrix
|
||||
represents a *floor* of the building. The grid of numbers is the average selling price of
|
||||
an apartment for each year and floor. For example in 2002 the average selling price for
|
||||
the 9th floor was 415000 (row 3, column 3).
|
||||
|
||||
The `bicubicSpline` function is then used to
|
||||
interpolate the grid, and the `predict` function is used to predict a value for year 2003, floor 8.
|
||||
Notice that the matrix does not included a data point for year 2003, floor 8. The `bicupicSpline`
|
||||
function creates that data point based on the surrounding data in the matrix.
|
||||
|
||||
[source,text]
|
||||
----
|
||||
let(years=array(1998, 2000, 2002, 2004, 2006),
|
||||
floors=array(1, 5, 9, 13, 17, 19),
|
||||
prices = matrix(array(300000, 320000, 330000, 350000, 360000, 370000),
|
||||
array(320000, 330000, 340000, 350000, 365000, 380000),
|
||||
array(400000, 410000, 415000, 425000, 430000, 440000),
|
||||
array(410000, 420000, 425000, 435000, 445000, 450000),
|
||||
array(420000, 430000, 435000, 445000, 450000, 470000)),
|
||||
bspline=bicubicSpline(years, floors, prices),
|
||||
prediction=predict(bspline, 2003, 8))
|
||||
----
|
||||
|
||||
When this expression is sent to the /stream handler it
|
||||
responds with:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"prediction": 418279.5009328358
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 0
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
|
@ -0,0 +1,415 @@
|
|||
= Probability Distributions
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
This section of the user guide covers the
|
||||
*probability distribution
|
||||
framework* included in the math expressions library.
|
||||
|
||||
== Probability Distributions
|
||||
|
||||
The probability distribution framework includes
|
||||
many commonly used *real* and *discrete* probability
|
||||
distributions, including support for *empirical* and
|
||||
*enumerated* distributions that model real world data.
|
||||
|
||||
The probability distribution framework also includes a set
|
||||
of functions that use the probability distributions
|
||||
to support probability calculations and sampling.
|
||||
|
||||
=== Real Distributions
|
||||
|
||||
The probability distribution framework has the following functions
|
||||
which support well known real probability distributions:
|
||||
|
||||
* `normalDistribution`: Creates a normal distribution function.
|
||||
|
||||
* `logNormalDistribution`: Creates a log normal distribution function.
|
||||
|
||||
* `gammaDistribution`: Creates a gamma distribution function.
|
||||
|
||||
* `betaDistribution`: Creates a beta distribution function.
|
||||
|
||||
* `uniformDistribution`: Creates a uniform real distribution function.
|
||||
|
||||
* `weibullDistribution`: Creates a Weibull distribution function.
|
||||
|
||||
* `triangularDistribution`: Creates a triangular distribution function.
|
||||
|
||||
* `constantDistribution`: Creates constant real distribution function.
|
||||
|
||||
=== Empirical Distribution
|
||||
|
||||
The `empiricalDistribution` function creates a real probability
|
||||
distribution from actual data. An empirical distribution
|
||||
can be used interchangeably with any of the theoretical
|
||||
real distributions.
|
||||
|
||||
=== Discrete
|
||||
|
||||
The probability distribution framework has the following functions
|
||||
which support well known discrete probability distributions:
|
||||
|
||||
* `poissonDistribution`: Creates a Poisson distribution function.
|
||||
|
||||
* `binomialDistribution`: Creates a binomial distribution function.
|
||||
|
||||
* `uniformIntegerDistribution`: Creates a uniform integer distribution function.
|
||||
|
||||
* `geometricDistribution`: Creates a geometric distribution function.
|
||||
|
||||
* `zipFDistribution`: Creates a Zipf distribution function.
|
||||
|
||||
=== Enumerated Distributions
|
||||
|
||||
The `enumeratedDistribution` function creates a discrete
|
||||
distribution function from a data set of discrete values,
|
||||
or from and enumerated list of values and probabilities.
|
||||
|
||||
Enumerated distribution functions can be used interchangeably
|
||||
with any of the theoretical discrete distributions.
|
||||
|
||||
=== Cumulative Probability
|
||||
|
||||
The `cumulativeProbability` function can be used with all
|
||||
probability distributions to calculate the
|
||||
cumulative probability of encountering a specific
|
||||
random variable within a specific distribution.
|
||||
|
||||
Below is example of calculating the cumulative probability
|
||||
of a random variable within a normal distribution.
|
||||
|
||||
In the example a normal distribution function is created
|
||||
with a mean of 10 and a standard deviation of 5. Then
|
||||
the cumulative probability of the value 12 is calculated for this
|
||||
specific distribution.
|
||||
|
||||
[source,text]
|
||||
----
|
||||
let(a=normalDistribution(10, 5),
|
||||
b=cumulativeProbability(a, 12))
|
||||
----
|
||||
|
||||
When this expression is sent to the /stream handler it responds with:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"b": 0.6554217416103242
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 0
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
Below is an example of a cumulative probability calculation
|
||||
using an empirical distribution.
|
||||
|
||||
In the example an empirical distribution is created from a random
|
||||
sample taken from the *price_f* field.
|
||||
|
||||
The cumulative probability of the value .75 is then calculated.
|
||||
The *price_f* field in this example was generated using a
|
||||
uniform real distribution between 0 and 1, so the output of the
|
||||
`cumulativeProbability` function is very close to .75.
|
||||
|
||||
[source,text]
|
||||
----
|
||||
let(a=random(collection1, q="*:*", rows="30000", fl="price_f"),
|
||||
b=col(a, price_f),
|
||||
c=empiricalDistribution(b),
|
||||
d=cumulativeProbability(c, .75))
|
||||
----
|
||||
|
||||
When this expression is sent to the /stream handler it responds with:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"b": 0.7554217416103242
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 0
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
=== Probability
|
||||
|
||||
The `probability` function can be used with any discrete
|
||||
distribution function to compute the probability of a
|
||||
discrete value.
|
||||
|
||||
Below is an example which calculates the probability
|
||||
of a discrete value within a Poisson distribution.
|
||||
|
||||
In the example a Poisson distribution function is created
|
||||
with a mean of 100. Then the
|
||||
probability of encountering a sample of the discrete value 101 is calculated for this
|
||||
specific distribution.
|
||||
|
||||
[source,text]
|
||||
----
|
||||
let(a=poissonDistribution(100),
|
||||
b=probability(a, 101))
|
||||
----
|
||||
|
||||
When this expression is sent to the /stream handler it responds with:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"b": 0.039466333474403106
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 0
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
Below is an example of a probability calculation
|
||||
using an enumerated distribution.
|
||||
|
||||
In the example an enumerated distribution is created from a random
|
||||
sample taken from the *day_i* field, which was created
|
||||
using a uniform integer distribution between 0 and 30.
|
||||
|
||||
The probability of the discrete value 10 is then calculated.
|
||||
|
||||
[source,text]
|
||||
----
|
||||
let(a=random(collection1, q="*:*", rows="30000", fl="day_i"),
|
||||
b=col(a, day_i),
|
||||
c=enumeratedDistribution(b),
|
||||
d=probability(c, 10))
|
||||
----
|
||||
|
||||
When this expression is sent to the /stream handler it responds with:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"d": 0.03356666666666666
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 488
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
=== Sampling
|
||||
|
||||
All probability distributions support sampling. The `sample`
|
||||
function returns 1 or more random samples from a probability
|
||||
distribution.
|
||||
|
||||
Below is an example drawing a single sample from
|
||||
a normal distribution.
|
||||
|
||||
[source,text]
|
||||
----
|
||||
let(a=normalDistribution(10, 5),
|
||||
b=sample(a))
|
||||
----
|
||||
|
||||
When this expression is sent to the /stream handler it responds with:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"b": 11.24578055004963
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 0
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
Below is an example drawing 10 samples from a normal
|
||||
distribution.
|
||||
|
||||
[source,text]
|
||||
----
|
||||
let(a=normalDistribution(10, 5),
|
||||
b=sample(a, 10))
|
||||
----
|
||||
|
||||
When this expression is sent to the /stream handler it responds with:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"b": [
|
||||
10.18444709339441,
|
||||
9.466947971749377,
|
||||
1.2420697166234458,
|
||||
11.074501226984806,
|
||||
7.659629052136225,
|
||||
0.4440887839190708,
|
||||
13.710925254778786,
|
||||
2.089566359480239,
|
||||
0.7907293097654424,
|
||||
2.8184587681006734
|
||||
]
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 3
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
=== Multivariate Normal Distribution
|
||||
|
||||
The multivariate normal distribution is a generalization of the
|
||||
univariate normal distribution to higher dimensions.
|
||||
|
||||
The multivariate normal distribution models two or more random
|
||||
variables that are normally distributed. The relationship between
|
||||
the variables is defined by a covariance matrix.
|
||||
|
||||
==== Sampling
|
||||
|
||||
The `sample` function can be used to draw samples
|
||||
from a multivariate normal distribution in much the same
|
||||
way as a univariate normal distribution.
|
||||
The difference is that each sample will be an array containing a sample
|
||||
drawn from each of the underlying normal distributions.
|
||||
If multiple samples are drawn, the `sample` function returns a matrix with a
|
||||
sample in each row. Over the long term the columns of the sample
|
||||
matrix will conform to the covariance matrix used to parametrize the
|
||||
multivariate normal distribution.
|
||||
|
||||
The example below demonstrates how to initialize and draw samples
|
||||
from a multivariate normal distribution.
|
||||
|
||||
In this example 5000 random samples are selected from a collection
|
||||
of log records. Each sample contains
|
||||
the fields *filesize_d* and *response_d*. The values of both fields conform
|
||||
to a normal distribution.
|
||||
|
||||
Both fields are then vectorized. The *filesize_d* vector is stored in
|
||||
variable *b* and the *response_d* variable is stored in variable *c*.
|
||||
|
||||
An array is created that contains the *means* of the two vectorized fields.
|
||||
|
||||
Then both vectors are added to a matrix which is transposed. This creates
|
||||
an *observation* matrix where each row contains one observation of
|
||||
*filesize_d* and *response_d*. A covariance matrix is then created from the columns of
|
||||
the observation matrix with the
|
||||
`cov` function. The covariance matrix describes the covariance between
|
||||
*filesize_d* and *response_d*.
|
||||
|
||||
The `multivariateNormalDistribution` function is then called with the
|
||||
array of means for the two fields and the covariance matrix. The model for the
|
||||
multivariate normal distribution is assigned to variable *g*.
|
||||
|
||||
Finally five samples are drawn from the multivariate normal distribution. The samples
|
||||
are returned as a matrix, with each row representing one sample. There are two
|
||||
columns in the matrix. The first column contains samples for *filesize_d* and the second
|
||||
column contains samples for *response_d*. Over the long term the covariance between
|
||||
the columns will conform to the covariance matrix used to instantiate the
|
||||
multivariate normal distribution.
|
||||
|
||||
[source,text]
|
||||
----
|
||||
let(a=random(collection2, q="*:*", rows="5000", fl="filesize_d, response_d"),
|
||||
b=col(a, filesize_d),
|
||||
c=col(a, response_d),
|
||||
d=array(mean(b), mean(c)),
|
||||
e=transpose(matrix(b, c)),
|
||||
f=cov(e),
|
||||
g=multiVariateNormalDistribution(d, f),
|
||||
h=sample(g, 5))
|
||||
----
|
||||
|
||||
When this expression is sent to the /stream handler it responds with:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"h": [
|
||||
[
|
||||
41974.85669321393,
|
||||
779.4097049705296
|
||||
],
|
||||
[
|
||||
42869.19876441414,
|
||||
834.2599296790783
|
||||
],
|
||||
[
|
||||
38556.30444839889,
|
||||
720.3683470060988
|
||||
],
|
||||
[
|
||||
37689.31290928216,
|
||||
686.5549428100018
|
||||
],
|
||||
[
|
||||
40564.74398214547,
|
||||
769.9328090774
|
||||
]
|
||||
]
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 162
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
|
@ -0,0 +1,439 @@
|
|||
= Linear Regression
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
|
||||
This section of the math expressions user guide covers simple and multivariate linear regression.
|
||||
|
||||
|
||||
== Simple Linear Regression
|
||||
|
||||
The `regress` function is used to build a linear regression model
|
||||
between two random variables. Sample observations are provided with two
|
||||
numeric arrays. The first numeric array is the *independent variable* and
|
||||
the second array is the *dependent variable*.
|
||||
|
||||
In the example below the `random` function selects 5000 random samples each containing
|
||||
the fields *filesize_d* and *response_d*. The two fields are vectorized
|
||||
and stored in variables *b* and *c*. Then the `regress` function performs a regression
|
||||
analysis on the two numeric arrays.
|
||||
|
||||
The `regress` function returns a single tuple with the results of the regression
|
||||
analysis.
|
||||
|
||||
Note that in this regression analysis the value of *RSquared* is *.75*. This means that changes in
|
||||
*filesize_d* explain 75% of the variability of the *response_d* variable.
|
||||
|
||||
[source,text]
|
||||
----
|
||||
let(a=random(collection2, q="*:*", rows="5000", fl="filesize_d, response_d"),
|
||||
b=col(a, filesize_d),
|
||||
c=col(a, response_d),
|
||||
d=regress(b, c))
|
||||
----
|
||||
|
||||
When this expression is sent to the /stream handler it responds with:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"d": {
|
||||
"significance": 0,
|
||||
"totalSumSquares": 10564812.895147054,
|
||||
"R": 0.8674822407146515,
|
||||
"RSquared": 0.7525254379553127,
|
||||
"meanSquareError": 523.1137343558588,
|
||||
"intercept": -49.528134913099095,
|
||||
"slopeConfidenceInterval": 0.0003171801710329995,
|
||||
"regressionSumSquares": 7950290.450836472,
|
||||
"slope": 0.019945557923159506,
|
||||
"interceptStdErr": 6.489732340389941,
|
||||
"N": 5000
|
||||
}
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 98
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
=== Prediction
|
||||
|
||||
The `predict` function uses the regression model to make predictions.
|
||||
Using the example above the regression model can be used to predict the value
|
||||
of *response_d* given a value for *filesize_d*.
|
||||
|
||||
In the example below the `predict` function uses the regression analysis to predict
|
||||
the value of *response_d* for the *filesize_d* value of 40000.
|
||||
|
||||
|
||||
[source,text]
|
||||
----
|
||||
let(a=random(collection2, q="*:*", rows="5000", fl="filesize_d, response_d"),
|
||||
b=col(a, filesize_d),
|
||||
c=col(a, response_d),
|
||||
d=regress(b, c),
|
||||
e=predict(d, 40000))
|
||||
----
|
||||
|
||||
When this expression is sent to the /stream handler it responds with:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"e": 748.079241022975
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 95
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
The `predict` function can also make predictions for an array of values. In this
|
||||
case it returns an array of predictions.
|
||||
|
||||
In the example below the `predict` function uses the regression analysis to
|
||||
predict values for each of the 5000 samples of `filesize_d` used to generate the model.
|
||||
In this case 5000 predictions are returned.
|
||||
|
||||
[source,text]
|
||||
----
|
||||
let(a=random(collection2, q="*:*", rows="5000", fl="filesize_d, response_d"),
|
||||
b=col(a, filesize_d),
|
||||
c=col(a, response_d),
|
||||
d=regress(b, c),
|
||||
e=predict(d, b))
|
||||
----
|
||||
|
||||
When this expression is sent to the /stream handler it responds with:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"e": [
|
||||
742.2525322514165,
|
||||
709.6972488729955,
|
||||
687.8382568904871,
|
||||
820.2511324266264,
|
||||
720.4006432289061,
|
||||
761.1578181053039,
|
||||
759.1304101159126,
|
||||
699.5597256337142,
|
||||
742.4738911248204,
|
||||
769.0342605881644,
|
||||
746.6740473150268,
|
||||
...
|
||||
]
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 113
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
=== Residuals
|
||||
|
||||
The difference between the observed value and the predicted value is known as the
|
||||
residual. There isn't a specific function to calculate the residuals but vector
|
||||
math can used to perform the calculation.
|
||||
|
||||
In the example below the predictions are stored in variable *e*. The `ebeSubtract`
|
||||
function is then used to subtract the predictions
|
||||
from the actual *response_d* values stored in variable *c*. Variable *f* contains
|
||||
the array of residuals.
|
||||
|
||||
[source,text]
|
||||
----
|
||||
let(a=random(collection2, q="*:*", rows="5000", fl="filesize_d, response_d"),
|
||||
b=col(a, filesize_d),
|
||||
c=col(a, response_d),
|
||||
d=regress(b, c),
|
||||
e=predict(d, b),
|
||||
f=ebeSubtract(c, e))
|
||||
----
|
||||
|
||||
When this expression is sent to the /stream handler it responds with:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"e": [
|
||||
31.30678554491226,
|
||||
-30.292830927953446,
|
||||
-30.49508862647258,
|
||||
-30.499884780783532,
|
||||
-9.696458959319784,
|
||||
-30.521563961535094,
|
||||
-30.28380938033081,
|
||||
-9.890289849359306,
|
||||
30.819723560583157,
|
||||
-30.213178859683012,
|
||||
-30.609943619066826,
|
||||
10.527700442607625,
|
||||
10.68046928406568,
|
||||
...
|
||||
]
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 113
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
== Multivariate Linear Regression
|
||||
|
||||
The `olsRegress` function performs a multivariate linear regression analysis. Multivariate linear
|
||||
regression models the linear relationship between two or more *independent* variables and a *dependent* variable.
|
||||
|
||||
The example below extends the simple linear regression example by introducing a new independent variable
|
||||
called *service_d*. The *service_d* variable is the service level of the request and it can range from 1 to 4
|
||||
in the data-set. The higher the service level, the higher the bandwidth available for the request.
|
||||
|
||||
Notice that the two independent variables *filesize_d* and *service_d* are vectorized and stored
|
||||
in the variables *b* and *c*. The variables *b* and *c* are then added as rows to a `matrix`. The matrix is
|
||||
then transposed so that each row in the matrix represents one observation with *filesize_d* and *service_d*.
|
||||
The `olsRegress` function then performs the multivariate regression analysis using the observation matrix as the
|
||||
independent variables and the *response_d* values, stored in variable *d*, as the dependent variable.
|
||||
|
||||
Notice that the RSquared of the regression analysis is 1. This means that linear relationship between
|
||||
*filesize_d* and *service_d* describe 100% of the variability of the *response_d* variable.
|
||||
|
||||
[source,text]
|
||||
----
|
||||
let(a=random(collection2, q="*:*", rows="30000", fl="filesize_d, service_d, response_d"),
|
||||
b=col(a, filesize_d),
|
||||
c=col(a, service_d),
|
||||
d=col(a, response_d),
|
||||
e=transpose(matrix(b, c)),
|
||||
f=olsRegress(e, d))
|
||||
----
|
||||
|
||||
When this expression is sent to the /stream handler it responds with:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"f": {
|
||||
"regressionParametersStandardErrors": [
|
||||
2.0660690430026933e-13,
|
||||
5.1212982077663434e-18,
|
||||
9.10920932555875e-15
|
||||
],
|
||||
"RSquared": 1,
|
||||
"regressionParameters": [
|
||||
6.553210695971329e-12,
|
||||
0.019999999999999858,
|
||||
-20.49999999999968
|
||||
],
|
||||
"regressandVariance": 2124.130825172683,
|
||||
"regressionParametersVariance": [
|
||||
[
|
||||
0.013660174897582315,
|
||||
-3.361258014840509e-7,
|
||||
-0.00006893737578369605
|
||||
],
|
||||
[
|
||||
-3.361258014840509e-7,
|
||||
8.393183709503206e-12,
|
||||
6.430253229589981e-11
|
||||
],
|
||||
[
|
||||
-0.00006893737578369605,
|
||||
6.430253229589981e-11,
|
||||
0.000026553878455570856
|
||||
]
|
||||
],
|
||||
"adjustedRSquared": 1,
|
||||
"residualSumSquares": 9.373703759269822e-20
|
||||
}
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 690
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
=== Prediction
|
||||
|
||||
The `predict` function can also be used to make predictions for multivariate linear regression. Below is an example
|
||||
of a single prediction using the multivariate linear regression model and a single observation. The observation
|
||||
is an array that matches the structure of the observation matrix used to build the model. In this case
|
||||
the first value represent a *filesize_d* of 40000 and the second value represents a *service_d* of 4.
|
||||
|
||||
[source,text]
|
||||
----
|
||||
let(a=random(collection2, q="*:*", rows="5000", fl="filesize_d, service_d, response_d"),
|
||||
b=col(a, filesize_d),
|
||||
c=col(a, service_d),
|
||||
d=col(a, response_d),
|
||||
e=transpose(matrix(b, c)),
|
||||
f=olsRegress(e, d),
|
||||
g=predict(f, array(40000, 4)))
|
||||
----
|
||||
|
||||
When this expression is sent to the /stream handler it responds with:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"g": 718.0000000000005
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 117
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
The `predict` function can also make predictions for more than one multivariate observation. In this scenario
|
||||
an observation matrix used. In the example below the observation matrix used to build the multivariate regression model
|
||||
is passed to the `predict` function and it returns an array of predictions.
|
||||
|
||||
|
||||
[source,text]
|
||||
----
|
||||
let(a=random(collection2, q="*:*", rows="5000", fl="filesize_d, service_d, response_d"),
|
||||
b=col(a, filesize_d),
|
||||
c=col(a, service_d),
|
||||
d=col(a, response_d),
|
||||
e=transpose(matrix(b, c)),
|
||||
f=olsRegress(e, d),
|
||||
g=predict(f, e))
|
||||
----
|
||||
|
||||
When this expression is sent to the /stream handler it responds with:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"e": [
|
||||
685.498283591961,
|
||||
801.2175699959365,
|
||||
776.7638245911025,
|
||||
610.3559852681935,
|
||||
751.0925865965207,
|
||||
787.2914663381897,
|
||||
744.3632053810668,
|
||||
688.3729301599697,
|
||||
765.367783417171,
|
||||
724.9309687628346,
|
||||
834.4350712384264,
|
||||
...
|
||||
]
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 113
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
=== Residuals
|
||||
|
||||
Once the predictions are generated the residuals can be calculated using the same approach used with
|
||||
simple linear regression.
|
||||
|
||||
Below is an example of the residuals calculation following a multivariate linear regression. In the example
|
||||
the predictions stored variable *g* are subtracted from observed values stored in variable *d*.
|
||||
|
||||
[source,text]
|
||||
----
|
||||
let(a=random(collection2, q="*:*", rows="5000", fl="filesize_d, service_d, response_d"),
|
||||
b=col(a, filesize_d),
|
||||
c=col(a, service_d),
|
||||
d=col(a, response_d),
|
||||
e=transpose(matrix(b, c)),
|
||||
f=olsRegress(e, d),
|
||||
g=predict(f, e),
|
||||
h=ebeSubtract(d, g))
|
||||
----
|
||||
|
||||
When this expression is sent to the /stream handler it responds with:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"e": [
|
||||
1.1368683772161603e-13,
|
||||
1.1368683772161603e-13,
|
||||
0,
|
||||
1.1368683772161603e-13,
|
||||
0,
|
||||
1.1368683772161603e-13,
|
||||
0,
|
||||
2.2737367544323206e-13,
|
||||
1.1368683772161603e-13,
|
||||
2.2737367544323206e-13,
|
||||
1.1368683772161603e-13,
|
||||
...
|
||||
]
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 113
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,137 @@
|
|||
= Scalar Math
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
The most basic math expressions are scalar expressions. Scalar expressions
|
||||
perform mathematical operations on numbers.
|
||||
|
||||
For example the expression below adds two numbers together:
|
||||
|
||||
[source,text]
|
||||
----
|
||||
add(1, 1)
|
||||
----
|
||||
|
||||
When this expression is sent to the /stream handler it
|
||||
responds with:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"return-value": 2
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 2
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
Math expressions can be nested. For example in the expression
|
||||
below the output of the `add` function is the second parameter
|
||||
of the `pow` function:
|
||||
|
||||
[source,text]
|
||||
----
|
||||
pow(10, add(1,1))
|
||||
----
|
||||
|
||||
This expression returns the following response:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"return-value": 100
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 0
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
== Streaming Scalar Math
|
||||
|
||||
Scalar math expressions can also be applied to each tuple in a stream
|
||||
through use of the `select` stream decorator. The `select` function wraps a
|
||||
stream of tuples and selects fields to include in each tuple.
|
||||
The `select` function can also use math expressions to compute
|
||||
new values and add them to the outgoing tuples.
|
||||
|
||||
In the example below the `select` expression is wrapping a search
|
||||
expression. The `select` function is selecting the *price_f* field
|
||||
and computing a new field called *newPrice* using the `mult` math
|
||||
expression.
|
||||
|
||||
The first parameter of the `mult` expression is the *price_f* field.
|
||||
The second parameter is the scalar value 10. This multiplies the value
|
||||
of the *price_f* field in each tuple by 10.
|
||||
|
||||
[source,text]
|
||||
----
|
||||
select(search(collection2, q="*:*", fl="price_f", sort="price_f desc", rows="3"),
|
||||
price_f,
|
||||
mult(price_f, 10) as newPrice)
|
||||
----
|
||||
|
||||
When this expression is sent to the /stream handler it responds with:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"price_f": 0.99999994,
|
||||
"newPrice": 9.9999994
|
||||
},
|
||||
{
|
||||
"price_f": 0.99999994,
|
||||
"newPrice": 9.9999994
|
||||
},
|
||||
{
|
||||
"price_f": 0.9999992,
|
||||
"newPrice": 9.999992
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 3
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
== More Scalar Math Functions
|
||||
|
||||
The following scalar math functions are available in the math expressions library:
|
||||
|
||||
`abs`, `add`, `div`, `mult`, `sub`, `log`,
|
||||
`pow`, `mod`, `ceil`, `floor`, `sin`, `asin`,
|
||||
`sinh`, `cos`, `acos`, `cosh`, `tan`, `atan`,
|
||||
`tanh`, `round`, `precision`, `sqrt`, `cbrt`
|
||||
|
|
@ -0,0 +1,575 @@
|
|||
= Statistics
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
|
||||
This section of the user guide covers the core statistical functions
|
||||
available in math expressions.
|
||||
|
||||
== Descriptive Statistics
|
||||
|
||||
The `describe` function can be used to return descriptive statistics about a
|
||||
numeric array. The `describe` function returns a single *tuple* with name/value
|
||||
pairs containing descriptive statistics.
|
||||
|
||||
Below is a simple example that selects a random sample of documents,
|
||||
vectorizes the *price_f* field in the result set and uses the `describe` function to
|
||||
return descriptive statistics about the vector:
|
||||
|
||||
[source,text]
|
||||
----
|
||||
let(a=random(collection1, q="*:*", rows="1500", fl="price_f"),
|
||||
b=col(a, price_f),
|
||||
c=describe(b))
|
||||
----
|
||||
|
||||
When this expression is sent to the /stream handler it responds with:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"c": {
|
||||
"sumsq": 4999.041975263254,
|
||||
"max": 0.99995726,
|
||||
"var": 0.08344429493940454,
|
||||
"geometricMean": 0.36696588922559575,
|
||||
"sum": 7497.460565552007,
|
||||
"kurtosis": -1.2000739963006035,
|
||||
"N": 15000,
|
||||
"min": 0.00012338161,
|
||||
"mean": 0.49983070437013266,
|
||||
"popVar": 0.08343873198640858,
|
||||
"skewness": -0.001735537500095477,
|
||||
"stdev": 0.28886726179926403
|
||||
}
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 305
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
== Histograms and Frequency Tables
|
||||
|
||||
Histograms and frequency tables are are tools for understanding the distribution
|
||||
of a random variable.
|
||||
|
||||
The `hist` function creates a histogram designed for usage with continuous data. The
|
||||
`freqTable` function creates a frequency table for use with discrete data.
|
||||
|
||||
=== histograms
|
||||
|
||||
Below is an example that selects a random sample, creates a vector from the
|
||||
result set and uses the `hist` function to return a histogram with 5 bins.
|
||||
The `hist` function returns a list of tuples with summary statistics for each bin.
|
||||
|
||||
[source,text]
|
||||
----
|
||||
let(a=random(collection1, q="*:*", rows="15000", fl="price_f"),
|
||||
b=col(a, price_f),
|
||||
c=hist(b, 5))
|
||||
----
|
||||
|
||||
When this expression is sent to the /stream handler it responds with:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"c": [
|
||||
{
|
||||
"prob": 0.2057939717603699,
|
||||
"min": 0.000010371208,
|
||||
"max": 0.19996578,
|
||||
"mean": 0.10010319358402578,
|
||||
"var": 0.003366805016271609,
|
||||
"cumProb": 0.10293732468049072,
|
||||
"sum": 309.0185585938884,
|
||||
"stdev": 0.058024176136086666,
|
||||
"N": 3087
|
||||
},
|
||||
{
|
||||
"prob": 0.19381868629885585,
|
||||
"min": 0.20007741,
|
||||
"max": 0.3999073,
|
||||
"mean": 0.2993590803885827,
|
||||
"var": 0.003401644034068929,
|
||||
"cumProb": 0.3025295802728267,
|
||||
"sum": 870.5362057700005,
|
||||
"stdev": 0.0583236147205309,
|
||||
"N": 2908
|
||||
},
|
||||
{
|
||||
"prob": 0.20565789836690007,
|
||||
"min": 0.39995712,
|
||||
"max": 0.5999038,
|
||||
"mean": 0.4993620963792545,
|
||||
"var": 0.0033158364923609046,
|
||||
"cumProb": 0.5023006239697967,
|
||||
"sum": 1540.5320673300018,
|
||||
"stdev": 0.05758330046429177,
|
||||
"N": 3085
|
||||
},
|
||||
{
|
||||
"prob": 0.19437108496008693,
|
||||
"min": 0.6000449,
|
||||
"max": 0.79973197,
|
||||
"mean": 0.7001752711861512,
|
||||
"var": 0.0033895105082360185,
|
||||
"cumProb": 0.7026537198687285,
|
||||
"sum": 2042.4112660500066,
|
||||
"stdev": 0.058219502816805456,
|
||||
"N": 2917
|
||||
},
|
||||
{
|
||||
"prob": 0.20019582213899467,
|
||||
"min": 0.7999126,
|
||||
"max": 0.99987316,
|
||||
"mean": 0.8985428275824184,
|
||||
"var": 0.003312360017780078,
|
||||
"cumProb": 0.899450457219298,
|
||||
"sum": 2698.3241112299997,
|
||||
"stdev": 0.05755310606544253,
|
||||
"N": 3003
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 322
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
The `col` function can be used to *vectorize* a column of data from the list of tuples
|
||||
returned by the `hist` function.
|
||||
|
||||
In the example below, the *N* field,
|
||||
which is the number of observations in the each bin, is returned as a vector.
|
||||
|
||||
[source,text]
|
||||
----
|
||||
let(a=random(collection1, q="*:*", rows="15000", fl="price_f"),
|
||||
b=col(a, price_f),
|
||||
c=hist(b, 11),
|
||||
d=col(c, N))
|
||||
----
|
||||
|
||||
When this expression is sent to the /stream handler it responds with:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"d": [
|
||||
1387,
|
||||
1396,
|
||||
1391,
|
||||
1357,
|
||||
1384,
|
||||
1360,
|
||||
1367,
|
||||
1375,
|
||||
1307,
|
||||
1310,
|
||||
1366
|
||||
]
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 307
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
=== Frequency Tables
|
||||
|
||||
The `freqTable` function returns a frequency distribution for a discrete data set.
|
||||
The `freqTable` function doesn't create bins like the histogram. Instead it counts
|
||||
the occurrence of each discrete data value and returns a list of tuples with the
|
||||
frequency statistics for each value. Fields from a frequency table can be vectorized using
|
||||
using the `col` function in the same manner as a histogram.
|
||||
|
||||
Below is a simple example of a frequency table built from a random sample of
|
||||
a discrete variable.
|
||||
|
||||
[source,text]
|
||||
----
|
||||
let(a=random(collection1, q="*:*", rows="15000", fl="day_i"),
|
||||
b=col(a, day_i),
|
||||
c=freqTable(b))
|
||||
----
|
||||
|
||||
When this expression is sent to the /stream handler it responds with:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"c": [
|
||||
{
|
||||
"pct": 0.0318,
|
||||
"count": 477,
|
||||
"cumFreq": 477,
|
||||
"cumPct": 0.0318,
|
||||
"value": 0
|
||||
},
|
||||
{
|
||||
"pct": 0.033133333333333334,
|
||||
"count": 497,
|
||||
"cumFreq": 974,
|
||||
"cumPct": 0.06493333333333333,
|
||||
"value": 1
|
||||
},
|
||||
{
|
||||
"pct": 0.03426666666666667,
|
||||
"count": 514,
|
||||
"cumFreq": 1488,
|
||||
"cumPct": 0.0992,
|
||||
"value": 2
|
||||
},
|
||||
{
|
||||
"pct": 0.0346,
|
||||
"count": 519,
|
||||
"cumFreq": 2007,
|
||||
"cumPct": 0.1338,
|
||||
"value": 3
|
||||
},
|
||||
{
|
||||
"pct": 0.03133333333333333,
|
||||
"count": 470,
|
||||
"cumFreq": 2477,
|
||||
"cumPct": 0.16513333333333333,
|
||||
"value": 4
|
||||
},
|
||||
{
|
||||
"pct": 0.03333333333333333,
|
||||
"count": 500,
|
||||
"cumFreq": 2977,
|
||||
"cumPct": 0.19846666666666668,
|
||||
"value": 5
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 281
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
== Percentiles
|
||||
|
||||
The `percentile` function returns the estimated value for a specific percentile in
|
||||
a sample set. The example below returns the estimation for the 95th percentile
|
||||
of the *price_f* field.
|
||||
|
||||
[source,text]
|
||||
----
|
||||
let(a=random(collection1, q="*:*", rows="15000", fl="price_f"),
|
||||
b=col(a, price_f),
|
||||
c=percentile(b, 95))
|
||||
----
|
||||
|
||||
When this expression is sent to the /stream handler it responds with:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"c": 312.94
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 286
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
== Covariance and Correlation
|
||||
|
||||
Covariance and Correlation measure how random variables move
|
||||
together.
|
||||
|
||||
=== Covariance and Covariance Matrices
|
||||
|
||||
The `cov` function calculates the covariance of two sample sets of data.
|
||||
|
||||
In the example below covariance is calculated for two numeric
|
||||
arrays.
|
||||
|
||||
The example below uses arrays created by the `array` function. Its important to note that
|
||||
vectorized data from Solr Cloud collections can be used with any function that
|
||||
operates on arrays.
|
||||
|
||||
[source,text]
|
||||
----
|
||||
let(a=array(1, 2, 3, 4, 5),
|
||||
b=array(100, 200, 300, 400, 500),
|
||||
c=cov(a, b))
|
||||
----
|
||||
|
||||
When this expression is sent to the /stream handler it responds with:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"c": 0.9484775349999998
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 286
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
If a matrix is passed to the `cov` function it will automatically compute a covariance
|
||||
matrix for the columns of the matrix.
|
||||
|
||||
Notice in the example three numeric arrays are added as rows
|
||||
in a matrix. The matrix is then transposed to turn the rows into
|
||||
columns, and the covariance matrix is computed for the columns of the
|
||||
matrix.
|
||||
|
||||
[source,text]
|
||||
----
|
||||
let(a=array(1, 2, 3, 4, 5),
|
||||
b=array(100, 200, 300, 400, 500),
|
||||
c=array(30, 40, 80, 90, 110),
|
||||
d=transpose(matrix(a, b, c)),
|
||||
e=cov(d))
|
||||
----
|
||||
|
||||
When this expression is sent to the /stream handler it responds with:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"e": [
|
||||
[
|
||||
2.5,
|
||||
250,
|
||||
52.5
|
||||
],
|
||||
[
|
||||
250,
|
||||
25000,
|
||||
5250
|
||||
],
|
||||
[
|
||||
52.5,
|
||||
5250,
|
||||
1150
|
||||
]
|
||||
]
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 2
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
=== Correlation and Correlation Matrices
|
||||
|
||||
Correlation is measure of covariance that has been scaled between
|
||||
-1 and 1.
|
||||
|
||||
Three correlation types are supported:
|
||||
|
||||
* *pearsons* (default)
|
||||
* *kendalls*
|
||||
* *spearmans*
|
||||
|
||||
The type of correlation is specified by adding the *type* named parameter in the
|
||||
function call. The example below demonstrates the use of the *type*
|
||||
named parameter.
|
||||
|
||||
[source,text]
|
||||
----
|
||||
let(a=array(1, 2, 3, 4, 5),
|
||||
b=array(100, 200, 300, 400, 5000),
|
||||
c=corr(a, b, type=spearmans))
|
||||
----
|
||||
|
||||
When this expression is sent to the /stream handler it responds with:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"c": 0.7432941462471664
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 0
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
Like the `cov` function, the `corr` function automatically builds a correlation matrix
|
||||
if a matrix is passed as a parameter. The correlation matrix is built by correlating the columns
|
||||
of the matrix passed in.
|
||||
|
||||
== Statistical Inference Tests
|
||||
|
||||
Statistical inference tests test a hypothesis on *random samples* and return p-values which
|
||||
can be used to infer the reliability of the test for the entire population.
|
||||
|
||||
The following statistical inference tests are available:
|
||||
|
||||
* `anova`: One-Way-Anova tests if there is a statistically significant difference in the
|
||||
means of two or more random samples.
|
||||
|
||||
* `ttest`: The T-test tests if there is a statistically significant difference in the means of two
|
||||
random samples.
|
||||
|
||||
* `pairedTtest`: The paired t-test tests if there is a statistically significant difference
|
||||
in the means of two random samples with paired data.
|
||||
|
||||
* `gTestDataSet`: The G-test tests if two samples of binned discrete data were drawn
|
||||
from the same population.
|
||||
|
||||
* `chiSquareDataset`: The Chi-Squared test tests if two samples of binned discrete data were
|
||||
drawn from the same population.
|
||||
|
||||
* `mannWhitney`: The Mann-Whitney test is a non-parametric test that tests if two
|
||||
samples of continuous were pulled
|
||||
from the same population. The Mann-Whitney test is often used instead of the T-test when the
|
||||
underlying assumptions of the T-test are not
|
||||
met.
|
||||
|
||||
* `ks`: The Kolmogorov-Smirnov test tests if two samples of continuous data were drawn from
|
||||
the same distribution.
|
||||
|
||||
Below is a simple example of a T-test performed on two random samples.
|
||||
The returned p-value of .93 means we can accept the null hypothesis
|
||||
that the two samples do not have statistically significantly differences in the means.
|
||||
|
||||
[source,text]
|
||||
----
|
||||
let(a=random(collection1, q="*:*", rows="1500", fl="price_f"),
|
||||
b=random(collection1, q="*:*", rows="1500", fl="price_f"),
|
||||
c=col(a, price_f),
|
||||
d=col(b, price_f),
|
||||
e=ttest(c, d))
|
||||
----
|
||||
|
||||
When this expression is sent to the /stream handler it responds with:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"e": {
|
||||
"p-value": 0.9350135639249795,
|
||||
"t-statistic": 0.081545541074817
|
||||
}
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 48
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
== Transformations
|
||||
|
||||
In statistical analysis its often useful to transform data sets before performing
|
||||
statistical calculations. The statistical function library includes the following
|
||||
commonly used transformations:
|
||||
|
||||
* `rank`: Returns a numeric array with the rank-transformed value of each element of the original
|
||||
array.
|
||||
|
||||
* `log`: Returns a numeric array with the natural log of each element of the original array.
|
||||
|
||||
* `sqrt`: Returns a numeric array with the square root of each element of the original array.
|
||||
|
||||
* `cbrt`: Returns a numeric array with the cube root of each element of the original array.
|
||||
|
||||
Below is an example of a ttest performed on log transformed data sets:
|
||||
|
||||
[source,text]
|
||||
----
|
||||
let(a=random(collection1, q="*:*", rows="1500", fl="price_f"),
|
||||
b=random(collection1, q="*:*", rows="1500", fl="price_f"),
|
||||
c=log(col(a, price_f)),
|
||||
d=log(col(b, price_f)),
|
||||
e=ttest(c, d))
|
||||
----
|
||||
|
||||
When this expression is sent to the /stream handler it responds with:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"e": {
|
||||
"p-value": 0.9655110070265056,
|
||||
"t-statistic": -0.04324265449471238
|
||||
}
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 58
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
|
@ -1,5 +1,5 @@
|
|||
= Streaming Expressions
|
||||
:page-children: stream-source-reference, stream-decorator-reference, stream-evaluator-reference, statistical-programming, graph-traversal
|
||||
:page-children: stream-source-reference, stream-decorator-reference, stream-evaluator-reference, statistical-programming, math-expressions, graph-traversal
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
|
|
|
@ -0,0 +1,237 @@
|
|||
= Text Analysis and Term Vectors
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
TF-IDF term vectors are often used to represent text documents when performing text mining
|
||||
and machine learning operations. This section of the user guide describes how to
|
||||
use math expressions to perform text analysis and create TF-IDF term vectors.
|
||||
|
||||
== Text Analysis
|
||||
|
||||
The `analyze` function applies a Solr analyzer to a text field and returns the tokens
|
||||
emitted by the analyzer in an array. Any analyzer chain that is attached to a field in Solr's
|
||||
schema can be used with the `analyze` function.
|
||||
|
||||
In the example below, the text "hello world" is analyzed using the analyzer chain attached to the *subject* field in
|
||||
the schema. The *subject* field is defined as the field type *text_general* and the text is analyzed using the
|
||||
analysis chain configured for the *text_general* field type.
|
||||
|
||||
[source,text]
|
||||
----
|
||||
analyze("hello world", subject)
|
||||
----
|
||||
|
||||
When this expression is sent to the /stream handler it
|
||||
responds with:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"return-value": [
|
||||
"hello",
|
||||
"world"
|
||||
]
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 0
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
=== Annotating Documents
|
||||
|
||||
The `analyze` function can be used inside of a `select` function to annotate documents with the tokens
|
||||
generated by the analysis.
|
||||
|
||||
The example below is performing a `search` in collection1. Each tuple returned by the `search`
|
||||
contains an *id* and *subject*. For each tuple, the
|
||||
`select` function is selecting the *id* field and calling the `analyze` function on the *subject* field.
|
||||
The analyzer chain specified by the *subject_bigram* field is configured to perform a bigram analysis.
|
||||
The tokens generated by the `analyze` function are added to each tuple in a field called `terms`.
|
||||
|
||||
Notice in the output that an array of bigram terms have been added to the tuples.
|
||||
|
||||
[source,text]
|
||||
----
|
||||
select(search(collection1, q="*:*", fl="id, subject", sort="id asc"),
|
||||
id,
|
||||
analyze(subject, subject_bigram) as terms)
|
||||
----
|
||||
|
||||
When this expression is sent to the /stream handler it
|
||||
responds with:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"terms": [
|
||||
"text analysis",
|
||||
"analysis example"
|
||||
],
|
||||
"id": "1"
|
||||
},
|
||||
{
|
||||
"terms": [
|
||||
"example number",
|
||||
"number two"
|
||||
],
|
||||
"id": "2"
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 4
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
== Term Vectors
|
||||
|
||||
The `termVectors` function can be used to build *TF-IDF*
|
||||
term vectors from the terms generated by the `analyze` function.
|
||||
|
||||
The `termVectors` function operates over a list of tuples that contain a field
|
||||
called *id* and a field called *terms*. Notice
|
||||
that this is the exact output structure of the *document annotation* example above.
|
||||
|
||||
The `termVectors` function builds a *matrix* from the list of tuples. There is *row* in the
|
||||
matrix for each tuple in the list. There is a *column* in the matrix for each term in the *terms*
|
||||
field.
|
||||
|
||||
The example below builds on the *document annotation* example.
|
||||
The list of tuples are stored in variable *a*. The `termVectors` function
|
||||
operates over variable *a* and builds a matrix with *2 rows* and *4 columns*.
|
||||
|
||||
The `termVectors` function also sets the *row* and *column* labels of the term vectors matrix.
|
||||
The row labels are the document ids and the
|
||||
column labels are the terms.
|
||||
|
||||
In the example below, the `getRowLabels` and `getColumnLabels` functions return
|
||||
the row and column labels which are then stored in variables *c* and *d*.
|
||||
The *echo* parameter is echoing variables *c* and *d*, so the output includes
|
||||
the row and column labels.
|
||||
|
||||
[source,text]
|
||||
----
|
||||
let(echo="c, d",
|
||||
a=select(search(collection3, q="*:*", fl="id, subject", sort="id asc"),
|
||||
id,
|
||||
analyze(subject, subject_bigram) as terms),
|
||||
b=termVectors(a, minTermLength=4, minDocFreq=0, maxDocFreq=1),
|
||||
c=getRowLabels(b),
|
||||
d=getColumnLabels(b))
|
||||
----
|
||||
|
||||
When this expression is sent to the /stream handler it
|
||||
responds with:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"c": [
|
||||
"1",
|
||||
"2"
|
||||
],
|
||||
"d": [
|
||||
"analysis example",
|
||||
"example number",
|
||||
"number two",
|
||||
"text analysis"
|
||||
]
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 5
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
=== TF-IDF Values
|
||||
|
||||
The values within the term vectors matrix are the TF-IDF values for each term in each document. The
|
||||
example below shows the values of the matrix.
|
||||
|
||||
[source,text]
|
||||
----
|
||||
let(a=select(search(collection3, q="*:*", fl="id, subject", sort="id asc"),
|
||||
id,
|
||||
analyze(subject, subject_bigram) as terms),
|
||||
b=termVectors(a, minTermLength=4, minDocFreq=0, maxDocFreq=1))
|
||||
----
|
||||
|
||||
When this expression is sent to the /stream handler it
|
||||
responds with:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"b": [
|
||||
[
|
||||
1.4054651081081644,
|
||||
0,
|
||||
0,
|
||||
1.4054651081081644
|
||||
],
|
||||
[
|
||||
0,
|
||||
1.4054651081081644,
|
||||
1.4054651081081644,
|
||||
0
|
||||
]
|
||||
]
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 5
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
=== Limiting the Noise
|
||||
|
||||
One of the key challenges when with working term vectors is that text often has a significant amount of noise
|
||||
which can obscure the important terms in the data. The `termVectors` function has several parameters
|
||||
designed to filter out the less meaningful terms. This is also important because eliminating
|
||||
the noisy terms helps keep the term vector matrix small enough to fit comfortably in memory.
|
||||
|
||||
There are four parameters designed to filter noisy terms from the term vector matrix:
|
||||
|
||||
* *minTermLength*: The minimum term length required to include the term in the matrix.
|
||||
* *minDocFreq*: The minimum *percentage* (0 to 1) of documents the term must appear in to be included in the index.
|
||||
* *maxDocFreq*: The maximum *percentage* (0 to 1) of documents the term can appear in to be included in the index.
|
||||
* *exclude*: A comma delimited list of strings used to exclude terms. If a term contains any of the exclude strings that
|
||||
term will be excluded from the term vector.
|
|
@ -0,0 +1,431 @@
|
|||
= Time Series
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
This section of the user guide provides an overview of time series *aggregation*,
|
||||
*smoothing* and *differencing*.
|
||||
|
||||
== Time Series Aggregation
|
||||
|
||||
The `timeseries` function performs fast, distributed time
|
||||
series aggregation leveraging Solr's builtin faceting and date math capabilities.
|
||||
|
||||
The example below performs a monthly time series aggregation:
|
||||
|
||||
[source,text]
|
||||
----
|
||||
timeseries(collection1,
|
||||
q=*:*,
|
||||
field="recdate_dt",
|
||||
start="2012-01-20T17:33:18Z",
|
||||
end="2012-12-20T17:33:18Z",
|
||||
gap="+1MONTH",
|
||||
format="YYYY-MM",
|
||||
count(*))
|
||||
----
|
||||
|
||||
When this expression is sent to the /stream handler it responds with:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"recdate_dt": "2012-01",
|
||||
"count(*)": 8703
|
||||
},
|
||||
{
|
||||
"recdate_dt": "2012-02",
|
||||
"count(*)": 8648
|
||||
},
|
||||
{
|
||||
"recdate_dt": "2012-03",
|
||||
"count(*)": 8621
|
||||
},
|
||||
{
|
||||
"recdate_dt": "2012-04",
|
||||
"count(*)": 8533
|
||||
},
|
||||
{
|
||||
"recdate_dt": "2012-05",
|
||||
"count(*)": 8792
|
||||
},
|
||||
{
|
||||
"recdate_dt": "2012-06",
|
||||
"count(*)": 8598
|
||||
},
|
||||
{
|
||||
"recdate_dt": "2012-07",
|
||||
"count(*)": 8679
|
||||
},
|
||||
{
|
||||
"recdate_dt": "2012-08",
|
||||
"count(*)": 8469
|
||||
},
|
||||
{
|
||||
"recdate_dt": "2012-09",
|
||||
"count(*)": 8637
|
||||
},
|
||||
{
|
||||
"recdate_dt": "2012-10",
|
||||
"count(*)": 8536
|
||||
},
|
||||
{
|
||||
"recdate_dt": "2012-11",
|
||||
"count(*)": 8785
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 16
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
== Vectorizing the Time Series
|
||||
|
||||
Before a time series result can be operated on by math expressions
|
||||
the data will need to be vectorized. Specifically
|
||||
in the example above, the aggregation field count(*) will need to by moved into an array.
|
||||
As described in the Streams and Vectorization section of the user guide, the `col` function can be used
|
||||
to copy a numeric column from a list of tuples into an array.
|
||||
|
||||
The expression below demonstrates the vectorization of the count(*) field.
|
||||
|
||||
[source,text]
|
||||
----
|
||||
let(a=timeseries(collection1,
|
||||
q=*:*,
|
||||
field="test_dt",
|
||||
start="2012-01-20T17:33:18Z",
|
||||
end="2012-12-20T17:33:18Z",
|
||||
gap="+1MONTH",
|
||||
format="YYYY-MM",
|
||||
count(*)),
|
||||
b=col(a, count(*)))
|
||||
----
|
||||
|
||||
When this expression is sent to the /stream handler it responds with:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"b": [
|
||||
8703,
|
||||
8648,
|
||||
8621,
|
||||
8533,
|
||||
8792,
|
||||
8598,
|
||||
8679,
|
||||
8469,
|
||||
8637,
|
||||
8536,
|
||||
8785
|
||||
]
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 5
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
== Smoothing
|
||||
|
||||
Time series smoothing is often used to remove the noise from a time series and help
|
||||
spot the underlying trends.
|
||||
The math expressions library has three *sliding window* approaches
|
||||
for time series smoothing. The *sliding window* approaches use a summary value
|
||||
from a sliding window of the data to calculate a new set of smoothed data points.
|
||||
|
||||
The three *sliding window* functions are lagging indicators, which means
|
||||
they don't start to move in the direction of the trend until the trend effects
|
||||
the summary value of the sliding window. Because of this lagging quality these smoothing
|
||||
functions are often used to confirm the direction of the trend.
|
||||
|
||||
=== Moving Average
|
||||
|
||||
The `movingAvg` function computes a simple moving average over a sliding window of data.
|
||||
The example below generates a time series, vectorizes the count(*) field and computes the
|
||||
moving average with a window size of 3.
|
||||
|
||||
The moving average function returns an array that is of shorter length
|
||||
then the original data set. This is because results are generated only when a full window of data
|
||||
is available for computing the average. With a window size of three the moving average will
|
||||
begin generating results at the 3rd value. The prior values are not included in the result.
|
||||
|
||||
This is true for all the sliding window functions.
|
||||
|
||||
[source,text]
|
||||
----
|
||||
let(a=timeseries(collection1,
|
||||
q=*:*,
|
||||
field="test_dt",
|
||||
start="2012-01-20T17:33:18Z",
|
||||
end="2012-12-20T17:33:18Z",
|
||||
gap="+1MONTH",
|
||||
format="YYYY-MM",
|
||||
count(*)),
|
||||
b=col(a, count(*)),
|
||||
c=movingAvg(b, 3))
|
||||
----
|
||||
|
||||
When this expression is sent to the /stream handler it responds with:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"c": [
|
||||
8657.333333333334,
|
||||
8600.666666666666,
|
||||
8648.666666666666,
|
||||
8641,
|
||||
8689.666666666666,
|
||||
8582,
|
||||
8595,
|
||||
8547.333333333334,
|
||||
8652.666666666666
|
||||
]
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 7
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
=== Exponential Moving Average
|
||||
|
||||
The `expMovingAvg` function uses a different formula for computing the moving average that
|
||||
responds faster to changes in the underlying data. This means that it is
|
||||
less of a lagging indicator then the simple moving average.
|
||||
|
||||
Below is an example that computes an exponential moving average:
|
||||
|
||||
[source,text]
|
||||
----
|
||||
let(a=timeseries(collection1, q=*:*,
|
||||
field="test_dt",
|
||||
start="2012-01-20T17:33:18Z",
|
||||
end="2012-12-20T17:33:18Z",
|
||||
gap="+1MONTH",
|
||||
format="YYYY-MM",
|
||||
count(*)),
|
||||
b=col(a, count(*)),
|
||||
c=expMovingAvg(b, 3))
|
||||
----
|
||||
|
||||
When this expression is sent to the /stream handler it responds with:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"c": [
|
||||
8657.333333333334,
|
||||
8595.166666666668,
|
||||
8693.583333333334,
|
||||
8645.791666666668,
|
||||
8662.395833333334,
|
||||
8565.697916666668,
|
||||
8601.348958333334,
|
||||
8568.674479166668,
|
||||
8676.837239583334
|
||||
]
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 5
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
=== Moving Median
|
||||
|
||||
The `movingMedian` function uses the median of the sliding window rather than the average.
|
||||
In many cases the moving median will be more *robust* to outliers then moving averages.
|
||||
|
||||
Below is an example computing the moving median:
|
||||
|
||||
[source,text]
|
||||
----
|
||||
let(a=timeseries(collection1,
|
||||
q=*:*,
|
||||
field="test_dt",
|
||||
start="2012-01-20T17:33:18Z",
|
||||
end="2012-12-20T17:33:18Z",
|
||||
gap="+1MONTH",
|
||||
format="YYYY-MM",
|
||||
count(*)),
|
||||
b=col(a, count(*)),
|
||||
c=movingMedian(b, 3))
|
||||
----
|
||||
|
||||
When this expression is sent to the /stream handler it responds with:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"c": [
|
||||
8648,
|
||||
8621,
|
||||
8621,
|
||||
8598,
|
||||
8679,
|
||||
8598,
|
||||
8637,
|
||||
8536,
|
||||
8637
|
||||
]
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 7
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
== Differencing
|
||||
|
||||
Differencing is often used to remove the
|
||||
trend or seasonality from a time series. This is known as making a time series
|
||||
*stationary*.
|
||||
|
||||
=== First Difference
|
||||
|
||||
The actual technique of differencing is to use the difference between values rather then the
|
||||
original values. The *first difference* takes the difference between a value and the value
|
||||
that came directly before it. The first difference is often used to remove the trend
|
||||
from a time series.
|
||||
|
||||
In the example below, the `diff` function computes the first difference of a time series.
|
||||
The result array length is one value smaller then the original array.
|
||||
This is because the `diff` function only returns a result for values
|
||||
where the prior value has been subtracted.
|
||||
|
||||
[source,text]
|
||||
----
|
||||
let(a=timeseries(collection1,
|
||||
q=*:*,
|
||||
field="test_dt",
|
||||
start="2012-01-20T17:33:18Z",
|
||||
end="2012-12-20T17:33:18Z",
|
||||
gap="+1MONTH",
|
||||
format="YYYY-MM",
|
||||
count(*)),
|
||||
b=col(a, count(*)),
|
||||
c=diff(b))
|
||||
----
|
||||
|
||||
When this expression is sent to the /stream handler it responds with:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"c": [
|
||||
-55,
|
||||
-27,
|
||||
-88,
|
||||
259,
|
||||
-194,
|
||||
81,
|
||||
-210,
|
||||
168,
|
||||
-101,
|
||||
249
|
||||
]
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 11
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
=== Lagged Differences
|
||||
|
||||
The `diff` function has an optional second parameter to specify a lag in the difference.
|
||||
If a lag is specified the difference is taken between a value and the value at a specified
|
||||
lag in the past. Lagged differences are often used to remove seasonality from a time series.
|
||||
|
||||
The simple example below demonstrates how lagged differencing works.
|
||||
Notice that the array in the example follows a simple repeated pattern. This type of pattern
|
||||
is often displayed with seasonality. In this example we can remove this pattern using
|
||||
the `diff` function with a lag of 4. This will subtract the value lagging four indexes
|
||||
behind the current index. Notice that result set size is the original array size minus the lag.
|
||||
This is because the `diff` function only returns results for values where the lag of 4
|
||||
is possible to compute.
|
||||
|
||||
[source,text]
|
||||
----
|
||||
let(a=array(1,2,5,2,1,2,5,2,1,2,5),
|
||||
b=diff(a, 4))
|
||||
----
|
||||
|
||||
Expression is sent to the /stream handler it responds with:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"b": [
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0
|
||||
]
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 0
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
|
@ -0,0 +1,147 @@
|
|||
= Variables
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
== The Let Expressions
|
||||
|
||||
The `let` expression sets variables and returns
|
||||
the value of the last variable by default. The output of any streaming expression
|
||||
or math expression can be set to a variable.
|
||||
|
||||
Below is a simple example setting three variables *a*, *b*
|
||||
and *c*. Variables *a* and *b* are set to arrays. The variable *c* is set
|
||||
to the output of the `ebeAdd` function which performs element-by-element
|
||||
addition of the two arrays.
|
||||
|
||||
Notice that the last variable, *c*, is returned.
|
||||
|
||||
[source,text]
|
||||
----
|
||||
let(a=array(1, 2, 3),
|
||||
b=array(10, 20, 30),
|
||||
c=ebeAdd(a, b))
|
||||
----
|
||||
|
||||
When this expression is sent to the /stream handler it
|
||||
responds with:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"c": [
|
||||
11,
|
||||
22,
|
||||
33
|
||||
]
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 4
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
== Echoing Variables
|
||||
|
||||
All variables can be output by setting the *echo* variable to *true*.
|
||||
|
||||
[source,text]
|
||||
----
|
||||
let(echo=true,
|
||||
a=array(1, 2, 3),
|
||||
b=array(10, 20, 30),
|
||||
c=ebeAdd(a, b))
|
||||
----
|
||||
|
||||
When this expression is sent to the /stream handler it
|
||||
responds with:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"a": [
|
||||
1,
|
||||
2,
|
||||
3
|
||||
],
|
||||
"b": [
|
||||
10,
|
||||
20,
|
||||
30
|
||||
],
|
||||
"c": [
|
||||
11,
|
||||
22,
|
||||
33
|
||||
]
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 0
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
A specific set of variables can be echoed by providing a comma delimited
|
||||
list of variables to the echo parameter.
|
||||
|
||||
[source,text]
|
||||
----
|
||||
let(echo="a,b",
|
||||
a=array(1, 2, 3),
|
||||
b=array(10, 20, 30),
|
||||
c=ebeAdd(a, b))
|
||||
----
|
||||
|
||||
When this expression is sent to the /stream handler it
|
||||
responds with:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"a": [
|
||||
1,
|
||||
2,
|
||||
3
|
||||
],
|
||||
"b": [
|
||||
10,
|
||||
20,
|
||||
30
|
||||
]
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 0
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
|
@ -0,0 +1,343 @@
|
|||
= Vector Math
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
This section of the user guide covers vector math and
|
||||
vector manipulation functions.
|
||||
|
||||
== Arrays
|
||||
|
||||
Arrays can be created with the `array` function.
|
||||
|
||||
For example the expression below creates a numeric array with
|
||||
three elements:
|
||||
|
||||
[source,text]
|
||||
----
|
||||
array(1, 2, 3)
|
||||
----
|
||||
|
||||
When this expression is sent to the /stream handler it responds with
|
||||
a json array.
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"return-value": [
|
||||
1,
|
||||
2,
|
||||
3
|
||||
]
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 0
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
== Array Operations
|
||||
|
||||
Arrays can be passed as parameters to functions that operate on arrays.
|
||||
|
||||
For example, an array can be reversed with the `rev` function:
|
||||
|
||||
[source,text]
|
||||
----
|
||||
rev(array(1, 2, 3))
|
||||
----
|
||||
|
||||
When this expression is sent to the /stream handler it responds with:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"return-value": [
|
||||
3,
|
||||
2,
|
||||
1
|
||||
]
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 0
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
Another example is the `length` function,
|
||||
which returns the length of an array:
|
||||
|
||||
[source,text]
|
||||
----
|
||||
length(array(1, 2, 3))
|
||||
----
|
||||
|
||||
When this expression is sent to the /stream handler it responds with:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"return-value": 3
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 0
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
A slice of an array can be taken with the `copyOfRange` function, which
|
||||
copies elements of an array from a start and end range.
|
||||
|
||||
[source,text]
|
||||
----
|
||||
copyOfRange(array(1,2,3,4,5,6), 1, 4)
|
||||
----
|
||||
|
||||
When this expression is sent to the /stream handler it responds with:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"return-value": [
|
||||
2,
|
||||
3,
|
||||
4
|
||||
]
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 0
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
== Vector Summarizations and Norms
|
||||
|
||||
There are a set of functions that perform
|
||||
summerizations and return norms of arrays. These functions
|
||||
operate over an array and return a single
|
||||
value. The following vector summarizations and norm functions are available:
|
||||
`mult`, `add`, `sumSq`, `mean`, `l1norm`, `l2norm`, `linfnorm`.
|
||||
|
||||
The example below is using the `mult` function,
|
||||
which multiples all the values of an array.
|
||||
|
||||
[source,text]
|
||||
----
|
||||
mult(array(2,4,8))
|
||||
----
|
||||
|
||||
When this expression is sent to the /stream handler it responds with:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"return-value": 64
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 0
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
The vector norm functions provide different formulas for calculating vector magnitude.
|
||||
|
||||
The example below calculates the *l2norm* of an array.
|
||||
|
||||
[source,text]
|
||||
----
|
||||
l2norm(array(2,4,8))
|
||||
----
|
||||
|
||||
When this expression is sent to the /stream handler it responds with:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"return-value": 9.16515138991168
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 0
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
== Scalar Vector Math
|
||||
|
||||
Scalar vector math functions add, subtract, multiple or divide a scalar value with every value in a vector.
|
||||
The following functions perform these operations: `scalarAdd`, `scalarSubtract`, `scalarMultiply`
|
||||
and `scalarDivide`.
|
||||
|
||||
|
||||
Below is an example of the `scalarMultiply` function, which multiplies the scalar value 3 with
|
||||
every value of an array.
|
||||
|
||||
[source,text]
|
||||
----
|
||||
scalarMultiply(3, array(1,2,3))
|
||||
----
|
||||
|
||||
When this expression is sent to the /stream handler it responds with:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"return-value": [
|
||||
3,
|
||||
6,
|
||||
9
|
||||
]
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 0
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
== Element-By-Element Vector Math
|
||||
|
||||
Two vectors can be added, subtracted, multiplied and divided using element-by-element
|
||||
vector math functions. The following element-by-element vector math functions are:
|
||||
`ebeAdd`, `ebeSubtract`, `ebeMultiply`, `ebeDivide`.
|
||||
|
||||
The expression below performs the element-by-element subtraction of two arrays.
|
||||
|
||||
[source,text]
|
||||
----
|
||||
ebeSubtract(array(10, 15, 20), array(1,2,3))
|
||||
----
|
||||
|
||||
When this expression is sent to the /stream handler it responds with:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"return-value": [
|
||||
9,
|
||||
13,
|
||||
17
|
||||
]
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 5
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
== Dot Product and Cosine Similarity
|
||||
|
||||
The `dotProduct` and `cosineSimilarity` functions are often used as similarity measures between two
|
||||
sparse vectors. The `dotProduct` is a measure of both angle and magnitude while `cosineSimilarity`
|
||||
is a measure only of angle.
|
||||
|
||||
Below is an example of the `dotProduct` function:
|
||||
|
||||
[source,text]
|
||||
----
|
||||
dotProduct(array(2,3,0,0,0,1), array(2,0,1,0,0,3))
|
||||
----
|
||||
|
||||
When this expression is sent to the /stream handler it responds with:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"return-value": 7
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 15
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
Below is an example of the `cosineSimilarity` function:
|
||||
|
||||
[source,text]
|
||||
----
|
||||
cosineSimilarity(array(2,3,0,0,0,1), array(2,0,1,0,0,3))
|
||||
----
|
||||
|
||||
When this expression is sent to the /stream handler it responds with:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"return-value": 0.5
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 7
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
|
@ -0,0 +1,243 @@
|
|||
= Streams and Vectorization
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
This section of the user guide explores techniques
|
||||
for retrieving streams of data from Solr and vectorizing the
|
||||
*numeric* fields.
|
||||
|
||||
The next chapter of the user guide covers
|
||||
Text Analysis and Term Vectors which describes how to
|
||||
vectorize *text* fields.
|
||||
|
||||
== Streams
|
||||
|
||||
Streaming Expressions has a wide range of stream sources that can be used to
|
||||
retrieve data from Solr Cloud collections. Math expressions can be used
|
||||
to vectorize and analyze the results sets.
|
||||
|
||||
Below are some of the key stream sources:
|
||||
|
||||
* *random*: Random sampling is widely used in statistics, probability and machine learning.
|
||||
The `random` function returns a random sample of search results that match a
|
||||
query. The random samples can be vectorized and operated on by math expressions and the results
|
||||
can be used to describe and make inferences about the entire population.
|
||||
|
||||
* *timeseries*: The `timeseries`
|
||||
expression provides fast distributed time series aggregations, which can be
|
||||
vectorized and analyzed with math expressions.
|
||||
|
||||
* *knnSearch*: K-nearest neighbor is a core machine learning algorithm. The `knnSearch`
|
||||
function is a specialized knn algorithm optimized to find the k-nearest neighbors of a document in
|
||||
a distributed index. Once the nearest neighbors are retrieved they can be vectorized
|
||||
and operated on by machine learning and text mining algorithms.
|
||||
|
||||
* *sql*: SQL is the primary query language used by data scientists. The `sql` function supports
|
||||
data retrieval using a subset of SQL which includes both full text search and
|
||||
fast distributed aggregations. The result sets can then be vectorized and operated
|
||||
on by math expressions.
|
||||
|
||||
* *jdbc*: The `jdbc` function allows data from any JDBC compliant data source to be combined with
|
||||
streams originating from Solr. Result sets from outside data sources can be vectorized and operated
|
||||
on by math expressions in the same manner as result sets originating from Solr.
|
||||
|
||||
* *topic*: Messaging is an important foundational technology for large scale computing. The `topic`
|
||||
function provides publish/subscribe messaging capabilities by treating
|
||||
Solr Cloud as a distributed message queue. Topics are extremely powerful
|
||||
because they allow subscription by query. Topics can be use to support a broad set of
|
||||
use cases including bulk text mining operations and AI alerting.
|
||||
|
||||
* *nodes*: Graph queries are frequently used by recommendation engines and are an important
|
||||
machine learning tool. The `nodes` function provides fast, distributed, breadth
|
||||
first graph traversal over documents in a Solr Cloud collection. The node sets collected
|
||||
by the `nodes` function can be operated on by statistical and machine learning expressions to
|
||||
gain more insight into the graph.
|
||||
|
||||
* *search*: Ranked search results are a powerful tool for finding the most relevant
|
||||
documents from a large document corpus. The `search` expression
|
||||
returns the top N ranked search results that match any
|
||||
Solr query, including geo-spatial queries. The smaller set of relevant
|
||||
documents can then be explored with statistical, machine learning and
|
||||
text mining expressions to gather insights about the data set.
|
||||
|
||||
== Assigning Streams to Variables
|
||||
|
||||
The output of any streaming expression can be set to a variable.
|
||||
Below is a very simple example using the `random` function to fetch
|
||||
three random samples from collection1. The random samples are returned
|
||||
as *tuples*, which contain name/value pairs.
|
||||
|
||||
|
||||
[source,text]
|
||||
----
|
||||
let(a=random(collection1, q="*:*", rows="3", fl="price_f"))
|
||||
----
|
||||
|
||||
When this expression is sent to the /stream handler it responds with:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"a": [
|
||||
{
|
||||
"price_f": 0.7927976
|
||||
},
|
||||
{
|
||||
"price_f": 0.060795486
|
||||
},
|
||||
{
|
||||
"price_f": 0.55128294
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 11
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
== Creating a Vector with the *col* Function
|
||||
|
||||
The `col` function iterates over a list of tuples and copies the values
|
||||
from a specific column into an *array*.
|
||||
|
||||
The output of the `col` function is an numeric array that can be set to a
|
||||
variable and operated on by math expressions.
|
||||
|
||||
Below is an example of the `col` function:
|
||||
|
||||
[source,text]
|
||||
----
|
||||
let(a=random(collection1, q="*:*", rows="3", fl="price_f"),
|
||||
b=col(a, price_f))
|
||||
----
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"b": [
|
||||
0.42105234,
|
||||
0.85237443,
|
||||
0.7566981
|
||||
]
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 9
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
== Applying Math Expressions to the Vector
|
||||
|
||||
Once a vector has been created any math expression that operates on vectors
|
||||
can be applied. In the example below the `mean` function is applied to
|
||||
the vector assigned to variable *b*.
|
||||
|
||||
[source,text]
|
||||
----
|
||||
let(a=random(collection1, q="*:*", rows="15000", fl="price_f"),
|
||||
b=col(a, price_f),
|
||||
c=mean(b))
|
||||
----
|
||||
|
||||
When this expression is sent to the /stream handler it responds with:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"c": 0.5016035594638814
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 306
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
== Creating Matrices
|
||||
|
||||
Matrices can be created by vectorizing multiple numeric fields
|
||||
and adding them to a matrix. The matrices can then be operated on by
|
||||
any math expression that operates on matrices.
|
||||
|
||||
Note that this section deals with the creation of matrices
|
||||
from numeric data. The next chapter of the user guide covers
|
||||
Text Analysis and Term Vectors which describes how to build TF-IDF
|
||||
term vector matrices from text fields.
|
||||
|
||||
Below is a simple example where four random samples are taken
|
||||
from different sub-populations in the data. The *price_f* field of
|
||||
each random sample is
|
||||
vectorized and the vectors are added as rows to a matrix.
|
||||
Then the `sumRows`
|
||||
function is applied to the matrix to return a vector containing
|
||||
the sum of each row.
|
||||
|
||||
[source,text]
|
||||
----
|
||||
let(a=random(collection1, q="market:A", rows="5000", fl="price_f"),
|
||||
b=random(collection1, q="market:B", rows="5000", fl="price_f"),
|
||||
c=random(collection1, q="market:C", rows="5000", fl="price_f"),
|
||||
d=random(collection1, q="market:D", rows="5000", fl="price_f"),
|
||||
e=col(a, price_f),
|
||||
f=col(b, price_f),
|
||||
g=col(c, price_f),
|
||||
h=col(d, price_f),
|
||||
i=matrix(e, f, g, h),
|
||||
j=sumRows(i))
|
||||
----
|
||||
|
||||
When this expression is sent to the /stream handler it responds with:
|
||||
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"result-set": {
|
||||
"docs": [
|
||||
{
|
||||
"j": [
|
||||
154390.1293375,
|
||||
167434.89453,
|
||||
159293.258493,
|
||||
149773.42769,
|
||||
]
|
||||
},
|
||||
{
|
||||
"EOF": true,
|
||||
"RESPONSE_TIME": 9
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
----
|
|
@ -31,10 +31,12 @@ public class FieldValueEvaluator extends SourceEvaluator {
|
|||
private static final long serialVersionUID = 1L;
|
||||
|
||||
private String fieldName;
|
||||
private boolean literal;
|
||||
|
||||
public FieldValueEvaluator(String fieldName) {
|
||||
if(fieldName.startsWith("'") && fieldName.endsWith("'") && fieldName.length() > 1){
|
||||
if(fieldName.startsWith("\"") && fieldName.endsWith("\"") && fieldName.length() > 1){
|
||||
fieldName = fieldName.substring(1, fieldName.length() - 1);
|
||||
literal = true;
|
||||
}
|
||||
|
||||
this.fieldName = fieldName;
|
||||
|
@ -42,6 +44,10 @@ public class FieldValueEvaluator extends SourceEvaluator {
|
|||
|
||||
@Override
|
||||
public Object evaluate(Tuple tuple) throws IOException {
|
||||
if(literal) {
|
||||
return fieldName;
|
||||
}
|
||||
|
||||
Object value = tuple.get(fieldName);
|
||||
|
||||
// This is somewhat radical.
|
||||
|
@ -84,10 +90,6 @@ public class FieldValueEvaluator extends SourceEvaluator {
|
|||
}
|
||||
}
|
||||
|
||||
if(value == null) {
|
||||
return fieldName;
|
||||
}
|
||||
|
||||
return value;
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue