From 1ed4e226ac66078a775c869a375c8c816220edec Mon Sep 17 00:00:00 2001 From: Joel Bernstein Date: Mon, 26 Mar 2018 12:48:33 -0400 Subject: [PATCH] SOLR-11947: Squashed commit of the following ref guide changes: commit 61053f2fe373bff0b451f549e063550f08ecdac1 Author: Joel Bernstein Date: Mon Mar 26 12:44:12 2018 -0400 SOLR-11947: Fix orphaned files commit 42302073bf61fde134caeff71b6db3978e113b4d Author: Joel Bernstein Date: Mon Mar 26 12:27:26 2018 -0400 SOLR-11947: small change commit b16b1453c2e7d5083f588b4b874c918d521e9fe5 Author: Joel Bernstein Date: Mon Mar 26 12:23:17 2018 -0400 SOLR-11947: proofing commit 57265ce4659a427c179e206b79d8fe05b01a5f93 Author: Joel Bernstein Date: Sat Mar 24 14:41:48 2018 -0400 SOLR-11947: monte carlo WIP commit 04e8381f6b5b329c5fa17c1f31c2d848fe9cec2a Author: Joel Bernstein Date: Fri Mar 23 16:24:10 2018 -0400 SOLR-11947: probabiity WIP commit 4298a6d514e7e431e322a4f62c22c336430a89f1 Author: Joel Bernstein Date: Fri Mar 23 13:07:05 2018 -0400 SOLR-11947: time series WIP commit 1a7654f9225948cd4adb3056bc2192cc0d24b3ee Author: Joel Bernstein Date: Fri Mar 23 11:32:53 2018 -0400 SOLR-11947: machine learning WIP commit fae0c3aa46e6f26fecb59077207982b2f584ec86 Author: Joel Bernstein Date: Thu Mar 22 22:14:15 2018 -0400 SOLR-11947: machine learning WIP commit fb6a96b2bdc4bbc4c2b5b62b6e69cd561ef9e31b Author: Joel Bernstein Date: Thu Mar 22 14:36:08 2018 -0400 SOLR-11947: numerical analysis WIP commit a648ba939c90caf5db2a5b88023bd580d4d1e8af Author: Joel Bernstein Date: Thu Mar 22 12:27:33 2018 -0400 SOLR-11947: numerical analysis WIP commit ce8f1b710d414d8e3ff3c8676f64fc3017316a15 Author: Joel Bernstein Date: Wed Mar 21 19:56:10 2018 -0400 SOLR-11947: numerical analysis WIP commit 5e25a4884341cdd84988e13250f255eb23d7fd50 Author: Joel Bernstein Date: Tue Mar 20 22:01:59 2018 -0400 SOLR-11947: Curve fitting WIP commit f381414dc44ecfa781988c5ca75bfb1c80de6674 Author: Joel Bernstein Date: Tue Mar 20 21:49:39 2018 -0400 SOLR-11947: Curve fitting WIP commit 4be725132215ed44cc84587bb0d11be216360b74 Author: Joel Bernstein Date: Mon Mar 19 19:55:10 2018 -0400 SOLR-11947: Monte Carlo WIP commit d330b412e46be0ebf8d75e99295e3fe9f978c02c Author: Joel Bernstein Date: Sun Mar 18 22:00:55 2018 -0400 SOLR-11947: Probability WIP commit e3d6160c1fa650e054b9694c57d34b3950c80175 Author: Joel Bernstein Date: Sat Mar 17 21:18:43 2018 -0400 SOLR-11947: More WIP commit 8484b0283f79825dee8eaee82604120d04511de4 Author: Joel Bernstein Date: Fri Mar 16 15:03:06 2018 -0400 SOLR-11947: machine learning WIP commit 77ecfdc71d79ca8eded0355669310c6025c70d96 Author: Joel Bernstein Date: Thu Mar 15 21:33:09 2018 -0400 SOLR-11947: machine learning WIP commit 7488caf5e54436a0e5fe85c0dda4ea31d8357600 Author: Joel Bernstein Date: Thu Mar 15 19:08:50 2018 -0400 SOLR-11947: machine learning WIP commit 102ee2e1857e7d7f45d7f3195a0a4e91eacb766d Author: Joel Bernstein Date: Thu Mar 15 15:18:31 2018 -0400 SOLR-11947: machine learning WIP commit 0d5cd2b4a4fd012fe6d640a86733280702cf8673 Author: Joel Bernstein Date: Wed Mar 14 21:49:15 2018 -0400 SOLR-11947: numerical analysis WIP commit 31eec30576479a9023c7b0e6ccb2d9f685e128a1 Author: Joel Bernstein Date: Wed Mar 14 14:41:06 2018 -0400 SOLR-11947: numerical analysis WIP commit c6e324ac56ca6e9f229d6acb39fdcf60c3356230 Author: Joel Bernstein Date: Tue Mar 13 15:16:26 2018 -0400 SOLR-11947: term vectors WIP commit 8c843999eabdb82665641caa9c21f07e95b70a86 Author: Joel Bernstein Date: Mon Mar 12 18:03:53 2018 -0400 SOLR-11947: Add curve fitting to TOC commit 09be026f6ad400d965fd373403d7a2eb2fae0c90 Author: Joel Bernstein Date: Mon Mar 12 15:36:05 2018 -0400 SOLR-11947: Text analysis WIP commit e48b4d69abadb603a90c052aa1e36dd60ae7fd33 Author: Joel Bernstein Date: Sun Mar 11 18:29:20 2018 -0400 SOLR-11947: TOC changes commit f71ebc079713e16492ba45cedafc3b9512f6bae2 Author: Joel Bernstein Date: Sat Mar 10 17:54:04 2018 -0500 SOLR-11947: WIP term vectors commit ebc6b3943a27454adaf1a2309b6720bb2ba63c8c Author: Joel Bernstein Date: Sat Mar 10 13:34:19 2018 -0500 SOLR-11947: WIP regression commit 44752b2d34f46bc7f5693839e42ab3cef9edc47c Author: Joel Bernstein Date: Fri Mar 9 22:40:40 2018 -0500 SOLR-11947: WIP for vectorization.adoc commit 43254fcb05386264a6d591b1fa2c2573dcc2d2a3 Author: Joel Bernstein Date: Fri Mar 9 19:42:26 2018 -0500 SOLR-11947: Test local links commit b60df2000978f70720eb0a36543752fd3bf07d2c Author: Joel Bernstein Date: Thu Mar 8 21:41:17 2018 -0500 SOLR-11947: Update math-expressions TOC commit de068c3af8557d60de37cb29f3ed7da3f5442772 Author: Joel Bernstein Date: Thu Mar 8 21:24:46 2018 -0500 SOLR-11947: Continued work on math expressions documentation. commit fe445f2c997ea825d1ae9b9912406521249befc0 Author: Joel Bernstein Date: Sun Mar 4 20:22:33 2018 -0500 SOLR-12054: ebeAdd and ebeSubtract should support matrix operations commit 1f3ae745cc26453a34a64a4327ceac7cc91d23f5 Author: Joel Bernstein Date: Sun Mar 4 13:24:54 2018 -0500 SOLR-11947: Initial commit for new math expression docs WIP --- solr/solr-ref-guide/src/curve-fitting.adoc | 182 +++++ solr/solr-ref-guide/src/machine-learning.adoc | 680 ++++++++++++++++++ solr/solr-ref-guide/src/math-expressions.adoc | 59 ++ solr/solr-ref-guide/src/matrix-math.adoc | 443 ++++++++++++ solr/solr-ref-guide/src/montecarlo.adoc | 213 ++++++ .../src/numerical-analysis.adoc | 430 +++++++++++ solr/solr-ref-guide/src/probability.adoc | 415 +++++++++++ solr/solr-ref-guide/src/regression.adoc | 439 +++++++++++ solr/solr-ref-guide/src/scalar-math.adoc | 137 ++++ solr/solr-ref-guide/src/statistics.adoc | 575 +++++++++++++++ .../src/streaming-expressions.adoc | 2 +- solr/solr-ref-guide/src/term-vectors.adoc | 237 ++++++ solr/solr-ref-guide/src/time-series.adoc | 431 +++++++++++ solr/solr-ref-guide/src/variables.adoc | 147 ++++ solr/solr-ref-guide/src/vector-math.adoc | 343 +++++++++ solr/solr-ref-guide/src/vectorization.adoc | 243 +++++++ .../solrj/io/eval/FieldValueEvaluator.java | 12 +- 17 files changed, 4982 insertions(+), 6 deletions(-) create mode 100644 solr/solr-ref-guide/src/curve-fitting.adoc create mode 100644 solr/solr-ref-guide/src/machine-learning.adoc create mode 100644 solr/solr-ref-guide/src/math-expressions.adoc create mode 100644 solr/solr-ref-guide/src/matrix-math.adoc create mode 100644 solr/solr-ref-guide/src/montecarlo.adoc create mode 100644 solr/solr-ref-guide/src/numerical-analysis.adoc create mode 100644 solr/solr-ref-guide/src/probability.adoc create mode 100644 solr/solr-ref-guide/src/regression.adoc create mode 100644 solr/solr-ref-guide/src/scalar-math.adoc create mode 100644 solr/solr-ref-guide/src/statistics.adoc create mode 100644 solr/solr-ref-guide/src/term-vectors.adoc create mode 100644 solr/solr-ref-guide/src/time-series.adoc create mode 100644 solr/solr-ref-guide/src/variables.adoc create mode 100644 solr/solr-ref-guide/src/vector-math.adoc create mode 100644 solr/solr-ref-guide/src/vectorization.adoc diff --git a/solr/solr-ref-guide/src/curve-fitting.adoc b/solr/solr-ref-guide/src/curve-fitting.adoc new file mode 100644 index 00000000000..057cc23ab61 --- /dev/null +++ b/solr/solr-ref-guide/src/curve-fitting.adoc @@ -0,0 +1,182 @@ += Curve Fitting +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + + +== Polynomial Curve Fitting + + +The `polyfit` function is a general purpose curve fitter used to model +the *non-linear* relationship between two random variables. + +The `polyfit` function is passed *x* and *y* axises and fits a smooth curve to the data. +If only a single array is provided it is treated as the *y* axis and a sequence is generated +for the *x* axis. + +The `polyfit` function also has a parameter the specifies the degree of the polynomial. The higher +the degree the more curves that can be modeled. + +The example below uses the `polyfit` function to fit a curve to an array using +a 3 degree polynomial. The fitted curve is then subtracted from the original curve. The output +shows the error between the fitted curve and the original curve, known as the residuals. +The output also includes the sum-of-squares of the residuals which provides a measure +of how large the error is.. + +[source,text] +---- +let(echo="residuals, sumSqError", + y=array(0, 1, 2, 3, 4, 5.7, 6, 7, 6, 5, 5, 3, 2, 1, 0), + curve=polyfit(y, 3), + residuals=ebeSubtract(y, curve), + sumSqError=sumSq(residuals)) +---- + +When this expression is sent to the /stream handler it +responds with: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "residuals": [ + 0.5886274509803899, + -0.0746078431372561, + -0.49492135315664765, + -0.6689571213100631, + -0.5933591898297781, + 0.4352283990519288, + 0.32016160310277897, + 1.1647963800904968, + 0.272488687782805, + -0.3534055160525744, + 0.2904697263520779, + -0.7925296272355089, + -0.5990476190476182, + -0.12572829131652274, + 0.6307843137254909 + ], + "sumSqError": 4.7294282482223595 + }, + { + "EOF": true, + "RESPONSE_TIME": 0 + } + ] + } +} +---- + +In the next example the curve is fit using a 5 degree polynomial. Notice that the curve +is fit closer, shown by the smaller residuals and lower value for the sum-of-squares of the +residuals. This is because the higher polynomial produced a closer fit. + +[source,text] +---- +let(echo="residuals, sumSqError", + y=array(0, 1, 2, 3, 4, 5.7, 6, 7, 6, 5, 5, 3, 2, 1, 0), + curve=polyfit(y, 5), + residuals=ebeSubtract(y, curve), + sumSqError=sumSq(residuals)) +---- + +When this expression is sent to the /stream handler it +responds with: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "residuals": [ + -0.12337461300309674, + 0.22708978328173413, + 0.12266015718028167, + -0.16502738747320755, + -0.41142804563857105, + 0.2603044014808713, + -0.12128970101106162, + 0.6234168308471704, + -0.1754692675745293, + -0.5379689969473249, + 0.4651616185671843, + -0.288175756132409, + 0.027970945463215102, + 0.18699690402476687, + -0.09086687306501587 + ], + "sumSqError": 1.413089480179252 + }, + { + "EOF": true, + "RESPONSE_TIME": 0 + } + ] + } +} +---- + + +== Prediction, Derivatives and Integrals + +The `polyfit` function returns an array which contains the *y* value data points +of the fitted curve. + +In order to predict values along the curve an interpolation function must be created +for the curve. Once an interpolation functin has been created the `predict`, +`derivative` and `integral` functions can be applied to the curve. + +In the example below the x axis is included for clarity. +The `polyfit` function returns an array with the fitted curve. +A linear inpolation function is then created for the curve with the `lerp` function. +The `predict` function is then used to predict a value along the curve, in this +case the prediction is made for the *x* value of .5. + +[source,text] +---- +let(x=array(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14), + y=array(0, 1, 2, 3, 4, 5.7, 6, 7, 6, 5, 5, 3, 2, 1, 0), + curve=polyfit(x, y, 5), + interp=lerp(x, curve), + p=predict(interp, .5)) +---- + +When this expression is sent to the /stream handler it +responds with: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "p": 0.4481424148606813 + }, + { + "EOF": true, + "RESPONSE_TIME": 0 + } + ] + } +} +---- + + + + diff --git a/solr/solr-ref-guide/src/machine-learning.adoc b/solr/solr-ref-guide/src/machine-learning.adoc new file mode 100644 index 00000000000..cbb3e05fb72 --- /dev/null +++ b/solr/solr-ref-guide/src/machine-learning.adoc @@ -0,0 +1,680 @@ += Machine Learning +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + + +This section of the math expressions user guide covers machine learning +functions. + +== Feature Scaling + +Before performing machine learning operations its often necessary to +scale the feature vectors so they can be compared at the same scale. + +All the scaling function operate on vectors and matrices. +When operating on a matrix the *rows* of the matrix are scaled. + +=== Min/Max Scaling + +The `minMaxScale` function scales a vector or matrix between a min and +max value. By default it will scale between 0 and 1 if min/max values +are not provided. + +Below is a simple example of min/max scaling between 0 and 1. +Notice that once brought into the same scale the vectors are the same. + +[source,text] +---- +let(a=array(20, 30, 40, 50), + b=array(200, 300, 400, 500), + c=matrix(a, b), + d=minMaxScale(c)) +---- + +This expression returns the following response: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "d": [ + [ + 0, + 0.3333333333333333, + 0.6666666666666666, + 1 + ], + [ + 0, + 0.3333333333333333, + 0.6666666666666666, + 1 + ] + ] + }, + { + "EOF": true, + "RESPONSE_TIME": 0 + } + ] + } +} +---- + +=== Standardization + +The `standardize` function scales a vector so that it has a +mean of 0 and a standard deviation of 1. Standardization can be +used with machine learning algorithms, such as SVM, that +perform better when the data has a normal distribution. + +[source,text] +---- +let(a=array(20, 30, 40, 50), + b=array(200, 300, 400, 500), + c=matrix(a, b), + d=standardize(c)) +---- + +This expression returns the following response: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "d": [ + [ + -1.161895003862225, + -0.3872983346207417, + 0.3872983346207417, + 1.161895003862225 + ], + [ + -1.1618950038622249, + -0.38729833462074165, + 0.38729833462074165, + 1.1618950038622249 + ] + ] + }, + { + "EOF": true, + "RESPONSE_TIME": 17 + } + ] + } +} +---- + +=== Unitize + +The `unitize` function scales vectors to a magnitude of 1. A vector with a +magnitude of 1 is known as a unit vector. Unit vectors are +preferred when the vector math deals +with vector direction rather than magnitude. + +[source,text] +---- +let(a=array(20, 30, 40, 50), + b=array(200, 300, 400, 500), + c=matrix(a, b), + d=unitize(c)) +---- + +This expression returns the following response: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "d": [ + [ + 0.2721655269759087, + 0.40824829046386296, + 0.5443310539518174, + 0.6804138174397716 + ], + [ + 0.2721655269759087, + 0.4082482904638631, + 0.5443310539518174, + 0.6804138174397717 + ] + ] + }, + { + "EOF": true, + "RESPONSE_TIME": 6 + } + ] + } +} +---- + +== Distance + +The `distance` function computes a distance measure for two +numeric arrays or a *distance matrix* for the columns of a matrix. + +There are four distance measures currently supported: + +* euclidean (default) +* manhattan +* canberra +* earthMovers + +Below is an example for computing euclidean distance for +two numeric arrays: + + +[source,text] +---- +let(a=array(20, 30, 40, 50), + b=array(21, 29, 41, 49), + c=distance(a, b)) +---- + +This expression returns the following response: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "c": 2 + }, + { + "EOF": true, + "RESPONSE_TIME": 0 + } + ] + } +} +---- + +Below is an example for computing a distance matrix for columns +of a matrix: + +[source,text] +---- +let(a=array(20, 30, 40), + b=array(21, 29, 41), + c=array(31, 40, 50), + d=matrix(a, b, c), + c=distance(d)) +---- + +This expression returns the following response: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "e": [ + [ + 0, + 15.652475842498529, + 34.07345007480164 + ], + [ + 15.652475842498529, + 0, + 18.547236990991408 + ], + [ + 34.07345007480164, + 18.547236990991408, + 0 + ] + ] + }, + { + "EOF": true, + "RESPONSE_TIME": 24 + } + ] + } +} +---- + +== K-means Clustering + +The `kmeans` functions performs k-means clustering of the rows of a matrix. +Once the clustering has been completed there are a number of useful functions available +for examining the *clusters* and *centroids*. + +The examples below are clustering *term vectors*. +The chapter on link:term-vectors.adoc[Text Analysis and Term Vectors] should be +consulted for a full explanation of these features. + +=== Centroid Features + +In the example below the `kmeans` function is used to cluster a result set from the Enron email data-set +and then the top features are extracted from the cluster centroids. + +Let's look at what data is assigned to each variable: + +* *a*: The `random` function returns a sample of 500 documents from the *enron* +collection that match the query *body:oil*. The `select` function selects the *id* and +and annotates each tuple with the analyzed bigram terms from the body field. + +* *b*: The `termVectors` function creates a TF-IDF term vector matrix from the +tuples stored in variable *a*. Each row in the matrix represents a document. The columns of the matrix +are the bigram terms that were attached to each tuple. +* *c*: The `kmeans` function clusters the rows of the matrix into 5 clusters. The k-means clustering is performed using the +*Euclidean distance* measure. +* *d*: The `getCentroids` function returns a matrix of cluster centroids. Each row in the matrix is a centroid +from one of the 5 clusters. The columns of the matrix are the same bigrams terms of the term vector matrix. +* *e*: The `topFeatures` function returns the column labels for the top 5 features of each centroid in the matrix. +This returns the top 5 bigram terms for each centroid. + +[source,text] +---- +let(a=select(random(enron, q="body:oil", rows="500", fl="id, body"), + id, + analyze(body, body_bigram) as terms), + b=termVectors(a, maxDocFreq=.10, minDocFreq=.05, minTermLength=14, exclude="_,copyright"), + c=kmeans(b, 5), + d=getCentroids(c), + e=topFeatures(d, 5)) +---- + +This expression returns the following response: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "e": [ + [ + "enron enronxgate", + "north american", + "energy services", + "conference call", + "power generation" + ], + [ + "financial times", + "chief financial", + "financial officer", + "exchange commission", + "houston chronicle" + ], + [ + "southern california", + "california edison", + "public utilities", + "utilities commission", + "rate increases" + ], + [ + "rolling blackouts", + "public utilities", + "electricity prices", + "federal energy", + "price controls" + ], + [ + "california edison", + "regulatory commission", + "southern california", + "federal energy", + "power generators" + ] + ] + }, + { + "EOF": true, + "RESPONSE_TIME": 982 + } + ] + } +} +---- + +=== Cluster Features + +The example below examines the top features of a specific cluster. This example uses the same techniques +as the centroids example but the top features are extracted from a cluster rather then the centroids. + +The `getCluster` function returns a cluster by its index. Each cluster is a matrix containing term vectors +that have been clustered together based on their features. + +In the example below the `topFeatures` function is used to extract the top 4 features from each term vector +in the cluster. + +[source,text] +---- +let(a=select(random(collection3, q="body:oil", rows="500", fl="id, body"), + id, + analyze(body, body_bigram) as terms), + b=termVectors(a, maxDocFreq=.09, minDocFreq=.03, minTermLength=14, exclude="_,copyright"), + c=kmeans(b, 25), + d=getCluster(c, 0), + e=topFeatures(d, 4)) +---- + +This expression returns the following response: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "e": [ + [ + "electricity board", + "maharashtra state", + "power purchase", + "state electricity", + "reserved enron" + ], + [ + "electricity board", + "maharashtra state", + "state electricity", + "purchase agreement", + "independent power" + ], + [ + "maharashtra state", + "reserved enron", + "federal government", + "state government", + "dabhol project" + ], + [ + "purchase agreement", + "power purchase", + "electricity board", + "maharashtra state", + "state government" + ], + [ + "investment grade", + "portland general", + "general electric", + "holding company", + "transmission lines" + ], + [ + "state government", + "state electricity", + "purchase agreement", + "electricity board", + "maharashtra state" + ], + [ + "electricity board", + "state electricity", + "energy management", + "maharashtra state", + "energy markets" + ], + [ + "electricity board", + "maharashtra state", + "state electricity", + "state government", + "second quarter" + ] + ] + }, + { + "EOF": true, + "RESPONSE_TIME": 978 + } + ] + } +} +---- + +== Multi K-means Clustering + +K-means clustering will be produce different results depending on +the initial placement of the centroids. K-means is fast enough +that multiple trials can be performed and the best outcome selected. +The `multiKmeans` function runs the K-means +clustering algorithm for a gven number of trials and selects the +best result based on which trial produces the lowest intra-cluster +variance. + +The example below is identical to centroids example except that +it uses `multiKmeans` with 100 trials, rather then a single +trial of the `kmeans` function. + +[source,text] +---- +let(a=select(random(collection3, q="body:oil", rows="500", fl="id, body"), + id, + analyze(body, body_bigram) as terms), + b=termVectors(a, maxDocFreq=.09, minDocFreq=.03, minTermLength=14, exclude="_,copyright"), + c=multiKmeans(b, 5, 100), + d=getCentroids(c), + e=topFeatures(d, 5)) +---- + +This expression returns the following response: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "e": [ + [ + "enron enronxgate", + "energy trading", + "energy markets", + "energy services", + "unleaded gasoline" + ], + [ + "maharashtra state", + "electricity board", + "state electricity", + "energy trading", + "chief financial" + ], + [ + "price controls", + "electricity prices", + "francisco chronicle", + "wholesale electricity", + "power generators" + ], + [ + "southern california", + "california edison", + "public utilities", + "francisco chronicle", + "utilities commission" + ], + [ + "california edison", + "power purchases", + "system operator", + "term contracts", + "independent system" + ] + ] + }, + { + "EOF": true, + "RESPONSE_TIME": 1182 + } + ] + } +} +---- + +== Fuzzy K-means Clustering + +The `fuzzyKmeans` function is a soft clustering algorithm which +allows vectors to be assigned to more then one cluster. The *fuzziness* parameter +is a value between 1 and 2 that determines how fuzzy to make the cluster assignment. + +After the clustering has been performed the `getMembershipMatrix` function can be called +on the clustering result to return a matrix describing which clusters each vector belongs to. +There is a row in the matrix for each vector that was clustered. There is a column in the matrix +for each cluster. The values in the columns are the probability that the vector belonged to the specific +cluster. + +A simple example will make this more clear. In the example below 300 documents are analyzed and +then turned into a term vector matrix. Then the `fuzzyKmeans` function clusters the +term vectors into 12 clusters with a fuzziness factor of 1.25. + +The `getMembershipMatrix` function is used to return the membership matrix and the first row +of membership matrix is retrieved with the `rowAt` function. The `precision` function is then applied to the first row +of the matrix to make it easier to read. + +The output shows a single vector representing the cluster membership probabilities for the first +term vector. Notice that the term vector has the highest association with the 12th cluster, +but also has significant associations with the 3rd, 5th, 6th and 7th clusters. + +[source,text] +---- +et(a=select(random(collection3, q="body:oil", rows="300", fl="id, body"), + id, + analyze(body, body_bigram) as terms), + b=termVectors(a, maxDocFreq=.09, minDocFreq=.03, minTermLength=14, exclude="_,copyright"), + c=fuzzyKmeans(b, 12, fuzziness=1.25), + d=getMembershipMatrix(c), + e=rowAt(d, 0), + f=precision(e, 5)) +---- + +This expression returns the following response: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "f": [ + 0, + 0, + 0.178, + 0, + 0.17707, + 0.17775, + 0.16214, + 0, + 0, + 0, + 0, + 0.30504 + ] + }, + { + "EOF": true, + "RESPONSE_TIME": 2157 + } + ] + } +} +---- + +== K-nearest Neighbor + +The `knn` function searches the rows of a matrix for the +K-nearest neighbors of a search vector. The `knn` function +returns a *matrix* of the K-nearest neighbors. The `knn` function +has a *named parameter* called *distance* which specifies the distance measure. +There are four distance measures currently supported: + +* euclidean (Default) +* manhattan +* canberra +* earthMovers + +The example below builds on the clustering examples to demonstrate +the `knn` function. + +In the example, the centroids matrix is set to variable *d*. The first +centroid vector is selected from the matrix with the `rowAt` function. +Then the `knn` function is used to find the 3 nearest neighbors +to the centroid vector in the term vector matrix (variable b). + +The `knn` function returns a matrix with the 3 nearest neighbors based on the +default distance measure which is euclidean. Finally, the top 4 features +of the term vectors in the nearest neighbor matrix are returned. + +[source,text] +---- +let(a=select(random(collection3, q="body:oil", rows="500", fl="id, body"), + id, + analyze(body, body_bigram) as terms), + b=termVectors(a, maxDocFreq=.09, minDocFreq=.03, minTermLength=14, exclude="_,copyright"), + c=multiKmeans(b, 5, 100), + d=getCentroids(c), + e=rowAt(d, 0), + g=knn(b, e, 3), + h=topFeatures(g, 4)) +---- + +This expression returns the following response: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "h": [ + [ + "california power", + "electricity supply", + "concerned about", + "companies like" + ], + [ + "maharashtra state", + "california power", + "electricity board", + "alternative energy" + ], + [ + "electricity board", + "maharashtra state", + "state electricity", + "houston chronicle" + ] + ] + }, + { + "EOF": true, + "RESPONSE_TIME": 1243 + } + ] + } +} +---- \ No newline at end of file diff --git a/solr/solr-ref-guide/src/math-expressions.adoc b/solr/solr-ref-guide/src/math-expressions.adoc new file mode 100644 index 00000000000..e2ed438b798 --- /dev/null +++ b/solr/solr-ref-guide/src/math-expressions.adoc @@ -0,0 +1,59 @@ += Math Expressions +:page-children: scalar-math, vector-math, variables, matrix-math, vectorization, term-vectors, statistics, probability, montecarlo, time-series, regression, numerical-analysis, curve-fitting, machine-learning + +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +The Streaming Expression library includes a powerful +mathematical programing syntax with many of the features of a +functional programming language. The syntax includes variables, +data structures and a growing set of mathematical functions. + +This user guide provides an overview of the different areas of +mathematical coverage starting with basic scalar math and +ending with machine learning. Along the way the guide covers variables +and data structures and techniques for combining Solr's +powerful streams with mathematical functions to make every +record in your Solr Cloud cluster computable. + +== link:scalar-math.adoc[Scalar Math] + +== link:vector-math.adoc[Vector Math] + +== link:variables.adoc[Variables] + +== link:matrix-math.adoc[Matrix Math] + +== link:vectorization.adoc[Streams and Vectorization] + +== link:term-vectors.adoc[Text Analysis and Term Vectors] + +== link:statistics.adoc[Statistics] + +== link:probability.adoc[Probability] + +== link:montecarlo.adoc[Monte Carlo Simulations] + +== link:time-series.adoc[Time Series] + +== link:regression.adoc[Linear Regression] + +== link:numerical-analysis.adoc[Interpolation, Derivatives and Integrals] + +== link:curve-fitting.adoc[Curve Fitting] + +== link:machine-learning.adoc[Machine Learning] diff --git a/solr/solr-ref-guide/src/matrix-math.adoc b/solr/solr-ref-guide/src/matrix-math.adoc new file mode 100644 index 00000000000..ba45ccaaba0 --- /dev/null +++ b/solr/solr-ref-guide/src/matrix-math.adoc @@ -0,0 +1,443 @@ += Matrices and Matrix Math +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +This section of the user guide covers the +basics of matrix creation, manipulation and matrix math. Other sections +of the user guide demonstrate how matrices are used by the statistics, +probability and machine learning functions. + +== Matrix Creation + +A matrix can be created with the `matrix` function. +The matrix function is passed a list of `arrays` with +each array representing a *row* in the matrix. + +The example below creates a two-by-two matrix. + +[source,text] +---- +matrix(array(1, 2), + array(4, 5)) +---- + +When this expression is sent to the /stream handler it +responds with: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "return-value": [ + [ + 1, + 2 + ], + [ + 4, + 5 + ] + ] + }, + { + "EOF": true, + "RESPONSE_TIME": 0 + } + ] + } +---- + +== Accessing Rows and Columns + +The rows and columns of a matrix can be accessed using the `rowAt` +and `colAt` functions. + +The example below creates a 2 by 2 matrix and returns the second column of the matrix. +Notice that the matrix is passed variables in this example rather than +directly passed a list of arrays. + +[source,text] +---- +let(a=array(1, 2), + b=array(4, 5), + c=matrix(a, b), + d=colAt(c, 1)) +---- + +When this expression is sent to the /stream handler it +responds with: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "d": [ + 2, + 5 + ] + }, + { + "EOF": true, + "RESPONSE_TIME": 0 + } + ] + } +} +---- + +== Row and Column Labels + +A matrix can have column and rows and labels. The functions +`setRowLabels`, `setColumnLabels`, `getRowLabels` and `getColumnLabels` +can be used to set and get the labels. The label values +are set using string arrays. + +The example below sets the row and column labels. In other sections of the +user guide examples are shown where functions return matrices +with the labels already set. + +Below is a simple example of setting and +getting row and column labels +on a matrix. + +[source,text] +---- +let(echo="d, e", + a=matrix(array(1, 2), + array(4, 5)), + b=setRowLabels(a, array("row0", "row1")), + c=setColumnLabels(b, array("col0", "col1")), + d=getRowLabels(c), + e=getColumnLabels(c)) +---- + +When this expression is sent to the /stream handler it +responds with: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "d": [ + "row0", + "row1" + ], + "e": [ + "col0", + "col1" + ] + }, + { + "EOF": true, + "RESPONSE_TIME": 0 + } + ] + } +} +---- + +== Matrix Attributes + +A matrix can also have an arbitrary set of named attributes associated +with it. Certain functions, such as the `termVectors` function, +return matrices that contain attributes that describe data in the matrix. + +Attributes can be retrieved by name using the `getAttribute` function and +the entire attribute map can be returned using the `getAttributes` +function. + +== Matrix Dimensions + +The dimensions of a matrix can be determined using the +`rowCount` and `columnCount` functions. + +The example below retrieves the dimensions of a matrix. + +[source,text] +---- +let(echo="b,c", + a=matrix(array(1, 2, 3), + array(4, 5, 6)), + b=rowCount(a), + c=columnCount(a)) +---- + +When this expression is sent to the /stream handler it +responds with: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "b": 2, + "c": 3 + }, + { + "EOF": true, + "RESPONSE_TIME": 0 + } + ] + } +} +---- + +== Matrix Transposition + +A matrix can be https://en.wikipedia.org/wiki/Transpose[transposed] +using the `transpose` function. + +An example of matrix transposition is shown below: + +[source,text] +---- +let(a=matrix(array(1, 2), + array(4, 5)), + b=transpose(a)) +---- + +When this expression is sent to the /stream handler it +responds with: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "b": [ + [ + 1, + 4 + ], + [ + 2, + 5 + ] + ] + }, + { + "EOF": true, + "RESPONSE_TIME": 24 + } + ] + } +} +---- + +== Matrix Summations + +The rows and columns of a matrix can be summed with the `sumRows` and `sumColumns` functions. +Below is an example of the `sumRows` function which returns an +array with the sum of each row. + +[source,text] +---- +let(a=matrix(array(1, 2, 3), + array(4, 5, 6)), + b=sumRows(a)) +---- + +When this expression is sent to the /stream handler it +responds with: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "b": [ + 6, + 15 + ] + }, + { + "EOF": true, + "RESPONSE_TIME": 2 + } + ] + } +} +---- + +The `grandSum` function returns the sum of all values in the matrix. +Below is an example of the `grandSum` function: + +[source,text] +---- +let(a=matrix(array(1, 2, 3), + array(4, 5, 6)), + b=grandSum(a)) +---- + +When this expression is sent to the /stream handler it +responds with: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "b": 21 + }, + { + "EOF": true, + "RESPONSE_TIME": 0 + } + ] + } +} +---- + +== Scalar Matrix Math + +The same scalar math functions that apply to vectors can also be applied to matrices: `scalarAdd`, `scalarSubtract`, +`scalarMultiply`, `scalarDivide`. Below is an example of the `scalarAdd` function +which adds a scalar value to each element in a matrix. + + +[source,text] +---- +let(a=matrix(array(1, 2), + array(4, 5)), + b=scalarAdd(10, a)) +---- + +When this expression is sent to the /stream handler it +responds with: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "b": [ + [ + 11, + 12 + ], + [ + 14, + 15 + ] + ] + }, + { + "EOF": true, + "RESPONSE_TIME": 0 + } + ] + } +} +---- + +== Matrix Addition and Subtraction + +Two matrices can be added and subtracted using the `ebeAdd` and `ebeSubtract` functions, +which perform element-by-element addition +and subtraction of matrices. + +Below is a simple example of an element-by-element addition of a matrix by itself: + +[source,text] +---- +let(a=matrix(array(1, 2), + array(4, 5)), + b=ebeAdd(a, a)) +---- + +When this expression is sent to the /stream handler it +responds with: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "b": [ + [ + 2, + 4 + ], + [ + 8, + 10 + ] + ] + }, + { + "EOF": true, + "RESPONSE_TIME": 0 + } + ] + } +} +---- + +== Matrix Multiplication + +Matrix multiplication can be accomplished using the `matrixMult` function. Below is a simple +example of matrix multiplication: + +[source,text] +---- +let(a=matrix(array(1, 2), + array(4, 5)), + b=matrix(array(11, 12), + array(14, 15)), + c=matrixMult(a, b)) +---- + +When this expression is sent to the /stream handler it +responds with: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "c": [ + [ + 39, + 42 + ], + [ + 114, + 123 + ] + ] + }, + { + "EOF": true, + "RESPONSE_TIME": 0 + } + ] + } +} +---- \ No newline at end of file diff --git a/solr/solr-ref-guide/src/montecarlo.adoc b/solr/solr-ref-guide/src/montecarlo.adoc new file mode 100644 index 00000000000..814110ffefc --- /dev/null +++ b/solr/solr-ref-guide/src/montecarlo.adoc @@ -0,0 +1,213 @@ += Monte Carlo Simulations +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + + +Monte Carlo simulations are commonly used to model the behavior of +stochastic systems. This section of the user guide describes +how to perform both *uncorrelated* and *correlated* Monte Carlo simulations +using the *sampling* capabilities of the probability distribution framework. + +=== Uncorrelated Simulations + +Uncorrelated Monte Carlo simulations model stochastic systems with the assumption + that the underlying random variables move independently of each other. + A simple example of a Monte Carlo simulation using two independently changing random variables + is described below. + +In this example a Monte Carlo simulation is used to determine the probability that a simple hinge assembly will +fall within a required length specification. + +The hinge has two components *A* and *B*. The combined length of the two components must be less then 5 centimeters +to fall within specification. + +A random sampling of lengths for component *A* has shown that its length conforms to a +normal distribution with a mean of 2.2 centimeters and a standard deviation of .0195 +centimeters. + +A random sampling of lengths for component *B* has shown that its length conforms +to a normal distribution with a mean of 2.71 centimeters and a standard deviation of .0198 centimeters. + +The Monte Carlo simulation below performs the following steps: + +* A normal distribution with a mean of 2.2 and a standard deviation of .0195 is created to model the length of componentA. +* A normal distribution with a mean of 2.71 and a standard deviation of .0198 is created to model the length of componentB. +* The `monteCarlo` function is used to simulate component pairs. The `monteCarlo` function + calls the *add(sample(componentA), sample(componentB))* function 100000 times and collects the results in an array. Each + time the function is called a random sample is drawn from the componentA + and componentB length distributions. The `add` function adds the two samples to calculate the combined length. + The result of each function run is collected in an array and assigned to the *simresults* variable. +* An `empiricalDistribution` function is then created from the *simresults* array to model the distribution of the + simulation results. +* Finally, the `cumulativeProbability` function is called on the *simmodel* to determine the cumulative probability + that the combined length of the components is 5 or less. +* Based on the simulation there is .9994371944629039 probability that the combined length of a component pair will +be 5 or less. + +[source,text] +---- +let(componentA=normalDistribution(2.2, .0195), + componentB=normalDistribution(2.71, .0198), + simresults=monteCarlo(add(sample(componentA), sample(componentB)), 100000), + simmodel=empiricalDistribution(simresults), + prob=cumulativeProbability(simmodel, 5)) +---- + +When this expression is sent to the /stream handler it responds with: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "prob": 0.9994371944629039 + }, + { + "EOF": true, + "RESPONSE_TIME": 660 + } + ] + } +} +---- + +=== Correlated Simulations + +The simulation above assumes that the lengths of *componentA* and *componentB* vary independently. +What would happen to the probability model if there was a correlation between the lengths of +*componentA* and *componentB*. + +In the example below a database containing assembled pairs of components is used to determine +if there is a correlation between the lengths of the components, and how the correlation effects the model. + +Before performing a simulation of the effects of correlation on the probability model its +useful to understand what the correlation is between the lengths of *componentA* and *componentB*. + +In the example below 5000 random samples are selected from a collection +of assembled hinges. Each sample contains +lengths of the components in the fields *componentA_d* and *componentB_d*. + +Both fields are then vectorized. The *componentA_d* vector is stored in +variable *b* and the *componentB_d* variable is stored in variable *c*. + +Then the correlation of the two vectors is calculated using the `corr` function. Note that the outcome +from `corr` is 0.9996931313216989. This means that *componentA_d* and *componentB_d* are almost +perfectly correlated. + +[source,text] +---- +let(a=random(collection5, q="*:*", rows="5000", fl="componentA_d, componentB_d"), + b=col(a, componentA_d)), + c=col(a, componentB_d)), + d=corr(b, c)) +---- + +When this expression is sent to the /stream handler it responds with: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "d": 0.9996931313216989 + }, + { + "EOF": true, + "RESPONSE_TIME": 309 + } + ] + } +} +---- + +How does correlation effect the probability model? + +The example below explores how to use a *multivariate normal distribution* function +to model how correlation effects the probability of hinge defects. + +In this example 5000 random samples are selected from a collection +containing length data for assembled hinges. Each sample contains +the fields *componentA_d* and *componentB_d*. + +Both fields are then vectorized. The *componentA_d* vector is stored in +variable *b* and the *componentB_d* variable is stored in variable *c*. + +An array is created that contains the *means* of the two vectorized fields. + +Then both vectors are added to a matrix which is transposed. This creates +an *observation* matrix where each row contains one observation of +*componentA_d* and *componentB_d*. A covariance matrix is then created from the columns of +the observation matrix with the +`cov` function. The covariance matrix describes the covariance between +*componentA_d* and *componentB_d*. + +The `multivariateNormalDistribution` function is then called with the +array of means for the two fields and the covariance matrix. The model +for the multivariate normal distribution is stored in variable *g*. + +The `monteCarlo` function then calls the function *add(sample(g))* 50000 times +and collections the results in a vector. Each time the function is called a single sample +is drawn from the multivariate normal distribution. Each sample is a vector containing +one *componentA* and *componentB* pair. the `add` function adds the values in the vector to +calculate the length of the pair. Over the long term the samples drawn from the +multivariate normal distribution will conform to the covariance matrix used to construct it. + +Just as in the non-correlated example an empirical distribution is used to model probabilities +of the simulation vector and the `cumulativeProbability` function is used to compute the cumulative +probability that the combined component length will be 5 centimeters or less. + +Notice that the probability of a hinge meeting specification has dropped to 0.9889517439980468. +This is because the strong correlation +between the lengths of components means that their lengths rise together causing more hinges to +fall out of the 5 centimeter specification. + +[source,text] +---- +let(a=random(hinges, q="*:*", rows="5000", fl="componentA_d, componentB_d"), + b=col(a, componentA_d), + c=col(a, componentB_d), + cor=corr(b,c), + d=array(mean(b), mean(c)), + e=transpose(matrix(b, c)), + f=cov(e), + g=multiVariateNormalDistribution(d, f), + h=monteCarlo(add(sample(g)), 50000), + i=empiricalDistribution(h), + j=cumulativeProbability(i, 5)) +---- + +When this expression is sent to the /stream handler it responds with: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "j": 0.9889517439980468 + }, + { + "EOF": true, + "RESPONSE_TIME": 599 + } + ] + } +} +---- + diff --git a/solr/solr-ref-guide/src/numerical-analysis.adoc b/solr/solr-ref-guide/src/numerical-analysis.adoc new file mode 100644 index 00000000000..cb2bc2e7d92 --- /dev/null +++ b/solr/solr-ref-guide/src/numerical-analysis.adoc @@ -0,0 +1,430 @@ += Interpolation, Derivatives and Integrals +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +This section of the math expression user guide covers *interpolation*, *derivatives* and *integrals*. +These three interrelated topics are part of the field of mathematics called *numerical analysis*. + +== Interpolation + +Interpolation is used to construct new data points between a set of known control of points. +The ability to *predict* new data points allows for *sampling* along the curve defined by the +control points. + +The interpolation functions described below all return an *interpolation model* +that can be passed to other functions which make use of the sampling capability. + +If returned directly the interpolation model returns an array containing predictions for each of the +control points. This is useful in the case of `loess` interpolation which first smooths the control points +and then interpolates the smoothed points. All other interpolation function simply return the original +control points because interpolation predicts a curve that passes through the original control points. + +There are different algorithms for interpolation that will result in different predictions +along the curve. The math expressions library currently supports the following +interpolation functions: + +* `lerp`: Linear interpolation predicts points that pass through each control point and + form straight lines between control points. +* `spline`: Spline interpolation predicts points that pass through each control point +and form a smooth curve between control points. +* `akima`: Akima spline interpolation is similar to spline interpolation but is stable to outliers. +* `loess`: Loess interpolation first performs a non-linear local regression to smooth the original +control points. Then a spline is used to interpolate the smoothed control points. + +=== Upsampling + +Interpolation can be used to increase the sampling rate along a curve. One example +of this would be to take a time series with samples every minute and create a data set with +samples every second. In order to do this the data points between the minutes must be created. + +The `predict` function can be used to predict values anywhere within the bounds of the interpolation +range. The example below shows a very simple example of upsampling. + +In the example linear interpolation is performed on the arrays in variables *x* and *y*. The *x* variable, +which is the x axis, is a sequence from 0 to 20 with a stride of 2. The *y* variable defines the curve +along the x axis. + +The `lerp` function performs the interpolation and returns the interpolation model. + +The `u` value is an array from 0 to 20 with a stride of 1. This fills in the gaps of the original x axis. +The `predict` function then uses the interpolation function in variable *l* to predict values for +every point in the array assigned to variable *u*. + +The variable *p* is the array of predictions, which is the upsampled set of y values. + +[source,text] +---- +let(x=array(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20), + y=array(5, 10, 60, 190, 100, 130, 100, 20, 30, 10, 5), + l=lerp(x, y), + u=array(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20), + p=predict(l, u)) +---- + +When this expression is sent to the /stream handler it +responds with: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "g": [ + 5, + 7.5, + 10, + 35, + 60, + 125, + 190, + 145, + 100, + 115, + 130, + 115, + 100, + 60, + 20, + 25, + 30, + 20, + 10, + 7.5, + 5 + ] + }, + { + "EOF": true, + "RESPONSE_TIME": 0 + } + ] + } +} +---- + +=== Smoothing Interpolation + +The `loess` function is a smoothing interpolator which means it doesn't derive +a function that passes through the original control points. Instead the `loess` function +returns a function that smooths the original control points. + +A technique known as local regression is used to compute the smoothed curve. The size of the +neighborhood of the local regression can be adjusted +to control how close the new curve conforms to the original control points. + +The `loess` function is passed *x* and *y* axises and fits a smooth curve to the data. +If only a single array is provided it is treated as the *y* axis and a sequence is generated +for the *x* axis. + +The example below uses the `loess` function to fit a curve to a set of *y* values in an array. +The bandwidth parameter defines the percent of data to use for the local +regression. The lower the percent the smaller the neighborhood used for the local +regression and the closer the curve will be to the original data. + +In the example the fitted curve is subtracted from the original curve using the +`ebeSubtract` function. The output shows the error between the +fitted curve and the original curve, known as the residuals. The output also includes +the sum-of-squares of the residuals which provides a measure +of how large the error is. + +[source,text] +---- +let(echo="residuals, sumSqError", + y=array(0, 1, 2, 3, 4, 5.7, 6, 7, 7, 7,6, 7, 7, 7, 6, 5, 5, 3, 2, 1, 0), + curve=loess(y, bandwidth=.3), + residuals=ebeSubtract(y, curve), + sumSqError=sumSq(residuals)) +---- + +When this expression is sent to the /stream handler it +responds with: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "residuals": [ + 0, + 0, + 0, + -0.040524802275866634, + -0.10531988096456502, + 0.5906115002526198, + 0.004215074334896762, + 0.4201374330912433, + 0.09618315578013803, + 0.012107948556718817, + -0.9892939034492398, + 0.012014364143757561, + 0.1093830927709325, + 0.523166271893805, + 0.09658362075164639, + -0.011433819306139625, + 0.9899403519886416, + -0.011707983372932773, + -0.004223284004140737, + -0.00021462867928434548, + 0.0018723112875456138 + ], + "sumSqError": 2.8016013870800616 + }, + { + "EOF": true, + "RESPONSE_TIME": 0 + } + ] + } +} +---- + +In the next example the curve is fit using a bandwidth of .25. Notice that the curve +is a closer fit, shown by the smaller residuals and lower value for the sum-of-squares of the +residuals. + +[source,text] +---- +let(echo="residuals, sumSqError", + y=array(0, 1, 2, 3, 4, 5.7, 6, 7, 6, 5, 5, 3, 2, 1, 0), + curve=loess(y, .25), + residuals=ebeSubtract(y, curve), + sumSqError=sumSq(residuals)) +---- + +When this expression is sent to the /stream handler it +responds with: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "residuals": [ + 0, + 0, + 0, + 0, + -0.19117650587715396, + 0.442863451538809, + -0.18553845993358564, + 0.29990769020356645, + 0, + 0.23761890236245709, + -0.7344358765888117, + 0.2376189023624491, + 0, + 0.30373119215254984, + -3.552713678800501e-15, + -0.23761890236245264, + 0.7344358765888046, + -0.2376189023625095, + 0, + 2.842170943040401e-14, + -2.4868995751603507e-14 + ], + "sumSqError": 1.7539413576337557 + }, + { + "EOF": true, + "RESPONSE_TIME": 0 + } + ] + } +} +---- + +== Derivatives + +The derivative of a function measures the rate of change of the *y* value in respects to the +rate of change of the *x* value. + +The `derivative` function can compute the derivative of any *interpolation* function. +The `derivative` function can also compute the derivative of a derivative. + +The example below computes the derivative for a `loess` interpolation function. + +[source,text] +---- +let(x=array(0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20), + y=array(0, 1, 2, 3, 4, 5.7, 6, 7, 7, 7,6, 7, 7, 7, 6, 5, 5, 3, 2, 1, 0), + curve=loess(x, y, bandwidth=.3), + derivative=derivative(curve)) +---- + +When this expression is sent to the /stream handler it +responds with: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "derivative": [ + 1.0022002675659012, + 0.9955994648681976, + 1.0154018729613081, + 1.0643674501141696, + 1.0430879694757085, + 0.9698717643975381, + 0.7488201070357539, + 0.44627000894357516, + 0.19019561285422165, + 0.01703599324311178, + -0.001908408138535126, + -0.009121607450087499, + -0.2576361507216319, + -0.49378951291352746, + -0.7288073815664, + -0.9871806872210384, + -1.0025400632604322, + -1.001836567536853, + -1.0076227586138085, + -1.0021524620888589, + -1.0020541789058157 + ] + }, + { + "EOF": true, + "RESPONSE_TIME": 0 + } + ] + } +} +---- + +== Integrals + +An integral is a measure of the volume underneath a curve. +The `integrate` function computes an integral for a specific +range of an interpolated curve. + +In the example below the `integrate` function computes an +integral for the entire range of the curve, 0 through 20. + +[source,text] +---- +let(x=array(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20), + y=array(0, 1, 2, 3, 4, 5.7, 6, 7, 7, 7,6, 7, 7, 7, 6, 5, 5, 3, 2, 1, 0), + curve=loess(x, y, bandwidth=.3), + integral=integrate(curve, 0, 20)) +---- + +When this expression is sent to the /stream handler it +responds with: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "integral": 90.17446104846645 + }, + { + "EOF": true, + "RESPONSE_TIME": 0 + } + ] + } +} +---- + +In the next example an integral is computed for the range of 0 through 10. + +[source,text] +---- +let(x=array(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20), + y=array(0, 1, 2, 3, 4, 5.7, 6, 7, 7, 7,6, 7, 7, 7, 6, 5, 5, 3, 2, 1, 0), + curve=loess(x, y, bandwidth=.3), + integral=integrate(curve, 0, 10)) +---- + +When this expression is sent to the /stream handler it +responds with: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "integral": 45.300912584519914 + }, + { + "EOF": true, + "RESPONSE_TIME": 0 + } + ] + } +} +---- + +== Bicubic Spline + +The `bicubicSpline` function can be used to interpolate and predict values +anywhere within a grid of data. + +A simple example will make this more clear. + +In example below a bicubic spline is used to interpolate a matrix of real estate data. +Each row of the matrix represents a specific *year*. Each column of the matrix +represents a *floor* of the building. The grid of numbers is the average selling price of +an apartment for each year and floor. For example in 2002 the average selling price for +the 9th floor was 415000 (row 3, column 3). + +The `bicubicSpline` function is then used to +interpolate the grid, and the `predict` function is used to predict a value for year 2003, floor 8. +Notice that the matrix does not included a data point for year 2003, floor 8. The `bicupicSpline` +function creates that data point based on the surrounding data in the matrix. + +[source,text] +---- +let(years=array(1998, 2000, 2002, 2004, 2006), + floors=array(1, 5, 9, 13, 17, 19), + prices = matrix(array(300000, 320000, 330000, 350000, 360000, 370000), + array(320000, 330000, 340000, 350000, 365000, 380000), + array(400000, 410000, 415000, 425000, 430000, 440000), + array(410000, 420000, 425000, 435000, 445000, 450000), + array(420000, 430000, 435000, 445000, 450000, 470000)), + bspline=bicubicSpline(years, floors, prices), + prediction=predict(bspline, 2003, 8)) +---- + +When this expression is sent to the /stream handler it +responds with: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "prediction": 418279.5009328358 + }, + { + "EOF": true, + "RESPONSE_TIME": 0 + } + ] + } +} +---- + diff --git a/solr/solr-ref-guide/src/probability.adoc b/solr/solr-ref-guide/src/probability.adoc new file mode 100644 index 00000000000..9c46d087f21 --- /dev/null +++ b/solr/solr-ref-guide/src/probability.adoc @@ -0,0 +1,415 @@ += Probability Distributions +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +This section of the user guide covers the +*probability distribution +framework* included in the math expressions library. + +== Probability Distributions + +The probability distribution framework includes +many commonly used *real* and *discrete* probability +distributions, including support for *empirical* and +*enumerated* distributions that model real world data. + +The probability distribution framework also includes a set +of functions that use the probability distributions +to support probability calculations and sampling. + +=== Real Distributions + +The probability distribution framework has the following functions +which support well known real probability distributions: + +* `normalDistribution`: Creates a normal distribution function. + +* `logNormalDistribution`: Creates a log normal distribution function. + +* `gammaDistribution`: Creates a gamma distribution function. + +* `betaDistribution`: Creates a beta distribution function. + +* `uniformDistribution`: Creates a uniform real distribution function. + +* `weibullDistribution`: Creates a Weibull distribution function. + +* `triangularDistribution`: Creates a triangular distribution function. + +* `constantDistribution`: Creates constant real distribution function. + +=== Empirical Distribution + +The `empiricalDistribution` function creates a real probability +distribution from actual data. An empirical distribution +can be used interchangeably with any of the theoretical +real distributions. + +=== Discrete + +The probability distribution framework has the following functions +which support well known discrete probability distributions: + +* `poissonDistribution`: Creates a Poisson distribution function. + +* `binomialDistribution`: Creates a binomial distribution function. + +* `uniformIntegerDistribution`: Creates a uniform integer distribution function. + +* `geometricDistribution`: Creates a geometric distribution function. + +* `zipFDistribution`: Creates a Zipf distribution function. + +=== Enumerated Distributions + +The `enumeratedDistribution` function creates a discrete +distribution function from a data set of discrete values, +or from and enumerated list of values and probabilities. + +Enumerated distribution functions can be used interchangeably +with any of the theoretical discrete distributions. + +=== Cumulative Probability + +The `cumulativeProbability` function can be used with all +probability distributions to calculate the +cumulative probability of encountering a specific +random variable within a specific distribution. + +Below is example of calculating the cumulative probability +of a random variable within a normal distribution. + +In the example a normal distribution function is created +with a mean of 10 and a standard deviation of 5. Then +the cumulative probability of the value 12 is calculated for this +specific distribution. + +[source,text] +---- +let(a=normalDistribution(10, 5), + b=cumulativeProbability(a, 12)) +---- + +When this expression is sent to the /stream handler it responds with: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "b": 0.6554217416103242 + }, + { + "EOF": true, + "RESPONSE_TIME": 0 + } + ] + } +} +---- + +Below is an example of a cumulative probability calculation +using an empirical distribution. + +In the example an empirical distribution is created from a random +sample taken from the *price_f* field. + +The cumulative probability of the value .75 is then calculated. +The *price_f* field in this example was generated using a +uniform real distribution between 0 and 1, so the output of the + `cumulativeProbability` function is very close to .75. + +[source,text] +---- +let(a=random(collection1, q="*:*", rows="30000", fl="price_f"), + b=col(a, price_f), + c=empiricalDistribution(b), + d=cumulativeProbability(c, .75)) +---- + +When this expression is sent to the /stream handler it responds with: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "b": 0.7554217416103242 + }, + { + "EOF": true, + "RESPONSE_TIME": 0 + } + ] + } +} +---- + +=== Probability + +The `probability` function can be used with any discrete +distribution function to compute the probability of a +discrete value. + +Below is an example which calculates the probability +of a discrete value within a Poisson distribution. + +In the example a Poisson distribution function is created +with a mean of 100. Then the +probability of encountering a sample of the discrete value 101 is calculated for this +specific distribution. + +[source,text] +---- +let(a=poissonDistribution(100), + b=probability(a, 101)) +---- + +When this expression is sent to the /stream handler it responds with: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "b": 0.039466333474403106 + }, + { + "EOF": true, + "RESPONSE_TIME": 0 + } + ] + } +} +---- + +Below is an example of a probability calculation +using an enumerated distribution. + +In the example an enumerated distribution is created from a random +sample taken from the *day_i* field, which was created +using a uniform integer distribution between 0 and 30. + +The probability of the discrete value 10 is then calculated. + +[source,text] +---- +let(a=random(collection1, q="*:*", rows="30000", fl="day_i"), + b=col(a, day_i), + c=enumeratedDistribution(b), + d=probability(c, 10)) +---- + +When this expression is sent to the /stream handler it responds with: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "d": 0.03356666666666666 + }, + { + "EOF": true, + "RESPONSE_TIME": 488 + } + ] + } +} +---- + +=== Sampling + +All probability distributions support sampling. The `sample` +function returns 1 or more random samples from a probability +distribution. + +Below is an example drawing a single sample from +a normal distribution. + +[source,text] +---- +let(a=normalDistribution(10, 5), + b=sample(a)) +---- + +When this expression is sent to the /stream handler it responds with: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "b": 11.24578055004963 + }, + { + "EOF": true, + "RESPONSE_TIME": 0 + } + ] + } +} +---- + +Below is an example drawing 10 samples from a normal +distribution. + +[source,text] +---- +let(a=normalDistribution(10, 5), + b=sample(a, 10)) +---- + +When this expression is sent to the /stream handler it responds with: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "b": [ + 10.18444709339441, + 9.466947971749377, + 1.2420697166234458, + 11.074501226984806, + 7.659629052136225, + 0.4440887839190708, + 13.710925254778786, + 2.089566359480239, + 0.7907293097654424, + 2.8184587681006734 + ] + }, + { + "EOF": true, + "RESPONSE_TIME": 3 + } + ] + } +} +---- + +=== Multivariate Normal Distribution + +The multivariate normal distribution is a generalization of the +univariate normal distribution to higher dimensions. + +The multivariate normal distribution models two or more random +variables that are normally distributed. The relationship between +the variables is defined by a covariance matrix. + +==== Sampling + +The `sample` function can be used to draw samples +from a multivariate normal distribution in much the same +way as a univariate normal distribution. +The difference is that each sample will be an array containing a sample +drawn from each of the underlying normal distributions. +If multiple samples are drawn, the `sample` function returns a matrix with a +sample in each row. Over the long term the columns of the sample +matrix will conform to the covariance matrix used to parametrize the +multivariate normal distribution. + +The example below demonstrates how to initialize and draw samples +from a multivariate normal distribution. + +In this example 5000 random samples are selected from a collection +of log records. Each sample contains +the fields *filesize_d* and *response_d*. The values of both fields conform +to a normal distribution. + +Both fields are then vectorized. The *filesize_d* vector is stored in +variable *b* and the *response_d* variable is stored in variable *c*. + +An array is created that contains the *means* of the two vectorized fields. + +Then both vectors are added to a matrix which is transposed. This creates +an *observation* matrix where each row contains one observation of +*filesize_d* and *response_d*. A covariance matrix is then created from the columns of +the observation matrix with the +`cov` function. The covariance matrix describes the covariance between +*filesize_d* and *response_d*. + +The `multivariateNormalDistribution` function is then called with the +array of means for the two fields and the covariance matrix. The model for the +multivariate normal distribution is assigned to variable *g*. + +Finally five samples are drawn from the multivariate normal distribution. The samples +are returned as a matrix, with each row representing one sample. There are two +columns in the matrix. The first column contains samples for *filesize_d* and the second +column contains samples for *response_d*. Over the long term the covariance between +the columns will conform to the covariance matrix used to instantiate the +multivariate normal distribution. + +[source,text] +---- +let(a=random(collection2, q="*:*", rows="5000", fl="filesize_d, response_d"), + b=col(a, filesize_d), + c=col(a, response_d), + d=array(mean(b), mean(c)), + e=transpose(matrix(b, c)), + f=cov(e), + g=multiVariateNormalDistribution(d, f), + h=sample(g, 5)) +---- + +When this expression is sent to the /stream handler it responds with: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "h": [ + [ + 41974.85669321393, + 779.4097049705296 + ], + [ + 42869.19876441414, + 834.2599296790783 + ], + [ + 38556.30444839889, + 720.3683470060988 + ], + [ + 37689.31290928216, + 686.5549428100018 + ], + [ + 40564.74398214547, + 769.9328090774 + ] + ] + }, + { + "EOF": true, + "RESPONSE_TIME": 162 + } + ] + } +} +---- + diff --git a/solr/solr-ref-guide/src/regression.adoc b/solr/solr-ref-guide/src/regression.adoc new file mode 100644 index 00000000000..b57c62bcd5c --- /dev/null +++ b/solr/solr-ref-guide/src/regression.adoc @@ -0,0 +1,439 @@ += Linear Regression +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + + +This section of the math expressions user guide covers simple and multivariate linear regression. + + +== Simple Linear Regression + +The `regress` function is used to build a linear regression model +between two random variables. Sample observations are provided with two +numeric arrays. The first numeric array is the *independent variable* and +the second array is the *dependent variable*. + +In the example below the `random` function selects 5000 random samples each containing +the fields *filesize_d* and *response_d*. The two fields are vectorized +and stored in variables *b* and *c*. Then the `regress` function performs a regression +analysis on the two numeric arrays. + +The `regress` function returns a single tuple with the results of the regression +analysis. + +Note that in this regression analysis the value of *RSquared* is *.75*. This means that changes in +*filesize_d* explain 75% of the variability of the *response_d* variable. + +[source,text] +---- +let(a=random(collection2, q="*:*", rows="5000", fl="filesize_d, response_d"), + b=col(a, filesize_d), + c=col(a, response_d), + d=regress(b, c)) +---- + +When this expression is sent to the /stream handler it responds with: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "d": { + "significance": 0, + "totalSumSquares": 10564812.895147054, + "R": 0.8674822407146515, + "RSquared": 0.7525254379553127, + "meanSquareError": 523.1137343558588, + "intercept": -49.528134913099095, + "slopeConfidenceInterval": 0.0003171801710329995, + "regressionSumSquares": 7950290.450836472, + "slope": 0.019945557923159506, + "interceptStdErr": 6.489732340389941, + "N": 5000 + } + }, + { + "EOF": true, + "RESPONSE_TIME": 98 + } + ] + } +} +---- + +=== Prediction + +The `predict` function uses the regression model to make predictions. +Using the example above the regression model can be used to predict the value +of *response_d* given a value for *filesize_d*. + +In the example below the `predict` function uses the regression analysis to predict +the value of *response_d* for the *filesize_d* value of 40000. + + +[source,text] +---- +let(a=random(collection2, q="*:*", rows="5000", fl="filesize_d, response_d"), + b=col(a, filesize_d), + c=col(a, response_d), + d=regress(b, c), + e=predict(d, 40000)) +---- + +When this expression is sent to the /stream handler it responds with: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "e": 748.079241022975 + }, + { + "EOF": true, + "RESPONSE_TIME": 95 + } + ] + } +} +---- + +The `predict` function can also make predictions for an array of values. In this +case it returns an array of predictions. + +In the example below the `predict` function uses the regression analysis to +predict values for each of the 5000 samples of `filesize_d` used to generate the model. +In this case 5000 predictions are returned. + +[source,text] +---- +let(a=random(collection2, q="*:*", rows="5000", fl="filesize_d, response_d"), + b=col(a, filesize_d), + c=col(a, response_d), + d=regress(b, c), + e=predict(d, b)) +---- + +When this expression is sent to the /stream handler it responds with: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "e": [ + 742.2525322514165, + 709.6972488729955, + 687.8382568904871, + 820.2511324266264, + 720.4006432289061, + 761.1578181053039, + 759.1304101159126, + 699.5597256337142, + 742.4738911248204, + 769.0342605881644, + 746.6740473150268, + ... + ] + }, + { + "EOF": true, + "RESPONSE_TIME": 113 + } + ] + } +} +---- + +=== Residuals + +The difference between the observed value and the predicted value is known as the +residual. There isn't a specific function to calculate the residuals but vector +math can used to perform the calculation. + +In the example below the predictions are stored in variable *e*. The `ebeSubtract` +function is then used to subtract the predictions +from the actual *response_d* values stored in variable *c*. Variable *f* contains +the array of residuals. + +[source,text] +---- +let(a=random(collection2, q="*:*", rows="5000", fl="filesize_d, response_d"), + b=col(a, filesize_d), + c=col(a, response_d), + d=regress(b, c), + e=predict(d, b), + f=ebeSubtract(c, e)) +---- + +When this expression is sent to the /stream handler it responds with: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "e": [ + 31.30678554491226, + -30.292830927953446, + -30.49508862647258, + -30.499884780783532, + -9.696458959319784, + -30.521563961535094, + -30.28380938033081, + -9.890289849359306, + 30.819723560583157, + -30.213178859683012, + -30.609943619066826, + 10.527700442607625, + 10.68046928406568, + ... + ] + }, + { + "EOF": true, + "RESPONSE_TIME": 113 + } + ] + } +} +---- + +== Multivariate Linear Regression + +The `olsRegress` function performs a multivariate linear regression analysis. Multivariate linear +regression models the linear relationship between two or more *independent* variables and a *dependent* variable. + +The example below extends the simple linear regression example by introducing a new independent variable +called *service_d*. The *service_d* variable is the service level of the request and it can range from 1 to 4 +in the data-set. The higher the service level, the higher the bandwidth available for the request. + +Notice that the two independent variables *filesize_d* and *service_d* are vectorized and stored +in the variables *b* and *c*. The variables *b* and *c* are then added as rows to a `matrix`. The matrix is +then transposed so that each row in the matrix represents one observation with *filesize_d* and *service_d*. +The `olsRegress` function then performs the multivariate regression analysis using the observation matrix as the +independent variables and the *response_d* values, stored in variable *d*, as the dependent variable. + +Notice that the RSquared of the regression analysis is 1. This means that linear relationship between +*filesize_d* and *service_d* describe 100% of the variability of the *response_d* variable. + +[source,text] +---- +let(a=random(collection2, q="*:*", rows="30000", fl="filesize_d, service_d, response_d"), + b=col(a, filesize_d), + c=col(a, service_d), + d=col(a, response_d), + e=transpose(matrix(b, c)), + f=olsRegress(e, d)) +---- + +When this expression is sent to the /stream handler it responds with: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "f": { + "regressionParametersStandardErrors": [ + 2.0660690430026933e-13, + 5.1212982077663434e-18, + 9.10920932555875e-15 + ], + "RSquared": 1, + "regressionParameters": [ + 6.553210695971329e-12, + 0.019999999999999858, + -20.49999999999968 + ], + "regressandVariance": 2124.130825172683, + "regressionParametersVariance": [ + [ + 0.013660174897582315, + -3.361258014840509e-7, + -0.00006893737578369605 + ], + [ + -3.361258014840509e-7, + 8.393183709503206e-12, + 6.430253229589981e-11 + ], + [ + -0.00006893737578369605, + 6.430253229589981e-11, + 0.000026553878455570856 + ] + ], + "adjustedRSquared": 1, + "residualSumSquares": 9.373703759269822e-20 + } + }, + { + "EOF": true, + "RESPONSE_TIME": 690 + } + ] + } +} +---- + +=== Prediction + +The `predict` function can also be used to make predictions for multivariate linear regression. Below is an example +of a single prediction using the multivariate linear regression model and a single observation. The observation +is an array that matches the structure of the observation matrix used to build the model. In this case +the first value represent a *filesize_d* of 40000 and the second value represents a *service_d* of 4. + +[source,text] +---- +let(a=random(collection2, q="*:*", rows="5000", fl="filesize_d, service_d, response_d"), + b=col(a, filesize_d), + c=col(a, service_d), + d=col(a, response_d), + e=transpose(matrix(b, c)), + f=olsRegress(e, d), + g=predict(f, array(40000, 4))) +---- + +When this expression is sent to the /stream handler it responds with: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "g": 718.0000000000005 + }, + { + "EOF": true, + "RESPONSE_TIME": 117 + } + ] + } +} +---- + +The `predict` function can also make predictions for more than one multivariate observation. In this scenario +an observation matrix used. In the example below the observation matrix used to build the multivariate regression model +is passed to the `predict` function and it returns an array of predictions. + + +[source,text] +---- +let(a=random(collection2, q="*:*", rows="5000", fl="filesize_d, service_d, response_d"), + b=col(a, filesize_d), + c=col(a, service_d), + d=col(a, response_d), + e=transpose(matrix(b, c)), + f=olsRegress(e, d), + g=predict(f, e)) +---- + +When this expression is sent to the /stream handler it responds with: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "e": [ + 685.498283591961, + 801.2175699959365, + 776.7638245911025, + 610.3559852681935, + 751.0925865965207, + 787.2914663381897, + 744.3632053810668, + 688.3729301599697, + 765.367783417171, + 724.9309687628346, + 834.4350712384264, + ... + ] + }, + { + "EOF": true, + "RESPONSE_TIME": 113 + } + ] + } +} +---- + +=== Residuals + +Once the predictions are generated the residuals can be calculated using the same approach used with +simple linear regression. + +Below is an example of the residuals calculation following a multivariate linear regression. In the example +the predictions stored variable *g* are subtracted from observed values stored in variable *d*. + +[source,text] +---- +let(a=random(collection2, q="*:*", rows="5000", fl="filesize_d, service_d, response_d"), + b=col(a, filesize_d), + c=col(a, service_d), + d=col(a, response_d), + e=transpose(matrix(b, c)), + f=olsRegress(e, d), + g=predict(f, e), + h=ebeSubtract(d, g)) +---- + +When this expression is sent to the /stream handler it responds with: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "e": [ + 1.1368683772161603e-13, + 1.1368683772161603e-13, + 0, + 1.1368683772161603e-13, + 0, + 1.1368683772161603e-13, + 0, + 2.2737367544323206e-13, + 1.1368683772161603e-13, + 2.2737367544323206e-13, + 1.1368683772161603e-13, + ... + ] + }, + { + "EOF": true, + "RESPONSE_TIME": 113 + } + ] + } +} +---- + + + + diff --git a/solr/solr-ref-guide/src/scalar-math.adoc b/solr/solr-ref-guide/src/scalar-math.adoc new file mode 100644 index 00000000000..07b1eb570de --- /dev/null +++ b/solr/solr-ref-guide/src/scalar-math.adoc @@ -0,0 +1,137 @@ += Scalar Math +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +The most basic math expressions are scalar expressions. Scalar expressions +perform mathematical operations on numbers. + +For example the expression below adds two numbers together: + +[source,text] +---- +add(1, 1) +---- + +When this expression is sent to the /stream handler it +responds with: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "return-value": 2 + }, + { + "EOF": true, + "RESPONSE_TIME": 2 + } + ] + } +} +---- + +Math expressions can be nested. For example in the expression +below the output of the `add` function is the second parameter +of the `pow` function: + +[source,text] +---- +pow(10, add(1,1)) +---- + +This expression returns the following response: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "return-value": 100 + }, + { + "EOF": true, + "RESPONSE_TIME": 0 + } + ] + } +} +---- + +== Streaming Scalar Math + +Scalar math expressions can also be applied to each tuple in a stream +through use of the `select` stream decorator. The `select` function wraps a +stream of tuples and selects fields to include in each tuple. +The `select` function can also use math expressions to compute +new values and add them to the outgoing tuples. + +In the example below the `select` expression is wrapping a search +expression. The `select` function is selecting the *price_f* field +and computing a new field called *newPrice* using the `mult` math +expression. + +The first parameter of the `mult` expression is the *price_f* field. +The second parameter is the scalar value 10. This multiplies the value +of the *price_f* field in each tuple by 10. + +[source,text] +---- +select(search(collection2, q="*:*", fl="price_f", sort="price_f desc", rows="3"), + price_f, + mult(price_f, 10) as newPrice) +---- + +When this expression is sent to the /stream handler it responds with: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "price_f": 0.99999994, + "newPrice": 9.9999994 + }, + { + "price_f": 0.99999994, + "newPrice": 9.9999994 + }, + { + "price_f": 0.9999992, + "newPrice": 9.999992 + }, + { + "EOF": true, + "RESPONSE_TIME": 3 + } + ] + } +} +---- + +== More Scalar Math Functions + +The following scalar math functions are available in the math expressions library: + +`abs`, `add`, `div`, `mult`, `sub`, `log`, +`pow`, `mod`, `ceil`, `floor`, `sin`, `asin`, +`sinh`, `cos`, `acos`, `cosh`, `tan`, `atan`, +`tanh`, `round`, `precision`, `sqrt`, `cbrt` + diff --git a/solr/solr-ref-guide/src/statistics.adoc b/solr/solr-ref-guide/src/statistics.adoc new file mode 100644 index 00000000000..74da76b7960 --- /dev/null +++ b/solr/solr-ref-guide/src/statistics.adoc @@ -0,0 +1,575 @@ += Statistics +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + + +This section of the user guide covers the core statistical functions +available in math expressions. + +== Descriptive Statistics + +The `describe` function can be used to return descriptive statistics about a +numeric array. The `describe` function returns a single *tuple* with name/value +pairs containing descriptive statistics. + +Below is a simple example that selects a random sample of documents, +vectorizes the *price_f* field in the result set and uses the `describe` function to +return descriptive statistics about the vector: + +[source,text] +---- +let(a=random(collection1, q="*:*", rows="1500", fl="price_f"), + b=col(a, price_f), + c=describe(b)) +---- + +When this expression is sent to the /stream handler it responds with: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "c": { + "sumsq": 4999.041975263254, + "max": 0.99995726, + "var": 0.08344429493940454, + "geometricMean": 0.36696588922559575, + "sum": 7497.460565552007, + "kurtosis": -1.2000739963006035, + "N": 15000, + "min": 0.00012338161, + "mean": 0.49983070437013266, + "popVar": 0.08343873198640858, + "skewness": -0.001735537500095477, + "stdev": 0.28886726179926403 + } + }, + { + "EOF": true, + "RESPONSE_TIME": 305 + } + ] + } +} +---- + +== Histograms and Frequency Tables + +Histograms and frequency tables are are tools for understanding the distribution +of a random variable. + +The `hist` function creates a histogram designed for usage with continuous data. The +`freqTable` function creates a frequency table for use with discrete data. + +=== histograms + +Below is an example that selects a random sample, creates a vector from the +result set and uses the `hist` function to return a histogram with 5 bins. +The `hist` function returns a list of tuples with summary statistics for each bin. + +[source,text] +---- +let(a=random(collection1, q="*:*", rows="15000", fl="price_f"), + b=col(a, price_f), + c=hist(b, 5)) +---- + +When this expression is sent to the /stream handler it responds with: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "c": [ + { + "prob": 0.2057939717603699, + "min": 0.000010371208, + "max": 0.19996578, + "mean": 0.10010319358402578, + "var": 0.003366805016271609, + "cumProb": 0.10293732468049072, + "sum": 309.0185585938884, + "stdev": 0.058024176136086666, + "N": 3087 + }, + { + "prob": 0.19381868629885585, + "min": 0.20007741, + "max": 0.3999073, + "mean": 0.2993590803885827, + "var": 0.003401644034068929, + "cumProb": 0.3025295802728267, + "sum": 870.5362057700005, + "stdev": 0.0583236147205309, + "N": 2908 + }, + { + "prob": 0.20565789836690007, + "min": 0.39995712, + "max": 0.5999038, + "mean": 0.4993620963792545, + "var": 0.0033158364923609046, + "cumProb": 0.5023006239697967, + "sum": 1540.5320673300018, + "stdev": 0.05758330046429177, + "N": 3085 + }, + { + "prob": 0.19437108496008693, + "min": 0.6000449, + "max": 0.79973197, + "mean": 0.7001752711861512, + "var": 0.0033895105082360185, + "cumProb": 0.7026537198687285, + "sum": 2042.4112660500066, + "stdev": 0.058219502816805456, + "N": 2917 + }, + { + "prob": 0.20019582213899467, + "min": 0.7999126, + "max": 0.99987316, + "mean": 0.8985428275824184, + "var": 0.003312360017780078, + "cumProb": 0.899450457219298, + "sum": 2698.3241112299997, + "stdev": 0.05755310606544253, + "N": 3003 + } + ] + }, + { + "EOF": true, + "RESPONSE_TIME": 322 + } + ] + } +} +---- + +The `col` function can be used to *vectorize* a column of data from the list of tuples +returned by the `hist` function. + +In the example below, the *N* field, +which is the number of observations in the each bin, is returned as a vector. + +[source,text] +---- +let(a=random(collection1, q="*:*", rows="15000", fl="price_f"), + b=col(a, price_f), + c=hist(b, 11), + d=col(c, N)) +---- + +When this expression is sent to the /stream handler it responds with: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "d": [ + 1387, + 1396, + 1391, + 1357, + 1384, + 1360, + 1367, + 1375, + 1307, + 1310, + 1366 + ] + }, + { + "EOF": true, + "RESPONSE_TIME": 307 + } + ] + } +} +---- + +=== Frequency Tables + +The `freqTable` function returns a frequency distribution for a discrete data set. +The `freqTable` function doesn't create bins like the histogram. Instead it counts +the occurrence of each discrete data value and returns a list of tuples with the +frequency statistics for each value. Fields from a frequency table can be vectorized using +using the `col` function in the same manner as a histogram. + +Below is a simple example of a frequency table built from a random sample of +a discrete variable. + +[source,text] +---- +let(a=random(collection1, q="*:*", rows="15000", fl="day_i"), + b=col(a, day_i), + c=freqTable(b)) +---- + +When this expression is sent to the /stream handler it responds with: + +[source,json] +---- + "result-set": { + "docs": [ + { + "c": [ + { + "pct": 0.0318, + "count": 477, + "cumFreq": 477, + "cumPct": 0.0318, + "value": 0 + }, + { + "pct": 0.033133333333333334, + "count": 497, + "cumFreq": 974, + "cumPct": 0.06493333333333333, + "value": 1 + }, + { + "pct": 0.03426666666666667, + "count": 514, + "cumFreq": 1488, + "cumPct": 0.0992, + "value": 2 + }, + { + "pct": 0.0346, + "count": 519, + "cumFreq": 2007, + "cumPct": 0.1338, + "value": 3 + }, + { + "pct": 0.03133333333333333, + "count": 470, + "cumFreq": 2477, + "cumPct": 0.16513333333333333, + "value": 4 + }, + { + "pct": 0.03333333333333333, + "count": 500, + "cumFreq": 2977, + "cumPct": 0.19846666666666668, + "value": 5 + } + ] + }, + { + "EOF": true, + "RESPONSE_TIME": 281 + } + ] + } +} +---- + +== Percentiles + +The `percentile` function returns the estimated value for a specific percentile in +a sample set. The example below returns the estimation for the 95th percentile +of the *price_f* field. + +[source,text] +---- +let(a=random(collection1, q="*:*", rows="15000", fl="price_f"), + b=col(a, price_f), + c=percentile(b, 95)) +---- + +When this expression is sent to the /stream handler it responds with: + +[source,json] +---- + { + "result-set": { + "docs": [ + { + "c": 312.94 + }, + { + "EOF": true, + "RESPONSE_TIME": 286 + } + ] + } + } +---- + +== Covariance and Correlation + +Covariance and Correlation measure how random variables move +together. + +=== Covariance and Covariance Matrices + +The `cov` function calculates the covariance of two sample sets of data. + +In the example below covariance is calculated for two numeric +arrays. + +The example below uses arrays created by the `array` function. Its important to note that +vectorized data from Solr Cloud collections can be used with any function that +operates on arrays. + +[source,text] +---- +let(a=array(1, 2, 3, 4, 5), + b=array(100, 200, 300, 400, 500), + c=cov(a, b)) +---- + +When this expression is sent to the /stream handler it responds with: + +[source,json] +---- + { + "result-set": { + "docs": [ + { + "c": 0.9484775349999998 + }, + { + "EOF": true, + "RESPONSE_TIME": 286 + } + ] + } + } +---- + +If a matrix is passed to the `cov` function it will automatically compute a covariance +matrix for the columns of the matrix. + +Notice in the example three numeric arrays are added as rows +in a matrix. The matrix is then transposed to turn the rows into +columns, and the covariance matrix is computed for the columns of the +matrix. + +[source,text] +---- +let(a=array(1, 2, 3, 4, 5), + b=array(100, 200, 300, 400, 500), + c=array(30, 40, 80, 90, 110), + d=transpose(matrix(a, b, c)), + e=cov(d)) +---- + +When this expression is sent to the /stream handler it responds with: + +[source,json] +---- + { + "result-set": { + "docs": [ + { + "e": [ + [ + 2.5, + 250, + 52.5 + ], + [ + 250, + 25000, + 5250 + ], + [ + 52.5, + 5250, + 1150 + ] + ] + }, + { + "EOF": true, + "RESPONSE_TIME": 2 + } + ] + } + } +---- + +=== Correlation and Correlation Matrices + +Correlation is measure of covariance that has been scaled between +-1 and 1. + +Three correlation types are supported: + +* *pearsons* (default) +* *kendalls* +* *spearmans* + +The type of correlation is specified by adding the *type* named parameter in the +function call. The example below demonstrates the use of the *type* +named parameter. + +[source,text] +---- +let(a=array(1, 2, 3, 4, 5), + b=array(100, 200, 300, 400, 5000), + c=corr(a, b, type=spearmans)) +---- + +When this expression is sent to the /stream handler it responds with: + +[source,json] +---- + { + "result-set": { + "docs": [ + { + "c": 0.7432941462471664 + }, + { + "EOF": true, + "RESPONSE_TIME": 0 + } + ] + } + } +---- + +Like the `cov` function, the `corr` function automatically builds a correlation matrix +if a matrix is passed as a parameter. The correlation matrix is built by correlating the columns +of the matrix passed in. + +== Statistical Inference Tests + +Statistical inference tests test a hypothesis on *random samples* and return p-values which +can be used to infer the reliability of the test for the entire population. + +The following statistical inference tests are available: + +* `anova`: One-Way-Anova tests if there is a statistically significant difference in the +means of two or more random samples. + +* `ttest`: The T-test tests if there is a statistically significant difference in the means of two +random samples. + +* `pairedTtest`: The paired t-test tests if there is a statistically significant difference +in the means of two random samples with paired data. + +* `gTestDataSet`: The G-test tests if two samples of binned discrete data were drawn +from the same population. + +* `chiSquareDataset`: The Chi-Squared test tests if two samples of binned discrete data were +drawn from the same population. + +* `mannWhitney`: The Mann-Whitney test is a non-parametric test that tests if two +samples of continuous were pulled +from the same population. The Mann-Whitney test is often used instead of the T-test when the +underlying assumptions of the T-test are not +met. + +* `ks`: The Kolmogorov-Smirnov test tests if two samples of continuous data were drawn from +the same distribution. + +Below is a simple example of a T-test performed on two random samples. +The returned p-value of .93 means we can accept the null hypothesis +that the two samples do not have statistically significantly differences in the means. + +[source,text] +---- +let(a=random(collection1, q="*:*", rows="1500", fl="price_f"), + b=random(collection1, q="*:*", rows="1500", fl="price_f"), + c=col(a, price_f), + d=col(b, price_f), + e=ttest(c, d)) +---- + +When this expression is sent to the /stream handler it responds with: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "e": { + "p-value": 0.9350135639249795, + "t-statistic": 0.081545541074817 + } + }, + { + "EOF": true, + "RESPONSE_TIME": 48 + } + ] + } +} +---- + +== Transformations + +In statistical analysis its often useful to transform data sets before performing +statistical calculations. The statistical function library includes the following +commonly used transformations: + +* `rank`: Returns a numeric array with the rank-transformed value of each element of the original +array. + +* `log`: Returns a numeric array with the natural log of each element of the original array. + +* `sqrt`: Returns a numeric array with the square root of each element of the original array. + +* `cbrt`: Returns a numeric array with the cube root of each element of the original array. + +Below is an example of a ttest performed on log transformed data sets: + +[source,text] +---- +let(a=random(collection1, q="*:*", rows="1500", fl="price_f"), + b=random(collection1, q="*:*", rows="1500", fl="price_f"), + c=log(col(a, price_f)), + d=log(col(b, price_f)), + e=ttest(c, d)) +---- + +When this expression is sent to the /stream handler it responds with: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "e": { + "p-value": 0.9655110070265056, + "t-statistic": -0.04324265449471238 + } + }, + { + "EOF": true, + "RESPONSE_TIME": 58 + } + ] + } +} +---- diff --git a/solr/solr-ref-guide/src/streaming-expressions.adoc b/solr/solr-ref-guide/src/streaming-expressions.adoc index ed37ce158ed..ccf3bf3d1c0 100644 --- a/solr/solr-ref-guide/src/streaming-expressions.adoc +++ b/solr/solr-ref-guide/src/streaming-expressions.adoc @@ -1,5 +1,5 @@ = Streaming Expressions -:page-children: stream-source-reference, stream-decorator-reference, stream-evaluator-reference, statistical-programming, graph-traversal +:page-children: stream-source-reference, stream-decorator-reference, stream-evaluator-reference, statistical-programming, math-expressions, graph-traversal // Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information diff --git a/solr/solr-ref-guide/src/term-vectors.adoc b/solr/solr-ref-guide/src/term-vectors.adoc new file mode 100644 index 00000000000..cbd21a06db8 --- /dev/null +++ b/solr/solr-ref-guide/src/term-vectors.adoc @@ -0,0 +1,237 @@ += Text Analysis and Term Vectors +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +TF-IDF term vectors are often used to represent text documents when performing text mining +and machine learning operations. This section of the user guide describes how to +use math expressions to perform text analysis and create TF-IDF term vectors. + +== Text Analysis + +The `analyze` function applies a Solr analyzer to a text field and returns the tokens +emitted by the analyzer in an array. Any analyzer chain that is attached to a field in Solr's +schema can be used with the `analyze` function. + +In the example below, the text "hello world" is analyzed using the analyzer chain attached to the *subject* field in +the schema. The *subject* field is defined as the field type *text_general* and the text is analyzed using the +analysis chain configured for the *text_general* field type. + +[source,text] +---- +analyze("hello world", subject) +---- + +When this expression is sent to the /stream handler it +responds with: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "return-value": [ + "hello", + "world" + ] + }, + { + "EOF": true, + "RESPONSE_TIME": 0 + } + ] + } +} +---- + +=== Annotating Documents + +The `analyze` function can be used inside of a `select` function to annotate documents with the tokens +generated by the analysis. + +The example below is performing a `search` in collection1. Each tuple returned by the `search` +contains an *id* and *subject*. For each tuple, the +`select` function is selecting the *id* field and calling the `analyze` function on the *subject* field. +The analyzer chain specified by the *subject_bigram* field is configured to perform a bigram analysis. +The tokens generated by the `analyze` function are added to each tuple in a field called `terms`. + +Notice in the output that an array of bigram terms have been added to the tuples. + +[source,text] +---- +select(search(collection1, q="*:*", fl="id, subject", sort="id asc"), + id, + analyze(subject, subject_bigram) as terms) +---- + +When this expression is sent to the /stream handler it +responds with: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "terms": [ + "text analysis", + "analysis example" + ], + "id": "1" + }, + { + "terms": [ + "example number", + "number two" + ], + "id": "2" + }, + { + "EOF": true, + "RESPONSE_TIME": 4 + } + ] + } +} +---- + +== Term Vectors + +The `termVectors` function can be used to build *TF-IDF* +term vectors from the terms generated by the `analyze` function. + +The `termVectors` function operates over a list of tuples that contain a field +called *id* and a field called *terms*. Notice +that this is the exact output structure of the *document annotation* example above. + +The `termVectors` function builds a *matrix* from the list of tuples. There is *row* in the +matrix for each tuple in the list. There is a *column* in the matrix for each term in the *terms* +field. + +The example below builds on the *document annotation* example. +The list of tuples are stored in variable *a*. The `termVectors` function +operates over variable *a* and builds a matrix with *2 rows* and *4 columns*. + +The `termVectors` function also sets the *row* and *column* labels of the term vectors matrix. +The row labels are the document ids and the +column labels are the terms. + +In the example below, the `getRowLabels` and `getColumnLabels` functions return +the row and column labels which are then stored in variables *c* and *d*. +The *echo* parameter is echoing variables *c* and *d*, so the output includes +the row and column labels. + +[source,text] +---- +let(echo="c, d", + a=select(search(collection3, q="*:*", fl="id, subject", sort="id asc"), + id, + analyze(subject, subject_bigram) as terms), + b=termVectors(a, minTermLength=4, minDocFreq=0, maxDocFreq=1), + c=getRowLabels(b), + d=getColumnLabels(b)) +---- + +When this expression is sent to the /stream handler it +responds with: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "c": [ + "1", + "2" + ], + "d": [ + "analysis example", + "example number", + "number two", + "text analysis" + ] + }, + { + "EOF": true, + "RESPONSE_TIME": 5 + } + ] + } +} +---- + +=== TF-IDF Values + +The values within the term vectors matrix are the TF-IDF values for each term in each document. The +example below shows the values of the matrix. + +[source,text] +---- +let(a=select(search(collection3, q="*:*", fl="id, subject", sort="id asc"), + id, + analyze(subject, subject_bigram) as terms), + b=termVectors(a, minTermLength=4, minDocFreq=0, maxDocFreq=1)) +---- + +When this expression is sent to the /stream handler it +responds with: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "b": [ + [ + 1.4054651081081644, + 0, + 0, + 1.4054651081081644 + ], + [ + 0, + 1.4054651081081644, + 1.4054651081081644, + 0 + ] + ] + }, + { + "EOF": true, + "RESPONSE_TIME": 5 + } + ] + } +} +---- + +=== Limiting the Noise + +One of the key challenges when with working term vectors is that text often has a significant amount of noise +which can obscure the important terms in the data. The `termVectors` function has several parameters +designed to filter out the less meaningful terms. This is also important because eliminating +the noisy terms helps keep the term vector matrix small enough to fit comfortably in memory. + +There are four parameters designed to filter noisy terms from the term vector matrix: + +* *minTermLength*: The minimum term length required to include the term in the matrix. +* *minDocFreq*: The minimum *percentage* (0 to 1) of documents the term must appear in to be included in the index. +* *maxDocFreq*: The maximum *percentage* (0 to 1) of documents the term can appear in to be included in the index. +* *exclude*: A comma delimited list of strings used to exclude terms. If a term contains any of the exclude strings that +term will be excluded from the term vector. diff --git a/solr/solr-ref-guide/src/time-series.adoc b/solr/solr-ref-guide/src/time-series.adoc new file mode 100644 index 00000000000..e76527006ac --- /dev/null +++ b/solr/solr-ref-guide/src/time-series.adoc @@ -0,0 +1,431 @@ += Time Series +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +This section of the user guide provides an overview of time series *aggregation*, +*smoothing* and *differencing*. + +== Time Series Aggregation + +The `timeseries` function performs fast, distributed time +series aggregation leveraging Solr's builtin faceting and date math capabilities. + +The example below performs a monthly time series aggregation: + +[source,text] +---- +timeseries(collection1, + q=*:*, + field="recdate_dt", + start="2012-01-20T17:33:18Z", + end="2012-12-20T17:33:18Z", + gap="+1MONTH", + format="YYYY-MM", + count(*)) +---- + +When this expression is sent to the /stream handler it responds with: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "recdate_dt": "2012-01", + "count(*)": 8703 + }, + { + "recdate_dt": "2012-02", + "count(*)": 8648 + }, + { + "recdate_dt": "2012-03", + "count(*)": 8621 + }, + { + "recdate_dt": "2012-04", + "count(*)": 8533 + }, + { + "recdate_dt": "2012-05", + "count(*)": 8792 + }, + { + "recdate_dt": "2012-06", + "count(*)": 8598 + }, + { + "recdate_dt": "2012-07", + "count(*)": 8679 + }, + { + "recdate_dt": "2012-08", + "count(*)": 8469 + }, + { + "recdate_dt": "2012-09", + "count(*)": 8637 + }, + { + "recdate_dt": "2012-10", + "count(*)": 8536 + }, + { + "recdate_dt": "2012-11", + "count(*)": 8785 + }, + { + "EOF": true, + "RESPONSE_TIME": 16 + } + ] + } +} +---- + +== Vectorizing the Time Series + +Before a time series result can be operated on by math expressions + the data will need to be vectorized. Specifically +in the example above, the aggregation field count(*) will need to by moved into an array. +As described in the Streams and Vectorization section of the user guide, the `col` function can be used +to copy a numeric column from a list of tuples into an array. + +The expression below demonstrates the vectorization of the count(*) field. + +[source,text] +---- +let(a=timeseries(collection1, + q=*:*, + field="test_dt", + start="2012-01-20T17:33:18Z", + end="2012-12-20T17:33:18Z", + gap="+1MONTH", + format="YYYY-MM", + count(*)), + b=col(a, count(*))) +---- + +When this expression is sent to the /stream handler it responds with: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "b": [ + 8703, + 8648, + 8621, + 8533, + 8792, + 8598, + 8679, + 8469, + 8637, + 8536, + 8785 + ] + }, + { + "EOF": true, + "RESPONSE_TIME": 5 + } + ] + } +} +---- + +== Smoothing + +Time series smoothing is often used to remove the noise from a time series and help +spot the underlying trends. +The math expressions library has three *sliding window* approaches +for time series smoothing. The *sliding window* approaches use a summary value +from a sliding window of the data to calculate a new set of smoothed data points. + +The three *sliding window* functions are lagging indicators, which means +they don't start to move in the direction of the trend until the trend effects +the summary value of the sliding window. Because of this lagging quality these smoothing +functions are often used to confirm the direction of the trend. + +=== Moving Average + +The `movingAvg` function computes a simple moving average over a sliding window of data. +The example below generates a time series, vectorizes the count(*) field and computes the +moving average with a window size of 3. + +The moving average function returns an array that is of shorter length +then the original data set. This is because results are generated only when a full window of data +is available for computing the average. With a window size of three the moving average will +begin generating results at the 3rd value. The prior values are not included in the result. + +This is true for all the sliding window functions. + +[source,text] +---- +let(a=timeseries(collection1, + q=*:*, + field="test_dt", + start="2012-01-20T17:33:18Z", + end="2012-12-20T17:33:18Z", + gap="+1MONTH", + format="YYYY-MM", + count(*)), + b=col(a, count(*)), + c=movingAvg(b, 3)) +---- + +When this expression is sent to the /stream handler it responds with: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "c": [ + 8657.333333333334, + 8600.666666666666, + 8648.666666666666, + 8641, + 8689.666666666666, + 8582, + 8595, + 8547.333333333334, + 8652.666666666666 + ] + }, + { + "EOF": true, + "RESPONSE_TIME": 7 + } + ] + } +} +---- + +=== Exponential Moving Average + +The `expMovingAvg` function uses a different formula for computing the moving average that +responds faster to changes in the underlying data. This means that it is +less of a lagging indicator then the simple moving average. + +Below is an example that computes an exponential moving average: + +[source,text] +---- +let(a=timeseries(collection1, q=*:*, + field="test_dt", + start="2012-01-20T17:33:18Z", + end="2012-12-20T17:33:18Z", + gap="+1MONTH", + format="YYYY-MM", + count(*)), + b=col(a, count(*)), + c=expMovingAvg(b, 3)) +---- + +When this expression is sent to the /stream handler it responds with: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "c": [ + 8657.333333333334, + 8595.166666666668, + 8693.583333333334, + 8645.791666666668, + 8662.395833333334, + 8565.697916666668, + 8601.348958333334, + 8568.674479166668, + 8676.837239583334 + ] + }, + { + "EOF": true, + "RESPONSE_TIME": 5 + } + ] + } +} +---- + +=== Moving Median + +The `movingMedian` function uses the median of the sliding window rather than the average. +In many cases the moving median will be more *robust* to outliers then moving averages. + +Below is an example computing the moving median: + +[source,text] +---- +let(a=timeseries(collection1, + q=*:*, + field="test_dt", + start="2012-01-20T17:33:18Z", + end="2012-12-20T17:33:18Z", + gap="+1MONTH", + format="YYYY-MM", + count(*)), + b=col(a, count(*)), + c=movingMedian(b, 3)) +---- + +When this expression is sent to the /stream handler it responds with: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "c": [ + 8648, + 8621, + 8621, + 8598, + 8679, + 8598, + 8637, + 8536, + 8637 + ] + }, + { + "EOF": true, + "RESPONSE_TIME": 7 + } + ] + } +} +---- + +== Differencing + +Differencing is often used to remove the +trend or seasonality from a time series. This is known as making a time series +*stationary*. + +=== First Difference + +The actual technique of differencing is to use the difference between values rather then the +original values. The *first difference* takes the difference between a value and the value +that came directly before it. The first difference is often used to remove the trend +from a time series. + +In the example below, the `diff` function computes the first difference of a time series. +The result array length is one value smaller then the original array. +This is because the `diff` function only returns a result for values +where the prior value has been subtracted. + +[source,text] +---- +let(a=timeseries(collection1, + q=*:*, + field="test_dt", + start="2012-01-20T17:33:18Z", + end="2012-12-20T17:33:18Z", + gap="+1MONTH", + format="YYYY-MM", + count(*)), + b=col(a, count(*)), + c=diff(b)) +---- + +When this expression is sent to the /stream handler it responds with: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "c": [ + -55, + -27, + -88, + 259, + -194, + 81, + -210, + 168, + -101, + 249 + ] + }, + { + "EOF": true, + "RESPONSE_TIME": 11 + } + ] + } +} +---- + +=== Lagged Differences + +The `diff` function has an optional second parameter to specify a lag in the difference. +If a lag is specified the difference is taken between a value and the value at a specified +lag in the past. Lagged differences are often used to remove seasonality from a time series. + +The simple example below demonstrates how lagged differencing works. +Notice that the array in the example follows a simple repeated pattern. This type of pattern +is often displayed with seasonality. In this example we can remove this pattern using +the `diff` function with a lag of 4. This will subtract the value lagging four indexes +behind the current index. Notice that result set size is the original array size minus the lag. +This is because the `diff` function only returns results for values where the lag of 4 +is possible to compute. + +[source,text] +---- +let(a=array(1,2,5,2,1,2,5,2,1,2,5), + b=diff(a, 4)) +---- + +Expression is sent to the /stream handler it responds with: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "b": [ + 0, + 0, + 0, + 0, + 0, + 0, + 0 + ] + }, + { + "EOF": true, + "RESPONSE_TIME": 0 + } + ] + } +} +---- diff --git a/solr/solr-ref-guide/src/variables.adoc b/solr/solr-ref-guide/src/variables.adoc new file mode 100644 index 00000000000..7e12e7517fc --- /dev/null +++ b/solr/solr-ref-guide/src/variables.adoc @@ -0,0 +1,147 @@ += Variables +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +== The Let Expressions + +The `let` expression sets variables and returns +the value of the last variable by default. The output of any streaming expression +or math expression can be set to a variable. + +Below is a simple example setting three variables *a*, *b* +and *c*. Variables *a* and *b* are set to arrays. The variable *c* is set +to the output of the `ebeAdd` function which performs element-by-element +addition of the two arrays. + +Notice that the last variable, *c*, is returned. + +[source,text] +---- +let(a=array(1, 2, 3), + b=array(10, 20, 30), + c=ebeAdd(a, b)) +---- + +When this expression is sent to the /stream handler it +responds with: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "c": [ + 11, + 22, + 33 + ] + }, + { + "EOF": true, + "RESPONSE_TIME": 4 + } + ] + } +} +---- + +== Echoing Variables + +All variables can be output by setting the *echo* variable to *true*. + +[source,text] +---- +let(echo=true, + a=array(1, 2, 3), + b=array(10, 20, 30), + c=ebeAdd(a, b)) +---- + +When this expression is sent to the /stream handler it +responds with: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "a": [ + 1, + 2, + 3 + ], + "b": [ + 10, + 20, + 30 + ], + "c": [ + 11, + 22, + 33 + ] + }, + { + "EOF": true, + "RESPONSE_TIME": 0 + } + ] + } +} +---- + +A specific set of variables can be echoed by providing a comma delimited +list of variables to the echo parameter. + +[source,text] +---- +let(echo="a,b", + a=array(1, 2, 3), + b=array(10, 20, 30), + c=ebeAdd(a, b)) +---- + +When this expression is sent to the /stream handler it +responds with: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "a": [ + 1, + 2, + 3 + ], + "b": [ + 10, + 20, + 30 + ] + }, + { + "EOF": true, + "RESPONSE_TIME": 0 + } + ] + } +} +---- \ No newline at end of file diff --git a/solr/solr-ref-guide/src/vector-math.adoc b/solr/solr-ref-guide/src/vector-math.adoc new file mode 100644 index 00000000000..22d610f6236 --- /dev/null +++ b/solr/solr-ref-guide/src/vector-math.adoc @@ -0,0 +1,343 @@ += Vector Math +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +This section of the user guide covers vector math and +vector manipulation functions. + +== Arrays + +Arrays can be created with the `array` function. + +For example the expression below creates a numeric array with +three elements: + +[source,text] +---- +array(1, 2, 3) +---- + +When this expression is sent to the /stream handler it responds with +a json array. + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "return-value": [ + 1, + 2, + 3 + ] + }, + { + "EOF": true, + "RESPONSE_TIME": 0 + } + ] + } +} +---- + +== Array Operations + +Arrays can be passed as parameters to functions that operate on arrays. + +For example, an array can be reversed with the `rev` function: + +[source,text] +---- +rev(array(1, 2, 3)) +---- + +When this expression is sent to the /stream handler it responds with: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "return-value": [ + 3, + 2, + 1 + ] + }, + { + "EOF": true, + "RESPONSE_TIME": 0 + } + ] + } +} +---- + +Another example is the `length` function, +which returns the length of an array: + +[source,text] +---- +length(array(1, 2, 3)) +---- + +When this expression is sent to the /stream handler it responds with: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "return-value": 3 + }, + { + "EOF": true, + "RESPONSE_TIME": 0 + } + ] + } +} +---- + +A slice of an array can be taken with the `copyOfRange` function, which +copies elements of an array from a start and end range. + +[source,text] +---- +copyOfRange(array(1,2,3,4,5,6), 1, 4) +---- + +When this expression is sent to the /stream handler it responds with: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "return-value": [ + 2, + 3, + 4 + ] + }, + { + "EOF": true, + "RESPONSE_TIME": 0 + } + ] + } +} +---- + +== Vector Summarizations and Norms + +There are a set of functions that perform +summerizations and return norms of arrays. These functions +operate over an array and return a single +value. The following vector summarizations and norm functions are available: +`mult`, `add`, `sumSq`, `mean`, `l1norm`, `l2norm`, `linfnorm`. + +The example below is using the `mult` function, +which multiples all the values of an array. + +[source,text] +---- +mult(array(2,4,8)) +---- + +When this expression is sent to the /stream handler it responds with: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "return-value": 64 + }, + { + "EOF": true, + "RESPONSE_TIME": 0 + } + ] + } +} +---- + +The vector norm functions provide different formulas for calculating vector magnitude. + +The example below calculates the *l2norm* of an array. + +[source,text] +---- +l2norm(array(2,4,8)) +---- + +When this expression is sent to the /stream handler it responds with: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "return-value": 9.16515138991168 + }, + { + "EOF": true, + "RESPONSE_TIME": 0 + } + ] + } +} +---- + +== Scalar Vector Math + +Scalar vector math functions add, subtract, multiple or divide a scalar value with every value in a vector. +The following functions perform these operations: `scalarAdd`, `scalarSubtract`, `scalarMultiply` +and `scalarDivide`. + + +Below is an example of the `scalarMultiply` function, which multiplies the scalar value 3 with +every value of an array. + +[source,text] +---- +scalarMultiply(3, array(1,2,3)) +---- + +When this expression is sent to the /stream handler it responds with: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "return-value": [ + 3, + 6, + 9 + ] + }, + { + "EOF": true, + "RESPONSE_TIME": 0 + } + ] + } +} +---- + +== Element-By-Element Vector Math + +Two vectors can be added, subtracted, multiplied and divided using element-by-element +vector math functions. The following element-by-element vector math functions are: +`ebeAdd`, `ebeSubtract`, `ebeMultiply`, `ebeDivide`. + +The expression below performs the element-by-element subtraction of two arrays. + +[source,text] +---- +ebeSubtract(array(10, 15, 20), array(1,2,3)) +---- + +When this expression is sent to the /stream handler it responds with: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "return-value": [ + 9, + 13, + 17 + ] + }, + { + "EOF": true, + "RESPONSE_TIME": 5 + } + ] + } +} +---- + +== Dot Product and Cosine Similarity + +The `dotProduct` and `cosineSimilarity` functions are often used as similarity measures between two +sparse vectors. The `dotProduct` is a measure of both angle and magnitude while `cosineSimilarity` +is a measure only of angle. + +Below is an example of the `dotProduct` function: + +[source,text] +---- +dotProduct(array(2,3,0,0,0,1), array(2,0,1,0,0,3)) +---- + +When this expression is sent to the /stream handler it responds with: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "return-value": 7 + }, + { + "EOF": true, + "RESPONSE_TIME": 15 + } + ] + } +} +---- + +Below is an example of the `cosineSimilarity` function: + +[source,text] +---- +cosineSimilarity(array(2,3,0,0,0,1), array(2,0,1,0,0,3)) +---- + +When this expression is sent to the /stream handler it responds with: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "return-value": 0.5 + }, + { + "EOF": true, + "RESPONSE_TIME": 7 + } + ] + } +} +---- \ No newline at end of file diff --git a/solr/solr-ref-guide/src/vectorization.adoc b/solr/solr-ref-guide/src/vectorization.adoc new file mode 100644 index 00000000000..b01dcc8275e --- /dev/null +++ b/solr/solr-ref-guide/src/vectorization.adoc @@ -0,0 +1,243 @@ += Streams and Vectorization +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +This section of the user guide explores techniques +for retrieving streams of data from Solr and vectorizing the +*numeric* fields. + +The next chapter of the user guide covers +Text Analysis and Term Vectors which describes how to +vectorize *text* fields. + +== Streams + +Streaming Expressions has a wide range of stream sources that can be used to +retrieve data from Solr Cloud collections. Math expressions can be used +to vectorize and analyze the results sets. + +Below are some of the key stream sources: + +* *random*: Random sampling is widely used in statistics, probability and machine learning. +The `random` function returns a random sample of search results that match a +query. The random samples can be vectorized and operated on by math expressions and the results +can be used to describe and make inferences about the entire population. + +* *timeseries*: The `timeseries` +expression provides fast distributed time series aggregations, which can be +vectorized and analyzed with math expressions. + +* *knnSearch*: K-nearest neighbor is a core machine learning algorithm. The `knnSearch` +function is a specialized knn algorithm optimized to find the k-nearest neighbors of a document in +a distributed index. Once the nearest neighbors are retrieved they can be vectorized +and operated on by machine learning and text mining algorithms. + +* *sql*: SQL is the primary query language used by data scientists. The `sql` function supports +data retrieval using a subset of SQL which includes both full text search and +fast distributed aggregations. The result sets can then be vectorized and operated +on by math expressions. + +* *jdbc*: The `jdbc` function allows data from any JDBC compliant data source to be combined with +streams originating from Solr. Result sets from outside data sources can be vectorized and operated +on by math expressions in the same manner as result sets originating from Solr. + +* *topic*: Messaging is an important foundational technology for large scale computing. The `topic` +function provides publish/subscribe messaging capabilities by treating +Solr Cloud as a distributed message queue. Topics are extremely powerful +because they allow subscription by query. Topics can be use to support a broad set of +use cases including bulk text mining operations and AI alerting. + +* *nodes*: Graph queries are frequently used by recommendation engines and are an important +machine learning tool. The `nodes` function provides fast, distributed, breadth +first graph traversal over documents in a Solr Cloud collection. The node sets collected +by the `nodes` function can be operated on by statistical and machine learning expressions to +gain more insight into the graph. + +* *search*: Ranked search results are a powerful tool for finding the most relevant +documents from a large document corpus. The `search` expression +returns the top N ranked search results that match any +Solr query, including geo-spatial queries. The smaller set of relevant +documents can then be explored with statistical, machine learning and +text mining expressions to gather insights about the data set. + +== Assigning Streams to Variables + +The output of any streaming expression can be set to a variable. +Below is a very simple example using the `random` function to fetch +three random samples from collection1. The random samples are returned +as *tuples*, which contain name/value pairs. + + +[source,text] +---- +let(a=random(collection1, q="*:*", rows="3", fl="price_f")) +---- + +When this expression is sent to the /stream handler it responds with: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "a": [ + { + "price_f": 0.7927976 + }, + { + "price_f": 0.060795486 + }, + { + "price_f": 0.55128294 + } + ] + }, + { + "EOF": true, + "RESPONSE_TIME": 11 + } + ] + } +} +---- + +== Creating a Vector with the *col* Function + +The `col` function iterates over a list of tuples and copies the values +from a specific column into an *array*. + +The output of the `col` function is an numeric array that can be set to a +variable and operated on by math expressions. + +Below is an example of the `col` function: + +[source,text] +---- +let(a=random(collection1, q="*:*", rows="3", fl="price_f"), + b=col(a, price_f)) +---- + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "b": [ + 0.42105234, + 0.85237443, + 0.7566981 + ] + }, + { + "EOF": true, + "RESPONSE_TIME": 9 + } + ] + } +} +---- + +== Applying Math Expressions to the Vector + +Once a vector has been created any math expression that operates on vectors +can be applied. In the example below the `mean` function is applied to +the vector assigned to variable *b*. + +[source,text] +---- +let(a=random(collection1, q="*:*", rows="15000", fl="price_f"), + b=col(a, price_f), + c=mean(b)) +---- + +When this expression is sent to the /stream handler it responds with: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "c": 0.5016035594638814 + }, + { + "EOF": true, + "RESPONSE_TIME": 306 + } + ] + } +} +---- + +== Creating Matrices + +Matrices can be created by vectorizing multiple numeric fields +and adding them to a matrix. The matrices can then be operated on by +any math expression that operates on matrices. + +Note that this section deals with the creation of matrices +from numeric data. The next chapter of the user guide covers +Text Analysis and Term Vectors which describes how to build TF-IDF +term vector matrices from text fields. + +Below is a simple example where four random samples are taken +from different sub-populations in the data. The *price_f* field of +each random sample is +vectorized and the vectors are added as rows to a matrix. +Then the `sumRows` +function is applied to the matrix to return a vector containing +the sum of each row. + +[source,text] +---- +let(a=random(collection1, q="market:A", rows="5000", fl="price_f"), + b=random(collection1, q="market:B", rows="5000", fl="price_f"), + c=random(collection1, q="market:C", rows="5000", fl="price_f"), + d=random(collection1, q="market:D", rows="5000", fl="price_f"), + e=col(a, price_f), + f=col(b, price_f), + g=col(c, price_f), + h=col(d, price_f), + i=matrix(e, f, g, h), + j=sumRows(i)) +---- + +When this expression is sent to the /stream handler it responds with: + +[source,json] +---- +{ + "result-set": { + "docs": [ + { + "j": [ + 154390.1293375, + 167434.89453, + 159293.258493, + 149773.42769, + ] + }, + { + "EOF": true, + "RESPONSE_TIME": 9 + } + ] + } +} +---- \ No newline at end of file diff --git a/solr/solrj/src/java/org/apache/solr/client/solrj/io/eval/FieldValueEvaluator.java b/solr/solrj/src/java/org/apache/solr/client/solrj/io/eval/FieldValueEvaluator.java index fac42740bdf..a12a74e37e4 100644 --- a/solr/solrj/src/java/org/apache/solr/client/solrj/io/eval/FieldValueEvaluator.java +++ b/solr/solrj/src/java/org/apache/solr/client/solrj/io/eval/FieldValueEvaluator.java @@ -31,10 +31,12 @@ public class FieldValueEvaluator extends SourceEvaluator { private static final long serialVersionUID = 1L; private String fieldName; + private boolean literal; public FieldValueEvaluator(String fieldName) { - if(fieldName.startsWith("'") && fieldName.endsWith("'") && fieldName.length() > 1){ + if(fieldName.startsWith("\"") && fieldName.endsWith("\"") && fieldName.length() > 1){ fieldName = fieldName.substring(1, fieldName.length() - 1); + literal = true; } this.fieldName = fieldName; @@ -42,6 +44,10 @@ public class FieldValueEvaluator extends SourceEvaluator { @Override public Object evaluate(Tuple tuple) throws IOException { + if(literal) { + return fieldName; + } + Object value = tuple.get(fieldName); // This is somewhat radical. @@ -84,10 +90,6 @@ public class FieldValueEvaluator extends SourceEvaluator { } } - if(value == null) { - return fieldName; - } - return value; }