From b7b7f4df02f28bfe602d3a195650d39635da1243 Mon Sep 17 00:00:00 2001 From: Joel Bernstein Date: Thu, 17 Aug 2017 16:06:14 -0400 Subject: [PATCH] SOLR-10651: Statistical function docs for 7.0 Part 3 --- .../src/statistical-programming.adoc | 768 ++++++++++++++++++ 1 file changed, 768 insertions(+) create mode 100644 solr/solr-ref-guide/src/statistical-programming.adoc diff --git a/solr/solr-ref-guide/src/statistical-programming.adoc b/solr/solr-ref-guide/src/statistical-programming.adoc new file mode 100644 index 00000000000..2bc79116c49 --- /dev/null +++ b/solr/solr-ref-guide/src/statistical-programming.adoc @@ -0,0 +1,768 @@ += Statistical Programming +:page-shortname: statistical-programming +:page-permalink: statistical-programming.html +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +The Streaming Expression language includes a powerful statistical programing syntax with many of the +features of a functional programming language. The syntax includes *variables*, *data structures* +and a growing set of *mathematical functions*. + +Using the statistical programing syntax Solr's powerful `data retrieval` +capabilities can be combined with in-depth `statistical analysis`. + +The `data retrieval` methods include: + + * SQL + * time series aggregation + * random sampling + * faceted aggregation + * KNN searches + * topic message queues + * MapReduce (parallel relational algebra) + * JDBC calls to outside databases + * Graph Expressions + +Once the data is retrieved, the statistical programming syntax can be used to create *arrays* from the data so it +can be *manipulated*, *transformed* and *analyzed*. + +The statistical function library includes functions that perform: + +* Correlation +* Cross-correlation +* Covariance +* Moving averages +* Percentiles +* Simple regression and prediction +* ANOVA +* Histograms +* Convolution +* Euclidean distance +* Descriptive statistics +* Rank transformation +* Normalization transformation +* Sequences +* Array manipulation functions (creation, copying, length, scaling, reverse etc...) + +The statistical function library is backed by `Apache Commons Math`. + +This document provides an overview of the how to apply the variables, data structures +and mathematical functions. + +== /stream handler + +Like all Streaming Expressions the statistical functions can be run by Solr's /stream handler. + +== Math + +Streaming Expressions contain a suite of *mathematical* functions which can be called on +their own or as part of a larger expression. + +Solr's /stream handler evaluates the mathematical expression and returns a result. + +For example sending the following expression to the /stream handler: + +[source,text] +---- +add(1, 1) +---- + +Returns the following response: + +[source,text] +---- +{ + "result-set": { + "docs": [ + { + "return-value": 2 + }, + { + "EOF": true, + "RESPONSE_TIME": 2 + } + ] + } +} +---- + +You can nest math functions within each other. For example: + +[source,text] +---- +pow(10, add(1,1)) +---- + +Returns the following response: + +[source,text] +---- +{ + "result-set": { + "docs": [ + { + "return-value": 100 + }, + { + "EOF": true, + "RESPONSE_TIME": 0 + } + ] + } +} +---- + +You can also perform math on a stream of Tuples. + +For example: + +[source,text] +---- +select(search(collection2, q="*:*", fl="price_f", sort="price_f desc", rows="3"), + price_f, + mult(price_f, 10) as newPrice) +---- + +Returns the following response: + +[source, text] +---- +{ + "result-set": { + "docs": [ + { + "price_f": 0.99999994, + "newPrice": 9.9999994 + }, + { + "price_f": 0.99999994, + "newPrice": 9.9999994 + }, + { + "price_f": 0.9999992, + "newPrice": 9.999992 + }, + { + "EOF": true, + "RESPONSE_TIME": 3 + } + ] + } +} +---- + +== Arrays + +The first data structure we'll explore is the *array*. + +We can create an array with the `array` function: + +For example: + +[source,text] +---- +array(1, 2, 3) +---- + +Returns the following response: + +[source,text] +---- +{ + "result-set": { + "docs": [ + { + "return-value": [ + 1, + 2, + 3 + ] + }, + { + "EOF": true, + "RESPONSE_TIME": 0 + } + ] + } +} +---- + +We can nest arrays within arrays to form a *matrix*: + +[source,text] +---- +array(array(1, 2, 3), + array(4, 5, 6)) +---- + +Returns the following response: + +[source,text] +---- +{ + "result-set": { + "docs": [ + { + "return-value": [ + [ + 1, + 2, + 3 + ], + [ + 4, + 5, + 6 + ] + ] + }, + { + "EOF": true, + "RESPONSE_TIME": 0 + } + ] + } +} +---- + +We can manipulate arrays with functions. + +For example we can reverse and array with `rev` function. + +[source,text] +---- +rev(array(1, 2, 3)) +---- + +Returns the following response: + +[source,text] +---- +{ + "result-set": { + "docs": [ + { + "return-value": [ + 3, + 2, + 1 + ] + }, + { + "EOF": true, + "RESPONSE_TIME": 0 + } + ] + } +} +---- + +Functions can return arrays. + +For example the sequence function: + +[source,text] +---- +sequence(5,0,1) +---- + +This returns an array of size *5* starting from *0* with a stride of *1*. + +[source,text] +---- +{ + "result-set": { + "docs": [ + { + "return-value": [ + 0, + 1, + 2, + 3, + 4 + ] + }, + { + "EOF": true, + "RESPONSE_TIME": 4 + } + ] + } +} +---- + +We can perform math on an array. + +For example we can scale an array with the `scale` function: + +Expression: + +[source,text] +---- +scale(sequence(5,0,1), 10) +---- + +Returns the following response: + +[source,text] +---- +{ + "result-set": { + "docs": [ + { + "return-value": [ + 0, + 10, + 20, + 30, + 40 + ] + }, + { + "EOF": true, + "RESPONSE_TIME": 0 + } + ] + } +} +---- + +We can perform `statistical analysis` on arrays. + +For example we can correlate two sequences with the `corr` function: + +[source,text] +---- +corr(sequence(5,1,1), sequence(5,10,10)) +---- + +Returns the following response: + +[source,text] +---- +{ + "result-set": { + "docs": [ + { + "return-value": 1 + }, + { + "EOF": true, + "RESPONSE_TIME": 1 + } + ] + } +} +---- + + +== Tuple + +The *tuple* is the next data structure we'll explore. + +The `tuple` function returns a map of name/value pairs. A tuple is a very flexible data structure +that can hold values that are strings, numerics, arrays and lists of tuples. + +A tuple can be used to return a complex result from a statistical expression. + +Here is an example: + +[source,text] +---- +tuple(title="hello world", + array1=array(1,2,3,4), + array2=array(4,5,6,7)) + +Returns the following response: + +---- +[source,text] +---- +{ + "result-set": { + "docs": [ + { + "title": "hello world", + "array1": [ + 1, + 2, + 3, + 4 + ], + "array2": [ + 4, + 5, + 6, + 7 + ] + }, + { + "EOF": true, + "RESPONSE_TIME": 0 + } + ] + } +} +---- + +== List + +Next up, we have the *list* data structure. + +The `list` function is a data structure that wraps Streaming Expressions and emits their tuples as a single +concatenated stream. + +Below is an example of a list of tuples: + +[source,text] +---- +list(tuple(id=1, data=array(1, 2, 3)), + tuple(id=2, data=array(10, 12, 14))) +---- + +Returns the following response: + +[source,text] +---- + +{ + "result-set": { + "docs": [ + { + "id": "1", + "data": [ + 1, + 2, + 3 + ] + }, + { + "id": "2", + "data": [ + 10, + 12, + 14 + ] + }, + { + "EOF": true, + "RESPONSE_TIME": 0 + } + ] + } +} +---- + +== Let + +The `let` function sets variables and runs a streaming expression that references the variables. + +Th output of any Streaming Expression can be stored in a variable. + +Lets see how `let` works. + +Here is a very simple example: + +[source,text] +---- +let(a=random(collection2, q="*:*", rows="3", fl="price_f"), + b=random(collection2, q="*:*", rows="3", fl="price_f"), + tuple(sample1=a, sample2=b)) +---- + +The `let` expression above is setting variables *a* and *b* to random +samples taken from collection2. + +The `let` function then executes the `tuple` streaming expression +which references the two variables. + +Here is the output: + +[source,text] +---- +{ + "result-set": { + "docs": [ + { + "sample1": [ + { + "price_f": 0.39729273 + }, + { + "price_f": 0.063344836 + }, + { + "price_f": 0.42020327 + } + ], + "sample2": [ + { + "price_f": 0.659244 + }, + { + "price_f": 0.58797807 + }, + { + "price_f": 0.57520163 + } + ] + }, + { + "EOF": true, + "RESPONSE_TIME": 20 + } + ] + } +} +---- + +== Col + +The `col` function is used to move a column of numbers from a list of tuples into an array. +This is an important step because Streaming Expressions such as SQL, random and timeseries return tuples, +but the statistical functions operate on arrays. + +Below is an example of the `col` function: + +[source,text] +---- +let(a=random(collection2, q="*:*", rows="3", fl="price_f"), + b=random(collection2, q="*:*", rows="3", fl="price_f"), + c=col(a, price_f), + d=col(b, price_f), + tuple(sample1=c, sample2=d)) +---- + +The example above is using the `col` function to create arrays from the tuples stored in +variables *a* and *b*. + +Variable *c* contains an array of values from the *price_f* field, +taken from the tuples stored in variable *a*. + +Variable *d* contains an array of values from the *price_f* field, +taken from the tuples stored in variable *b*. + +Also notice that the response `tuple` is now pointing to the arrays in variables *c* and *d*. + +The response shows the arrays: + +[source,text] +---- + +{ + "result-set": { + "docs": [ + { + "sample1": [ + 0.06490427, + 0.6751543, + 0.07063508 + ], + "sample2": [ + 0.8884564, + 0.8878821, + 0.3504665 + ] + }, + { + "EOF": true, + "RESPONSE_TIME": 17 + } + ] + } +} +---- + +== Statistical Programming + +We've covered how the *data structures*, *variables* and a few *statistical functions* work. +Let's dive into an example that puts these tools to use. + +=== Use case + +We have an existing hotel in *cityA* that is very profitable. +We are contemplating opening up a new hotel in a different city. +We're considering 4 different cities: *cityB*, *cityC*, *cityD*, *cityE*. +We'd like to open a hotel in a city that has a very similar room rate to *cityA*. + +How do we determine which of the 4 cities we're considering has the most similar room rates to *cityA* + +=== The Data + +We have a data set of un-aggregated hotel *bookings*. Each booking record has a rate and city. + +=== Can we simply aggregate? + +One approach would be to aggregate the data from each city and compare the *mean* room rates. This approach will +give us some useful information, but the mean is a summary statistic which loses a significant amount of information +about the data. For example we don't have an understanding of how the distribution of room rates is effecting the +mean. + +The *median* room rate provides another interesting data point but it's still not the entire picture. It sill just +one point of reference. + +Is there a way that we can compare the markets without losing valuable information in the data? + +=== K Nearest Neighbor + +The use case we're thinking about can often be approached using a k Nearest Neighbor (knn) algorithm. + +With knn we use a *distance* measure to compare vectors of data to find the k nearest neighbors to +a specific vector. + +=== Distance + +The Streaming Expression statistical function library has a function called `distance`. The `distance` function +computes the Euclidean distance between two vectors. This looks promising for comparing vectors of room rates. + +=== Vectors + +But how to create the vectors from a our data set. Remember we have un-aggregated room rates from each of the cities. +How can we vectorize the data so it can be compared using the `distance` function. + +We have a Streaming Expression that can take a *random sample* from each of the cities. The name of this +expression is `random`. So we could take a random sample of 1000 room rates from each of the five markets. + +But random vectors of room rates are not comparable because the distance algorithm compares values at each index +in the vector. How can make these vectors comparable? + +We can make them comparable by *sorting* them. Then as the distance algorithm moves along the vectors it will be +comparing room rates from lowest to highest in both markets. + +=== The code + +let(cityA=sort(random(bookings, q="city:cityA", rows="1000", fl="rate_d"), by="rate_d asc"), + cityB=sort(random(bookings, q="city:cityB", rows="1000", fl="rate_d"), by="rate_d asc"), + cityC=sort(random(bookings, q="city:cityC", rows="1000", fl="rate_d"), by="rate_d asc"), + city4=sort(random(bookings, q="city:cityD", rows="1000", fl="rate_d"), by="rate_d asc"), + city5=sort(random(bookings, q="city:cityE", rows="1000", fl="rate_d"), by="rate_d asc"), + ratesA=col(cityA, rate_d), + ratesB=col(cityB, rate_d), + ratesC=col(cityC, rate_d), + ratesD=col(cityD, rate_d), + ratesE=col(cityE, rate_d), + top(n=1, + sort="distance asc", + list(tuple(city=B, distance=distance(rateA, rateB)), + tuple(city=C, distance=distance(rateA, rateC)), + tuple(city=D, distance=distance(rateA, rateD)), + tuple(city=E, distance=distance(rateA, rateD))))) + + +==== The code explained + +The `let` expression sets variables first. + +The first 5 variables (cityA, cityB, cityC, cityD, cityE), contain the random samples from the `bookings` collection. +the `random` function is pulling 1000 random samples from each city and including the `rate_d` field in the +tuples that are returned. + +The `random` function is wrapped by a `sort` function which is sorting the tuples in +ascending order based on the rate_d field. + +The next five variables (rates1, rates2, rates3, rates4, rates5) contain the arrays of room rates for each +city. The `col` function is used to move the `rate_d` field from the random sample tuples +into an array for each city. + +Now we have five sorted vectors of room rates that we can compare with our `distance` function. + +After the variables are set the `let` expression runs the `top` expression. + +The `top` expression is wrapping a `list` of `tuples`. Inside each tuple the distance function is used to compare +the rateA vector with one of the other cities. The output of the distance function is stored in the distance field +in the tuple. + +The `list` function emits each `tuple` and the `top` function returns only the tuple with lowest distance. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +[source,text] +---- +let(tuples=list(sort(random(collection2, q="month_i:1", rows="1000", fl="response_d"), by="reponse_d asc"), + sort(random(collection2, q="month_i:2", rows="1000", fl="response_d"), by="reponse_d asc"), + sort(random(collection2, q="month_i:3", rows="1000", fl="response_d"), by="reponse_d asc"), + sort(random(collection2, q="month_i:4", rows="1000", fl="response_d"), by="reponse_d asc"), + sort(random(collection2, q="month_i:5", rows="1000", fl="response_d"), by="reponse_d asc")), + allSamples=col(tuples, response_d), + rankedSamples=rank(allSamples), + sample1=copyOfRange(rankedSamples, 0, 1000), + sample2=copyOfRange(rankedSamples, 1000, 2000), + sample3=copyOfRange(rankedSamples, 2000, 3000), + sample4=copyOfRange(rankedSamples, 3000, 4000), + sample5=copyOfRange(rankedSamples, 4000, 5000), + top(n=1, sort="distance asc", + list(tuple(sample=2, distance=distance(sample1, sample2)), + tuple(sample=3, distance=distance(sample1, sample3)), + tuple(sample=4, distance=distance(sample1, sample4)), + tuple(sample=5, distance=distance(sample1, sample5))))) +---- + +[source,text] +---- + +---- + + + + + + +