How to architect an analytics stack

This commit is contained in:
fjy 2015-03-26 10:41:16 -07:00
parent aea7f9d192
commit 74af1f947e
36 changed files with 3438 additions and 0 deletions

4
publications/radstack/.gitignore vendored Normal file
View File

@ -0,0 +1,4 @@
*.aux
*.out
*.bbl
*.blg

View File

@ -0,0 +1,12 @@
all : radstack.pdf
clean :
@rm -f *.aux *.bbl *.blg *.log
%.tex : %.bib
%.pdf : %.tex %.bib
lualatex $(*F)
bibtex $(*F)
lualatex $(*F)
lualatex $(*F)

View File

@ -0,0 +1,4 @@
Download [MacTeX](http://tug.org/mactex/)
```bash
make
```

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 54 KiB

Binary file not shown.

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 17 KiB

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 74 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 73 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 85 KiB

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1,536 @@
@article{cattell2011scalable,
title={Scalable SQL and NoSQL data stores},
author={Cattell, Rick},
journal={ACM SIGMOD Record},
volume={39},
number={4},
pages={12--27},
year={2011},
publisher={ACM}
}
@article{chang2008bigtable,
title={Bigtable: A distributed storage system for structured data},
author={Chang, Fay and Dean, Jeffrey and Ghemawat, Sanjay and Hsieh, Wilson C and Wallach, Deborah A and Burrows, Mike and Chandra, Tushar and Fikes, Andrew and Gruber, Robert E},
journal={ACM Transactions on Computer Systems (TOCS)},
volume={26},
number={2},
pages={4},
year={2008},
publisher={ACM}
}
@inproceedings{decandia2007dynamo,
title={Dynamo: amazon's highly available key-value store},
author={DeCandia, Giuseppe and Hastorun, Deniz and Jampani, Madan and Kakulapati, Gunavardhan and Lakshman, Avinash and Pilchin, Alex and Sivasubramanian, Swaminathan and Vosshall, Peter and Vogels, Werner},
booktitle={ACM SIGOPS Operating Systems Review},
volume={41},
number={6},
pages={205--220},
year={2007},
organization={ACM}
}
@inproceedings{abadi2008column,
title={Column-Stores vs. Row-Stores: How different are they really?},
author={Abadi, Daniel J and Madden, Samuel R and Hachem, Nabil},
booktitle={Proceedings of the 2008 ACM SIGMOD international conference on Management of data},
pages={967--980},
year={2008},
organization={ACM}
}
@inproceedings{bear2012vertica,
title={The vertica database: SQL RDBMS for managing big data},
author={Bear, Chuck and Lamb, Andrew and Tran, Nga},
booktitle={Proceedings of the 2012 workshop on Management of big data systems},
pages={37--38},
year={2012},
organization={ACM}
}
@article{lakshman2010cassandra,
title={Cassandra—A decentralized structured storage system},
author={Lakshman, Avinash and Malik, Prashant},
journal={Operating systems review},
volume={44},
number={2},
pages={35},
year={2010}
}
@article{melnik2010dremel,
title={Dremel: interactive analysis of web-scale datasets},
author={Melnik, Sergey and Gubarev, Andrey and Long, Jing Jing and Romer, Geoffrey and Shivakumar, Shiva and Tolton, Matt and Vassilakis, Theo},
journal={Proceedings of the VLDB Endowment},
volume={3},
number={1-2},
pages={330--339},
year={2010},
publisher={VLDB Endowment}
}
@article{hall2012processing,
title={Processing a trillion cells per mouse click},
author={Hall, Alexander and Bachmann, Olaf and B{\"u}ssow, Robert and G{\u{a}}nceanu, Silviu and Nunkesser, Marc},
journal={Proceedings of the VLDB Endowment},
volume={5},
number={11},
pages={1436--1446},
year={2012},
publisher={VLDB Endowment}
}
@inproceedings{shvachko2010hadoop,
title={The hadoop distributed file system},
author={Shvachko, Konstantin and Kuang, Hairong and Radia, Sanjay and Chansler, Robert},
booktitle={Mass Storage Systems and Technologies (MSST), 2010 IEEE 26th Symposium on},
pages={1--10},
year={2010},
organization={IEEE}
}
@article{colantonio2010concise,
title={Concise: Compressed nComposable Integer Set},
author={Colantonio, Alessandro and Di Pietro, Roberto},
journal={Information Processing Letters},
volume={110},
number={16},
pages={644--650},
year={2010},
publisher={Elsevier}
}
@inproceedings{stonebraker2005c,
title={C-store: a column-oriented DBMS},
author={Stonebraker, Mike and Abadi, Daniel J and Batkin, Adam and Chen, Xuedong and Cherniack, Mitch and Ferreira, Miguel and Lau, Edmond and Lin, Amerson and Madden, Sam and O'Neil, Elizabeth and others},
booktitle={Proceedings of the 31st international conference on Very large data bases},
pages={553--564},
year={2005},
organization={VLDB Endowment}
}
@article{stonebraker1987extendability,
title={Extendability in POSTGRES.},
author={Stonebraker, Michael and Anton, Jeff and Hirohama, Michael},
journal={IEEE Data Eng. Bull.},
volume={10},
number={2},
pages={16--23},
year={1987}
}
@book{george2011hbase,
title={HBase: the definitive guide},
author={George, Lars},
year={2011},
publisher={" O'Reilly Media, Inc."}
}
@inproceedings{engle2012shark,
title={Shark: fast data analysis using coarse-grained distributed memory},
author={Engle, Cliff and Lupher, Antonio and Xin, Reynold and Zaharia, Matei and Franklin, Michael J and Shenker, Scott and Stoica, Ion},
booktitle={Proceedings of the 2012 international conference on Management of Data},
pages={689--692},
year={2012},
organization={ACM}
}
@inproceedings{zaharia2012discretized,
title={Discretized streams: an efficient and fault-tolerant model for stream processing on large clusters},
author={Zaharia, Matei and Das, Tathagata and Li, Haoyuan and Shenker, Scott and Stoica, Ion},
booktitle={Proceedings of the 4th USENIX conference on Hot Topics in Cloud Computing},
pages={10--10},
year={2012},
organization={USENIX Association}
}
@misc{marz2013storm,
author = {Marz, Nathan},
title = {Storm: Distributed and Fault-Tolerant Realtime Computation},
month = {February},
year = {2013},
howpublished = "\url{http://storm-project.net/}"
}
@misc{2014samza,
title = {Apache Samza},
year = {2014},
howpublished = "\url{http://samza.apache.org/}"
}
@misc{2013linkedin,
title = {Camus},
year = {2013},
howpublished = "\url{https://github.com/linkedin/camus}"
}
@misc{yang2014radstack,
title = {Real Time Analytics with Open Source Technologies},
year = {2014},
howpublished = "\url{https://speakerdeck.com/druidio/real-time-analytics-with-open-source-technologies-1}"
}
@inproceedings{yang2014druid,
title={Druid: a real-time analytical data store},
author={Yang, Fangjin and Tschetter, Eric and L{\'e}aut{\'e}, Xavier and Ray, Nelson and Merlino, Gian and Ganguli, Deep},
booktitle={Proceedings of the 2014 ACM SIGMOD international conference on Management of data},
pages={157--168},
year={2014},
organization={ACM}
}
@misc{2014yahoo,
title = {Pushing the limits of Realtime Analytics using Druid},
year = {2014},
howpublished = "\url{http://www.slideshare.net/ydn/pushing-thelimitsofrealtimeanalyticswithdruidv3}"
}
@misc{tschetter2011druid,
author = {Eric Tschetter},
title = {Introducing Druid: Real-Time Analytics at a Billion Rows Per Second},
month = {April},
year = {2011},
howpublished = "\url{http://druid.io/blog/2011/04/30/introducing-druid.html}"
}
@article{farber2012sap,
title={SAP HANA database: data management for modern business applications},
author={F{\"a}rber, Franz and Cha, Sang Kyun and Primsch, J{\"u}rgen and Bornh{\"o}vd, Christof and Sigg, Stefan and Lehner, Wolfgang},
journal={ACM Sigmod Record},
volume={40},
number={4},
pages={45--51},
year={2012},
publisher={ACM}
}
@misc{voltdb2010voltdb,
title={VoltDB Technical Overview},
author={VoltDB, LLC},
year={2010},
howpublished = "\url{https://voltdb.com/}"
}
@inproceedings{macnicol2004sybase,
title={Sybase IQ multiplex-designed for analytics},
author={MacNicol, Roger and French, Blaine},
booktitle={Proceedings of the Thirtieth international conference on Very large data bases-Volume 30},
pages={1227--1230},
year={2004},
organization={VLDB Endowment}
}
@inproceedings{singh2011introduction,
title={Introduction to the IBM Netezza warehouse appliance},
author={Singh, Malcolm and Leonhardi, Ben},
booktitle={Proceedings of the 2011 Conference of the Center for Advanced Studies on Collaborative Research},
pages={385--386},
year={2011},
organization={IBM Corp.}
}
@inproceedings{miner2012unified,
title={Unified analytics platform for big data},
author={Miner, Donald},
booktitle={Proceedings of the WICSA/ECSA 2012 Companion Volume},
pages={176--176},
year={2012},
organization={ACM}
}
@inproceedings{fink2012distributed,
title={Distributed computation on dynamo-style distributed storage: riak pipe},
author={Fink, Bryan},
booktitle={Proceedings of the eleventh ACM SIGPLAN workshop on Erlang workshop},
pages={43--50},
year={2012},
organization={ACM}
}
@misc{paraccel2013,
key = {ParAccel Analytic Database},
title = {ParAccel Analytic Database},
month = {March},
year = {2013},
howpublished = "\url{http://www.paraccel.com/resources/Datasheets/ParAccel-Core-Analytic-Database.pdf}"
}
@misc{cloudera2013,
key = {Cloudera Impala},
title = {Cloudera Impala},
month = {March},
year = {2013},
url = {},
howpublished = "\url{http://blog.cloudera.com/blog}"
}
@inproceedings{hunt2010zookeeper,
title={ZooKeeper: Wait-free coordination for Internet-scale systems},
author={Hunt, Patrick and Konar, Mahadev and Junqueira, Flavio P and Reed, Benjamin},
booktitle={USENIX ATC},
volume={10},
year={2010}
}
@inproceedings{kreps2011kafka,
title={Kafka: A distributed messaging system for log processing},
author={Kreps, Jay and Narkhede, Neha and Rao, Jun},
booktitle={Proceedings of 6th International Workshop on Networking Meets Databases (NetDB), Athens, Greece},
year={2011}
}
@misc{liblzf2013,
title = {LibLZF},
key = {LibLZF},
month = {March},
year = {2013},
howpublished = "\url{http://freecode.com/projects/liblzf}"
}
@inproceedings{tomasic1993performance,
title={Performance of inverted indices in shared-nothing distributed text document information retrieval systems},
author={Tomasic, Anthony and Garcia-Molina, Hector},
booktitle={Parallel and Distributed Information Systems, 1993., Proceedings of the Second International Conference on},
pages={8--17},
year={1993},
organization={IEEE}
}
@inproceedings{antoshenkov1995byte,
title={Byte-aligned bitmap compression},
author={Antoshenkov, Gennady},
booktitle={Data Compression Conference, 1995. DCC'95. Proceedings},
pages={476},
year={1995},
organization={IEEE}
}
@inproceedings{van2011memory,
title={A memory efficient reachability data structure through bit vector compression},
author={van Schaik, Sebastiaan J and de Moor, Oege},
booktitle={Proceedings of the 2011 international conference on Management of data},
pages={913--924},
year={2011},
organization={ACM}
}
@inproceedings{o1993lru,
title={The LRU-K page replacement algorithm for database disk buffering},
author={O'neil, Elizabeth J and O'neil, Patrick E and Weikum, Gerhard},
booktitle={ACM SIGMOD Record},
volume={22},
number={2},
pages={297--306},
year={1993},
organization={ACM}
}
@article{kim2001lrfu,
title={LRFU: A spectrum of policies that subsumes the least recently used and least frequently used policies},
author={Kim, Chong Sang},
journal={IEEE Transactions on Computers},
volume={50},
number={12},
year={2001}
}
@article{wu2006optimizing,
title={Optimizing bitmap indices with efficient compression},
author={Wu, Kesheng and Otoo, Ekow J and Shoshani, Arie},
journal={ACM Transactions on Database Systems (TODS)},
volume={31},
number={1},
pages={1--38},
year={2006},
publisher={ACM}
}
@misc{twitter2013,
key = {Twitter Public Streams},
title = {Twitter Public Streams},
month = {March},
year = {2013},
howpublished = "\url{https://dev.twitter.com/docs/streaming-apis/streams/public}"
}
@article{fitzpatrick2004distributed,
title={Distributed caching with memcached},
author={Fitzpatrick, Brad},
journal={Linux journal},
number={124},
pages={72--74},
year={2004}
}
@inproceedings{amdahl1967validity,
title={Validity of the single processor approach to achieving large scale computing capabilities},
author={Amdahl, Gene M},
booktitle={Proceedings of the April 18-20, 1967, spring joint computer conference},
pages={483--485},
year={1967},
organization={ACM}
}
@book{sarawagi1998discovery,
title={Discovery-driven exploration of OLAP data cubes},
author={Sarawagi, Sunita and Agrawal, Rakesh and Megiddo, Nimrod},
year={1998},
publisher={Springer}
}
@article{hu2011stream,
title={Stream Database Survey},
author={Hu, Bo},
year={2011}
}
@article{dean2008mapreduce,
title={MapReduce: simplified data processing on large clusters},
author={Dean, Jeffrey and Ghemawat, Sanjay},
journal={Communications of the ACM},
volume={51},
number={1},
pages={107--113},
year={2008},
publisher={ACM}
}
@misc{linkedin2013senseidb,
author = {LinkedIn},
title = {SenseiDB},
month = {July},
year = {2013},
howpublished = "\url{http://www.senseidb.com/}"
}
@misc{apache2013solr,
author = {Apache},
title = {Apache Solr},
month = {February},
year = {2013},
howpublished = "\url{http://lucene.apache.org/solr/}"
}
@misc{banon2013elasticsearch,
author = {Banon, Shay},
title = {ElasticSearch},
month = {July},
year = {2013},
howpublished = "\url{http://www.elasticseach.com/}"
}
@book{oehler2012ibm,
title={IBM Cognos TM1: The Official Guide},
author={Oehler, Karsten and Gruenes, Jochen and Ilacqua, Christopher and Perez, Manuel},
year={2012},
publisher={McGraw-Hill}
}
@book{schrader2009oracle,
title={Oracle Essbase \& Oracle OLAP},
author={Schrader, Michael and Vlamis, Dan and Nader, Mike and Claterbos, Chris and Collins, Dave and Campbell, Mitch and Conrad, Floyd},
year={2009},
publisher={McGraw-Hill, Inc.}
}
@book{lachev2005applied,
title={Applied Microsoft Analysis Services 2005: And Microsoft Business Intelligence Platform},
author={Lachev, Teo},
year={2005},
publisher={Prologika Press}
}
@article{o1996log,
title={The log-structured merge-tree (LSM-tree)},
author={ONeil, Patrick and Cheng, Edward and Gawlick, Dieter and ONeil, Elizabeth},
journal={Acta Informatica},
volume={33},
number={4},
pages={351--385},
year={1996},
publisher={Springer}
}
@inproceedings{o1997improved,
title={Improved query performance with variant indexes},
author={O'Neil, Patrick and Quass, Dallan},
booktitle={ACM Sigmod Record},
volume={26},
number={2},
pages={38--49},
year={1997},
organization={ACM}
}
@inproceedings{cipar2012lazybase,
title={LazyBase: trading freshness for performance in a scalable database},
author={Cipar, James and Ganger, Greg and Keeton, Kimberly and Morrey III, Charles B and Soules, Craig AN and Veitch, Alistair},
booktitle={Proceedings of the 7th ACM european conference on Computer Systems},
pages={169--182},
year={2012},
organization={ACM}
}
@article{collet2013lz4,
title={LZ4: Extremely fast compression algorithm},
author={Collet, Yann},
journal={code. google. com},
year={2013}
}
@inproceedings{beyer1999bottom,
title={Bottom-up computation of sparse and iceberg cube},
author={Beyer, Kevin and Ramakrishnan, Raghu},
booktitle={ACM SIGMOD Record},
volume={28},
number={2},
pages={359--370},
year={1999},
organization={ACM}
}
@inproceedings{vavilapalli2013apache,
title={Apache hadoop yarn: Yet another resource negotiator},
author={Vavilapalli, Vinod Kumar and Murthy, Arun C and Douglas, Chris and Agarwal, Sharad and Konar, Mahadev and Evans, Robert and Graves, Thomas and Lowe, Jason and Shah, Hitesh and Seth, Siddharth and others},
booktitle={Proceedings of the 4th annual Symposium on Cloud Computing},
pages={5},
year={2013},
organization={ACM}
}
@article{boykin2014summingbird,
title={Summingbird: A Framework for Integrating Batch and Online MapReduce Computations},
author={Boykin, Oscar and Ritchie, Sam and OConnell, Ian and Lin, Jimmy},
journal={Proceedings of the VLDB Endowment},
volume={7},
number={13},
year={2014}
}
@inproceedings{zaharia2012resilient,
title={Resilient distributed datasets: A fault-tolerant abstraction for in-memory cluster computing},
author={Zaharia, Matei and Chowdhury, Mosharaf and Das, Tathagata and Dave, Ankur and Ma, Justin and McCauley, Murphy and Franklin, Michael J and Shenker, Scott and Stoica, Ion},
booktitle={Proceedings of the 9th USENIX conference on Networked Systems Design and Implementation},
pages={2--2},
year={2012},
organization={USENIX Association}
}
@inproceedings{stonebraker2009requirements,
title={Requirements for Science Data Bases and SciDB.},
author={Stonebraker, Michael and Becla, Jacek and DeWitt, David J and Lim, Kian-Tat and Maier, David and Ratzesberger, Oliver and Zdonik, Stanley B},
booktitle={CIDR},
volume={7},
pages={173--184},
year={2009}
}
@article{stonebraker2010mapreduce,
title={MapReduce and parallel DBMSs: friends or foes?},
author={Stonebraker, Michael and Abadi, Daniel and DeWitt, David J and Madden, Sam and Paulson, Erik and Pavlo, Andrew and Rasin, Alexander},
journal={Communications of the ACM},
volume={53},
number={1},
pages={64--71},
year={2010},
publisher={ACM}
}

Binary file not shown.

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,77 @@
library(ggplot2)
library(stringr)
library(plyr)
x <-
"Dimension Cardinality Concise compressed size (bytes)
Has_mention 2 586,400
Has_links 2 580,872
Has_geo 2 144,004
Is_retweet 2 584,592
Is_viral 2 358,380
User_lang 21 1,414,000
User_time_zone 142 3,876,244
URL_domain 31,165 1,562,428
First_hashtag 100,728 1,837,144
Rt_name 182,704 2,235,288
Reply_to_name 620,421 5,673,504
User_location 637,774 9,511,844
User_mention_name 923,842 9,086,416
User_name 1,784,369 16,000,028"
foo <- function(x){
m <- matrix(unlist(str_split(x, "\t|\n")), ncol = 3, byrow = TRUE)
df <- data.frame(m[-1, ], stringsAsFactors = FALSE)
names(df) <- m[1, ]
df[, 2] <- as.numeric(str_replace_all(df[, 2], ",", ""))
df[, 3] <- as.numeric(str_replace_all(df[, 3], ",", ""))
df <- transform(df, ytext = `Concise\ compressed\ size\ (bytes)`)
names(df) <- c(m[1, ], "ytext")
df
}
df <- foo(x)
## df$ytext[12] <- 1.05 * df$ytext[12]
## df$ytext[13] <- .93 * df$ytext[13]
## df$ytext[1] <- 1.13 * df$ytext[1]
## df$ytext[4] <- .87 * df$ytext[4]
## qplot(x = Cardinality, y = `Concise\ compressed\ size\ (bytes)`, data = df, geom = "point") +
## geom_text(aes(x = Cardinality * 1.2, y = ytext, label = Dimension), hjust = 0, size = 4) +
## scale_x_log10(limits = c(1, 10^8.5)) +
## scale_y_log10() +
## geom_hline(aes(yintercept = 9089180)) +
## geom_text(aes(x = 1e2, y = 9089180 * 1.1, label = "Integer array size (bytes)"), hjust = 0, size = 4) +
## ggtitle("The Relationship of Compressed Size to Cardinality")
y <-
"Dimension Cardinality Concise compressed size (bytes)
Has_mention 2 744
Has_links 2 1,504
Has_geo 2 2,840
Is_retweet 2 1,616
Is_viral 2 1,488
User_lang 21 38,416
User_time_zone 142 319,644
URL_domain 31,165 700,752
First_hashtag 100,728 1,505,292
Rt_name 182,704 1,874,180
Reply_to_name 620,421 5,404,108
User_location 637,774 9,091,016
User_mention_name 923,842 8,686,384
User_name 1,784,369 16,204,900"
df2 <- foo(y)
df$sorted <- "unsorted"
df2$sorted <- "sorted"
dat <- rbind(df, df2)
ggplot(data = dat, aes(x = Cardinality, y = `Concise\ compressed\ size\ (bytes)`)) +
geom_point(aes(color = sorted, shape = sorted), alpha = .9, size = 4) +
scale_x_log10(limits = c(1, 10^8.5)) +
scale_y_log10() +
geom_hline(aes(yintercept = 9089180)) +
geom_text(aes(x = 1e1, y = 9089180 * 1.4, label = "Integer array size (bytes)"), hjust = 0, size = 5)
#ggsave("concise_plot.png", width = 10, height = 8)
ggsave("../figures/concise_plot.pdf", width = 6, height = 4.5)

View File

@ -0,0 +1,253 @@
library(stringr)
library(xtable)
library(plyr)
library(ggplot2)
stringToDF <- function(x, ncol){
m <- matrix(unlist(str_split(x, "\t|\n")), ncol = ncol, byrow = TRUE)
df <- data.frame(m[-1, ], stringsAsFactors = FALSE)
names(df) <- m[1, ]
df
}
##print(xtable(stringToDF(x, 3)), include.rownames = FALSE)
stringToDF2 <- function(x, query){
m <- matrix(unlist(str_split(x, "\t|\n")), ncol = 3, byrow = TRUE)
df <- data.frame(m[-1, ], stringsAsFactors = FALSE)
names(df) <- m[1, ]
df[, 2] <- as.numeric(str_replace_all(df[, 2], ",", ""))
df[, 3] <- as.numeric(str_replace_all(df[, 3], ",", ""))
df2 <- ldply(1:nrow(df), function(i) gsub("^\\s+|\\s+$", "", str_split(df[i, 1], ",")[[1]]))
names(df2) <- c("cores", "nodes", "configuration")
df2$query <- query
cbind(df, df2)
}
x2 <- "Cluster
Cluster scan rate (rows/sec)
Core scan rate
15-core, 100 nodes, in-memory
26,610,386,635
17,740,258
15-core, 75 nodes, mmap
25,224,873,928
22,422,110
15-core, 50 nodes, mmap
20,387,152,160
27,182,870
15-core, 25 nodes, mmap
11,910,388,894
31,761,037
4-core, 131 nodes, in-memory
10,008,730,163
19,100,630
4-core, 131 nodes, mmap
10,129,695,120
19,331,479
4-core, 50 nodes, mmap
6,626,570,688
33,132,853"
x3 <- "Cluster
Cluster scan rate (rows/sec)
Core scan rate
15-core, 100 nodes, in-memory
16,223,081,703
10,815,388
15-core, 75 nodes, mmap
9,860,968,285
8,765,305
15-core, 50 nodes, mmap
8,093,611,909
10,791,483
15-core, 25 nodes, mmap
4,126,502,352
11,004,006
4-core, 131 nodes, in-memory
5,755,274,389
10,983,348
4-core, 131 nodes, mmap
5,032,185,657
9,603,408
4-core, 50 nodes, mmap
1,720,238,609
8,601,193"
x4 <- "Cluster
Cluster scan rate (rows/sec)
Core scan rate
15-core, 100 nodes, in-memory
7,591,604,822
5,061,070
15-core, 75 nodes, mmap
4,319,179,995
3,839,271
15-core, 50 nodes, mmap
3,406,554,102
4,542,072
15-core, 25 nodes, mmap
1,826,451,888
4,870,538
4-core, 131 nodes, in-memory
1,936,648,601
3,695,894
4-core, 131 nodes, mmap
2,210,367,152
4,218,258
4-core, 50 nodes, mmap
1,002,291,562
5,011,458"
x5 <- "Cluster
Cluster scan rate (rows/sec)
Core scan rate
15-core, 100 nodes, in-memory
10,241,183,745
6,827,456
15-core, 75 nodes, mmap
4,891,097,559
4,347,642
15-core, 50 nodes, mmap
3,616,707,511
4,822,277
15-core, 25 nodes, mmap
1,665,053,263
4,440,142
4-core, 131 nodes, in-memory
4,388,159,569
8,374,350
4-core, 131 nodes, mmap
2,444,344,232
4,664,779
4-core, 50 nodes, mmap
1,215,737,558
6,078,688"
x6 <- "Cluster
Cluster scan rate (rows/sec)
Core scan rate
15-core, 100 nodes, in-memory
7,309,984,688
4,873,323
15-core, 75 nodes, mmap
3,333,628,777
2,963,226
15-core, 50 nodes, mmap
2,555,300,237
3,407,067
15-core, 25 nodes, mmap
1,384,674,717
3,692,466
4-core, 131 nodes, in-memory
3,237,907,984
6,179,214
4-core, 131 nodes, mmap
1,740,481,380
3,321,529
4-core, 50 nodes, mmap
863,170,420
4,315,852"
x7 <- "Cluster
Cluster scan rate (rows/sec)
Core scan rate
15-core, 100 nodes, in-memory
4,064,424,274
2,709,616
15-core, 75 nodes, mmap
2,014,067,386
1,790,282
15-core, 50 nodes, mmap
1,499,452,617
1,999,270
15-core, 25 nodes, mmap
810,143,518
2,160,383
4-core, 131 nodes, in-memory
1,670,214,695
3,187,433
4-core, 131 nodes, mmap
1,116,635,690
2,130,984
4-core, 50 nodes, mmap
531,389,163
2,656,946"
dat <- ldply(2:7, function(i){
df <- eval(parse(text = paste("x", i, sep = "")))
stringToDF2(df, paste("Query", i - 1, sep = " "))
})
ggplot(data = dat, aes(
x = as.numeric(str_extract(nodes, "[0-9]+")),
y = `Cluster scan rate (rows/sec)` / 1e9,
shape = configuration,
color = query,
size = factor(cores, levels = c("4-core", "15-core"))
)) +
scale_size_manual(values = c(3, 6), guide = guide_legend(title = "Cores Per Node")) +
scale_color_discrete(guide = guide_legend(title = "Query")) +
scale_shape_discrete(guide = guide_legend(title = "Configuration")) +
geom_point() +
scale_y_log10(breaks = 2^(-1:5)) +
facet_grid(configuration~.) +
xlab("Number of Nodes") +
ylab("Cluster Scan Rate (billion rows/sec.)")
dat2 <- subset(dat, cores == "15-core" & configuration == "mmap")
dat2$x <- as.numeric(str_extract(dat2$nodes, "[0-9]+"))
baseRate <- list()
d_ply(subset(dat2, x == 25), .(query), function(df) baseRate[df$query] <<- df$`Cluster scan rate (rows/sec)`)
dat2 <- ddply(dat2, .(query, x), function(df){
df$projected <- df$x / 25 * unlist(baseRate[df$query])
df
})
ggplot(data = dat2, aes(
x = as.numeric(str_extract(nodes, "[0-9]+")),
y = `Cluster scan rate (rows/sec)` / 1e9,
color = query,
shape = query
)) +
# scale_y_log10(breaks = 2^(-1:5)) +
# scale_color_discrete(guide = guide_legend(title = "Query")) +
geom_point(size = 3) +
# geom_path(aes(y = projected)) +
geom_line(aes(y = projected / 1e9)) +
# scale_y_log10() +
xlab("Number of Nodes") +
ylab("Cluster Scan Rate (billion rows/sec.)")
ggsave("../figures/cluster_scan_rate.pdf", width = 4, height = 3)
## ggplot(data = dat, aes(
## x = as.numeric(str_extract(nodes, "[0-9]+")),
## y = `Core scan rate` / 1e6,
## shape = configuration,
## color = query,
## size = factor(cores, levels = c("4-core", "15-core"))
## )) +
## scale_size_manual(values = c(3, 6), guide = guide_legend(title = "Cores Per Node")) +
## scale_color_discrete(guide = guide_legend(title = "Query")) +
## scale_shape_discrete(guide = guide_legend(title = "Configuration")) +
## geom_point() +
## scale_y_log10(breaks = 2^(-1:5)) +
## xlab("Number of Nodes") +
## ylab("Core Scan Rate (million rows/sec.)")
ggplot(data = dat2, aes(
x = as.numeric(str_extract(nodes, "[0-9]+")),
y = `Core scan rate` / 1e6,
color = query,
shape = query
)) +
geom_point(size = 3) +
xlab("Number of Nodes") +
ylab("Core Scan Rate (million rows/sec.)")
ggsave("../figures/core_scan_rate.pdf", width = 4, height = 3)

File diff suppressed because it is too large Load Diff