mirror of https://github.com/apache/druid.git
How to architect an analytics stack
This commit is contained in:
parent
aea7f9d192
commit
74af1f947e
|
@ -0,0 +1,4 @@
|
|||
*.aux
|
||||
*.out
|
||||
*.bbl
|
||||
*.blg
|
|
@ -0,0 +1,12 @@
|
|||
all : radstack.pdf
|
||||
|
||||
clean :
|
||||
@rm -f *.aux *.bbl *.blg *.log
|
||||
|
||||
%.tex : %.bib
|
||||
|
||||
%.pdf : %.tex %.bib
|
||||
lualatex $(*F)
|
||||
bibtex $(*F)
|
||||
lualatex $(*F)
|
||||
lualatex $(*F)
|
|
@ -0,0 +1,4 @@
|
|||
Download [MacTeX](http://tug.org/mactex/)
|
||||
```bash
|
||||
make
|
||||
```
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
After Width: | Height: | Size: 54 KiB |
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
After Width: | Height: | Size: 17 KiB |
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
After Width: | Height: | Size: 74 KiB |
Binary file not shown.
After Width: | Height: | Size: 73 KiB |
Binary file not shown.
After Width: | Height: | Size: 85 KiB |
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,536 @@
|
|||
@article{cattell2011scalable,
|
||||
title={Scalable SQL and NoSQL data stores},
|
||||
author={Cattell, Rick},
|
||||
journal={ACM SIGMOD Record},
|
||||
volume={39},
|
||||
number={4},
|
||||
pages={12--27},
|
||||
year={2011},
|
||||
publisher={ACM}
|
||||
}
|
||||
|
||||
@article{chang2008bigtable,
|
||||
title={Bigtable: A distributed storage system for structured data},
|
||||
author={Chang, Fay and Dean, Jeffrey and Ghemawat, Sanjay and Hsieh, Wilson C and Wallach, Deborah A and Burrows, Mike and Chandra, Tushar and Fikes, Andrew and Gruber, Robert E},
|
||||
journal={ACM Transactions on Computer Systems (TOCS)},
|
||||
volume={26},
|
||||
number={2},
|
||||
pages={4},
|
||||
year={2008},
|
||||
publisher={ACM}
|
||||
}
|
||||
|
||||
@inproceedings{decandia2007dynamo,
|
||||
title={Dynamo: amazon's highly available key-value store},
|
||||
author={DeCandia, Giuseppe and Hastorun, Deniz and Jampani, Madan and Kakulapati, Gunavardhan and Lakshman, Avinash and Pilchin, Alex and Sivasubramanian, Swaminathan and Vosshall, Peter and Vogels, Werner},
|
||||
booktitle={ACM SIGOPS Operating Systems Review},
|
||||
volume={41},
|
||||
number={6},
|
||||
pages={205--220},
|
||||
year={2007},
|
||||
organization={ACM}
|
||||
}
|
||||
|
||||
@inproceedings{abadi2008column,
|
||||
title={Column-Stores vs. Row-Stores: How different are they really?},
|
||||
author={Abadi, Daniel J and Madden, Samuel R and Hachem, Nabil},
|
||||
booktitle={Proceedings of the 2008 ACM SIGMOD international conference on Management of data},
|
||||
pages={967--980},
|
||||
year={2008},
|
||||
organization={ACM}
|
||||
}
|
||||
|
||||
@inproceedings{bear2012vertica,
|
||||
title={The vertica database: SQL RDBMS for managing big data},
|
||||
author={Bear, Chuck and Lamb, Andrew and Tran, Nga},
|
||||
booktitle={Proceedings of the 2012 workshop on Management of big data systems},
|
||||
pages={37--38},
|
||||
year={2012},
|
||||
organization={ACM}
|
||||
}
|
||||
|
||||
@article{lakshman2010cassandra,
|
||||
title={Cassandra—A decentralized structured storage system},
|
||||
author={Lakshman, Avinash and Malik, Prashant},
|
||||
journal={Operating systems review},
|
||||
volume={44},
|
||||
number={2},
|
||||
pages={35},
|
||||
year={2010}
|
||||
}
|
||||
|
||||
@article{melnik2010dremel,
|
||||
title={Dremel: interactive analysis of web-scale datasets},
|
||||
author={Melnik, Sergey and Gubarev, Andrey and Long, Jing Jing and Romer, Geoffrey and Shivakumar, Shiva and Tolton, Matt and Vassilakis, Theo},
|
||||
journal={Proceedings of the VLDB Endowment},
|
||||
volume={3},
|
||||
number={1-2},
|
||||
pages={330--339},
|
||||
year={2010},
|
||||
publisher={VLDB Endowment}
|
||||
}
|
||||
|
||||
@article{hall2012processing,
|
||||
title={Processing a trillion cells per mouse click},
|
||||
author={Hall, Alexander and Bachmann, Olaf and B{\"u}ssow, Robert and G{\u{a}}nceanu, Silviu and Nunkesser, Marc},
|
||||
journal={Proceedings of the VLDB Endowment},
|
||||
volume={5},
|
||||
number={11},
|
||||
pages={1436--1446},
|
||||
year={2012},
|
||||
publisher={VLDB Endowment}
|
||||
}
|
||||
|
||||
@inproceedings{shvachko2010hadoop,
|
||||
title={The hadoop distributed file system},
|
||||
author={Shvachko, Konstantin and Kuang, Hairong and Radia, Sanjay and Chansler, Robert},
|
||||
booktitle={Mass Storage Systems and Technologies (MSST), 2010 IEEE 26th Symposium on},
|
||||
pages={1--10},
|
||||
year={2010},
|
||||
organization={IEEE}
|
||||
}
|
||||
|
||||
@article{colantonio2010concise,
|
||||
title={Concise: Compressed ‘n’Composable Integer Set},
|
||||
author={Colantonio, Alessandro and Di Pietro, Roberto},
|
||||
journal={Information Processing Letters},
|
||||
volume={110},
|
||||
number={16},
|
||||
pages={644--650},
|
||||
year={2010},
|
||||
publisher={Elsevier}
|
||||
}
|
||||
|
||||
@inproceedings{stonebraker2005c,
|
||||
title={C-store: a column-oriented DBMS},
|
||||
author={Stonebraker, Mike and Abadi, Daniel J and Batkin, Adam and Chen, Xuedong and Cherniack, Mitch and Ferreira, Miguel and Lau, Edmond and Lin, Amerson and Madden, Sam and O'Neil, Elizabeth and others},
|
||||
booktitle={Proceedings of the 31st international conference on Very large data bases},
|
||||
pages={553--564},
|
||||
year={2005},
|
||||
organization={VLDB Endowment}
|
||||
}
|
||||
|
||||
@article{stonebraker1987extendability,
|
||||
title={Extendability in POSTGRES.},
|
||||
author={Stonebraker, Michael and Anton, Jeff and Hirohama, Michael},
|
||||
journal={IEEE Data Eng. Bull.},
|
||||
volume={10},
|
||||
number={2},
|
||||
pages={16--23},
|
||||
year={1987}
|
||||
}
|
||||
|
||||
@book{george2011hbase,
|
||||
title={HBase: the definitive guide},
|
||||
author={George, Lars},
|
||||
year={2011},
|
||||
publisher={" O'Reilly Media, Inc."}
|
||||
}
|
||||
|
||||
@inproceedings{engle2012shark,
|
||||
title={Shark: fast data analysis using coarse-grained distributed memory},
|
||||
author={Engle, Cliff and Lupher, Antonio and Xin, Reynold and Zaharia, Matei and Franklin, Michael J and Shenker, Scott and Stoica, Ion},
|
||||
booktitle={Proceedings of the 2012 international conference on Management of Data},
|
||||
pages={689--692},
|
||||
year={2012},
|
||||
organization={ACM}
|
||||
}
|
||||
|
||||
@inproceedings{zaharia2012discretized,
|
||||
title={Discretized streams: an efficient and fault-tolerant model for stream processing on large clusters},
|
||||
author={Zaharia, Matei and Das, Tathagata and Li, Haoyuan and Shenker, Scott and Stoica, Ion},
|
||||
booktitle={Proceedings of the 4th USENIX conference on Hot Topics in Cloud Computing},
|
||||
pages={10--10},
|
||||
year={2012},
|
||||
organization={USENIX Association}
|
||||
}
|
||||
|
||||
@misc{marz2013storm,
|
||||
author = {Marz, Nathan},
|
||||
title = {Storm: Distributed and Fault-Tolerant Realtime Computation},
|
||||
month = {February},
|
||||
year = {2013},
|
||||
howpublished = "\url{http://storm-project.net/}"
|
||||
}
|
||||
|
||||
@misc{2014samza,
|
||||
title = {Apache Samza},
|
||||
year = {2014},
|
||||
howpublished = "\url{http://samza.apache.org/}"
|
||||
}
|
||||
|
||||
@misc{2013linkedin,
|
||||
title = {Camus},
|
||||
year = {2013},
|
||||
howpublished = "\url{https://github.com/linkedin/camus}"
|
||||
}
|
||||
|
||||
|
||||
@misc{yang2014radstack,
|
||||
title = {Real Time Analytics with Open Source Technologies},
|
||||
year = {2014},
|
||||
howpublished = "\url{https://speakerdeck.com/druidio/real-time-analytics-with-open-source-technologies-1}"
|
||||
}
|
||||
|
||||
@inproceedings{yang2014druid,
|
||||
title={Druid: a real-time analytical data store},
|
||||
author={Yang, Fangjin and Tschetter, Eric and L{\'e}aut{\'e}, Xavier and Ray, Nelson and Merlino, Gian and Ganguli, Deep},
|
||||
booktitle={Proceedings of the 2014 ACM SIGMOD international conference on Management of data},
|
||||
pages={157--168},
|
||||
year={2014},
|
||||
organization={ACM}
|
||||
}
|
||||
|
||||
@misc{2014yahoo,
|
||||
title = {Pushing the limits of Realtime Analytics using Druid},
|
||||
year = {2014},
|
||||
howpublished = "\url{http://www.slideshare.net/ydn/pushing-thelimitsofrealtimeanalyticswithdruidv3}"
|
||||
}
|
||||
|
||||
@misc{tschetter2011druid,
|
||||
author = {Eric Tschetter},
|
||||
title = {Introducing Druid: Real-Time Analytics at a Billion Rows Per Second},
|
||||
month = {April},
|
||||
year = {2011},
|
||||
howpublished = "\url{http://druid.io/blog/2011/04/30/introducing-druid.html}"
|
||||
}
|
||||
|
||||
@article{farber2012sap,
|
||||
title={SAP HANA database: data management for modern business applications},
|
||||
author={F{\"a}rber, Franz and Cha, Sang Kyun and Primsch, J{\"u}rgen and Bornh{\"o}vd, Christof and Sigg, Stefan and Lehner, Wolfgang},
|
||||
journal={ACM Sigmod Record},
|
||||
volume={40},
|
||||
number={4},
|
||||
pages={45--51},
|
||||
year={2012},
|
||||
publisher={ACM}
|
||||
}
|
||||
|
||||
@misc{voltdb2010voltdb,
|
||||
title={VoltDB Technical Overview},
|
||||
author={VoltDB, LLC},
|
||||
year={2010},
|
||||
howpublished = "\url{https://voltdb.com/}"
|
||||
}
|
||||
|
||||
@inproceedings{macnicol2004sybase,
|
||||
title={Sybase IQ multiplex-designed for analytics},
|
||||
author={MacNicol, Roger and French, Blaine},
|
||||
booktitle={Proceedings of the Thirtieth international conference on Very large data bases-Volume 30},
|
||||
pages={1227--1230},
|
||||
year={2004},
|
||||
organization={VLDB Endowment}
|
||||
}
|
||||
|
||||
@inproceedings{singh2011introduction,
|
||||
title={Introduction to the IBM Netezza warehouse appliance},
|
||||
author={Singh, Malcolm and Leonhardi, Ben},
|
||||
booktitle={Proceedings of the 2011 Conference of the Center for Advanced Studies on Collaborative Research},
|
||||
pages={385--386},
|
||||
year={2011},
|
||||
organization={IBM Corp.}
|
||||
}
|
||||
|
||||
@inproceedings{miner2012unified,
|
||||
title={Unified analytics platform for big data},
|
||||
author={Miner, Donald},
|
||||
booktitle={Proceedings of the WICSA/ECSA 2012 Companion Volume},
|
||||
pages={176--176},
|
||||
year={2012},
|
||||
organization={ACM}
|
||||
}
|
||||
|
||||
@inproceedings{fink2012distributed,
|
||||
title={Distributed computation on dynamo-style distributed storage: riak pipe},
|
||||
author={Fink, Bryan},
|
||||
booktitle={Proceedings of the eleventh ACM SIGPLAN workshop on Erlang workshop},
|
||||
pages={43--50},
|
||||
year={2012},
|
||||
organization={ACM}
|
||||
}
|
||||
|
||||
@misc{paraccel2013,
|
||||
key = {ParAccel Analytic Database},
|
||||
title = {ParAccel Analytic Database},
|
||||
month = {March},
|
||||
year = {2013},
|
||||
howpublished = "\url{http://www.paraccel.com/resources/Datasheets/ParAccel-Core-Analytic-Database.pdf}"
|
||||
}
|
||||
|
||||
@misc{cloudera2013,
|
||||
key = {Cloudera Impala},
|
||||
title = {Cloudera Impala},
|
||||
month = {March},
|
||||
year = {2013},
|
||||
url = {},
|
||||
howpublished = "\url{http://blog.cloudera.com/blog}"
|
||||
}
|
||||
|
||||
@inproceedings{hunt2010zookeeper,
|
||||
title={ZooKeeper: Wait-free coordination for Internet-scale systems},
|
||||
author={Hunt, Patrick and Konar, Mahadev and Junqueira, Flavio P and Reed, Benjamin},
|
||||
booktitle={USENIX ATC},
|
||||
volume={10},
|
||||
year={2010}
|
||||
}
|
||||
|
||||
@inproceedings{kreps2011kafka,
|
||||
title={Kafka: A distributed messaging system for log processing},
|
||||
author={Kreps, Jay and Narkhede, Neha and Rao, Jun},
|
||||
booktitle={Proceedings of 6th International Workshop on Networking Meets Databases (NetDB), Athens, Greece},
|
||||
year={2011}
|
||||
}
|
||||
|
||||
@misc{liblzf2013,
|
||||
title = {LibLZF},
|
||||
key = {LibLZF},
|
||||
month = {March},
|
||||
year = {2013},
|
||||
howpublished = "\url{http://freecode.com/projects/liblzf}"
|
||||
}
|
||||
|
||||
@inproceedings{tomasic1993performance,
|
||||
title={Performance of inverted indices in shared-nothing distributed text document information retrieval systems},
|
||||
author={Tomasic, Anthony and Garcia-Molina, Hector},
|
||||
booktitle={Parallel and Distributed Information Systems, 1993., Proceedings of the Second International Conference on},
|
||||
pages={8--17},
|
||||
year={1993},
|
||||
organization={IEEE}
|
||||
}
|
||||
|
||||
@inproceedings{antoshenkov1995byte,
|
||||
title={Byte-aligned bitmap compression},
|
||||
author={Antoshenkov, Gennady},
|
||||
booktitle={Data Compression Conference, 1995. DCC'95. Proceedings},
|
||||
pages={476},
|
||||
year={1995},
|
||||
organization={IEEE}
|
||||
}
|
||||
|
||||
@inproceedings{van2011memory,
|
||||
title={A memory efficient reachability data structure through bit vector compression},
|
||||
author={van Schaik, Sebastiaan J and de Moor, Oege},
|
||||
booktitle={Proceedings of the 2011 international conference on Management of data},
|
||||
pages={913--924},
|
||||
year={2011},
|
||||
organization={ACM}
|
||||
}
|
||||
|
||||
@inproceedings{o1993lru,
|
||||
title={The LRU-K page replacement algorithm for database disk buffering},
|
||||
author={O'neil, Elizabeth J and O'neil, Patrick E and Weikum, Gerhard},
|
||||
booktitle={ACM SIGMOD Record},
|
||||
volume={22},
|
||||
number={2},
|
||||
pages={297--306},
|
||||
year={1993},
|
||||
organization={ACM}
|
||||
}
|
||||
|
||||
@article{kim2001lrfu,
|
||||
title={LRFU: A spectrum of policies that subsumes the least recently used and least frequently used policies},
|
||||
author={Kim, Chong Sang},
|
||||
journal={IEEE Transactions on Computers},
|
||||
volume={50},
|
||||
number={12},
|
||||
year={2001}
|
||||
}
|
||||
|
||||
@article{wu2006optimizing,
|
||||
title={Optimizing bitmap indices with efficient compression},
|
||||
author={Wu, Kesheng and Otoo, Ekow J and Shoshani, Arie},
|
||||
journal={ACM Transactions on Database Systems (TODS)},
|
||||
volume={31},
|
||||
number={1},
|
||||
pages={1--38},
|
||||
year={2006},
|
||||
publisher={ACM}
|
||||
}
|
||||
|
||||
@misc{twitter2013,
|
||||
key = {Twitter Public Streams},
|
||||
title = {Twitter Public Streams},
|
||||
month = {March},
|
||||
year = {2013},
|
||||
howpublished = "\url{https://dev.twitter.com/docs/streaming-apis/streams/public}"
|
||||
}
|
||||
|
||||
@article{fitzpatrick2004distributed,
|
||||
title={Distributed caching with memcached},
|
||||
author={Fitzpatrick, Brad},
|
||||
journal={Linux journal},
|
||||
number={124},
|
||||
pages={72--74},
|
||||
year={2004}
|
||||
}
|
||||
@inproceedings{amdahl1967validity,
|
||||
title={Validity of the single processor approach to achieving large scale computing capabilities},
|
||||
author={Amdahl, Gene M},
|
||||
booktitle={Proceedings of the April 18-20, 1967, spring joint computer conference},
|
||||
pages={483--485},
|
||||
year={1967},
|
||||
organization={ACM}
|
||||
}
|
||||
@book{sarawagi1998discovery,
|
||||
title={Discovery-driven exploration of OLAP data cubes},
|
||||
author={Sarawagi, Sunita and Agrawal, Rakesh and Megiddo, Nimrod},
|
||||
year={1998},
|
||||
publisher={Springer}
|
||||
}
|
||||
@article{hu2011stream,
|
||||
title={Stream Database Survey},
|
||||
author={Hu, Bo},
|
||||
year={2011}
|
||||
}
|
||||
|
||||
@article{dean2008mapreduce,
|
||||
title={MapReduce: simplified data processing on large clusters},
|
||||
author={Dean, Jeffrey and Ghemawat, Sanjay},
|
||||
journal={Communications of the ACM},
|
||||
volume={51},
|
||||
number={1},
|
||||
pages={107--113},
|
||||
year={2008},
|
||||
publisher={ACM}
|
||||
}
|
||||
|
||||
@misc{linkedin2013senseidb,
|
||||
author = {LinkedIn},
|
||||
title = {SenseiDB},
|
||||
month = {July},
|
||||
year = {2013},
|
||||
howpublished = "\url{http://www.senseidb.com/}"
|
||||
}
|
||||
|
||||
@misc{apache2013solr,
|
||||
author = {Apache},
|
||||
title = {Apache Solr},
|
||||
month = {February},
|
||||
year = {2013},
|
||||
howpublished = "\url{http://lucene.apache.org/solr/}"
|
||||
}
|
||||
|
||||
@misc{banon2013elasticsearch,
|
||||
author = {Banon, Shay},
|
||||
title = {ElasticSearch},
|
||||
month = {July},
|
||||
year = {2013},
|
||||
howpublished = "\url{http://www.elasticseach.com/}"
|
||||
}
|
||||
|
||||
@book{oehler2012ibm,
|
||||
title={IBM Cognos TM1: The Official Guide},
|
||||
author={Oehler, Karsten and Gruenes, Jochen and Ilacqua, Christopher and Perez, Manuel},
|
||||
year={2012},
|
||||
publisher={McGraw-Hill}
|
||||
}
|
||||
|
||||
@book{schrader2009oracle,
|
||||
title={Oracle Essbase \& Oracle OLAP},
|
||||
author={Schrader, Michael and Vlamis, Dan and Nader, Mike and Claterbos, Chris and Collins, Dave and Campbell, Mitch and Conrad, Floyd},
|
||||
year={2009},
|
||||
publisher={McGraw-Hill, Inc.}
|
||||
}
|
||||
|
||||
@book{lachev2005applied,
|
||||
title={Applied Microsoft Analysis Services 2005: And Microsoft Business Intelligence Platform},
|
||||
author={Lachev, Teo},
|
||||
year={2005},
|
||||
publisher={Prologika Press}
|
||||
}
|
||||
|
||||
@article{o1996log,
|
||||
title={The log-structured merge-tree (LSM-tree)},
|
||||
author={O’Neil, Patrick and Cheng, Edward and Gawlick, Dieter and O’Neil, Elizabeth},
|
||||
journal={Acta Informatica},
|
||||
volume={33},
|
||||
number={4},
|
||||
pages={351--385},
|
||||
year={1996},
|
||||
publisher={Springer}
|
||||
}
|
||||
|
||||
@inproceedings{o1997improved,
|
||||
title={Improved query performance with variant indexes},
|
||||
author={O'Neil, Patrick and Quass, Dallan},
|
||||
booktitle={ACM Sigmod Record},
|
||||
volume={26},
|
||||
number={2},
|
||||
pages={38--49},
|
||||
year={1997},
|
||||
organization={ACM}
|
||||
}
|
||||
|
||||
@inproceedings{cipar2012lazybase,
|
||||
title={LazyBase: trading freshness for performance in a scalable database},
|
||||
author={Cipar, James and Ganger, Greg and Keeton, Kimberly and Morrey III, Charles B and Soules, Craig AN and Veitch, Alistair},
|
||||
booktitle={Proceedings of the 7th ACM european conference on Computer Systems},
|
||||
pages={169--182},
|
||||
year={2012},
|
||||
organization={ACM}
|
||||
}
|
||||
|
||||
@article{collet2013lz4,
|
||||
title={LZ4: Extremely fast compression algorithm},
|
||||
author={Collet, Yann},
|
||||
journal={code. google. com},
|
||||
year={2013}
|
||||
}
|
||||
|
||||
@inproceedings{beyer1999bottom,
|
||||
title={Bottom-up computation of sparse and iceberg cube},
|
||||
author={Beyer, Kevin and Ramakrishnan, Raghu},
|
||||
booktitle={ACM SIGMOD Record},
|
||||
volume={28},
|
||||
number={2},
|
||||
pages={359--370},
|
||||
year={1999},
|
||||
organization={ACM}
|
||||
}
|
||||
|
||||
@inproceedings{vavilapalli2013apache,
|
||||
title={Apache hadoop yarn: Yet another resource negotiator},
|
||||
author={Vavilapalli, Vinod Kumar and Murthy, Arun C and Douglas, Chris and Agarwal, Sharad and Konar, Mahadev and Evans, Robert and Graves, Thomas and Lowe, Jason and Shah, Hitesh and Seth, Siddharth and others},
|
||||
booktitle={Proceedings of the 4th annual Symposium on Cloud Computing},
|
||||
pages={5},
|
||||
year={2013},
|
||||
organization={ACM}
|
||||
}
|
||||
|
||||
@article{boykin2014summingbird,
|
||||
title={Summingbird: A Framework for Integrating Batch and Online MapReduce Computations},
|
||||
author={Boykin, Oscar and Ritchie, Sam and O’Connell, Ian and Lin, Jimmy},
|
||||
journal={Proceedings of the VLDB Endowment},
|
||||
volume={7},
|
||||
number={13},
|
||||
year={2014}
|
||||
}
|
||||
|
||||
@inproceedings{zaharia2012resilient,
|
||||
title={Resilient distributed datasets: A fault-tolerant abstraction for in-memory cluster computing},
|
||||
author={Zaharia, Matei and Chowdhury, Mosharaf and Das, Tathagata and Dave, Ankur and Ma, Justin and McCauley, Murphy and Franklin, Michael J and Shenker, Scott and Stoica, Ion},
|
||||
booktitle={Proceedings of the 9th USENIX conference on Networked Systems Design and Implementation},
|
||||
pages={2--2},
|
||||
year={2012},
|
||||
organization={USENIX Association}
|
||||
}
|
||||
|
||||
@inproceedings{stonebraker2009requirements,
|
||||
title={Requirements for Science Data Bases and SciDB.},
|
||||
author={Stonebraker, Michael and Becla, Jacek and DeWitt, David J and Lim, Kian-Tat and Maier, David and Ratzesberger, Oliver and Zdonik, Stanley B},
|
||||
booktitle={CIDR},
|
||||
volume={7},
|
||||
pages={173--184},
|
||||
year={2009}
|
||||
}
|
||||
|
||||
@article{stonebraker2010mapreduce,
|
||||
title={MapReduce and parallel DBMSs: friends or foes?},
|
||||
author={Stonebraker, Michael and Abadi, Daniel and DeWitt, David J and Madden, Sam and Paulson, Erik and Pavlo, Andrew and Rasin, Alexander},
|
||||
journal={Communications of the ACM},
|
||||
volume={53},
|
||||
number={1},
|
||||
pages={64--71},
|
||||
year={2010},
|
||||
publisher={ACM}
|
||||
}
|
Binary file not shown.
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,77 @@
|
|||
library(ggplot2)
|
||||
library(stringr)
|
||||
library(plyr)
|
||||
|
||||
x <-
|
||||
"Dimension Cardinality Concise compressed size (bytes)
|
||||
Has_mention 2 586,400
|
||||
Has_links 2 580,872
|
||||
Has_geo 2 144,004
|
||||
Is_retweet 2 584,592
|
||||
Is_viral 2 358,380
|
||||
User_lang 21 1,414,000
|
||||
User_time_zone 142 3,876,244
|
||||
URL_domain 31,165 1,562,428
|
||||
First_hashtag 100,728 1,837,144
|
||||
Rt_name 182,704 2,235,288
|
||||
Reply_to_name 620,421 5,673,504
|
||||
User_location 637,774 9,511,844
|
||||
User_mention_name 923,842 9,086,416
|
||||
User_name 1,784,369 16,000,028"
|
||||
|
||||
foo <- function(x){
|
||||
m <- matrix(unlist(str_split(x, "\t|\n")), ncol = 3, byrow = TRUE)
|
||||
df <- data.frame(m[-1, ], stringsAsFactors = FALSE)
|
||||
names(df) <- m[1, ]
|
||||
df[, 2] <- as.numeric(str_replace_all(df[, 2], ",", ""))
|
||||
df[, 3] <- as.numeric(str_replace_all(df[, 3], ",", ""))
|
||||
df <- transform(df, ytext = `Concise\ compressed\ size\ (bytes)`)
|
||||
names(df) <- c(m[1, ], "ytext")
|
||||
df
|
||||
}
|
||||
df <- foo(x)
|
||||
## df$ytext[12] <- 1.05 * df$ytext[12]
|
||||
## df$ytext[13] <- .93 * df$ytext[13]
|
||||
## df$ytext[1] <- 1.13 * df$ytext[1]
|
||||
## df$ytext[4] <- .87 * df$ytext[4]
|
||||
|
||||
## qplot(x = Cardinality, y = `Concise\ compressed\ size\ (bytes)`, data = df, geom = "point") +
|
||||
## geom_text(aes(x = Cardinality * 1.2, y = ytext, label = Dimension), hjust = 0, size = 4) +
|
||||
## scale_x_log10(limits = c(1, 10^8.5)) +
|
||||
## scale_y_log10() +
|
||||
## geom_hline(aes(yintercept = 9089180)) +
|
||||
## geom_text(aes(x = 1e2, y = 9089180 * 1.1, label = "Integer array size (bytes)"), hjust = 0, size = 4) +
|
||||
## ggtitle("The Relationship of Compressed Size to Cardinality")
|
||||
|
||||
|
||||
y <-
|
||||
"Dimension Cardinality Concise compressed size (bytes)
|
||||
Has_mention 2 744
|
||||
Has_links 2 1,504
|
||||
Has_geo 2 2,840
|
||||
Is_retweet 2 1,616
|
||||
Is_viral 2 1,488
|
||||
User_lang 21 38,416
|
||||
User_time_zone 142 319,644
|
||||
URL_domain 31,165 700,752
|
||||
First_hashtag 100,728 1,505,292
|
||||
Rt_name 182,704 1,874,180
|
||||
Reply_to_name 620,421 5,404,108
|
||||
User_location 637,774 9,091,016
|
||||
User_mention_name 923,842 8,686,384
|
||||
User_name 1,784,369 16,204,900"
|
||||
df2 <- foo(y)
|
||||
|
||||
df$sorted <- "unsorted"
|
||||
df2$sorted <- "sorted"
|
||||
dat <- rbind(df, df2)
|
||||
|
||||
|
||||
ggplot(data = dat, aes(x = Cardinality, y = `Concise\ compressed\ size\ (bytes)`)) +
|
||||
geom_point(aes(color = sorted, shape = sorted), alpha = .9, size = 4) +
|
||||
scale_x_log10(limits = c(1, 10^8.5)) +
|
||||
scale_y_log10() +
|
||||
geom_hline(aes(yintercept = 9089180)) +
|
||||
geom_text(aes(x = 1e1, y = 9089180 * 1.4, label = "Integer array size (bytes)"), hjust = 0, size = 5)
|
||||
#ggsave("concise_plot.png", width = 10, height = 8)
|
||||
ggsave("../figures/concise_plot.pdf", width = 6, height = 4.5)
|
|
@ -0,0 +1,253 @@
|
|||
library(stringr)
|
||||
library(xtable)
|
||||
library(plyr)
|
||||
library(ggplot2)
|
||||
|
||||
stringToDF <- function(x, ncol){
|
||||
m <- matrix(unlist(str_split(x, "\t|\n")), ncol = ncol, byrow = TRUE)
|
||||
df <- data.frame(m[-1, ], stringsAsFactors = FALSE)
|
||||
names(df) <- m[1, ]
|
||||
df
|
||||
}
|
||||
|
||||
##print(xtable(stringToDF(x, 3)), include.rownames = FALSE)
|
||||
|
||||
stringToDF2 <- function(x, query){
|
||||
m <- matrix(unlist(str_split(x, "\t|\n")), ncol = 3, byrow = TRUE)
|
||||
df <- data.frame(m[-1, ], stringsAsFactors = FALSE)
|
||||
names(df) <- m[1, ]
|
||||
df[, 2] <- as.numeric(str_replace_all(df[, 2], ",", ""))
|
||||
df[, 3] <- as.numeric(str_replace_all(df[, 3], ",", ""))
|
||||
df2 <- ldply(1:nrow(df), function(i) gsub("^\\s+|\\s+$", "", str_split(df[i, 1], ",")[[1]]))
|
||||
names(df2) <- c("cores", "nodes", "configuration")
|
||||
df2$query <- query
|
||||
cbind(df, df2)
|
||||
}
|
||||
|
||||
x2 <- "Cluster
|
||||
Cluster scan rate (rows/sec)
|
||||
Core scan rate
|
||||
15-core, 100 nodes, in-memory
|
||||
26,610,386,635
|
||||
17,740,258
|
||||
15-core, 75 nodes, mmap
|
||||
25,224,873,928
|
||||
22,422,110
|
||||
15-core, 50 nodes, mmap
|
||||
20,387,152,160
|
||||
27,182,870
|
||||
15-core, 25 nodes, mmap
|
||||
11,910,388,894
|
||||
31,761,037
|
||||
4-core, 131 nodes, in-memory
|
||||
10,008,730,163
|
||||
19,100,630
|
||||
4-core, 131 nodes, mmap
|
||||
10,129,695,120
|
||||
19,331,479
|
||||
4-core, 50 nodes, mmap
|
||||
6,626,570,688
|
||||
33,132,853"
|
||||
|
||||
|
||||
x3 <- "Cluster
|
||||
Cluster scan rate (rows/sec)
|
||||
Core scan rate
|
||||
15-core, 100 nodes, in-memory
|
||||
16,223,081,703
|
||||
10,815,388
|
||||
15-core, 75 nodes, mmap
|
||||
9,860,968,285
|
||||
8,765,305
|
||||
15-core, 50 nodes, mmap
|
||||
8,093,611,909
|
||||
10,791,483
|
||||
15-core, 25 nodes, mmap
|
||||
4,126,502,352
|
||||
11,004,006
|
||||
4-core, 131 nodes, in-memory
|
||||
5,755,274,389
|
||||
10,983,348
|
||||
4-core, 131 nodes, mmap
|
||||
5,032,185,657
|
||||
9,603,408
|
||||
4-core, 50 nodes, mmap
|
||||
1,720,238,609
|
||||
8,601,193"
|
||||
|
||||
x4 <- "Cluster
|
||||
Cluster scan rate (rows/sec)
|
||||
Core scan rate
|
||||
15-core, 100 nodes, in-memory
|
||||
7,591,604,822
|
||||
5,061,070
|
||||
15-core, 75 nodes, mmap
|
||||
4,319,179,995
|
||||
3,839,271
|
||||
15-core, 50 nodes, mmap
|
||||
3,406,554,102
|
||||
4,542,072
|
||||
15-core, 25 nodes, mmap
|
||||
1,826,451,888
|
||||
4,870,538
|
||||
4-core, 131 nodes, in-memory
|
||||
1,936,648,601
|
||||
3,695,894
|
||||
4-core, 131 nodes, mmap
|
||||
2,210,367,152
|
||||
4,218,258
|
||||
4-core, 50 nodes, mmap
|
||||
1,002,291,562
|
||||
5,011,458"
|
||||
|
||||
x5 <- "Cluster
|
||||
Cluster scan rate (rows/sec)
|
||||
Core scan rate
|
||||
15-core, 100 nodes, in-memory
|
||||
10,241,183,745
|
||||
6,827,456
|
||||
15-core, 75 nodes, mmap
|
||||
4,891,097,559
|
||||
4,347,642
|
||||
15-core, 50 nodes, mmap
|
||||
3,616,707,511
|
||||
4,822,277
|
||||
15-core, 25 nodes, mmap
|
||||
1,665,053,263
|
||||
4,440,142
|
||||
4-core, 131 nodes, in-memory
|
||||
4,388,159,569
|
||||
8,374,350
|
||||
4-core, 131 nodes, mmap
|
||||
2,444,344,232
|
||||
4,664,779
|
||||
4-core, 50 nodes, mmap
|
||||
1,215,737,558
|
||||
6,078,688"
|
||||
|
||||
x6 <- "Cluster
|
||||
Cluster scan rate (rows/sec)
|
||||
Core scan rate
|
||||
15-core, 100 nodes, in-memory
|
||||
7,309,984,688
|
||||
4,873,323
|
||||
15-core, 75 nodes, mmap
|
||||
3,333,628,777
|
||||
2,963,226
|
||||
15-core, 50 nodes, mmap
|
||||
2,555,300,237
|
||||
3,407,067
|
||||
15-core, 25 nodes, mmap
|
||||
1,384,674,717
|
||||
3,692,466
|
||||
4-core, 131 nodes, in-memory
|
||||
3,237,907,984
|
||||
6,179,214
|
||||
4-core, 131 nodes, mmap
|
||||
1,740,481,380
|
||||
3,321,529
|
||||
4-core, 50 nodes, mmap
|
||||
863,170,420
|
||||
4,315,852"
|
||||
|
||||
x7 <- "Cluster
|
||||
Cluster scan rate (rows/sec)
|
||||
Core scan rate
|
||||
15-core, 100 nodes, in-memory
|
||||
4,064,424,274
|
||||
2,709,616
|
||||
15-core, 75 nodes, mmap
|
||||
2,014,067,386
|
||||
1,790,282
|
||||
15-core, 50 nodes, mmap
|
||||
1,499,452,617
|
||||
1,999,270
|
||||
15-core, 25 nodes, mmap
|
||||
810,143,518
|
||||
2,160,383
|
||||
4-core, 131 nodes, in-memory
|
||||
1,670,214,695
|
||||
3,187,433
|
||||
4-core, 131 nodes, mmap
|
||||
1,116,635,690
|
||||
2,130,984
|
||||
4-core, 50 nodes, mmap
|
||||
531,389,163
|
||||
2,656,946"
|
||||
|
||||
dat <- ldply(2:7, function(i){
|
||||
df <- eval(parse(text = paste("x", i, sep = "")))
|
||||
stringToDF2(df, paste("Query", i - 1, sep = " "))
|
||||
})
|
||||
|
||||
ggplot(data = dat, aes(
|
||||
x = as.numeric(str_extract(nodes, "[0-9]+")),
|
||||
y = `Cluster scan rate (rows/sec)` / 1e9,
|
||||
shape = configuration,
|
||||
color = query,
|
||||
size = factor(cores, levels = c("4-core", "15-core"))
|
||||
)) +
|
||||
scale_size_manual(values = c(3, 6), guide = guide_legend(title = "Cores Per Node")) +
|
||||
scale_color_discrete(guide = guide_legend(title = "Query")) +
|
||||
scale_shape_discrete(guide = guide_legend(title = "Configuration")) +
|
||||
geom_point() +
|
||||
scale_y_log10(breaks = 2^(-1:5)) +
|
||||
facet_grid(configuration~.) +
|
||||
xlab("Number of Nodes") +
|
||||
ylab("Cluster Scan Rate (billion rows/sec.)")
|
||||
|
||||
dat2 <- subset(dat, cores == "15-core" & configuration == "mmap")
|
||||
dat2$x <- as.numeric(str_extract(dat2$nodes, "[0-9]+"))
|
||||
baseRate <- list()
|
||||
d_ply(subset(dat2, x == 25), .(query), function(df) baseRate[df$query] <<- df$`Cluster scan rate (rows/sec)`)
|
||||
dat2 <- ddply(dat2, .(query, x), function(df){
|
||||
df$projected <- df$x / 25 * unlist(baseRate[df$query])
|
||||
df
|
||||
})
|
||||
|
||||
ggplot(data = dat2, aes(
|
||||
x = as.numeric(str_extract(nodes, "[0-9]+")),
|
||||
y = `Cluster scan rate (rows/sec)` / 1e9,
|
||||
color = query,
|
||||
shape = query
|
||||
)) +
|
||||
# scale_y_log10(breaks = 2^(-1:5)) +
|
||||
# scale_color_discrete(guide = guide_legend(title = "Query")) +
|
||||
geom_point(size = 3) +
|
||||
# geom_path(aes(y = projected)) +
|
||||
geom_line(aes(y = projected / 1e9)) +
|
||||
# scale_y_log10() +
|
||||
xlab("Number of Nodes") +
|
||||
ylab("Cluster Scan Rate (billion rows/sec.)")
|
||||
ggsave("../figures/cluster_scan_rate.pdf", width = 4, height = 3)
|
||||
|
||||
|
||||
|
||||
## ggplot(data = dat, aes(
|
||||
## x = as.numeric(str_extract(nodes, "[0-9]+")),
|
||||
## y = `Core scan rate` / 1e6,
|
||||
## shape = configuration,
|
||||
## color = query,
|
||||
## size = factor(cores, levels = c("4-core", "15-core"))
|
||||
## )) +
|
||||
## scale_size_manual(values = c(3, 6), guide = guide_legend(title = "Cores Per Node")) +
|
||||
## scale_color_discrete(guide = guide_legend(title = "Query")) +
|
||||
## scale_shape_discrete(guide = guide_legend(title = "Configuration")) +
|
||||
## geom_point() +
|
||||
## scale_y_log10(breaks = 2^(-1:5)) +
|
||||
## xlab("Number of Nodes") +
|
||||
## ylab("Core Scan Rate (million rows/sec.)")
|
||||
|
||||
ggplot(data = dat2, aes(
|
||||
x = as.numeric(str_extract(nodes, "[0-9]+")),
|
||||
y = `Core scan rate` / 1e6,
|
||||
color = query,
|
||||
shape = query
|
||||
)) +
|
||||
|
||||
geom_point(size = 3) +
|
||||
xlab("Number of Nodes") +
|
||||
ylab("Core Scan Rate (million rows/sec.)")
|
||||
ggsave("../figures/core_scan_rate.pdf", width = 4, height = 3)
|
||||
|
||||
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue