Merge pull request #442 from metamx/demo-paper
Writing a demonstration paper for a conference
|
@ -36,8 +36,6 @@ druid.processing.numThreads=1
|
||||||
druid.segmentCache.locations=[{"path": "/tmp/druid/indexCache", "maxSize"\: 10000000000}]
|
druid.segmentCache.locations=[{"path": "/tmp/druid/indexCache", "maxSize"\: 10000000000}]
|
||||||
```
|
```
|
||||||
|
|
||||||
Note: This will spin up a Historical node with the local filesystem as deep storage.
|
|
||||||
|
|
||||||
Production Configs
|
Production Configs
|
||||||
------------------
|
------------------
|
||||||
These production configs are using S3 as a deep store.
|
These production configs are using S3 as a deep store.
|
||||||
|
|
|
@ -0,0 +1,12 @@
|
||||||
|
all : druid_demo.pdf
|
||||||
|
|
||||||
|
clean :
|
||||||
|
@rm -f *.aux *.bbl *.blg *.log
|
||||||
|
|
||||||
|
%.tex : %.bib
|
||||||
|
|
||||||
|
%.pdf : %.tex %.bib
|
||||||
|
lualatex $(*F)
|
||||||
|
bibtex $(*F)
|
||||||
|
lualatex $(*F)
|
||||||
|
lualatex $(*F)
|
|
@ -0,0 +1,54 @@
|
||||||
|
\relax
|
||||||
|
\providecommand\HyperFirstAtBeginDocument{\AtBeginDocument}
|
||||||
|
\HyperFirstAtBeginDocument{\ifx\hyper@anchor\@undefined
|
||||||
|
\global\let\oldcontentsline\contentsline
|
||||||
|
\gdef\contentsline#1#2#3#4{\oldcontentsline{#1}{#2}{#3}}
|
||||||
|
\global\let\oldnewlabel\newlabel
|
||||||
|
\gdef\newlabel#1#2{\newlabelxx{#1}#2}
|
||||||
|
\gdef\newlabelxx#1#2#3#4#5#6{\oldnewlabel{#1}{{#2}{#3}}}
|
||||||
|
\AtEndDocument{\ifx\hyper@anchor\@undefined
|
||||||
|
\let\contentsline\oldcontentsline
|
||||||
|
\let\newlabel\oldnewlabel
|
||||||
|
\fi}
|
||||||
|
\fi}
|
||||||
|
\global\let\hyper@last\relax
|
||||||
|
\gdef\HyperFirstAtBeginDocument#1{#1}
|
||||||
|
\providecommand\HyField@AuxAddToFields[1]{}
|
||||||
|
\citation{hunt2010zookeeper}
|
||||||
|
\@writefile{toc}{\contentsline {section}{\numberline {1}Introduction}{1}{section.1}}
|
||||||
|
\@writefile{toc}{\contentsline {subsection}{\numberline {1.1}The Need for Druid}{1}{subsection.1.1}}
|
||||||
|
\@writefile{toc}{\contentsline {section}{\numberline {2}Architecture}{1}{section.2}}
|
||||||
|
\citation{abadi2008column}
|
||||||
|
\@writefile{lof}{\contentsline {figure}{\numberline {1}{\ignorespaces An overview of a Druid cluster and the flow of data through the cluster.}}{2}{figure.1}}
|
||||||
|
\newlabel{fig:cluster}{{1}{2}{An overview of a Druid cluster and the flow of data through the cluster}{figure.1}{}}
|
||||||
|
\@writefile{toc}{\contentsline {subsection}{\numberline {2.1}Real-time Nodes}{2}{subsection.2.1}}
|
||||||
|
\@writefile{toc}{\contentsline {subsection}{\numberline {2.2}Historical Nodes}{2}{subsection.2.2}}
|
||||||
|
\@writefile{toc}{\contentsline {subsection}{\numberline {2.3}Broker Nodes}{2}{subsection.2.3}}
|
||||||
|
\@writefile{toc}{\contentsline {subsection}{\numberline {2.4}Coordinator Nodes}{2}{subsection.2.4}}
|
||||||
|
\@writefile{toc}{\contentsline {subsection}{\numberline {2.5}Query Processing}{2}{subsection.2.5}}
|
||||||
|
\citation{tomasic1993performance}
|
||||||
|
\citation{colantonio2010concise}
|
||||||
|
\@writefile{lot}{\contentsline {table}{\numberline {1}{\ignorespaces Sample sales data set.}}{3}{table.1}}
|
||||||
|
\newlabel{tab:sample_data}{{1}{3}{Sample sales data set}{table.1}{}}
|
||||||
|
\@writefile{toc}{\contentsline {subsection}{\numberline {2.6}Query Capabilities}{3}{subsection.2.6}}
|
||||||
|
\@writefile{lof}{\contentsline {figure}{\numberline {2}{\ignorespaces Query latencies of production data sources.}}{3}{figure.2}}
|
||||||
|
\newlabel{fig:query_latency}{{2}{3}{Query latencies of production data sources}{figure.2}{}}
|
||||||
|
\@writefile{lof}{\contentsline {figure}{\numberline {3}{\ignorespaces Druid \& MySQL benchmarks -- 100GB TPC-H data.}}{3}{figure.3}}
|
||||||
|
\newlabel{fig:tpch_100gb}{{3}{3}{Druid \& MySQL benchmarks -- 100GB TPC-H data}{figure.3}{}}
|
||||||
|
\@writefile{toc}{\contentsline {section}{\numberline {3}Performance}{3}{section.3}}
|
||||||
|
\@writefile{toc}{\contentsline {subsection}{\numberline {3.1}Query Performance}{3}{subsection.3.1}}
|
||||||
|
\bibstyle{abbrv}
|
||||||
|
\bibdata{druid_demo}
|
||||||
|
\bibcite{abadi2008column}{1}
|
||||||
|
\bibcite{colantonio2010concise}{2}
|
||||||
|
\bibcite{hunt2010zookeeper}{3}
|
||||||
|
\bibcite{tomasic1993performance}{4}
|
||||||
|
\@writefile{lof}{\contentsline {figure}{\numberline {4}{\ignorespaces Combined cluster ingestion rates.}}{4}{figure.4}}
|
||||||
|
\newlabel{fig:ingestion_rate}{{4}{4}{Combined cluster ingestion rates}{figure.4}{}}
|
||||||
|
\@writefile{toc}{\contentsline {subsection}{\numberline {3.2}Data Ingestion Performance}{4}{subsection.3.2}}
|
||||||
|
\@writefile{toc}{\contentsline {section}{\numberline {4}Demonstration Details}{4}{section.4}}
|
||||||
|
\@writefile{toc}{\contentsline {subsection}{\numberline {4.1}Setup}{4}{subsection.4.1}}
|
||||||
|
\@writefile{toc}{\contentsline {subsection}{\numberline {4.2}Goals}{4}{subsection.4.2}}
|
||||||
|
\@writefile{toc}{\contentsline {section}{\numberline {5}Acknowledgments}{4}{section.5}}
|
||||||
|
\@writefile{toc}{\contentsline {section}{\numberline {6}Additional Authors}{4}{section.6}}
|
||||||
|
\@writefile{toc}{\contentsline {section}{\numberline {7}References}{4}{section.7}}
|
|
@ -0,0 +1,27 @@
|
||||||
|
\begin{thebibliography}{1}
|
||||||
|
|
||||||
|
\bibitem{abadi2008column}
|
||||||
|
D.~J. Abadi, S.~R. Madden, and N.~Hachem.
|
||||||
|
\newblock Column-stores vs. row-stores: How different are they really?
|
||||||
|
\newblock In {\em Proceedings of the 2008 ACM SIGMOD international conference
|
||||||
|
on Management of data}, pages 967--980. ACM, 2008.
|
||||||
|
|
||||||
|
\bibitem{colantonio2010concise}
|
||||||
|
A.~Colantonio and R.~Di~Pietro.
|
||||||
|
\newblock Concise: Compressed ‘n’composable integer set.
|
||||||
|
\newblock {\em Information Processing Letters}, 110(16):644--650, 2010.
|
||||||
|
|
||||||
|
\bibitem{hunt2010zookeeper}
|
||||||
|
P.~Hunt, M.~Konar, F.~P. Junqueira, and B.~Reed.
|
||||||
|
\newblock Zookeeper: Wait-free coordination for internet-scale systems.
|
||||||
|
\newblock In {\em USENIX ATC}, volume~10, 2010.
|
||||||
|
|
||||||
|
\bibitem{tomasic1993performance}
|
||||||
|
A.~Tomasic and H.~Garcia-Molina.
|
||||||
|
\newblock Performance of inverted indices in shared-nothing distributed text
|
||||||
|
document information retrieval systems.
|
||||||
|
\newblock In {\em Parallel and Distributed Information Systems, 1993.,
|
||||||
|
Proceedings of the Second International Conference on}, pages 8--17. IEEE,
|
||||||
|
1993.
|
||||||
|
|
||||||
|
\end{thebibliography}
|
|
@ -0,0 +1,420 @@
|
||||||
|
@article{cattell2011scalable,
|
||||||
|
title={Scalable SQL and NoSQL data stores},
|
||||||
|
author={Cattell, Rick},
|
||||||
|
journal={ACM SIGMOD Record},
|
||||||
|
volume={39},
|
||||||
|
number={4},
|
||||||
|
pages={12--27},
|
||||||
|
year={2011},
|
||||||
|
publisher={ACM}
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{chang2008bigtable,
|
||||||
|
title={Bigtable: A distributed storage system for structured data},
|
||||||
|
author={Chang, Fay and Dean, Jeffrey and Ghemawat, Sanjay and Hsieh, Wilson C and Wallach, Deborah A and Burrows, Mike and Chandra, Tushar and Fikes, Andrew and Gruber, Robert E},
|
||||||
|
journal={ACM Transactions on Computer Systems (TOCS)},
|
||||||
|
volume={26},
|
||||||
|
number={2},
|
||||||
|
pages={4},
|
||||||
|
year={2008},
|
||||||
|
publisher={ACM}
|
||||||
|
}
|
||||||
|
|
||||||
|
@inproceedings{decandia2007dynamo,
|
||||||
|
title={Dynamo: amazon's highly available key-value store},
|
||||||
|
author={DeCandia, Giuseppe and Hastorun, Deniz and Jampani, Madan and Kakulapati, Gunavardhan and Lakshman, Avinash and Pilchin, Alex and Sivasubramanian, Swaminathan and Vosshall, Peter and Vogels, Werner},
|
||||||
|
booktitle={ACM SIGOPS Operating Systems Review},
|
||||||
|
volume={41},
|
||||||
|
number={6},
|
||||||
|
pages={205--220},
|
||||||
|
year={2007},
|
||||||
|
organization={ACM}
|
||||||
|
}
|
||||||
|
|
||||||
|
@inproceedings{abadi2008column,
|
||||||
|
title={Column-Stores vs. Row-Stores: How different are they really?},
|
||||||
|
author={Abadi, Daniel J and Madden, Samuel R and Hachem, Nabil},
|
||||||
|
booktitle={Proceedings of the 2008 ACM SIGMOD international conference on Management of data},
|
||||||
|
pages={967--980},
|
||||||
|
year={2008},
|
||||||
|
organization={ACM}
|
||||||
|
}
|
||||||
|
|
||||||
|
@inproceedings{bear2012vertica,
|
||||||
|
title={The vertica database: SQL RDBMS for managing big data},
|
||||||
|
author={Bear, Chuck and Lamb, Andrew and Tran, Nga},
|
||||||
|
booktitle={Proceedings of the 2012 workshop on Management of big data systems},
|
||||||
|
pages={37--38},
|
||||||
|
year={2012},
|
||||||
|
organization={ACM}
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{lakshman2010cassandra,
|
||||||
|
title={Cassandra—A decentralized structured storage system},
|
||||||
|
author={Lakshman, Avinash and Malik, Prashant},
|
||||||
|
journal={Operating systems review},
|
||||||
|
volume={44},
|
||||||
|
number={2},
|
||||||
|
pages={35},
|
||||||
|
year={2010}
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{melnik2010dremel,
|
||||||
|
title={Dremel: interactive analysis of web-scale datasets},
|
||||||
|
author={Melnik, Sergey and Gubarev, Andrey and Long, Jing Jing and Romer, Geoffrey and Shivakumar, Shiva and Tolton, Matt and Vassilakis, Theo},
|
||||||
|
journal={Proceedings of the VLDB Endowment},
|
||||||
|
volume={3},
|
||||||
|
number={1-2},
|
||||||
|
pages={330--339},
|
||||||
|
year={2010},
|
||||||
|
publisher={VLDB Endowment}
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{hall2012processing,
|
||||||
|
title={Processing a trillion cells per mouse click},
|
||||||
|
author={Hall, Alexander and Bachmann, Olaf and B{\"u}ssow, Robert and G{\u{a}}nceanu, Silviu and Nunkesser, Marc},
|
||||||
|
journal={Proceedings of the VLDB Endowment},
|
||||||
|
volume={5},
|
||||||
|
number={11},
|
||||||
|
pages={1436--1446},
|
||||||
|
year={2012},
|
||||||
|
publisher={VLDB Endowment}
|
||||||
|
}
|
||||||
|
|
||||||
|
@inproceedings{shvachko2010hadoop,
|
||||||
|
title={The hadoop distributed file system},
|
||||||
|
author={Shvachko, Konstantin and Kuang, Hairong and Radia, Sanjay and Chansler, Robert},
|
||||||
|
booktitle={Mass Storage Systems and Technologies (MSST), 2010 IEEE 26th Symposium on},
|
||||||
|
pages={1--10},
|
||||||
|
year={2010},
|
||||||
|
organization={IEEE}
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{colantonio2010concise,
|
||||||
|
title={Concise: Compressed ‘n’Composable Integer Set},
|
||||||
|
author={Colantonio, Alessandro and Di Pietro, Roberto},
|
||||||
|
journal={Information Processing Letters},
|
||||||
|
volume={110},
|
||||||
|
number={16},
|
||||||
|
pages={644--650},
|
||||||
|
year={2010},
|
||||||
|
publisher={Elsevier}
|
||||||
|
}
|
||||||
|
|
||||||
|
@inproceedings{stonebraker2005c,
|
||||||
|
title={C-store: a column-oriented DBMS},
|
||||||
|
author={Stonebraker, Mike and Abadi, Daniel J and Batkin, Adam and Chen, Xuedong and Cherniack, Mitch and Ferreira, Miguel and Lau, Edmond and Lin, Amerson and Madden, Sam and O'Neil, Elizabeth and others},
|
||||||
|
booktitle={Proceedings of the 31st international conference on Very large data bases},
|
||||||
|
pages={553--564},
|
||||||
|
year={2005},
|
||||||
|
organization={VLDB Endowment}
|
||||||
|
}
|
||||||
|
|
||||||
|
@inproceedings{engle2012shark,
|
||||||
|
title={Shark: fast data analysis using coarse-grained distributed memory},
|
||||||
|
author={Engle, Cliff and Lupher, Antonio and Xin, Reynold and Zaharia, Matei and Franklin, Michael J and Shenker, Scott and Stoica, Ion},
|
||||||
|
booktitle={Proceedings of the 2012 international conference on Management of Data},
|
||||||
|
pages={689--692},
|
||||||
|
year={2012},
|
||||||
|
organization={ACM}
|
||||||
|
}
|
||||||
|
|
||||||
|
@inproceedings{zaharia2012discretized,
|
||||||
|
title={Discretized streams: an efficient and fault-tolerant model for stream processing on large clusters},
|
||||||
|
author={Zaharia, Matei and Das, Tathagata and Li, Haoyuan and Shenker, Scott and Stoica, Ion},
|
||||||
|
booktitle={Proceedings of the 4th USENIX conference on Hot Topics in Cloud Computing},
|
||||||
|
pages={10--10},
|
||||||
|
year={2012},
|
||||||
|
organization={USENIX Association}
|
||||||
|
}
|
||||||
|
|
||||||
|
@misc{marz2013storm,
|
||||||
|
author = {Marz, Nathan},
|
||||||
|
title = {Storm: Distributed and Fault-Tolerant Realtime Computation},
|
||||||
|
month = {February},
|
||||||
|
year = {2013},
|
||||||
|
howpublished = "\url{http://storm-project.net/}"
|
||||||
|
}
|
||||||
|
|
||||||
|
@misc{tschetter2011druid,
|
||||||
|
author = {Eric Tschetter},
|
||||||
|
title = {Introducing Druid: Real-Time Analytics at a Billion Rows Per Second},
|
||||||
|
month = {April},
|
||||||
|
year = {2011},
|
||||||
|
howpublished = "\url{http://druid.io/blog/2011/04/30/introducing-druid.html}"
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{farber2012sap,
|
||||||
|
title={SAP HANA database: data management for modern business applications},
|
||||||
|
author={F{\"a}rber, Franz and Cha, Sang Kyun and Primsch, J{\"u}rgen and Bornh{\"o}vd, Christof and Sigg, Stefan and Lehner, Wolfgang},
|
||||||
|
journal={ACM Sigmod Record},
|
||||||
|
volume={40},
|
||||||
|
number={4},
|
||||||
|
pages={45--51},
|
||||||
|
year={2012},
|
||||||
|
publisher={ACM}
|
||||||
|
}
|
||||||
|
|
||||||
|
@misc{voltdb2010voltdb,
|
||||||
|
title={VoltDB Technical Overview},
|
||||||
|
author={VoltDB, LLC},
|
||||||
|
year={2010},
|
||||||
|
howpublished = "\url{https://voltdb.com/}"
|
||||||
|
}
|
||||||
|
|
||||||
|
@inproceedings{macnicol2004sybase,
|
||||||
|
title={Sybase IQ multiplex-designed for analytics},
|
||||||
|
author={MacNicol, Roger and French, Blaine},
|
||||||
|
booktitle={Proceedings of the Thirtieth international conference on Very large data bases-Volume 30},
|
||||||
|
pages={1227--1230},
|
||||||
|
year={2004},
|
||||||
|
organization={VLDB Endowment}
|
||||||
|
}
|
||||||
|
|
||||||
|
@inproceedings{singh2011introduction,
|
||||||
|
title={Introduction to the IBM Netezza warehouse appliance},
|
||||||
|
author={Singh, Malcolm and Leonhardi, Ben},
|
||||||
|
booktitle={Proceedings of the 2011 Conference of the Center for Advanced Studies on Collaborative Research},
|
||||||
|
pages={385--386},
|
||||||
|
year={2011},
|
||||||
|
organization={IBM Corp.}
|
||||||
|
}
|
||||||
|
|
||||||
|
@inproceedings{miner2012unified,
|
||||||
|
title={Unified analytics platform for big data},
|
||||||
|
author={Miner, Donald},
|
||||||
|
booktitle={Proceedings of the WICSA/ECSA 2012 Companion Volume},
|
||||||
|
pages={176--176},
|
||||||
|
year={2012},
|
||||||
|
organization={ACM}
|
||||||
|
}
|
||||||
|
|
||||||
|
@inproceedings{fink2012distributed,
|
||||||
|
title={Distributed computation on dynamo-style distributed storage: riak pipe},
|
||||||
|
author={Fink, Bryan},
|
||||||
|
booktitle={Proceedings of the eleventh ACM SIGPLAN workshop on Erlang workshop},
|
||||||
|
pages={43--50},
|
||||||
|
year={2012},
|
||||||
|
organization={ACM}
|
||||||
|
}
|
||||||
|
|
||||||
|
@misc{paraccel2013,
|
||||||
|
key = {ParAccel Analytic Database},
|
||||||
|
title = {ParAccel Analytic Database},
|
||||||
|
month = {March},
|
||||||
|
year = {2013},
|
||||||
|
howpublished = "\url{http://www.paraccel.com/resources/Datasheets/ParAccel-Core-Analytic-Database.pdf}"
|
||||||
|
}
|
||||||
|
|
||||||
|
@misc{cloudera2013,
|
||||||
|
key = {Cloudera Impala},
|
||||||
|
title = {Cloudera Impala},
|
||||||
|
month = {March},
|
||||||
|
year = {2013},
|
||||||
|
url = {},
|
||||||
|
howpublished = "\url{http://blog.cloudera.com/blog}"
|
||||||
|
}
|
||||||
|
|
||||||
|
@inproceedings{hunt2010zookeeper,
|
||||||
|
title={ZooKeeper: Wait-free coordination for Internet-scale systems},
|
||||||
|
author={Hunt, Patrick and Konar, Mahadev and Junqueira, Flavio P and Reed, Benjamin},
|
||||||
|
booktitle={USENIX ATC},
|
||||||
|
volume={10},
|
||||||
|
year={2010}
|
||||||
|
}
|
||||||
|
|
||||||
|
@inproceedings{kreps2011kafka,
|
||||||
|
title={Kafka: A distributed messaging system for log processing},
|
||||||
|
author={Kreps, Jay and Narkhede, Neha and Rao, Jun},
|
||||||
|
booktitle={Proceedings of 6th International Workshop on Networking Meets Databases (NetDB), Athens, Greece},
|
||||||
|
year={2011}
|
||||||
|
}
|
||||||
|
|
||||||
|
@misc{liblzf2013,
|
||||||
|
title = {LibLZF},
|
||||||
|
key = {LibLZF},
|
||||||
|
month = {March},
|
||||||
|
year = {2013},
|
||||||
|
howpublished = "\url{http://freecode.com/projects/liblzf}"
|
||||||
|
}
|
||||||
|
|
||||||
|
@inproceedings{tomasic1993performance,
|
||||||
|
title={Performance of inverted indices in shared-nothing distributed text document information retrieval systems},
|
||||||
|
author={Tomasic, Anthony and Garcia-Molina, Hector},
|
||||||
|
booktitle={Parallel and Distributed Information Systems, 1993., Proceedings of the Second International Conference on},
|
||||||
|
pages={8--17},
|
||||||
|
year={1993},
|
||||||
|
organization={IEEE}
|
||||||
|
}
|
||||||
|
|
||||||
|
@inproceedings{antoshenkov1995byte,
|
||||||
|
title={Byte-aligned bitmap compression},
|
||||||
|
author={Antoshenkov, Gennady},
|
||||||
|
booktitle={Data Compression Conference, 1995. DCC'95. Proceedings},
|
||||||
|
pages={476},
|
||||||
|
year={1995},
|
||||||
|
organization={IEEE}
|
||||||
|
}
|
||||||
|
|
||||||
|
@inproceedings{van2011memory,
|
||||||
|
title={A memory efficient reachability data structure through bit vector compression},
|
||||||
|
author={van Schaik, Sebastiaan J and de Moor, Oege},
|
||||||
|
booktitle={Proceedings of the 2011 international conference on Management of data},
|
||||||
|
pages={913--924},
|
||||||
|
year={2011},
|
||||||
|
organization={ACM}
|
||||||
|
}
|
||||||
|
|
||||||
|
@inproceedings{o1993lru,
|
||||||
|
title={The LRU-K page replacement algorithm for database disk buffering},
|
||||||
|
author={O'neil, Elizabeth J and O'neil, Patrick E and Weikum, Gerhard},
|
||||||
|
booktitle={ACM SIGMOD Record},
|
||||||
|
volume={22},
|
||||||
|
number={2},
|
||||||
|
pages={297--306},
|
||||||
|
year={1993},
|
||||||
|
organization={ACM}
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{kim2001lrfu,
|
||||||
|
title={LRFU: A spectrum of policies that subsumes the least recently used and least frequently used policies},
|
||||||
|
author={Kim, Chong Sang},
|
||||||
|
journal={IEEE Transactions on Computers},
|
||||||
|
volume={50},
|
||||||
|
number={12},
|
||||||
|
year={2001}
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{wu2006optimizing,
|
||||||
|
title={Optimizing bitmap indices with efficient compression},
|
||||||
|
author={Wu, Kesheng and Otoo, Ekow J and Shoshani, Arie},
|
||||||
|
journal={ACM Transactions on Database Systems (TODS)},
|
||||||
|
volume={31},
|
||||||
|
number={1},
|
||||||
|
pages={1--38},
|
||||||
|
year={2006},
|
||||||
|
publisher={ACM}
|
||||||
|
}
|
||||||
|
|
||||||
|
@misc{twitter2013,
|
||||||
|
key = {Twitter Public Streams},
|
||||||
|
title = {Twitter Public Streams},
|
||||||
|
month = {March},
|
||||||
|
year = {2013},
|
||||||
|
howpublished = "\url{https://dev.twitter.com/docs/streaming-apis/streams/public}"
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{fitzpatrick2004distributed,
|
||||||
|
title={Distributed caching with memcached},
|
||||||
|
author={Fitzpatrick, Brad},
|
||||||
|
journal={Linux journal},
|
||||||
|
number={124},
|
||||||
|
pages={72--74},
|
||||||
|
year={2004}
|
||||||
|
}
|
||||||
|
@inproceedings{amdahl1967validity,
|
||||||
|
title={Validity of the single processor approach to achieving large scale computing capabilities},
|
||||||
|
author={Amdahl, Gene M},
|
||||||
|
booktitle={Proceedings of the April 18-20, 1967, spring joint computer conference},
|
||||||
|
pages={483--485},
|
||||||
|
year={1967},
|
||||||
|
organization={ACM}
|
||||||
|
}
|
||||||
|
@book{sarawagi1998discovery,
|
||||||
|
title={Discovery-driven exploration of OLAP data cubes},
|
||||||
|
author={Sarawagi, Sunita and Agrawal, Rakesh and Megiddo, Nimrod},
|
||||||
|
year={1998},
|
||||||
|
publisher={Springer}
|
||||||
|
}
|
||||||
|
@article{hu2011stream,
|
||||||
|
title={Stream Database Survey},
|
||||||
|
author={Hu, Bo},
|
||||||
|
year={2011}
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{dean2008mapreduce,
|
||||||
|
title={MapReduce: simplified data processing on large clusters},
|
||||||
|
author={Dean, Jeffrey and Ghemawat, Sanjay},
|
||||||
|
journal={Communications of the ACM},
|
||||||
|
volume={51},
|
||||||
|
number={1},
|
||||||
|
pages={107--113},
|
||||||
|
year={2008},
|
||||||
|
publisher={ACM}
|
||||||
|
}
|
||||||
|
|
||||||
|
@misc{linkedin2013senseidb,
|
||||||
|
author = {LinkedIn},
|
||||||
|
title = {SenseiDB},
|
||||||
|
month = {July},
|
||||||
|
year = {2013},
|
||||||
|
howpublished = "\url{http://www.senseidb.com/}"
|
||||||
|
}
|
||||||
|
|
||||||
|
@misc{apache2013solr,
|
||||||
|
author = {Apache},
|
||||||
|
title = {Apache Solr},
|
||||||
|
month = {February},
|
||||||
|
year = {2013},
|
||||||
|
howpublished = "\url{http://lucene.apache.org/solr/}"
|
||||||
|
}
|
||||||
|
|
||||||
|
@misc{banon2013elasticsearch,
|
||||||
|
author = {Banon, Shay},
|
||||||
|
title = {ElasticSearch},
|
||||||
|
month = {July},
|
||||||
|
year = {2013},
|
||||||
|
howpublished = "\url{http://www.elasticseach.com/}"
|
||||||
|
}
|
||||||
|
|
||||||
|
@book{oehler2012ibm,
|
||||||
|
title={IBM Cognos TM1: The Official Guide},
|
||||||
|
author={Oehler, Karsten and Gruenes, Jochen and Ilacqua, Christopher and Perez, Manuel},
|
||||||
|
year={2012},
|
||||||
|
publisher={McGraw-Hill}
|
||||||
|
}
|
||||||
|
|
||||||
|
@book{schrader2009oracle,
|
||||||
|
title={Oracle Essbase \& Oracle OLAP},
|
||||||
|
author={Schrader, Michael and Vlamis, Dan and Nader, Mike and Claterbos, Chris and Collins, Dave and Campbell, Mitch and Conrad, Floyd},
|
||||||
|
year={2009},
|
||||||
|
publisher={McGraw-Hill, Inc.}
|
||||||
|
}
|
||||||
|
|
||||||
|
@book{lachev2005applied,
|
||||||
|
title={Applied Microsoft Analysis Services 2005: And Microsoft Business Intelligence Platform},
|
||||||
|
author={Lachev, Teo},
|
||||||
|
year={2005},
|
||||||
|
publisher={Prologika Press}
|
||||||
|
}
|
||||||
|
|
||||||
|
@article{o1996log,
|
||||||
|
title={The log-structured merge-tree (LSM-tree)},
|
||||||
|
author={O’Neil, Patrick and Cheng, Edward and Gawlick, Dieter and O’Neil, Elizabeth},
|
||||||
|
journal={Acta Informatica},
|
||||||
|
volume={33},
|
||||||
|
number={4},
|
||||||
|
pages={351--385},
|
||||||
|
year={1996},
|
||||||
|
publisher={Springer}
|
||||||
|
}
|
||||||
|
|
||||||
|
@inproceedings{o1997improved,
|
||||||
|
title={Improved query performance with variant indexes},
|
||||||
|
author={O'Neil, Patrick and Quass, Dallan},
|
||||||
|
booktitle={ACM Sigmod Record},
|
||||||
|
volume={26},
|
||||||
|
number={2},
|
||||||
|
pages={38--49},
|
||||||
|
year={1997},
|
||||||
|
organization={ACM}
|
||||||
|
}
|
||||||
|
|
||||||
|
@inproceedings{cipar2012lazybase,
|
||||||
|
title={LazyBase: trading freshness for performance in a scalable database},
|
||||||
|
author={Cipar, James and Ganger, Greg and Keeton, Kimberly and Morrey III, Charles B and Soules, Craig AN and Veitch, Alistair},
|
||||||
|
booktitle={Proceedings of the 7th ACM european conference on Computer Systems},
|
||||||
|
pages={169--182},
|
||||||
|
year={2012},
|
||||||
|
organization={ACM}
|
||||||
|
}
|
|
@ -0,0 +1,46 @@
|
||||||
|
This is BibTeX, Version 0.99d (TeX Live 2012)
|
||||||
|
Capacity: max_strings=35307, hash_size=35307, hash_prime=30011
|
||||||
|
The top-level auxiliary file: druid_demo.aux
|
||||||
|
The style file: abbrv.bst
|
||||||
|
Database file #1: druid_demo.bib
|
||||||
|
You've used 4 entries,
|
||||||
|
2118 wiz_defined-function locations,
|
||||||
|
524 strings with 4556 characters,
|
||||||
|
and the built_in function-call counts, 1592 in all, are:
|
||||||
|
= -- 160
|
||||||
|
> -- 67
|
||||||
|
< -- 3
|
||||||
|
+ -- 26
|
||||||
|
- -- 22
|
||||||
|
* -- 105
|
||||||
|
:= -- 251
|
||||||
|
add.period$ -- 14
|
||||||
|
call.type$ -- 4
|
||||||
|
change.case$ -- 23
|
||||||
|
chr.to.int$ -- 0
|
||||||
|
cite$ -- 4
|
||||||
|
duplicate$ -- 67
|
||||||
|
empty$ -- 133
|
||||||
|
format.name$ -- 22
|
||||||
|
if$ -- 349
|
||||||
|
int.to.chr$ -- 0
|
||||||
|
int.to.str$ -- 4
|
||||||
|
missing$ -- 4
|
||||||
|
newline$ -- 23
|
||||||
|
num.names$ -- 8
|
||||||
|
pop$ -- 30
|
||||||
|
preamble$ -- 1
|
||||||
|
purify$ -- 19
|
||||||
|
quote$ -- 0
|
||||||
|
skip$ -- 47
|
||||||
|
stack$ -- 0
|
||||||
|
substring$ -- 96
|
||||||
|
swap$ -- 22
|
||||||
|
text.length$ -- 3
|
||||||
|
text.prefix$ -- 0
|
||||||
|
top$ -- 0
|
||||||
|
type$ -- 16
|
||||||
|
warning$ -- 0
|
||||||
|
while$ -- 16
|
||||||
|
width$ -- 5
|
||||||
|
write$ -- 48
|
|
@ -0,0 +1,18 @@
|
||||||
|
\BOOKMARK [1][-]{section.1}{Introduction}{}% 1
|
||||||
|
\BOOKMARK [2][-]{subsection.1.1}{The Need for Druid}{section.1}% 2
|
||||||
|
\BOOKMARK [1][-]{section.2}{Architecture}{}% 3
|
||||||
|
\BOOKMARK [2][-]{subsection.2.1}{Real-time Nodes}{section.2}% 4
|
||||||
|
\BOOKMARK [2][-]{subsection.2.2}{Historical Nodes}{section.2}% 5
|
||||||
|
\BOOKMARK [2][-]{subsection.2.3}{Broker Nodes}{section.2}% 6
|
||||||
|
\BOOKMARK [2][-]{subsection.2.4}{Coordinator Nodes}{section.2}% 7
|
||||||
|
\BOOKMARK [2][-]{subsection.2.5}{Query Processing}{section.2}% 8
|
||||||
|
\BOOKMARK [2][-]{subsection.2.6}{Query Capabilities}{section.2}% 9
|
||||||
|
\BOOKMARK [1][-]{section.3}{Performance}{}% 10
|
||||||
|
\BOOKMARK [2][-]{subsection.3.1}{Query Performance}{section.3}% 11
|
||||||
|
\BOOKMARK [2][-]{subsection.3.2}{Data Ingestion Performance}{section.3}% 12
|
||||||
|
\BOOKMARK [1][-]{section.4}{Demonstration Details}{}% 13
|
||||||
|
\BOOKMARK [2][-]{subsection.4.1}{Setup}{section.4}% 14
|
||||||
|
\BOOKMARK [2][-]{subsection.4.2}{Goals}{section.4}% 15
|
||||||
|
\BOOKMARK [1][-]{section.5}{Acknowledgments}{}% 16
|
||||||
|
\BOOKMARK [1][-]{section.6}{Additional Authors}{}% 17
|
||||||
|
\BOOKMARK [1][-]{section.7}{References}{}% 18
|
|
@ -0,0 +1,464 @@
|
||||||
|
% THIS IS AN EXAMPLE DOCUMENT FOR VLDB 2012
|
||||||
|
% based on ACM SIGPROC-SP.TEX VERSION 2.7
|
||||||
|
% Modified by Gerald Weber <gerald@cs.auckland.ac.nz>
|
||||||
|
% Removed the requirement to include *bbl file in here. (AhmetSacan, Sep2012)
|
||||||
|
% Fixed the equation on page 3 to prevent line overflow. (AhmetSacan, Sep2012)
|
||||||
|
|
||||||
|
\documentclass{vldb}
|
||||||
|
\usepackage{graphicx}
|
||||||
|
\usepackage{balance} % for \balance command ON LAST PAGE (only there!)
|
||||||
|
\usepackage{fontspec}
|
||||||
|
\usepackage{hyperref}
|
||||||
|
\graphicspath{{figures/}}
|
||||||
|
\usepackage{enumitem}
|
||||||
|
|
||||||
|
\begin{document}
|
||||||
|
|
||||||
|
% ****************** TITLE ****************************************
|
||||||
|
|
||||||
|
\title{Druid: Open Source Real-time Analytics at Scale}
|
||||||
|
|
||||||
|
% possible, but not really needed or used for PVLDB:
|
||||||
|
%\subtitle{[Extended Abstract]
|
||||||
|
%\titlenote{A full version of this paper is available as\textit{Author's Guide to Preparing ACM SIG Proceedings Using \LaTeX$2_\epsilon$\ and BibTeX} at \texttt{www.acm.org/eaddress.htm}}}
|
||||||
|
|
||||||
|
% ****************** AUTHORS **************************************
|
||||||
|
|
||||||
|
% You need the command \numberofauthors to handle the 'placement
|
||||||
|
% and alignment' of the authors beneath the title.
|
||||||
|
%
|
||||||
|
% For aesthetic reasons, we recommend 'three authors at a time'
|
||||||
|
% i.e. three 'name/affiliation blocks' be placed beneath the title.
|
||||||
|
%
|
||||||
|
% NOTE: You are NOT restricted in how many 'rows' of
|
||||||
|
% "name/affiliations" may appear. We just ask that you restrict
|
||||||
|
% the number of 'columns' to three.
|
||||||
|
%
|
||||||
|
% Because of the available 'opening page real-estate'
|
||||||
|
% we ask you to refrain from putting more than six authors
|
||||||
|
% (two rows with three columns) beneath the article title.
|
||||||
|
% More than six makes the first-page appear very cluttered indeed.
|
||||||
|
%
|
||||||
|
% Use the \alignauthor commands to handle the names
|
||||||
|
% and affiliations for an 'aesthetic maximum' of six authors.
|
||||||
|
% Add names, affiliations, addresses for
|
||||||
|
% the seventh etc. author(s) as the argument for the
|
||||||
|
% \additionalauthors command.
|
||||||
|
% These 'additional authors' will be output/set for you
|
||||||
|
% without further effort on your part as the last section in
|
||||||
|
% the body of your article BEFORE References or any Appendices.
|
||||||
|
|
||||||
|
\numberofauthors{6} % in this sample file, there are a *total*
|
||||||
|
% of EIGHT authors. SIX appear on the 'first-page' (for formatting
|
||||||
|
% reasons) and the remaining two appear in the \additionalauthors section.
|
||||||
|
|
||||||
|
\author{
|
||||||
|
% You can go ahead and credit any number of authors here,
|
||||||
|
% e.g. one 'row of three' or two rows (consisting of one row of three
|
||||||
|
% and a second row of one, two or three).
|
||||||
|
%
|
||||||
|
% The command \alignauthor (no curly braces needed) should
|
||||||
|
% precede each author name, affiliation/snail-mail address and
|
||||||
|
% e-mail address. Additionally, tag each line of
|
||||||
|
% affiliation/address with \affaddr, and tag the
|
||||||
|
% e-mail address with \email.
|
||||||
|
%
|
||||||
|
% 1st. author
|
||||||
|
\alignauthor
|
||||||
|
Fangjin Yang\\
|
||||||
|
\affaddr{Metamarkets Group, Inc.}\\
|
||||||
|
\email{fangjin@metamarkets.com}
|
||||||
|
% 2nd. author
|
||||||
|
\alignauthor
|
||||||
|
Eric Tschetter\\
|
||||||
|
\email{echeddar@gmail.com}
|
||||||
|
% 3rd. author
|
||||||
|
\alignauthor
|
||||||
|
Xavier Léauté\\
|
||||||
|
\affaddr{Metamarkets Group, Inc.}\\
|
||||||
|
\email{xavier@metamarkets.com}
|
||||||
|
\and % use '\and' if you need 'another row' of author names
|
||||||
|
% 4th. author
|
||||||
|
\alignauthor
|
||||||
|
Nishant Bangarwa\\
|
||||||
|
\affaddr{Metamarkets Group, Inc.}\\
|
||||||
|
\email{nishant@metamarkets.com}
|
||||||
|
% 5th. author
|
||||||
|
\alignauthor
|
||||||
|
Nelson Ray\\
|
||||||
|
\email{ncray86@gmail.com}
|
||||||
|
% 6th. author
|
||||||
|
\alignauthor
|
||||||
|
Gian Merlino\\
|
||||||
|
\affaddr{Metamarkets Group, Inc.}\\
|
||||||
|
\email{gian@metamarkets.com}
|
||||||
|
}
|
||||||
|
% There's nothing stopping you putting the seventh, eighth, etc.
|
||||||
|
% author on the opening page (as the 'third row') but we ask,
|
||||||
|
% for aesthetic reasons that you place these 'additional authors'
|
||||||
|
% in the \additional authors block, viz.
|
||||||
|
\additionalauthors{Additional authors: Deep Ganguli (Metamarkets Group, Inc., {\texttt{deep@metamarkets.com}}), Himadri Singh (Metamarkets Group, Inc., {\texttt{himadri@metamarkets.com}}), Igal Levy (Metamarkets Group, Inc., {\texttt{igal@metamarkets.com}})}
|
||||||
|
\date{14 March 2014}
|
||||||
|
% Just remember to make sure that the TOTAL number of authors
|
||||||
|
% is the number that will appear on the first page PLUS the
|
||||||
|
% number that will appear in the \additionalauthors section.
|
||||||
|
|
||||||
|
|
||||||
|
\maketitle
|
||||||
|
|
||||||
|
\begin{abstract}
|
||||||
|
Druid is an open
|
||||||
|
source\footnote{\href{https://github.com/metamx/druid}{https://github.com/metamx/druid}}
|
||||||
|
data store built for exploratory analytics on large data sets. Druid supports
|
||||||
|
fast data aggregation, low latency data ingestion, and arbitrary data
|
||||||
|
exploration. The system combines a column-oriented storage layout, a
|
||||||
|
distributed, shared-nothing architecture, and an advanced indexing structure to
|
||||||
|
return queries on billions of rows in milliseconds. Druid is petabyte scale and
|
||||||
|
is deployed in production at several technology companies.
|
||||||
|
\end{abstract}
|
||||||
|
|
||||||
|
\section{Introduction}
|
||||||
|
The recent proliferation of internet technology has created a surge
|
||||||
|
in machine-generated events. Individually, these events contain minimal useful
|
||||||
|
information and are of low value. Given the time and resources required to
|
||||||
|
extract meaning from large collections of events, many companies were willing
|
||||||
|
to discard this data instead.
|
||||||
|
|
||||||
|
A few years ago, Google introduced MapReduce as their mechanism of leveraging
|
||||||
|
commodity hardware to index the internet and analyze logs. The Hadoop project
|
||||||
|
soon followed and was largely patterned after the insights that came out of the
|
||||||
|
original MapReduce paper. Hadoop has contributed much to helping companies
|
||||||
|
convert their low-value event streams into high-value aggregates for a variety
|
||||||
|
of applications such as business intelligence and A-B testing.
|
||||||
|
|
||||||
|
As with a lot of great systems, Hadoop has opened our eyes to a new space of
|
||||||
|
problems. Specifically, Hadoop excels at storing and providing access to large
|
||||||
|
amounts of data, however, it does not make any performance guarantees around
|
||||||
|
how quickly that data can be accessed. Furthermore, although Hadoop is a
|
||||||
|
highly available system, performance degrades under heavy concurrent load.
|
||||||
|
Lastly, while Hadoop works well for storing data, it is not optimized for
|
||||||
|
ingesting data and making that data immediately readable.
|
||||||
|
|
||||||
|
\subsection{The Need for Druid}
|
||||||
|
Druid was originally designed to solve problems around ingesting and exploring
|
||||||
|
large quantities of transactional events (log data). This form of timeseries
|
||||||
|
data (OLAP data) is commonly found in the business intelligence
|
||||||
|
space and the nature of the data tends to be very append heavy. Events typically
|
||||||
|
have three distinct components: a timestamp column indicating when the event
|
||||||
|
occurred, a set of dimension columns indicating various attributes about the
|
||||||
|
event, and a set of metric columns containing values (usually numeric) that can
|
||||||
|
be aggregated. Queries are typically issued for the sum of some set of metrics,
|
||||||
|
filtered by some set of dimensions, over some span of time.
|
||||||
|
|
||||||
|
The Druid project first began out of necessity at Metamarkets to power a
|
||||||
|
business intelligence dashboard that allowed users to arbitrarily explore and
|
||||||
|
visualize event streams. Existing open source Relational Database Management
|
||||||
|
Systems, cluster computing frameworks, and NoSQL key/value stores were unable
|
||||||
|
to provide a low latency data ingestion and query platform for an interactive
|
||||||
|
dashboard. Queries needed to return fast enough to allow the data
|
||||||
|
visualizations in the dashboard to update interactively.
|
||||||
|
|
||||||
|
In addition to the query latency needs, the system had to be multi-tenant and
|
||||||
|
highly available, as the dashboard is used in a highly concurrent environment.
|
||||||
|
Downtime is costly and many businesses cannot afford to wait if a system is
|
||||||
|
unavailable in the face of software upgrades or network failure. Finally,
|
||||||
|
Metamarkets also wanted to allow users and alerting systems to be able to make
|
||||||
|
business decisions in ``real-time". The time from when an event is created to
|
||||||
|
when that event is queryable determines how fast users and systems are able to
|
||||||
|
react to potentially catastrophic occurrences in their systems.
|
||||||
|
|
||||||
|
The problems of data exploration, ingestion, and availability span multiple
|
||||||
|
industries. Since Druid was open sourced in October 2012, it has been deployed as a
|
||||||
|
video, network monitoring, operations monitoring, and online advertising
|
||||||
|
analytics platform at multiple companies\footnote{\href{http://druid.io/druid.html}{http://druid.io/druid.html}}.
|
||||||
|
|
||||||
|
\begin{figure*}
|
||||||
|
\centering
|
||||||
|
\includegraphics[width = 4.5in]{cluster}
|
||||||
|
\caption{An overview of a Druid cluster and the flow of data through the cluster.}
|
||||||
|
\label{fig:cluster}
|
||||||
|
\end{figure*}
|
||||||
|
|
||||||
|
\section{Architecture}
|
||||||
|
A Druid cluster consists of different types of nodes and each node type is
|
||||||
|
designed to perform a specific set of things. We believe this design separates
|
||||||
|
concerns and simplifies the complexity of the system. The different node types
|
||||||
|
operate fairly independently of each other and there is minimal interaction among
|
||||||
|
them. Hence, intra-cluster communication failures have minimal impact on data
|
||||||
|
availability. To solve complex data analysis problems, the different node
|
||||||
|
types come together to form a fully working system. The composition of and flow
|
||||||
|
of data in a Druid cluster are shown in Figure~\ref{fig:cluster}. All Druid
|
||||||
|
nodes announce their availability and the data they are serving over
|
||||||
|
Zookeeper\cite{hunt2010zookeeper}.
|
||||||
|
|
||||||
|
\subsection{Real-time Nodes}
|
||||||
|
Real-time nodes encapsulate the functionality to ingest and query event
|
||||||
|
streams. Events indexed via these nodes are immediately available for querying.
|
||||||
|
These nodes are only concerned with events for some small time range. They
|
||||||
|
periodically hand off batches of immutable events to other nodes in the Druid
|
||||||
|
cluster that are specialized in dealing with batches of immutable events.
|
||||||
|
|
||||||
|
Real-time nodes maintain an in-memory index buffer for all incoming events.
|
||||||
|
These indexes are incrementally populated as new events are ingested and the
|
||||||
|
indexes are also directly queryable. To avoid heap overflow problems, real-time
|
||||||
|
nodes persist their in-memory indexes to disk either periodically or after some
|
||||||
|
maximum row limit is reached. This persist process converts data stored in the
|
||||||
|
in-memory buffer to a column oriented storage format. Each persisted index is
|
||||||
|
immutable and real-time nodes load persisted indexes into off-heap memory such
|
||||||
|
that they can still be queried. On a periodic basis, each real-time node will
|
||||||
|
schedule a background task that searches for all locally persisted indexes. The
|
||||||
|
task merges these indexes together and builds an immutable block of data that
|
||||||
|
contains all the events that have ingested by a real-time node for some span of
|
||||||
|
time. We refer to this block of data as a ``segment". During the handoff stage,
|
||||||
|
a real-time node uploads this segment to permanent backup storage, typically
|
||||||
|
a distributed file system that Druid calls ``deep storage".
|
||||||
|
|
||||||
|
\subsection{Historical Nodes}
|
||||||
|
Historical nodes encapsulate the functionality to load and serve the immutable
|
||||||
|
blocks of data (segments) created by real-time nodes. In many real-world
|
||||||
|
workflows, most of the data loaded in a Druid cluster is immutable and hence
|
||||||
|
historical nodes are typically the main workers of a Druid cluster. Historical
|
||||||
|
nodes follow a shared-nothing architecture and there is no single point of
|
||||||
|
contention among the nodes. The nodes have no knowledge of one another and are
|
||||||
|
operationally simple; they only know how to load, drop, and serve immutable
|
||||||
|
segments.
|
||||||
|
|
||||||
|
\subsection{Broker Nodes}
|
||||||
|
Broker nodes act as query routers to historical and real-time nodes. Broker
|
||||||
|
nodes understand what segments are queryable and where those segments are
|
||||||
|
located. Broker nodes route incoming queries such that the queries hit the
|
||||||
|
right historical or real-time nodes. Broker nodes also merge partial results
|
||||||
|
from historical and real-time nodes before returning a final consolidated
|
||||||
|
result to the caller.
|
||||||
|
|
||||||
|
\subsection{Coordinator Nodes}
|
||||||
|
Druid coordinator nodes are primarily in charge of data management and
|
||||||
|
distribution on historical nodes. The coordinator nodes tell historical nodes
|
||||||
|
to load new data, drop outdated data, replicate data, and move data to load
|
||||||
|
balance. Coordinator nodes undergo a
|
||||||
|
leader-election process that determines a single node that runs the coordinator
|
||||||
|
functionality. The remaining coordinator nodes act as redundant backups.
|
||||||
|
|
||||||
|
A coordinator node runs periodically to determine the current state of the
|
||||||
|
cluster. It makes decisions by comparing the expected state of the cluster with
|
||||||
|
the actual state of the cluster at the time of the run. Coordinator nodes also
|
||||||
|
maintain a connection to a MySQL database that contains additional operational
|
||||||
|
parameters and configurations. One of the key pieces of information located in
|
||||||
|
the MySQL database is a table that contains a list of all segments that should
|
||||||
|
be served by historical nodes. This table can be updated by any service that
|
||||||
|
creates segments, such as real-time nodes.
|
||||||
|
|
||||||
|
\subsection{Query Processing}
|
||||||
|
Data tables in Druid (called \emph{data sources}) are collections of
|
||||||
|
timestamped events partitioned into a set of segments, where each segment
|
||||||
|
is typically 5--10 million rows. Formally, we define a segment as a collection
|
||||||
|
of rows of data that span some period in time. Segments represent the
|
||||||
|
fundamental storage unit in Druid and replication and distribution are done at
|
||||||
|
a segment level.
|
||||||
|
|
||||||
|
Druid segments are stored in a column orientation. Given that Druid is best
|
||||||
|
used for aggregating event streams (all data going into Druid must have a
|
||||||
|
timestamp), the advantages storing aggregate information as columns rather than
|
||||||
|
rows are well documented \cite{abadi2008column}. Column storage allows for more
|
||||||
|
efficient CPU usage as only what is needed is actually loaded and scanned.
|
||||||
|
|
||||||
|
Druid has multiple column types to represent various data formats. Depending on
|
||||||
|
the column type, different compression methods are used to reduce the cost of
|
||||||
|
storing a column in memory and on disk. For example, if an entire column only
|
||||||
|
contains string values, storing the raw strings is unnecessarily costly.
|
||||||
|
String columns can be dictionary encoded instead. Dictionary encoding is a
|
||||||
|
common method to compress data in column stores.
|
||||||
|
|
||||||
|
In many real world OLAP workflows, queries are issued for the aggregated
|
||||||
|
results of some set of metrics where some set of dimension specifications are
|
||||||
|
met. Consider Table~\ref{tab:sample_data}. An example query for this table may
|
||||||
|
ask: ``How much revenue was generated in the first hour of 2014-01-01 in the
|
||||||
|
city of San Francisco?". This query is filtering a sales data set based on a
|
||||||
|
Boolean expression of dimension values. In many real world data sets, dimension
|
||||||
|
columns contain strings and metric columns contain numbers. Druid creates
|
||||||
|
additional lookup indices for string columns such that only those rows that
|
||||||
|
pertain to a particular query filter are ever scanned.
|
||||||
|
|
||||||
|
\begin{table}
|
||||||
|
\centering
|
||||||
|
\begin{tabular}{| l | l | l |}
|
||||||
|
\hline
|
||||||
|
\textbf{Timestamp} & \textbf{City} & \textbf{Revenue} \\ \hline
|
||||||
|
2014-01-01T01:00:00Z & San Francisco & 25 \\ \hline
|
||||||
|
2014-01-01T01:00:00Z & San Francisco & 42 \\ \hline
|
||||||
|
2014-01-01T02:00:00Z & New York & 17 \\ \hline
|
||||||
|
2014-01-01T02:00:00Z & New York & 170 \\ \hline
|
||||||
|
\end{tabular}
|
||||||
|
\caption{Sample sales data set.}
|
||||||
|
\label{tab:sample_data}
|
||||||
|
\end{table}
|
||||||
|
|
||||||
|
For each unique city in
|
||||||
|
Table~\ref{tab:sample_data}, we can form some representation
|
||||||
|
indicating in which table rows a particular city is seen. We can
|
||||||
|
store this information in a binary array where the array indices
|
||||||
|
represent our rows. If a particular page is seen in a certain
|
||||||
|
row, that array index is marked as \texttt{1}. For example:
|
||||||
|
{\small\begin{verbatim}
|
||||||
|
San Francisco -> rows [0, 1] -> [1][1][0][0]
|
||||||
|
New York -> rows [2, 3] -> [0][0][1][1]
|
||||||
|
\end{verbatim}}
|
||||||
|
|
||||||
|
\texttt{San Francisco} is seen in rows \texttt{0} and \texttt{1}. This mapping of column values
|
||||||
|
to row indices forms an inverted index \cite{tomasic1993performance}. To know which
|
||||||
|
rows contain {\ttfamily San Francisco} or {\ttfamily New York}, we can \texttt{OR} together
|
||||||
|
the two arrays.
|
||||||
|
{\small\begin{verbatim}
|
||||||
|
[0][1][0][1] OR [1][0][1][0] = [1][1][1][1]
|
||||||
|
\end{verbatim}}
|
||||||
|
|
||||||
|
This approach of performing Boolean operations on large bitmap sets is commonly
|
||||||
|
used in search engines. Druid compresses each bitmap index using the Concise
|
||||||
|
algorithm \cite{colantonio2010concise}. All Boolean operations on top of these
|
||||||
|
Concise sets are done without decompressing the set.
|
||||||
|
|
||||||
|
\subsection{Query Capabilities}
|
||||||
|
Druid supports many types of aggregations including double sums, long sums,
|
||||||
|
minimums, maximums, and complex aggregations such as cardinality estimation and
|
||||||
|
approximate quantile estimation. The results of aggregations can be combined
|
||||||
|
in mathematical expressions to form other aggregations. Druid supports
|
||||||
|
different query types ranging from simple aggregates for an interval time,
|
||||||
|
groupBys, and approximate top-K queries.
|
||||||
|
|
||||||
|
\section{Performance}
|
||||||
|
Druid runs in production at several organizations, and to briefly demonstrate its
|
||||||
|
performance, we have chosen to share some real world numbers for the main production
|
||||||
|
cluster running at Metamarkets in early 2014. For comparison with other databases
|
||||||
|
we also include results from synthetic workloads on TPC-H data.
|
||||||
|
|
||||||
|
\subsection{Query Performance}
|
||||||
|
Query latencies are shown in Figure~\ref{fig:query_latency} for a cluster
|
||||||
|
hosting approximately 10.5TB of data using 1302 processing threads and 672
|
||||||
|
total cores (hyperthreaded). There are approximately 50 billion rows of data in
|
||||||
|
this cluster.
|
||||||
|
|
||||||
|
\begin{figure}
|
||||||
|
\centering
|
||||||
|
\includegraphics[width = 2.3in]{avg_query_latency}
|
||||||
|
\caption{Query latencies of production data sources.}
|
||||||
|
\label{fig:query_latency}
|
||||||
|
\end{figure}
|
||||||
|
|
||||||
|
\begin{figure}
|
||||||
|
\centering
|
||||||
|
\includegraphics[width = 2.3in]{tpch_100gb}
|
||||||
|
\caption{Druid \& MySQL benchmarks -- 100GB TPC-H data.}
|
||||||
|
\label{fig:tpch_100gb}
|
||||||
|
\end{figure}
|
||||||
|
|
||||||
|
The average queries per minute during this time was approximately
|
||||||
|
1000. The number of dimensions the various data sources vary from 25 to 78
|
||||||
|
dimensions, and 8 to 35 metrics. Across all the various data sources, average
|
||||||
|
query latency is approximately 550 milliseconds, with 90\% of queries returning
|
||||||
|
in less than 1 second, 95\% in under 2 seconds, and 99\% of queries returning
|
||||||
|
in less than 10 seconds.
|
||||||
|
|
||||||
|
Approximately 30\% of the queries are standard
|
||||||
|
aggregates involving different types of metrics and filters, 60\% of queries
|
||||||
|
are ordered group bys over one or more dimensions with aggregates, and 10\% of
|
||||||
|
queries are search queries and metadata retrieval queries. The number of
|
||||||
|
columns scanned in aggregate queries roughly follows an exponential
|
||||||
|
distribution. Queries involving a single column are very frequent, and queries
|
||||||
|
involving all columns are very rare.
|
||||||
|
|
||||||
|
We also present Druid benchmarks on TPC-H data in Figure~\ref{fig:tpch_100g}.
|
||||||
|
Most TPC-H queries do not directly apply to Druid, so we selected queries more
|
||||||
|
typical of Druid's workload to demonstrate query performance. As a comparison,
|
||||||
|
we also provide the results of the same queries using MySQL using the MyISAM
|
||||||
|
engine (InnoDB was slower in our experiments).
|
||||||
|
|
||||||
|
We benchmarked Druid's scan rate at 53,539,211 rows/second/core for
|
||||||
|
\texttt{select count(*)} equivalent query over a given time interval and
|
||||||
|
36,246,530 rows/second/core for a \texttt{select sum(float)} type query.
|
||||||
|
|
||||||
|
\subsection{Data Ingestion Performance}
|
||||||
|
To showcase Druid's data ingestion latency, we selected several production
|
||||||
|
datasources of varying dimensions, metrics, and event volumes. Druid's data
|
||||||
|
ingestion latency is heavily dependent on the complexity of the data set being
|
||||||
|
ingested. The data complexity is determined by the number of dimensions in each
|
||||||
|
event, the number of metrics in each event, and the types of aggregations we
|
||||||
|
want to perform on those metrics.
|
||||||
|
|
||||||
|
\begin{figure}
|
||||||
|
\centering
|
||||||
|
\includegraphics[width = 2.3in]{ingestion_rate}
|
||||||
|
\caption{Combined cluster ingestion rates.}
|
||||||
|
\label{fig:ingestion_rate}
|
||||||
|
\end{figure}
|
||||||
|
|
||||||
|
For the given datasources, the number of dimensions vary from 5 to 35, and the
|
||||||
|
number of metrics vary from 2 to 24. The peak ingestion latency we measured in
|
||||||
|
production was 22914.43 events/second/core on a datasource with 30 dimensions
|
||||||
|
and 19 metrics.
|
||||||
|
|
||||||
|
The latency measurements we presented are sufficient to address the our stated
|
||||||
|
problems of interactivity. We would prefer the variability in the latencies to
|
||||||
|
be less, which can be achieved by adding additional
|
||||||
|
hardware, but we have not chosen to do so because of cost concerns.
|
||||||
|
|
||||||
|
\section{Demonstration Details}
|
||||||
|
|
||||||
|
We would like to do an end-to-end demonstratation of Druid, from setting up a
|
||||||
|
cluster, ingesting data, structuring a query, and obtaining results. We would
|
||||||
|
also like to showcase how to solve real-world data analysis problems with Druid
|
||||||
|
and demonstrate tools that can be built on top of it, including interactive
|
||||||
|
data visualizations, approximate algorithms, and machine-learning components.
|
||||||
|
We already use similar tools in production.
|
||||||
|
|
||||||
|
\subsection{Setup}
|
||||||
|
|
||||||
|
Users will be able to set up a local Druid cluster to better understand the
|
||||||
|
components and architecture of the system. Druid is designed to run on
|
||||||
|
commodity hardware and Druid nodes are simply java processes that need to be
|
||||||
|
started up. The local setup will allow users to ingest data from Twitter's
|
||||||
|
public API and query it. We will also provide users access to an AWS hosted
|
||||||
|
Druid cluster that contains several terabytes of Twitter data that we have been
|
||||||
|
collecting for over 2 years. There are over 3 billion tweets in this data set,
|
||||||
|
and new events are constantly being ingested. We will walk through a variety of
|
||||||
|
different queries to demonstrate Druid's arbitrary data-exploration
|
||||||
|
capabilities.
|
||||||
|
|
||||||
|
Finally, we will teach users how to build a simple interactive dashboard on top
|
||||||
|
of Druid. The dashboard will use some of Druid's more powerful features such as
|
||||||
|
approximate algorithms for quickly determining the cardinality of sets, and
|
||||||
|
machine learning algorithms for scientific computing problems such as anomaly
|
||||||
|
detection. These use cases represent some of the more interesting problems we
|
||||||
|
use Druid for in production.
|
||||||
|
|
||||||
|
\subsection{Goals}
|
||||||
|
|
||||||
|
We will not only walk users through solving real-world problems with Druid and
|
||||||
|
different tools that have been built on top of Druid, but also answer
|
||||||
|
conference-specific questions such as what are the trending tweets and topics
|
||||||
|
at VLDB, what netizens are conversing about in the general area, and even
|
||||||
|
perform a sentiment analysis of VLDB. Our goal is to clearly explain why the
|
||||||
|
architecture of Druid makes it highly optimal for certain types of queries, and
|
||||||
|
the potential of the system as a real-time analytics platform.
|
||||||
|
|
||||||
|
%\end{document} % This is where a 'short' article might terminate
|
||||||
|
|
||||||
|
% ensure same length columns on last page (might need two sub-sequent latex runs)
|
||||||
|
\balance
|
||||||
|
|
||||||
|
%ACKNOWLEDGMENTS are optional
|
||||||
|
\section{Acknowledgments}
|
||||||
|
Druid could not have been built without the help of many great people in the
|
||||||
|
community. We want to thank everyone that has contributed to the Druid
|
||||||
|
codebase for their invaluable support.
|
||||||
|
|
||||||
|
% The following two commands are all you need in the
|
||||||
|
% initial runs of your .tex file to
|
||||||
|
% produce the bibliography for the citations in your paper.
|
||||||
|
\bibliographystyle{abbrv}
|
||||||
|
\bibliography{druid_demo} % vldb_sample.bib is the name of the Bibliography in this case
|
||||||
|
% You must have a proper ".bib" file
|
||||||
|
% and remember to run:
|
||||||
|
% latex bibtex latex latex
|
||||||
|
% to resolve all references
|
||||||
|
|
||||||
|
\end{document}
|
After Width: | Height: | Size: 35 KiB |
After Width: | Height: | Size: 53 KiB |
After Width: | Height: | Size: 28 KiB |
After Width: | Height: | Size: 51 KiB |
After Width: | Height: | Size: 35 KiB |
After Width: | Height: | Size: 36 KiB |
After Width: | Height: | Size: 74 KiB |
After Width: | Height: | Size: 73 KiB |
After Width: | Height: | Size: 85 KiB |
After Width: | Height: | Size: 43 KiB |