Merge pull request #442 from metamx/demo-paper

Writing a demonstration paper for a conference
2014-03-25 22:29:36 -06:00 · 2014-03-25 22:29:36 -06:00 · 183a133ee4
parent 1063e14cf4 15af9ac3c5
commit 183a133ee4
32 changed files with 2441 additions and 2 deletions
--- a/docs/content/Historical-Config.md
+++ b/docs/content/Historical-Config.md
@ -36,8 +36,6 @@ druid.processing.numThreads=1
 druid.segmentCache.locations=[{"path": "/tmp/druid/indexCache", "maxSize"\: 10000000000}]
 ```

-Note: This will spin up a Historical node with the local filesystem as deep storage.
-
 Production Configs
 ------------------
 These production configs are using S3 as a deep store.
--- a/publications/demo/Makefile
+++ b/publications/demo/Makefile
@ -0,0 +1,12 @@
+all : druid_demo.pdf
+
+clean :
+	@rm -f *.aux *.bbl *.blg *.log
+
+%.tex : %.bib
+
+%.pdf : %.tex %.bib
+	lualatex $(*F)
+	bibtex $(*F)
+	lualatex $(*F)
+	lualatex $(*F)
--- a/publications/demo/druid_demo.aux
+++ b/publications/demo/druid_demo.aux
@ -0,0 +1,54 @@
+\relax 
+\providecommand\HyperFirstAtBeginDocument{\AtBeginDocument}
+\HyperFirstAtBeginDocument{\ifx\hyper@anchor\@undefined
+\global\let\oldcontentsline\contentsline
+\gdef\contentsline#1#2#3#4{\oldcontentsline{#1}{#2}{#3}}
+\global\let\oldnewlabel\newlabel
+\gdef\newlabel#1#2{\newlabelxx{#1}#2}
+\gdef\newlabelxx#1#2#3#4#5#6{\oldnewlabel{#1}{{#2}{#3}}}
+\AtEndDocument{\ifx\hyper@anchor\@undefined
+\let\contentsline\oldcontentsline
+\let\newlabel\oldnewlabel
+\fi}
+\fi}
+\global\let\hyper@last\relax 
+\gdef\HyperFirstAtBeginDocument#1{#1}
+\providecommand\HyField@AuxAddToFields[1]{}
+\citation{hunt2010zookeeper}
+\@writefile{toc}{\contentsline {section}{\numberline {1}Introduction}{1}{section.1}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {1.1}The Need for Druid}{1}{subsection.1.1}}
+\@writefile{toc}{\contentsline {section}{\numberline {2}Architecture}{1}{section.2}}
+\citation{abadi2008column}
+\@writefile{lof}{\contentsline {figure}{\numberline {1}{\ignorespaces An overview of a Druid cluster and the flow of data through the cluster.}}{2}{figure.1}}
+\newlabel{fig:cluster}{{1}{2}{An overview of a Druid cluster and the flow of data through the cluster}{figure.1}{}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {2.1}Real-time Nodes}{2}{subsection.2.1}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {2.2}Historical Nodes}{2}{subsection.2.2}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {2.3}Broker Nodes}{2}{subsection.2.3}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {2.4}Coordinator Nodes}{2}{subsection.2.4}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {2.5}Query Processing}{2}{subsection.2.5}}
+\citation{tomasic1993performance}
+\citation{colantonio2010concise}
+\@writefile{lot}{\contentsline {table}{\numberline {1}{\ignorespaces Sample sales data set.}}{3}{table.1}}
+\newlabel{tab:sample_data}{{1}{3}{Sample sales data set}{table.1}{}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {2.6}Query Capabilities}{3}{subsection.2.6}}
+\@writefile{lof}{\contentsline {figure}{\numberline {2}{\ignorespaces Query latencies of production data sources.}}{3}{figure.2}}
+\newlabel{fig:query_latency}{{2}{3}{Query latencies of production data sources}{figure.2}{}}
+\@writefile{lof}{\contentsline {figure}{\numberline {3}{\ignorespaces Druid \& MySQL benchmarks -- 100GB TPC-H data.}}{3}{figure.3}}
+\newlabel{fig:tpch_100gb}{{3}{3}{Druid \& MySQL benchmarks -- 100GB TPC-H data}{figure.3}{}}
+\@writefile{toc}{\contentsline {section}{\numberline {3}Performance}{3}{section.3}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {3.1}Query Performance}{3}{subsection.3.1}}
+\bibstyle{abbrv}
+\bibdata{druid_demo}
+\bibcite{abadi2008column}{1}
+\bibcite{colantonio2010concise}{2}
+\bibcite{hunt2010zookeeper}{3}
+\bibcite{tomasic1993performance}{4}
+\@writefile{lof}{\contentsline {figure}{\numberline {4}{\ignorespaces Combined cluster ingestion rates.}}{4}{figure.4}}
+\newlabel{fig:ingestion_rate}{{4}{4}{Combined cluster ingestion rates}{figure.4}{}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {3.2}Data Ingestion Performance}{4}{subsection.3.2}}
+\@writefile{toc}{\contentsline {section}{\numberline {4}Demonstration Details}{4}{section.4}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {4.1}Setup}{4}{subsection.4.1}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {4.2}Goals}{4}{subsection.4.2}}
+\@writefile{toc}{\contentsline {section}{\numberline {5}Acknowledgments}{4}{section.5}}
+\@writefile{toc}{\contentsline {section}{\numberline {6}Additional Authors}{4}{section.6}}
+\@writefile{toc}{\contentsline {section}{\numberline {7}References}{4}{section.7}}
--- a/publications/demo/druid_demo.bbl
+++ b/publications/demo/druid_demo.bbl
@ -0,0 +1,27 @@
+\begin{thebibliography}{1}
+
+\bibitem{abadi2008column}
+D.~J. Abadi, S.~R. Madden, and N.~Hachem.
+\newblock Column-stores vs. row-stores: How different are they really?
+\newblock In {\em Proceedings of the 2008 ACM SIGMOD international conference
+  on Management of data}, pages 967--980. ACM, 2008.
+
+\bibitem{colantonio2010concise}
+A.~Colantonio and R.~Di~Pietro.
+\newblock Concise: Compressed ‘n’composable integer set.
+\newblock {\em Information Processing Letters}, 110(16):644--650, 2010.
+
+\bibitem{hunt2010zookeeper}
+P.~Hunt, M.~Konar, F.~P. Junqueira, and B.~Reed.
+\newblock Zookeeper: Wait-free coordination for internet-scale systems.
+\newblock In {\em USENIX ATC}, volume~10, 2010.
+
+\bibitem{tomasic1993performance}
+A.~Tomasic and H.~Garcia-Molina.
+\newblock Performance of inverted indices in shared-nothing distributed text
+  document information retrieval systems.
+\newblock In {\em Parallel and Distributed Information Systems, 1993.,
+  Proceedings of the Second International Conference on}, pages 8--17. IEEE,
+  1993.
+
+\end{thebibliography}
--- a/publications/demo/druid_demo.bib
+++ b/publications/demo/druid_demo.bib
@ -0,0 +1,420 @@
+  @article{cattell2011scalable,
+    title={Scalable SQL and NoSQL data stores},
+    author={Cattell, Rick},
+    journal={ACM SIGMOD Record},
+    volume={39},
+    number={4},
+    pages={12--27},
+    year={2011},
+    publisher={ACM}
+  }
+
+  @article{chang2008bigtable,
+    title={Bigtable: A distributed storage system for structured data},
+    author={Chang, Fay and Dean, Jeffrey and Ghemawat, Sanjay and Hsieh, Wilson C and Wallach, Deborah A and Burrows, Mike and Chandra, Tushar and Fikes, Andrew and Gruber, Robert E},
+    journal={ACM Transactions on Computer Systems (TOCS)},
+    volume={26},
+    number={2},
+    pages={4},
+    year={2008},
+    publisher={ACM}
+  }
+
+  @inproceedings{decandia2007dynamo,
+    title={Dynamo: amazon's highly available key-value store},
+    author={DeCandia, Giuseppe and Hastorun, Deniz and Jampani, Madan and Kakulapati, Gunavardhan and Lakshman, Avinash and Pilchin, Alex and Sivasubramanian, Swaminathan and Vosshall, Peter and Vogels, Werner},
+    booktitle={ACM SIGOPS Operating Systems Review},
+    volume={41},
+    number={6},
+    pages={205--220},
+    year={2007},
+    organization={ACM}
+  }
+
+  @inproceedings{abadi2008column,
+    title={Column-Stores vs. Row-Stores: How different are they really?},
+    author={Abadi, Daniel J and Madden, Samuel R and Hachem, Nabil},
+    booktitle={Proceedings of the 2008 ACM SIGMOD international conference on Management of data},
+    pages={967--980},
+    year={2008},
+    organization={ACM}
+  }
+
+  @inproceedings{bear2012vertica,
+    title={The vertica database: SQL RDBMS for managing big data},
+    author={Bear, Chuck and Lamb, Andrew and Tran, Nga},
+    booktitle={Proceedings of the 2012 workshop on Management of big data systems},
+    pages={37--38},
+    year={2012},
+    organization={ACM}
+  }
+
+  @article{lakshman2010cassandra,
+    title={Cassandra—A decentralized structured storage system},
+    author={Lakshman, Avinash and Malik, Prashant},
+    journal={Operating systems review},
+    volume={44},
+    number={2},
+    pages={35},
+    year={2010}
+  }
+
+  @article{melnik2010dremel,
+    title={Dremel: interactive analysis of web-scale datasets},
+    author={Melnik, Sergey and Gubarev, Andrey and Long, Jing Jing and Romer, Geoffrey and Shivakumar, Shiva and Tolton, Matt and Vassilakis, Theo},
+    journal={Proceedings of the VLDB Endowment},
+    volume={3},
+    number={1-2},
+    pages={330--339},
+    year={2010},
+    publisher={VLDB Endowment}
+  }
+
+  @article{hall2012processing,
+    title={Processing a trillion cells per mouse click},
+    author={Hall, Alexander and Bachmann, Olaf and B{\"u}ssow, Robert and G{\u{a}}nceanu, Silviu and Nunkesser, Marc},
+    journal={Proceedings of the VLDB Endowment},
+    volume={5},
+    number={11},
+    pages={1436--1446},
+    year={2012},
+    publisher={VLDB Endowment}
+  }
+
+  @inproceedings{shvachko2010hadoop,
+    title={The hadoop distributed file system},
+    author={Shvachko, Konstantin and Kuang, Hairong and Radia, Sanjay and Chansler, Robert},
+    booktitle={Mass Storage Systems and Technologies (MSST), 2010 IEEE 26th Symposium on},
+    pages={1--10},
+    year={2010},
+    organization={IEEE}
+  }
+
+  @article{colantonio2010concise,
+    title={Concise: Compressed ‘n’Composable Integer Set},
+    author={Colantonio, Alessandro and Di Pietro, Roberto},
+    journal={Information Processing Letters},
+    volume={110},
+    number={16},
+    pages={644--650},
+    year={2010},
+    publisher={Elsevier}
+  }
+
+  @inproceedings{stonebraker2005c,
+    title={C-store: a column-oriented DBMS},
+    author={Stonebraker, Mike and Abadi, Daniel J and Batkin, Adam and Chen, Xuedong and Cherniack, Mitch and Ferreira, Miguel and Lau, Edmond and Lin, Amerson and Madden, Sam and O'Neil, Elizabeth and others},
+    booktitle={Proceedings of the 31st international conference on Very large data bases},
+    pages={553--564},
+    year={2005},
+    organization={VLDB Endowment}
+  }
+
+  @inproceedings{engle2012shark,
+    title={Shark: fast data analysis using coarse-grained distributed memory},
+    author={Engle, Cliff and Lupher, Antonio and Xin, Reynold and Zaharia, Matei and Franklin, Michael J and Shenker, Scott and Stoica, Ion},
+    booktitle={Proceedings of the 2012 international conference on Management of Data},
+    pages={689--692},
+    year={2012},
+    organization={ACM}
+  }
+
+  @inproceedings{zaharia2012discretized,
+    title={Discretized streams: an efficient and fault-tolerant model for stream processing on large clusters},
+    author={Zaharia, Matei and Das, Tathagata and Li, Haoyuan and Shenker, Scott and Stoica, Ion},
+    booktitle={Proceedings of the 4th USENIX conference on Hot Topics in Cloud Computing},
+    pages={10--10},
+    year={2012},
+    organization={USENIX Association}
+  }
+
+  @misc{marz2013storm,
+  author = {Marz, Nathan},
+  title = {Storm: Distributed and Fault-Tolerant Realtime Computation},
+  month = {February},
+  year = {2013},
+  howpublished = "\url{http://storm-project.net/}"
+  }
+
+  @misc{tschetter2011druid,
+  author = {Eric Tschetter},
+  title = {Introducing Druid: Real-Time Analytics at a Billion Rows Per Second},
+  month = {April},
+  year = {2011},
+  howpublished = "\url{http://druid.io/blog/2011/04/30/introducing-druid.html}"
+  }
+
+  @article{farber2012sap,
+    title={SAP HANA database: data management for modern business applications},
+    author={F{\"a}rber, Franz and Cha, Sang Kyun and Primsch, J{\"u}rgen and Bornh{\"o}vd, Christof and Sigg, Stefan and Lehner, Wolfgang},
+    journal={ACM Sigmod Record},
+    volume={40},
+    number={4},
+    pages={45--51},
+    year={2012},
+    publisher={ACM}
+  }
+
+  @misc{voltdb2010voltdb,
+    title={VoltDB Technical Overview},
+    author={VoltDB, LLC},
+    year={2010},
+  howpublished = "\url{https://voltdb.com/}"
+  }
+
+  @inproceedings{macnicol2004sybase,
+    title={Sybase IQ multiplex-designed for analytics},
+    author={MacNicol, Roger and French, Blaine},
+    booktitle={Proceedings of the Thirtieth international conference on Very large data bases-Volume 30},
+    pages={1227--1230},
+    year={2004},
+    organization={VLDB Endowment}
+  }
+
+  @inproceedings{singh2011introduction,
+    title={Introduction to the IBM Netezza warehouse appliance},
+    author={Singh, Malcolm and Leonhardi, Ben},
+    booktitle={Proceedings of the 2011 Conference of the Center for Advanced Studies on Collaborative Research},
+    pages={385--386},
+    year={2011},
+    organization={IBM Corp.}
+  }
+
+  @inproceedings{miner2012unified,
+    title={Unified analytics platform for big data},
+    author={Miner, Donald},
+    booktitle={Proceedings of the WICSA/ECSA 2012 Companion Volume},
+    pages={176--176},
+    year={2012},
+    organization={ACM}
+  }
+
+  @inproceedings{fink2012distributed,
+    title={Distributed computation on dynamo-style distributed storage: riak pipe},
+    author={Fink, Bryan},
+    booktitle={Proceedings of the eleventh ACM SIGPLAN workshop on Erlang workshop},
+    pages={43--50},
+    year={2012},
+    organization={ACM}
+  }
+
+  @misc{paraccel2013,
+  key = {ParAccel Analytic Database},
+  title = {ParAccel Analytic Database},
+  month = {March},
+  year = {2013},
+  howpublished = "\url{http://www.paraccel.com/resources/Datasheets/ParAccel-Core-Analytic-Database.pdf}"
+  }
+
+  @misc{cloudera2013,
+  key = {Cloudera Impala},
+  title = {Cloudera Impala},
+  month = {March},
+  year = {2013},
+  url = {},
+  howpublished = "\url{http://blog.cloudera.com/blog}"
+  }
+
+  @inproceedings{hunt2010zookeeper,
+    title={ZooKeeper: Wait-free coordination for Internet-scale systems},
+    author={Hunt, Patrick and Konar, Mahadev and Junqueira, Flavio P and Reed, Benjamin},
+    booktitle={USENIX ATC},
+    volume={10},
+    year={2010}
+  }
+
+  @inproceedings{kreps2011kafka,
+    title={Kafka: A distributed messaging system for log processing},
+    author={Kreps, Jay and Narkhede, Neha and Rao, Jun},
+    booktitle={Proceedings of 6th International Workshop on Networking Meets Databases (NetDB), Athens, Greece},
+    year={2011}
+  }
+
+  @misc{liblzf2013,
+  title = {LibLZF},
+  key = {LibLZF},
+  month = {March},
+  year = {2013},
+  howpublished = "\url{http://freecode.com/projects/liblzf}"
+  }
+
+  @inproceedings{tomasic1993performance,
+    title={Performance of inverted indices in shared-nothing distributed text document information retrieval systems},
+    author={Tomasic, Anthony and Garcia-Molina, Hector},
+    booktitle={Parallel and Distributed Information Systems, 1993., Proceedings of the Second International Conference on},
+    pages={8--17},
+    year={1993},
+    organization={IEEE}
+  }
+
+  @inproceedings{antoshenkov1995byte,
+    title={Byte-aligned bitmap compression},
+    author={Antoshenkov, Gennady},
+    booktitle={Data Compression Conference, 1995. DCC'95. Proceedings},
+    pages={476},
+    year={1995},
+    organization={IEEE}
+  }
+
+  @inproceedings{van2011memory,
+    title={A memory efficient reachability data structure through bit vector compression},
+    author={van Schaik, Sebastiaan J and de Moor, Oege},
+    booktitle={Proceedings of the 2011 international conference on Management of data},
+    pages={913--924},
+    year={2011},
+    organization={ACM}
+  }
+
+  @inproceedings{o1993lru,
+    title={The LRU-K page replacement algorithm for database disk buffering},
+    author={O'neil, Elizabeth J and O'neil, Patrick E and Weikum, Gerhard},
+    booktitle={ACM SIGMOD Record},
+    volume={22},
+    number={2},
+    pages={297--306},
+    year={1993},
+    organization={ACM}
+  }
+
+  @article{kim2001lrfu,
+    title={LRFU: A spectrum of policies that subsumes the least recently used and least frequently used policies},
+    author={Kim, Chong Sang},
+    journal={IEEE Transactions on Computers},
+    volume={50},
+    number={12},
+    year={2001}
+  }
+
+  @article{wu2006optimizing,
+    title={Optimizing bitmap indices with efficient compression},
+    author={Wu, Kesheng and Otoo, Ekow J and Shoshani, Arie},
+    journal={ACM Transactions on Database Systems (TODS)},
+    volume={31},
+    number={1},
+    pages={1--38},
+    year={2006},
+    publisher={ACM}
+  }
+
+  @misc{twitter2013,
+  key = {Twitter Public Streams},
+  title = {Twitter Public Streams},
+  month = {March},
+  year = {2013},
+  howpublished = "\url{https://dev.twitter.com/docs/streaming-apis/streams/public}"
+  }
+
+  @article{fitzpatrick2004distributed,
+    title={Distributed caching with memcached},
+    author={Fitzpatrick, Brad},
+    journal={Linux journal},
+    number={124},
+    pages={72--74},
+    year={2004}
+  }
+  @inproceedings{amdahl1967validity,
+    title={Validity of the single processor approach to achieving large scale computing capabilities},
+    author={Amdahl, Gene M},
+    booktitle={Proceedings of the April 18-20, 1967, spring joint computer conference},
+    pages={483--485},
+    year={1967},
+    organization={ACM}
+  }
+  @book{sarawagi1998discovery,
+    title={Discovery-driven exploration of OLAP data cubes},
+    author={Sarawagi, Sunita and Agrawal, Rakesh and Megiddo, Nimrod},
+    year={1998},
+    publisher={Springer}
+  }
+  @article{hu2011stream,
+    title={Stream Database Survey},
+    author={Hu, Bo},
+    year={2011}
+  }
+
+  @article{dean2008mapreduce,
+    title={MapReduce: simplified data processing on large clusters},
+    author={Dean, Jeffrey and Ghemawat, Sanjay},
+    journal={Communications of the ACM},
+    volume={51},
+    number={1},
+    pages={107--113},
+    year={2008},
+    publisher={ACM}
+  }
+
+  @misc{linkedin2013senseidb,
+  author = {LinkedIn},
+  title = {SenseiDB},
+  month = {July},
+  year = {2013},
+  howpublished = "\url{http://www.senseidb.com/}"
+  }
+
+  @misc{apache2013solr,
+  author = {Apache},
+  title = {Apache Solr},
+  month = {February},
+  year = {2013},
+  howpublished = "\url{http://lucene.apache.org/solr/}"
+  }
+
+  @misc{banon2013elasticsearch,
+  author = {Banon, Shay},
+  title = {ElasticSearch},
+  month = {July},
+  year = {2013},
+  howpublished = "\url{http://www.elasticseach.com/}"
+  }
+
+@book{oehler2012ibm,
+  title={IBM Cognos TM1: The Official Guide},
+  author={Oehler, Karsten and Gruenes, Jochen and Ilacqua, Christopher and Perez, Manuel},
+  year={2012},
+  publisher={McGraw-Hill}
+}
+
+@book{schrader2009oracle,
+  title={Oracle Essbase \& Oracle OLAP},
+  author={Schrader, Michael and Vlamis, Dan and Nader, Mike and Claterbos, Chris and Collins, Dave and Campbell, Mitch and Conrad, Floyd},
+  year={2009},
+  publisher={McGraw-Hill, Inc.}
+}
+
+@book{lachev2005applied,
+  title={Applied Microsoft Analysis Services 2005: And Microsoft Business Intelligence Platform},
+  author={Lachev, Teo},
+  year={2005},
+  publisher={Prologika Press}
+}
+
+@article{o1996log,
+  title={The log-structured merge-tree (LSM-tree)},
+  author={O’Neil, Patrick and Cheng, Edward and Gawlick, Dieter and O’Neil, Elizabeth},
+  journal={Acta Informatica},
+  volume={33},
+  number={4},
+  pages={351--385},
+  year={1996},
+  publisher={Springer}
+}
+
+@inproceedings{o1997improved,
+  title={Improved query performance with variant indexes},
+  author={O'Neil, Patrick and Quass, Dallan},
+  booktitle={ACM Sigmod Record},
+  volume={26},
+  number={2},
+  pages={38--49},
+  year={1997},
+  organization={ACM}
+}
+
+@inproceedings{cipar2012lazybase,
+  title={LazyBase: trading freshness for performance in a scalable database},
+  author={Cipar, James and Ganger, Greg and Keeton, Kimberly and Morrey III, Charles B and Soules, Craig AN and Veitch, Alistair},
+  booktitle={Proceedings of the 7th ACM european conference on Computer Systems},
+  pages={169--182},
+  year={2012},
+  organization={ACM}
+}
--- a/publications/demo/druid_demo.blg
+++ b/publications/demo/druid_demo.blg
@ -0,0 +1,46 @@
+This is BibTeX, Version 0.99d (TeX Live 2012)
+Capacity: max_strings=35307, hash_size=35307, hash_prime=30011
+The top-level auxiliary file: druid_demo.aux
+The style file: abbrv.bst
+Database file #1: druid_demo.bib
+You've used 4 entries,
+            2118 wiz_defined-function locations,
+            524 strings with 4556 characters,
+and the built_in function-call counts, 1592 in all, are:
+= -- 160
+> -- 67
+< -- 3
+ -- 26
+- -- 22
+* -- 105
+:= -- 251
+add.period$ -- 14
+call.type$ -- 4
+change.case$ -- 23
+chr.to.int$ -- 0
+cite$ -- 4
+duplicate$ -- 67
+empty$ -- 133
+format.name$ -- 22
+if$ -- 349
+int.to.chr$ -- 0
+int.to.str$ -- 4
+missing$ -- 4
+newline$ -- 23
+num.names$ -- 8
+pop$ -- 30
+preamble$ -- 1
+purify$ -- 19
+quote$ -- 0
+skip$ -- 47
+stack$ -- 0
+substring$ -- 96
+swap$ -- 22
+text.length$ -- 3
+text.prefix$ -- 0
+top$ -- 0
+type$ -- 16
+warning$ -- 0
+while$ -- 16
+width$ -- 5
+write$ -- 48
--- a/publications/demo/druid_demo.out
+++ b/publications/demo/druid_demo.out
@ -0,0 +1,18 @@
+\BOOKMARK [1][-]{section.1}{Introduction}{}% 1
+\BOOKMARK [2][-]{subsection.1.1}{The Need for Druid}{section.1}% 2
+\BOOKMARK [1][-]{section.2}{Architecture}{}% 3
+\BOOKMARK [2][-]{subsection.2.1}{Real-time Nodes}{section.2}% 4
+\BOOKMARK [2][-]{subsection.2.2}{Historical Nodes}{section.2}% 5
+\BOOKMARK [2][-]{subsection.2.3}{Broker Nodes}{section.2}% 6
+\BOOKMARK [2][-]{subsection.2.4}{Coordinator Nodes}{section.2}% 7
+\BOOKMARK [2][-]{subsection.2.5}{Query Processing}{section.2}% 8
+\BOOKMARK [2][-]{subsection.2.6}{Query Capabilities}{section.2}% 9
+\BOOKMARK [1][-]{section.3}{Performance}{}% 10
+\BOOKMARK [2][-]{subsection.3.1}{Query Performance}{section.3}% 11
+\BOOKMARK [2][-]{subsection.3.2}{Data Ingestion Performance}{section.3}% 12
+\BOOKMARK [1][-]{section.4}{Demonstration Details}{}% 13
+\BOOKMARK [2][-]{subsection.4.1}{Setup}{section.4}% 14
+\BOOKMARK [2][-]{subsection.4.2}{Goals}{section.4}% 15
+\BOOKMARK [1][-]{section.5}{Acknowledgments}{}% 16
+\BOOKMARK [1][-]{section.6}{Additional Authors}{}% 17
+\BOOKMARK [1][-]{section.7}{References}{}% 18
--- a/publications/demo/druid_demo.pdf
+++ b/publications/demo/druid_demo.pdf
--- a/publications/demo/druid_demo.tex
+++ b/publications/demo/druid_demo.tex
@ -0,0 +1,464 @@
+% THIS IS AN EXAMPLE DOCUMENT FOR VLDB 2012
+% based on ACM SIGPROC-SP.TEX VERSION 2.7
+% Modified by  Gerald Weber <gerald@cs.auckland.ac.nz>
+% Removed the requirement to include *bbl file in here. (AhmetSacan, Sep2012)
+% Fixed the equation on page 3 to prevent line overflow. (AhmetSacan, Sep2012)
+
+\documentclass{vldb}
+\usepackage{graphicx}
+\usepackage{balance}  % for  \balance command ON LAST PAGE  (only there!)
+\usepackage{fontspec}
+\usepackage{hyperref}
+\graphicspath{{figures/}}
+\usepackage{enumitem}
+
+\begin{document}
+
+% ****************** TITLE ****************************************
+
+\title{Druid: Open Source Real-time Analytics at Scale}
+
+% possible, but not really needed or used for PVLDB:
+%\subtitle{[Extended Abstract]
+%\titlenote{A full version of this paper is available as\textit{Author's Guide to Preparing ACM SIG Proceedings Using \LaTeX$2_\epsilon$\ and BibTeX} at \texttt{www.acm.org/eaddress.htm}}}
+
+% ****************** AUTHORS **************************************
+
+% You need the command \numberofauthors to handle the 'placement
+% and alignment' of the authors beneath the title.
+%
+% For aesthetic reasons, we recommend 'three authors at a time'
+% i.e. three 'name/affiliation blocks' be placed beneath the title.
+%
+% NOTE: You are NOT restricted in how many 'rows' of
+% "name/affiliations" may appear. We just ask that you restrict
+% the number of 'columns' to three.
+%
+% Because of the available 'opening page real-estate'
+% we ask you to refrain from putting more than six authors
+% (two rows with three columns) beneath the article title.
+% More than six makes the first-page appear very cluttered indeed.
+%
+% Use the \alignauthor commands to handle the names
+% and affiliations for an 'aesthetic maximum' of six authors.
+% Add names, affiliations, addresses for
+% the seventh etc. author(s) as the argument for the
+% \additionalauthors command.
+% These 'additional authors' will be output/set for you
+% without further effort on your part as the last section in
+% the body of your article BEFORE References or any Appendices.
+
+\numberofauthors{6} %  in this sample file, there are a *total*
+% of EIGHT authors. SIX appear on the 'first-page' (for formatting
+% reasons) and the remaining two appear in the \additionalauthors section.
+
+\author{
+% You can go ahead and credit any number of authors here,
+% e.g. one 'row of three' or two rows (consisting of one row of three
+% and a second row of one, two or three).
+%
+% The command \alignauthor (no curly braces needed) should
+% precede each author name, affiliation/snail-mail address and
+% e-mail address. Additionally, tag each line of
+% affiliation/address with \affaddr, and tag the
+% e-mail address with \email.
+%
+% 1st. author
+\alignauthor
+Fangjin Yang\\
+       \affaddr{Metamarkets Group, Inc.}\\
+       \email{fangjin@metamarkets.com}
+% 2nd. author
+\alignauthor
+Eric Tschetter\\
+       \email{echeddar@gmail.com}
+% 3rd. author
+\alignauthor 
+Xavier Léauté\\
+       \affaddr{Metamarkets Group, Inc.}\\
+       \email{xavier@metamarkets.com}
+\and  % use '\and' if you need 'another row' of author names
+% 4th. author
+\alignauthor
+Nishant Bangarwa\\
+       \affaddr{Metamarkets Group, Inc.}\\
+       \email{nishant@metamarkets.com}
+% 5th. author
+\alignauthor
+Nelson Ray\\
+       \email{ncray86@gmail.com}
+% 6th. author
+\alignauthor
+Gian Merlino\\
+       \affaddr{Metamarkets Group, Inc.}\\
+       \email{gian@metamarkets.com}
+}
+% There's nothing stopping you putting the seventh, eighth, etc.
+% author on the opening page (as the 'third row') but we ask,
+% for aesthetic reasons that you place these 'additional authors'
+% in the \additional authors block, viz.
+\additionalauthors{Additional authors: Deep Ganguli (Metamarkets Group, Inc., {\texttt{deep@metamarkets.com}}), Himadri Singh (Metamarkets Group, Inc., {\texttt{himadri@metamarkets.com}}), Igal Levy (Metamarkets Group, Inc., {\texttt{igal@metamarkets.com}})}
+\date{14 March 2014}
+% Just remember to make sure that the TOTAL number of authors
+% is the number that will appear on the first page PLUS the
+% number that will appear in the \additionalauthors section.
+
+
+\maketitle
+
+\begin{abstract}
+Druid is an open
+source\footnote{\href{https://github.com/metamx/druid}{https://github.com/metamx/druid}}
+data store built for exploratory analytics on large data sets.  Druid supports
+fast data aggregation, low latency data ingestion, and arbitrary data
+exploration. The system combines a column-oriented storage layout, a
+distributed, shared-nothing architecture, and an advanced indexing structure to
+return queries on billions of rows in milliseconds.  Druid is petabyte scale and
+is deployed in production at several technology companies.
+\end{abstract}
+
+\section{Introduction}
+The recent proliferation of internet technology has created a surge
+in machine-generated events.  Individually, these events contain minimal useful
+information and are of low value.  Given the time and resources required to
+extract meaning from large collections of events, many companies were willing
+to discard this data instead.  
+
+A few years ago, Google introduced MapReduce as their mechanism of leveraging
+commodity hardware to index the internet and analyze logs.  The Hadoop project
+soon followed and was largely patterned after the insights that came out of the
+original MapReduce paper. Hadoop has contributed much to helping companies
+convert their low-value event streams into high-value aggregates for a variety
+of applications such as business intelligence and A-B testing.
+
+As with a lot of great systems, Hadoop has opened our eyes to a new space of
+problems.  Specifically, Hadoop excels at storing and providing access to large
+amounts of data, however, it does not make any performance guarantees around
+how quickly that data can be accessed.  Furthermore, although Hadoop is a
+highly available system, performance degrades under heavy concurrent load.
+Lastly, while Hadoop works well for storing data, it is not optimized for
+ingesting data and making that data immediately readable.
+
+\subsection{The Need for Druid}
+Druid was originally designed to solve problems around ingesting and exploring
+large quantities of transactional events (log data). This form of timeseries
+data (OLAP data) is commonly found in the business intelligence
+space and the nature of the data tends to be very append heavy. Events typically
+have three distinct components: a timestamp column indicating when the event
+occurred, a set of dimension columns indicating various attributes about the
+event, and a set of metric columns containing values (usually numeric) that can
+be aggregated. Queries are typically issued for the sum of some set of metrics,
+filtered by some set of dimensions, over some span of time. 
+
+The Druid project first began out of necessity at Metamarkets to power a
+business intelligence dashboard that allowed users to arbitrarily explore and
+visualize event streams. Existing open source Relational Database Management
+Systems, cluster computing frameworks, and NoSQL key/value stores were unable
+to provide a low latency data ingestion and query platform for an interactive
+dashboard. Queries needed to return fast enough to allow the data
+visualizations in the dashboard to update interactively.
+
+In addition to the query latency needs, the system had to be multi-tenant and
+highly available, as the dashboard is used in a highly concurrent environment.
+Downtime is costly and many businesses cannot afford to wait if a system is
+unavailable in the face of software upgrades or network failure. Finally,
+Metamarkets also wanted to allow users and alerting systems to be able to make
+business decisions in ``real-time". The time from when an event is created to
+when that event is queryable determines how fast users and systems are able to
+react to potentially catastrophic occurrences in their systems. 
+
+The problems of data exploration, ingestion, and availability span multiple
+industries. Since Druid was open sourced in October 2012, it has been deployed as a
+video, network monitoring, operations monitoring, and online advertising
+analytics platform at multiple companies\footnote{\href{http://druid.io/druid.html}{http://druid.io/druid.html}}.
+
+\begin{figure*}
+\centering
+\includegraphics[width = 4.5in]{cluster}
+\caption{An overview of a Druid cluster and the flow of data through the cluster.}
+\label{fig:cluster}
+\end{figure*}
+
+\section{Architecture}
+A Druid cluster consists of different types of nodes and each node type is
+designed to perform a specific set of things. We believe this design separates
+concerns and simplifies the complexity of the system. The different node types
+operate fairly independently of each other and there is minimal interaction among
+them. Hence, intra-cluster communication failures have minimal impact on data
+availability.  To solve complex data analysis problems, the different node
+types come together to form a fully working system. The composition of and flow
+of data in a Druid cluster are shown in Figure~\ref{fig:cluster}. All Druid
+nodes announce their availability and the data they are serving over
+Zookeeper\cite{hunt2010zookeeper}.
+
+\subsection{Real-time Nodes}
+Real-time nodes encapsulate the functionality to ingest and query event
+streams. Events indexed via these nodes are immediately available for querying.
+These nodes are only concerned with events for some small time range. They
+periodically hand off batches of immutable events to other nodes in the Druid
+cluster that are specialized in dealing with batches of immutable events.
+
+Real-time nodes maintain an in-memory index buffer for all incoming events.
+These indexes are incrementally populated as new events are ingested and the
+indexes are also directly queryable. To avoid heap overflow problems, real-time
+nodes persist their in-memory indexes to disk either periodically or after some
+maximum row limit is reached. This persist process converts data stored in the
+in-memory buffer to a column oriented storage format. Each persisted index is
+immutable and real-time nodes load persisted indexes into off-heap memory such
+that they can still be queried. On a periodic basis, each real-time node will
+schedule a background task that searches for all locally persisted indexes. The
+task merges these indexes together and builds an immutable block of data that
+contains all the events that have ingested by a real-time node for some span of
+time. We refer to this block of data as a ``segment". During the handoff stage,
+a real-time node uploads this segment to permanent backup storage, typically
+a distributed file system that Druid calls ``deep storage".
+
+\subsection{Historical Nodes}
+Historical nodes encapsulate the functionality to load and serve the immutable
+blocks of data (segments) created by real-time nodes. In many real-world
+workflows, most of the data loaded in a Druid cluster is immutable and hence
+historical nodes are typically the main workers of a Druid cluster.  Historical
+nodes follow a shared-nothing architecture and there is no single point of
+contention among the nodes. The nodes have no knowledge of one another and are
+operationally simple; they only know how to load, drop, and serve immutable
+segments.
+
+\subsection{Broker Nodes}
+Broker nodes act as query routers to historical and real-time nodes. Broker
+nodes understand what segments are queryable and where those segments are
+located. Broker nodes route incoming queries such that the queries hit the
+right historical or real-time nodes. Broker nodes also merge partial results
+from historical and real-time nodes before returning a final consolidated
+result to the caller.
+
+\subsection{Coordinator Nodes}
+Druid coordinator nodes are primarily in charge of data management and
+distribution on historical nodes. The coordinator nodes tell historical nodes
+to load new data, drop outdated data, replicate data, and move data to load
+balance. Coordinator nodes undergo a
+leader-election process that determines a single node that runs the coordinator
+functionality. The remaining coordinator nodes act as redundant backups.
+
+A coordinator node runs periodically to determine the current state of the
+cluster. It makes decisions by comparing the expected state of the cluster with
+the actual state of the cluster at the time of the run.  Coordinator nodes also
+maintain a connection to a MySQL database that contains additional operational
+parameters and configurations.  One of the key pieces of information located in
+the MySQL database is a table that contains a list of all segments that should
+be served by historical nodes.  This table can be updated by any service that
+creates segments, such as real-time nodes. 
+
+\subsection{Query Processing}
+Data tables in Druid (called \emph{data sources}) are collections of
+timestamped events partitioned into a set of segments, where each segment
+is typically 5--10 million rows. Formally, we define a segment as a collection
+of rows of data that span some period in time. Segments represent the
+fundamental storage unit in Druid and replication and distribution are done at
+a segment level.
+
+Druid segments are stored in a column orientation. Given that Druid is best
+used for aggregating event streams (all data going into Druid must have a
+timestamp), the advantages storing aggregate information as columns rather than
+rows are well documented \cite{abadi2008column}. Column storage allows for more
+efficient CPU usage as only what is needed is actually loaded and scanned. 
+
+Druid has multiple column types to represent various data formats. Depending on
+the column type, different compression methods are used to reduce the cost of
+storing a column in memory and on disk. For example, if an entire column only
+contains string values, storing the raw strings is unnecessarily costly.
+String columns can be dictionary encoded instead. Dictionary encoding is a
+common method to compress data in column stores.
+
+In many real world OLAP workflows, queries are issued for the aggregated
+results of some set of metrics where some set of dimension specifications are
+met. Consider Table~\ref{tab:sample_data}. An example query for this table may
+ask: ``How much revenue was generated in the first hour of 2014-01-01 in the
+city of San Francisco?". This query is filtering a sales data set based on a
+Boolean expression of dimension values. In many real world data sets, dimension
+columns contain strings and metric columns contain numbers. Druid creates
+additional lookup indices for string columns such that only those rows that
+pertain to a particular query filter are ever scanned.
+
+\begin{table}
+  \centering
+  \begin{tabular}{| l | l | l |}
+    \hline
+    \textbf{Timestamp} & \textbf{City} & \textbf{Revenue} \\ \hline
+    2014-01-01T01:00:00Z & San Francisco & 25 \\ \hline
+    2014-01-01T01:00:00Z & San Francisco & 42 \\ \hline
+    2014-01-01T02:00:00Z & New York & 17 \\ \hline
+    2014-01-01T02:00:00Z & New York & 170 \\ \hline
+  \end{tabular}
+  \caption{Sample sales data set.}
+  \label{tab:sample_data}
+\end{table}
+
+For each unique city in
+Table~\ref{tab:sample_data}, we can form some representation
+indicating in which table rows a particular city is seen. We can
+store this information in a binary array where the array indices
+represent our rows. If a particular page is seen in a certain
+row, that array index is marked as \texttt{1}. For example:
+{\small\begin{verbatim}
+San Francisco -> rows [0, 1] -> [1][1][0][0]
+New York      -> rows [2, 3] -> [0][0][1][1]
+\end{verbatim}}
+
+\texttt{San Francisco} is seen in rows \texttt{0} and \texttt{1}. This mapping of column values
+to row indices forms an inverted index \cite{tomasic1993performance}. To know which
+rows contain {\ttfamily San Francisco} or {\ttfamily New York}, we can \texttt{OR} together
+the two arrays.
+{\small\begin{verbatim}
+[0][1][0][1] OR [1][0][1][0] = [1][1][1][1]
+\end{verbatim}}
+
+This approach of performing Boolean operations on large bitmap sets is commonly
+used in search engines. Druid compresses each bitmap index using the Concise
+algorithm \cite{colantonio2010concise}. All Boolean operations on top of these
+Concise sets are done without decompressing the set. 
+
+\subsection{Query Capabilities}
+Druid supports many types of aggregations including double sums, long sums,
+minimums, maximums, and complex aggregations such as cardinality estimation and
+approximate quantile estimation.  The results of aggregations can be combined
+in mathematical expressions to form other aggregations. Druid supports
+different query types ranging from simple aggregates for an interval time,
+groupBys, and approximate top-K queries. 
+
+\section{Performance}
+Druid runs in production at several organizations, and to briefly demonstrate its
+performance, we have chosen to share some real world numbers for the main production
+cluster running at Metamarkets in early 2014. For comparison with other databases
+we also include results from synthetic workloads on TPC-H data.
+
+\subsection{Query Performance}
+Query latencies are shown in Figure~\ref{fig:query_latency} for a cluster
+hosting approximately 10.5TB of data using 1302 processing threads and 672
+total cores (hyperthreaded). There are approximately 50 billion rows of data in
+this cluster.
+
+\begin{figure}
+\centering
+\includegraphics[width = 2.3in]{avg_query_latency}
+\caption{Query latencies of production data sources.}
+\label{fig:query_latency}
+\end{figure}
+
+\begin{figure}
+\centering
+\includegraphics[width = 2.3in]{tpch_100gb}
+\caption{Druid \& MySQL benchmarks -- 100GB TPC-H data.}
+\label{fig:tpch_100gb}
+\end{figure}
+
+The average queries per minute during this time was approximately
+1000. The number of dimensions the various data sources vary from 25 to 78
+dimensions, and 8 to 35 metrics. Across all the various data sources, average
+query latency is approximately 550 milliseconds, with 90\% of queries returning
+in less than 1 second, 95\% in under 2 seconds, and 99\% of queries returning
+in less than 10 seconds.  
+
+Approximately 30\% of the queries are standard
+aggregates involving different types of metrics and filters, 60\% of queries
+are ordered group bys over one or more dimensions with aggregates, and 10\% of
+queries are search queries and metadata retrieval queries. The number of
+columns scanned in aggregate queries roughly follows an exponential
+distribution. Queries involving a single column are very frequent, and queries
+involving all columns are very rare.
+
+We also present Druid benchmarks on TPC-H data in Figure~\ref{fig:tpch_100g}.
+Most TPC-H queries do not directly apply to Druid, so we selected queries more
+typical of Druid's workload to demonstrate query performance. As a comparison,
+we also provide the results of the same queries using MySQL using the MyISAM
+engine (InnoDB was slower in our experiments).
+
+We benchmarked Druid's scan rate at 53,539,211 rows/second/core for
+\texttt{select count(*)} equivalent query over a given time interval and
+36,246,530 rows/second/core for a \texttt{select sum(float)} type query.
+
+\subsection{Data Ingestion Performance}
+To showcase Druid's data ingestion latency, we selected several production
+datasources of varying dimensions, metrics, and event volumes. Druid's data
+ingestion latency is heavily dependent on the complexity of the data set being
+ingested. The data complexity is determined by the number of dimensions in each
+event, the number of metrics in each event, and the types of aggregations we
+want to perform on those metrics. 
+
+\begin{figure}
+\centering
+\includegraphics[width = 2.3in]{ingestion_rate}
+\caption{Combined cluster ingestion rates.}
+\label{fig:ingestion_rate}
+\end{figure}
+
+For the given datasources, the number of dimensions vary from 5 to 35, and the
+number of metrics vary from 2 to 24. The peak ingestion latency we measured in
+production was 22914.43 events/second/core on a datasource with 30 dimensions
+and 19 metrics.
+
+The latency measurements we presented are sufficient to address the our stated
+problems of interactivity. We would prefer the variability in the latencies to
+be less, which can be achieved by adding additional
+hardware, but we have not chosen to do so because of cost concerns.
+
+\section{Demonstration Details}
+
+We would like to do an end-to-end demonstratation of Druid, from setting up a
+cluster, ingesting data, structuring a query, and obtaining results. We would
+also like to showcase how to solve real-world data analysis problems with Druid
+and demonstrate tools that can be built on top of it, including interactive
+data visualizations, approximate algorithms, and machine-learning components.
+We already use similar tools in production.
+
+\subsection{Setup}
+
+Users will be able to set up a local Druid cluster to better understand the
+components and architecture of the system. Druid is designed to run on
+commodity hardware and Druid nodes are simply java processes that need to be
+started up. The local setup will allow users to ingest data from Twitter's
+public API and query it.  We will also provide users access to an AWS hosted
+Druid cluster that contains several terabytes of Twitter data that we have been
+collecting for over 2 years. There are over 3 billion tweets in this data set,
+and new events are constantly being ingested. We will walk through a variety of
+different queries to demonstrate Druid's arbitrary data-exploration
+capabilities.
+
+Finally, we will teach users how to build a simple interactive dashboard on top
+of Druid. The dashboard will use some of Druid's more powerful features such as
+approximate algorithms for quickly determining the cardinality of sets, and
+machine learning algorithms for scientific computing problems such as anomaly
+detection.  These use cases represent some of the more interesting problems we
+use Druid for in production.
+
+\subsection{Goals}
+
+We will not only walk users through solving real-world problems with Druid and
+different tools that have been built on top of Druid, but also answer
+conference-specific questions such as what are the trending tweets and topics
+at VLDB, what netizens are conversing about in the general area, and even
+perform a sentiment analysis of VLDB. Our goal is to clearly explain why the
+architecture of Druid makes it highly optimal for certain types of queries, and
+the potential of the system as a real-time analytics platform.
+
+%\end{document}  % This is where a 'short' article might terminate
+
+% ensure same length columns on last page (might need two sub-sequent latex runs)
+\balance
+
+%ACKNOWLEDGMENTS are optional
+\section{Acknowledgments}
+Druid could not have been built without the help of many great people in the
+community.  We want to thank everyone that has contributed to the Druid
+codebase for their invaluable support.
+
+% The following two commands are all you need in the
+% initial runs of your .tex file to
+% produce the bibliography for the citations in your paper.
+\bibliographystyle{abbrv}
+\bibliography{druid_demo}  % vldb_sample.bib is the name of the Bibliography in this case
+% You must have a proper ".bib" file
+%  and remember to run:
+% latex bibtex latex latex
+% to resolve all references
+
+\end{document}
--- a/publications/demo/figures/90th_percentile.pdf
+++ b/publications/demo/figures/90th_percentile.pdf
--- a/publications/demo/figures/95th_percentile.pdf
+++ b/publications/demo/figures/95th_percentile.pdf
--- a/publications/demo/figures/99th_percentile.pdf
+++ b/publications/demo/figures/99th_percentile.pdf
--- a/publications/demo/figures/avg_query_latency.pdf
+++ b/publications/demo/figures/avg_query_latency.pdf
--- a/publications/demo/figures/caching.png
+++ b/publications/demo/figures/caching.png
--- a/publications/demo/figures/cluster.png
+++ b/publications/demo/figures/cluster.png
--- a/publications/demo/figures/cluster_scan_rate.pdf
+++ b/publications/demo/figures/cluster_scan_rate.pdf
--- a/publications/demo/figures/concise_plot.pdf
+++ b/publications/demo/figures/concise_plot.pdf
--- a/publications/demo/figures/core_scan_rate.pdf
+++ b/publications/demo/figures/core_scan_rate.pdf
--- a/publications/demo/figures/historical_download.png
+++ b/publications/demo/figures/historical_download.png
--- a/publications/demo/figures/ingestion_rate.pdf
+++ b/publications/demo/figures/ingestion_rate.pdf
--- a/publications/demo/figures/queries_per_min.pdf
+++ b/publications/demo/figures/queries_per_min.pdf
--- a/publications/demo/figures/query_percentiles.pdf
+++ b/publications/demo/figures/query_percentiles.pdf
--- a/publications/demo/figures/realtime_flow.png
+++ b/publications/demo/figures/realtime_flow.png
--- a/publications/demo/figures/realtime_pipeline.png
+++ b/publications/demo/figures/realtime_pipeline.png
--- a/publications/demo/figures/realtime_timeline.png
+++ b/publications/demo/figures/realtime_timeline.png
--- a/publications/demo/figures/throughput_vs_cardinality.png
+++ b/publications/demo/figures/throughput_vs_cardinality.png
--- a/publications/demo/figures/throughput_vs_num_dims.png
+++ b/publications/demo/figures/throughput_vs_num_dims.png
--- a/publications/demo/figures/throughput_vs_num_metrics.png
+++ b/publications/demo/figures/throughput_vs_num_metrics.png
--- a/publications/demo/figures/tpch_100gb.pdf
+++ b/publications/demo/figures/tpch_100gb.pdf
--- a/publications/demo/figures/tpch_1gb.pdf
+++ b/publications/demo/figures/tpch_1gb.pdf
--- a/publications/demo/figures/tpch_scaling.png
+++ b/publications/demo/figures/tpch_scaling.png
--- a/publications/demo/vldb.cls
+++ b/publications/demo/vldb.cls