druid/publications/radstack/src/druid_plot.R

78 lines
2.4 KiB
R
Raw Normal View History

2015-03-26 13:41:16 -04:00
library(ggplot2)
library(stringr)
library(plyr)
x <-
"Dimension Cardinality Concise compressed size (bytes)
Has_mention 2 586,400
Has_links 2 580,872
Has_geo 2 144,004
Is_retweet 2 584,592
Is_viral 2 358,380
User_lang 21 1,414,000
User_time_zone 142 3,876,244
URL_domain 31,165 1,562,428
First_hashtag 100,728 1,837,144
Rt_name 182,704 2,235,288
Reply_to_name 620,421 5,673,504
User_location 637,774 9,511,844
User_mention_name 923,842 9,086,416
User_name 1,784,369 16,000,028"
foo <- function(x){
m <- matrix(unlist(str_split(x, "\t|\n")), ncol = 3, byrow = TRUE)
df <- data.frame(m[-1, ], stringsAsFactors = FALSE)
names(df) <- m[1, ]
df[, 2] <- as.numeric(str_replace_all(df[, 2], ",", ""))
df[, 3] <- as.numeric(str_replace_all(df[, 3], ",", ""))
df <- transform(df, ytext = `Concise\ compressed\ size\ (bytes)`)
names(df) <- c(m[1, ], "ytext")
df
}
df <- foo(x)
## df$ytext[12] <- 1.05 * df$ytext[12]
## df$ytext[13] <- .93 * df$ytext[13]
## df$ytext[1] <- 1.13 * df$ytext[1]
## df$ytext[4] <- .87 * df$ytext[4]
## qplot(x = Cardinality, y = `Concise\ compressed\ size\ (bytes)`, data = df, geom = "point") +
## geom_text(aes(x = Cardinality * 1.2, y = ytext, label = Dimension), hjust = 0, size = 4) +
## scale_x_log10(limits = c(1, 10^8.5)) +
## scale_y_log10() +
## geom_hline(aes(yintercept = 9089180)) +
## geom_text(aes(x = 1e2, y = 9089180 * 1.1, label = "Integer array size (bytes)"), hjust = 0, size = 4) +
## ggtitle("The Relationship of Compressed Size to Cardinality")
y <-
"Dimension Cardinality Concise compressed size (bytes)
Has_mention 2 744
Has_links 2 1,504
Has_geo 2 2,840
Is_retweet 2 1,616
Is_viral 2 1,488
User_lang 21 38,416
User_time_zone 142 319,644
URL_domain 31,165 700,752
First_hashtag 100,728 1,505,292
Rt_name 182,704 1,874,180
Reply_to_name 620,421 5,404,108
User_location 637,774 9,091,016
User_mention_name 923,842 8,686,384
User_name 1,784,369 16,204,900"
df2 <- foo(y)
df$sorted <- "unsorted"
df2$sorted <- "sorted"
dat <- rbind(df, df2)
ggplot(data = dat, aes(x = Cardinality, y = `Concise\ compressed\ size\ (bytes)`)) +
geom_point(aes(color = sorted, shape = sorted), alpha = .9, size = 4) +
scale_x_log10(limits = c(1, 10^8.5)) +
scale_y_log10() +
geom_hline(aes(yintercept = 9089180)) +
geom_text(aes(x = 1e1, y = 9089180 * 1.4, label = "Integer array size (bytes)"), hjust = 0, size = 5)
#ggsave("concise_plot.png", width = 10, height = 8)
ggsave("../figures/concise_plot.pdf", width = 6, height = 4.5)