library(ggplot2) library(stringr) library(plyr) x <- "Dimension Cardinality Concise compressed size (bytes) Has_mention 2 586,400 Has_links 2 580,872 Has_geo 2 144,004 Is_retweet 2 584,592 Is_viral 2 358,380 User_lang 21 1,414,000 User_time_zone 142 3,876,244 URL_domain 31,165 1,562,428 First_hashtag 100,728 1,837,144 Rt_name 182,704 2,235,288 Reply_to_name 620,421 5,673,504 User_location 637,774 9,511,844 User_mention_name 923,842 9,086,416 User_name 1,784,369 16,000,028" foo <- function(x){ m <- matrix(unlist(str_split(x, "\t|\n")), ncol = 3, byrow = TRUE) df <- data.frame(m[-1, ], stringsAsFactors = FALSE) names(df) <- m[1, ] df[, 2] <- as.numeric(str_replace_all(df[, 2], ",", "")) df[, 3] <- as.numeric(str_replace_all(df[, 3], ",", "")) df <- transform(df, ytext = `Concise\ compressed\ size\ (bytes)`) names(df) <- c(m[1, ], "ytext") df } df <- foo(x) ## df$ytext[12] <- 1.05 * df$ytext[12] ## df$ytext[13] <- .93 * df$ytext[13] ## df$ytext[1] <- 1.13 * df$ytext[1] ## df$ytext[4] <- .87 * df$ytext[4] ## qplot(x = Cardinality, y = `Concise\ compressed\ size\ (bytes)`, data = df, geom = "point") + ## geom_text(aes(x = Cardinality * 1.2, y = ytext, label = Dimension), hjust = 0, size = 4) + ## scale_x_log10(limits = c(1, 10^8.5)) + ## scale_y_log10() + ## geom_hline(aes(yintercept = 9089180)) + ## geom_text(aes(x = 1e2, y = 9089180 * 1.1, label = "Integer array size (bytes)"), hjust = 0, size = 4) + ## ggtitle("The Relationship of Compressed Size to Cardinality") y <- "Dimension Cardinality Concise compressed size (bytes) Has_mention 2 744 Has_links 2 1,504 Has_geo 2 2,840 Is_retweet 2 1,616 Is_viral 2 1,488 User_lang 21 38,416 User_time_zone 142 319,644 URL_domain 31,165 700,752 First_hashtag 100,728 1,505,292 Rt_name 182,704 1,874,180 Reply_to_name 620,421 5,404,108 User_location 637,774 9,091,016 User_mention_name 923,842 8,686,384 User_name 1,784,369 16,204,900" df2 <- foo(y) df$sorted <- "unsorted" df2$sorted <- "sorted" dat <- rbind(df, df2) ggplot(data = dat, aes(x = Cardinality, y = `Concise\ compressed\ size\ (bytes)`)) + geom_point(aes(color = sorted, shape = sorted), alpha = .9, size = 4) + scale_x_log10(limits = c(1, 10^8.5)) + scale_y_log10() + geom_hline(aes(yintercept = 9089180)) + geom_text(aes(x = 1e1, y = 9089180 * 1.4, label = "Integer array size (bytes)"), hjust = 0, size = 5) #ggsave("concise_plot.png", width = 10, height = 8) ggsave("../figures/concise_plot.pdf", width = 6, height = 4.5)