--- output: pdf_document: default html_document: default --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) knitr::opts_knit$set(root.dir = normalizePath("C:/Users/Dusty/Documents/coding/projects/Udacity/Data Analysis/eda/lesson5")) library(ggplot2) library(dplyr) library(reshape2) ``` Lesson 5 ======================================================== ### Multivariate Data Notes: *** ### Moira Perceived Audience Size Colored by Age Notes: *** ### Third Qualitative Variable Notes: ```{r Third Qualitative Variable} pf = read.csv('pseudo_facebook.tsv', sep = '\t') pf.fc_by_age_gender <- pf %>% filter(!is.na(gender)) %>% group_by(age, gender) %>% summarize(mean_friend_count = mean(friend_count), median_friend_count = median(friend_count), n = n()) %>% ungroup() %>% arrange(age) head(pf.fc_by_age_gender) ``` *** ### Plotting Conditional Summaries Notes: ```{r Plotting Conditional Summaries} ggplot(aes(x = age, y = median_friend_count), data = pf.fc_by_age_gender) + geom_line(aes(color = gender)) ``` *** ### Thinking in Ratios Notes: What is the ratio of friends for males vs females ### Wide and Long Format Notes: *** ### Reshaping Data Notes: ```{r} #install.packages('reshape2') pf.fc_by_age_gender.wide <- dcast(pf.fc_by_age_gender, age ~ gender, value.var = 'median_friend_count') head(pf.fc_by_age_gender.wide) ``` *** ### Ratio Plot Notes: ```{r Ratio Plot} ggplot(aes(x = age, y = female/male), data = pf.fc_by_age_gender.wide) + geom_line() + geom_hline(aes(yintercept = 1), alpha=0.3, linetype = 2) ``` *** ### Third Quantitative Variable Notes: ```{r Third Quantitative Variable} pf$year_joined <- floor(2014 - pf$tenure/365) head(pf) ``` *** ### Cut a Variable Notes: ```{r Cut a Variable} pf$year_joined.bucket = cut(pf$year_joined, c(2004, 2009, 2011, 2012, 2014)) table(pf$year_joined.bucket) ``` *** ### Plotting it All Together Notes: ```{r Plotting it All Together} ggplot(aes(x = age, y = friend_count), data = subset(pf, !is.na(year_joined.bucket))) + geom_line(aes(color = year_joined.bucket), stat='summary', fun.y = median) ``` *** ### Plot the Grand Mean Notes: ```{r Plot the Grand Mean} ggplot(aes(x = age, y = friend_count), data = subset(pf, !is.na(year_joined.bucket))) + geom_line(aes(color = year_joined.bucket), stat='summary', fun.y = mean) + geom_line(stat = 'summary', fun.y = mean, linetype = 2) ``` *** ### Friending Rate Notes: ```{r Friending Rate} with(subset(pf, tenure >= 1), summary(friend_count / tenure)) ``` *** ### Friendships Initiated Notes: What is the median friend rate? 0.2205 What is the maximum friend rate? 417.0 ```{r Friendships Initiated} ggplot(aes(y = friendships_initiated/tenure, x = tenure), data = subset(pf, tenure >= 1)) + geom_line(aes(color = year_joined.bucket), stat = 'summary', fun.y = mean) ``` *** ### Bias-Variance Tradeoff Revisited Notes: ```{r Bias-Variance Tradeoff Revisited} #ggplot(aes(x = tenure, y = friendships_initiated / tenure), # data = subset(pf, tenure >= 1)) + # geom_line(aes(color = year_joined.bucket), # stat = 'summary', # fun.y = mean) # #ggplot(aes(x = 7 * round(tenure / 7), y = friendships_initiated / tenure), # data = subset(pf, tenure > 0)) + # geom_line(aes(color = year_joined.bucket), # stat = "summary", # fun.y = mean) # #ggplot(aes(x = 30 * round(tenure / 30), y = friendships_initiated / tenure), # data = subset(pf, tenure > 0)) + # geom_line(aes(color = year_joined.bucket), # stat = "summary", # fun.y = mean) # #ggplot(aes(x = 90 * round(tenure / 90), y = friendships_initiated / tenure), # data = subset(pf, tenure > 0)) + # geom_line(aes(color = year_joined.bucket), # stat = "summary", # fun.y = mean) ggplot(aes(x = tenure, y = friendships_initiated / tenure), data = subset(pf, tenure >= 1)) + geom_smooth(aes(color = year_joined.bucket)) ``` *** ### Sean's NFL Fan Sentiment Study Notes: *** ### Introducing the Yogurt Data Set Notes: *** ### Histograms Revisited Notes: ```{r Histograms Revisited} yo <- read.csv('yogurt.csv') yo$id <- factor(yo$id) head(yo) ggplot(aes(x = price), data = yo) + geom_histogram() ``` *** ### Number of Purchases Notes: ```{r Number of Purchases} yo <- transform(yo, all.purchases = strawberry + blueberry + pina.colada + plain + mixed.berry) head(yo) ``` *** ### Prices over Time Notes: ```{r Prices over Time} ggplot(aes(x = time, y = price), data = yo) + geom_point(alpha = 1/10) ggplot(aes(x = time, y = price), data = yo) + geom_point(alpha = 1/10, aes(color = all.purchases)) ``` *** ### Sampling Observations Notes: *** ### Looking at Samples of Households ```{r Looking at Sample of Households} set.seed(1) sample.ids <- sample(levels(yo$id), 16) ggplot(aes(x = time, y = price), data = subset(yo, id %in% sample.ids)) + facet_wrap( ~ id) + geom_line() + geom_point(aes(size = all.purchases), pch = 1) ``` *** ### The Limits of Cross Sectional Data Notes: *** ### Many Variables Notes: *** ### Scatterplot Matrix Notes: ```{r} #install.packages('GGally') library(GGally) theme_set(theme_minimal(20)) set.seed(1836) pf_subset <- pf[, c(2:15)] names(pf_subset) ggpairs(pf_subset[sample.int(nrow(pf_subset), 1000), ]) ``` ### Even More Variables Notes: *** ### Heat Maps Notes: ```{r} nci <- read.table("nci.tsv") colnames(nci) <- c(1:64) ``` ```{r} nci.long.samp <- melt(as.matrix(nci[1:2000,])) names(nci.long.samp) <- c("gene", "case", "value") head(nci.long.samp) ggplot(aes(y = gene, x = case, fill = value), data = nci.long.samp) + geom_tile() + scale_fill_gradientn(colours = colorRampPalette(c("blue", "red"))(100)) ``` *** ### Analyzing Three of More Variables Reflection: *** Click **KnitHTML** to see all of your hard work and to have an html page of this lesson, your answers, and your notes!