323 lines
5.9 KiB
Plaintext
323 lines
5.9 KiB
Plaintext
---
|
|
output:
|
|
pdf_document: default
|
|
html_document: default
|
|
---
|
|
|
|
```{r setup, include=FALSE}
|
|
knitr::opts_chunk$set(echo = TRUE)
|
|
knitr::opts_knit$set(root.dir = normalizePath("C:/Users/Dusty/Documents/coding/projects/Udacity/Data Analysis/eda/lesson5"))
|
|
library(ggplot2)
|
|
library(dplyr)
|
|
library(reshape2)
|
|
```
|
|
|
|
Lesson 5
|
|
========================================================
|
|
|
|
### Multivariate Data
|
|
Notes:
|
|
|
|
***
|
|
|
|
### Moira Perceived Audience Size Colored by Age
|
|
Notes:
|
|
|
|
***
|
|
|
|
### Third Qualitative Variable
|
|
Notes:
|
|
|
|
```{r Third Qualitative Variable}
|
|
pf = read.csv('pseudo_facebook.tsv', sep = '\t')
|
|
pf.fc_by_age_gender <- pf %>%
|
|
filter(!is.na(gender)) %>%
|
|
group_by(age, gender) %>%
|
|
summarize(mean_friend_count = mean(friend_count),
|
|
median_friend_count = median(friend_count),
|
|
n = n()) %>%
|
|
ungroup() %>%
|
|
arrange(age)
|
|
|
|
head(pf.fc_by_age_gender)
|
|
```
|
|
|
|
***
|
|
|
|
### Plotting Conditional Summaries
|
|
Notes:
|
|
|
|
```{r Plotting Conditional Summaries}
|
|
ggplot(aes(x = age, y = median_friend_count), data = pf.fc_by_age_gender) +
|
|
geom_line(aes(color = gender))
|
|
```
|
|
|
|
***
|
|
|
|
### Thinking in Ratios
|
|
Notes:
|
|
|
|
What is the ratio of friends for males vs females
|
|
|
|
### Wide and Long Format
|
|
Notes:
|
|
|
|
***
|
|
|
|
### Reshaping Data
|
|
Notes:
|
|
|
|
```{r}
|
|
#install.packages('reshape2')
|
|
pf.fc_by_age_gender.wide <- dcast(pf.fc_by_age_gender,
|
|
age ~ gender,
|
|
value.var = 'median_friend_count')
|
|
|
|
head(pf.fc_by_age_gender.wide)
|
|
```
|
|
|
|
|
|
***
|
|
|
|
### Ratio Plot
|
|
Notes:
|
|
|
|
```{r Ratio Plot}
|
|
ggplot(aes(x = age, y = female/male), data = pf.fc_by_age_gender.wide) +
|
|
geom_line() +
|
|
geom_hline(aes(yintercept = 1), alpha=0.3, linetype = 2)
|
|
```
|
|
|
|
***
|
|
|
|
### Third Quantitative Variable
|
|
Notes:
|
|
|
|
```{r Third Quantitative Variable}
|
|
pf$year_joined <- floor(2014 - pf$tenure/365)
|
|
head(pf)
|
|
```
|
|
|
|
***
|
|
|
|
### Cut a Variable
|
|
Notes:
|
|
|
|
```{r Cut a Variable}
|
|
pf$year_joined.bucket = cut(pf$year_joined, c(2004, 2009, 2011, 2012, 2014))
|
|
table(pf$year_joined.bucket)
|
|
```
|
|
|
|
***
|
|
|
|
### Plotting it All Together
|
|
Notes:
|
|
|
|
```{r Plotting it All Together}
|
|
ggplot(aes(x = age, y = friend_count), data = subset(pf, !is.na(year_joined.bucket))) +
|
|
geom_line(aes(color = year_joined.bucket), stat='summary', fun.y = median)
|
|
```
|
|
|
|
***
|
|
|
|
### Plot the Grand Mean
|
|
Notes:
|
|
|
|
```{r Plot the Grand Mean}
|
|
ggplot(aes(x = age, y = friend_count), data = subset(pf, !is.na(year_joined.bucket))) +
|
|
geom_line(aes(color = year_joined.bucket), stat='summary', fun.y = mean) +
|
|
geom_line(stat = 'summary', fun.y = mean, linetype = 2)
|
|
|
|
```
|
|
|
|
***
|
|
|
|
### Friending Rate
|
|
Notes:
|
|
|
|
```{r Friending Rate}
|
|
with(subset(pf, tenure >= 1), summary(friend_count / tenure))
|
|
```
|
|
|
|
***
|
|
|
|
### Friendships Initiated
|
|
Notes:
|
|
|
|
What is the median friend rate?
|
|
0.2205
|
|
|
|
What is the maximum friend rate?
|
|
417.0
|
|
|
|
```{r Friendships Initiated}
|
|
ggplot(aes(y = friendships_initiated/tenure, x = tenure), data = subset(pf, tenure >= 1)) +
|
|
geom_line(aes(color = year_joined.bucket), stat = 'summary', fun.y = mean)
|
|
```
|
|
|
|
***
|
|
|
|
### Bias-Variance Tradeoff Revisited
|
|
Notes:
|
|
|
|
```{r Bias-Variance Tradeoff Revisited}
|
|
|
|
#ggplot(aes(x = tenure, y = friendships_initiated / tenure),
|
|
# data = subset(pf, tenure >= 1)) +
|
|
# geom_line(aes(color = year_joined.bucket),
|
|
# stat = 'summary',
|
|
# fun.y = mean)
|
|
#
|
|
#ggplot(aes(x = 7 * round(tenure / 7), y = friendships_initiated / tenure),
|
|
# data = subset(pf, tenure > 0)) +
|
|
# geom_line(aes(color = year_joined.bucket),
|
|
# stat = "summary",
|
|
# fun.y = mean)
|
|
#
|
|
#ggplot(aes(x = 30 * round(tenure / 30), y = friendships_initiated / tenure),
|
|
# data = subset(pf, tenure > 0)) +
|
|
# geom_line(aes(color = year_joined.bucket),
|
|
# stat = "summary",
|
|
# fun.y = mean)
|
|
#
|
|
#ggplot(aes(x = 90 * round(tenure / 90), y = friendships_initiated / tenure),
|
|
# data = subset(pf, tenure > 0)) +
|
|
# geom_line(aes(color = year_joined.bucket),
|
|
# stat = "summary",
|
|
# fun.y = mean)
|
|
|
|
ggplot(aes(x = tenure, y = friendships_initiated / tenure),
|
|
data = subset(pf, tenure >= 1)) +
|
|
geom_smooth(aes(color = year_joined.bucket))
|
|
|
|
```
|
|
|
|
***
|
|
|
|
### Sean's NFL Fan Sentiment Study
|
|
Notes:
|
|
|
|
***
|
|
|
|
### Introducing the Yogurt Data Set
|
|
Notes:
|
|
|
|
***
|
|
|
|
### Histograms Revisited
|
|
Notes:
|
|
|
|
```{r Histograms Revisited}
|
|
yo <- read.csv('yogurt.csv')
|
|
|
|
yo$id <- factor(yo$id)
|
|
head(yo)
|
|
|
|
ggplot(aes(x = price), data = yo) +
|
|
geom_histogram()
|
|
```
|
|
|
|
***
|
|
|
|
### Number of Purchases
|
|
Notes:
|
|
|
|
```{r Number of Purchases}
|
|
yo <- transform(yo, all.purchases = strawberry + blueberry + pina.colada + plain + mixed.berry)
|
|
head(yo)
|
|
```
|
|
|
|
***
|
|
|
|
### Prices over Time
|
|
Notes:
|
|
|
|
```{r Prices over Time}
|
|
ggplot(aes(x = time, y = price), data = yo) +
|
|
geom_point(alpha = 1/10)
|
|
|
|
ggplot(aes(x = time, y = price), data = yo) +
|
|
geom_point(alpha = 1/10, aes(color = all.purchases))
|
|
```
|
|
|
|
***
|
|
|
|
### Sampling Observations
|
|
Notes:
|
|
|
|
***
|
|
|
|
### Looking at Samples of Households
|
|
|
|
```{r Looking at Sample of Households}
|
|
set.seed(1)
|
|
sample.ids <- sample(levels(yo$id), 16)
|
|
|
|
ggplot(aes(x = time, y = price), data = subset(yo, id %in% sample.ids)) +
|
|
facet_wrap( ~ id) +
|
|
geom_line() +
|
|
geom_point(aes(size = all.purchases), pch = 1)
|
|
```
|
|
|
|
***
|
|
|
|
### The Limits of Cross Sectional Data
|
|
Notes:
|
|
|
|
***
|
|
|
|
### Many Variables
|
|
Notes:
|
|
|
|
***
|
|
|
|
### Scatterplot Matrix
|
|
Notes:
|
|
|
|
```{r}
|
|
#install.packages('GGally')
|
|
library(GGally)
|
|
theme_set(theme_minimal(20))
|
|
|
|
set.seed(1836)
|
|
pf_subset <- pf[, c(2:15)]
|
|
names(pf_subset)
|
|
ggpairs(pf_subset[sample.int(nrow(pf_subset), 1000), ])
|
|
```
|
|
|
|
### Even More Variables
|
|
Notes:
|
|
|
|
***
|
|
|
|
### Heat Maps
|
|
Notes:
|
|
|
|
```{r}
|
|
nci <- read.table("nci.tsv")
|
|
colnames(nci) <- c(1:64)
|
|
```
|
|
|
|
```{r}
|
|
nci.long.samp <- melt(as.matrix(nci[1:2000,]))
|
|
names(nci.long.samp) <- c("gene", "case", "value")
|
|
head(nci.long.samp)
|
|
|
|
ggplot(aes(y = gene, x = case, fill = value),
|
|
data = nci.long.samp) +
|
|
geom_tile() +
|
|
scale_fill_gradientn(colours = colorRampPalette(c("blue", "red"))(100))
|
|
```
|
|
|
|
|
|
***
|
|
|
|
### Analyzing Three of More Variables
|
|
Reflection:
|
|
|
|
***
|
|
|
|
Click **KnitHTML** to see all of your hard work and to have an html
|
|
page of this lesson, your answers, and your notes!
|
|
|