udacity_eda/lesson3/Problem Set.Rmd
2018-04-23 22:58:05 -08:00

186 lines
4.7 KiB
Plaintext

---
title: 'Problem Set: Explore One Variable'
author: "Dusty P"
date: "April 19, 2018"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
library(ggplot2)
data(diamonds)
```
## Diamonds Summary
```{r diamonds}
summary(diamonds)
```
## Histogram of Diamonds Prices
```{r prices}
ggplot(aes(price), data = diamonds) +
geom_histogram(binwidth = 10)
```
## Diamond Counts
```{r counts}
sum(diamonds$price < 500)
sum(diamonds$price < 250)
sum(diamonds$price >= 15000)
```
## Cheaper Diamonds
```{r cheaper}
ggplot(aes(price), data = diamonds) +
geom_histogram(binwidth = 11) +
scale_x_continuous(limits = c(300, 1500), breaks = seq(300, 1500, 100))
```
## Price by cut
```{r price_by_cut}
ggplot(aes(price), data = diamonds) +
geom_histogram(binwidth = 10) +
facet_wrap(~cut)
```
## Price by Cut Stats
```{r price_cut_stats}
by(diamonds$price, diamonds$cut, summary)
```
## Scales and Multiple Histograms
```{r scales}
ggplot(aes(price), data = diamonds) +
geom_histogram(binwidth = 10) +
facet_wrap(~cut, scales = "free_y")
```
## Price per Carat by Cut
```{r carat_by_cut}
ggplot(aes(price/carat), data = diamonds) +
geom_histogram(binwidth = 0.05) +
facet_wrap(~cut, scales = "free_y") +
scale_x_log10()
```
## Price Box Plots
```{r price_box_plot}
ggplot(aes(y = price, x = cut, color = cut), data = diamonds) +
geom_boxplot() +
coord_cartesian(ylim = c(0, 7500))
```
## Interquartile Range
```{r iqr}
by(diamonds$price, diamonds$color, summary)
IQR(subset(diamonds, color == 'D')$price)
IQR(subset(diamonds, color == 'J')$price)
```
## Price per Carat Box Plots by Color
```{r price_carat_box}
ggplot(aes(y = price/carat, x = color, color = color), data = diamonds) +
geom_boxplot() +
coord_cartesian(ylim = c(1000, 6000))
```
## Carat Frequency Polygon
```{r carat_freq_poly}
ggplot(aes(x = carat, color = carat), data = diamonds) +
geom_freqpoly(binwidth = 0.01) +
coord_cartesian(ylim = c(0, 5000)) +
scale_x_continuous(breaks = seq(0, 5, 0.1))
table(diamonds$carat)
```
## Gapminer Data
```{r gapminer_data}
births <- read.csv('total_fertility.csv')
library(tidyr)
library(gridExtra)
b_2000 <- gather(births, -Total.fertility.rate, key = 'year', value = 'cases')
g1 <- ggplot(aes(x = year, y = cases, group = 1), data = subset(b_2000, Total.fertility.rate == 'United States')) +
geom_line() +
labs(x = "Year",
y = "Births per Woman") +
ggtitle('United States') +
coord_cartesian(ylim = c(1.5, 8)) +
scale_x_discrete(breaks = NULL)
g2 <- ggplot(aes(x = year, y = cases, group = 1), data = subset(b_2000, Total.fertility.rate == 'Germany')) +
geom_line() +
labs(x = "Year",
y = "Births per Woman") +
ggtitle('Germany') +
coord_cartesian(ylim = c(1.5, 8)) +
scale_x_discrete(breaks = NULL)
g3 <- ggplot(aes(x = year, y = cases, group = 1), data = subset(b_2000, Total.fertility.rate == 'United Kingdom')) +
geom_line() +
labs(x = "Year",
y = "Births per Woman") +
ggtitle('United Kingdom') +
coord_cartesian(ylim = c(1.5, 8)) +
scale_x_discrete(breaks = NULL)
g4 <- ggplot(aes(x = year, y = cases, group = 1), data = subset(b_2000, Total.fertility.rate == 'Spain')) +
geom_line() +
labs(x = "Year",
y = "Births per Woman") +
ggtitle('Spain') +
coord_cartesian(ylim = c(1.5, 8)) +
scale_x_discrete(breaks = NULL)
g5 <- ggplot(aes(x = year, y = cases, group = 1), data = subset(b_2000, Total.fertility.rate == 'France')) +
geom_line() +
labs(x = "Year",
y = "Births per Woman") +
ggtitle('France') +
coord_cartesian(ylim = c(1.5, 8)) +
scale_x_discrete(breaks = NULL)
g6 <- ggplot(aes(factor(year), cases, group = 1), data = subset(b_2000, Total.fertility.rate == 'Canada')) +
geom_line() +
labs(x = "Year",
y = "Births per Woman") +
ggtitle('Canada') +
coord_cartesian(ylim = c(1.5, 8)) +
scale_x_discrete(breaks = NULL)
grid.arrange(g1, g2, g3, g4, g5, g6)
```
```{r fertility}
births <- read.csv('total_fertility.csv')
library(tidyr)
library(gridExtra)
b_2000 <- gather(births, 'X1920':'X2000', key = 'year', value = 'cases')
p1 = ggplot(aes(x = year, y = cases, group = 1), data = subset(b_2000, Total.fertility.rate == 'United States')) +
geom_line() +
labs(x = "Year",
y = "Births per Woman") +
ggtitle('United States') +
theme(axis.text.x= element_text(size = 6, angle = 90))
p2 = ggplot(aes(x = year, y = factor(cases), group = 1), data = subset(b_2000, Total.fertility.rate == 'United States')) +
geom_histogram(stat = 'identity') +
labs(x = "Year",
y = "Births per Woman") +
ggtitle('United States') +
scale_y_discrete(breaks = seq(1, 5, .1)) +
theme(axis.text.x= element_text(size = 6, angle = 90))
grid.arrange(p1, p2)
```