udacity_eda/lesson3/Problem Set.Rmd

---
title: 'Problem Set: Explore One Variable'
author: "Dusty P"
date: "April 19, 2018"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
library(ggplot2)
data(diamonds)
```

## Diamonds Summary

```{r diamonds}
summary(diamonds)
```

## Histogram of Diamonds Prices

```{r prices}
ggplot(aes(price), data = diamonds) +
  geom_histogram(binwidth = 10)
```

## Diamond Counts

```{r counts}
sum(diamonds$price < 500)
sum(diamonds$price < 250)
sum(diamonds$price >= 15000)
```

## Cheaper Diamonds

```{r cheaper}
ggplot(aes(price), data = diamonds) +
  geom_histogram(binwidth = 11) +
  scale_x_continuous(limits = c(300, 1500), breaks = seq(300, 1500, 100))
```

## Price by cut

```{r price_by_cut}
ggplot(aes(price), data = diamonds) +
  geom_histogram(binwidth = 10) +
  facet_wrap(~cut)
```

## Price by Cut Stats

```{r price_cut_stats}
by(diamonds$price, diamonds$cut, summary)
```

## Scales and Multiple Histograms

```{r scales}
ggplot(aes(price), data = diamonds) +
  geom_histogram(binwidth = 10) +
  facet_wrap(~cut, scales = "free_y")
```

## Price per Carat by Cut

```{r carat_by_cut}
ggplot(aes(price/carat), data = diamonds) +
  geom_histogram(binwidth = 0.05) +
  facet_wrap(~cut, scales = "free_y") +
  scale_x_log10()
```

## Price Box Plots

```{r price_box_plot}
ggplot(aes(y = price, x = cut, color = cut), data = diamonds) +
  geom_boxplot() +
  coord_cartesian(ylim = c(0, 7500))
```

## Interquartile Range

```{r iqr}
by(diamonds$price, diamonds$color, summary)
IQR(subset(diamonds, color == 'D')$price)
IQR(subset(diamonds, color == 'J')$price)
```

## Price per Carat Box Plots by Color

```{r price_carat_box}
ggplot(aes(y = price/carat, x = color, color = color), data = diamonds) +
  geom_boxplot() +
  coord_cartesian(ylim = c(1000, 6000))
```

## Carat Frequency Polygon

```{r carat_freq_poly}
ggplot(aes(x = carat, color = carat), data = diamonds) +
  geom_freqpoly(binwidth = 0.01) +
  coord_cartesian(ylim = c(0, 5000)) +
  scale_x_continuous(breaks = seq(0, 5, 0.1))
table(diamonds$carat)
```

## Gapminer Data

```{r gapminer_data}
births <- read.csv('total_fertility.csv')
library(tidyr)
library(gridExtra)
b_2000 <- gather(births, -Total.fertility.rate, key = 'year', value = 'cases')
g1 <- ggplot(aes(x = year, y = cases, group = 1), data = subset(b_2000, Total.fertility.rate == 'United States')) +
  geom_line() +
  labs(x = "Year",
       y = "Births per Woman") +
  ggtitle('United States') +
  coord_cartesian(ylim = c(1.5, 8)) +
  scale_x_discrete(breaks = NULL)
g2 <- ggplot(aes(x = year, y = cases, group = 1), data = subset(b_2000, Total.fertility.rate == 'Germany')) +
  geom_line() +
  labs(x = "Year",
       y = "Births per Woman") +
  ggtitle('Germany') +
  coord_cartesian(ylim = c(1.5, 8)) +
  scale_x_discrete(breaks = NULL)
g3 <- ggplot(aes(x = year, y = cases, group = 1), data = subset(b_2000, Total.fertility.rate == 'United Kingdom')) +
  geom_line() +
  labs(x = "Year",
       y = "Births per Woman") +
  ggtitle('United Kingdom') +
  coord_cartesian(ylim = c(1.5, 8)) +
  scale_x_discrete(breaks = NULL)
g4 <- ggplot(aes(x = year, y = cases, group = 1), data = subset(b_2000, Total.fertility.rate == 'Spain')) +
  geom_line() +
  labs(x = "Year",
       y = "Births per Woman") +
  ggtitle('Spain') +
  coord_cartesian(ylim = c(1.5, 8)) +
  scale_x_discrete(breaks = NULL)
g5 <- ggplot(aes(x = year, y = cases, group = 1), data = subset(b_2000, Total.fertility.rate == 'France')) +
  geom_line() +
  labs(x = "Year",
       y = "Births per Woman") +
  ggtitle('France') +
  coord_cartesian(ylim = c(1.5, 8)) +
  scale_x_discrete(breaks = NULL)
g6 <- ggplot(aes(factor(year), cases, group = 1), data = subset(b_2000, Total.fertility.rate == 'Canada')) +
  geom_line() +
  labs(x = "Year",
       y = "Births per Woman") +
  ggtitle('Canada') +
  coord_cartesian(ylim = c(1.5, 8)) +
  scale_x_discrete(breaks = NULL)
grid.arrange(g1, g2, g3, g4, g5, g6)
```

```{r fertility}
births <- read.csv('total_fertility.csv')
library(tidyr)
library(gridExtra)
b_2000 <- gather(births, 'X1920':'X2000', key = 'year', value = 'cases')

p1 = ggplot(aes(x = year, y = cases, group = 1), data = subset(b_2000, Total.fertility.rate == 'United States')) +
  geom_line() +
  labs(x = "Year",
       y = "Births per Woman") +
  ggtitle('United States') +
  theme(axis.text.x= element_text(size = 6, angle = 90))

p2 = ggplot(aes(x = year, y = factor(cases), group = 1), data = subset(b_2000, Total.fertility.rate == 'United States')) +
  geom_histogram(stat = 'identity') +
  labs(x = "Year",
       y = "Births per Woman") +
  ggtitle('United States') +
  scale_y_discrete(breaks = seq(1, 5, .1)) +
  theme(axis.text.x= element_text(size = 6, angle = 90))
grid.arrange(p1, p2)
```