186 lines
4.7 KiB
Plaintext
186 lines
4.7 KiB
Plaintext
---
|
|
title: 'Problem Set: Explore One Variable'
|
|
author: "Dusty P"
|
|
date: "April 19, 2018"
|
|
output: html_document
|
|
---
|
|
|
|
```{r setup, include=FALSE}
|
|
knitr::opts_chunk$set(echo = TRUE)
|
|
library(ggplot2)
|
|
data(diamonds)
|
|
```
|
|
|
|
## Diamonds Summary
|
|
|
|
```{r diamonds}
|
|
summary(diamonds)
|
|
```
|
|
|
|
## Histogram of Diamonds Prices
|
|
|
|
```{r prices}
|
|
ggplot(aes(price), data = diamonds) +
|
|
geom_histogram(binwidth = 10)
|
|
```
|
|
|
|
## Diamond Counts
|
|
|
|
```{r counts}
|
|
sum(diamonds$price < 500)
|
|
sum(diamonds$price < 250)
|
|
sum(diamonds$price >= 15000)
|
|
```
|
|
|
|
## Cheaper Diamonds
|
|
|
|
```{r cheaper}
|
|
ggplot(aes(price), data = diamonds) +
|
|
geom_histogram(binwidth = 11) +
|
|
scale_x_continuous(limits = c(300, 1500), breaks = seq(300, 1500, 100))
|
|
```
|
|
|
|
## Price by cut
|
|
|
|
```{r price_by_cut}
|
|
ggplot(aes(price), data = diamonds) +
|
|
geom_histogram(binwidth = 10) +
|
|
facet_wrap(~cut)
|
|
```
|
|
|
|
## Price by Cut Stats
|
|
|
|
```{r price_cut_stats}
|
|
by(diamonds$price, diamonds$cut, summary)
|
|
```
|
|
|
|
## Scales and Multiple Histograms
|
|
|
|
```{r scales}
|
|
ggplot(aes(price), data = diamonds) +
|
|
geom_histogram(binwidth = 10) +
|
|
facet_wrap(~cut, scales = "free_y")
|
|
```
|
|
|
|
## Price per Carat by Cut
|
|
|
|
```{r carat_by_cut}
|
|
ggplot(aes(price/carat), data = diamonds) +
|
|
geom_histogram(binwidth = 0.05) +
|
|
facet_wrap(~cut, scales = "free_y") +
|
|
scale_x_log10()
|
|
```
|
|
|
|
## Price Box Plots
|
|
|
|
```{r price_box_plot}
|
|
ggplot(aes(y = price, x = cut, color = cut), data = diamonds) +
|
|
geom_boxplot() +
|
|
coord_cartesian(ylim = c(0, 7500))
|
|
```
|
|
|
|
## Interquartile Range
|
|
|
|
```{r iqr}
|
|
by(diamonds$price, diamonds$color, summary)
|
|
IQR(subset(diamonds, color == 'D')$price)
|
|
IQR(subset(diamonds, color == 'J')$price)
|
|
```
|
|
|
|
## Price per Carat Box Plots by Color
|
|
|
|
```{r price_carat_box}
|
|
ggplot(aes(y = price/carat, x = color, color = color), data = diamonds) +
|
|
geom_boxplot() +
|
|
coord_cartesian(ylim = c(1000, 6000))
|
|
```
|
|
|
|
## Carat Frequency Polygon
|
|
|
|
```{r carat_freq_poly}
|
|
ggplot(aes(x = carat, color = carat), data = diamonds) +
|
|
geom_freqpoly(binwidth = 0.01) +
|
|
coord_cartesian(ylim = c(0, 5000)) +
|
|
scale_x_continuous(breaks = seq(0, 5, 0.1))
|
|
table(diamonds$carat)
|
|
```
|
|
|
|
## Gapminer Data
|
|
|
|
```{r gapminer_data}
|
|
births <- read.csv('total_fertility.csv')
|
|
library(tidyr)
|
|
library(gridExtra)
|
|
b_2000 <- gather(births, -Total.fertility.rate, key = 'year', value = 'cases')
|
|
g1 <- ggplot(aes(x = year, y = cases, group = 1), data = subset(b_2000, Total.fertility.rate == 'United States')) +
|
|
geom_line() +
|
|
labs(x = "Year",
|
|
y = "Births per Woman") +
|
|
ggtitle('United States') +
|
|
coord_cartesian(ylim = c(1.5, 8)) +
|
|
scale_x_discrete(breaks = NULL)
|
|
g2 <- ggplot(aes(x = year, y = cases, group = 1), data = subset(b_2000, Total.fertility.rate == 'Germany')) +
|
|
geom_line() +
|
|
labs(x = "Year",
|
|
y = "Births per Woman") +
|
|
ggtitle('Germany') +
|
|
coord_cartesian(ylim = c(1.5, 8)) +
|
|
scale_x_discrete(breaks = NULL)
|
|
g3 <- ggplot(aes(x = year, y = cases, group = 1), data = subset(b_2000, Total.fertility.rate == 'United Kingdom')) +
|
|
geom_line() +
|
|
labs(x = "Year",
|
|
y = "Births per Woman") +
|
|
ggtitle('United Kingdom') +
|
|
coord_cartesian(ylim = c(1.5, 8)) +
|
|
scale_x_discrete(breaks = NULL)
|
|
g4 <- ggplot(aes(x = year, y = cases, group = 1), data = subset(b_2000, Total.fertility.rate == 'Spain')) +
|
|
geom_line() +
|
|
labs(x = "Year",
|
|
y = "Births per Woman") +
|
|
ggtitle('Spain') +
|
|
coord_cartesian(ylim = c(1.5, 8)) +
|
|
scale_x_discrete(breaks = NULL)
|
|
g5 <- ggplot(aes(x = year, y = cases, group = 1), data = subset(b_2000, Total.fertility.rate == 'France')) +
|
|
geom_line() +
|
|
labs(x = "Year",
|
|
y = "Births per Woman") +
|
|
ggtitle('France') +
|
|
coord_cartesian(ylim = c(1.5, 8)) +
|
|
scale_x_discrete(breaks = NULL)
|
|
g6 <- ggplot(aes(factor(year), cases, group = 1), data = subset(b_2000, Total.fertility.rate == 'Canada')) +
|
|
geom_line() +
|
|
labs(x = "Year",
|
|
y = "Births per Woman") +
|
|
ggtitle('Canada') +
|
|
coord_cartesian(ylim = c(1.5, 8)) +
|
|
scale_x_discrete(breaks = NULL)
|
|
grid.arrange(g1, g2, g3, g4, g5, g6)
|
|
```
|
|
|
|
```{r fertility}
|
|
births <- read.csv('total_fertility.csv')
|
|
library(tidyr)
|
|
library(gridExtra)
|
|
b_2000 <- gather(births, 'X1920':'X2000', key = 'year', value = 'cases')
|
|
|
|
p1 = ggplot(aes(x = year, y = cases, group = 1), data = subset(b_2000, Total.fertility.rate == 'United States')) +
|
|
geom_line() +
|
|
labs(x = "Year",
|
|
y = "Births per Woman") +
|
|
ggtitle('United States') +
|
|
theme(axis.text.x= element_text(size = 6, angle = 90))
|
|
|
|
p2 = ggplot(aes(x = year, y = factor(cases), group = 1), data = subset(b_2000, Total.fertility.rate == 'United States')) +
|
|
geom_histogram(stat = 'identity') +
|
|
labs(x = "Year",
|
|
y = "Births per Woman") +
|
|
ggtitle('United States') +
|
|
scale_y_discrete(breaks = seq(1, 5, .1)) +
|
|
theme(axis.text.x= element_text(size = 6, angle = 90))
|
|
grid.arrange(p1, p2)
|
|
```
|
|
|
|
|
|
|
|
|