--- title: 'Problem Set: Explore One Variable' author: "Dusty P" date: "April 19, 2018" output: html_document --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) library(ggplot2) data(diamonds) ``` ## Diamonds Summary ```{r diamonds} summary(diamonds) ``` ## Histogram of Diamonds Prices ```{r prices} ggplot(aes(price), data = diamonds) + geom_histogram(binwidth = 10) ``` ## Diamond Counts ```{r counts} sum(diamonds$price < 500) sum(diamonds$price < 250) sum(diamonds$price >= 15000) ``` ## Cheaper Diamonds ```{r cheaper} ggplot(aes(price), data = diamonds) + geom_histogram(binwidth = 11) + scale_x_continuous(limits = c(300, 1500), breaks = seq(300, 1500, 100)) ``` ## Price by cut ```{r price_by_cut} ggplot(aes(price), data = diamonds) + geom_histogram(binwidth = 10) + facet_wrap(~cut) ``` ## Price by Cut Stats ```{r price_cut_stats} by(diamonds$price, diamonds$cut, summary) ``` ## Scales and Multiple Histograms ```{r scales} ggplot(aes(price), data = diamonds) + geom_histogram(binwidth = 10) + facet_wrap(~cut, scales = "free_y") ``` ## Price per Carat by Cut ```{r carat_by_cut} ggplot(aes(price/carat), data = diamonds) + geom_histogram(binwidth = 0.05) + facet_wrap(~cut, scales = "free_y") + scale_x_log10() ``` ## Price Box Plots ```{r price_box_plot} ggplot(aes(y = price, x = cut, color = cut), data = diamonds) + geom_boxplot() + coord_cartesian(ylim = c(0, 7500)) ``` ## Interquartile Range ```{r iqr} by(diamonds$price, diamonds$color, summary) IQR(subset(diamonds, color == 'D')$price) IQR(subset(diamonds, color == 'J')$price) ``` ## Price per Carat Box Plots by Color ```{r price_carat_box} ggplot(aes(y = price/carat, x = color, color = color), data = diamonds) + geom_boxplot() + coord_cartesian(ylim = c(1000, 6000)) ``` ## Carat Frequency Polygon ```{r carat_freq_poly} ggplot(aes(x = carat, color = carat), data = diamonds) + geom_freqpoly(binwidth = 0.01) + coord_cartesian(ylim = c(0, 5000)) + scale_x_continuous(breaks = seq(0, 5, 0.1)) table(diamonds$carat) ``` ## Gapminer Data ```{r gapminer_data} births <- read.csv('total_fertility.csv') library(tidyr) library(gridExtra) b_2000 <- gather(births, -Total.fertility.rate, key = 'year', value = 'cases') g1 <- ggplot(aes(x = year, y = cases, group = 1), data = subset(b_2000, Total.fertility.rate == 'United States')) + geom_line() + labs(x = "Year", y = "Births per Woman") + ggtitle('United States') + coord_cartesian(ylim = c(1.5, 8)) + scale_x_discrete(breaks = NULL) g2 <- ggplot(aes(x = year, y = cases, group = 1), data = subset(b_2000, Total.fertility.rate == 'Germany')) + geom_line() + labs(x = "Year", y = "Births per Woman") + ggtitle('Germany') + coord_cartesian(ylim = c(1.5, 8)) + scale_x_discrete(breaks = NULL) g3 <- ggplot(aes(x = year, y = cases, group = 1), data = subset(b_2000, Total.fertility.rate == 'United Kingdom')) + geom_line() + labs(x = "Year", y = "Births per Woman") + ggtitle('United Kingdom') + coord_cartesian(ylim = c(1.5, 8)) + scale_x_discrete(breaks = NULL) g4 <- ggplot(aes(x = year, y = cases, group = 1), data = subset(b_2000, Total.fertility.rate == 'Spain')) + geom_line() + labs(x = "Year", y = "Births per Woman") + ggtitle('Spain') + coord_cartesian(ylim = c(1.5, 8)) + scale_x_discrete(breaks = NULL) g5 <- ggplot(aes(x = year, y = cases, group = 1), data = subset(b_2000, Total.fertility.rate == 'France')) + geom_line() + labs(x = "Year", y = "Births per Woman") + ggtitle('France') + coord_cartesian(ylim = c(1.5, 8)) + scale_x_discrete(breaks = NULL) g6 <- ggplot(aes(factor(year), cases, group = 1), data = subset(b_2000, Total.fertility.rate == 'Canada')) + geom_line() + labs(x = "Year", y = "Births per Woman") + ggtitle('Canada') + coord_cartesian(ylim = c(1.5, 8)) + scale_x_discrete() grid.arrange(g1, g2, g3, g4, g5, g6) ```