258 lines
7.4 KiB
Plaintext
258 lines
7.4 KiB
Plaintext
---
|
|
title: 'Problem Set: Explore One Variable'
|
|
author: "Dusty P"
|
|
date: "April 19, 2018"
|
|
output: html_document
|
|
---
|
|
|
|
```{r setup, include=FALSE}
|
|
knitr::opts_chunk$set(echo = TRUE)
|
|
library(ggplot2)
|
|
data(diamonds)
|
|
```
|
|
|
|
## Diamonds Summary
|
|
|
|
```{r diamonds}
|
|
summary(diamonds)
|
|
```
|
|
|
|
## Histogram of Diamonds Prices
|
|
|
|
```{r prices}
|
|
ggplot(aes(price), data = diamonds) +
|
|
geom_histogram(binwidth = 10)
|
|
```
|
|
|
|
## Diamond Counts
|
|
|
|
```{r counts}
|
|
sum(diamonds$price < 500)
|
|
sum(diamonds$price < 250)
|
|
sum(diamonds$price >= 15000)
|
|
```
|
|
|
|
## Cheaper Diamonds
|
|
|
|
```{r cheaper}
|
|
ggplot(aes(price), data = diamonds) +
|
|
geom_histogram(binwidth = 11) +
|
|
scale_x_continuous(limits = c(300, 1500), breaks = seq(300, 1500, 100))
|
|
```
|
|
|
|
## Price by cut
|
|
|
|
```{r price_by_cut}
|
|
ggplot(aes(price), data = diamonds) +
|
|
geom_histogram(binwidth = 10) +
|
|
facet_wrap(~cut)
|
|
```
|
|
|
|
## Price by Cut Stats
|
|
|
|
```{r price_cut_stats}
|
|
by(diamonds$price, diamonds$cut, summary)
|
|
```
|
|
|
|
## Scales and Multiple Histograms
|
|
|
|
```{r scales}
|
|
ggplot(aes(price), data = diamonds) +
|
|
geom_histogram(binwidth = 10) +
|
|
facet_wrap(~cut, scales = "free_y")
|
|
```
|
|
|
|
## Price per Carat by Cut
|
|
|
|
```{r carat_by_cut}
|
|
ggplot(aes(price/carat), data = diamonds) +
|
|
geom_histogram(binwidth = 0.05) +
|
|
facet_wrap(~cut, scales = "free_y") +
|
|
scale_x_log10()
|
|
```
|
|
|
|
## Price Box Plots
|
|
|
|
```{r price_box_plot}
|
|
ggplot(aes(y = price, x = cut, color = cut), data = diamonds) +
|
|
geom_boxplot() +
|
|
coord_cartesian(ylim = c(0, 7500))
|
|
```
|
|
|
|
## Interquartile Range
|
|
|
|
```{r iqr}
|
|
by(diamonds$price, diamonds$color, summary)
|
|
IQR(subset(diamonds, color == 'D')$price)
|
|
IQR(subset(diamonds, color == 'J')$price)
|
|
```
|
|
|
|
## Price per Carat Box Plots by Color
|
|
|
|
```{r price_carat_box}
|
|
ggplot(aes(y = price/carat, x = color, color = color), data = diamonds) +
|
|
geom_boxplot() +
|
|
coord_cartesian(ylim = c(1000, 6000))
|
|
```
|
|
|
|
## Carat Frequency Polygon
|
|
|
|
```{r carat_freq_poly}
|
|
ggplot(aes(x = carat, color = carat), data = diamonds) +
|
|
geom_freqpoly(binwidth = 0.01) +
|
|
coord_cartesian(ylim = c(0, 5000)) +
|
|
scale_x_continuous(breaks = seq(0, 5, 0.1))
|
|
table(diamonds$carat)
|
|
```
|
|
|
|
## Gapminer Data
|
|
|
|
```{r gapminer_data}
|
|
births <- read.csv('total_fertility.csv')
|
|
library(tidyr)
|
|
library(gridExtra)
|
|
births <- t(births)
|
|
|
|
ggplot(aes(x = 'United States'), data = births)
|
|
```
|
|
|
|
```{r fertility}
|
|
births <- read.csv('total_fertility.csv')
|
|
library(tidyr)
|
|
library(gridExtra)
|
|
#b_2000 <- gather(births, 'X1920':'X2000', key = 'year', value = 'cases')
|
|
b_2000 <- gather(births, -Total.fertility.rate, key = 'year', value = 'cases')
|
|
|
|
data = subset(b_2000, Total.fertility.rate == 'United States')
|
|
|
|
p1 = ggplot(aes(x = cases, group = 1), data = subset(b_2000, Total.fertility.rate == 'United States')) +
|
|
geom_histogram(binwidth = 0.1) +
|
|
labs(y = "Years",
|
|
x = "Births per Woman") +
|
|
ggtitle('United States') +
|
|
theme(axis.text.x= element_text(size = 6, angle = 90))
|
|
|
|
p2 = ggplot(aes(x = cases, group = 1), data = subset(b_2000, Total.fertility.rate == 'United Kingdom')) +
|
|
geom_histogram(binwidth = 0.1) +
|
|
labs(y = "Years",
|
|
x = "Births per Woman") +
|
|
ggtitle('United Kingdom')
|
|
scale_y_discrete(breaks = seq(1, 5, .1)) +
|
|
theme(axis.text.x= element_text(size = 6, angle = 90))
|
|
|
|
p3 = ggplot(aes(x = cases, group = 1), data = subset(b_2000, Total.fertility.rate == 'Brazil')) +
|
|
geom_histogram(binwidth = 0.1) +
|
|
labs(y = "Years",
|
|
x = "Births per Woman") +
|
|
ggtitle('Brazil')
|
|
scale_y_discrete(breaks = seq(1, 5, .1)) +
|
|
theme(axis.text.x= element_text(size = 6, angle = 90))
|
|
|
|
p4 = ggplot(aes(x = cases, group = 1), data = subset(b_2000, Total.fertility.rate == 'India')) +
|
|
geom_histogram(binwidth = 0.1) +
|
|
labs(y = "Years",
|
|
x = "Births per Woman") +
|
|
ggtitle('India')
|
|
scale_y_discrete(breaks = seq(1, 5, .1)) +
|
|
theme(axis.text.x= element_text(size = 6, angle = 90))
|
|
|
|
p5 = ggplot(aes(x = cases, group = 1), data = b_2000) +
|
|
geom_histogram(binwidth = 0.1) +
|
|
labs(y = "Years",
|
|
x = "Births per Woman") +
|
|
ggtitle('Global') +
|
|
scale_x_continuous(breaks = seq(0.0, 9.3, 1))
|
|
|
|
p1 = ggplot(aes(x = cases, group = 1), data = subset(b_2000, Total.fertility.rate == 'United States')) +
|
|
geom_histogram(binwidth = 0.1) +
|
|
labs(y = "Years",
|
|
x = "Births per Woman") +
|
|
ggtitle('United States') +
|
|
theme(axis.text.x= element_text(size = 6, angle = 90))
|
|
|
|
grid.arrange(p1, p2, p3, p4, p5, ncol = 2)
|
|
|
|
summary(data['cases'])
|
|
summary(b_2000['cases'])
|
|
```
|
|
|
|
|
|
The data I chose was the number of births per woman by country and year. From the Graphs it looks like
|
|
the more third world or developing countries have a higher birth rate than first world countries such as
|
|
the US and UK. If you graph the data using the year as the x axis and the number of births as the y axis
|
|
it becomes apparent that in countries such as Brazil and India which are developing countries the birth
|
|
rate has dropped drastically in the last couple decades as they are becoming more advanced.
|
|
|
|
Here are some basic statistics for the global data as well as the US
|
|
|
|
US Global
|
|
Min. :1.740 Min. :0.840
|
|
1st Qu.:2.308 1st Qu.:4.620
|
|
Median :3.700 Median :5.900
|
|
Mean :4.033 Mean :5.397
|
|
3rd Qu.:5.562 3rd Qu.:6.580
|
|
Max. :7.030 Max. :9.220
|
|
NA's :12532
|
|
|
|
```{r birthdays}
|
|
library(lubridate)
|
|
library(gridExtra)
|
|
|
|
# Import Sample Birthdays data
|
|
birthdays <- read.csv('birthdaysExample.csv')
|
|
|
|
# Convert the data frame into datetime objects sorted by date
|
|
dates <- strptime(birthdays$dates[order(as.Date(birthdays$dates, format = '%m/%d/%y'))], '%m/%d/%y')
|
|
|
|
# Create a histogram showing the amount of birthdays for every day in the dataset
|
|
p1 = ggplot(birthdays, aes(x = dates)) +
|
|
geom_histogram(stat='count', binwidth = 1)
|
|
|
|
# Extract the Months and Days from the dates into new columns
|
|
birthdays$months <- month(dates)
|
|
birthdays$days <- day(dates)
|
|
|
|
# Create a histogram showing the distribution of birthdays by month
|
|
p2 <- ggplot(birthdays, aes(months)) +
|
|
geom_histogram() +
|
|
scale_x_continuous(breaks = seq(1, 12, 1))
|
|
|
|
# Create a historgram showing the distribution of birthdays by day of the month.
|
|
p3 <- ggplot(birthdays, aes(days)) +
|
|
geom_histogram() +
|
|
scale_x_continuous(breaks = seq(1, 31, 1))
|
|
|
|
# Show all 3 histograms on the same image
|
|
grid.arrange(p1, p2, p3)
|
|
|
|
# Show basic statistics of the data including the Quartiles, Min, Max, Mean, and Median
|
|
summary(birthdays)
|
|
|
|
```
|
|
|
|
From this we can see that the days that have the most births
|
|
are Feb 6th,May 22nd, and July 16th with 8 people each. But
|
|
as we can tell from the other distributions the number of
|
|
birthdays per month is fairly even. The mean is 6.474 which
|
|
is very close to half way through they year. The Median is
|
|
slightly higher at 7 which indicates that there are slightly
|
|
more birthdays in the latter half of the year. The quartiles
|
|
also indicate an even distribution with the 25% quartile at
|
|
month 3 and the 75% quartile at month 9.
|
|
|
|
Similarly the day of the month data shows a fairly even
|
|
distribution as well. But there is one notible anomaly.
|
|
The 15th day has a drastically higher number of birthdays
|
|
than any other day. Whether this is an error or not I can't
|
|
tell. The 31st has understandibly fewer birthdays than the
|
|
other days since there are only 30 days in many months.
|
|
|
|
dates months days
|
|
2/6/14 : 8 Min. : 1.000 Min. : 1.0
|
|
5/22/14: 8 1st Qu.: 3.000 1st Qu.: 8.0
|
|
7/16/14: 8 Median : 7.000 Median :16.0
|
|
1/14/14: 7 Mean : 6.474 Mean :15.7
|
|
2/2/14 : 7 3rd Qu.: 9.000 3rd Qu.:23.0
|
|
2/23/14: 7 Max. :12.000 Max. :31.0
|
|
(Other):988
|