Lesson 3 Problem Set Finished
This commit is contained in:
parent
e0855a30da
commit
56f9478edc
@ -111,75 +111,147 @@ table(diamonds$carat)
|
|||||||
births <- read.csv('total_fertility.csv')
|
births <- read.csv('total_fertility.csv')
|
||||||
library(tidyr)
|
library(tidyr)
|
||||||
library(gridExtra)
|
library(gridExtra)
|
||||||
b_2000 <- gather(births, -Total.fertility.rate, key = 'year', value = 'cases')
|
births <- t(births)
|
||||||
g1 <- ggplot(aes(x = year, y = cases, group = 1), data = subset(b_2000, Total.fertility.rate == 'United States')) +
|
|
||||||
geom_line() +
|
ggplot(aes(x = 'United States'), data = births)
|
||||||
labs(x = "Year",
|
|
||||||
y = "Births per Woman") +
|
|
||||||
ggtitle('United States') +
|
|
||||||
coord_cartesian(ylim = c(1.5, 8)) +
|
|
||||||
scale_x_discrete(breaks = NULL)
|
|
||||||
g2 <- ggplot(aes(x = year, y = cases, group = 1), data = subset(b_2000, Total.fertility.rate == 'Germany')) +
|
|
||||||
geom_line() +
|
|
||||||
labs(x = "Year",
|
|
||||||
y = "Births per Woman") +
|
|
||||||
ggtitle('Germany') +
|
|
||||||
coord_cartesian(ylim = c(1.5, 8)) +
|
|
||||||
scale_x_discrete(breaks = NULL)
|
|
||||||
g3 <- ggplot(aes(x = year, y = cases, group = 1), data = subset(b_2000, Total.fertility.rate == 'United Kingdom')) +
|
|
||||||
geom_line() +
|
|
||||||
labs(x = "Year",
|
|
||||||
y = "Births per Woman") +
|
|
||||||
ggtitle('United Kingdom') +
|
|
||||||
coord_cartesian(ylim = c(1.5, 8)) +
|
|
||||||
scale_x_discrete(breaks = NULL)
|
|
||||||
g4 <- ggplot(aes(x = year, y = cases, group = 1), data = subset(b_2000, Total.fertility.rate == 'Spain')) +
|
|
||||||
geom_line() +
|
|
||||||
labs(x = "Year",
|
|
||||||
y = "Births per Woman") +
|
|
||||||
ggtitle('Spain') +
|
|
||||||
coord_cartesian(ylim = c(1.5, 8)) +
|
|
||||||
scale_x_discrete(breaks = NULL)
|
|
||||||
g5 <- ggplot(aes(x = year, y = cases, group = 1), data = subset(b_2000, Total.fertility.rate == 'France')) +
|
|
||||||
geom_line() +
|
|
||||||
labs(x = "Year",
|
|
||||||
y = "Births per Woman") +
|
|
||||||
ggtitle('France') +
|
|
||||||
coord_cartesian(ylim = c(1.5, 8)) +
|
|
||||||
scale_x_discrete(breaks = NULL)
|
|
||||||
g6 <- ggplot(aes(factor(year), cases, group = 1), data = subset(b_2000, Total.fertility.rate == 'Canada')) +
|
|
||||||
geom_line() +
|
|
||||||
labs(x = "Year",
|
|
||||||
y = "Births per Woman") +
|
|
||||||
ggtitle('Canada') +
|
|
||||||
coord_cartesian(ylim = c(1.5, 8)) +
|
|
||||||
scale_x_discrete(breaks = NULL)
|
|
||||||
grid.arrange(g1, g2, g3, g4, g5, g6)
|
|
||||||
```
|
```
|
||||||
|
|
||||||
```{r fertility}
|
```{r fertility}
|
||||||
births <- read.csv('total_fertility.csv')
|
births <- read.csv('total_fertility.csv')
|
||||||
library(tidyr)
|
library(tidyr)
|
||||||
library(gridExtra)
|
library(gridExtra)
|
||||||
b_2000 <- gather(births, 'X1920':'X2000', key = 'year', value = 'cases')
|
#b_2000 <- gather(births, 'X1920':'X2000', key = 'year', value = 'cases')
|
||||||
|
b_2000 <- gather(births, -Total.fertility.rate, key = 'year', value = 'cases')
|
||||||
|
|
||||||
p1 = ggplot(aes(x = year, y = cases, group = 1), data = subset(b_2000, Total.fertility.rate == 'United States')) +
|
data = subset(b_2000, Total.fertility.rate == 'United States')
|
||||||
geom_line() +
|
|
||||||
labs(x = "Year",
|
p1 = ggplot(aes(x = cases, group = 1), data = subset(b_2000, Total.fertility.rate == 'United States')) +
|
||||||
y = "Births per Woman") +
|
geom_histogram(binwidth = 0.1) +
|
||||||
|
labs(y = "Years",
|
||||||
|
x = "Births per Woman") +
|
||||||
ggtitle('United States') +
|
ggtitle('United States') +
|
||||||
theme(axis.text.x= element_text(size = 6, angle = 90))
|
theme(axis.text.x= element_text(size = 6, angle = 90))
|
||||||
|
|
||||||
p2 = ggplot(aes(x = year, y = factor(cases), group = 1), data = subset(b_2000, Total.fertility.rate == 'United States')) +
|
p2 = ggplot(aes(x = cases, group = 1), data = subset(b_2000, Total.fertility.rate == 'United Kingdom')) +
|
||||||
geom_histogram(stat = 'identity') +
|
geom_histogram(binwidth = 0.1) +
|
||||||
labs(x = "Year",
|
labs(y = "Years",
|
||||||
y = "Births per Woman") +
|
x = "Births per Woman") +
|
||||||
ggtitle('United States') +
|
ggtitle('United Kingdom')
|
||||||
scale_y_discrete(breaks = seq(1, 5, .1)) +
|
scale_y_discrete(breaks = seq(1, 5, .1)) +
|
||||||
theme(axis.text.x= element_text(size = 6, angle = 90))
|
theme(axis.text.x= element_text(size = 6, angle = 90))
|
||||||
grid.arrange(p1, p2)
|
|
||||||
|
p3 = ggplot(aes(x = cases, group = 1), data = subset(b_2000, Total.fertility.rate == 'Brazil')) +
|
||||||
|
geom_histogram(binwidth = 0.1) +
|
||||||
|
labs(y = "Years",
|
||||||
|
x = "Births per Woman") +
|
||||||
|
ggtitle('Brazil')
|
||||||
|
scale_y_discrete(breaks = seq(1, 5, .1)) +
|
||||||
|
theme(axis.text.x= element_text(size = 6, angle = 90))
|
||||||
|
|
||||||
|
p4 = ggplot(aes(x = cases, group = 1), data = subset(b_2000, Total.fertility.rate == 'India')) +
|
||||||
|
geom_histogram(binwidth = 0.1) +
|
||||||
|
labs(y = "Years",
|
||||||
|
x = "Births per Woman") +
|
||||||
|
ggtitle('India')
|
||||||
|
scale_y_discrete(breaks = seq(1, 5, .1)) +
|
||||||
|
theme(axis.text.x= element_text(size = 6, angle = 90))
|
||||||
|
|
||||||
|
p5 = ggplot(aes(x = cases, group = 1), data = b_2000) +
|
||||||
|
geom_histogram(binwidth = 0.1) +
|
||||||
|
labs(y = "Years",
|
||||||
|
x = "Births per Woman") +
|
||||||
|
ggtitle('Global') +
|
||||||
|
scale_x_continuous(breaks = seq(0.0, 9.3, 1))
|
||||||
|
|
||||||
|
p1 = ggplot(aes(x = cases, group = 1), data = subset(b_2000, Total.fertility.rate == 'United States')) +
|
||||||
|
geom_histogram(binwidth = 0.1) +
|
||||||
|
labs(y = "Years",
|
||||||
|
x = "Births per Woman") +
|
||||||
|
ggtitle('United States') +
|
||||||
|
theme(axis.text.x= element_text(size = 6, angle = 90))
|
||||||
|
|
||||||
|
grid.arrange(p1, p2, p3, p4, p5, ncol = 2)
|
||||||
|
|
||||||
|
summary(data['cases'])
|
||||||
|
summary(b_2000['cases'])
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
||||||
|
The data I chose was the number of births per woman by country and year. From the Graphs it looks like
|
||||||
|
the more third world or developing countries have a higher birth rate than first world countries such as
|
||||||
|
the US and UK. If you graph the data using the year as the x axis and the number of births as the y axis
|
||||||
|
it becomes apparent that in countries such as Brazil and India which are developing countries the birth
|
||||||
|
rate has dropped drastically in the last couple decades as they are becoming more advanced.
|
||||||
|
|
||||||
|
Here are some basic statistics for the global data as well as the US
|
||||||
|
|
||||||
|
US Global
|
||||||
|
Min. :1.740 Min. :0.840
|
||||||
|
1st Qu.:2.308 1st Qu.:4.620
|
||||||
|
Median :3.700 Median :5.900
|
||||||
|
Mean :4.033 Mean :5.397
|
||||||
|
3rd Qu.:5.562 3rd Qu.:6.580
|
||||||
|
Max. :7.030 Max. :9.220
|
||||||
|
NA's :12532
|
||||||
|
|
||||||
|
```{r birthdays}
|
||||||
|
library(lubridate)
|
||||||
|
library(gridExtra)
|
||||||
|
|
||||||
|
# Import Sample Birthdays data
|
||||||
|
birthdays <- read.csv('birthdaysExample.csv')
|
||||||
|
|
||||||
|
# Convert the data frame into datetime objects sorted by date
|
||||||
|
dates <- strptime(birthdays$dates[order(as.Date(birthdays$dates, format = '%m/%d/%y'))], '%m/%d/%y')
|
||||||
|
|
||||||
|
# Create a histogram showing the amount of birthdays for every day in the dataset
|
||||||
|
p1 = ggplot(birthdays, aes(x = dates)) +
|
||||||
|
geom_histogram(stat='count', binwidth = 1)
|
||||||
|
|
||||||
|
# Extract the Months and Days from the dates into new columns
|
||||||
|
birthdays$months <- month(dates)
|
||||||
|
birthdays$days <- day(dates)
|
||||||
|
|
||||||
|
# Create a histogram showing the distribution of birthdays by month
|
||||||
|
p2 <- ggplot(birthdays, aes(months)) +
|
||||||
|
geom_histogram() +
|
||||||
|
scale_x_continuous(breaks = seq(1, 12, 1))
|
||||||
|
|
||||||
|
# Create a historgram showing the distribution of birthdays by day of the month.
|
||||||
|
p3 <- ggplot(birthdays, aes(days)) +
|
||||||
|
geom_histogram() +
|
||||||
|
scale_x_continuous(breaks = seq(1, 31, 1))
|
||||||
|
|
||||||
|
# Show all 3 histograms on the same image
|
||||||
|
grid.arrange(p1, p2, p3)
|
||||||
|
|
||||||
|
# Show basic statistics of the data including the Quartiles, Min, Max, Mean, and Median
|
||||||
|
summary(birthdays)
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
From this we can see that the days that have the most births
|
||||||
|
are Feb 6th,May 22nd, and July 16th with 8 people each. But
|
||||||
|
as we can tell from the other distributions the number of
|
||||||
|
birthdays per month is fairly even. The mean is 6.474 which
|
||||||
|
is very close to half way through they year. The Median is
|
||||||
|
slightly higher at 7 which indicates that there are slightly
|
||||||
|
more birthdays in the latter half of the year. The quartiles
|
||||||
|
also indicate an even distribution with the 25% quartile at
|
||||||
|
month 3 and the 75% quartile at month 9.
|
||||||
|
|
||||||
|
Similarly the day of the month data shows a fairly even
|
||||||
|
distribution as well. But there is one notible anomaly.
|
||||||
|
The 15th day has a drastically higher number of birthdays
|
||||||
|
than any other day. Whether this is an error or not I can't
|
||||||
|
tell. The 31st has understandibly fewer birthdays than the
|
||||||
|
other days since there are only 30 days in many months.
|
||||||
|
|
||||||
|
dates months days
|
||||||
|
2/6/14 : 8 Min. : 1.000 Min. : 1.0
|
||||||
|
5/22/14: 8 1st Qu.: 3.000 1st Qu.: 8.0
|
||||||
|
7/16/14: 8 Median : 7.000 Median :16.0
|
||||||
|
1/14/14: 7 Mean : 6.474 Mean :15.7
|
||||||
|
2/2/14 : 7 3rd Qu.: 9.000 3rd Qu.:23.0
|
||||||
|
2/23/14: 7 Max. :12.000 Max. :31.0
|
||||||
|
(Other):988
|
||||||
|
|||||||
1
lesson3/birthdaysExample.csv
Normal file
1
lesson3/birthdaysExample.csv
Normal file
File diff suppressed because one or more lines are too long
Loading…
x
Reference in New Issue
Block a user