Lesson 3 Problem Set Finished

2018-04-24 22:58:37 -08:00 · 2018-04-24 22:58:37 -08:00 · 56f9478edc
commit 56f9478edc
parent e0855a30da
2 changed files with 128 additions and 55 deletions
--- a/lesson3/Problem
+++ b/lesson3/Problem
@ -111,75 +111,147 @@ table(diamonds$carat)
 births <- read.csv('total_fertility.csv')
 library(tidyr)
 library(gridExtra)
-b_2000 <- gather(births, -Total.fertility.rate, key = 'year', value = 'cases')
+births <- t(births)
-g1 <- ggplot(aes(x = year, y = cases, group = 1), data = subset(b_2000, Total.fertility.rate == 'United States')) +
+
-  geom_line() +
+ggplot(aes(x = 'United States'), data = births)
  labs(x = "Year",
       y = "Births per Woman") +
  ggtitle('United States') +
  coord_cartesian(ylim = c(1.5, 8)) +
  scale_x_discrete(breaks = NULL)
 g2 <- ggplot(aes(x = year, y = cases, group = 1), data = subset(b_2000, Total.fertility.rate == 'Germany')) +
  geom_line() +
  labs(x = "Year",
       y = "Births per Woman") +
  ggtitle('Germany') +
  coord_cartesian(ylim = c(1.5, 8)) +
  scale_x_discrete(breaks = NULL)
 g3 <- ggplot(aes(x = year, y = cases, group = 1), data = subset(b_2000, Total.fertility.rate == 'United Kingdom')) +
  geom_line() +
  labs(x = "Year",
       y = "Births per Woman") +
  ggtitle('United Kingdom') +
  coord_cartesian(ylim = c(1.5, 8)) +
  scale_x_discrete(breaks = NULL)
 g4 <- ggplot(aes(x = year, y = cases, group = 1), data = subset(b_2000, Total.fertility.rate == 'Spain')) +
  geom_line() +
  labs(x = "Year",
       y = "Births per Woman") +
  ggtitle('Spain') +
  coord_cartesian(ylim = c(1.5, 8)) +
  scale_x_discrete(breaks = NULL)
 g5 <- ggplot(aes(x = year, y = cases, group = 1), data = subset(b_2000, Total.fertility.rate == 'France')) +
  geom_line() +
  labs(x = "Year",
       y = "Births per Woman") +
  ggtitle('France') +
  coord_cartesian(ylim = c(1.5, 8)) +
  scale_x_discrete(breaks = NULL)
 g6 <- ggplot(aes(factor(year), cases, group = 1), data = subset(b_2000, Total.fertility.rate == 'Canada')) +
  geom_line() +
  labs(x = "Year",
       y = "Births per Woman") +
  ggtitle('Canada') +
  coord_cartesian(ylim = c(1.5, 8)) +
  scale_x_discrete(breaks = NULL)
 grid.arrange(g1, g2, g3, g4, g5, g6)
 ```
 ```{r fertility}
 births <- read.csv('total_fertility.csv')
 library(tidyr)
 library(gridExtra)
-b_2000 <- gather(births, 'X1920':'X2000', key = 'year', value = 'cases')
+#b_2000 <- gather(births, 'X1920':'X2000', key = 'year', value = 'cases')
 b_2000 <- gather(births, -Total.fertility.rate, key = 'year', value = 'cases')
-p1 = ggplot(aes(x = year, y = cases, group = 1), data = subset(b_2000, Total.fertility.rate == 'United States')) +
+data = subset(b_2000, Total.fertility.rate == 'United States')
-  geom_line() +
+
-  labs(x = "Year",
+p1 = ggplot(aes(x = cases, group = 1), data = subset(b_2000, Total.fertility.rate == 'United States')) +
-       y = "Births per Woman") +
+  geom_histogram(binwidth = 0.1) +
  labs(y = "Years",
       x = "Births per Woman") +
  ggtitle('United States') +
  theme(axis.text.x= element_text(size = 6, angle = 90))
-p2 = ggplot(aes(x = year, y = factor(cases), group = 1), data = subset(b_2000, Total.fertility.rate == 'United States')) +
+p2 = ggplot(aes(x = cases, group = 1), data = subset(b_2000, Total.fertility.rate == 'United Kingdom')) +
-  geom_histogram(stat = 'identity') +
+  geom_histogram(binwidth = 0.1) +
-  labs(x = "Year",
+  labs(y = "Years",
-       y = "Births per Woman") +
+       x = "Births per Woman") +
-  ggtitle('United States') +
+  ggtitle('United Kingdom')
  scale_y_discrete(breaks = seq(1, 5, .1)) +
  theme(axis.text.x= element_text(size = 6, angle = 90))
-grid.arrange(p1, p2)
+
 p3 = ggplot(aes(x = cases, group = 1), data = subset(b_2000, Total.fertility.rate == 'Brazil')) +
  geom_histogram(binwidth = 0.1) +
  labs(y = "Years",
       x = "Births per Woman") +
  ggtitle('Brazil')
  scale_y_discrete(breaks = seq(1, 5, .1)) +
  theme(axis.text.x= element_text(size = 6, angle = 90))
 p4 = ggplot(aes(x = cases, group = 1), data = subset(b_2000, Total.fertility.rate == 'India')) +
  geom_histogram(binwidth = 0.1) +
  labs(y = "Years",
       x = "Births per Woman") +
  ggtitle('India')
  scale_y_discrete(breaks = seq(1, 5, .1)) +
  theme(axis.text.x= element_text(size = 6, angle = 90))
 p5 = ggplot(aes(x = cases, group = 1), data = b_2000) +
  geom_histogram(binwidth = 0.1) +
  labs(y = "Years",
       x = "Births per Woman") +
  ggtitle('Global') +
  scale_x_continuous(breaks = seq(0.0, 9.3, 1))
 p1 = ggplot(aes(x = cases, group = 1), data = subset(b_2000, Total.fertility.rate == 'United States')) +
  geom_histogram(binwidth = 0.1) +
  labs(y = "Years",
       x = "Births per Woman") +
  ggtitle('United States') +
  theme(axis.text.x= element_text(size = 6, angle = 90))
 grid.arrange(p1, p2, p3, p4, p5, ncol = 2)
 summary(data['cases'])
 summary(b_2000['cases'])
 ```
 The data I chose was the number of births per woman by country and year. From the Graphs it looks like
 the more third world or developing countries have a higher birth rate than first world countries such as
 the US and UK. If you graph the data using the year as the x axis and the number of births as the y axis
 it becomes apparent that in countries such as Brazil and India which are developing countries the birth
 rate has dropped drastically in the last couple decades as they are becoming more advanced.
 Here are some basic statistics for the global data as well as the US
  US                 Global      
 Min.   :1.740     Min.   :0.840  
 1st Qu.:2.308     1st Qu.:4.620  
 Median :3.700     Median :5.900  
 Mean   :4.033     Mean   :5.397 
 3rd Qu.:5.562     3rd Qu.:6.580  
 Max.   :7.030     Max.   :9.220  
                  NA's   :12532 
 ```{r birthdays}
 library(lubridate)
 library(gridExtra)
 # Import Sample Birthdays data
 birthdays <- read.csv('birthdaysExample.csv')
 # Convert the data frame into datetime objects sorted by date
 dates <- strptime(birthdays$dates[order(as.Date(birthdays$dates, format = '%m/%d/%y'))], '%m/%d/%y')
 # Create a histogram showing the amount of birthdays for every day in the dataset
 p1 = ggplot(birthdays, aes(x = dates)) +
  geom_histogram(stat='count', binwidth = 1)
 # Extract the Months and Days from the dates into new columns
 birthdays$months <- month(dates)
 birthdays$days <- day(dates)
 # Create a histogram showing the distribution of birthdays by month
 p2 <- ggplot(birthdays, aes(months)) +
  geom_histogram() +
  scale_x_continuous(breaks = seq(1, 12, 1))
 # Create a historgram showing the distribution of birthdays by day of the month.
 p3 <- ggplot(birthdays, aes(days)) +
  geom_histogram() +
  scale_x_continuous(breaks = seq(1, 31, 1))
 # Show all 3 histograms on the same image
 grid.arrange(p1, p2, p3)
 # Show basic statistics of the data including the Quartiles, Min, Max, Mean, and Median
 summary(birthdays)
 ```
 From this we can see that the days that have the most births
 are Feb 6th,May 22nd, and July 16th with 8 people each. But
 as we can tell from the other distributions the number of
 birthdays per month is fairly even. The mean is 6.474 which
 is very close to half way through they year. The Median is
 slightly higher at 7 which indicates that there are slightly
 more birthdays in the latter half of the year. The quartiles
 also indicate an even distribution with the 25% quartile at
 month 3 and the 75% quartile at month 9.
 Similarly the day of the month data shows a fairly even
 distribution as well. But there is one notible anomaly.
 The 15th day has a drastically higher number of birthdays
 than any other day. Whether this is an error or not I can't
 tell. The 31st has understandibly fewer birthdays than the
 other days since there are only 30 days in many months.
  dates         months            days     
 2/6/14 :  8   Min.   : 1.000   Min.   : 1.0  
 5/22/14:  8   1st Qu.: 3.000   1st Qu.: 8.0  
 7/16/14:  8   Median : 7.000   Median :16.0  
 1/14/14:  7   Mean   : 6.474   Mean   :15.7  
 2/2/14 :  7   3rd Qu.: 9.000   3rd Qu.:23.0  
 2/23/14:  7   Max.   :12.000   Max.   :31.0  
 (Other):988                                  
--- a/lesson3/birthdaysExample.csv
+++ b/lesson3/birthdaysExample.csv