From 860c600a9d8f775adad0ed9b1d1c50c706976f21 Mon Sep 17 00:00:00 2001
From: "Dusty.P" <dustin@djpianalto.com>
Date: Sat, 2 Jun 2018 00:50:59 -0800
Subject: [PATCH] Final Project initial Univariate plots done

---
 EDA_Project/EDA_Project.rmd | 152 +++++++++++++++++++++++++++++++++---
 1 file changed, 143 insertions(+), 9 deletions(-)

diff --git a/EDA_Project/EDA_Project.rmd b/EDA_Project/EDA_Project.rmd
index 4f86189..5206130 100644
--- a/EDA_Project/EDA_Project.rmd
+++ b/EDA_Project/EDA_Project.rmd
@@ -29,21 +29,155 @@ This report explores a dataset containing chemical information and ratings on al
 ```{r echo=FALSE, message=FALSE, warning=FALSE, Load_the_Data}
 # Load the Data
 wqw <- read.csv('wineQualityWhites.csv')
+# because the first column is just row numbers I am going to remove it
+wqw <- subset(wqw, select = -X)
 ```
 
 # Univariate Plots Section
 
-> **Tip**: In this section, you should perform some preliminary exploration of
-your dataset. Run some summaries of the data and create univariate plots to
-understand the structure of the individual variables in your dataset. Don't
-forget to add a comment after each plot or closely-related group of plots!
-There should be multiple code chunks and text sections; the first one below is
-just to help you get started.
-
-```{r echo=FALSE, Univariate_Plots}
-
+```{r echo=FALSE, Data_Dimensions}
+dim(wqw)
 ```
 
+```{r echo=FALSE, Data_Structure}
+str(wqw)
+```
+
+```{r echo=False, Data_Summary}
+summary(wqw)
+```
+
+Our Data consists of 11 numerical variables and one Integer attribute which is the output with almost 4900 observations
+
+```{r echo=FALSE, quality_histogram}
+ggplot(aes(x = quality), data = wqw) + 
+  geom_histogram(binwidth = 1)
+```
+
+The distribution of the quality seems fairly normal with a peak at 6
+
+```{r echo=FALSE, alcohol_histogram}
+ggplot(aes(x = alcohol), data = wqw) + 
+  geom_histogram(binwidth = .1)
+```
+
+The Alcohol seems to be slightly long tailed, I want to see what it is like with a log transformation.
+
+```{r echo=FALSE, alcohol_histogram}
+ggplot(aes(x = alcohol), data = wqw) + 
+  geom_histogram(binwidth = .005) +
+  scale_x_log10()
+```
+
+```{r echo=FALSE, fixed.acidity_histogram}
+ggplot(aes(x = fixed.acidity), data = wqw) + 
+  geom_histogram(binwidth = .1)
+```
+
+The fixed.acidity definately has some outliers but besides that has a pretty normal distribution.
+
+```{r echo=FALSE, fixed.acidity_summary}
+summary(wqw$fixed.acidity)
+```
+
+Most Wines have a acidity between 6.3 and 7.3
+I am going to plot the data again removing both the high and low 1% of values to remove the outliers.
+
+```{r echo=FALSE, fixed.acidity_histogram}
+ggplot(aes(x = fixed.acidity), data = wqw) + 
+  geom_histogram(binwidth = .1) +
+  xlim(quantile(wqw$fixed.acidity, 0.01), quantile(wqw$fixed.acidity, 0.99))
+```
+
+And we see a fairly normal distribution with a peak around 6.8 which matches both the median (6.8) and mean (6.855) from the summary above.
+
+```{r echo=FALSE, volatile.acidity_histogram}
+ggplot(aes(x = volatile.acidity), data = wqw) + 
+  geom_histogram(binwidth = .01)
+```
+
+We have another long tailed distribution. I am going to plot again with a log_10 transformation this time.
+
+```{r echo=FALSE, volatile.acidity_histogram}
+ggplot(aes(x = volatile.acidity), data = wqw) + 
+  geom_histogram(binwidth = .04) +
+  scale_x_log10()
+```
+
+```{r echo=FALSE, citric.acid_histogram}
+ggplot(aes(x = citric.acid), data = wqw) + 
+  geom_histogram(binwidth = .01) +
+  xlim(quantile(wqw$citric.acid, 0.01), quantile(wqw$citric.acid, 0.99))
+```
+
+There is an odd spike at about 0.49 I might want to look into that more later.
+
+```{r echo=FALSE, residual.sugar_histogram}
+ggplot(aes(x = residual.sugar), data = wqw) + 
+  geom_histogram(binwidth = .1) +
+  xlim(quantile(wqw$residual.sugar, 0.01), quantile(wqw$residual.sugar, 0.99))
+```
+
+Even with the top and bottom 1% removed the plot is still very long tailed
+
+```{r echo=FALSE, residual.sugar_histogram}
+p1 <- ggplot(aes(x = residual.sugar), data = wqw) + 
+  geom_histogram(binwidth = .05) +
+  scale_x_log10()
+p2 <- ggplot(aes(x = residual.sugar), data = wqw) + 
+  geom_histogram(binwidth = .01) +
+  scale_x_log10(breaks = seq(0, 20, 2))
+grid.arrange(p1, p2)
+```
+
+```{r echo=FALSE, residual.sugar_summary}
+summary(wqw$residual.sugar)
+```
+
+Using a log_10 transform with a bin width of .05 indicates a bimodal distribution but if you decrease the binwidth to 0.01 it shows that while there are a lot of observations between ~4 and ~20 they are a lot more spread out and there are more individual of each value from ~0.5 to ~2. And we can see in the summary of the data that the median is 5.2 and the mean is 6.4 which puts both of them inbetween the two peaks.
+
+```{r echo=FALSE, chlorides_histogram}
+ggplot(aes(x = chlorides), data = wqw) + 
+  geom_histogram(binwidth = .001) +
+  xlim(0, quantile(wqw$chlorides, 0.97))
+```
+
+Here I just removed the top 3% of values to remove the long tail.
+
+```{r echo=FALSE, sulfur.dioxide_histograms}
+p1 <- ggplot(aes(x = free.sulfur.dioxide), data = wqw) + 
+  geom_histogram(binwidth = 1) +
+  xlim(0, quantile(wqw$free.sulfur.dioxide, 0.99))
+p2 <- ggplot(aes(x = total.sulfur.dioxide), data = wqw) + 
+  geom_histogram(binwidth = 1) +
+  xlim(quantile(wqw$total.sulfur.dioxide, 0.01), quantile(wqw$total.sulfur.dioxide, 0.99))
+grid.arrange(p1, p2)
+```
+
+I plotted the Free Sulphur Dioxide and Total Sulphur Dioxide together to save room and because they are related. Note the difference in scales on both axies.
+
+```{r echo=FALSE, density_histogram}
+ggplot(aes(x = density), data = wqw) + 
+  geom_histogram(binwidth = .0001) +
+  xlim(quantile(wqw$density, 0.01), quantile(wqw$density, 0.99))
+```
+
+```{r echo=FALSE, pH_histogram}
+ggplot(aes(x = pH), data = wqw) + 
+  geom_histogram(binwidth = .01)
+```
+
+The pH plot doesn't need any modification.
+
+```{r echo=FALSE, sulphates_histogram}
+ggplot(aes(x = sulphates), data = wqw) + 
+  geom_histogram(binwidth = .01)
+```
+
+
+
+density             pH          sulphates
+
 > **Tip**: Make sure that you leave a blank line between the start / end of
 each code block and the end / start of your Markdown text so that it is
 formatted nicely in the knitted text. Note as well that text on consecutive