821 lines
32 KiB
TeX
821 lines
32 KiB
TeX
\documentclass[]{article}
|
|
\usepackage{lmodern}
|
|
\usepackage{amssymb,amsmath}
|
|
\usepackage{ifxetex,ifluatex}
|
|
\usepackage{fixltx2e} % provides \textsubscript
|
|
\ifnum 0\ifxetex 1\fi\ifluatex 1\fi=0 % if pdftex
|
|
\usepackage[T1]{fontenc}
|
|
\usepackage[utf8]{inputenc}
|
|
\else % if luatex or xelatex
|
|
\ifxetex
|
|
\usepackage{mathspec}
|
|
\else
|
|
\usepackage{fontspec}
|
|
\fi
|
|
\defaultfontfeatures{Ligatures=TeX,Scale=MatchLowercase}
|
|
\fi
|
|
% use upquote if available, for straight quotes in verbatim environments
|
|
\IfFileExists{upquote.sty}{\usepackage{upquote}}{}
|
|
% use microtype if available
|
|
\IfFileExists{microtype.sty}{%
|
|
\usepackage{microtype}
|
|
\UseMicrotypeSet[protrusion]{basicmath} % disable protrusion for tt fonts
|
|
}{}
|
|
\usepackage[margin=1in]{geometry}
|
|
\usepackage{hyperref}
|
|
\hypersetup{unicode=true,
|
|
pdfborder={0 0 0},
|
|
breaklinks=true}
|
|
\urlstyle{same} % don't use monospace font for urls
|
|
\usepackage{color}
|
|
\usepackage{fancyvrb}
|
|
\newcommand{\VerbBar}{|}
|
|
\newcommand{\VERB}{\Verb[commandchars=\\\{\}]}
|
|
\DefineVerbatimEnvironment{Highlighting}{Verbatim}{commandchars=\\\{\}}
|
|
% Add ',fontsize=\small' for more characters per line
|
|
\usepackage{framed}
|
|
\definecolor{shadecolor}{RGB}{248,248,248}
|
|
\newenvironment{Shaded}{\begin{snugshade}}{\end{snugshade}}
|
|
\newcommand{\KeywordTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{\textbf{#1}}}
|
|
\newcommand{\DataTypeTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{#1}}
|
|
\newcommand{\DecValTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}}
|
|
\newcommand{\BaseNTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}}
|
|
\newcommand{\FloatTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}}
|
|
\newcommand{\ConstantTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}}
|
|
\newcommand{\CharTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}}
|
|
\newcommand{\SpecialCharTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}}
|
|
\newcommand{\StringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}}
|
|
\newcommand{\VerbatimStringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}}
|
|
\newcommand{\SpecialStringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}}
|
|
\newcommand{\ImportTok}[1]{#1}
|
|
\newcommand{\CommentTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textit{#1}}}
|
|
\newcommand{\DocumentationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}}
|
|
\newcommand{\AnnotationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}}
|
|
\newcommand{\CommentVarTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}}
|
|
\newcommand{\OtherTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{#1}}
|
|
\newcommand{\FunctionTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}}
|
|
\newcommand{\VariableTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}}
|
|
\newcommand{\ControlFlowTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{\textbf{#1}}}
|
|
\newcommand{\OperatorTok}[1]{\textcolor[rgb]{0.81,0.36,0.00}{\textbf{#1}}}
|
|
\newcommand{\BuiltInTok}[1]{#1}
|
|
\newcommand{\ExtensionTok}[1]{#1}
|
|
\newcommand{\PreprocessorTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textit{#1}}}
|
|
\newcommand{\AttributeTok}[1]{\textcolor[rgb]{0.77,0.63,0.00}{#1}}
|
|
\newcommand{\RegionMarkerTok}[1]{#1}
|
|
\newcommand{\InformationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}}
|
|
\newcommand{\WarningTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}}
|
|
\newcommand{\AlertTok}[1]{\textcolor[rgb]{0.94,0.16,0.16}{#1}}
|
|
\newcommand{\ErrorTok}[1]{\textcolor[rgb]{0.64,0.00,0.00}{\textbf{#1}}}
|
|
\newcommand{\NormalTok}[1]{#1}
|
|
\usepackage{graphicx,grffile}
|
|
\makeatletter
|
|
\def\maxwidth{\ifdim\Gin@nat@width>\linewidth\linewidth\else\Gin@nat@width\fi}
|
|
\def\maxheight{\ifdim\Gin@nat@height>\textheight\textheight\else\Gin@nat@height\fi}
|
|
\makeatother
|
|
% Scale images if necessary, so that they will not overflow the page
|
|
% margins by default, and it is still possible to overwrite the defaults
|
|
% using explicit options in \includegraphics[width, height, ...]{}
|
|
\setkeys{Gin}{width=\maxwidth,height=\maxheight,keepaspectratio}
|
|
\IfFileExists{parskip.sty}{%
|
|
\usepackage{parskip}
|
|
}{% else
|
|
\setlength{\parindent}{0pt}
|
|
\setlength{\parskip}{6pt plus 2pt minus 1pt}
|
|
}
|
|
\setlength{\emergencystretch}{3em} % prevent overfull lines
|
|
\providecommand{\tightlist}{%
|
|
\setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}}
|
|
\setcounter{secnumdepth}{0}
|
|
% Redefines (sub)paragraphs to behave more like sections
|
|
\ifx\paragraph\undefined\else
|
|
\let\oldparagraph\paragraph
|
|
\renewcommand{\paragraph}[1]{\oldparagraph{#1}\mbox{}}
|
|
\fi
|
|
\ifx\subparagraph\undefined\else
|
|
\let\oldsubparagraph\subparagraph
|
|
\renewcommand{\subparagraph}[1]{\oldsubparagraph{#1}\mbox{}}
|
|
\fi
|
|
|
|
%%% Use protect on footnotes to avoid problems with footnotes in titles
|
|
\let\rmarkdownfootnote\footnote%
|
|
\def\footnote{\protect\rmarkdownfootnote}
|
|
|
|
%%% Change title format to be more compact
|
|
\usepackage{titling}
|
|
|
|
% Create subtitle command for use in maketitle
|
|
\newcommand{\subtitle}[1]{
|
|
\posttitle{
|
|
\begin{center}\large#1\end{center}
|
|
}
|
|
}
|
|
|
|
\setlength{\droptitle}{-2em}
|
|
\title{}
|
|
\pretitle{\vspace{\droptitle}}
|
|
\posttitle{}
|
|
\author{}
|
|
\preauthor{}\postauthor{}
|
|
\date{}
|
|
\predate{}\postdate{}
|
|
|
|
|
|
\begin{document}
|
|
|
|
\section{Lesson 4}\label{lesson-4}
|
|
|
|
\begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}
|
|
|
|
\subsubsection{Scatterplots and Perceived Audience
|
|
Size}\label{scatterplots-and-perceived-audience-size}
|
|
|
|
Notes:
|
|
|
|
\begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}
|
|
|
|
\subsubsection{Scatterplots}\label{scatterplots}
|
|
|
|
Notes:
|
|
|
|
\begin{Shaded}
|
|
\begin{Highlighting}[]
|
|
\KeywordTok{library}\NormalTok{(ggplot2)}
|
|
\NormalTok{pf <-}\StringTok{ }\KeywordTok{read.csv}\NormalTok{(}\StringTok{'pseudo_facebook.tsv'}\NormalTok{, }\DataTypeTok{sep =} \StringTok{'}\CharTok{\textbackslash{}t}\StringTok{'}\NormalTok{)}
|
|
|
|
\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ age, }\DataTypeTok{y =}\NormalTok{ friend_count), }\DataTypeTok{data =}\NormalTok{ pf) }\OperatorTok{+}
|
|
\StringTok{ }\KeywordTok{geom_point}\NormalTok{()}
|
|
\end{Highlighting}
|
|
\end{Shaded}
|
|
|
|
\includegraphics{lesson4_student_files/figure-latex/Scatterplots-1.pdf}
|
|
|
|
\begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}
|
|
|
|
\paragraph{What are some things that you notice right
|
|
away?}\label{what-are-some-things-that-you-notice-right-away}
|
|
|
|
Response:
|
|
|
|
All of the data points are grouped into vertical lines and that the
|
|
younger the age the more likely they are to have more friends.
|
|
|
|
\subsubsection{ggplot Syntax}\label{ggplot-syntax}
|
|
|
|
Notes:
|
|
|
|
\begin{Shaded}
|
|
\begin{Highlighting}[]
|
|
\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ age, }\DataTypeTok{y =}\NormalTok{ friend_count), }\DataTypeTok{data =}\NormalTok{ pf) }\OperatorTok{+}
|
|
\StringTok{ }\KeywordTok{geom_point}\NormalTok{() }\OperatorTok{+}
|
|
\StringTok{ }\KeywordTok{xlim}\NormalTok{(}\DecValTok{13}\NormalTok{, }\DecValTok{90}\NormalTok{)}
|
|
\end{Highlighting}
|
|
\end{Shaded}
|
|
|
|
\begin{verbatim}
|
|
## Warning: Removed 4906 rows containing missing values (geom_point).
|
|
\end{verbatim}
|
|
|
|
\includegraphics{lesson4_student_files/figure-latex/ggplot Syntax-1.pdf}
|
|
|
|
\begin{Shaded}
|
|
\begin{Highlighting}[]
|
|
\KeywordTok{summary}\NormalTok{(pf}\OperatorTok{$}\NormalTok{age)}
|
|
\end{Highlighting}
|
|
\end{Shaded}
|
|
|
|
\begin{verbatim}
|
|
## Min. 1st Qu. Median Mean 3rd Qu. Max.
|
|
## 13.00 20.00 28.00 37.28 50.00 113.00
|
|
\end{verbatim}
|
|
|
|
Build one layer at a time to find errors easier
|
|
|
|
\subsubsection{Overplotting}\label{overplotting}
|
|
|
|
Notes:
|
|
|
|
\begin{Shaded}
|
|
\begin{Highlighting}[]
|
|
\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ age, }\DataTypeTok{y =}\NormalTok{ friend_count), }\DataTypeTok{data =}\NormalTok{ pf) }\OperatorTok{+}
|
|
\StringTok{ }\KeywordTok{geom_jitter}\NormalTok{(}\DataTypeTok{alpha =} \DecValTok{1}\OperatorTok{/}\DecValTok{20}\NormalTok{) }\OperatorTok{+}
|
|
\StringTok{ }\KeywordTok{xlim}\NormalTok{(}\DecValTok{13}\NormalTok{, }\DecValTok{90}\NormalTok{)}
|
|
\end{Highlighting}
|
|
\end{Shaded}
|
|
|
|
\begin{verbatim}
|
|
## Warning: Removed 5190 rows containing missing values (geom_point).
|
|
\end{verbatim}
|
|
|
|
\includegraphics{lesson4_student_files/figure-latex/Overplotting-1.pdf}
|
|
|
|
\paragraph{What do you notice in the
|
|
plot?}\label{what-do-you-notice-in-the-plot}
|
|
|
|
Response:
|
|
|
|
The bar for 69 is still clearly visible and it is more obvious that the
|
|
number generally decreases as the age increases.
|
|
|
|
\subsubsection{Coord\_trans()}\label{coord_trans}
|
|
|
|
Notes:
|
|
|
|
\paragraph{Look up the documentation for coord\_trans() and add a layer
|
|
to the plot that transforms friend\_count using the square root
|
|
function. Create your
|
|
plot!}\label{look-up-the-documentation-for-coord_trans-and-add-a-layer-to-the-plot-that-transforms-friend_count-using-the-square-root-function.-create-your-plot}
|
|
|
|
\begin{Shaded}
|
|
\begin{Highlighting}[]
|
|
\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ age, }\DataTypeTok{y =}\NormalTok{ friend_count), }\DataTypeTok{data =}\NormalTok{ pf) }\OperatorTok{+}
|
|
\StringTok{ }\KeywordTok{geom_point}\NormalTok{(}\DataTypeTok{alpha =} \DecValTok{1}\OperatorTok{/}\DecValTok{20}\NormalTok{) }\OperatorTok{+}
|
|
\StringTok{ }\KeywordTok{xlim}\NormalTok{(}\DecValTok{13}\NormalTok{, }\DecValTok{90}\NormalTok{) }\OperatorTok{+}
|
|
\StringTok{ }\KeywordTok{coord_trans}\NormalTok{(}\DataTypeTok{y =} \StringTok{"sqrt"}\NormalTok{)}
|
|
\end{Highlighting}
|
|
\end{Shaded}
|
|
|
|
\begin{verbatim}
|
|
## Warning: Removed 4906 rows containing missing values (geom_point).
|
|
\end{verbatim}
|
|
|
|
\includegraphics{lesson4_student_files/figure-latex/unnamed-chunk-1-1.pdf}
|
|
|
|
\paragraph{What do you notice?}\label{what-do-you-notice}
|
|
|
|
First off coord\_trans does not work with geom\_jitter, second the
|
|
datapoints near the bottom are more spread out vertically to present
|
|
them as more of a focus.
|
|
|
|
To use jitter you need more advanced syntax to only jitter the ages,
|
|
also to prevent possible negatives if 0 is jittered. To do this in
|
|
\texttt{geom\_point()} pass
|
|
\texttt{position\ =\ position\_jitter(h\ =\ 0)}
|
|
|
|
\begin{Shaded}
|
|
\begin{Highlighting}[]
|
|
\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ age, }\DataTypeTok{y =}\NormalTok{ friend_count), }\DataTypeTok{data =}\NormalTok{ pf) }\OperatorTok{+}
|
|
\StringTok{ }\KeywordTok{geom_point}\NormalTok{(}\DataTypeTok{alpha =} \DecValTok{1}\OperatorTok{/}\DecValTok{20}\NormalTok{, }\DataTypeTok{position =} \KeywordTok{position_jitter}\NormalTok{(}\DataTypeTok{h =} \DecValTok{0}\NormalTok{)) }\OperatorTok{+}
|
|
\StringTok{ }\KeywordTok{xlim}\NormalTok{(}\DecValTok{13}\NormalTok{, }\DecValTok{90}\NormalTok{) }\OperatorTok{+}
|
|
\StringTok{ }\KeywordTok{coord_trans}\NormalTok{(}\DataTypeTok{y =} \StringTok{"sqrt"}\NormalTok{)}
|
|
\end{Highlighting}
|
|
\end{Shaded}
|
|
|
|
\begin{verbatim}
|
|
## Warning: Removed 5185 rows containing missing values (geom_point).
|
|
\end{verbatim}
|
|
|
|
\includegraphics{lesson4_student_files/figure-latex/coord_trans_advanced-1.pdf}
|
|
|
|
\subsubsection{Alpha and Jitter}\label{alpha-and-jitter}
|
|
|
|
Notes:
|
|
|
|
\begin{Shaded}
|
|
\begin{Highlighting}[]
|
|
\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ age, }\DataTypeTok{y =}\NormalTok{ friendships_initiated, }\DataTypeTok{color =}\NormalTok{ gender), }\DataTypeTok{data =}\NormalTok{ pf) }\OperatorTok{+}
|
|
\StringTok{ }\KeywordTok{geom_point}\NormalTok{(}\DataTypeTok{alpha =} \DecValTok{1}\OperatorTok{/}\DecValTok{10}\NormalTok{, }\DataTypeTok{position =} \KeywordTok{position_jitter}\NormalTok{(}\DataTypeTok{h =} \DecValTok{0}\NormalTok{)) }\OperatorTok{+}
|
|
\StringTok{ }\KeywordTok{xlim}\NormalTok{(}\DecValTok{13}\NormalTok{, }\DecValTok{90}\NormalTok{) }\OperatorTok{+}
|
|
\StringTok{ }\KeywordTok{coord_trans}\NormalTok{(}\DataTypeTok{y =} \StringTok{"sqrt"}\NormalTok{)}
|
|
\end{Highlighting}
|
|
\end{Shaded}
|
|
|
|
\begin{verbatim}
|
|
## Warning: Removed 5170 rows containing missing values (geom_point).
|
|
\end{verbatim}
|
|
|
|
\includegraphics{lesson4_student_files/figure-latex/Alpha and Jitter-1.pdf}
|
|
|
|
\begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}
|
|
|
|
\subsubsection{Overplotting and Domain
|
|
Knowledge}\label{overplotting-and-domain-knowledge}
|
|
|
|
Notes:
|
|
|
|
plotting as a percentage of the whole
|
|
|
|
\subsubsection{Conditional Means}\label{conditional-means}
|
|
|
|
Notes:
|
|
|
|
\begin{Shaded}
|
|
\begin{Highlighting}[]
|
|
\KeywordTok{library}\NormalTok{(dplyr)}
|
|
\end{Highlighting}
|
|
\end{Shaded}
|
|
|
|
\begin{verbatim}
|
|
##
|
|
## Attaching package: 'dplyr'
|
|
\end{verbatim}
|
|
|
|
\begin{verbatim}
|
|
## The following objects are masked from 'package:stats':
|
|
##
|
|
## filter, lag
|
|
\end{verbatim}
|
|
|
|
\begin{verbatim}
|
|
## The following objects are masked from 'package:base':
|
|
##
|
|
## intersect, setdiff, setequal, union
|
|
\end{verbatim}
|
|
|
|
\begin{Shaded}
|
|
\begin{Highlighting}[]
|
|
\NormalTok{age_groups <-}\StringTok{ }\KeywordTok{group_by}\NormalTok{(pf, age)}
|
|
\NormalTok{pf.fc_by_age <-}\StringTok{ }\KeywordTok{summarise}\NormalTok{(age_groups,}
|
|
\DataTypeTok{friend_count_mean =} \KeywordTok{mean}\NormalTok{(friend_count),}
|
|
\DataTypeTok{friend_count_median =} \KeywordTok{median}\NormalTok{(friend_count),}
|
|
\DataTypeTok{n =} \KeywordTok{n}\NormalTok{())}
|
|
\NormalTok{pf.fc_by_age <-}\StringTok{ }\KeywordTok{arrange}\NormalTok{(pf.fc_by_age, age)}
|
|
|
|
\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ age, }\DataTypeTok{y =}\NormalTok{ friend_count_mean), }\DataTypeTok{data =}\NormalTok{ pf.fc_by_age) }\OperatorTok{+}
|
|
\StringTok{ }\KeywordTok{geom_line}\NormalTok{() }\OperatorTok{+}
|
|
\StringTok{ }\KeywordTok{xlim}\NormalTok{(}\DecValTok{13}\NormalTok{,}\DecValTok{90}\NormalTok{)}
|
|
\end{Highlighting}
|
|
\end{Shaded}
|
|
|
|
\begin{verbatim}
|
|
## Warning: Removed 23 rows containing missing values (geom_path).
|
|
\end{verbatim}
|
|
|
|
\includegraphics{lesson4_student_files/figure-latex/Conditional Means-1.pdf}
|
|
|
|
\subsubsection{Overlaying Summaries with Raw
|
|
Data}\label{overlaying-summaries-with-raw-data}
|
|
|
|
Notes:
|
|
|
|
\begin{Shaded}
|
|
\begin{Highlighting}[]
|
|
\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ age, }\DataTypeTok{y =}\NormalTok{ friendships_initiated), }\DataTypeTok{data =}\NormalTok{ pf) }\OperatorTok{+}
|
|
\StringTok{ }\KeywordTok{geom_point}\NormalTok{(}\DataTypeTok{alpha =} \DecValTok{1}\OperatorTok{/}\DecValTok{10}\NormalTok{, }\DataTypeTok{position =} \KeywordTok{position_jitter}\NormalTok{(}\DataTypeTok{h =} \DecValTok{0}\NormalTok{), }\DataTypeTok{color =} \StringTok{'orange'}\NormalTok{) }\OperatorTok{+}
|
|
\StringTok{ }\KeywordTok{xlim}\NormalTok{(}\DecValTok{13}\NormalTok{, }\DecValTok{90}\NormalTok{) }\OperatorTok{+}
|
|
\StringTok{ }\KeywordTok{coord_trans}\NormalTok{(}\DataTypeTok{y =} \StringTok{"sqrt"}\NormalTok{) }\OperatorTok{+}
|
|
\StringTok{ }\KeywordTok{geom_line}\NormalTok{(}\DataTypeTok{stat =} \StringTok{'summary'}\NormalTok{, }\DataTypeTok{fun.y =}\NormalTok{ mean) }\OperatorTok{+}
|
|
\StringTok{ }\KeywordTok{geom_line}\NormalTok{(}\DataTypeTok{stat =} \StringTok{'summary'}\NormalTok{, }\DataTypeTok{fun.y =}\NormalTok{ median, }\DataTypeTok{color =} \StringTok{'blue'}\NormalTok{) }\OperatorTok{+}
|
|
\StringTok{ }\KeywordTok{geom_line}\NormalTok{(}\DataTypeTok{stat =} \StringTok{'summary'}\NormalTok{, }\DataTypeTok{fun.y =}\NormalTok{ quantile, }\DataTypeTok{fun.args =} \KeywordTok{list}\NormalTok{(}\DataTypeTok{probs =} \FloatTok{0.1}\NormalTok{), }\DataTypeTok{color =} \StringTok{'red'}\NormalTok{, }\DataTypeTok{linetype =} \DecValTok{2}\NormalTok{) }\OperatorTok{+}
|
|
\StringTok{ }\KeywordTok{geom_line}\NormalTok{(}\DataTypeTok{stat =} \StringTok{'summary'}\NormalTok{, }\DataTypeTok{fun.y =}\NormalTok{ quantile, }\DataTypeTok{fun.args =} \KeywordTok{list}\NormalTok{(}\DataTypeTok{probs =} \FloatTok{0.9}\NormalTok{), }\DataTypeTok{color =} \StringTok{'red'}\NormalTok{, }\DataTypeTok{linetype =} \DecValTok{2}\NormalTok{) }\OperatorTok{+}
|
|
\StringTok{ }\KeywordTok{coord_cartesian}\NormalTok{(}\DataTypeTok{xlim =} \KeywordTok{c}\NormalTok{(}\DecValTok{13}\NormalTok{,}\DecValTok{70}\NormalTok{), }\DataTypeTok{ylim =} \KeywordTok{c}\NormalTok{(}\DecValTok{0}\NormalTok{,}\DecValTok{1000}\NormalTok{))}
|
|
\end{Highlighting}
|
|
\end{Shaded}
|
|
|
|
\begin{verbatim}
|
|
## Warning: Removed 4906 rows containing non-finite values (stat_summary).
|
|
|
|
## Warning: Removed 4906 rows containing non-finite values (stat_summary).
|
|
|
|
## Warning: Removed 4906 rows containing non-finite values (stat_summary).
|
|
|
|
## Warning: Removed 4906 rows containing non-finite values (stat_summary).
|
|
\end{verbatim}
|
|
|
|
\begin{verbatim}
|
|
## Warning: Removed 5183 rows containing missing values (geom_point).
|
|
\end{verbatim}
|
|
|
|
\includegraphics{lesson4_student_files/figure-latex/Overlaying Summaries with Raw Data-1.pdf}
|
|
|
|
\paragraph{What are some of your observations of the
|
|
plot?}\label{what-are-some-of-your-observations-of-the-plot}
|
|
|
|
Response:
|
|
|
|
I notice that the median is always lower than the mean and that the
|
|
median is closer to the center of the main body of datapoints. It
|
|
appears that the data is long tailed towards the high friend counts
|
|
which pulls the mean upwards.
|
|
|
|
\subsubsection{Moira: Histogram Summary and
|
|
Scatterplot}\label{moira-histogram-summary-and-scatterplot}
|
|
|
|
See the Instructor Notes of this video to download Moira's paper on
|
|
perceived audience size and to see the final plot.
|
|
|
|
Notes:
|
|
|
|
\begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}
|
|
|
|
\subsubsection{Correlation}\label{correlation}
|
|
|
|
Notes:
|
|
|
|
\begin{Shaded}
|
|
\begin{Highlighting}[]
|
|
\KeywordTok{cor.test}\NormalTok{(pf}\OperatorTok{$}\NormalTok{age, pf}\OperatorTok{$}\NormalTok{friend_count)}
|
|
\end{Highlighting}
|
|
\end{Shaded}
|
|
|
|
\begin{verbatim}
|
|
##
|
|
## Pearson's product-moment correlation
|
|
##
|
|
## data: pf$age and pf$friend_count
|
|
## t = -8.6268, df = 99001, p-value < 2.2e-16
|
|
## alternative hypothesis: true correlation is not equal to 0
|
|
## 95 percent confidence interval:
|
|
## -0.03363072 -0.02118189
|
|
## sample estimates:
|
|
## cor
|
|
## -0.02740737
|
|
\end{verbatim}
|
|
|
|
Look up the documentation for the cor.test function.
|
|
|
|
What's the correlation between age and friend count? Round to three
|
|
decimal places. Response:
|
|
|
|
-0.027
|
|
|
|
\subsubsection{Correlation on Subsets}\label{correlation-on-subsets}
|
|
|
|
Notes:
|
|
|
|
\begin{Shaded}
|
|
\begin{Highlighting}[]
|
|
\KeywordTok{with}\NormalTok{(pf[pf}\OperatorTok{$}\NormalTok{age }\OperatorTok{<=}\StringTok{ }\DecValTok{70}\NormalTok{,], }\KeywordTok{cor.test}\NormalTok{(age, friend_count))}
|
|
\end{Highlighting}
|
|
\end{Shaded}
|
|
|
|
\begin{verbatim}
|
|
##
|
|
## Pearson's product-moment correlation
|
|
##
|
|
## data: age and friend_count
|
|
## t = -52.592, df = 91029, p-value < 2.2e-16
|
|
## alternative hypothesis: true correlation is not equal to 0
|
|
## 95 percent confidence interval:
|
|
## -0.1780220 -0.1654129
|
|
## sample estimates:
|
|
## cor
|
|
## -0.1717245
|
|
\end{verbatim}
|
|
|
|
\begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}
|
|
|
|
\subsubsection{Correlation Methods}\label{correlation-methods}
|
|
|
|
Notes:
|
|
|
|
\url{http://www.statisticssolutions.com/correlation-pearson-kendall-spearman/}
|
|
|
|
\subsection{Create Scatterplots}\label{create-scatterplots}
|
|
|
|
Notes:
|
|
|
|
\begin{Shaded}
|
|
\begin{Highlighting}[]
|
|
\KeywordTok{library}\NormalTok{(ggplot2)}
|
|
\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ www_likes_received, }\DataTypeTok{y =}\NormalTok{ likes_received), }\DataTypeTok{data =}\NormalTok{ pf) }\OperatorTok{+}
|
|
\StringTok{ }\KeywordTok{geom_point}\NormalTok{()}\CommentTok{#alpha = 1/20, position = position_jitter(h = 0)) +}
|
|
\end{Highlighting}
|
|
\end{Shaded}
|
|
|
|
\includegraphics{lesson4_student_files/figure-latex/unnamed-chunk-2-1.pdf}
|
|
|
|
\begin{Shaded}
|
|
\begin{Highlighting}[]
|
|
\CommentTok{#xlim(13, 90) +}
|
|
\CommentTok{#coord_trans(y = "sqrt")}
|
|
\end{Highlighting}
|
|
\end{Shaded}
|
|
|
|
\begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}
|
|
|
|
\subsubsection{Strong Correlations}\label{strong-correlations}
|
|
|
|
Notes:
|
|
|
|
\begin{Shaded}
|
|
\begin{Highlighting}[]
|
|
\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ www_likes_received, }\DataTypeTok{y =}\NormalTok{ likes_received), }\DataTypeTok{data =}\NormalTok{ pf) }\OperatorTok{+}
|
|
\StringTok{ }\KeywordTok{geom_point}\NormalTok{() }\OperatorTok{+}
|
|
\StringTok{ }\KeywordTok{xlim}\NormalTok{(}\DecValTok{0}\NormalTok{, }\KeywordTok{quantile}\NormalTok{(pf}\OperatorTok{$}\NormalTok{www_likes_received, }\FloatTok{0.95}\NormalTok{)) }\OperatorTok{+}
|
|
\StringTok{ }\KeywordTok{ylim}\NormalTok{(}\DecValTok{0}\NormalTok{, }\KeywordTok{quantile}\NormalTok{(pf}\OperatorTok{$}\NormalTok{likes_received, }\FloatTok{0.95}\NormalTok{)) }\OperatorTok{+}
|
|
\StringTok{ }\KeywordTok{geom_smooth}\NormalTok{(}\DataTypeTok{method =} \StringTok{'lm'}\NormalTok{, }\DataTypeTok{color =} \StringTok{'red'}\NormalTok{)}
|
|
\end{Highlighting}
|
|
\end{Shaded}
|
|
|
|
\begin{verbatim}
|
|
## Warning: Removed 6075 rows containing non-finite values (stat_smooth).
|
|
\end{verbatim}
|
|
|
|
\begin{verbatim}
|
|
## Warning: Removed 6075 rows containing missing values (geom_point).
|
|
\end{verbatim}
|
|
|
|
\includegraphics{lesson4_student_files/figure-latex/Strong Correlations-1.pdf}
|
|
|
|
What's the correlation betwen the two variables? Include the top 5\% of
|
|
values for the variable in the calculation and round to 3 decimal
|
|
places.
|
|
|
|
\begin{Shaded}
|
|
\begin{Highlighting}[]
|
|
\KeywordTok{with}\NormalTok{(pf, }\KeywordTok{cor.test}\NormalTok{(www_likes_received, likes_received))}
|
|
\end{Highlighting}
|
|
\end{Shaded}
|
|
|
|
\begin{verbatim}
|
|
##
|
|
## Pearson's product-moment correlation
|
|
##
|
|
## data: www_likes_received and likes_received
|
|
## t = 937.1, df = 99001, p-value < 2.2e-16
|
|
## alternative hypothesis: true correlation is not equal to 0
|
|
## 95 percent confidence interval:
|
|
## 0.9473553 0.9486176
|
|
## sample estimates:
|
|
## cor
|
|
## 0.9479902
|
|
\end{verbatim}
|
|
|
|
Response:
|
|
|
|
0.948 Variable is a superset of another
|
|
|
|
\subsubsection{Moira on Correlation}\label{moira-on-correlation}
|
|
|
|
Notes:
|
|
|
|
Highly corelated can mean that variables are dependent on the same thing
|
|
or are similar.
|
|
|
|
\subsubsection{More Caution with
|
|
Correlation}\label{more-caution-with-correlation}
|
|
|
|
Notes:
|
|
|
|
\begin{Shaded}
|
|
\begin{Highlighting}[]
|
|
\CommentTok{#install.packages('alr3')}
|
|
\KeywordTok{library}\NormalTok{(alr3)}
|
|
\end{Highlighting}
|
|
\end{Shaded}
|
|
|
|
\begin{verbatim}
|
|
## Loading required package: car
|
|
\end{verbatim}
|
|
|
|
\begin{verbatim}
|
|
## Loading required package: carData
|
|
\end{verbatim}
|
|
|
|
\begin{verbatim}
|
|
##
|
|
## Attaching package: 'car'
|
|
\end{verbatim}
|
|
|
|
\begin{verbatim}
|
|
## The following object is masked from 'package:dplyr':
|
|
##
|
|
## recode
|
|
\end{verbatim}
|
|
|
|
\begin{Shaded}
|
|
\begin{Highlighting}[]
|
|
\KeywordTok{library}\NormalTok{(ggplot2)}
|
|
\KeywordTok{data}\NormalTok{(Mitchell)}
|
|
\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ Month, }\DataTypeTok{y =}\NormalTok{ Temp), }\DataTypeTok{data =}\NormalTok{ Mitchell) }\OperatorTok{+}
|
|
\StringTok{ }\KeywordTok{geom_point}\NormalTok{()}
|
|
\end{Highlighting}
|
|
\end{Shaded}
|
|
|
|
\includegraphics{lesson4_student_files/figure-latex/More Caution With Correlation-1.pdf}
|
|
|
|
Create your plot!
|
|
|
|
\begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}
|
|
|
|
\subsubsection{Noisy Scatterplots}\label{noisy-scatterplots}
|
|
|
|
\begin{enumerate}
|
|
\def\labelenumi{\alph{enumi}.}
|
|
\tightlist
|
|
\item
|
|
Take a guess for the correlation coefficient for the scatterplot.
|
|
\end{enumerate}
|
|
|
|
0.9
|
|
|
|
\begin{enumerate}
|
|
\def\labelenumi{\alph{enumi}.}
|
|
\setcounter{enumi}{1}
|
|
\tightlist
|
|
\item
|
|
What is the actual correlation of the two variables? (Round to the
|
|
thousandths place)
|
|
\end{enumerate}
|
|
|
|
\begin{Shaded}
|
|
\begin{Highlighting}[]
|
|
\KeywordTok{with}\NormalTok{(Mitchell, }\KeywordTok{cor.test}\NormalTok{(Month, Temp))}
|
|
\end{Highlighting}
|
|
\end{Shaded}
|
|
|
|
\begin{verbatim}
|
|
##
|
|
## Pearson's product-moment correlation
|
|
##
|
|
## data: Month and Temp
|
|
## t = 0.81816, df = 202, p-value = 0.4142
|
|
## alternative hypothesis: true correlation is not equal to 0
|
|
## 95 percent confidence interval:
|
|
## -0.08053637 0.19331562
|
|
## sample estimates:
|
|
## cor
|
|
## 0.05747063
|
|
\end{verbatim}
|
|
|
|
\begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}
|
|
|
|
\subsubsection{Making Sense of Data}\label{making-sense-of-data}
|
|
|
|
Notes:
|
|
|
|
\begin{Shaded}
|
|
\begin{Highlighting}[]
|
|
\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(Month, Temp), }\DataTypeTok{data =}\NormalTok{ Mitchell) }\OperatorTok{+}
|
|
\StringTok{ }\KeywordTok{geom_point}\NormalTok{() }\OperatorTok{+}
|
|
\StringTok{ }\KeywordTok{scale_x_continuous}\NormalTok{(}\DataTypeTok{breaks =} \KeywordTok{seq}\NormalTok{(}\DecValTok{0}\NormalTok{, }\DecValTok{204}\NormalTok{, }\DecValTok{12}\NormalTok{))}
|
|
\end{Highlighting}
|
|
\end{Shaded}
|
|
|
|
\includegraphics{lesson4_student_files/figure-latex/Making Sense of Data-1.pdf}
|
|
|
|
\begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}
|
|
|
|
\subsubsection{A New Perspective}\label{a-new-perspective}
|
|
|
|
What do you notice? Response:
|
|
|
|
There is a cyclical pattern to the data going from low to high and back
|
|
to low every 12 months. This is why I originally said there seems to be
|
|
a 0.9 correlation coefficient to the data because I saw this pattern the
|
|
first time I looked at the plot.
|
|
|
|
Watch the solution video and check out the Instructor Notes! Notes:
|
|
|
|
\begin{Shaded}
|
|
\begin{Highlighting}[]
|
|
\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ (Month}\OperatorTok{%%}\DecValTok{12}\NormalTok{), }\DataTypeTok{y =}\NormalTok{ Temp), }\DataTypeTok{data =}\NormalTok{ Mitchell) }\OperatorTok{+}
|
|
\StringTok{ }\KeywordTok{geom_point}\NormalTok{()}
|
|
\end{Highlighting}
|
|
\end{Shaded}
|
|
|
|
\includegraphics{lesson4_student_files/figure-latex/unnamed-chunk-3-1.pdf}
|
|
|
|
\subsubsection{Understanding Noise: Age to Age
|
|
Months}\label{understanding-noise-age-to-age-months}
|
|
|
|
Notes:
|
|
|
|
\begin{Shaded}
|
|
\begin{Highlighting}[]
|
|
\NormalTok{pf}\OperatorTok{$}\NormalTok{age_with_months <-}\StringTok{ }\NormalTok{(pf}\OperatorTok{$}\NormalTok{age) }\OperatorTok{+}\StringTok{ }\NormalTok{(}\DecValTok{1} \OperatorTok{-}\StringTok{ }\NormalTok{(pf}\OperatorTok{$}\NormalTok{dob_month}\OperatorTok{/}\DecValTok{12}\NormalTok{))}
|
|
\KeywordTok{head}\NormalTok{(pf)}
|
|
\end{Highlighting}
|
|
\end{Shaded}
|
|
|
|
\begin{verbatim}
|
|
## userid age dob_day dob_year dob_month gender tenure friend_count
|
|
## 1 2094382 14 19 1999 11 male 266 0
|
|
## 2 1192601 14 2 1999 11 female 6 0
|
|
## 3 2083884 14 16 1999 11 male 13 0
|
|
## 4 1203168 14 25 1999 12 female 93 0
|
|
## 5 1733186 14 4 1999 12 male 82 0
|
|
## 6 1524765 14 1 1999 12 male 15 0
|
|
## friendships_initiated likes likes_received mobile_likes
|
|
## 1 0 0 0 0
|
|
## 2 0 0 0 0
|
|
## 3 0 0 0 0
|
|
## 4 0 0 0 0
|
|
## 5 0 0 0 0
|
|
## 6 0 0 0 0
|
|
## mobile_likes_received www_likes www_likes_received age_with_months
|
|
## 1 0 0 0 14.08333
|
|
## 2 0 0 0 14.08333
|
|
## 3 0 0 0 14.08333
|
|
## 4 0 0 0 14.00000
|
|
## 5 0 0 0 14.00000
|
|
## 6 0 0 0 14.00000
|
|
\end{verbatim}
|
|
|
|
\begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}
|
|
|
|
\subsubsection{Age with Months Means}\label{age-with-months-means}
|
|
|
|
\begin{Shaded}
|
|
\begin{Highlighting}[]
|
|
\KeywordTok{library}\NormalTok{(dplyr)}
|
|
|
|
\NormalTok{age_with_months <-}\StringTok{ }\KeywordTok{group_by}\NormalTok{(pf, age_with_months)}
|
|
\NormalTok{pf.fc_by_age_months <-}\StringTok{ }\KeywordTok{summarize}\NormalTok{(}
|
|
\NormalTok{ age_with_months,}
|
|
\DataTypeTok{friend_count_mean =} \KeywordTok{mean}\NormalTok{(friend_count),}
|
|
\DataTypeTok{friend_count_median =} \KeywordTok{median}\NormalTok{(friend_count),}
|
|
\DataTypeTok{n =} \KeywordTok{n}\NormalTok{()}
|
|
\NormalTok{)}
|
|
|
|
\NormalTok{pf.fc_by_age_months <-}\StringTok{ }\KeywordTok{arrange}\NormalTok{(pf.fc_by_age_months, age_with_months)}
|
|
|
|
\KeywordTok{head}\NormalTok{(pf.fc_by_age_months)}
|
|
\end{Highlighting}
|
|
\end{Shaded}
|
|
|
|
\begin{verbatim}
|
|
## # A tibble: 6 x 4
|
|
## age_with_months friend_count_mean friend_count_median n
|
|
## <dbl> <dbl> <dbl> <int>
|
|
## 1 13.2 46.3 30.5 6
|
|
## 2 13.2 115. 23.5 14
|
|
## 3 13.3 136. 44.0 25
|
|
## 4 13.4 164. 72.0 33
|
|
## 5 13.5 131. 66.0 45
|
|
## 6 13.6 157. 64.0 54
|
|
\end{verbatim}
|
|
|
|
\subsubsection{Noise in Conditional
|
|
Means}\label{noise-in-conditional-means}
|
|
|
|
\begin{Shaded}
|
|
\begin{Highlighting}[]
|
|
\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ age_with_months, }\DataTypeTok{y =}\NormalTok{ friend_count_mean), }\DataTypeTok{data =} \KeywordTok{subset}\NormalTok{(pf.fc_by_age_months, age_with_months}\OperatorTok{<}\DecValTok{71}\NormalTok{)) }\OperatorTok{+}
|
|
\StringTok{ }\KeywordTok{geom_line}\NormalTok{()}
|
|
\end{Highlighting}
|
|
\end{Shaded}
|
|
|
|
\includegraphics{lesson4_student_files/figure-latex/Noise in Conditional Means-1.pdf}
|
|
|
|
\begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}
|
|
|
|
\subsubsection{Smoothing Conditional
|
|
Means}\label{smoothing-conditional-means}
|
|
|
|
Notes:
|
|
|
|
\begin{Shaded}
|
|
\begin{Highlighting}[]
|
|
\KeywordTok{library}\NormalTok{(gridExtra)}
|
|
\end{Highlighting}
|
|
\end{Shaded}
|
|
|
|
\begin{verbatim}
|
|
##
|
|
## Attaching package: 'gridExtra'
|
|
\end{verbatim}
|
|
|
|
\begin{verbatim}
|
|
## The following object is masked from 'package:dplyr':
|
|
##
|
|
## combine
|
|
\end{verbatim}
|
|
|
|
\begin{Shaded}
|
|
\begin{Highlighting}[]
|
|
\NormalTok{p1 <-}\StringTok{ }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ age, }\DataTypeTok{y =}\NormalTok{ friend_count_mean), }\DataTypeTok{data =} \KeywordTok{subset}\NormalTok{(pf.fc_by_age, age }\OperatorTok{<}\StringTok{ }\DecValTok{71}\NormalTok{)) }\OperatorTok{+}
|
|
\StringTok{ }\KeywordTok{geom_line}\NormalTok{() }\OperatorTok{+}
|
|
\StringTok{ }\KeywordTok{geom_smooth}\NormalTok{()}
|
|
\NormalTok{p2 <-}\StringTok{ }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =}\NormalTok{ age_with_months, }\DataTypeTok{y =}\NormalTok{ friend_count_mean), }\DataTypeTok{data =} \KeywordTok{subset}\NormalTok{(pf.fc_by_age_months, age_with_months }\OperatorTok{<}\StringTok{ }\DecValTok{71}\NormalTok{)) }\OperatorTok{+}
|
|
\StringTok{ }\KeywordTok{geom_line}\NormalTok{() }\OperatorTok{+}
|
|
\StringTok{ }\KeywordTok{geom_smooth}\NormalTok{()}
|
|
\NormalTok{p3 <-}\StringTok{ }\KeywordTok{ggplot}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =} \KeywordTok{round}\NormalTok{(age }\OperatorTok{/}\StringTok{ }\DecValTok{5}\NormalTok{) }\OperatorTok{*}\StringTok{ }\DecValTok{5}\NormalTok{, }\DataTypeTok{y =}\NormalTok{ friend_count), }\DataTypeTok{data =} \KeywordTok{subset}\NormalTok{(pf, age }\OperatorTok{<}\StringTok{ }\DecValTok{71}\NormalTok{)) }\OperatorTok{+}
|
|
\StringTok{ }\KeywordTok{geom_line}\NormalTok{(}\DataTypeTok{stat =} \StringTok{'summary'}\NormalTok{, }\DataTypeTok{fun.y =} \StringTok{'mean'}\NormalTok{)}
|
|
\KeywordTok{grid.arrange}\NormalTok{(p1, p2, p3)}
|
|
\end{Highlighting}
|
|
\end{Shaded}
|
|
|
|
\begin{verbatim}
|
|
## `geom_smooth()` using method = 'loess'
|
|
\end{verbatim}
|
|
|
|
\begin{verbatim}
|
|
## `geom_smooth()` using method = 'loess'
|
|
\end{verbatim}
|
|
|
|
\includegraphics{lesson4_student_files/figure-latex/Smoothing Conditional Means-1.pdf}
|
|
|
|
\begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}
|
|
|
|
\subsubsection{Which Plot to Choose?}\label{which-plot-to-choose}
|
|
|
|
Notes:
|
|
|
|
Make multiple plots during the exploritory phase and then refine them
|
|
down into the best plots for distribution.
|
|
|
|
\subsubsection{Analyzing Two Variables}\label{analyzing-two-variables}
|
|
|
|
Reflection:
|
|
|
|
Making multiple plots can show different features of the data. Also
|
|
while summaries and correlations are good for a lot of things they are
|
|
not always the best at portraying the data.
|
|
|
|
Click \textbf{KnitHTML} to see all of your hard work and to have an html
|
|
page of this lesson, your answers, and your notes!
|
|
|
|
|
|
\end{document}
|