# 611 Highlights in R (part 1)

Check out an updated version of this post on RPubs: http://rpubs.com/rosemm/64056

## 611 Syllabus in R

Here’s the 611 schedule of topics from this year, and some code highlights from each week.

# a toy data set for us to work with
n <- 100
gender <- as.factor(c(rep("M", n/2), rep("F", n/2)))
IQ <- rnorm(n, mean=100, sd=15)
degree <- rep(c("HS", "BA", "MS", "PhD"), n/4)
height <- as.numeric(gender)-2 + rnorm(n, mean=5.5, sd=1)
RT1 <- rchisq(n, 4)
RT2 <- rchisq(n, 4)
DV <- 50 - 5*(as.numeric(gender)-1) + .1*IQ + rnorm(n)

df <- data.frame(awesome=DV, gender=gender, IQ=IQ, degree=degree, height=height, RT1=RT1, RT2=RT2)

### Exploratory Data Analysis

head(df)
##   awesome gender     IQ degree height   RT1    RT2
## 1   56.22      M 114.60     HS  6.260 1.965 0.4609
## 2   56.30      M 118.47     BA  5.010 1.516 4.6082
## 3   59.22      M 119.18     MS  4.455 5.396 2.2837
## 4   54.42      M  92.82    PhD  6.796 3.476 7.3616
## 5   53.95      M  92.41     HS  4.052 7.327 4.2431
## 6   58.60      M 115.49     BA  5.155 6.912 6.4011
View(df)

summary(df)
##     awesome     gender       IQ        degree       height
##  Min.   :51.3   F:50   Min.   : 58.4   BA :25   Min.   :2.19
##  1st Qu.:54.8   M:50   1st Qu.: 91.8   HS :25   1st Qu.:4.25
##  Median :57.5          Median : 98.1   MS :25   Median :4.86
##  Mean   :57.4          Mean   : 99.2   PhD:25   Mean   :4.89
##  3rd Qu.:59.4          3rd Qu.:109.3            3rd Qu.:5.60
##  Max.   :63.2          Max.   :131.1            Max.   :7.99
##       RT1              RT2
##  Min.   : 0.467   Min.   : 0.371
##  1st Qu.: 2.006   1st Qu.: 2.008
##  Median : 4.140   Median : 3.163
##  Mean   : 4.513   Mean   : 4.121
##  3rd Qu.: 6.457   3rd Qu.: 5.301
##  Max.   :13.278   Max.   :20.295
library(psych)
describe(df)
##         vars   n  mean    sd median trimmed   mad   min    max range  skew
## awesome    1 100 57.39  2.87  57.46   57.31  3.44 51.34  63.16 11.82  0.11
## gender*    2 100  1.50  0.50   1.50    1.50  0.74  1.00   2.00  1.00  0.00
## IQ         3 100 99.21 14.87  98.09   99.78 12.01 58.41 131.05 72.64 -0.34
## degree*    4 100  2.50  1.12   2.50    2.50  1.48  1.00   4.00  3.00  0.00
## height     5 100  4.89  1.16   4.86    4.88  1.08  2.19   7.99  5.80  0.03
## RT1        6 100  4.51  2.94   4.14    4.20  3.23  0.47  13.28 12.81  0.81
## RT2        7 100  4.12  3.13   3.16    3.71  2.18  0.37  20.29 19.92  1.97
##         kurtosis   se
## awesome    -0.94 0.29
## gender*    -2.02 0.05
## IQ          0.28 1.49
## degree*    -1.39 0.11
## height     -0.24 0.12
## RT1         0.03 0.29
## RT2         6.27 0.31
describe(df[,c(1,3)])
##         vars   n  mean    sd median trimmed   mad   min    max range  skew
## awesome    1 100 57.39  2.87  57.46   57.31  3.44 51.34  63.16 11.82  0.11
## IQ         2 100 99.21 14.87  98.09   99.78 12.01 58.41 131.05 72.64 -0.34
##         kurtosis   se
## awesome    -0.94 0.29
## IQ          0.28 1.49
plot(df$awesome ~ df$IQ)

plot(df$awesome ~ df$gender)

plot(df$awesome ~ df$degree)

plot(df$height ~ df$gender)

hist(df$IQ) hist(df$RT1)

cor(df$awesome, df$IQ)
## [1] 0.4604
round(cor(df[,c(1,3,5:7)]),3)
##         awesome     IQ height    RT1   RT2
## awesome   1.000  0.460 -0.419 -0.042 0.091
## IQ        0.460  1.000 -0.026 -0.003 0.142
## height   -0.419 -0.026  1.000  0.014 0.142
## RT1      -0.042 -0.003  0.014  1.000 0.012
## RT2       0.091  0.142  0.142  0.012 1.000
var(df$awesome, df$IQ)
## [1] 19.62
table(df[,c(2,4)])
##       degree
## gender BA HS MS PhD
##      F 12 12 13  13
##      M 13 13 12  12
# the big guns
library(ggplot2)
##
## Attaching package: 'ggplot2'
##
## The following object is masked from 'package:psych':
##
##     %+%

library(GGally)
ggpairs(df)

### Distributions

qqnorm(df$IQ) qqline(df$IQ, col="red")

qqnorm(df$RT1) qqline(df$RT1, col="red")

df$RT1_sqrt <- sqrt(df$RT1)
qqnorm(df$RT1_sqrt) qqline(df$RT1_sqrt, col="red")

### Hypothesis Testing & Estimation: T-tests

# one-sample
t.test(IQ, mu=100, data=df, conf.level=.95)
##
##  One Sample t-test
##
## data:  IQ
## t = -0.5292, df = 99, p-value = 0.5979
## alternative hypothesis: true mean is not equal to 100
## 95 percent confidence interval:
##   96.26 102.16
## sample estimates:
## mean of x
##     99.21
# IST
t.test(IQ ~ gender, var.equal=TRUE, data=df)
##
##  Two Sample t-test
##
## data:  IQ by gender
## t = -0.9859, df = 98, p-value = 0.3266
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -8.832  2.969
## sample estimates:
## mean in group F mean in group M
##           97.75          100.68
t.test(x=df[gender=="F",3], y=df[gender=="M",3], var.equal=TRUE)
##
##  Two Sample t-test
##
## data:  df[gender == "F", 3] and df[gender == "M", 3]
## t = -0.9859, df = 98, p-value = 0.3266
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -8.832  2.969
## sample estimates:
## mean of x mean of y
##     97.75    100.68
# DST
t.test(x=RT1, y=RT2, paired=TRUE, data=df)
##
##  Paired t-test
##
## data:  RT1 and RT2
## t = 0.9187, df = 99, p-value = 0.3605
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.4547  1.2388
## sample estimates:
## mean of the differences
##                   0.392

### One-Way ANOVAs

aov(awesome ~ degree, data=df)
## Call:
##    aov(formula = awesome ~ degree, data = df)
##
## Terms:
##                 degree Residuals
## Sum of Squares     5.6     807.9
## Deg. of Freedom      3        96
##
## Residual standard error: 2.901
## Estimated effects may be unbalanced
summary(aov(awesome ~ degree, data=df))
##             Df Sum Sq Mean Sq F value Pr(>F)
## degree       3      6    1.86    0.22   0.88
## Residuals   96    808    8.42

### Contrasts

Contrasts in R are unnecessarily confusing. Here is an rpubs doc that will help, hopefully!

# a little messy :)

### Two-Way ANOVAs

aov(awesome ~ degree*gender, data=df)
## Call:
##    aov(formula = awesome ~ degree * gender, data = df)
##
## Terms:
##                 degree gender degree:gender Residuals
## Sum of Squares     5.6  472.2           5.8     329.9
## Deg. of Freedom      3      1             3        92
##
## Residual standard error: 1.894
## Estimated effects may be unbalanced
summary(aov(awesome ~ degree*gender, data=df))
##               Df Sum Sq Mean Sq F value Pr(>F)
## degree         3      6       2    0.52   0.67
## gender         1    472     472  131.71 <2e-16 ***
## degree:gender  3      6       2    0.54   0.66
## Residuals     92    330       4
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# regression coefficients output
summary(lm(awesome ~ degree*gender, data=df))
##
## Call:
## lm(formula = awesome ~ degree * gender, data = df)
##
## Residuals:
##    Min     1Q Median     3Q    Max
## -5.517 -1.256 -0.268  1.328  3.795
##
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)
## (Intercept)        59.5427     0.5466  108.93  < 2e-16 ***
## degreeHS           -0.4621     0.7730   -0.60     0.55
## degreeMS            0.4083     0.7580    0.54     0.59
## degreePhD           0.1155     0.7580    0.15     0.88
## genderM            -4.5616     0.7580   -6.02  3.6e-08 ***
## degreeHS:genderM    1.0268     1.0720    0.96     0.34
## degreeMS:genderM    0.0397     1.0720    0.04     0.97
## degreePhD:genderM  -0.2190     1.0720   -0.20     0.84
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.89 on 92 degrees of freedom
## Multiple R-squared:  0.594,  Adjusted R-squared:  0.564
## F-statistic: 19.3 on 7 and 92 DF,  p-value: 1.25e-15

### Correlation and Regression

cor(df$awesome, df$IQ)
## [1] 0.4604
summary(lm(awesome ~ IQ, data=df))
##
## Call:
## lm(formula = awesome ~ IQ, data = df)
##
## Residuals:
##    Min     1Q Median     3Q    Max
## -4.493 -2.538  0.131  2.285  5.202
##
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  48.5816     1.7345   28.01  < 2e-16 ***
## IQ            0.0888     0.0173    5.13  1.4e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.56 on 98 degrees of freedom
## Multiple R-squared:  0.212,  Adjusted R-squared:  0.204
## F-statistic: 26.4 on 1 and 98 DF,  p-value: 1.44e-06

### Analysis of Categorical Data

table(df[,c(2,4)])
##       degree
## gender BA HS MS PhD
##      F 12 12 13  13
##      M 13 13 12  12
# goodness of fit
chisq.test(table(df$degree)) # are degrees equally distributed? ## ## Chi-squared test for given probabilities ## ## data: table(df$degree)
## X-squared = 0, df = 3, p-value = 1
chisq.test(table(df$degree), p=c(.4, .3, .2, .1)) # is it 40% HS, 30% BS, 20% MS, and 10% PhD? ## ## Chi-squared test for given probabilities ## ## data: table(df$degree)
## X-squared = 30.21, df = 3, p-value = 1.248e-06
# test of independence
chisq.test(table(df[,c(2,4)])) 
##
##  Pearson's Chi-squared test
##
## data:  table(df[, c(2, 4)])
## X-squared = 0.16, df = 3, p-value = 0.9838
# Get measures of association
library(vcd)
## Loading required package: grid
assocstats(table(df[,c(2,4)]))
##                      X^2 df P(> X^2)
## Likelihood Ratio 0.16004  3  0.98377
## Pearson          0.16000  3  0.98377
##
## Phi-Coefficient   : 0.04
## Contingency Coeff.: 0.04
## Cramer's V        : 0.04