Basic Data Visualization with ggplot2

###Basic Data Visualization with ggplot2
###R Club - 5/1/1014
###Joe Hoover

#Let's make some data

library(gridExtra)
set.seed(10005)

ATTEND rnorm(1500, mean = 16),
rnorm(500, mean = 19))
GRADE rnorm(1500, mean = 83, sd = 4),
rnorm(500, mean = 92, sd = 2))

BOOKS data

#We have books read, classes attended, and grade.
#We're interested in predicting student grades from
#the number of books they've read, and the number
#of classes they attended.

##Graphing with ggplot involves a process of specifying layers of graphic objects.
#Let's start with a histogram

#What does the grade distribution look like?
ggplot(data, aes(GRADE)) + #The first step is to identify the data you want to graph.
#In this case, we want to graph the variable GRADE, which can be found in data
geom_histogram() #Now, we need to tell ggplot2 how we want the data represented.

#That's not very nice. Let's eliminate the black fill:
grade_hist = ggplot(data, aes(GRADE)) + geom_histogram(fill=NA, color="black")
grade_hist

#Let's add some lables:
grade_hist + labs(x="Grades", y = "Number of Grades")

#We can also display grades in a density plot:
d1 = ggplot(data, aes(GRADE)) + geom_density()
d1

#Color?

d1 = ggplot(data, aes(GRADE, fill="red")) + geom_density()
d1

#Now, what if we want to look at the density distribution of grades
#broken down into categories of number of books read?

d2 = ggplot(data, aes(GRADE, fill=factor(BOOKS))) + geom_density(alpha = 0.2)
d2

#We probably don't need the legend...
d2 = ggplot(data, aes(GRADE, fill="red")) + geom_density(alpha = 0.2) +
theme(legend.position = "none")
d2

#Let's look at grades by books read:

b1 = ggplot(data, aes(GRADE)) +
geom_density(aes(fill = factor(BOOKS))) #Here, we tell ggplot we want different colors for our factor BOOKS
b1

#What about barplots?

ggplot(data, aes(x = factor(BOOKS), y = GRADE)) + geom_bar(stat="identity")
#What happened? Why are the grades so high?
#It looks like we have summed grades, which is not particularly interesting.
#Let's use stat_summary to tell ggplot we want the mean grade for each level of book read.

b1 = ggplot(data, aes(x = factor(BOOKS), y=GRADE, fill = factor(BOOKS))) +
stat_summary(fun.y=mean, geom="bar") +
scale_fill_manual(values=c("purple", "orange", "red"))
b1

#Box plots, jitter plots, and violins:

b1 = ggplot(data, aes(BOOKS, GRADE)) +
geom_boxplot(aes(fill=BOOKS))
b1

#jitter plot
b2<-ggplot(data, aes(BOOKS, GRADE)) +
geom_jitter(alpha=I(1/4), aes(color=BOOKS)) +
theme(legend.position = "none")
b2

#violin plot
b3<-ggplot(data, aes(x = GRADE)) +
stat_density(aes(ymax = ..density.., ymin = -..density..,
fill = BOOKS, color = BOOKS),
geom = "ribbon", position = "identity") +
facet_grid(. ~ BOOKS) +
coord_flip() +
theme(legend.position = "none")
b3

#jitter plot + boxplot
b4<-ggplot(data, aes(BOOKS, GRADE)) +
geom_jitter(alpha=I(1/4), aes(color=BOOKS)) +
geom_boxplot(aes(fill=BOOKS)) +
theme(legend.position = "none")
b4

#Let's display them all together:

grid.arrange(b1, b2, b3, b4, nrow=1)

#Alright, how about scatterplots?
#First we need to declare the data we want to graph.
#Let's look at grades by attendence:

s1 = ggplot(data, aes(x = ATTEND, y = GRADE))
s2 = s1 + geom_point() #add a layer of plot points
s2

#Let's use BOOKSe to color the points:
s2 = s1 + geom_point(aes(color=factor(BOOKS))) +
scale_color_manual(values=c("purple", "orange", "red"))

s2

#Let's add a regression line
s2 + geom_smooth(method = "lm") + facet_grid(. ~ BOOKS)

#How about regression lines for each level of books?
s2 + geom_smooth(method = "lm", aes(color = factor(BOOKS)))

#No error bars
ggplot(data, aes(x = ATTEND, y = GRADE)) +
geom_point(aes(color=factor(BOOKS))) +
scale_color_manual(values=c("purple", "orange", "red")) +
geom_smooth(method = "lm", se=FALSE, aes(color = factor(BOOKS)))

##Kind of hard to see, let's plot the regression lines without the data points:

#No error bars
ggplot(data, aes(x = ATTEND, y = GRADE)) +
scale_color_manual(values=c("purple", "orange", "red")) +
geom_smooth(method = "lm", se=FALSE, aes(color = factor(BOOKS)))

#Combining graphs:

#Let's plot several different graphs together.
#We're going to use grid.arrange to do this.
#First, however, we need to create an empty placeholder plot:

#placeholder plot - prints nothing at all
empty theme(
plot.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank(),
axis.text.x = element_blank(),
axis.text.y = element_blank(),
axis.ticks = element_blank()
)

#scatterplot of x and y variables
scatter geom_point(aes(color=BOOKS)) +
scale_color_manual(values = c("orange", "purple", "red")) +
theme(legend.position=c(1,1),legend.justification=c(1,1))

#marginal density of x - plot on top
plot_top geom_density(alpha=.5) +
scale_fill_manual(values = c("orange", "purple", "red")) +
theme(legend.position = "none")

#marginal density of y - plot on the right
plot_right geom_density(alpha=.5) +
coord_flip() +
scale_fill_manual(values = c("orange", "purple", "red")) +
theme(legend.position = "none")
plot_right

#arrange the plots together, with appropriate height and width for each row and column
grid.arrange(plot_top, empty, scatter, plot_right, ncol=2, nrow=2, widths=c(4, 1), heights=c(1, 4))

#scatterplot of x and y variables
scatter geom_point(aes(color=BOOKS)) +
scale_color_manual(values = c("orange", "purple", "red")) +
theme(legend.position=c(1,1),legend.justification=c(1,1))

#marginal density of x - plot on top
plot_top geom_density(alpha=.5) +
scale_fill_manual(values = c("orange", "purple", "red")) +
theme(legend.position = "none") +
scale_y_continuous(breaks=c(0.0, .2, .4)) #Define breaks for y-axis

#marginal density of y - plot on the right
plot_right geom_density(alpha=.5) +
coord_flip() +
scale_fill_manual(values = c("orange", "purple", "red")) +
theme(legend.position = "none") +
scale_y_continuous(breaks=c(0.0, .1, .2)) #Define breaks for y-axis

#arrange the plots together, with appropriate height and width for each row and column
grid.arrange(plot_top, empty, scatter, plot_right, ncol=2, nrow=2, widths=c(4, 1), heights=c(1, 4))

##############################################
#######################RESOURCES##############

##Much of the code above was adapted from this excellent blog:
##http://rforpublichealth.blogspot.com/
##Other useful sights:
## sape.inf.usi.ch/quick-reference/ggplot2
## www.cookbook-r.com/Graphs
##docs.ggplot2.org

One comment