# Basic Data Visualization with ggplot2

```###Basic Data Visualization with ggplot2 ###R Club - 5/1/1014 ###Joe Hoover```

``` #Let's make some data library(gridExtra) set.seed(10005) ATTEND rnorm(1500, mean = 16), rnorm(500, mean = 19)) GRADE rnorm(1500, mean = 83, sd = 4), rnorm(500, mean = 92, sd = 2)) BOOKS data #We have books read, classes attended, and grade. #We're interested in predicting student grades from #the number of books they've read, and the number #of classes they attended. ##Graphing with ggplot involves a process of specifying layers of graphic objects. #Let's start with a histogram #What does the grade distribution look like? ggplot(data, aes(GRADE)) + #The first step is to identify the data you want to graph. #In this case, we want to graph the variable GRADE, which can be found in data geom_histogram() #Now, we need to tell ggplot2 how we want the data represented. #That's not very nice. Let's eliminate the black fill: grade_hist = ggplot(data, aes(GRADE)) + geom_histogram(fill=NA, color="black") grade_hist #Let's add some lables: grade_hist + labs(x="Grades", y = "Number of Grades") #We can also display grades in a density plot: d1 = ggplot(data, aes(GRADE)) + geom_density() d1 #Color? d1 = ggplot(data, aes(GRADE, fill="red")) + geom_density() d1 #Now, what if we want to look at the density distribution of grades #broken down into categories of number of books read? d2 = ggplot(data, aes(GRADE, fill=factor(BOOKS))) + geom_density(alpha = 0.2) d2 #We probably don't need the legend... d2 = ggplot(data, aes(GRADE, fill="red")) + geom_density(alpha = 0.2) + theme(legend.position = "none") d2 #Let's look at grades by books read: b1 = ggplot(data, aes(GRADE)) + geom_density(aes(fill = factor(BOOKS))) #Here, we tell ggplot we want different colors for our factor BOOKS b1 #What about barplots? ggplot(data, aes(x = factor(BOOKS), y = GRADE)) + geom_bar(stat="identity") #What happened? Why are the grades so high? #It looks like we have summed grades, which is not particularly interesting. #Let's use stat_summary to tell ggplot we want the mean grade for each level of book read. b1 = ggplot(data, aes(x = factor(BOOKS), y=GRADE, fill = factor(BOOKS))) + stat_summary(fun.y=mean, geom="bar") + scale_fill_manual(values=c("purple", "orange", "red")) b1 #Box plots, jitter plots, and violins: b1 = ggplot(data, aes(BOOKS, GRADE)) + geom_boxplot(aes(fill=BOOKS)) b1 #jitter plot b2<-ggplot(data, aes(BOOKS, GRADE)) + geom_jitter(alpha=I(1/4), aes(color=BOOKS)) + theme(legend.position = "none") b2 #violin plot b3<-ggplot(data, aes(x = GRADE)) + stat_density(aes(ymax = ..density.., ymin = -..density.., fill = BOOKS, color = BOOKS), geom = "ribbon", position = "identity") + facet_grid(. ~ BOOKS) + coord_flip() + theme(legend.position = "none") b3 #jitter plot + boxplot b4<-ggplot(data, aes(BOOKS, GRADE)) + geom_jitter(alpha=I(1/4), aes(color=BOOKS)) + geom_boxplot(aes(fill=BOOKS)) + theme(legend.position = "none") b4 #Let's display them all together: grid.arrange(b1, b2, b3, b4, nrow=1) #Alright, how about scatterplots? #First we need to declare the data we want to graph. #Let's look at grades by attendence: s1 = ggplot(data, aes(x = ATTEND, y = GRADE)) s2 = s1 + geom_point() #add a layer of plot points s2 #Let's use BOOKSe to color the points: s2 = s1 + geom_point(aes(color=factor(BOOKS))) + scale_color_manual(values=c("purple", "orange", "red")) s2 #Let's add a regression line s2 + geom_smooth(method = "lm") + facet_grid(. ~ BOOKS) #How about regression lines for each level of books? s2 + geom_smooth(method = "lm", aes(color = factor(BOOKS))) #No error bars ggplot(data, aes(x = ATTEND, y = GRADE)) + geom_point(aes(color=factor(BOOKS))) + scale_color_manual(values=c("purple", "orange", "red")) + geom_smooth(method = "lm", se=FALSE, aes(color = factor(BOOKS))) ##Kind of hard to see, let's plot the regression lines without the data points: #No error bars ggplot(data, aes(x = ATTEND, y = GRADE)) + scale_color_manual(values=c("purple", "orange", "red")) + geom_smooth(method = "lm", se=FALSE, aes(color = factor(BOOKS))) #Combining graphs: #Let's plot several different graphs together. #We're going to use grid.arrange to do this. #First, however, we need to create an empty placeholder plot: #placeholder plot - prints nothing at all empty theme( plot.background = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.border = element_blank(), panel.background = element_blank(), axis.title.x = element_blank(), axis.title.y = element_blank(), axis.text.x = element_blank(), axis.text.y = element_blank(), axis.ticks = element_blank() ) #scatterplot of x and y variables scatter geom_point(aes(color=BOOKS)) + scale_color_manual(values = c("orange", "purple", "red")) + theme(legend.position=c(1,1),legend.justification=c(1,1)) #marginal density of x - plot on top plot_top geom_density(alpha=.5) + scale_fill_manual(values = c("orange", "purple", "red")) + theme(legend.position = "none") #marginal density of y - plot on the right plot_right geom_density(alpha=.5) + coord_flip() + scale_fill_manual(values = c("orange", "purple", "red")) + theme(legend.position = "none") plot_right #arrange the plots together, with appropriate height and width for each row and column grid.arrange(plot_top, empty, scatter, plot_right, ncol=2, nrow=2, widths=c(4, 1), heights=c(1, 4)) #scatterplot of x and y variables scatter geom_point(aes(color=BOOKS)) + scale_color_manual(values = c("orange", "purple", "red")) + theme(legend.position=c(1,1),legend.justification=c(1,1)) #marginal density of x - plot on top plot_top geom_density(alpha=.5) + scale_fill_manual(values = c("orange", "purple", "red")) + theme(legend.position = "none") + scale_y_continuous(breaks=c(0.0, .2, .4)) #Define breaks for y-axis #marginal density of y - plot on the right plot_right geom_density(alpha=.5) + coord_flip() + scale_fill_manual(values = c("orange", "purple", "red")) + theme(legend.position = "none") + scale_y_continuous(breaks=c(0.0, .1, .2)) #Define breaks for y-axis #arrange the plots together, with appropriate height and width for each row and column grid.arrange(plot_top, empty, scatter, plot_right, ncol=2, nrow=2, widths=c(4, 1), heights=c(1, 4)) ############################################## #######################RESOURCES############## ```

```##Much of the code above was adapted from this excellent blog: ##http://rforpublichealth.blogspot.com/ ##Other useful sights: ## sape.inf.usi.ch/quick-reference/ggplot2 ## www.cookbook-r.com/Graphs ##docs.ggplot2.org ```