Visualise all the things
viridis
package to set colours# libraries needed for these graphs
library(tidyverse)
library(viridis)
library(plotly)
# cowplot will change the default theme of graphs, so we're loading it later
# library(cowplot)
Before you read ahead, come up with an example of each type of variable combination and sketch the types of graphs that would best display these data.
Here we’ve created some data frames with different types of data.
pets
has a column with pet typedemog
has height
and age
for 500 men and 500 women.x_vs_y
has two correlated continuous variables (x
and y
)overlap
has two correlated ordinal variables and 1000 observations so there is a lot of overlapoverplot
has two correlated continuous variables and 10000 observationsFirst, think about what kinds of graphs are best for representing these different types of data.
pets <- tibble(
pet = sample(
c("dog", "cat", "ferret", "bird", "fish"),
100,
TRUE,
c(0.45, 0.40, 0.05, 0.05, 0.05)
)
)
demog <- tibble(
sex = rep(c("male", "female"), each = 500),
height = c(rnorm(500, 70, 4), rnorm(500, 65, 3.5)),
age = rpois(1000, 3) + 20
)
x_vs_y <- tibble(
x = rnorm(100),
y = x + rnorm(100, 0, 0.5)
)
overlap <- tibble(
x = rbinom(1000, 10, 0.5),
y = x + rbinom(1000, 20, 0.5)
)
overplot <- tibble(
x = rnorm(10000),
y = x + rnorm(10000, 0, 0.5)
)
Bar plots are good for categorical data where you want to represent the count.
ggplot(pets, aes(pet)) +
geom_bar()
Density plots are good for one continuous variable, but only if you have a fairly large number of observations.
ggplot(demog, aes(height)) +
geom_density()
You can represent subsets of a variable by assigning the category variable to the argument group
, fill
, or color
.
ggplot(demog, aes(height, fill = sex)) +
geom_density(alpha = 0.5)
Try changing the alpha
argument to figure out what it does.
If you don’t want smoothed distributions, try geom_freqpoly()
.
ggplot(demog, aes(height, color = sex)) +
geom_freqpoly(binwidth = 1)
Try changing the binwidth
argument to 5 and 0.1. How do you figure out the right value?
Histograms are also good for one continuous variable, and work well if you don’t have many observations. Set the binwidth
to control how wide each bar is.
ggplot(demog, aes(height)) +
geom_histogram(binwidth = 1, fill = "white", color = "black")
If you show grouped histograms, you also probably want to change the default position
argument.
ggplot(demog, aes(height, fill=sex)) +
geom_histogram(binwidth = 1, alpha = 0.5, position = "dodge")
Try changing the position
argument to “identity”, “fill”, “dodge”, and “stack”.
Boxplots are great for representing the distribution of grouped continuous variables. They fix most of the problems with using barplots for continuous data.
ggplot(demog, aes(sex, height, fill=sex)) +
geom_boxplot(alpha = 0.5)
Violin pots are like sideways, mirrored density plots. They give even more information than a boxplot about distribution and are especially useful when you have non-normal distributions.
ggplot(demog, aes(sex, height, fill=sex)) +
geom_violin(
trim = FALSE,
draw_quantiles = c(0.25, 0.5, 0.75),
alpha = 0.5
)
Try changing the numbers in the draw_quantiles
argument.
ggplot(demog, aes(sex, height, fill=sex)) +
geom_violin(
trim = FALSE,
alpha = 0.5
) +
stat_summary(
fun.data = function(x) {
m <- mean(x)
sd <- sd(x)
c(y = m,
ymin = m - sd,
ymax = m + sd)
},
geom="pointrange")
To demonstrate the use of facet_grid()
for factorial designs, I created a new column called agegroup
to split the data into participants older than the meadian age or younger than the median age.
demog %>%
mutate(agegroup = ifelse(age<median(age), "Younger", "Older")) %>%
ggplot(aes(sex, height, fill=sex)) +
geom_violin(trim = FALSE, alpha=0.5, show.legend = FALSE) +
geom_boxplot(width = 0.25, fill="white") +
facet_grid(.~agegroup) +
scale_fill_manual(values = c("orange", "green"))
Set the show.legend
argument to FALSE
to hide the legend. We do this here because the x-axis already labels the sexes.
If you don’t have a lot of data points, it’s good to represent them individually. You can use geom_point
to do this, setting position
to “jitter”.
demog %>%
sample_n(50) %>% # choose 50 random observations from the dataset
ggplot(aes(sex, height, fill=sex)) +
geom_violin(
trim = FALSE,
draw_quantiles = c(0.25, 0.5, 0.75),
alpha=0.5
) +
geom_point(position = "jitter", alpha = 0.7, size = 3)
Column plots are the worst way to represent grouped continuous data, but also one of the most common.
To make column plots with error bars, you first need to calculate the means, error bar uper limits (ymax
) and error bar lower limits (ymin
) for each category. You’ll learn more about how to use the code below in the next two lessons.
# calculate mean and SD for each sex
demog %>%
group_by(sex) %>%
summarise(
mean = mean(height),
sd = sd(height)
) %>%
ggplot(aes(sex, mean, fill=sex)) +
geom_col(alpha = 0.5) +
geom_errorbar(aes(ymin = mean - sd, ymax = mean + sd), width = 0.25) +
geom_hline(yintercept = 40)
What do you think geom_hline()
does?
You can save a ggplot using ggsave()
. It saves the last ggplot you made, by default, but you can specify which plot you want to save if you assigned that plot to a variable.
You can set the width
and height
of your plot. The default units are inches, but you can change the units
argument to “in”, “cm”, or “mm”.
demog_box <- ggplot(demog, aes(sex, height, fill=sex)) +
geom_boxplot(alpha = 0.5)
demog_violin <- ggplot(demog, aes(sex, height, fill=sex)) +
geom_violin(alpha = 0.5)
ggsave("demog_violin_plot.png", width = 5, height = 7)
ggsave("demog_box_plot.jpg", plot = demog_box, width = 5, height = 7)
You can use the cowplot
package to easily make grids of different graphs. First, you have to assign each plot a name. Then you list all the plots as the first arguments of plot_grid()
and provide a list of labels.
You can get back the default ggplot theme with + theme_set(theme_grey())
.
library(cowplot)
my_hist <- ggplot(demog, aes(height, fill=sex)) +
geom_histogram(
binwidth = 1,
alpha = 0.5,
position = "dodge",
show.legend = FALSE
)
my_violin <- ggplot(demog, aes(sex, height, fill=sex)) +
geom_violin(
trim = FALSE,
draw_quantiles = c(0.5),
alpha = 0.5,
show.legend = FALSE
)
my_box <- ggplot(demog, aes(sex, height, fill=sex)) +
geom_boxplot(alpha=0.5, show.legend = FALSE)
my_density <- ggplot(demog, aes(height, fill=sex)) +
geom_density(alpha=0.5, show.legend = FALSE)
my_bar <- demog %>%
group_by(sex) %>%
summarise(
mean = mean(height),
sd = sd(height)
) %>%
ggplot(aes(sex, mean, fill=sex)) +
geom_bar(stat="identity", alpha = 0.5, show.legend = FALSE) +
geom_errorbar(aes(ymin = mean - sd, ymax = mean + sd), width = 0.25)
plot_grid(
my_violin,
my_box,
my_density,
my_bar,
labels = c("A", "B", "C", "D")
)
Once you load the cowplot package, your ggplot default theme will change.
Scatter plots are a good way to represent the relationship between two continuous variables.
ggplot(x_vs_y, aes(x, y)) +
geom_point()
You often want to represent the relationship as a single line.
ggplot(x_vs_y, aes(x, y)) +
geom_smooth(method="lm")
If your graph isn’t too complicated, it’s good to also show the individual data points behind the line.
ggplot(x_vs_y, aes(x, y)) +
geom_point(alpha = 0.25) +
geom_smooth(method="lm")
You can deal with overlapping data points (very common if you’re using Likert scales) by reducing the opacity of the points. You need to use trial and error to adjust these so they look right.
ggplot(overlap, aes(x, y)) +
geom_point(size = 5, alpha = .05) +
geom_smooth(method="lm")
Or you can set the size of the dot proportional to the number of overlapping observations using geom_count()
.
overlap %>%
ggplot(aes(x, y)) +
geom_count(color = "#663399")
Alternatively, you can transform your data to create a count column and use the count to set the dot colour.
overlap %>%
group_by(x, y) %>%
summarise(count = n()) %>%
ggplot(aes(x, y, color=count)) +
geom_point(size = 5) +
scale_color_viridis()
The viridis package changes the colour themes to be easier to read by people with colourblindness and to print better in greyscale. Use scale_color_viridis()
to set the colour palette and scale_fill_viridis()
to set the fill palette in ggplot. If you need discrete (as opposed to continuous) colours, use scale_color_viridis(discrete=TRUE)
or scale_fill_viridis(discrete=TRUE)
instead.
Even if the variables are continuous, overplotting might obscure any relationships if you have lots of data.
overplot %>%
ggplot(aes(x, y)) +
geom_point()
Use geom_density2d()
to create a contour map.
overplot %>%
ggplot(aes(x, y)) +
geom_density2d()
You can use stat_density_2d(aes(fill = ..level..), geom = "polygon")
to create a heatmap-style density plot.
overplot %>%
ggplot(aes(x, y)) +
stat_density_2d(aes(fill = ..level..), geom = "polygon") +
scale_fill_viridis()
Use geom_bin2d()
to create a rectangular heatmap of bin counts. Set the binwidth
to the x and y dimensions to capture in each box.
overplot %>%
ggplot(aes(x, y)) +
geom_bin2d(binwidth = c(1,1))
Use geomhex()
to create a hexagonal heatmap of bin counts. Adjust the binwidth
, xlim()
, ylim()
and/or the figure dimensions to make the hexagons more or less stretched.
overplot %>%
ggplot(aes(x, y)) +
geom_hex(binwidth = c(0.25, 0.25))
I’ve included the code for creating a correlation matrix from a table of variables, but you don’t need to understand how this is done yet. We’ll cover mutate
and gather
functions in the dplyr
and tidyr
lessons.
# generate two sets of correlated variables (a and b)
heatmap <- tibble(
a1 = rnorm(100),
b1 = rnorm(100)
) %>%
mutate(
a2 = a1 + rnorm(100),
a3 = a1 + rnorm(100),
a4 = a1 + rnorm(100),
b2 = b1 + rnorm(100),
b3 = b1 + rnorm(100),
b4 = b1 + rnorm(100)
) %>%
cor() %>% # create the correlation matrix
as.data.frame() %>% # make it a data frame
rownames_to_column(var = "V1") %>% # set rownames as V1
gather("V2", "r", a1:b4) # wide to long (V2)
Once you have a correlation matrix in the correct (long) format, it’s easy to make a heatmap using geom_tile()
.
ggplot(heatmap, aes(V1, V2, fill=r)) +
geom_tile() +
scale_fill_viridis()
The file type is set from the filename suffix, or by specifying the argument device
, which can take the following values: “eps”, “ps”, “tex”, “pdf”, “jpeg”, “tiff”, “png”, “bmp”, “svg” or “wmf”.
You can use the plotly
package to make interactive graphs. Just assign your ggplot to a variable and use the function ggplotly()
.
demog_plot <- ggplot(demog, aes(age, height, fill=sex)) +
geom_point(position = position_jitter(width= 0.2, height = 0), size = 2)
ggplotly(demog_plot)
Hover over the data points above and click on the legend items.
Generate a violin plot, boxplot, histogram, density plot, and column plot for the following data.
# dog weights estimated from http://petobesityprevention.org/ideal-weight-ranges/
dogs <- tibble(
breed = rep(c("beagle", "boxer", "bulldog"), each = 100),
weight = c(
rnorm(100, 24, 6),
rnorm(100, 62.5, 12.5),
rnorm(100, 45, 5)
)
)
Basic: Create each plot.
Intermediate: Change the axis labels and colours. Save each plot as a PNG file.
Advanced: Create a grid of different plot styles. In your RMarkdown file, display just the graph, but not the r
code for the graph.
ggplot(dogs, aes(breed, weight, fill = breed)) +
geom_violin(draw_quantiles = c(0.5), show.legend = FALSE)
ggplot(dogs, aes(breed, weight, fill = breed)) +
geom_boxplot(show.legend = FALSE)
ggplot(dogs, aes(weight, fill = breed)) +
geom_histogram(binwidth = 3, position = "dodge")
ggplot(dogs, aes(weight, fill = breed)) +
geom_density(alpha = 0.5) +
xlim(0, 110)
dogs %>%
group_by(breed) %>%
summarise(
mean = mean(weight),
sd = sd(weight)
) %>%
ggplot(aes(breed, mean, fill = breed)) +
geom_col(alpha = 0.5) +
geom_errorbar(aes(ymin = mean - sd, ymax = mean + sd), width = 0.25)
Represent the relationships among moral, sexual and pathogen disgust scores from the dataset disgust_scores.csv.
Basic: Graph the linear relationship between moral and pathogen disgust. Make sure the axes run from the minimum to maximum possible scores on both axes. Give the graph an appropriate title and axis lables.
disgust <- read_csv("data/disgust_scores.csv")
ggplot(disgust, aes(moral, pathogen)) +
geom_smooth(method = "lm") +
labs(
title = "Moral vs Pathogen",
x = "Moral Disgust Score",
y = "Pathogen Disgust Score"
) +
xlim(0, 6) + ylim(0, 6)
Intermediate: Create a 2d density plot of the relationship between pathogen and sexual disgust.
Use stat_density_2d(aes(fill = ..level..), geom = "polygon", n = ?, h = c(?, ?))
, set n and h to values that make the graph look good, and figure out what n
and h
represent.
n
changes the smoothness of the contours (number of grid points in each direction)h
changes how densely the contours are drawn (bandwidth)ggplot(disgust, aes(pathogen, sexual)) +
stat_density_2d(
aes(fill = ..level..),
geom = "polygon",
n = 100,
h = c(1, 1)
) +
scale_fill_viridis()
Advanced: Create a 3x3 grid of plots with columns representing the x-axis and rows representing the y-axis. Put a density plot of each variable along the diagonal. Make sure the graphs have appropriate titles and axis labels and that the range of the axes are the same in all graphs.
moral | sexual | pathogen | |
---|---|---|---|
moral | density | line | line |
sexual | line | density | line |
pathogen | line | line | density |
moral_sexual <- ggplot(disgust, aes(moral, sexual, color)) +
geom_smooth() +
labs(title = "Moral vs Sexual") +
xlim(0, 6) + ylim(0, 6)
moral_pathogen <- ggplot(disgust, aes(moral, pathogen)) +
geom_smooth() +
labs(title = "Moral vs Pathogen") +
xlim(0, 6) + ylim(0, 6)
pathogen_moral <- ggplot(disgust, aes(pathogen, moral)) +
geom_smooth() +
labs(title = "Pathogen vs Moral") +
xlim(0, 6) + ylim(0, 6)
pathogen_sexual <- ggplot(disgust, aes(pathogen, sexual)) +
geom_smooth() +
labs(title = "Pathogen vs Sexual") +
xlim(0, 6) + ylim(0, 6)
sexual_moral <- ggplot(disgust, aes(sexual, moral)) +
geom_smooth() +
labs(title = "Sexual vs Moral") +
xlim(0, 6) + ylim(0, 6)
sexual_pathogen <- ggplot(disgust, aes(sexual, pathogen)) +
geom_smooth() +
labs(title = "Sexual vs Pathogen") +
xlim(0, 6) + ylim(0, 6)
moral_moral <- ggplot(disgust, aes(moral)) +
geom_density() +
labs(title = "Moral Disgust") +
xlim(0, 6)
sexual_sexual <- ggplot(disgust, aes(sexual)) +
geom_density() +
labs(title = "Sexual Disgust") +
xlim(0, 6)
pathogen_pathogen <- ggplot(disgust, aes(pathogen)) +
geom_density() +
labs(title = "Pathogen Disgust") +
xlim(0, 6)
plot_grid(
moral_moral, sexual_moral, pathogen_moral,
moral_sexual, sexual_sexual, pathogen_sexual,
moral_pathogen, sexual_pathogen, pathogen_pathogen
)