# Clear the memory from any pre-existing objects
rm(list=ls())
# Load packages
library(tidyverse)
library(magrittr)
library(ggplot2)
library(haven)
library(tidyverse)
# Import dataset
<- read_dta("../econ490-r/fake_data.dta") fake_data
09 - Combining Graphs
Prerequisites
- Load data and packages.
- Create variables and objects.
- Have some familiarity with the syntax of commands to create basic graphs.
Learning Outcomes
- Identify best practices for data visualization.
- Feel comfortable with combining graphs using facets in
ggplot2
.
9.0 Intro
We’ll continue working with the fake data set we have been using so far. Recall that this data set is simulating information for workers in the years 1982-2012 in a fake country where a training program was introduced in 2003 to boost their earnings.
In this module, we will work on two examples. The first example covers combining two graphs with the same schema, while the second covers combining two graphs with different schemas. It will soon be very clear what we mean by schema.
9.1 Example 1
For this example, we want to generate two graphs with the same schema (they are the same type of graph and use the same variables as their x and y axis). For example, let’s say we want to see the evolution of average earnings over time for treated and untreated workers in two different regions. Instead of having four lines in one graph, we would like to separate the two regions in two different panels of the same graph.
Let’s do this step by step. We start by creating the data: we want a data frame with average earnings by year and treatment status for the first two regions. We use group_by
, as seen in Module 6.
<- fake_data %>%
figure1_data mutate(log_earnings = log(earnings)) %>% # take log of earnings
group_by(year, region, treated) %>% # group by time, treatment status, and region
summarise(mean_earnings = mean(log_earnings)) %>% # take average by group
filter(region==1|region==2) %>% # keep only first two regions
mutate(treatment = case_when(treated == 1 ~ 'Treated', treated == 0 ~ 'Untreated')) # create a character variable for treatment
Once we have created our data, we proceed with the same steps we have used in section 8.2.2 of Module 8 to create a line plot with one line for each treatment status (treated and untreated).
In this case, we need to add a crucial component: facet_grid
. This allows to split up our data by one or two variables that vary on the horizontal and/or vertical direction. The syntax is facet_grid(vertical ~ horizontal)
.
In the code below, we split vertically our data for the two regions, by adding facet_grid(region ~ .)
to our code.
# Specify data and axis
<- ggplot(data = figure1_data, # referencing the data we want to use
figure1 aes(
x = year, # x is year
y = mean_earnings, # our y is avg logearnings
group=treatment, # each line is data for one value of treatment
color=treatment # each value of treatment as one color
))
# Tell R the graph will be a line graph
<- figure1 + geom_line()
figure1
# Add labels
<- figure1 + labs(x = "Year", y = "Average Log-earnings")
figure1
# "split" vertically graph by region
+ facet_grid(region ~ .) figure1
Notice that now our graph is made of two panels whose titles are the names of the regions. However, we do not know what region 1 and region 2 mean. We can add a character variable to our data, named region_name, and split horizontally the graph into the two names stored in region_name.
# Add names to regions
<- figure1_data %>%
figure1_data mutate(region_name = case_when(region == 1 ~ 'Ontario', region == 2 ~ 'Manitoba'))
# Specify data and axis
<- ggplot(data = figure1_data, # referencing the data we want to use
figure1 aes(
x = year, # x is year
y = mean_earnings, # our y is avg logearnings
group=treatment, # each line is data for one value of treatment
color=treatment # each value of treatment as one color
))
# Tell R the graph will be a line graph
<- figure1 + geom_line()
figure1
# Add labels
<- figure1 + labs(x = "Year", y = "Average Log-earnings")
figure1
# Split horizontally and use labels
+ facet_grid(. ~ region_name) figure1
We can also add a vertical line for year 2003, the moment in which the treatment has been introduced. More information about how to do this is available in Module 8
# Add dashed red vertical line
<- figure1 + geom_vline(aes(xintercept=2002), color="#bb0000", linetype="dashed")
figure1
# Split vertically by region_name
+ facet_grid(. ~ region_name) figure1
9.2 Example 2
For this example, we want to combine graphs that do not follow the same schema. Let’s say we are interested in seeing if there is any relationship between the distribution of earnings (log_earnings) and how worker’s earnings change over time in region 1. Which graphs do you think would best present this information?
As we have seen in Module 8, we usually use histograms to represent density distributions and we can use a scatterplot or a line plot for the graph of earnings over time.
We will now see how to use grid.arrange
to put multiple graphs on one page. The grid.arrange
function is stored in the library gridExtra, which we need to install. We do so below.
# Install and laod
#install.packages("gridExtra")
library(gridExtra)
Let’s first create the two plots separately and then combine them together.
Start with the histogram: use geom_histogram
to create a histogram for the density of log-earnings and store it into the object plot1. You can find a detailed explanation in section 8.2.3 of Module 8.
# Add log earnings to dataset
<- fake_data %>% mutate(log_earnings = log(earnings))
fake_data
# Plot 1: histogram
<- ggplot(data = fake_data, aes(x = log_earnings)) + geom_histogram() plot1
Then, create a line graph for the average log-earning by year using geom_line
and store it into the object plot2. You can find a detailed explanation in section 8.2.2 of Module 8.
# Create a dataframe with the average wage by year
<- fake_data %>%
plot2_data group_by(year) %>%
summarise(mean_earnings = mean(log_earnings))
# Plot 2: line graph
<- ggplot(data = plot2_data, aes(x = year, y = mean_earnings)) + geom_line() plot2
Now combine the objects plot1 and plot2 into one single page using the function grid.arrange
. Notice that we can specify how many numbers of columns or rows we want with ncol
or nrow
, respectively.
grid.arrange(plot1, plot2, nrow=1)
9.3 Wrap Up
In this module we learned how to combine graphs using , whether they have the same schema or not. When producing a research paper we might want to compare statistics from different countries or regions, such as GDP, population density, inflation, exports, etc. These types of graphs allow us to see how the same variables diverge across different categories (as in Example 1) or how different variables influence each other (as in Example 2).
9.4 Wrap-up Table
Command | Function |
---|---|
facet_grid(vertical ~ horizontal) |
It combines two graphs with the same or different schemas. |
grid.arrange(plot1, plot2, nrow=, ncol=) |
It combines two graphs with the same or different schemas. |