library(tidyverse)
library(viridis)
library(patchwork)
library(gghighlight)
Data vizualisation
Generating a sample dataset and performing basic data exploration centered around M&A deal portfolio. Collapsible code sections are included.
Creating a dataset
Load libraries
Create objects
# Create dates
<- seq.Date(from = as.Date("2014-01-01"), to = Sys.Date(), by = 1)
datesx # Sample random dates and sort in chronologically
set.seed(123)
<- sample(datesx, size = 400, replace = TRUE)
dates <- sort(dates, decreasing = FALSE)
dates
# Create sectors and regions (categorical)
<- sample(x = c("air", "space", "land", "sea", "ground", "radio"), size = 400, replace = TRUE, prob=c(0.1, 0.2, 0.30, 0.2, 0.1, 0.2))
sectors <- sample(x = c("NAM", "LATAM", "EMEA", "APAC", "AUS"), size = 400, replace = TRUE, prob=c(0.1, 0.2, 0.40, 0.2, 0.1))
regions
# Create deal sizes and revenues (numeric)
<- sample(1:1000, size = 400, replace = TRUE)
dealsize <- round(rnorm(400, 20, 5)^2, 0) revenues
Create data frame
# create tibbles by combining vectors
<-
evolution tibble(dates, sectors, regions, dealsize, revenues)
#convert character variables to factors
<-
evolution %>%
evolution mutate_if(sapply(evolution, is.character), as.factor) %>%
print()
# A tibble: 400 × 5
dates sectors regions dealsize revenues
<date> <fct> <fct> <int> <dbl>
1 2014-01-13 radio NAM 285 56
2 2014-02-10 space LATAM 148 148
3 2014-04-27 land LATAM 93 385
4 2014-05-07 radio APAC 269 442
5 2014-05-17 space APAC 717 457
6 2014-05-21 radio EMEA 675 581
7 2014-05-31 ground APAC 951 362
8 2014-06-02 land EMEA 372 680
9 2014-06-09 sea AUS 517 237
10 2014-06-14 radio APAC 56 195
# ℹ 390 more rows
#filter year
<-
evolution %>%
evolution filter(dates > "2015-01-01",
< "2023-01-01") dates
Set theme
theme_set(theme_minimal())
Deal count
Code
<-
plot.bar.vertical.count %>%
evolution ggplot(mapping = aes(x = fct_rev(fct_infreq(sectors)))) +
geom_bar() +
geom_text(aes(label = after_stat(count)),
stat = "count",
vjust = 2.5,
colour = "white") + #add labels
labs(x = "Sectors",
title = "Number of deals by sector")
plot.bar.vertical.count
Code
<-
plot.percent.stacked %>%
evolution ggplot(mapping = aes(x=sectors, fill = regions)) +
geom_bar(position = "fill")
plot.percent.stacked
Code
<-
plot.count.dodge %>%
evolution ggplot(mapping = aes(x=fct_infreq(sectors), fill = fct_infreq(regions))) +
geom_bar(position = "dodge", color = "white")
plot.count.dodge
Highlight variable
Overlay two plots on top of each other to highlight a variable.
Code
<-
plot.bar.horizontal %>%
evolution count(sectors) %>%
mutate(prop = (n/sum(n))) %>%
ggplot(mapping = aes(x = fct_reorder(sectors, prop), y = prop)) +
geom_col(fill = "red3") +
coord_flip() +
gghighlight(sectors == "radio") +
geom_text(aes(label = paste0(round(prop*100,2), "%")),
vjust = .5,
hjust = 1.2,
size = 4,
color = "white") + #add labels
scale_y_continuous(labels = scales::percent) + # axis label in %
labs(x = "Sectors",
title = "Proportion of deals by sector",
subtitle = "By number of deals") #relabel x axis
plot.bar.horizontal
Code
#create base data
<-
evolutioncount %>%
evolution group_by(year = year(dates)) %>%
count(sectors) %>%
mutate(year = year, n = n)
#create highlight data
<-
evolutioncount.filt %>%
evolution filter(sectors == "radio") %>% # with filter
group_by(year = year(dates)) %>%
count(sectors) %>%
mutate(year = year, n = n)
#plot base and highlight data on top of each other
<-
plot.bar.time.highlight ggplot(data = evolutioncount, aes(x = year, y = n)) +
geom_col(fill = "red3") +
geom_text(evolutioncount.filt, mapping = aes(label = n),
vjust = -0.8,
color = "red3") +
gghighlight(sectors == "radio")
plot.bar.time.highlight
Change rate
Using the lag()
function which helps calculating the YoY change.
Code
# Prepare data using lag()
<-
data.change.roll.avg %>%
evolutioncount.filt ungroup() %>% #remember to ungroup()
mutate(lag0 = n,
lag1 = lag(lag0),
yoy.change = (lag0 - lag1),
yoy.changeperc = ((lag0/lag1-1)*100)
%>%
) mutate(yoy.changeperc = round(yoy.changeperc, 1))
#Plot 1
<-
plot.change1 %>%
data.change.roll.avg select(year, yoy.change, yoy.changeperc) %>%
filter(!is.na(yoy.change)) %>%
ggplot(aes(x = year, y = yoy.change)) +
geom_col(aes(fill = yoy.change > 0)) + # conditional formatting with > 0
geom_text(aes(label = yoy.change, y = yoy.change -0.5 * sign(yoy.change)), color = "white", fontface = "bold") + # data labels, use sign() to fit diverging labels, see : https://tinyurl.com/z5n7uk5k or https://tinyurl.com/3uakpz7y
labs(x = NULL) + # remove x axis title
scale_x_continuous(labels = NULL) + # remove x axis tick labels
scale_fill_manual(values = c("#c9191e", "#27a658" )) +
theme(legend.position = "none") # remove legend of col chart
#Plot 2
<-
plot.change2 %>%
data.change.roll.avg select(year, yoy.change, yoy.changeperc) %>%
filter(!is.na(yoy.changeperc)) %>%
ggplot(aes(x = year, y = yoy.changeperc)) +
geom_hline(aes(yintercept = 0), color = "gray", linetype = "dashed") + #highlight x axis
geom_line(color = "blue4") +
labs(x = NULL)
# Combining Plot 1 & 2 using patchwork
/ plot.change2) + plot_layout(ncol = 1, heights = c(2.5, 1)) (plot.change1
Code
# Resources :
# https://tinyurl.com/23yh29hu
Line plots : cumulated sums
Counting the number of deals, and grouping line geoms by color.
Code
<-
plot.line %>%
evolution group_by(sectors) %>%
arrange(dates) %>%
mutate(count = row_number()) %>%
ggplot(mapping = aes(x = dates, y = count, color = sectors)) +
geom_line(linewidth = 0.8)
plot.line
Without gghighlight
Now doing two things on this line plot :
Highlighting one line without using the
gghighlight
package.Graphing cumulated dealsize, instead of count previously. We use the function
cumsum()
Code
#create background line geoms (will be in grey)
<-
data.evolutionback %>%
evolution group_by(sectors) %>%
arrange(dates) %>%
mutate(evol = cumsum(dealsize))
#create the highlighted line (will be in red)
<- data.evolutionback %>%
data.evolutionfilt filter(sectors == "radio")
#plot
<- ggplot(data = data.evolutionback, mapping = aes(x = dates, y = evol, group = sectors)) +
plot.line.highlight geom_line(linewidth = 0.8,
color = "gray85") + #main lines in gray
geom_line(data = data.evolutionfilt,
linewidth = 0.8,
color = "red") + #highlight line in red
#add a dot at max value of line (max value of evol)
geom_point(data = data.evolutionfilt %>% filter(data.evolutionfilt$evol == max(evol)),
color = "red",
size = 2) +
#add text label to end of line
geom_text(data = data.evolutionfilt %>% filter(dates == last(dates)),
mapping = aes(label = sectors),
color = "red",
hjust = -0.2,
vjust = 0.1) +
# edit plot axis labels
labs(title = "Cumulated value over time",
subtitle = "Radio segment",
x = "Time",
y = "Cumulated deal value")
plot.line.highlight
To allow labels to bleed past the canvas boundaries we could use coord_cartesian(clip = 'off')
Adding a trend line with geom_smooth
Code
<-
plot.line.highlight +
plot.line.highlight # geom smooth trend line
geom_smooth(group = 1, color = "black",
linetype = "dashed",
linewidth = 0.4, se = FALSE,
method = "loess",
formula = y ~ x)
plot.line.highlight
Code
# Prepare data to get proportion per sector and per year
<-
data.line.percent %>%
evolution mutate(year = year(dates)) %>% #get year name
group_by(year, sectors) %>%
summarise(n = length(dates)) %>% #get length of vectors containing dates (a.k.a "number of dates"; for each sectors in each year)
group_by(year) %>%
mutate(sumperyear = sum(n),
prop = (n/sumperyear*100)) #sum the total number of dates per year and get prop of "each sectors for each year" compared to total number of all sectors in a year
# Filter data for highlight
<- data.line.percent %>% filter(sectors == "radio")
data.line.percent.filt
<-
plot.line.percent.highlight ggplot(data = data.line.percent, mapping = aes(x = year, y = prop, group = sectors)) +
geom_line(color = "gray80") +
geom_line(data = data.line.percent.filt,
color = "red3",
linewidth = 1) +
geom_point(data = data.line.percent.filt %>% filter(year == max(data.line.percent.filt$year)),
color = "red3") +
geom_text(data = data.line.percent.filt %>% filter(year == max(data.line.percent.filt$year)),
mapping = aes(label = paste0(round(prop), "", "%"),
hjust = 0.5,
vjust = -1,
color = "red3",
fontface = "bold")) +
theme(legend.position = "none") +
scale_y_continuous(limits = c(0,100)) +
labs(title = "Proportion per year",
subtitle = "Radio sector",
x = "Year",
y = "Share (%)")
plot.line.percent.highlight
With gghighlight
Code
+
plot.line gghighlight(sectors == "space")
With facets + gghighlight
Code
# No need for gghighlight here
+
plot.line facet_wrap(~sectors)
Code
# Highlighting segments with gghighlight
+
plot.line gghighlight(label_params = list(size = 3)) + #adjust size of labels
facet_wrap(~sectors) +
theme(
strip.text.x = element_blank() #remove titles from facets
)
Code
# With multiple gghighlights
+
plot.line gghighlight(sectors %in% c("space", "radio"),
label_params = list(nudge_x = 2))
- For fun we could try library(geomtextpath)
- For better reliability we could add labels without using additional packages:
https://stackoverflow.com/questions/29357612/plot-labels-at-ends-of-lines (answered Jul 2, 2018 at 15:57)
Describing deal value
Summary stats
Code
<-
data.evolution.summary %>%
evolution group_by(sectors) %>%
summarize(meandeal = mean(dealsize),
mediandeal = median(dealsize),
max = max(dealsize),
min = min(dealsize),
IQR = IQR(dealsize),
NBdeals = n()) %>%
arrange(mediandeal)
data.evolution.summary
# A tibble: 6 × 7
sectors meandeal mediandeal max min IQR NBdeals
<fct> <dbl> <dbl> <int> <int> <dbl> <int>
1 air 404. 326. 981 5 473. 22
2 land 466. 433 964 4 452. 79
3 radio 468. 440. 978 11 553. 56
4 space 524. 535 993 2 542 61
5 ground 475. 536 974 1 489 34
6 sea 589. 654. 993 35 607. 52
Code
# Note : it is also possible to use a library such as Janitor for this.
Sum
Which is the most important sector in terms of deal size ?
Code
<-
plot.bar.vertical.sum %>%
evolution group_by(sectors) %>%
summarise(sumdeal = sum(dealsize)) %>%
ggplot(mapping = aes(x = fct_reorder(sectors, sumdeal), y = sumdeal)) +
geom_col() +
geom_text(aes(label = sumdeal),
vjust = 2,
size = 4,
color = "white") +
labs(x = NULL)
+
plot.bar.vertical.sum labs(title = "Sum of deals' value per sector",
subtitle = "In USD",
x = NULL)
Code
plot.bar.vertical.sum
Frequency
How are the deals distributed in terms of deal size ?
Code
## Distribution of deal size
<-
plot.hist.dealsize %>%
evolution ggplot(aes(x = dealsize)) +
geom_histogram(binwidth = 20)
plot.hist.dealsize
Density
Code
## Density of deal size per sector
<-
plot.density %>%
evolution ggplot(aes(x = dealsize, fill = sectors)) +
geom_density(alpha = 0.3) +
geom_rug(alpha = 0.6) + #lower part of graph
facet_wrap(~sectors)
plot.density
Boxplot
Code
<-
plot.boxplot %>%
evolution group_by(sectors) %>%
mutate(mediandeal = median(dealsize)) %>%
ggplot(mapping = aes(x = fct_reorder(sectors, mediandeal), y = dealsize)) +
geom_boxplot() +
geom_jitter(width = 0.1, alpha = 0.4) +
labs(title = "Deal size distribution per sector",
subtitle = "In USD",
x = "Sector") +
coord_flip()
plot.boxplot
Code
<-
plot.boxplot.color %>%
evolution group_by(sectors) %>%
mutate(mediandeal = median(dealsize)) %>%
ggplot(mapping = aes(x = fct_reorder(sectors, mediandeal), y = dealsize)) +
geom_boxplot(fill = "blue3", alpha = 0.1, color = "blue3") +
geom_jitter(aes(color = sectors), width = 0.1, alpha = 0.4) +
labs(title = "Deal size distribution per sector",
subtitle = "In USD",
x = "Sector") +
guides(color = guide_legend(override.aes = list(size = 5))) + #increase legend items size
coord_flip()
plot.boxplot.color
Code
ggsave("plot.boxplot.color.png")
Saving 7 x 5 in image
Binning
Let’s create 3 buckets of deal sizes and count the number of deals in each of them. The key function here is cut()
were we define breaks within a variable. The breaks are defined in a vector like breaks = c(0,100,500,999)
.
Code
# binning deal sizes
# add column named "bin" containing 3 types of deal sizes
<-
bin %>%
evolution mutate(bin = cut(evolution$dealsize,
breaks = c(0,100,500,999)))
#relabeled bin
<-
bin %>%
evolution mutate(bin = cut(evolution$dealsize,
breaks = c(0,100,500,999),
labels = c("small < 100", "100 < medium < 500 ", "big > 500")))
# plot bins
<-
plot.bar.binned %>%
bin ggplot(mapping = aes(x = bin)) +
geom_bar() +
geom_text(aes(label = after_stat(count)),
stat = "count",
vjust = 2.5,
colour = "white")
plot.bar.binned
Code
# plot proportions for each bin in %
%>%
bin group_by(bin) %>%
summarise(sumperbin = sum(dealsize)) %>%
mutate(tot = sum(sumperbin), prop = sumperbin/tot*100) %>% #proportion
ggplot(mapping = aes(x = bin, y = prop)) +
geom_col() +
geom_text(aes(label = round(prop)), vjust = 2.5, colour = "white")