World Economic Report

# import libraries
library(tidyverse)
library(readxl)
library(RColorBrewer)
library(viridis)
library(patchwork)
library(skimr)

Data Preparation and QA

# Load world bank data
world_bank = read_xlsx(path = "WorldBank.xlsx")
head(world_bank)

# A tibble: 6 × 15
  `Country Name` `Country Code` Region  IncomeGroup  Year Birth rate, crude (p…¹
  <chr>          <chr>          <chr>   <chr>       <dbl>                  <dbl>
1 Afghanistan    AFG            South … Low income   2018                   NA  
2 Afghanistan    AFG            South … Low income   2017                   33.2
3 Afghanistan    AFG            South … Low income   2016                   34.0
4 Afghanistan    AFG            South … Low income   2015                   34.8
5 Afghanistan    AFG            South … Low income   2014                   35.7
6 Afghanistan    AFG            South … Low income   2013                   36.7
# ℹ abbreviated name: ¹`Birth rate, crude (per 1,000 people)`
# ℹ 9 more variables: `Death rate, crude (per 1,000 people)` <dbl>,
#   `Electric power consumption (kWh per capita)` <dbl>, `GDP (USD)` <dbl>,
#   `GDP per capita (USD)` <dbl>,
#   `Individuals using the Internet (% of population)` <dbl>,
#   `Infant mortality rate (per 1,000 live births)` <dbl>,
#   `Life expectancy at birth (years)` <dbl>, …

# Check data types
glimpse(world_bank)

Rows: 12,449
Columns: 15
$ `Country Name`                                                 <chr> "Afghan…
$ `Country Code`                                                 <chr> "AFG", …
$ Region                                                         <chr> "South …
$ IncomeGroup                                                    <chr> "Low in…
$ Year                                                           <dbl> 2018, 2…
$ `Birth rate, crude (per 1,000 people)`                         <dbl> NA, 33.…
$ `Death rate, crude (per 1,000 people)`                         <dbl> NA, 6.5…
$ `Electric power consumption (kWh per capita)`                  <dbl> NA, NA,…
$ `GDP (USD)`                                                    <dbl> 1936300…
$ `GDP per capita (USD)`                                         <dbl> 520.897…
$ `Individuals using the Internet (% of population)`             <dbl> NA, 13.…
$ `Infant mortality rate (per 1,000 live births)`                <dbl> 47.9, 4…
$ `Life expectancy at birth (years)`                             <dbl> NA, 64.…
$ `Population density (people per sq. km of land area)`          <dbl> 56.9378…
$ `Unemployment (% of total labor force) (modeled ILO estimate)` <dbl> 1.542, …

# Calculate population 
world_bank <- world_bank %>% 
mutate(`Population (M)` = (`GDP (USD)` / `GDP per capita (USD)`) / 1e6)
head(world_bank)

# A tibble: 6 × 16
  `Country Name` `Country Code` Region  IncomeGroup  Year Birth rate, crude (p…¹
  <chr>          <chr>          <chr>   <chr>       <dbl>                  <dbl>
1 Afghanistan    AFG            South … Low income   2018                   NA  
2 Afghanistan    AFG            South … Low income   2017                   33.2
3 Afghanistan    AFG            South … Low income   2016                   34.0
4 Afghanistan    AFG            South … Low income   2015                   34.8
5 Afghanistan    AFG            South … Low income   2014                   35.7
6 Afghanistan    AFG            South … Low income   2013                   36.7
# ℹ abbreviated name: ¹`Birth rate, crude (per 1,000 people)`
# ℹ 10 more variables: `Death rate, crude (per 1,000 people)` <dbl>,
#   `Electric power consumption (kWh per capita)` <dbl>, `GDP (USD)` <dbl>,
#   `GDP per capita (USD)` <dbl>,
#   `Individuals using the Internet (% of population)` <dbl>,
#   `Infant mortality rate (per 1,000 live births)` <dbl>,
#   `Life expectancy at birth (years)` <dbl>, …

# Review missing values as % of total DataFrame
colSums(is.na(world_bank)) / nrow(world_bank) *100

                                                Country Name 
                                                    0.000000 
                                                Country Code 
                                                    0.000000 
                                                      Region 
                                                    0.000000 
                                                 IncomeGroup 
                                                    0.000000 
                                                        Year 
                                                    0.000000 
                        Birth rate, crude (per 1,000 people) 
                                                    8.105069 
                        Death rate, crude (per 1,000 people) 
                                                    8.297855 
                 Electric power consumption (kWh per capita) 
                                                   53.024339 
                                                   GDP (USD) 
                                                   23.062093 
                                        GDP per capita (USD) 
                                                   23.086192 
            Individuals using the Internet (% of population) 
                                                   59.322034 
               Infant mortality rate (per 1,000 live births) 
                                                   19.800787 
                            Life expectancy at birth (years) 
                                                   10.225721 
         Population density (people per sq. km of land area) 
                                                    4.851795 
Unemployment (% of total labor force) (modeled ILO estimate) 
                                                   58.165314 
                                              Population (M) 
                                                   23.086192

# Summary of the Dataset
skim(world_bank)

Data summary
Name	world_bank
Number of rows	12449
Number of columns	16
_______________________
Column type frequency:
character	4
numeric	12
________________________
Group variables	None

Variable type: character

skim_variable	complete_rate	min	max	n_unique
Country Name	1	4	30	211
Country Code	1	3	3	211
Region	1	10	26	7
IncomeGroup	1	10	20	5

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
Year	0	1.00	1.98900e+03	1.703000e+01	1960.00	1.97400e+03	1.989000e+03	2.004000e+03	2.01800e+03	▇▇▇▇▇
Birth rate, crude (per 1,000 people)	1009	0.92	2.86400e+01	1.313000e+01	6.90	1.66000e+01	2.755000e+01	4.088000e+01	5.82300e+01	▇▇▆▇▂
Death rate, crude (per 1,000 people)	1033	0.92	1.05900e+01	5.490000e+00	1.13	6.86000e+00	9.200000e+00	1.269000e+01	5.44400e+01	▇▃▁▁▁
Electric power consumption (kWh per capita)	6601	0.47	3.17529e+03	4.467140e+03	0.00	3.90390e+02	1.541890e+03	4.313770e+03	5.47992e+04	▇▁▁▁▁
GDP (USD)	2871	0.77	1.70074e+11	8.979866e+11	8824450.00	1.39301e+09	7.275305e+09	4.857783e+10	2.05000e+13	▇▁▁▁▁
GDP per capita (USD)	2874	0.77	8.23181e+03	1.617354e+04	34.79	5.13150e+02	1.852810e+03	7.774570e+03	1.89171e+05	▇▁▁▁▁
Individuals using the Internet (% of population)	7385	0.41	2.33300e+01	2.832000e+01	0.00	5.90000e-01	8.410000e+00	4.130000e+01	1.00000e+02	▇▂▁▁▁
Infant mortality rate (per 1,000 live births)	2465	0.80	5.17000e+01	4.613000e+01	1.40	1.44700e+01	3.700000e+01	7.820000e+01	2.79400e+02	▇▃▁▁▁
Life expectancy at birth (years)	1273	0.90	6.40400e+01	1.149000e+01	18.91	5.59200e+01	6.728000e+01	7.269000e+01	8.54200e+01	▁▂▅▇▅
Population density (people per sq. km of land area)	604	0.95	3.18860e+02	1.593410e+03	0.10	1.97800e+01	6.401000e+01	1.448200e+02	2.13891e+04	▇▁▁▁▁
Unemployment (% of total labor force) (modeled ILO estimate)	7241	0.42	8.30000e+00	6.290000e+00	0.14	3.69000e+00	6.780000e+00	1.121000e+01	3.79400e+01	▇▅▁▁▁
Population (M)	2874	0.77	2.99600e+01	1.158000e+02	0.01	1.05000e+00	5.390000e+00	1.708000e+01	1.39190e+03	▇▁▁▁▁

# Filter for the year 2014
world_bank_2014 = world_bank %>%
  filter(Year == 2014)

head(world_bank_2014)

# A tibble: 6 × 16
  `Country Name` `Country Code` Region  IncomeGroup  Year Birth rate, crude (p…¹
  <chr>          <chr>          <chr>   <chr>       <dbl>                  <dbl>
1 Afghanistan    AFG            South … Low income   2014                   35.7
2 Albania        ALB            Europe… Upper midd…  2014                   12.3
3 Algeria        DZA            Middle… Upper midd…  2014                   25.5
4 American Samoa ASM            East A… Upper midd…  2014                   17.5
5 Andorra        AND            Europe… High incom…  2014                   NA  
6 Angola         AGO            Sub-Sa… Upper midd…  2014                   43.2
# ℹ abbreviated name: ¹`Birth rate, crude (per 1,000 people)`
# ℹ 10 more variables: `Death rate, crude (per 1,000 people)` <dbl>,
#   `Electric power consumption (kWh per capita)` <dbl>, `GDP (USD)` <dbl>,
#   `GDP per capita (USD)` <dbl>,
#   `Individuals using the Internet (% of population)` <dbl>,
#   `Infant mortality rate (per 1,000 live births)` <dbl>,
#   `Life expectancy at birth (years)` <dbl>, …

# Load HDI table
hdi = read_csv("HDI.csv")
head(hdi)

# A tibble: 6 × 1,008
  iso3  country hdicode region hdi_rank_2021 hdi_1990 hdi_1991 hdi_1992 hdi_1993
  <chr> <chr>   <chr>   <chr>          <dbl>    <dbl>    <dbl>    <dbl>    <dbl>
1 AFG   Afghan… Low     SA               180    0.273    0.279    0.287    0.297
2 AGO   Angola  Medium  SSA              148   NA       NA       NA       NA    
3 ALB   Albania High    ECA               67    0.647    0.629    0.614    0.617
4 AND   Andorra Very H… <NA>              40   NA       NA       NA       NA    
5 ARE   United… Very H… AS                26    0.728    0.739    0.742    0.748
6 ARG   Argent… Very H… LAC               47    0.723    0.73     0.735    0.739
# ℹ 999 more variables: hdi_1994 <dbl>, hdi_1995 <dbl>, hdi_1996 <dbl>,
#   hdi_1997 <dbl>, hdi_1998 <dbl>, hdi_1999 <dbl>, hdi_2000 <dbl>,
#   hdi_2001 <dbl>, hdi_2002 <dbl>, hdi_2003 <dbl>, hdi_2004 <dbl>,
#   hdi_2005 <dbl>, hdi_2006 <dbl>, hdi_2007 <dbl>, hdi_2008 <dbl>,
#   hdi_2009 <dbl>, hdi_2010 <dbl>, hdi_2011 <dbl>, hdi_2012 <dbl>,
#   hdi_2013 <dbl>, hdi_2014 <dbl>, hdi_2015 <dbl>, hdi_2016 <dbl>,
#   hdi_2017 <dbl>, hdi_2018 <dbl>, hdi_2019 <dbl>, hdi_2020 <dbl>, …

# select only iso3 and hdi_2014 columns
hdi_2014 = hdi %>% 
  select(iso3, hdi_2014)

# rename iso3 column 
hdi_2014 = hdi_2014 %>%
  rename(`Country Code` = iso3)

# Join 2014 data with the HDI table
wb_hdi_2014 = full_join(world_bank_2014, hdi_2014)
head(wb_hdi_2014)

# A tibble: 6 × 17
  `Country Name` `Country Code` Region  IncomeGroup  Year Birth rate, crude (p…¹
  <chr>          <chr>          <chr>   <chr>       <dbl>                  <dbl>
1 Afghanistan    AFG            South … Low income   2014                   35.7
2 Albania        ALB            Europe… Upper midd…  2014                   12.3
3 Algeria        DZA            Middle… Upper midd…  2014                   25.5
4 American Samoa ASM            East A… Upper midd…  2014                   17.5
5 Andorra        AND            Europe… High incom…  2014                   NA  
6 Angola         AGO            Sub-Sa… Upper midd…  2014                   43.2
# ℹ abbreviated name: ¹`Birth rate, crude (per 1,000 people)`
# ℹ 11 more variables: `Death rate, crude (per 1,000 people)` <dbl>,
#   `Electric power consumption (kWh per capita)` <dbl>, `GDP (USD)` <dbl>,
#   `GDP per capita (USD)` <dbl>,
#   `Individuals using the Internet (% of population)` <dbl>,
#   `Infant mortality rate (per 1,000 live births)` <dbl>,
#   `Life expectancy at birth (years)` <dbl>, …

# Review missing values as % of total DataFrame
colSums(is.na(wb_hdi_2014)) / nrow(wb_hdi_2014)*100

                                                Country Name 
                                                    6.222222 
                                                Country Code 
                                                    0.000000 
                                                      Region 
                                                    6.222222 
                                                 IncomeGroup 
                                                    6.222222 
                                                        Year 
                                                    6.222222 
                        Birth rate, crude (per 1,000 people) 
                                                    9.333333 
                        Death rate, crude (per 1,000 people) 
                                                    9.333333 
                 Electric power consumption (kWh per capita) 
                                                   38.222222 
                                                   GDP (USD) 
                                                   10.666667 
                                        GDP per capita (USD) 
                                                   10.666667 
            Individuals using the Internet (% of population) 
                                                   11.555556 
               Infant mortality rate (per 1,000 live births) 
                                                   15.555556 
                            Life expectancy at birth (years) 
                                                   12.444444 
         Population density (people per sq. km of land area) 
                                                    7.555556 
Unemployment (% of total labor force) (modeled ILO estimate) 
                                                   17.333333 
                                              Population (M) 
                                                   10.666667 
                                                    hdi_2014 
                                                   10.666667

Prepare data for visualization

# Summarize GDP by Year and Region
gdp_pivot = world_bank %>% 
  group_by(Year, Region) %>%
  summarise(`GDP (Trillions)` = sum(`GDP (USD)`, na.rm = TRUE)/ 1e12)

head(gdp_pivot)

# A tibble: 6 × 3
# Groups:   Year [1]
   Year Region                     `GDP (Trillions)`
  <dbl> <chr>                                  <dbl>
1  1960 East Asia & Pacific                   0.146 
2  1960 Europe & Central Asia                 0.283 
3  1960 Latin America & Caribbean             0.0574
4  1960 Middle East & North Africa            0.0141
5  1960 North America                         0.543 
6  1960 South Asia                            0.0475

# Summarize Population by Year and Region
pop_pivot = world_bank %>% 
  group_by(Year, Region) %>% 
  summarise(`Population (Billions)` = sum(`Population (M)`, na.rm = TRUE)/1e3)
                                          
                                            
head(pop_pivot)

# A tibble: 6 × 3
# Groups:   Year [1]
   Year Region                     `Population (Billions)`
  <dbl> <chr>                                        <dbl>
1  1960 East Asia & Pacific                         0.872 
2  1960 Europe & Central Asia                       0.281 
3  1960 Latin America & Caribbean                   0.187 
4  1960 Middle East & North Africa                  0.0593
5  1960 North America                               0.181 
6  1960 South Asia                                  0.573

# remove missing values from Region and hdi_2014 columns
wb_hdi_2014 = wb_hdi_2014 %>% 
  filter(!is.na(Region) & !is.na(hdi_2014))

# Summarize HDI by Region
wb_hdi_by_region <- wb_hdi_2014 %>% 
  group_by(Region) %>% 
  summarise(avg_hdi = mean(hdi_2014)) %>% 
  arrange(desc(avg_hdi))

wb_hdi_by_region

# A tibble: 7 × 2
  Region                     avg_hdi
  <chr>                        <dbl>
1 North America                0.922
2 Europe & Central Asia        0.845
3 Latin America & Caribbean    0.740
4 Middle East & North Africa   0.737
5 East Asia & Pacific          0.730
6 South Asia                   0.611
7 Sub-Saharan Africa           0.528

Data Visualization

Create initial graphs for later integration into a final one-page infographic report

# Plot GDP over time by Region
ggplot(gdp_pivot, aes(x = Year, y = `GDP (Trillions)`, fill = Region)) +
  geom_area() + 
  labs(x = "Year", y = "GDP (Trillions)")

# Plot Population over time by Region
pop_pivot %>%
  ggplot(aes(x = Year, y = `Population (Billions)`, fill = Region)) +
  geom_area()  +
  labs(x = "Year", y = "Population (Billions)")

# Prepare bubble chart data
bubble_chart_data = wb_hdi_2014 %>% 
  filter(!is.na(`Life expectancy at birth (years)`),
         !is.na(`GDP per capita (USD)`),
         !is.na(`Population (M)`))

# Calculate min and max population values
min_population = min(bubble_chart_data$`Population (M)`)
max_population = max(bubble_chart_data$`Population (M)`)

# Bubble chart: Life expectancy vs GDP per capita
ggplot(bubble_chart_data, 
       aes(x = `Life expectancy at birth (years)`, 
           y = `GDP per capita (USD)`, 
           size = `Population (M)`, 
           color = Region)) +
  scale_size_continuous(range = c(1, 15), 
                        limits = c(min_population, max_population), 
                        breaks = c(250, 500, 750, 1000, 1250)) +
  scale_y_log10(labels = scales::comma) +
  geom_point(alpha = 0.60)

ggplot(wb_hdi_by_region, 
       aes(x = reorder(Region, avg_hdi), y = avg_hdi)) +
  geom_bar(stat = "identity", fill = "steelblue3", alpha = 0.7) +
  coord_flip()

# Filter out Iceland
wb_hdi_2014 <- wb_hdi_2014 %>% filter(`Country Name` != "Iceland")

# Electric Power Consumption vs. GDP per Capita
plot5 = wb_hdi_2014 %>%
  ggplot(aes(x = `Electric power consumption (kWh per capita)`   , 
             y = `GDP per capita (USD)`, 
             color = hdi_2014)) +
  geom_point(alpha = 0.6, size = 3) +
    coord_cartesian(ylim = c(0,125000), xlim = c(0, 20000))

Build final report

Combine visualizations into a single report style graphic

# Define a custom theme
custom_theme <- theme_minimal() +
  theme( text = element_text(family = "Times New Roman"),
    axis.text = element_text(size = 12, colour = "#000000"),
    axis.title = element_text(size = 13),
    panel.grid = element_blank(),
    axis.line = element_line(size = 0.5, color = "black"),
    axis.ticks = element_line(size = 0.5, color = "black"))

# Apply the custom theme globally
theme_set(custom_theme)

plot1 = ggplot(gdp_pivot, aes(x = Year, y = `GDP (Trillions)`,
                              fill = Region)) + 
  geom_area(alpha = 0.8, colour = "white") + 
  scale_fill_brewer(palette = "Set2") +
  scale_y_continuous(expand = c(0, 0)) +
  labs(x = NULL, y = "GDP (Trillions)") +
  theme(legend.title = element_blank(),
        legend.position = c(0.02, 1),
        legend.justification = c(0, 1), # Anchor to the top-right 
        legend.margin = margin(0, 0, 0, 0))

plot1

plot2 = ggplot(pop_pivot, aes(x = Year, y = `Population (Billions)`, fill = Region)) + 
  geom_area(alpha = 0.8, colour = "white") + 
  scale_fill_brewer(palette = "Set2") +
  labs(x = NULL, y = "Population (Billions)") +
  scale_y_continuous(breaks = seq(0, 7, by = 1),
                     expand = c(0, 0)) +
  theme(legend.position = "none",
        plot.margin = margin(t = 20, r = 80, b = 20, l = 20))
plot2

plot3 = ggplot(bubble_chart_data, aes(x = `Life expectancy at birth (years)`, y = `GDP per capita (USD)`, 
                                      size = `Population (M)`, color = Region)) +
  scale_size_continuous(range = c(1, 15),
                        limits = c(min_population, max_population), breaks = c(250, 500, 750, 1000, 1250)) +
  scale_color_brewer(palette = "Set2") +
  scale_x_continuous(breaks = seq(50, 85, by = 5)) +
  scale_y_log10(labels = scales::comma) +
  geom_point(alpha = 0.8) +
  theme(plot.margin = margin(t = 20, r = 80, b = 20, l = 10),
        legend.text = element_text(size = 10),
        legend.position = c(0, 1), # Position the legend at the top left
        legend.justification = c(0, 1),
        legend.direction = "horizontal") + # Set legend direction to horizontal
  guides(colour = "none")

plot3

plot4 = ggplot(wb_hdi_by_region, aes(reorder(Region, avg_hdi), avg_hdi, fill = Region)) +
  geom_col(alpha = 0.8) + 
  scale_fill_brewer(palette = "Set2") + 
  labs(x = "Region", y = "Human Development Index (HDI)") +
  scale_y_continuous(breaks = seq(0, 0.8, by = 0.2),
                     expand = c(0, 0)) +
  theme(axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        legend.position = "none")

plot4

plot5 = ggplot(wb_hdi_2014, aes(x = `Electric power consumption (kWh per capita)`, y = `GDP per capita (USD)`, 
                                color = hdi_2014)) + 
  geom_point(alpha = 0.8, size = 3) + 
  scale_color_viridis(option = "H", direction = -1, 
                      breaks = seq(0.4, 0.9, by = 0.1), 
                      guide = guide_legend(override.aes = list(size = 4, shape = 16))) + 
  coord_cartesian(ylim = c(0, 125000), xlim = c(0, 20000)) + 
  scale_y_continuous(labels = scales::comma) + 
  scale_x_continuous(labels = scales::comma) +
  theme(plot.margin = margin(t = 20, r = 80, b = 80, l = 20))

plot5

 # Combine all plots using patchwork package
combined_plot = (plot1 | plot2) / 
                 plot3 /
                 (plot4 | plot5) +
  plot_annotation(title = "World Economic Report (1960-2018)",
                  theme = theme(plot.title = element_text(hjust = 0.5, size = 18, face = "bold", margin = margin(t = 7, r = 0, b = 20, l = 0))))

# Save the combined plot
ggsave("econ_dev_report.png", combined_plot, width = 10.5, height = 13.5, units = "in", bg = "white")

# Final edit with Canva