R
Visualization

RAship Code

Acknowledgements

The following is an R project I developed to perform data analysis on historic economic data for my position as a Research Assistant for Prof. Nicolas Schmitt at Simon Fraser University. The version of this file published on Github will load redacted versions of proprietary data. All categorical variables are aliased, and all numeric fields are shuffled at random to preserve confidentiality. File names and headers are kept the same.

Note from the future: If I were to tackle this project today I would approach the task much differently: I would employ a more conrete naming convention for variables and files, use version control, and would overall use more functions for DRYer code. I include this project here because it served as an important learning opportunity for me, and demonstrates that I’ve solved a real-world assignment.

Data Processing

rm(list=ls()) #clear environment

#load libraries
library(purrr)
library(writexl)
library(openxlsx)
library(readxl)
library(tidyverse)
library(graphics)
library(ggthemes)
library(outliers)
library(ggplot2)
library(plyr)
library(dplyr)
library(ggforce)
library(ggrepel)
library(hexbin)
library(latex2exp)
library(rmarkdown)

Summarizing Function: Recategorized Data

nameG <- paste("Cat", seq(1,17), sep = ".")

recatData <- function(file_path) { #defining summary function

  data <- read_excel(file_path)
  data$Category <- as.factor(data$Category)
  data$`General Import Tax` <- sample(data$`General Import Tax`) #randomizing values
  
  data_summary <- data %>%
    group_by(Category) %>%
    summarise_at(vars(`General Import Tax`), list(mean=mean, sd=sd)) %>%
    as.data.frame()

  data_count <- data %>% dplyr::count(Category)
  data_merged <- left_join(data_summary, data_count, by= 'Category')
  levels(data_merged$Category) <- nameG
  return(data_merged[-c(17), ])

}

1902

oTwoDF <- recatData("D:\\Everything\\School\\RA position\\Tariff Schedules\\Analysis\\Sector Recategorization\\1902_Recat_V3.xlsx")
head(oTwoDF) # example of year = 1902
##   Category     mean        sd   n
## 1    Cat.1 32.24651  56.68118 129
## 2    Cat.2 51.88235 101.51224  17
## 3    Cat.3 27.00161  32.95000  31
## 4    Cat.4 67.33871 106.33315  31
## 5    Cat.5 24.38889  39.31782  18
## 6    Cat.6 39.35234  79.68471  64

1891

nineOneDF <- recatData("D:\\Everything\\School\\RA position\\Tariff Schedules\\Analysis\\Sector Recategorization\\1891_Recat_V2.xlsx")
head(nineOneDF) # example of year = 1891
##   Category     mean       sd  n
## 1    Cat.1 31.44384 55.13447 73
## 2    Cat.2 29.25000 39.12935 14
## 3    Cat.3 20.35000 33.41192 24
## 4    Cat.4 13.50769 15.91035 13
## 5    Cat.5 15.30714 24.42592  7
## 6    Cat.6 22.79333 32.87008 30

1887

eightSevDF <-recatData("D:\\Everything\\School\\RA position\\Tariff Schedules\\Analysis\\Sector Recategorization\\1891_Recat_V2.xlsx")
head(eightSevDF)
##   Category     mean       sd  n
## 1    Cat.1 32.47260 65.17196 73
## 2    Cat.2 30.52143 39.16315 14
## 3    Cat.3 25.12500 41.59255 24
## 4    Cat.4 27.08462 41.09249 13
## 5    Cat.5 34.92857 65.38376  7
## 6    Cat.6 30.63500 39.58481 30

1884

eightFourDF <- recatData("D:\\Everything\\School\\RA position\\Tariff Schedules\\Analysis\\Sector Recategorization\\1884_Recat_V2.xlsx")
head(eightFourDF)
##   Category     mean       sd  n
## 1    Cat.1 15.73359 21.27716 78
## 2    Cat.2 10.60417 10.90530 12
## 3    Cat.3 16.12500 22.99966 26
## 4    Cat.4 17.49167 18.87497 12
## 5    Cat.5 15.11111 14.17475  9
## 6    Cat.6 17.26167 24.61115 30

1851_accepted

fiveOneDF <- recatData("D:\\Everything\\School\\RA position\\Tariff Schedules\\Analysis\\Sector Recategorization\\1851_accepted_Recat_V3.xlsx")
head(fiveOneDF)
##   Category      mean        sd  n
## 1    Cat.1 4.3239130 4.6650800 46
## 2    Cat.2 4.2115385 4.4371610 13
## 3    Cat.3 0.9384615 0.8677446 13
## 4    Cat.4 6.7083333 5.4369492  6
## 5    Cat.5 4.7100000 6.1842946  5
## 6    Cat.6 5.2285714 5.8470477 14

1849

fourNine <- read_excel("D:\\Everything\\School\\RA position\\Tariff Schedules\\Analysis\\Sector Recategorization\\1849_Recat_V4.xlsx")
fourNine$Category<-as.factor(fourNine$Category)

#converting Batz to Franc
fourNine <- fourNine %>% 
  mutate(`General Import Tax` = Batz / 7)

fourNine$`General Import Tax` <- sample(fourNine$`General Import Tax`) #randomizing values

fourNineT <- fourNine %>%
  group_by(Category) %>%
  summarise_at(vars(`General Import Tax`), list(mean=mean, sd=sd)) %>% 
  as.data.frame()

fourNineN <- fourNine %>% dplyr::count(Category)
fourNineDF <- left_join(fourNineT, fourNineN, by= join_by(Category))


levels(fourNineDF$Category) <- nameG

fourNineDF <- fourNineDF[-c(17), ]

Creating Nominal Master Data Frame: 1902, 1891, 1887, 1884, 1851, and 1849

## Master DF... needs to be pivoted
{ppp <- left_join(oTwoDF, nineOneDF, by = join_by(Category));
Tppp <- left_join(ppp, eightSevDF, by = join_by(Category));
Cppp <- left_join(Tppp, eightFourDF, by = join_by(Category));
Gppp <- left_join(Cppp, fiveOneDF, by = join_by(Category));
Qppp<- left_join(Gppp, fourNineDF, by = join_by(Category));
}

## Pivoting Master DF long...
# Pivoting Mean
{mmm = subset(Qppp, select = c(Category, mean.x, mean.y, mean.x.x, mean.y.y, mean.x.x.x, mean.y.y.y));
mmm <- mmm %>%
  pivot_longer(cols=-Category, names_to = 'Year', names_prefix = 'mean.', values_to = "mean");
  
mmm$Year[mmm$Year == "x"] = "1902"; mmm$Year[mmm$Year == "y"] = "1891";
  mmm$Year[mmm$Year == "x.x"] = "1887"; mmm$Year[mmm$Year == "y.y"] = "1884"; mmm$Year[mmm$Year == "x.x.x"]="1851";
  mmm$Year[mmm$Year == "y.y.y"]="1849"
}


# Pivoting SD
{sdDF = subset(Qppp, select = c(Category, sd.x, sd.y, sd.x.x, sd.y.y, sd.x.x.x, sd.y.y.y));
sdDF <- sdDF %>%
  pivot_longer(cols=-Category, names_to = 'Year', names_prefix = 'sd.', values_to = "sd");
  
sdDF$Year[sdDF$Year == "x"] = "1902"; sdDF$Year[sdDF$Year == "y"] = "1891"; sdDF$Year[sdDF$Year == "sd"] = "1887" ;
  sdDF$Year[sdDF$Year == "x.x"] = "1887"; sdDF$Year[sdDF$Year == "y.y"] = "1884"; sdDF$Year[sdDF$Year == "x.x.x"] = "1851";
  sdDF$Year[sdDF$Year == "y.y.y"]="1849"
}

# Pivoting n
{nDF = subset(Qppp, select = c(Category, n.x, n.y, n.x.x, n.y.y, n.x.x.x, n.y.y.y));
nDF <- nDF %>%
  pivot_longer(cols=-Category, names_to = 'Year', names_prefix = 'n.', values_to = "n")
  
nDF$Year[nDF$Year == "x"] = "1902"; nDF$Year[nDF$Year == "y"] = "1891"; nDF$Year[nDF$Year == "n"] = "1887";
  nDF$Year[nDF$Year == "x.x"] = "1887"; nDF$Year[nDF$Year == "y.y"] = "1884"; nDF$Year[nDF$Year == "x.x.x"] = "1851";
  nDF$Year[nDF$Year == "y.y.y"] = "1849"
}


# joining long-pivoted DFs into final DF
fDFt <- left_join(mmm, sdDF, by= join_by(Category, Year))
fDF <- left_join(fDFt, nDF, by= join_by(Category, Year))
fDF[is.na(fDF)] <- 0 # Final Dataframe: "fDF"
# Optional code to write to xlsx file
#Write to excel file (warning: will overwrite file with same name. Make sure to change name and path) 
#write_xlsx(fDF, "D:\\Everything\\School\\RA position\\Tariff Schedules\\Analysis\\Sector Recategorization\\1902-1849 Master - Real_V2.xlsx")

Real Values

Accounting for a price index; Without accounting for index = “Nominal values”

nameG <- paste("Cat", seq(1,17), sep = ".")

realData <- function(df) { #defining summary function

  data <- df
  data$Category <- as.factor(data$Category)
  data$`General Import Tax` <- sample(data$`General Import Tax`) #randomizing values
  
  data_summary <- data %>%
    group_by(Category) %>%
    summarise_at(vars(`General Import Tax`), list(mean=mean, sd=sd)) %>%
    as.data.frame()

  data_count <- data %>% dplyr::count(Category)
  data_merged <- left_join(data_summary, data_count, by= 'Category')
  levels(data_merged$Category) <- nameG
  return(data_merged[-c(17), ])

}


nineOneR <- read_xlsx("D:\\Everything\\School\\RA position\\Tariff Schedules\\Analysis\\Sector Recategorization\\1891_Recat_V2.xlsx") %>% 
    mutate(`General Import Tax` = `General Import Tax` / 1.048)

eightSevR <- read_xlsx("D:\\Everything\\School\\RA position\\Tariff Schedules\\Analysis\\Sector Recategorization\\1887_Recat_V2.xlsx") %>% 
    mutate(`General Import Tax` = `General Import Tax` / .922)

eightFourR <- read_xlsx("D:\\Everything\\School\\RA position\\Tariff Schedules\\Analysis\\Sector Recategorization\\1884_Recat_V2.xlsx") %>% 
    mutate(`General Import Tax` = `General Import Tax` / 1.051)

fiveOneR <- read_xlsx("D:\\Everything\\School\\RA position\\Tariff Schedules\\Analysis\\Sector Recategorization\\1851_accepted_Recat_V3.xlsx") %>% 
    mutate(`General Import Tax` = `General Import Tax` / 0.864)

fourNineR <- read_xlsx("D:\\Everything\\School\\RA position\\Tariff Schedules\\Analysis\\Sector Recategorization\\1849_Recat_V4.xlsx") %>% 
  mutate(`General Import Tax` = Batz / 7)%>%
  mutate(`General Import Tax` = `General Import Tax` / 0.745)

RnineOneDF <- realData(nineOneR)
ReightSevDF <- realData(eightSevR)
ReightFourDF <- realData(eightFourR)
RfiveOneDF <- realData(fiveOneR)
RfourNineDF <- realData(fourNineR)

Creating Real Master Data Frame: 1902, 1891, 1887, 1884, 1851, and 1849

## Master DF... needs to be pivoted
{Rppp <- left_join(oTwoDF, RnineOneDF, by = join_by(Category));
TRppp <- left_join(Rppp, ReightSevDF, by = join_by(Category));
CRppp <- left_join(TRppp, ReightFourDF, by = join_by(Category));
GRppp <- left_join(CRppp, RfiveOneDF, by = join_by(Category));
QRppp<- left_join(GRppp, RfourNineDF, by = join_by(Category));
}

## Pivoting Master DF long...
# Pivoting Mean
{Rmmm = subset(QRppp, select = c(Category, mean.x, mean.y, mean.x.x, mean.y.y, mean.x.x.x, mean.y.y.y));
Rmmm <- Rmmm %>%
  pivot_longer(cols=-Category, names_to = 'Year', names_prefix = 'mean.', values_to = "mean");
  
Rmmm$Year[Rmmm$Year == "x"] = "1902"; Rmmm$Year[Rmmm$Year == "y"] = "1891";
  Rmmm$Year[Rmmm$Year == "x.x"] = "1887"; Rmmm$Year[Rmmm$Year == "y.y"] = "1884"; Rmmm$Year[Rmmm$Year == "x.x.x"]="1851";
  Rmmm$Year[Rmmm$Year == "y.y.y"]="1849"
}


# Pivoting SD
{RsdDF = subset(QRppp, select = c(Category, sd.x, sd.y, sd.x.x, sd.y.y, sd.x.x.x, sd.y.y.y));
RsdDF <- RsdDF %>%
  pivot_longer(cols=-Category, names_to = 'Year', names_prefix = 'sd.', values_to = "sd");
  
RsdDF$Year[RsdDF$Year == "x"] = "1902"; RsdDF$Year[RsdDF$Year == "y"] = "1891"; RsdDF$Year[RsdDF$Year == "sd"] = "1887" ;
  RsdDF$Year[RsdDF$Year == "x.x"] = "1887"; RsdDF$Year[RsdDF$Year == "y.y"] = "1884"; RsdDF$Year[RsdDF$Year == "x.x.x"] = "1851";
  RsdDF$Year[RsdDF$Year == "y.y.y"]="1849"
}

# Pivoting n
{RnDF = subset(QRppp, select = c(Category, n.x, n.y, n.x.x, n.y.y, n.x.x.x, n.y.y.y));
RnDF <- RnDF %>%
  pivot_longer(cols=-Category, names_to = 'Year', names_prefix = 'n.', values_to = "n")
  
RnDF$Year[RnDF$Year == "x"] = "1902"; RnDF$Year[RnDF$Year == "y"] = "1891"; RnDF$Year[RnDF$Year == "n"] = "1887";
  RnDF$Year[RnDF$Year == "x.x"] = "1887"; RnDF$Year[RnDF$Year == "y.y"] = "1884"; RnDF$Year[RnDF$Year == "x.x.x"] = "1851";
  RnDF$Year[RnDF$Year == "y.y.y"] = "1849"
}


# joining long-pivoted DFs into final DF
RfDFt <- left_join(Rmmm, RsdDF, by= join_by(Category, Year))
RealfDF <- left_join(RfDFt, RnDF, by= join_by(Category, Year))
RealfDF[is.na(RealfDF)] <- 0 # Final Dataframe: "fDF"
# Optional code to write to xlsx file
#Write to excel file (warning: will overwrite file with same name. Make sure to change name and path) 
#write_xlsx(fDF, "D:\\Everything\\School\\RA position\\Tariff Schedules\\Analysis\\Sector Recategorization\\1902-1849 Master - Real_V2.xlsx")

Graphs

The % change of the number of items in each category over consecutive years: \(\frac{(p_{it}-p_{it-1})}{p_{it-1}}\cdot 100\%\)

fDF <- fDF %>%
  arrange(Category, Year) %>%
  ungroup()

n_percentage_change_df <- fDF %>%
  group_by(Category) %>%
  mutate(n_Percentage_Change = (n - lag(n)) / lag(n) * 100) %>%
  mutate(n_Percentage_Change = ifelse(Year == 1849, NA, n_Percentage_Change)) %>% #setting all 1849 rows to NA to be removed later
  ungroup()

n_percentage_change_df <- na.omit(n_percentage_change_df) #removing all 1849 rows because they have no previous year to be compared to

# Histogram
ggplot(n_percentage_change_df , aes(x=Category, y=n_Percentage_Change, group=Year)) + 
  geom_bar(stat="identity", aes(fill=Year), size=2, position="dodge")+
  #geom_errorbar(aes(fill= Year, ymin= ifelse(mean-sd < 0, 0, mean-sd), ymax=mean+sd), position= "dodge") +
  geom_point(size=2, position= position_dodge(width=0.9))+
  #theme_economist()+
  scale_x_discrete(labels=reCatLabs)+
  labs(x="", y="% Change in 'n'")+
  ggtitle(TeX("Difference in Category(i) n-values over consecutive Years (t): $\\frac{(n_{it}-n_{it-1})}{n_{it-1}}\\cdot 100$"))+ # edit graph title
  geom_text(data = n_percentage_change_df, aes(label = round(n_Percentage_Change, 1)), 
            position = position_dodge(width = 0.9), vjust = -0.25)

The % change in Tariff SDs over consecutive years: [(SD_(it)-SD_(it-1))/SD_(it-1)]*100

(make sure: are you using REAL or NOMINAL tariffs?)

fDF <- fDF %>%
  arrange(Category, Year) %>%
  ungroup()

realSD_change_df <- fDF %>%
  group_by(Category) %>%
  mutate(realSD_Percentage_Change = (sd - lag(sd)) / lag(sd) * 100) %>%
  mutate(realSD_Percentage_Change = ifelse(Year == 1849, NA, realSD_Percentage_Change)) %>% #setting all 1849 rows to NA to be removed later
  ungroup()

realSD_change_df <- na.omit(realSD_change_df)

# Histogram
ggplot(realSD_change_df , aes(x=Category, y=realSD_Percentage_Change, group=Year)) + 
  geom_bar(stat="identity", aes(fill=Year), size=2, position="dodge")+
  #geom_errorbar(aes(fill= Year, ymin= ifelse(mean-sd < 0, 0, mean-sd), ymax=mean+sd), position= "dodge") +
  geom_point(size=2, position= position_dodge(width=0.9))+
  #theme_economist()+
  scale_x_discrete(labels=reCatLabs)+
  labs(x="", y="% Change in Nominal Tariff SD")+
  ggtitle(TeX("Difference in Nominal Tariff SDs (SD) arranged by Category (i) over consecutive Years (t): $\\frac{(SD_{it}-SD_{it-1})}{SD_{it-1}}\\cdot 100$"))+ # edit graph title
  geom_text(data = realSD_change_df, aes(label = round(realSD_Percentage_Change, 1)), 
            position = position_dodge(width = 0.9), vjust = -0.25)