Repository: bkelly-lab/ReplicationCrisis
Branch: master
Commit: 2434da1281aa
Files: 29
Total size: 430.1 KB

Directory structure:
gitextract_789kjfg8/

├── .gitignore
├── Analysis/
│   ├── .gitignore
│   ├── 0 - Functions.R
│   ├── 1 - Prepare Data.R
│   ├── 2 - Determine Clusters.R
│   ├── 3 - Analysis.R
│   ├── 4 - Output.R
│   ├── Analysis.Rproj
│   ├── Country Classification.xlsx
│   ├── Factor Details.xlsx
│   ├── README.md
│   ├── country_stats.R
│   ├── hxz_decomp.R
│   └── main.R
├── GlobalFactors/
│   ├── CHANGELOG.md
│   ├── Cluster Labels.csv
│   ├── Country Classification.xlsx
│   ├── Factor Details.xlsx
│   ├── GlobalFactors.Rproj
│   ├── MD
│   ├── README.md
│   ├── accounting_chars.sas
│   ├── char_macros.sas
│   ├── ind_identification.sas
│   ├── main.sas
│   ├── market_chars.sas
│   ├── portfolios.R
│   └── project_macros.sas
└── README.md

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitignore
================================================
GlobalFactors/.Rproj.user
GlobalFactors/.Rhistory
GlobalFactors/.RData
GlobalFactors/.Ruserdata
Analysis/Data/
Analysis/Objects/
Analysis/Figures/
Analysis/.Rproj.user
Analysis/.Rhistory
Analysis/.RData
Analysis/.Ruserdata
.Rproj.user


================================================
FILE: Analysis/.gitignore
================================================
.Rhistory
.RData
.Rproj.user
Data/
Objects/
Figures/
Scribbles/


================================================
FILE: Analysis/0 - Functions.R
================================================
eb_prepare <- function(data, scale_alphas, overlapping) {
  if (overlapping) {
    data <- data %>%
      group_by(region, characteristic) %>%
      mutate(obs = n()) %>%
      ungroup() %>%
      filter(obs == max(obs)) %>%
      select(-obs)
  }
  # Adjust for Beta
  data <- data %>% 
    group_by(region, characteristic) %>%
    mutate(
      beta = cov(ret, mkt_vw_exc)/var(mkt_vw_exc),
      ret_neu = (ret - mkt_vw_exc * beta)*100,
      scaling_fct = sqrt(10^2/12) / sd(ret_neu),
      ret_neu_scaled = ret_neu * scaling_fct
    ) %>%
    ungroup()
  # Make Wide
  data <- data %>% mutate(name_wide = str_c(characteristic, "__", region))
  if(scale_alphas) {
    data_wide <- data %>%
      select(name_wide, eom, ret_neu_scaled) %>%
      spread(key = name_wide, value = ret_neu_scaled)
  } else {
    data_wide <- data %>%
      select(name_wide, eom, ret_neu) %>%
      spread(key = name_wide, value = ret_neu)
  }
  # Return
  list(
    "long" = data,
    "wide" = data_wide
  )
}

block_cluster_func <- function(cor_mat, cl_lables) {
  cor_long <- cor_mat %>%
    as_tibble(rownames = "char1") %>%
    gather(-char1, key = "char2", value = "cor") %>%
    separate(col = "char1", into = c("char1", "region1"), sep = "__") %>%
    separate(col = "char2", into = c("char2", "region2"), sep = "__") %>%
    left_join(cl_lables %>% select(characteristic, "hcl1"  = hcl_label), by = c("char1"="characteristic")) %>%
    left_join(cl_lables %>% select(characteristic, "hcl2"  = hcl_label), by = c("char2"="characteristic")) %>%
    rowwise() %>%
    mutate(
      hclreg1 = str_c(hcl1, "__", region1),
      hclreg2 = str_c(hcl2, "__", region2)
    ) %>%
    select(-hcl1, -hcl2) %>%
    mutate(hcl_pair = str_c(min(c(hclreg1, hclreg2)), "_x_", max(c(hclreg1, hclreg2)))) %>%
    unite(col = "name1", char1, region1, sep = "__", remove = T) %>%
    unite(col = "name2", char2, region2, sep = "__", remove = T) %>%
    ungroup()
  
  cluster_wise_cor <- cor_long %>%
    filter(name1 != name2) %>%  # Exclude cor(factor_i, factor_i)=1 
    group_by(hcl_pair) %>%
    summarise(
      cor_avg = mean(cor)
    ) 
  
  cluster_block_cor <- cor_long %>%
    left_join(cluster_wise_cor, by = "hcl_pair") %>% 
    mutate(cor_avg = if_else(name1 == name2, 1, cor_avg)) %>%  # ONLY IF SAME REGION!!
    select(name1, name2, cor_avg) %>%
    spread(key = name2, value = cor_avg)
  
  cbc_rows <- cluster_block_cor$name1  
  cluster_block_cor <- cluster_block_cor %>% select(-name1) %>% as.matrix()
  rownames(cluster_block_cor) <- cbc_rows
  return(cluster_block_cor)
}

# Empirical Bayes ----------------
emp_bayes <- function(data, cluster_labels, min_obs = 5 * 12, fix_alpha = F, bs_cov = F, cor_type = "sample", shrinkage = 0, layers = 3, bs_samples = 10000, seed, priors = NULL, sigma = NULL, plot = T) { # cor_type %in% c("raw", "block_2", "block_clusters")
  set.seed(seed)
  y_raw <- data$wide %>% select(-eom) %>% as.matrix()
  obs <- y_raw %>% apply(2, function(x) sum(!is.na(x)))
  y <- y_raw[, obs >= min_obs]
  n_fcts <- ncol(y)
  y_mean <- y %>% apply(2, mean, na.rm = T)  
  
  if (is.null(sigma)) {
    if (bs_cov) {
      bs_full <- y %>%
        rsample::bootstraps(times = bs_samples) %>%
        mutate(
          res = splits %>% map(~.x %>% rsample::analysis() %>% apply(2, mean, na.rm = T) %>% as_tibble(rownames = "characteristic"))
        ) %>%
        select(-splits) %>%
        unnest(res)
      
      bs_full_cov <- bs_full %>%
        spread(key = characteristic, value = value) %>%
        select(-id) %>%
        cov()
      
      alpha_sd <- sqrt(diag(bs_full_cov)) 
      alpha_cor <- solve(diag(alpha_sd)) %*% bs_full_cov %*% solve(diag(alpha_sd))
      colnames(alpha_cor) <- names(alpha_sd)
      rownames(alpha_cor) <- names(alpha_sd)
      
    } else {
      y_sd <- y %>% apply(2, sd, na.rm=T) 
      y_scor <- y %>% cor(use = "complete.obs")  
      alpha_sd <- y_sd / sqrt(nrow(y))
      alpha_cor <- y_scor
    }
    
    # Apply Shrinkage
    alpha_cor_shrunk <- alpha_cor * (1-shrinkage) + diag(n_fcts) * shrinkage
    
    # Correlation Block Adjustment
    if (cor_type == "sample") {
      alpha_cor_adj <- alpha_cor_shrunk
    }
    if (cor_type == "block_clusters") {
      alpha_cor_adj <- alpha_cor_shrunk %>% block_cluster_func(cl_lables = cluster_labels)
    }
    
    sigma <- diag(alpha_sd) %*% alpha_cor_adj %*% diag(alpha_sd)  # This is really the equivalent of sigma/T from the paper
    colnames(sigma) <- colnames(alpha_cor_shrunk)
    
    print(str_c("Condition Number: Raw = ", round(kappa(alpha_cor_shrunk), 2), ", Adjusted = ", round(kappa(alpha_cor_adj), 2)))
  } else {
    alpha_sd <- sqrt(diag(sigma))
    names(alpha_sd) <- colnames(sigma)
  }
  
  # Cluster Membership
  cm <- y_mean %>% 
    as_tibble(rownames = "char_reg") %>%
    mutate(
      characteristic = str_split(char_reg, "__", simplify = T)[, 1]
    ) %>%
    left_join(cluster_labels, by = "characteristic") 
  
  m <- cm %>%
    mutate(cm = 1) %>%
    select(char_reg, hcl_label, cm) %>%
    spread(key = hcl_label, value = cm) %>% 
    select(-char_reg) %>% 
    as.matrix()
  m[is.na(m)] <- 0
  mm <- m %*% t(m)
  
  n_cl <- ncol(m)
  
  # Signal Membership
  z <- cm %>%
    mutate(sm = 1) %>%
    select(char_reg, characteristic, sm) %>%
    spread(key = characteristic, value = sm) %>%
    select(-char_reg) %>%
    as.matrix()
  
  z[is.na(z)] <- 0
  zz <- z %*% t(z)
  
  n_s <- ncol(z)
  
  # Starting Values
  starting_values <- cm %>%
    group_by(hcl_label, characteristic) %>%
    summarise(
      n_s = n(),
      signal_mean = mean(value),
      signal_sd = sd(value)
    ) %>%
    group_by(hcl_label) %>%
    summarise(
      n_c = sum(n_s),
      cl_mean = mean(signal_mean),
      cl_sd = sd(signal_mean),
      cl_signal_within = mean(signal_sd)
    ) %>%
    ungroup() %>%
    mutate(cl_sd = if_else(n_c == 1, 0, cl_sd)) %>%
    summarise(
      alpha_mean = mean(cl_mean),  
      sd_cl_mean = if_else(condition = fix_alpha, sqrt(sum((cl_mean^2) / (n() - 1))), sd(cl_mean)),
      sd_within_cl = mean(cl_sd),
      sd_within_signal = mean(cl_signal_within)
    )
  
  if (fix_alpha) {
    sd_all <- sqrt(sum(y_mean^2) / (length(y_mean) - 1))
  } else {
    sd_all <- sd(y_mean)
  }
  
  # Maximum Likelihood
  omega_func <- function(layers, tau_c, tau_s, tau_w) {
    if (layers == 1) {
      a_omega <- diag(n_fcts) * tau_c^2                                        # All alphas are drawn from same distribution
    }
    if (layers == 2) {
      a_omega <- diag(n_fcts) * tau_s^2 + mm * tau_c^2                       # All cluster alphas are drawn from the same distribution, could be done with signals as well  
    }
    if (layers == 3) {
      a_omega <- diag(n_fcts) * tau_w^2 + zz * tau_s^2 + mm * tau_c^2      # Cluster distrib., signal distrib. factor distrib.
    }
    return(a_omega)
  }
  
  # Choose between specifying prior parameters or finding them via EB
  if (is.null(priors)) {
    if (layers == 1) {
      start_list <- list(
        a = starting_values$alpha_mean, 
        tc = sd_all)
      
      mle_func <- function(a, tc) {
        a_vec <- rep(a, n_fcts)
        a_omega <- omega_func(layers = layers, tau_c = tc, tau_s = NULL, tau_w = NULL)
        
        a_cov <- sigma + a_omega  #  / t_mat
        
        -(mvtnorm::dmvnorm(x = y_mean, mean = a_vec, sigma = a_cov, log = T))  #  + dgamma(param[2], 2, 5, log = T)*sum(mm)/2  + dgamma(param[3], 2, 10, log = T)*140 
      }
    } 
    if (layers == 2) {
      start_list <- list(
        a = starting_values$alpha_mean, 
        tc = starting_values$sd_cl_mean, 
        ts = starting_values$sd_within_cl)
      
      mle_func <- function(a, tc, ts) {
        a_vec <- rep(a, n_fcts)
        a_omega <- omega_func(layers = layers, tau_c = tc, tau_s = ts, tau_w = NULL)
        
        a_cov <- sigma + a_omega  #  / t_mat
        
        -(mvtnorm::dmvnorm(x = y_mean, mean = a_vec, sigma = a_cov, log = T))  
      }
    }
    if (layers == 3) {
      start_list <- list(
        a = starting_values$alpha_mean, 
        tc = starting_values$sd_cl_mean, 
        ts = starting_values$sd_within_cl,
        tw = starting_values$sd_within_signal)
      
      mle_func <- function(a, tc, ts, tw) {
        a_vec <- rep(a, n_fcts)
        a_omega <- omega_func(layers = layers, tau_c = tc, tau_s = ts, tau_w = tw)
        
        a_cov <- sigma + a_omega  
        
        -(mvtnorm::dmvnorm(x = y_mean, mean = a_vec, sigma = a_cov, log = T))
      }
    }
    
    # Maximum likelihood estimation
    for (k in 1:10) {
      initial_params <- start_list %>% lapply(function(x) max(x+rnorm(1, mean = 0, sd = 0.01), 0)) # Max is just to ensure that variances are not negative, never in use
      if (fix_alpha) {
        (hyper_pars <- stats4::mle(minuslogl = mle_func, start = initial_params, lower = c(-Inf, 0, 0, 0)[1:length(start_list)], fixed = list(a = 0)))
      } else {
        (hyper_pars <- stats4::mle(minuslogl = mle_func, start = initial_params, lower = c(-Inf, 0, 0, 0)[1:length(start_list)]))
      }
      if (hyper_pars@details$convergence==0) break
    }
    # Check convergence
    if (hyper_pars@details$convergence != 0) {
      warning("MLE step did not converge!!!")
      return(NULL)
    } 
    
    mu <- hyper_pars@fullcoef["a"]
    tau_c <- hyper_pars@fullcoef["tc"]
    tau_s <- hyper_pars@fullcoef["ts"]
    tau_w <- hyper_pars@fullcoef["tw"]
  } else {
    mu <- priors$alpha
    tau_c <- priors$tau_c
    tau_s <- priors$tau_s
    tau_w <- priors$tau_w
  }
  
  theta <- omega_func(layers = layers, tau_c = tau_c, tau_s = tau_s, tau_w = tau_w)
  colnames(theta) <- rownames(theta) <- names(y_mean)
  print(paste("Condition Number Omega =", round(kappa(theta))))
  
  # Signal Posteriors ------------------
  if (layers == 3) {
    as_mean <- tau_w^2*t(z) %*% (theta + sigma) %*% (y_mean - rep(mu, n_fcts))
    as_cov <- tau_w^2 * diag(n_s) - tau_w^4 * t(z) %*% (theta + sigma) %*% z
    as_sd <- sqrt(diag(as_cov))
    colnames(as_mean) <- "post_mean"
    
    signal_summary <- as_mean %>% as_tibble(rownames = "characteristic") %>%
      left_join(as_sd %>% as_tibble(rownames = "characteristic") %>% rename("post_sd" = value), by = "characteristic")
  }
  
  # Factor Posteriors ------------------
  ai_cov <- solve(solve(theta) + solve(sigma))  # t_mat * solve(sigma)
  ai_sd <- sqrt(diag(ai_cov))
  ai_mean <- ai_cov %*% (solve(theta) %*% rep(mu, n_fcts) + solve(sigma) %*% y_mean) ## (t_mat * solve(sigma))
  
  rownames(ai_mean) <- names(y_mean)
  colnames(ai_mean) <- "post_mean"
  names(ai_sd) <- names(y_mean)
  
  factor_summary <- ai_mean %>% as_tibble(rownames = "char_reg") %>%
    left_join(ai_sd %>% as_tibble(rownames = "char_reg") %>% rename("post_sd" = value), by = "char_reg") %>%
    left_join(y_mean %>% as_tibble(rownames = "char_reg") %>% rename("ols_est" = value), by = "char_reg") %>%
    left_join(alpha_sd %>% as_tibble(rownames = "char_reg") %>% rename("ols_se" = value), by = "char_reg") %>%
    mutate(
      characteristic = str_split(char_reg, "__", simplify = T)[, 1],
      # characteristic = char_reg %>% str_extract(".+[?=_{2}]") %>% str_remove("__")
      p025 = post_mean - 1.96 * post_sd,
      p975 = post_mean + 1.96 * post_sd
    ) %>%
    left_join(cluster_labels, by = "characteristic") %>%
    mutate(
      region = char_reg %>% str_extract(pattern = "(?<=_{2}).+")
    ) %>%
    select(char_reg, characteristic, hcl_label, region, everything()) 
  
  # Output
  if (is.null(priors)) {
    comparison <- tibble(
      estimate = c("alpha", "tau_c", "tau_s", "tau_w")[1:(layers + 1)],
      crude = drop(unlist(start_list)),
      ml_est = c(mu, tau_c, tau_s, tau_w)[1:(layers + 1)]
    )
    if (fix_alpha) {
      ml_se <- c(NA_real_, sqrt(diag(solve(hyper_pars@details$hessian))))
    } else {
      ml_se <- sqrt(diag(solve(hyper_pars@details$hessian)))
    }
    comparison$ml_se <- ml_se
    
    print(comparison)
  }
  
  if (plot == T) {
    list("factors" = factor_summary) %>% eb_plots()
  }
  
  ret_list <- list(
    "input" = data,
    # "clusters" = cluster_summary,
    "factors" = factor_summary,
    "factor_mean" = ai_mean,
    "factor_cov" = ai_cov,
    "theta" = theta,
    "sigma" = sigma
  )
  if (is.null(sigma)) {
    ret_list[["alpha_cor_raw"]] <- alpha_cor_shrunk
    ret_list[["alpha_cor_adj"]] <- alpha_cor_adj
  }
  if (is.null(priors)) {
    ret_list[["mle"]] <- comparison
  }
  if (layers == 3) {
    ret_list$signal <- signal_summary
  }
  return(ret_list)
}

fdr_sim <- function(t_low, a_vec, a_cov, n_sim = 10000, seed=1) {
  set.seed(seed)
  t_all <- a_vec / sqrt(diag(a_cov))
  t_steps <- sort(t_all[t_all > t_low])
  t_steps <- head(t_steps, -1)  # Don't include the last t-value (no significant)
  # Simulated alphas
  sims <- mvtnorm::rmvnorm(n = n_sim, mean = a_vec, sigma = a_cov)
  # False Discovery as a Function of t-cutoff
  t_steps %>% lapply(function(t) {
    # Significant alphas under t-cutoff
    sig <- (t_all >= t) 
    # False Discovery Rate
    sims_fdr <- rowMeans(sign(sims[, sig]) != sign(a_vec[sig]))
    # Output
    tibble(t_cutoff = t, n_sig = sum(sig), fdr = mean(sims_fdr), fwr = mean(sims_fdr > 0))
  }) %>% bind_rows()
}

fdr_fwer_rates <- function(t_cutoff, a_vec, a_cov, orig_sig = F, n_sim = 10000, seed=1) {
  set.seed(seed)
  # Simulate from full posterior
  sims <- mvtnorm::rmvnorm(n = n_sim, mean = a_vec, sigma = a_cov)
  if (orig_sig == T) {
    orig_factors <- char_info %>% filter(significance == T) %>% pull(characteristic) %>% str_c("__world")
    sims <- sims[, match(x = orig_factors, table = rownames(a_vec))]
    a_vec <- a_vec[orig_factors, ]
    a_cov <- a_cov[orig_factors, orig_factors]
  } 
  t_all <- a_vec / sqrt(diag(a_cov))
  sig <- (t_all >= t_cutoff) 
  sig_sims <- sims[, sig]
  false_discoveries <- sig_sims %>% apply(1, function(x) mean(x<0))
  # FDR Distribution
  fdr_dist <- tibble(
    min = min(false_discoveries),
    p025 = quantile(false_discoveries, 0.025),
    p50 = quantile(false_discoveries, 0.5),
    p975 = quantile(false_discoveries, 0.975),
    max = max(false_discoveries),
    mean = mean(false_discoveries),
    sd = sd(false_discoveries)
  )
  fwer_dist <- tibble(
    min = min(false_discoveries!=0),
    p025 = quantile(false_discoveries!=0, 0.025),
    p50 = quantile(false_discoveries!=0, 0.5),
    p975 = quantile(false_discoveries!=0, 0.975),
    max = max(false_discoveries!=0),
    mean = mean(false_discoveries!=0),
    sd = sd(false_discoveries!=0)
  )
  # FWER Distribution
  # Output 
  print(paste("Factors:", length(t_all), "- Sig:", sum(sig)))
  print(paste("Mean p-value:", round(mean(1-pnorm(t_all[sig])), 6)))
  fwer_fdr <- tibble(t_cutoff = t_cutoff, n_sig = sum(sig), fdr = mean(false_discoveries), fwer = mean(false_discoveries > 0))
  list("fdr_dist"=fdr_dist, "fwer_dist" = fwer_dist, "fwer_fdr"=fwer_fdr)
}

# True Factors 
true_factors <- function(t_cutoff, a_vec, a_cov, orig_sig = T, n_sim = 10000, seed=1) {
  set.seed(seed)
  post_vol <- sqrt(diag(a_cov))
  # Simulate using all factors
  sims <- mvtnorm::rmvnorm(n = n_sim, mean = a_vec, sigma = a_cov)
  # Decide which factors to look at
  if (orig_sig == T) {
    orig_factors <- char_info %>% filter(significance == T) %>% pull(characteristic) %>% str_c("__world")
    orig_factors_match <- match(x = orig_factors, table = rownames(a_vec))
    sims <- sims[, orig_factors_match]
    post_vol <- post_vol[orig_factors_match]
    a_vec <- a_vec[orig_factors_match]
  }
  sims <- 1:ncol(sims) %>% sapply(function(i) sims[, i] / post_vol[i])
  true_factors_dist <- sims %>% apply(1, function(x) mean(x > t_cutoff)) 
  true_stat <- mean(a_vec/post_vol > t_cutoff)
  # From point 4 in https://influentialpoints.com/Training/bootstrap_confidence_intervals-principles-properties-assumptions.htm
  bc_ci <- function(stat, bootstraps, alpha=0.05) { # bias corrected bootstrap standard errors
    # estimate bias in std. norm deviates
    b <- qnorm((sum(bootstraps > stat)+sum(bootstraps==stat)/2)/length(bootstraps))  # Proportion of bootstrap samples above the "population" estimate. If unbiased, b=0.5. To handle discrete statistics, half of the sample at the population estimate is assumed to lie above
    z <- qnorm(c(alpha/2,1-alpha/2)) # Std. norm. limits
    p <- pnorm(z-2*b) # bias-correct & convert to proportions
    
    quantile(bootstraps,p=p) # Bias-corrected percentile lims.
  } 
  bias_corrected <- bc_ci(stat = true_stat, bootstraps = true_factors_dist, alpha = 0.05)
  tibble(
    min = min(true_factors_dist),
    p025 = quantile(true_factors_dist, 0.025),
    p50 = quantile(true_factors_dist, 0.5),
    p975 = quantile(true_factors_dist, 0.975),
    max = max(true_factors_dist),
    mean = mean(true_factors_dist),
    sd = sd(true_factors_dist),
    p025_bc = bias_corrected[1],
    p975_bc = bias_corrected[2]
  )
}

# Simulations for figure 2
sim_mt_control <- function(sim_settings) {
  # Cluster membership
  m <- matrix(0, nrow = sim_settings$n, ncol = sim_settings$clusters)  # Cluster membership
  j <- 0
  for (i in 1:sim_settings$clusters) {
    m[(j+1):(j + sim_settings$fct_pr_cl), i] <- 1
    j <- j + sim_settings$fct_pr_cl
  }
  # Correlation Matrix
  corr_mat <- m %*% t(m)
  corr_mat[corr_mat == 0] <- sim_settings$corr_across
  corr_mat[corr_mat == 1] <- sim_settings$corr_within
  diag(corr_mat) <- 1
  # Sigma
  sigma <- sim_settings$se^2 * corr_mat
  # Predefine variables
  alpha_0_vec <- rep(sim_settings$alpha_0, sim_settings$n)
  i_n <- diag(sim_settings$n)
  # Simulation
  search_grid <- expand.grid("tau_c" = sim_settings$tau_c, "tau_w" = sim_settings$tau_w)
  1:nrow(search_grid) %>% lapply(function(i) {
    tau_c <- search_grid[i, "tau_c"]
    tau_w <- search_grid[i, "tau_w"]
    print(paste("Iteration", i, "out of", nrow(search_grid)))
    alpha_noise <- MASS::mvrnorm(n = sim_settings$n_sims, mu = rep(0, sim_settings$n), Sigma = sigma) # Preallocate alpha noise for efficiency
    s <- 1:sim_settings$n_sims %>% lapply(function(s) {
      omega <- m %*% t(m) * tau_c^2 + i_n * tau_w^2
      alpha_c <- rnorm(sim_settings$clusters) * tau_c
      alpha_w <- rnorm(sim_settings$n) * tau_w
      alpha_true <- alpha_0_vec + m %*% alpha_c + alpha_w
      alpha_hat <- alpha_true + alpha_noise[s, ]
      
      post_var <- solve(solve(omega) + solve(sigma))
      post_alpha <- post_var %*% (solve(omega) %*% alpha_0_vec + solve(sigma) %*% alpha_hat)
      
      eb <- tibble("type" = "eb", "true_alpha" = drop(alpha_true), "z" = drop(post_alpha / sqrt(diag(post_var))), "p" = 2 * pnorm(abs(z), lower.tail = F))
      ols <- tibble("type" = "ols", "true_alpha" = drop(alpha_true), "z" = drop(alpha_hat / sqrt(diag(sigma))), "p" = 2 * pnorm(abs(z), lower.tail = F))
      by <- tibble("type" = "by", "true_alpha" = drop(alpha_true), "z" = ols$z)
      by$p <- p.adjust(ols$p, method = "BY")
      rbind(eb, ols, by) %>%
        mutate(sig = z > 0 & p < 0.025) %>%
        group_by(type) %>%
        summarise(
          sim = s,
          n_disc = sum(sig),
          true_disc = sum(sign(true_alpha[sig == T]) == sign(z[sig == T])),
          false_disc = n_disc - true_disc
        )
    }) %>% bind_rows()
    s %>% 
      group_by(type) %>%
      mutate(fdp = if_else(n_disc == 0, 0, false_disc / n_disc)) %>%
      summarise(
        fdr = mean(fdp),
        n_disc = mean(n_disc),
        false_disc = mean(false_disc),
        true_disc = mean(true_disc),
        tau_c = tau_c,
        tau_w = tau_w,
        n = n()
      ) %>%
      mutate(true_disc_rate = true_disc / (sim_settings$n / 2))
  }) %>% bind_rows()
}

multiple_testing <- function(eb_all, eb_world = NULL) {
  eb_all$factors %>%
    bind_rows(eb_world$factors) %>%
    mutate(
      t_ols = ols_est/ols_se,
      p_ols = 2*pnorm(abs(t_ols), lower.tail = F)
    ) %>%
    group_by(region) %>%
    mutate(
      n = n(),
      p_bonf = p_ols %>% p.adjust(method = "bonferroni"),
      p_holm = p_ols %>% p.adjust(method = "holm"),
      p_bh = p_ols %>% p.adjust(method = "BH"),
      p_by = p_ols %>% p.adjust(method = "BY")
    ) %>%
    select(n, region, char_reg, "estimate" = ols_est, "statistic" = t_ols, "se" = ols_se, starts_with("p_")) %>%
    gather(starts_with("p_"), key = "method", value = "p") %>%
    mutate(
      method = method %>% str_remove("^p_"),
      mt_adj = case_when(
        method == "ols" ~ "None",
        method == "bh" ~ "FDR",
        method == "by" ~ "FDR",
        method == "bonf" ~ "FWR",
        method == "holm" ~ "FWR"
      ),
      method = case_when(
        method == "ols" ~ "OLS",
        method == "bh" ~ "BH",
        method == "by" ~ "BY",
        method == "bonf" ~ "Bonferroni",
        method == "holm" ~ "Holm",
        TRUE ~ method
      )
    ) 
}

# Bootstrap Tangency Portfolio --
# BS Func
bootstrap_tpf <- function(data, n_boots = 100, shorting = T, seed = 1) {
  set.seed(seed)
  if (shorting) {
    boot_func <- function(splits, ...) {
      df <- analysis(splits) %>% apply(2, function(x) x / sd(x)) %>% as.data.frame()
      lm(rep(1, nrow(df)) ~ -1 + ., data = df) %>% 
        broom::tidy() %>% 
        mutate(weight = estimate / sum(estimate)) %>% 
        mutate(term = term %>% str_remove_all("`")) %>%
        select(term, weight)
    }
  } else {
    boot_func <- function(splits, ...) {
      df <- analysis(splits) %>% apply(2, function(x) x / sd(x))
      glmnet::glmnet(y = rep(1, nrow(df)), x = df %>% as.matrix(), 
                     lambda = 0, lower.limits = 0, intercept = F) %>% 
        broom::tidy(return_zeros = T) %>% 
        filter(term != "(Intercept)") %>% 
        mutate(weight = estimate / sum(estimate)) %>%
        select(term, weight)
    }
  } 
  
  data %>% 
    bootstraps(times = n_boots, apparent = T) %>% # Apparent = T --> Generate original data 
    mutate(
      coef = splits %>% map(.f = boot_func)
    ) 
}

# Full tpf
tpf_cluster <- function(data, mkt_region, orig_sig, min_date, n_boots, shorting, seed) {
  if (orig_sig) {
    orig_sig_values <- T
  } else {
    orig_sig_values <- c(T, F)
  }
  market_ret <- regional_mkt_ret[region == mkt_region]
  
  cluster_pf <- data %>%
    left_join(cluster_labels, by = "characteristic") %>%
    left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>%
    filter(orig_sig %in% orig_sig_values) %>%
    group_by(hcl_label, eom) %>%
    summarise(
      ret = mean(ret)
    )
  
  tpf_data <- cluster_pf %>% 
    filter(eom >= min_date) %>% 
    spread(key = hcl_label, value = ret) %>%
    left_join(market_ret %>% select(eom, market), by = c("eom")) %>%
    rename(Market = market)
  
  tpf_data %>% select(-eom) %>% bootstrap_tpf(n_boots = n_boots, shorting = shorting, seed = seed) %>% mutate(market_region = mkt_region)
}

# In-Sample / Out-of-Sample Functions
prepare_is_oos <- function(input, min_obs, orig_group, ret_scaled, type, print=F) {  # ret_scaled in ('none', "all", "is") & type in ('is_oos', 'is_post', 'is_pre')  
  data <- input %>%
    select(characteristic, eom, ret, mkt_vw_exc) %>%
    left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>% 
    filter(orig_sig %in% orig_group) %>%
    left_join(char_info %>% select(characteristic, sample_start, sample_end), by = "characteristic") %>%
    mutate(
      period = case_when(
        year(eom) >= sample_start & year(eom) <= sample_end ~ "is",
        type == "pre" & year(eom) < sample_start ~ "oos",
        type == "post" & year(eom) > sample_end ~ "oos",
        type == "pre_post" & (year(eom) < sample_start | year(eom) > sample_end) ~ "oos"
      ),
      ret = ret * 100,
      mkt_vw_exc = mkt_vw_exc * 100
    ) %>%
    filter(!is.na(period))
  
  # Exclude data
  data_excl <- data %>%
    group_by(characteristic) %>%
    mutate(n_is = sum(period == "is"), n_oos = sum(period == "oos")) %>% 
    filter(n_is >= min_obs & n_oos >= min_obs)
  
  if (ret_scaled == "none") {
    data_adj <- data_excl %>% mutate(ret_adj = ret)
  }
  
  if (ret_scaled == "all") {
    data_excl <- data_excl %>% 
      group_by(characteristic, period) %>%
      mutate(
        ret_neu = ret - cov(ret, mkt_vw_exc)/var(mkt_vw_exc) * mkt_vw_exc,
        ret_adj = ret * (10/sqrt(12))/sd(ret_neu)) %>%
      select(-ret_neu) %>%
      ungroup()
  }
  if (ret_scaled == "is") {
    is_vol <- data_excl %>%
      filter(period == "is") %>%
      group_by(characteristic) %>%
      mutate(ret_neu = ret - cov(ret, mkt_vw_exc)/var(mkt_vw_exc) * mkt_vw_exc) %>%
      summarise(
        is_sd = sd(ret_neu)
      )
    
    data_excl <- data_excl %>% 
      left_join(is_vol, by = c("region", "characteristic")) %>%
      group_by(characteristic, period) %>%
      mutate(ret_adj = ret * (10/sqrt(12))/is_sd) %>%
      ungroup() %>%
      select(-is_sd)
  }
  
  full <- data %>% summarise(fct_all = uniqueN(characteristic))
  excl <- data_excl %>% summarise(fct_excl = uniqueN(characteristic))
  
  if (print) {
    print(tibble("type"=type, full, excl))
  }
  
  return(data_excl)
}

# Economic Benefit of more Power
trading_on_significance <- function(posterior_is) {
  pf_base <- posterior_is %>%
    left_join(char_info %>% select(characteristic, significance, sample_end), by = "characteristic") %>%
    filter(significance == 1 & est_date >= sample_end) %>%
    group_by(est_date) %>%
    mutate(
      ols_p = pnorm(abs(ols_est / ols_se), lower.tail = F)*2,
      by_p = p.adjust(ols_p, method = "BY")
    ) 
  
  print(pf_base %>%
          summarise(
            rr_eb = mean(p025 > 0),
            rr_ols = mean(ols_p <= 0.05 & ols_est > 0),
            rr_by = mean(by_p <= 0.05 & ols_est > 0),
          ) %>%
          gather(rr_eb, rr_ols, rr_by, key = "type", value = "rr") %>%
          ggplot(aes(est_date, rr, colour = type)) +
          geom_point() +
          geom_line())
  
  positions <- pf_base %>%
    ungroup() %>%
    mutate(
      position_year = year(est_date) + 1,
      eb_sig = (p025 > 0),
      by_sig = (by_p <= 0.05 & ols_est > 0)
    ) %>%
    select(position_year, characteristic, eb_sig, by_sig)
  
  candidate_factors <- regional_pfs %>%
    mutate(year = year(eom)) %>%
    left_join(positions, by = c("characteristic", "year" = "position_year")) %>%
    mutate(marg_sig = (eb_sig == T & by_sig == F)) %>%
    gather(marg_sig, eb_sig, by_sig, key = "type", value = "significant")
  
  candidate_factors %>%
    filter(significant == T) %>%
    group_by(region, type, significant, eom) %>%
    summarise(
      n = n(),
      ret = mean(ret),
      mkt = mean(mkt_vw_exc)
    )  
}

# Simulation according to specification for Harvey et al (2016)
harvey_et_al_sim <- function(sim_settings, seed) {
  set.seed(seed)
  # Cluster membership
  m <- matrix(0, nrow = sim_settings$n, ncol = sim_settings$cl)  # Cluster membership
  j <- 0
  for (i in 1:sim_settings$cl) {
    m[(j+1):(j + sim_settings$fct_pr_cl), i] <- 1
    j <- j + sim_settings$fct_pr_cl
  }
  mm <- m %*% t(m)
  # Correlation Matrix
  corr_mat <- mm
  corr_mat[corr_mat == 0] <- sim_settings$corr_across
  corr_mat[corr_mat == 1] <- sim_settings$corr_within
  diag(corr_mat) <- 1
  
  # Average Correlation (Should be close to zero)
  mean(corr_mat[lower.tri(corr_mat)])
  
  # Sigma
  sigma <- sim_settings$se^2 * corr_mat
  
  # Predefine variables
  alpha_0_vec <- rep(sim_settings$alpha_0, sim_settings$n)
  i_n <- diag(sim_settings$n)
  
  # Simulation
  sim_settings$tau_ws %>% lapply(function(tau_w) {
    start <- proc.time()
    alpha_noise <- MASS::mvrnorm(n = sim_settings$n_sims, mu = rep(0, sim_settings$n), Sigma = sigma) # Preallocate alpha noise for efficiency
    tau_sim <- 1:sim_settings$n_sims %>% sapply(simplify = F, USE.NAMES = T, function(s) {
      print(paste("Tau_w:", tau_w, "- Simulation", s, "out of", sim_settings$n_sims))
      # Simulate Alphas
      alpha_c <- c(rep(sim_settings$ret, times = sim_settings$cl_true), rep(0, times = (sim_settings$cl - sim_settings$cl_true)))
      alpha_w <- c(rnorm(sim_settings$n_true) * tau_w, rep(0, sim_settings$n - sim_settings$n_true))
      alpha_true <- alpha_0_vec + m %*% alpha_c + alpha_w
      alpha_hat <- as.vector(alpha_true + alpha_noise[s, ])
      
      # MLE Function
      mle_func <- function(a, tc, tw) {
        a_vec <- rep(a, sim_settings$n)
        a_omega <- i_n * tw^2 + mm * tc^2
        a_cov <- sigma + a_omega  #  / t_mat
        
        -(mvtnorm::dmvnorm(x = alpha_hat, mean = a_vec, sigma = a_cov, log = T))  
      }
      
      # Starting Values
      starting_values <- tibble(a = alpha_hat, cl = rep(1:sim_settings$cl, each = sim_settings$fct_pr_cl)) %>%
        group_by(cl) %>%
        summarise(
          cl_mean = mean(a),
          cl_sd = sd(a)
        ) %>%
        summarize(
          crude_a0 = mean(cl_mean),
          crude_tc = if_else(sim_settings$fix_alpha, sqrt(sum((cl_mean^2) / (n() - 1))), sd(cl_mean)),
          crude_tw = mean(cl_sd)
        )
      start_list <- list(
        a = starting_values$crude_a0, 
        tc = starting_values$crude_tc, 
        tw = starting_values$crude_tw)
      
      # Estimate Parameters
      if (sim_settings$fix_alpha) {
        (hyper_pars <- stats4::mle(minuslogl = mle_func, start = start_list, lower = c(-Inf, 0, 0), fixed = list(a = 0)))
      } else {
        (hyper_pars <- stats4::mle(minuslogl = mle_func, start = start_list, lower = c(-Inf, 0, 0)))
      }
      
      # Check convergence
      if (hyper_pars@details$convergence != 0) {
        warning("MLE step did not converge!!!")
        return(NULL)
      }
      mu <- hyper_pars@fullcoef["a"]
      tc <- hyper_pars@fullcoef["tc"]
      tw <- hyper_pars@fullcoef["tw"]
      mle <- tibble(
        s = rep(s, 3),
        coef = c("a", "tc", "tw"),
        mle = c(mu, tc, tw),
        crude = c(start_list$a, start_list$tc, start_list$tw)
      )
      print(mle)
      
      # Specify Posterior
      omega <- i_n * tw^2 + mm * tc^2
      post_cov <- solve(solve(omega) + solve(sigma))
      post_alpha <- post_cov %*% (solve(omega) %*% alpha_0_vec + solve(sigma) %*% alpha_hat)
      
      list("alpha_true" = alpha_true, "alpha_hat" = alpha_hat, "post_alpha" = post_alpha, "post_cov" = post_cov, "mle" = mle)
    })
    print(proc.time() - start)  # 5 iterations took 232.13/60 = 4 minutes  
    return(tau_sim)
  }) 
}


# Single Factor TP --
sr_func <- function(data, w) {
  ret_vec <- data %>% colMeans()
  cov_mat <- data %>% cov()
  drop(w %*% ret_vec / sqrt(t(w) %*% cov_mat %*% w))
}

epo_tpf <- function(data, s) {
  sd <- data %>% apply(2, sd)
  cor <- data %>% cor()
  ret_vec <- data %>% colMeans()
  cor_shrunk <- diag(length(sd)) * s + cor * (1-s)
  cov_shrunk <- diag(sd) %*% cor_shrunk %*% diag(sd)
  drop((solve(cov_shrunk) %*% ret_vec) / sum(solve(cov_shrunk) %*% ret_vec))
}

# Prepare data for 
prepare_tpf_factors <- function(region, orig_sig_values, start, scale) {
  mkt <- regional_mkt_ret %>% filter(region == !!region) %>% select(-region)
  
  tpf_factors <- eb_est[[region]]$input$long %>%
    left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>%
    filter(orig_sig %in% orig_sig_values) %>%
    group_by(eom) %>%
    mutate(n = n()) %>%
    ungroup() %>%
    filter(eom >= start) %>%
    select(characteristic, eom, n, ret)
  
  # Ensure that all factors have data
  missing <- tpf_factors %>% filter(n != max(n)) %>% select(eom, n) %>% distinct()
  if (nrow(missing) > 0) {
    warning("UNBALANCED PANEL - SOME FACTORS ARE MISSING DATA!")
  }
  tpf_factors <- tpf_factors %>% filter(n == max(n)) %>% select(-n)  
  
  # 1. Overall TPF
  tpf_factors <- tpf_factors %>%
    bind_rows(mkt %>% rename("ret"="market") %>% mutate(characteristic = "market") %>% filter(eom %in% tpf_factors$eom)) 
  
  if (scale) {
    tpf_factors <- tpf_factors %>%
      group_by(characteristic) %>%
      mutate(ret = ret * (0.1 / sqrt(12)) / sd(ret))
  }
  
  tpf_factors_wide <- tpf_factors %>%
    pivot_wider(names_from = characteristic, values_from = ret) %>%
    select(-eom) 
  # Output
  list("long"=tpf_factors, "wide" = tpf_factors_wide)
}

# Optimal Shrinkage
optimal_shrinkage <- function(data, k, epo_range = seq(0, 1, 0.1)) {
  finance_kfold <- function(dates, k, horizon) {
    # Helper Function
    helper_eom_seq <- function(ends, horizon) {
      all <- ends %>% lapply(function(d) seq.Date(from = ceiling_date(d, unit = "month") - months(horizon - 1), to = ceiling_date(d, unit = "month"), by = "1 month") - 1)
      all_unique <- do.call(c, all) %>% unique()
    }
    # Split Dates into k groups
    date_vec <- dates %>% unique() %>% sort()
    n <- length(date_vec)
    n_fold <- floor(length(date_vec) / k)
    split <- split(date_vec, cut(seq_along(date_vec), k, labels = FALSE))
    # Create train/test split
    1:k %>% lapply(function(i) {
      test_ends <- split[[i]]
      test_dates <- test_ends %>% helper_eom_seq(horizon = horizon)
      train_ends <- do.call(c, split[-i])
      train_dates <- train_ends %>% helper_eom_seq(horizon = horizon)
      # Avoid Data Leakage
      train_dates <- train_dates[!(train_dates %in% test_dates)]
      tibble(fold = i, train = list(train_dates), test = list(test_dates))
    }) %>%
      bind_rows()
  }
  
  date_split <- unique(data$eom) %>% finance_kfold(k = k, horizon = 1)
  cross_val <- 1:k %>% lapply(function(i)  {
    test_dates <- date_split$test[[i]]
    test <- data %>%
      filter(eom %in% test_dates) %>%
      pivot_wider(names_from = characteristic, values_from = ret) 
    test_eom <- test$eom
    test <- test %>% select(-eom)
    
    train_dates <- date_split$train[[i]]
    train <- data %>%
      filter(eom %in% train_dates) %>%
      pivot_wider(names_from = characteristic, values_from = ret) %>%
      select(-eom) 
    
    # Create Weights
    nonneg_w <- glmnet::glmnet(y = rep(1, nrow(train)), x = train %>% as.matrix(), 
                               lambda = 0, lower.limits = 0, intercept = F) %>% 
      tidy(return_zeros = T) %>% 
      filter(term != "(Intercept)") %>% 
      mutate(
        type = "Non-Negative",
        weight = estimate / sum(estimate)
      ) %>%
      select(type, weight)
    
    epo_w <- epo_range %>% lapply(function(s) {
      epo <- train %>% epo_tpf(s = s)
      tibble(type = paste0("EPO s=", s), weight = epo)
    })
    
    # OOS Performance
    c(list(nonneg_w), epo_w) %>% lapply(function(x) {
      w <- x$weight
      tibble(
        type = unique(x$type),
        eom = test_eom, 
        ret = drop(as.matrix(test) %*% w)
      )
    }) %>%
      bind_rows() %>%
      mutate(
        i = i,
        test_range = paste0(year(min(test_dates)), "-", year(max(test_dates)))
      )
  }) %>% bind_rows()
  
  cross_val_summary <- cross_val %>%
    group_by(type) %>%
    summarise(
      ann_ret = mean(ret),
      sd = sd(ret),
      sr = ann_ret/sd
    ) %>%
    mutate(
      type_overall = if_else(str_detect(type, "EPO"), "EPO", "Non-Negative"),
      type = if_else(type == "EPO s=0", "Unconstrained", type),
      type = type %>% factor(levels = c("Non-Negative", "Unconstrained", paste0("EPO s=", seq(0.1, 1, 0.1))))
    ) 
  
  print(cross_val_summary %>%
          ggplot(aes(type, sr, group=type_overall)) +
          geom_point() +
          geom_path() +
          labs(colour = "Test Period:", y = "Monthly OOS SR of TPF") +
          theme(legend.position = "top", axis.title.x = element_blank(), axis.text.x = element_text(angle = 45, vjust=0.5)))
  
  opt_s_summary <- cross_val_summary %>% filter(type_overall == "EPO" & sr == max(sr)) %>% mutate(s = type %>% str_remove("EPO s=") %>% as.numeric()) 
  print(paste0("Highest OOS SR: ", opt_s_summary %>% pull(sr) %>% round(2), ", Standard MVO: ", cross_val_summary %>% filter(type == "Unconstrained") %>% pull(sr) %>% round(2)))
  
  # Optimal shrinkage
  opt_s_summary %>% pull(s)
}


# Table Functions --------------------------------------------------------
table_is_oos_ols <- function(is_oos_regs, is_post_regs) {
  oos_us <- lm(oos ~ is, data = is_oos_regs %>% filter(region == "us"))
  post_us <- lm(post ~ is, data = is_post_regs %>% filter(region == "us"))
  
  stargazer::stargazer(
    post_us, oos_us,
    title = "OLS - Biased: $\\hat{\\alpha}_\\text{Out-of-Sample} = \\gamma_0 + \\gamma_1\\times\\hat{\\alpha}_\\text{In-Sample}$", out.header=T,
    no.space=T, digits=3, type='latex', single.row=F,
    align = T, notes.align = "l",
    omit.stat = c("adj.rsq", "f", "ser"),
    covariate.labels = c("Intercept", "$\\hat{\\alpha}_\\text{IS}$"), dep.var.caption = "Dependent Variable:", dep.var.labels = c("$\\hat{\\alpha}_\\text{Post IS}$", "$\\hat{\\alpha}_\\text{Pre \\& Post IS}$"),
    notes.append=T, report = "vc*t", intercept.bottom = F)
}

table_is_oos_nls <- function(nls_post, nls_oos) {
  info <- list(
    "post" = list(
      "dep" = "post",
      "fit" = nls_post
    ),
    "oos" = list(
      "dep" = "oos",
      "fit" = nls_oos
    )
  ) 
  
  op <- info %>% sapply(simplify = F, USE.NAMES = T, function(x) {
    # Create Fake Linear Model
    fake_data <- tibble(y = rnorm(10), k0 = rnorm(10), kh = rnorm(10))
    colnames(fake_data) <- c(x$dep, "k0", "kh")
    lm_string <- paste0(x$dep, "~k0 + kh -1")
    fake_lm = lm(lm_string, data = fake_data)
    fake_x = c("k0", "kh")
    
    # Generate various parts for output
    sum_xx = summary(x$fit$nls_fit)
    mat_xx = sum_xx$coefficients[1:2, ]
    colnames(mat_xx) = c("coef","se", "t", "p")
    indVarNames = rownames(mat_xx)
    
    # Generate coefficients, se, t-stat and p values 
    df_xx = as.data.frame(mat_xx)
    vCoef = df_xx$coef; names(vCoef)=fake_x
    vSE = df_xx$se; names(vSE)=fake_x
    vT = df_xx$t; names(vT)=fake_x
    vP = df_xx$p; names(vP)=fake_x
    
    formulaTxt = sum_xx$formula
    nParameters = sum_xx$df[1]
    nDF = sum_xx$df[2]
    obs <- length(x$fit$nls_fit$m$resid())
    n_fcts <- uniqueN(x$fit$nls_data$c)
    rss = round(sum_xx$sigma, 3)
    convTolerance = x$fit$nls_fit$m$conv()
    list("lm"=fake_lm, "coef" = vCoef, "se" = vSE, "t" = vT, "p" = vP, "rss" = rss, "obs" = obs, "n_fcts" = n_fcts)
  })
  
  # Determine order
  y1 <- "post"
  y2 <- "oos"
  
  # Aesthetics 
  vTitle = "NLS - Unbiased: $R_{i,t} = \\alpha_i + (\\kappa_0 + \\kappa_h \\times \\alpha_i)\\times 1_\\text{\\{Out-of-Sample\\}}$"
  vType = "latex"
  # v_col_label = c("USA", "Developed", "Emerging")
  lines_obs <- c("Observations", 
                 sprintf("\\multicolumn{1}{r}{%s}", prettyNum(op[[y1]]$obs, big.mark = ",")), 
                 sprintf("\\multicolumn{1}{r}{%s}", prettyNum(op[[y2]]$obs, big.mark = ",")))
  lines_fcts <- c("Factors", sprintf("\\multicolumn{1}{r}{%s}", op[[y1]]$n_fcts), sprintf("\\multicolumn{1}{r}{%s}", op[[y2]]$n_fcts))
  dep_var_options <- list("post" = "Post IS", "oos" = "Pre \\& Post IS")
  dep_var_lbls <- c(dep_var_options[[y1]], dep_var_options[[y2]])
  
  # Output
  stargazer::stargazer(
    op[[y1]]$lm, op[[y2]]$lm,
    title = vTitle, out.header=T,
    no.space=T, digits=3, type=vType, single.row=F,
    align = T, notes.align = "l",
    omit.stat = c("rsq","adj.rsq", "f", "n", "ser"),
    covariate.labels = c("$\\kappa_0$", "$\\kappa_h$"), dep.var.labels.include = T, dep.var.caption = "OOS Period:", dep.var.labels = dep_var_lbls,
    add.lines=list(lines_fcts, lines_obs), report = "vc*t", intercept.bottom = F,
    # notes=vNotes, notes.append=T,
    coef=list(op[[y1]]$coef, op[[y2]]$coef), se=list(op[[y1]]$se, op[[y2]]$se), t=list(op[[y1]]$t, op[[y2]]$t), p=list(op[[y1]]$p, op[[y2]]$p)
  )
}

table_taus <- function(){
  taus <- list(
    list("USA", "us"),
    list("Developed", "developed"),
    list("Emerging", "emerging"),
    list("USA, Developed & Emerging", "all"),
    list("World", "world"),
    list("World ex. US", "world_ex_us"),
    list("USA - Mega", "us_mega"),
    list("USA - Large", "us_large"),
    list("USA - Small", "us_small"),
    list("USA - Micro", "us_micro"),
    list("USA - Nano", "us_nano")
  ) %>% lapply(function(x) {
    eb_est[[x[[2]]]]$mle %>% 
      select(estimate, ml_est) %>%
      spread(key = estimate, value = ml_est) %>%
      mutate(sample = x[[1]])
  }) %>% 
    bind_rows() %>%
    select(sample, tau_c, tau_s, tau_w)
  
  tau_cap <- paste(
    "The table shows the tau parameters estimated by maximum likelihood.",
    "$\\tau_c$ is the estimated dispersion in cluster alphas.",
    "$\\tau_w$ is the estimated dispersion in factor alphas with a cluster.",
    "$\\tau_s$ is the estimated dispersion in alpha of the same factor in different regions."
  )
  
  taus %>%
    select("Sample" = sample, "$\\tau_c$" = tau_c, "$\\tau_w$" = tau_s, "$\\tau_s$" = tau_w) %>% # Here I use the notation from eq 23
    xtable(auto=T, digits = 2, caption = tau_cap) %>% 
    print(include.rownames = F, caption.placement = "top", sanitize.colnames.function = identity) 
}

# Table - Factor Performance
table_factor_info <- function() {
  table <- eb_est$all$factors %>%
    mutate(p_zero = pnorm(q = 0, mean = post_mean, sd = post_sd)) %>%
    select(characteristic, region, ols_est, "eb_est" = post_mean, p_zero) %>% 
    pivot_wider(names_from = region, values_from = c(ols_est, eb_est, p_zero), names_sep = "_") %>%
    select(characteristic, ends_with("_us"), ends_with("_developed"), ends_with("_emerging")) %>%
    left_join(char_info %>% select(characteristic, significance), by = "characteristic") %>%
    mutate(char_name = if_else(significance == 0, paste0(characteristic, "*"), characteristic)) %>%
    select(-characteristic, -significance) %>%
    select(char_name, everything()) %>%
    arrange(ols_est_us) %>%
    as.data.frame() 
  
  data.frame(table[, 1:4], "empty1" = rep("", nrow(table)), table[, 5:7], "empty2" = rep("", nrow(table)), table[, 8:10]) %>%
    xtable() %>%
    print()
}

table_economic_benefit <- function(sig_pfs) {
  sig_regs <- c("us", "developed", "emerging") %>% lapply(function(x) {
    fit <- lm(ret ~ mkt, data = sig_pfs %>% filter(type == "marg_sig" & region == x) %>% mutate(ret = ret*100, mkt = mkt*100)) # Ensures alpha is in Percentages
    nw <- fit %>% lmtest::coeftest(vcov = sandwich::NeweyWest(fit, lag = 6))
    list("fit"=fit, "nw"=nw)
  })
  
  lines_obs <- c("Observations", 
                 sprintf("\\multicolumn{1}{r}{%s}", prettyNum(length(sig_regs[[1]]$fit$residuals), big.mark = ",")), 
                 sprintf("\\multicolumn{1}{r}{%s}", prettyNum(length(sig_regs[[2]]$fit$residuals), big.mark = ",")), 
                 sprintf("\\multicolumn{1}{r}{%s}", prettyNum(length(sig_regs[[3]]$fit$residuals), big.mark = ",")))
  lines_r2 <- c("Adjusted $R^2$", 
                sprintf("\\multicolumn{1}{r}{%s}", formatC(summary(sig_regs[[1]]$fit)$adj.r.squared, digits = 2, format = "f")), 
                sprintf("\\multicolumn{1}{r}{%s}", formatC(summary(sig_regs[[2]]$fit)$adj.r.squared, digits = 2, format = "f")), 
                sprintf("\\multicolumn{1}{r}{%s}", formatC(summary(sig_regs[[3]]$fit)$adj.r.squared, digits = 2, format = "f")))
  
  stargazer::stargazer(sig_regs[[1]]$nw, sig_regs[[2]]$nw, sig_regs[[3]]$nw, dep.var.labels.include = F, dep.var.caption = "Region",
                       no.space = F, intercept.bottom = F, report = "vc*t", column.labels = c("US", "Developed ex. US", "Emerging"),
                       add.lines=list(lines_obs, lines_r2), covariate.labels = c("Alpha", "Market Beta"), align=T, digits=2)
}


# PLOT FUNCTIONS ---------------------------------------------------------
cluster_val <- function(cor, labels, op_format = "pdf") {
  pairwise_cor <- cor %>%
    as_tibble(rownames = "char1") %>%
    gather(-char1, key = "char2", value = "cor") %>%
    left_join(select(labels, characteristic, "label1" = hcl_label), by = c("char1"="characteristic")) %>%
    left_join(select(labels, characteristic, "label2" = hcl_label), by = c("char2"="characteristic")) %>%
    filter(char1 != char2) %>% 
    mutate(hcl_pair = str_c(label1, "_", label2)) %>%
    group_by(hcl_pair) %>%
    summarise(
      n = n(),
      cor_avg = mean(cor)
    ) %>%
    ungroup() %>%
    separate(hcl_pair, c("hcl1", "hcl2"), sep = "_") %>%
    select(hcl1, hcl2, cor_avg) %>%
    spread(key = hcl2, cor_avg) 
  
  pairwise_cor_names <- pairwise_cor$hcl1
  pairwise_cor <- pairwise_cor %>% select(-hcl1) %>% as.matrix()
  rownames(pairwise_cor) <- pairwise_cor_names
  
  # Needs to Be saved as a functional
  if (op_format == "tex") {
    corrplot_cex <- list(tl = 0.8, number = 0.5)
  }
  if (op_format == "pdf") {
    corrplot_cex <- list(tl = 0.7, number = 0.45)
  }
  function() {
    par(xpd=TRUE)
    pairwise_cor %>%
      corrplot::corrplot(method = "color", addCoef.col = "black", type = "lower", mar = c(0, 0, 3, 0), tl.cex = corrplot_cex$tl,
                         number.cex = corrplot_cex$number, tl.col = "black", col = colorRampPalette(c(colours_theme[2], "white", colours_theme[1]))(200))
  }
}
plot_mt_eb_comp <- function(mt, eb_all, eb_us = NULL, eb_developed = NULL, eb_emerging = NULL, eb_world = NULL, mts = c("OLS", "Bonferroni", "BY"), regs = c("us", "developed", "emerging", "world"), se_methods, se_regions) {
  mt_sub <- mt %>%
    mutate(method = method %>% factor(levels = c("OLS", "Bonferroni", "Holm", "BH", "BY", "EB - Region", "EB - Full"))) %>%
    filter(method %in% mts & region %in% regs)
  
  (t_cutoff <- mt_sub %>% 
      group_by(method, region) %>%
      summarise(
        t_cut = (min(abs(statistic)[p < 0.05]) + max(abs(statistic)[p > 0.05])) / 2
      ))
  
  eb_comb <- bind_rows(
    eb_all$factors %>% mutate(method = "EB - All"),
    eb_us$factors %>% mutate(method = "EB - Region"),
    eb_developed$factors %>% mutate(method = "EB - Region"),
    eb_emerging$factors %>% mutate(method = "EB - Region"),
    eb_world$factors %>% mutate(method = "EB - Region")
  ) %>% mutate(
    method = method %>% factor(levels = c("OLS", "Bonferroni", "Holm", "BH", "BY", "EB - Region", "EB - All"))
  )
  
  mt_table <- mt_sub %>%
    mutate(characteristic = char_reg %>% str_remove("__.+")) %>%
    left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>% 
    filter(orig_sig == T) %>%
    group_by(region, method, mt_adj) %>%
    summarise(
      n = n(),
      significant = mean(p < 0.05 & estimate > 0), # Estimates also needs to be positive
      max_t_insig = max(abs(statistic[p > 0.05]))
    ) 
  
  eb_table <- eb_comb %>%
    left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>% 
    filter(orig_sig == T) %>%
    mutate(ols_t = ols_est / ols_se) %>%
    group_by(region, method) %>%
    summarise(
      mt_adj = "Bayesian",
      n = n(),
      significant = mean(p025 > 0),
      max_t_insig = max(abs(ols_t[p025 < 0 & p975 > 0]))
    )
  
  (comp_table <- mt_table %>% 
      bind_rows(eb_table)) 
  
  # To install drlib put: devtools::install_github("dgrtwo/drlib")
  if (FALSE) {
    repl_plot <- comp_table %>%
      mutate(
        region_pretty = case_when(
          region == "us" ~ "US",
          region == "developed" ~ "Developed Ex. US",
          region == "emerging" ~ "Emerging",
          region == "world" ~ "World"
        ),
        region_pretty = region_pretty %>% factor(levels = c("US", "Developed Ex. US", "Emerging", "World"))
      ) %>%
      filter((region == "world" & method == "EB - All") | region != "world") %>%
      filter(method != "Bonferroni") %>%
      # filter(!(region == "world" & method != "Empirical Bayes")) %>%
      ggplot(aes(drlib::reorder_within(method, significant, region_pretty), significant, fill = method)) +
      geom_col() +
      drlib::scale_x_reordered() +
      geom_text(aes(label = str_c(formatC(round(significant * 100, 2), digits = 2, format = "f"), "%")), nudge_y = 0.025, size = 3) +
      facet_wrap(~region_pretty, nrow = 1, scales = "free_x") +
      labs(x = "Method", fill = "Multiple Testing Adj.", y = "Replication Rate (%)") +
      theme(legend.position = "none")
  }
  
  repl_plot <- comp_table %>%
    # filter((region == "world" & method == "EB - All") | region != "world") %>%
    filter(method != "Bonferroni") %>%
    group_by(method) %>%
    # mutate(sort_var = significant[region == "us"]) %>%
    mutate(
      region_pretty = case_when(
        region == "us" ~ "US",
        region == "developed" ~ "Developed Ex. US",
        region == "emerging" ~ "Emerging",
        region == "world" ~ "World"
      ),
      region_pretty = region_pretty %>% factor(levels = c("US", "Developed Ex. US", "Emerging", "World")),
      method_pretty = case_when(
        method == "BY" ~ "Benjamini-Yekutieli",
        method == "EB - Region" ~ "Empirical Bayes - Region",
        method == "EB - All" ~ "Empirical Bayes - All",
        method == "OLS" ~ "OLS"
      ),
      method_pretty = method_pretty %>% factor(levels = c("OLS", "Benjamini-Yekutieli", "Empirical Bayes - Region", "Empirical Bayes - All"))
    ) %>%
    ggplot(aes(method_pretty, significant*100, fill = method_pretty)) + #reorder(method_pretty, sort_var)
    geom_col() +
    geom_text(aes(label = str_c(formatC(round(significant * 100, 1), digits = 1, format = "f"), "%")), nudge_y = 2.5, size = 3) +
    facet_grid(. ~ region_pretty, scales = "free", space='free') +
    # facet_wrap(~region_pretty, nrow = 1, scales = "free_x") +
    labs(x = "Method", fill = "Multiple Testing Adj.", y = "Replication Rate (%)") +
    theme(legend.position = "none", axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1, size = 10), axis.title.x = element_blank())
  
  
  eb_overview <- eb_comb %>% 
    left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>% 
    mutate(
      t_cut = 1.96,
      type = case_when(
        p025 > 0 & orig_sig == 1 ~ "Replicated",
        p025 <= 0 & orig_sig == 1 ~ "Not Replicated",
        orig_sig == 0 ~ "Never Significant"
      )
    ) %>% 
    select(region, method, type, char_reg, estimate = post_mean, t_cut, se = post_sd)
  
  
  mt_plot <- mt_sub %>%
    mutate(characteristic = char_reg %>% str_remove("__.+")) %>%
    left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>% 
    left_join(t_cutoff, by = c("region", "method")) %>%
    mutate(
      # significant = if_else(p < 0.05 & estimate > 0, "Significant", "Insignificant"),
      type = case_when(
        p <= 0.05 & estimate > 0 & orig_sig == 1 ~ "Replicated",
        (p > 0.05 | estimate <= 0) & orig_sig == 1 ~ "Not Replicated",
        orig_sig == 0 ~ "Never Significant"
      )
    ) %>%
    bind_rows(eb_overview) %>%
    filter(region %in% se_regions & method %in% se_methods) %>%
    mutate(
      characteristic = char_reg %>% str_extract(".+[?=__]") %>% str_remove("__"),
      type = type %>% factor(levels = c("Replicated", "Not Replicated", "Never Significant")),
      method_pretty = case_when(
        method == "BY" ~ "Multiple Testing - Benjamini-Yekutieli",
        method == "EB - Region" ~ "Empirical Bayes - US",
        method == "EB - All" ~ "Empirical Bayes - Global",
        method == "OLS" ~ "OLS"
      ),
      method_pretty = method_pretty %>% factor(levels = c("OLS", "Multiple Testing - Benjamini-Yekutieli", "Empirical Bayes - US", "Empirical Bayes - Global"))
    ) %>%
    group_by(characteristic) %>%
    # mutate(sort_var = statistic[method == "OLS" & region == "us"]) %>%
    mutate(sort_var = estimate[method == "OLS" & region == "us"]) %>%
    group_by(region, method) %>%
    mutate(
      ols_rank = frank(sort_var),
      repl_rate = sum(type == "Replicated") / sum(type %in% c("Replicated", "Not Replicated"))
    ) %>%
    ggplot(aes(reorder(ols_rank, sort_var), estimate, colour = type, linetype = type)) +
    geom_point() +
    geom_text(aes(x = 35, y = 1.45, label = str_c("Replication Rate: ", formatC(round(repl_rate*100, 1), digits = 1, format = "f"), "%")), 
              colour = "black", size = 3, check_overlap = T) +
    geom_errorbar(aes(ymin = estimate - t_cut * se, ymax = estimate + t_cut * se)) +
    facet_wrap(~method_pretty, ncol = length(se_methods) / 2) + 
    coord_cartesian(ylim = c(-1, 1.5)) +
    geom_hline(yintercept = 0, linetype = "dashed") +
    guides(colour = guide_legend(override.aes = list(shape = NA))) + 
    labs(y = "Monthly Alpha (%)") +
    theme(
      axis.title.x = element_blank(),
      axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1),
      # text = element_text(size = 13),
      legend.title = element_blank(),
      legend.position = "top"
    )
  
  list("mt" = mt_plot, "repl" = repl_plot)
}

plot_fdr <- function(simulated_fdr) {
  simulated_fdr %>%
    gather(fdr, fwr, key = "type", value = "rate") %>%
    mutate(type = type %>% str_to_upper()) %>%
    ggplot(aes(t_cutoff, rate, colour = type)) +
    geom_point() +
    geom_hline(yintercept = 0.05, linetype = "dashed") + 
    geom_vline(xintercept = 1.96, linetype = "dotted") +
    scale_y_continuous(breaks = c(0, 0.05, 0.25, 0.5, 0.75, 1)) +
    scale_x_continuous(breaks = c(0, 1.96, 2.5, 5.0, 7.5, 10)) +
    labs(x = "Critical Value (t)", y = "Rate", colour = "Type:") +
    theme(
      legend.position = "top"
    )
}

plot_factor_post <- function(eb, orig_sig, cluster_order) {
  if (orig_sig) {
    orig_sig_values <- T
  } else {
    orig_sig_values <- c(T, F)
  }
  
  eb$factors %>%
    left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>%
    filter(orig_sig %in% orig_sig_values) %>%
    group_by(hcl_label) %>%
    mutate(
      sort_var = median(post_mean) + post_mean / 1000000
    ) %>%
    mutate(hcl_label = hcl_label %>% factor(levels = cluster_order)) %>%
    ggplot(aes(reorder(characteristic, sort_var), post_mean, colour = hcl_label, shape = hcl_label)) +
    geom_point() +
    scale_shape_manual(values=1:13) +
    geom_errorbar(aes(ymin = post_mean - 1.96 * post_sd, ymax = post_mean + 1.96 * post_sd)) +
    geom_hline(yintercept = 0, linetype = "dashed") +
    labs(y = "Monthly Alpha with 95% Confidence Interval (%)", colour = "Cluster", shape = "Cluster") +
    theme(
      axis.title.x = element_blank(),
      axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)
    )
}

plot_repl_region <- function(eb_all, cluster_order) {
  eb_all$factors %>% 
    left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>% 
    filter(orig_sig == 1) %>%
    mutate(hcl_label = hcl_label %>% factor(levels = cluster_order)) %>%
    group_by(region, hcl_label) %>% 
    summarise(n = n(), repl_rate = mean(post_mean - 1.96 * post_sd > 0)) %>% 
    group_by(hcl_label) %>%
    mutate(
      sort_var = repl_rate[region == "us"] + n[region == "us"] / 1e6,
      region_pretty = case_when(
        region == "us" ~ "USA",
        region == "developed" ~ "Developed Ex. USA",
        region == "emerging" ~ "Emerging"
      ),
      region_pretty = region_pretty %>% factor(levels = c("USA", "Developed Ex. USA", "Emerging"))
    ) %>%
    ggplot(aes(reorder(hcl_label, sort_var), repl_rate*100, fill = hcl_label)) +
    geom_col() +
    labs(y = "Replication Rate (%)") +
    facet_wrap(~region_pretty, ncol = 1) +
    theme(
      legend.position = "none",
      axis.title.x = element_blank(),
      axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)
    )
}

# Figure 1 - Waterfall Graph
plot_lit_comp <- function(eb_us, mt_res, eb_world, excl_insig=T) {
  if (excl_insig) {
    sig_group <- T
  } else {
    sig_group <- c(T, F)
  }
  raw_reg <- eb_us$input$long %>%
    group_by(characteristic) %>%
    nest() %>%
    mutate(
      raw_reg = data %>% map(~ lm(ret ~ 1, data = .x)),
      tidied = raw_reg %>% map(tidy)
    ) %>%
    unnest(tidied) %>% 
    ungroup() 
  
  raw_overall <- raw_reg %>%
    summarise(repl_rate = mean(p.value < 0.05 & estimate > 0)) %>%
    pull(repl_rate)
  
  raw_sig <- raw_reg %>% 
    left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>% 
    filter(orig_sig == T) %>%
    summarise(repl_rate = mean(p.value < 0.05 & estimate > 0)) %>%
    pull(repl_rate)
  
  capm <- mt_res %>% 
    filter(region == "us" & method %in% c("BY", "OLS")) %>% 
    mutate(characteristic = char_reg %>% str_remove("__.+")) %>%
    left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>% 
    filter(orig_sig %in% sig_group) %>%
    group_by(method) %>% 
    summarise(repl_rate = mean(p < 0.05 & estimate > 0))
  
  eb_us_repl <- eb_us$factors %>%
    left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>% 
    filter(orig_sig %in% sig_group) %>%
    summarise(repl_rate = mean(p025 > 0)) %>%
    pull(repl_rate)
  
  eb_global_repl <- eb_world$factors %>% 
    ungroup() %>% 
    left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>% 
    filter(orig_sig %in% sig_group) %>%
    summarise(repl_rate = mean(post_mean - 1.96 * post_sd > 0)) %>% 
    pull(repl_rate)
  
  # Waterfall Graph
  litterature_comp <- tribble(
    ~ type, ~repl_rate,
    "hxz", 0.35,
    "raw", raw_overall,
    "raw_sig", raw_sig,
    "alpha", capm %>% filter(method == "OLS") %>% pull(repl_rate),
    "mt", capm %>% filter(method == "BY") %>% pull(repl_rate),
    "eb_us", eb_us_repl,
    "eb_global", eb_global_repl
  ) 
  if (excl_insig == F) {
    litterature_comp <- litterature_comp %>% filter(type != "raw_sig")
  } 
  litterature_comp <- litterature_comp %>%
    mutate(
      repl_rate = repl_rate * 100,
      type = type %>% factor(levels = c("hxz", "raw", "raw_sig", "alpha", 
                                        "mt", "eb_us", "eb_global")),
      prev_repl_rate = dplyr::lag(repl_rate, default = 0),
      impact = if_else(repl_rate > prev_repl_rate, "Increase", "Decrease"),
      impact = impact %>% factor(levels = c("Increase", "Decrease"))
    ) %>%
    setDT() 
  
  w <- 0.3  #use to set width of bars
  l1 <- -3
  inc <- -3 
  col_top <- "black" # colours_theme[2]
  col_bot <- "black" # colours_theme[1]
  type <- litterature_comp$type
  plot <- litterature_comp %>%
    ggplot(aes(xmin = as.integer(type) - w, xmax = as.integer(type) + w, ymin = prev_repl_rate, ymax = repl_rate, fill = impact)) +
    geom_rect() +
    geom_segment(data = litterature_comp[1:(.N - 1)], aes(x = as.integer(type) + w, xend = as.integer(type) + w + 1, y = repl_rate, yend = repl_rate)) +
    scale_x_discrete(limits = type) +
    geom_text(aes(x = as.integer(type), y = repl_rate + if_else(sign(repl_rate - prev_repl_rate)!=0, sign(repl_rate - prev_repl_rate), 1) * 2, label = str_c(formatC(round(repl_rate, 1), digits = 1, format = "f"), "%"))) + 
    scale_fill_manual(values = (c("Increase" = colours_theme[1], "Decrease" = colours_theme[2]))) +
    labs(x = "Implementation", y = "Replication Rate (%)") +
    coord_cartesian(ylim = c(0, 90), expand = FALSE, clip = "off") +
    # HXZ
    annotate(geom = "text", x = "hxz", y = l1, label = "Hou, Xue, and", colour = col_top, fontface = 2) +
    annotate(geom = "text", x = "hxz", y = l1 + inc*1, label = "Zhang (2020)", colour = col_top, fontface = 2) +
    annotate(geom = "text", x = "hxz", y = l1 + inc*2, label = "Raw returns", colour = col_bot, fontface = 1) +
    # Our Raw
    annotate(geom = "text", x = "raw", y = l1, label = "Our sample", colour = col_top, fontface = 2) +
    annotate(geom = "text", x = "raw", y = l1 + inc*1, label = "Raw returns,", colour = col_bot) +
    annotate(geom = "text", x = "raw", y = l1 + inc*2, label = "our methodology", colour = col_bot) +
    # Our Alpha
    annotate(geom = "text", x = "alpha", y = l1, label = "Our sample", colour = col_top, fontface = 2) +
    annotate(geom = "text", x = "alpha", y = l1 + inc*1, label = "CAPM alphas", colour = col_bot) +
    # Our MT
    annotate(geom = "text", x = "mt", y = l1, label = "Harvey, Liu, and", colour = col_top, fontface = 2) +
    annotate(geom = "text", x = "mt", y = l1 + inc*1, label = "Zhu (2016)", colour = col_bot, fontface = 2) +
    annotate(geom = "text", x = "mt", y = l1 + inc*2, label = "Multiple testing", colour = col_bot) +
    annotate(geom = "text", x = "mt", y = l1 + inc*3, label = "adjustment", colour = col_bot) +
    # Our EB-US
    annotate(geom = "text", x = "eb_us", y = l1, label = "Our Bayesian", colour = col_top, fontface = 2) +
    annotate(geom = "text", x = "eb_us", y = l1 + inc*1, label = "estimation", colour = col_top, fontface = 2) +
    annotate(geom = "text", x = "eb_us", y = l1 + inc*2, label = "US data", colour = col_bot) +
    # Our EB-US
    annotate(geom = "text", x = "eb_global", y = l1, label = "Our Bayesian", colour = col_top, fontface = 2) +
    annotate(geom = "text", x = "eb_global", y = l1 + inc*1, label = "estimation", colour = col_top, fontface = 2) +
    annotate(geom = "text", x = "eb_global", y = l1 + inc*2, label = "Global data", colour = col_bot) +
    theme(
      legend.title = element_blank(),
      plot.margin = unit(c(1, 1, 4, 1), "lines"),
      axis.title.x = element_blank(),
      axis.text.x = element_blank(),
      panel.grid.major.x = element_blank(),
      panel.grid.minor.x = element_blank()) 
  
  if (excl_insig) {
    # Our Raw Significant only
    plot <- plot +
      annotate(geom = "text", x = "raw_sig", y = l1, label = "Our sample", colour = col_top, fontface = 2) +
      annotate(geom = "text", x = "raw_sig", y = l1 + inc*1, label = "Excl. factors", colour = col_bot) +
      annotate(geom = "text", x = "raw_sig", y = l1 + inc*2, label = "never significant", colour = col_bot)
  }
  return(plot)
}

plot_many_factors <- function() {
  # The Power of Many Factors
  many_factors_se <- eb_est$us$input$long %>%
    select("char_reg" = name_wide, ret_neu_scaled, mkt_vw_exc) %>%
    mutate(
      region = char_reg %>% str_extract(pattern = "(?<=_{2}).+"),
      region = case_when(
        region == "us" ~ "USA",
        region == "developed" ~ "Developed Ex. USA",
        region == "emerging" ~ "Emerging"
      ),
      region = region %>% factor(levels = c("USA", "Developed Ex. USA", "Emerging"))
    ) %>%
    filter(!is.na(ret_neu_scaled)) %>%
    group_by(char_reg, region) %>%
    nest() %>%
    mutate(
      fit = data %>% map(~lm(ret_neu_scaled ~ mkt_vw_exc, data = .x)),  # They are market neutral by construction but this takes care of the degress of freedom adjustment 
      nw = fit %>% map(~ lmtest::coeftest(.x, vcov = sandwich::NeweyWest(.x)) %>% broom::tidy()),
      df = fit %>% map_dbl(~ .x$df.residual)
    ) %>%
    unnest(nw) %>%
    filter(term == "(Intercept)") %>%
    rename("p_ols" = p.value) %>%
    # group_by(region) %>%
    ungroup() %>%
    mutate(
      n = n(),
      p_bonf = p_ols %>% p.adjust(method = "bonferroni"),
      p_holm = p_ols %>% p.adjust(method = "holm"),
      p_bh = p_ols %>% p.adjust(method = "BH"),
      p_by = p_ols %>% p.adjust(method = "BY")
    ) %>%
    select(n, region, char_reg, estimate, statistic, df, "se" = std.error, starts_with("p_")) %>%
    gather(starts_with("p_"), key = "method", value = "p") %>%
    mutate(
      method = method %>% str_remove("^p_"),
      mt_adj = case_when(
        method == "ols" ~ "None",
        method == "bh" ~ "FDR",
        method == "by" ~ "FDR",
        method == "bonf" ~ "FWR",
        method == "holm" ~ "FWR"
      ),
      method = case_when(
        method == "ols" ~ "OLS",
        method == "bh" ~ "BH",
        method == "by" ~ "Benjamini-Yekutieli",
        method == "bonf" ~ "Bonferroni",
        method == "holm" ~ "Holm",
        TRUE ~ method
      ),
      method = method %>% factor(levels = c("OLS", "Bonferroni", "Holm", "BH", "Benjamini-Yekutieli", "EB - Region", "EB - Full"))
    ) 
  
  (mf_t <- many_factors_se %>% 
      group_by(method) %>%
      summarise(
        t_140 = (min(abs(statistic)[p < 0.05]) + max(abs(statistic)[p > 0.05])) / 2
      ))
  
  avg_se <- mean(many_factors_se$se)
  
  ols_ci <- tibble(
    method = c("OLS", "Bonferroni", "Benjamini-Yekutieli", "Empirical Bayes"), 
    t_1 = rep(1.96, 4)
  ) %>%
    left_join(mf_t, by = "method") %>%
    gather(t_1, t_140, key = "n_factors", value = "t") %>%
    mutate(
      n_factors = n_factors %>% str_remove("t_") %>% as.integer(),
      p025 = -t * avg_se,
      p975 = t * avg_se
    ) %>%
    filter(!(method == "Empirical Bayes" & n_factors == 140)) 
  
  eb_ci <- eb_est$us$factors %>%
    summarise(
      eb_se = mean(post_sd)
    ) %>%
    transmute(
      method = "Empirical Bayes",
      n_factors = 140,
      p025 = -1.96 * eb_se,
      p975 = 1.96 * eb_se
    )  
  
  comb_data <- bind_rows(ols_ci, eb_ci)
  
  comb_data %>%
    ggplot(aes(n_factors, colour = method, linetype = method)) +
    geom_line(aes(y = p025)) +
    geom_line(aes(y = p975)) +
    geom_ribbon(data=comb_data %>% filter(method == "Empirical Bayes"), 
                aes(x = n_factors, ymin=p025,ymax=p975), fill=colours_theme[3], alpha=0.2, inherit.aes = F) +
    labs(x = "Number of Factors", y = "Centered 95% Confidence Interval", colour = "Method", linetype = "Method") +
    scale_y_continuous(breaks=c(avg_se * 1.96, 0, -avg_se * 1.96),
                       labels=c(expression(hat(alpha) + sigma[hat(alpha)] %*% t), expression(hat(alpha)), expression(hat(alpha) - sigma[hat(alpha)] %*% t))) +
    scale_x_continuous(breaks = c(1, 140), expand=expansion(mult = c(0, 0.3), 
                                                            add = c(5, 0))) +
    geom_dl(aes(label = method, y = p975), method = list(dl.trans(x = x + 0.2), "last.points", cex = 1)) +
    theme(
      text = element_text(size = 12),
      axis.title.y = element_blank(),
      legend.position = "none"
    )
}


# Plot In-Sample vs. Out of Sample
plot_is_oos <- function(ub_us, ub_dev, ub_emer) {
  is_oos_split <- ub_us$ols_regs %>% 
    mutate(region = "us") %>%
    bind_rows(
      ub_dev$ols_regs %>% mutate(region = "dev"),
      ub_emer$ols_regs %>% mutate(region = "emer")
    ) %>%
    mutate(
      region = case_when(
        region == "us" ~ "USA",
        region == "dev" ~ "Developed",
        region == "emer" ~ "Emerging"
      ),
      region = region %>% factor(levels = c("USA", "Developed", "Emerging"))
    ) %>%
    rename("characteristic" = c)
  
  cluster_plot <- is_oos_split %>%
    left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>% 
    filter(orig_sig == 1) %>%
    left_join(cluster_labels, by = "characteristic") %>%
    gather(is, post_sample, key = "period", value = "estimate") %>%
    group_by(region, hcl_label, period) %>%
    summarise(
      alpha_mean = mean(estimate)
    ) %>% 
    select(region, hcl_label, period, alpha_mean) %>%
    spread(key = period, value = alpha_mean) %>%
    mutate(nudge_y = -0.015) %>%
    ggplot(aes(is, post_sample)) +
    geom_point() +
    ggrepel::geom_text_repel(aes(label = hcl_label), nudge_y = -0.015) +
    geom_abline(intercept = 0, slope = 1, linetype = "dotted") +
    # geom_smooth(method = "lm", se = F) +
    facet_wrap(~region) +
    labs(x = "Monthly Alpha (%): In-Sample", y = "Monthly Alpha (%): Post Sample")
  
  is_oos_split %>%
    left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>% 
    filter(orig_sig == 1) %>%
    ggplot(aes(is, post_sample)) +
    geom_point() +
    geom_abline(intercept = 0, slope = 1, linetype = "dotted") +
    geom_smooth(method = "lm", se = F, formula = "y ~ x") +
    facet_wrap(~region, ncol = 1) +
    labs(x = "Monthly Alpha (%): In-Sample", y = "Monthly Alpha (%): Post Sample")
}


plot_is_oos_factors <- function(is_oos_regions) {
  is_oos_data <- is_oos_regions %>%
    left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>% 
    filter(orig_sig == 1) %>%
    mutate(
      region = case_when(
        region == "us" ~ "USA",
        region == "developed" ~ "Developed Ex. USA",
        region == "emerging" ~ "Emerging"
      ),
      region = region %>% factor(levels = c("USA", "Developed Ex. USA", "Emerging"))
    ) %>%
    select(region, characteristic, period, estimate) %>%
    spread(key = period, value = estimate) 
  
  is_oos_data %>%
    group_by(region) %>%
    nest() %>%
    mutate(
      fit = data %>% map(~lm(oos ~ is, data = .x)),
      n = fit %>% map_dbl(~length(.x$residuals)),
      tidied = fit %>% map(tidy)
    ) %>%
    unnest(tidied)
  
  is_oos_data %>% 
    ggplot(aes(is, oos)) + 
    geom_point() + 
    geom_smooth(method = "lm", se = F, formula = "y ~ x") +
    facet_wrap(~region) +
    geom_hline(yintercept = 0) +
    geom_vline(xintercept = 0) +
    coord_fixed() +
    labs(x = "Monthly Alpha (%): In-Sample", y = "Monthly Alpha (%): Out-of-Sample")
}

plot_tpf <- function(tpf, cluster_order, ci_low = 0.05, ci_high = 0.95) {
  orig <- tpf %>%
    filter(id == "Apparent") %>%
    select(coef) %>%
    unnest(coef) %>%
    rename("tpf_weight" = weight)
  
  bs <- tpf %>%
    filter(id != "Apparent") %>%
    unnest(coef) %>%
    group_by(term) %>%
    summarise(
      bs_mean = mean(weight),
      bs_sd = sd(weight),
      bs_se = bs_sd / sqrt(n()),
      bs_low = weight %>% quantile(ci_low),
      bs_high = weight %>% quantile(ci_high),
      bs_prob_zero = mean(weight == 0)
    ) %>%
    left_join(orig, by = "term") %>%
    mutate(bs_bias = bs_mean - tpf_weight)
  print(paste0("Clusters with significantly positive TPF weight: ", sum(filter(bs, term != "Market")$bs_low>0)))
  bs %>% 
    mutate(
      term = term %>% factor(levels = c(cluster_order, "Market"))
    ) %>%
    ggplot(aes(reorder(term, tpf_weight), tpf_weight*100, fill = term)) +
    geom_col() +
    coord_flip() +
    geom_errorbar(mapping = aes(ymin = bs_low*100, ymax = bs_high*100), width = 0.2, size = 0.2) +
    labs(y = "Weight in Tangency PF (%)") +
    theme(
      axis.title.y = element_blank(),
      legend.position = "none"
    )
}

plot_tpf_region <- function(tpf_us, tpf_dev, tpf_emer, cluster_order, ci_low = 0.05, ci_high = 0.95) {
  all <- bind_rows(tpf_us, tpf_dev, tpf_emer)
  
  orig <- all %>%
    filter(id == "Apparent") %>%
    select(market_region, coef) %>%
    unnest(coef) %>%
    rename("tpf_weight" = weight)
  
  bs <- all %>%
    filter(id != "Apparent") %>%
    unnest(coef) %>%
    group_by(market_region, term) %>%
    summarise(
      bs_mean = mean(weight),
      bs_sd = sd(weight),
      bs_se = bs_sd / sqrt(n()),
      bs_low = weight %>% quantile(ci_low),
      bs_high = weight %>% quantile(ci_high),
      bs_prob_zero = mean(weight == 0)
    ) %>%
    left_join(orig, by = c("market_region", "term")) %>%
    mutate(bs_bias = bs_mean - tpf_weight)
  
  
  bs %>% 
    group_by(term) %>% 
    mutate(
      sort_var = tpf_weight[market_region == "us"],
      region_pretty = case_when(
        market_region == "us" ~ "USA",
        market_region == "developed" ~ "Developed Ex. USA",
        market_region == "emerging" ~ "Emerging"
      ),
      region_pretty = region_pretty %>% factor(levels = c("USA", "Developed Ex. USA", "Emerging"))
    ) %>%
    mutate(term = term %>% factor(levels = c(cluster_order, "Market"))) %>%
    ggplot(aes(reorder(term, sort_var), tpf_weight*100, fill = term)) +
    geom_col() +
    geom_errorbar(mapping = aes(ymin = bs_low*100, ymax = bs_high*100), width = 0.2, size = 0.2) +
    labs(y = "Weight in Tangency PF (%)") +
    theme(
      axis.title.x = element_blank(),
      axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),
      legend.position = "none"
    ) +
    facet_wrap(~region_pretty, ncol = 1)
}

plot_tpf_size <- function(tpf_size_samples, cluster_order, ci_low = 0.05, ci_high = 0.95) {
  orig <- tpf_size_samples %>%
    filter(id == "Apparent") %>%
    select(size_grp, coef) %>%
    unnest(coef) %>%
    rename("tpf_weight" = weight)
  
  bs <- tpf_size_samples %>%
    filter(id != "Apparent") %>%
    unnest(coef) %>%
    group_by(size_grp, term) %>%
    summarise(
      bs_mean = mean(weight),
      bs_sd = sd(weight),
      bs_se = bs_sd / sqrt(n()),
      bs_low = weight %>% quantile(ci_low),
      bs_high = weight %>% quantile(ci_high),
      bs_prob_zero = mean(weight == 0)
    ) %>%
    left_join(orig, by = c("size_grp", "term")) %>%
    mutate(bs_bias = bs_mean - tpf_weight)
  
  
  bs %>% 
    group_by(term) %>% 
    mutate(
      sort_var = tpf_weight[size_grp == "mega"],
      size_grp_pretty = size_grp %>% str_to_title(),
      size_grp_pretty = size_grp_pretty %>% factor(levels = c("Mega", "Large", "Small", "Micro", "Nano"))
    ) %>%
    mutate(term = term %>% factor(levels = c(cluster_order, "Market"))) %>%
    ggplot(aes(reorder(term, sort_var), tpf_weight*100, fill = term)) +
    geom_col() +
    geom_errorbar(mapping = aes(ymin = bs_low*100, ymax = bs_high*100), width = 0.2, size = 0.2) +
    labs(y = "Weight in Tangency PF (%)") +
    theme(
      axis.title.x = element_blank(),
      axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),
      legend.position = "none"
    ) +
    facet_wrap(~size_grp_pretty, ncol = 1)
}

plot_over_time <- function(posterior_over_time, orig_sig, ols_incl, lb, bw) {
  if (orig_sig) {
    orig_sig_values <- T
  } else {
    orig_sig_values <- c(T, F)
  }
  
  all_factors <- tibble("char_reg" = rownames(posterior_over_time[[1]]$factor_mean)) %>%
    mutate(characteristic = char_reg %>% str_remove(paste0("__", ot_region))) %>%
    left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>%
    mutate(
      selected_factors = (orig_sig %in% orig_sig_values) 
    )
  i <- all_factors$selected_factors
  
  full_posterior <- posterior_over_time %>% lapply(function(eb_act) {
    a <- eb_act$factor_mean[i]
    a_cov <- eb_act$factor_cov[i, i]
    n <- length(a)
    w <- rep(1/n, n)
    post_mean <- drop(t(a) %*% w)
    post_sd <- drop(sqrt(t(w) %*% a_cov %*% w))
    avg_ols <- mean(eb_act$factors$ols_est[i])
    tibble("end_date"= eb_act$end_date, n=n, post_mean, post_sd, avg_ols) 
  }) %>% bind_rows()
  
  # Black and white coloring
  if (bw) {
    col1 <- "black"
    col2 <- "grey35"
  } else {
    col1 <- colours_theme[1]
    col2 <- colours_theme[2]
  }
  
  if (ols_incl) {
    # Create OLS benchmarks
    ols_bm <- seq.Date(from = as.Date("1959-12-31"), to = settings$end_date, by = "1 year") %>% lapply(function(end_date) {
      data <- regional_pfs[region == ot_region] %>% 
        filter(characteristic %in% all_factors$characteristic[i]) %>%
        filter(eom >= settings$start_date & eom <= end_date) %>%
        eb_prepare(
          scale_alpha = settings$eb$scale_alpha, 
          overlapping = settings$eb$overlapping 
        )
      
      avg_alpha_full <- data$long %>%
        group_by(characteristic) %>%
        summarise(
          n = n(),
          alpha = mean(ret_neu_scaled)
        ) %>%
        ungroup() %>%
        summarise(
          end_date = end_date,
          type = "avg_alpha_full",
          alpha = mean(alpha)
        )
      
      avg_alpha_st <- data$long %>%
        filter(year(eom) > (year(end_date)-lb)) %>%
        group_by(characteristic) %>%
        mutate(
          ret_neu_st = (ret - cov(ret, mkt_vw_exc)/var(mkt_vw_exc) * mkt_vw_exc)*100,
          ret_neu_st = ret_neu_st / sd(ret_neu_st) * (10 / sqrt(12))
        ) %>%
        summarise(alpha = mean(ret_neu_st)) %>%
        ungroup() %>%
        summarise(
          end_date = end_date,
          type = "avg_alpha_st",
          alpha = mean(alpha)
        )
      
      alpha_avg <- data$long %>%
        group_by(eom) %>%
        summarise(
          n = n(),
          ret = mean(ret_neu_scaled),
          mkt_vw_exc = unique(mkt_vw_exc)
        ) %>%
        ungroup() %>%
        mutate(
          ret_neu = ret - cov(ret, mkt_vw_exc) / var(mkt_vw_exc) * mkt_vw_exc
        ) %>%
        summarise(
          end_date = end_date,
          type = "alpha_avg_full",
          alpha = mean(ret_neu)
        )
      
      bind_rows(avg_alpha_full, avg_alpha_st, alpha_avg)
    }) %>% bind_rows()
    
    ols_bm_wide <- ols_bm %>% spread(key = type, value = alpha)
    
    (plot_1 <- full_posterior %>%
        left_join(ols_bm_wide, by = "end_date") %>%
        ggplot(aes(end_date)) +
        geom_point(aes(y = post_mean, colour="Average Posterior Alpha", shape = "Average Posterior Alpha")) +
        geom_point(aes(y = avg_alpha_full, colour="Average OLS Alpha", shape = "Average OLS Alpha")) +
        geom_errorbar(aes(ymin = post_mean + 1.96 * post_sd, ymax = post_mean - 1.96 * post_sd)) +
        scale_colour_manual(name = "Test", values = c("Average Posterior Alpha"=col1, "Average OLS Alpha"=col2)) +
        scale_shape_manual(name = "Test", values = c("Average Posterior Alpha" = 16, "Average OLS Alpha" = 17)) +
        labs(y = "Posterior Alpha with 95% CI (%)") +
        ylim(c(0, NA)) +
        scale_x_date(breaks = seq.Date(as.Date("1960-12-31"), as.Date("2020-12-31"), by = "10 years"), date_labels = "%Y-%m") +
        theme(
          legend.title = element_blank(),
          legend.position = "top",
          axis.text.x = element_blank(),
          axis.title.x = element_blank()
        ))
    
    plot_2 <- full_posterior %>%
      left_join(ols_bm_wide, by = "end_date") %>%
      ggplot(aes(end_date, avg_alpha_st)) +
      geom_col() +
      labs(y = "5-year Rolling Alpha (%)") +
      scale_x_date(breaks = seq.Date(as.Date("1960-12-31"), as.Date("2020-12-31"), by = "10 years"), date_labels = "%Y-%m") +
      theme(
        axis.title.x = element_blank()
      )
    
    # print(full_posterior %>% left_join(ols_bm_wide, by = "end_date") %>% mutate(diff_pm = post_mean - lag(post_mean), diff_aaf = avg_alpha_full - lag(avg_alpha_full)) %>% filter(end_date != as.Date("1960-12-31")) %>% summarise(sd_pm = sd(diff_pm), sd_aaf = sd(diff_aaf)))
    
    plot <- cowplot::plot_grid(plot_1, plot_2, ncol = 1, rel_heights = c(2, 1))
  } else {
    plot <- full_posterior %>%
      ggplot(aes(end_date)) +
      geom_point(aes(y = post_mean), colour=col1, shape = 16) +
      geom_errorbar(aes(ymin = post_mean + 1.96 * post_sd, ymax = post_mean - 1.96 * post_sd)) +
      labs(y = "Posterior Alpha with 95% CI (%)") +
      ylim(c(0, NA)) +
      scale_x_date(breaks = seq.Date(as.Date("1960-12-31"), as.Date("2020-12-31"), by = "10 years"), date_labels = "%Y-%m") +
      theme(
        legend.title = element_blank(),
        legend.position = "top",
        axis.title.x = element_blank()
      )
  }
  print(full_posterior %>% mutate(ci_width = post_sd*1.96*2) %>% filter(end_date %in% c(as.Date("1960-12-31"), settings$end_date)))
  plot
}


plot_taus_over_time <- function(posterior_over_time_flex) {
  data <- posterior_over_time_flex %>% lapply(function(x) {
    x$mle %>% mutate(end_date = x$end_date)
  }) %>% 
    bind_rows() %>%
    filter(estimate != "alpha") %>%
    mutate(estimate_pretty = if_else(estimate == "tau_s", "tau_w", estimate)) 
  
  ymax <- max(data$ml_est)
  
  data %>%
    ggplot(aes(end_date, ml_est, colour = estimate_pretty, linetype = estimate_pretty)) +
    geom_line() +
    scale_linetype_manual(values = c('tau_c' = "solid", 'tau_w' = "longdash"), name = '', labels = c(expression(tau[c]), expression(tau[w]))) +
    scale_colour_manual(values = c('tau_c' = colours_theme[1], 'tau_w' = colours_theme[2]), name = '', labels = c(expression(tau[c]), expression(tau[w]))) +
    scale_x_date(breaks = seq.Date(as.Date("1960-12-31"), as.Date("2020-12-31"), by = "10 years"), date_labels = "%Y-%m") +
    labs(y = "Maximum Likelihood Estimate (%)") +
    ylim(c(0, ymax)) +
    theme(
      legend.title = element_blank(),
      legend.position = "top",
      axis.title.x = element_blank()
    )
}

plot_sim_fdr <- function(simulation) {
  tau_w_names <- c(
    `0.01` = expression(tau[w] ~ "= 0.01%"),
    `0.2` = expression(tau[w] ~ "= 0.20%")
  )
  stat_labels <- c(
    `False Discovery Rate` = expression(~"False Discovery Rate"),
    `True Discovery Rate` = expression(~"True Discovery Rate"),
    `True Discoveries` = expression(~"True Discoveries"),
    `False Discoveries` = expression(~"False Discoveries")
  )
  
  plot_data <- simulation %>%
    gather(n_disc, fdr, true_disc, false_disc, true_disc_rate, key = "stat", value = "number") %>%
    filter(stat %in% c("fdr", "true_disc", "false_disc", "true_disc_rate")) %>%
    mutate(
      stat = case_when(
        stat == "fdr" ~ "False Discovery Rate",
        stat == "true_disc" ~ "True Discoveries",
        stat == "false_disc" ~ "False Discoveries",
        stat == "true_disc_rate" ~ "True Discovery Rate"
      ),
      stat = stat %>% factor(levels = c("False Discovery Rate", "True Discovery Rate", "True Discoveries", "False Discoveries")),
      type = case_when(
        type == "by" ~ "Benjamini and Yekutieli",
        type == "ols" ~ "OLS",
        type == "eb" ~ "Empirical Bayes"
      ),
      type = type %>% factor(levels = c("OLS", "Benjamini and Yekutieli", "Empirical Bayes")),
      # tau_w_title = formatC(tau_w, digits = 2, format = "f"),
      # tau_w_title = as.character(eval(bquote(tau[w] ~ "=" ~ .(tau_w_title)~ "%")))
      tau_w_title = tau_w %>% factor(labels = tau_w_names),
      stat_title = stat %>% factor(label = stat_labels)
    ) 
  
  fdr_plot <- plot_data %>% 
    filter(stat == "False Discovery Rate") %>%
    ggplot(aes(tau_c, number, colour = type)) +
    geom_point() +
    geom_line() +
    labs(x = "tau_c (%)", y = "False Discovery Rate", colour = "Adjustment") +
    facet_wrap(stat~tau_w_title, labeller = label_bquote(tau[w] ~ "=" ~ .(tau_w_title)~ "%"))
  
  true_disc_rate <- plot_data %>% 
    filter(stat == "True Discovery Rate") %>%
    ggplot(aes(tau_c, number, colour = type)) +
    geom_point() +
    geom_line() +
    labs(x = "tau_c (%)", y = "True Discovery Rate", colour = "Adjustment") +
    facet_wrap(stat~tau_w_title, ncol = 2) 
  
  plot_data %>%
    filter(stat %in% c("False Discovery Rate", "True Discovery Rate")) %>%
    group_by(stat) %>%
    mutate(scale_max = max(number)) %>%
    mutate(scale_min = min(number)) %>%
    ggplot(aes(tau_c, number, colour = type, shape = type)) +
    geom_point() +
    geom_point(aes(y = scale_max), alpha = 0) +
    geom_point(aes(y = scale_min), alpha = 0) +
    geom_line() +
    labs(x = bquote(bold(tau[c])~"(%)"), colour = "Type:", linetype = "Type:", shape = "Type:") +
    facet_wrap(stat_title~tau_w_title, scales = "free_y", labeller = label_parsed) +
    # facet_wrap(stat_title~tau_w_title) +
    # facet_wrap(stat~tau_w_title, scales = "free_y", labeller = label_bquote(tau[w] ~ "=" ~ .(tau_w_title)~ "%")) +
    theme(
      axis.title.y = element_blank(),
      axis.title.x = element_text(size = 12),
      legend.position = "top"
    )
}


plot_size_overall <- function(eb_size, flipped = F, text = F) {
  size_repl <- eb_size %>%
    left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>% 
    filter(orig_sig == 1) %>%
    group_by(size_grp) %>%
    summarise(
      repl_rate = mean(p025>0)
    )
  if (flipped) {
    size_plot <- size_repl %>%
      mutate(size_grp = size_grp %>% factor(levels = c("Nano", "Micro", "Small", "Large", "Mega"))) %>%
      ggplot(aes(x = size_grp, y = repl_rate*100)) +
      geom_col(fill = colours_theme[1]) +
      coord_flip() +
      labs(y = "Replication Rate (%)") +
      theme(axis.title.y = element_blank())
    if (text) {
      size_plot <- size_plot + geom_text(aes(label = str_c(formatC(round(repl_rate * 100, 1), digits = 1, format = "f"), "%")), nudge_y = 7, size = 5.5)
    }
  } else {
    size_plot <- size_repl %>%
      ggplot(aes(x = size_grp, y = repl_rate*100)) +
      geom_col(fill = colours_theme[1]) +
      labs(y = "Replication Rate (%)") +
      theme(axis.title.x = element_blank())
    if (text) {
      size_plot <- size_plot + geom_text(aes(label = str_c(formatC(round(repl_rate * 100, 1), digits = 1, format = "f"), "%")), nudge_y = 2.5, size = 4)
    }
  }
  size_plot
}

plot_size_clusters <- function(eb_size, cluster_order) {
  overall <- eb_size %>%
    left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>% 
    filter(orig_sig == 1) %>%
    group_by(size_grp) %>%
    summarise(
      overall_rr = mean(p025>0)
    )
  
  cluster_rr <- eb_size %>%
    left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>% 
    filter(orig_sig == 1) %>%
    mutate(hcl_label = hcl_label %>% factor(levels = cluster_order)) %>%
    group_by(size_grp, hcl_label) %>%
    summarise(
      repl_rate = mean(p025>0)
    ) %>%
    group_by(hcl_label) %>%
    mutate(sort_var = repl_rate[size_grp == "Mega"]) %>%
    left_join(overall, by = "size_grp") %>%
    mutate(size_title = str_c(size_grp, " - Replication Rate: ", formatC(round(overall_rr * 100, 1), digits = 1, format = "f"), "%"))
  
  titles <- cluster_rr$size_title %>% unique()
  title_order <- c(titles[str_detect(titles, "Mega")], titles[str_detect(titles, "Large")], titles[str_detect(titles, "Small")], 
                   titles[str_detect(titles, "Micro")], titles[str_detect(titles, "Nano")])
  
  cluster_rr %>%
    mutate(size_title = size_title %>% factor(levels = title_order)) %>%
    ggplot(aes(x = reorder(hcl_label, sort_var), y = repl_rate*100, fill = hcl_label)) +
    geom_col() +
    labs(y = "Replication Rate (%)") +
    facet_wrap(~size_title, ncol = 1) +
    theme(
      axis.title.x = element_blank(),
      legend.position = "none",
      axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1, size = 10)
    )
}

plot_sign_test <- function(sign_test) {
  sig <- sign_test %>%
    left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>% 
    mutate(
      type = case_when(
        p <= 0.05 & orig_sig == 1 ~ "Replicated",
        p > 0.05 & orig_sig == 1 ~ "Not Replicated",
        orig_sig == 0 ~ "Never Significant"
      ),
      type = type %>% factor(levels = c("Replicated", "Not Replicated", "Never Significant"))      
      # p_value = if_else(p < 0.05, "Significant", "Insignificant"),
      # p_value = p_value %>% factor(levels = c("Significant", "Insignificant"))
    ) 
  
  sig_overall <- sig %>% 
    summarise(repl_rate = sum(type == "Replicated") / sum(type %in% c("Replicated", "Not Replicated"))) %>% 
    pull(repl_rate)
  
  plot_sign_factors <- sig %>%
    ggplot(aes(reorder(characteristic, pos_act), pos_act*100, fill = type)) +
    geom_col() +
    ylim(c(0, 100)) +
    labs(y = "Countries with Positive Return (%)", fill = "Bootstrapped p-Value:") +
    geom_text(aes(x = 18, y = 100, label = str_c("Replication Rate: ", round(sig_overall*100, 2), "%")), inherit.aes = F) +
    # geom_text(aes(label = round(p, 2)), nudge_y = 1, size = 1.5) +
    theme(
      axis.title.x = element_blank(),
      # legend.title = element_blank(),
      legend.position = "top",
      axis.text.x = element_text(size = 7, angle = 90, vjust = 0, hjust = 1),
      text = element_text(size = 10)
    )
  
  plot_pos <- sig %>%
    left_join(cluster_labels, by = "characteristic") %>% 
    mutate(hcl_label = hcl_label %>% factor(levels = cluster_order)) %>%
    group_by(hcl_label) %>%
    filter(type %in% c("Replicated", "Not Replicated")) %>%
    summarise(
      repl_rate = sum(type == "Replicated") / sum(type %in% c("Replicated", "Not Replicated")),
      pos = mean(pos_act)
    ) %>% 
    ggplot(aes(reorder(hcl_label, pos), pos*100, fill = hcl_label)) +
    geom_col() +
    labs(y = "Countries with Positive Alpha (%)") +
    ylim((c(0, 100))) +
    theme(
      legend.position = "none",
      axis.title.y = element_text(size=8),
      axis.title.x = element_blank(),
      axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)
    )
  
  plot_sig <- sig %>%
    left_join(cluster_labels, by = "characteristic") %>% 
    mutate(hcl_label = hcl_label %>% factor(levels = cluster_order)) %>%
    group_by(hcl_label) %>% 
    summarise(repl_rate = sum(type == "Replicated") / sum(type %in% c("Replicated", "Not Replicated"))) %>% 
    ggplot(aes(reorder(hcl_label, repl_rate), repl_rate*100, fill = hcl_label)) +
    geom_col() +
    labs(y = "Sign Test Replication Rate (%)") +
    theme(
      legend.position = "none",
      axis.title.y = element_text(size=8),
      axis.title.x = element_blank(),
      axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)
    )
  plot_sign_clusters <- cowplot::plot_grid(plot_pos, plot_sig, ncol = 1, labels = c("A", "B"), label_y = 1, label_x = 0)
  list("factors" = plot_sign_factors, "clusters" = plot_sign_clusters)
}

# World ex us versus us
plot_int_cor <- function(eb_us, eb_world_ex_us) {
  cor_data <- eb_us$input$long %>%
    bind_rows(eb_world_ex_us$input$long) %>%
    select(characteristic, region, eom, ret) %>%
    spread(key = region, value = ret) %>%
    left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>%
    filter(!is.na(us) & !is.na(world_ex_us)) %>%
    group_by(characteristic) %>%
    summarise(
      monhts = n(),
      cor = cor(us, world_ex_us)
    ) 
  
  print(cor_data %>%
          pull(cor) %>%
          quantile())
  
  cor_data %>%
    left_join(cluster_labels, by = "characteristic") %>%
    group_by(hcl_label) %>%
    summarise(
      cor_avg = mean(cor)
    ) %>%
    ggplot(aes(reorder(hcl_label, cor_avg), cor_avg)) +
    geom_col(fill = colours_theme[1]) +
    labs(y = "Correlation of US and World ex. US factor (Avg. within Cluster)") +
    coord_flip() +
    theme(
      axis.title.y = element_blank()
    )
}

plot_world_vs_us <- function(eb_us, eb_world_ex_us) {
  cor_data <- eb_us$input$long %>%
    bind_rows(eb_world_ex_us$input$long) %>%
    select(characteristic, region, eom, ret) %>%
    spread(key = region, value = ret) %>%
    left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>%
    filter(!is.na(us) & !is.na(world_ex_us)) %>%
    group_by(characteristic) %>%
    summarise(
      monhts = n(),
      cor = cor(us, world_ex_us)
    ) 
  
  print(cor_data %>%
          pull(cor) %>%
          quantile())
  
  
  region_data <- eb_us$factors %>%
    bind_rows(eb_world_ex_us$factors) %>%
    select(characteristic, region, ols_est) %>%
    spread(key = region, value = ols_est) %>%
    left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") 
  
  max_scale <- max(c(region_data$world_ex_us, region_data$us))
  min_scale <- min(c(region_data$world_ex_us, region_data$us))
  
  fit_all <- lm(world_ex_us ~ us, data = region_data)
  int <- fit_all$coefficients[1]
  slp <- fit_all$coefficients[2]
  r2 <- summary(fit_all)$r.squared
  
  eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(R)^2~"="~r2, 
                   list(a = format(unname(int), digits = 2),
                        b = format(unname(slp), digits = 2),
                        r2 = format(r2, digits = 3)))
  lbl <- as.character(as.expression(eq))
  
  t_int <- formatC(round(summary(fit_all)$coefficients["(Intercept)", "t value"], 2), format='f', digits=2)
  t_us <- formatC(round(summary(fit_all)$coefficients["us", "t value"], 2), format='f', digits=2)
  
  region_data %>%
    mutate(
      orig_sig_pretty = if_else(orig_sig == 1, "Studied", "Not Studied"),
      orig_sig_pretty = orig_sig_pretty %>% factor(levels = c("Studied", "Not Studied"))
    ) %>%
    ggplot(aes(us, world_ex_us, colour = orig_sig_pretty, shape = orig_sig_pretty)) +
    geom_point() +
    geom_abline(intercept = 0, slope = 1, linetype = "dotted") +
    # geom_abline(intercept = int, slope = slp) +
    xlim(c(min_scale, max_scale)) +
    ylim(c(min_scale, max_scale)) +
    annotate("text", label=lbl, parse=TRUE, x=min_scale, y=max_scale, hjust = 0) +
    annotate("text", label=paste0("        (", t_int, ")   (", t_us, ")"), parse=F, x=min_scale, y=max_scale-0.07, hjust = 0, size = 3.3) +
    labs(x = "US Alpha (%)", y = "World Ex. US Alpha (%)") + 
    theme(
      legend.position = "none",
      legend.title = element_blank()
    )
}

plot_is_oos_post <- function(is_oos, type) { # type in c("GLS", "OLS")
  plot_list <- c("pre", "post", "pre_post") %>% lapply(function(period) {
    data <- is_oos[[period]]$regs
    if (type == "OLS") {
      fit <- lm(oos ~ is, data = data)
      # Output
      int <- unname(fit$coefficients[1])
      int_se <- summary(fit)$coefficients[1, "Std. Error"]
      slope <- unname(fit$coefficients[2])
      slope_se <- summary(fit)$coefficients[2, "Std. Error"]
      r2 <- summary(fit)$adj.r.squared # Adjusted R2
      # Label
      eq_lbl <- substitute(
        italic(y) == a + b %.% italic(x)*","~~italic(R)^2~"="~r2,
        list(
          a = formatC(int, digits = 2, format = "f"),
          b = formatC(slope, digits = 2, format = "f"),
          r2 = formatC(r2, digits = 2, format = "f"))
      )
    }
    if (type == "GLS") {
      x <- cbind(rep(1, nrow(data)), data$is)
      y <- data$oos
      chars <- str_c(data$characteristic, "__us")
      
      gls_cov <- eb_est$us$factor_cov[chars, chars]
      gls_est <- solve(t(x) %*% solve(gls_cov) %*% x) %*% t(x) %*% solve(gls_cov) %*% y
      gls_res <- y - x %*% gls_est
      gls_e_var <- 1/(nrow(x)-ncol(x)) * t(gls_res) %*% solve(gls_cov) %*% (gls_res)
      gls_se <- sqrt(diag(drop(gls_e_var)* solve(t(x) %*% solve(gls_cov) %*% x)))
      # Output
      int <- gls_est[1, 1]
      int_se <- gls_se[1]
      slope <- gls_est[2, 1]
      slope_se <- gls_se[2]
      # Label (R2 doesn't really translate to GLS, because the mean prediction is no longer a good baseline)
      eq_lbl <- substitute(
        italic(y) == a + b %.% italic(x),
        list(
          a = formatC(int, digits = 2, format = "f"),
          b = formatC(slope, digits = 2, format = "f"))
      )
    }
    
    min_y <- min(data$is, data$oos)
    max_y <- max(data$is, data$oos)
    
    t_int <- formatC(round(int/int_se, 2), format='f', digits=2)
    t_is <- formatC(round(slope/slope_se, 2), format='f', digits=2)
    t_lbl <- paste0("       (", t_int, ")   (", t_is, ")")
    
    data %>%
        ggplot(aes(is, oos)) +
        geom_point(colour = colours_theme[1]) +
        ylim(c(min_y, max_y)) +
        xlim(c(min_y, max_y)) +
        geom_hline(yintercept = 0, linetype = "solid") +
        geom_vline(xintercept = 0, linetype = "solid") +
        geom_abline(slope = 1, intercept = 0, linetype = "dotted") +
        ggtitle(label = eq_lbl, subtitle = t_lbl) +
        labs(x = "In-Sample", y = "Out-of-Sample")
  })
  plot_list
}

plot_is_oos_post_quad <- function(is_oos, type) { # type in c("OLS", "GLS")
  plot_list <- c("pre", "post", "pre_post") %>% lapply(function(period) {
    data <- is_oos[[period]]$regs
    if (type == "OLS") {
      fit <- lm(oos ~ is + I(is^2), data = data)
      # Output
      int <- unname(fit$coefficients[1])
      int_se <- summary(fit)$coefficients[1, "Std. Error"]
      is <- unname(fit$coefficients[2])
      is_se <- summary(fit)$coefficients[2, "Std. Error"]
      issq <- unname(fit$coefficients[3])
      issq_se <- summary(fit)$coefficients[3, "Std. Error"]
      r2 <- summary(fit)$adj.r.squared # Adjusted R2
      # Label 
      eq_lbl <- substitute(
        italic(y) == a + b %.% italic(x)* ~ s ~ c %.% italic(x)^2*","~~italic(R)^2~"="~r2,
        list(
          a = formatC(int, digits = 2, format = "f"),
          b = formatC(is, digits = 2, format = "f"),
          s = ifelse(sign(issq)==1, "+", "-"),
          c = formatC(unname(abs(issq)), digits = 2, format = "f"),
          r2 = formatC(r2, digits = 2, format = "f"))
      )
    }
    if (type == "GLS") {
      x <- cbind(rep(1, nrow(data)), data$is, data$is^2)
      y <- data$oos
      chars <- str_c(data$characteristic, "__us")
      
      gls_cov <- eb_est$us$factor_cov[chars, chars]
      gls_est <- solve(t(x) %*% solve(gls_cov) %*% x) %*% t(x) %*% solve(gls_cov) %*% y
      gls_res <- y - x %*% gls_est
      gls_e_var <- 1/(nrow(x)-ncol(x)) * t(gls_res) %*% solve(gls_cov) %*% (gls_res)
      gls_se <- sqrt(diag(drop(gls_e_var)* solve(t(x) %*% solve(gls_cov) %*% x)))
      # Output
      int <- gls_est[1, 1]
      int_se <- gls_se[1]
      is <- gls_est[2, 1]
      is_se <- gls_se[2]
      issq <- gls_est[3, 1]
      issq_se <- gls_se[3]
      # Label
      eq_lbl <- substitute(
        italic(y) == a + b %.% italic(x)* ~ s ~ c %.% italic(x)^2,
        list(
          a = formatC(int, digits = 2, format = "f"),
          b = formatC(is, digits = 2, format = "f"),
          s = ifelse(sign(issq)==1, "+", "-"),
          c = formatC(unname(abs(issq)), digits = 2, format = "f"))
      )
    }
    
    min_y <- min(data$is, data$oos)
    max_y <- max(data$is, data$oos)
    
    t_int <- formatC(round(int/int_se, 2), format='f', digits=2)
    t_is <- formatC(round(is/is_se, 2), format='f', digits=2)
    t_issq <- formatC(round(issq/issq_se, 2), format='f', digits=2)
    t_lbl <- paste0("      (", t_int, ")    (", t_is, ")         (", t_issq, ")")
    
    data %>%
      ggplot(aes(is, oos)) +
      geom_point(colour = colours_theme[1]) +
      ylim(c(min_y, max_y)) +
      xlim(c(min_y, max_y)) +
      geom_hline(yintercept = 0, linetype = "solid") +
      geom_vline(xintercept = 0, linetype = "solid") +
      geom_abline(slope = 1, intercept = 0, linetype = "dotted") +
      geom_smooth(method = "loess", span = 1, formula = "y~x") +
      ggtitle(label = eq_lbl, subtitle = t_lbl) +
      labs(x = "In-Sample", y = "Out-of-Sample")
  })
  plot_list
}

# Effect Size Plot
plot_effects <- function(type, orig_sig, cluster_order) {  # type in c("ols", "eb")
  if (orig_sig) {
    orig_sig_values <- T
  } else {
    orig_sig_values <- c(T, F)
  }
  if (type == "ols") {
    alpha_est = "ols_est"
  }
  if (type == "eb") {
    alpha_est = "post_mean"
  }
  (effect_world <- eb_est$world$factors %>%
      left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>%
      filter(orig_sig %in% orig_sig_values) %>%
      mutate(hcl_label = hcl_label %>% factor(levels = cluster_order)) %>%
      group_by(hcl_label) %>%
      summarise(mean_alpha = mean(get(alpha_est))) %>%
      ggplot(aes(reorder(hcl_label, mean_alpha), mean_alpha, fill = hcl_label)) + 
      geom_col() +
      coord_flip() +
      labs(y = paste(str_to_upper(type), "Alpha Estimate (%)")) +
      theme(
        axis.title.y = element_blank(),
        legend.position = "none"
      ))
  
  (effect_regions <- eb_est$all$factors %>%
      left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>%
      filter(orig_sig %in% orig_sig_values) %>%
      mutate(hcl_label = hcl_label %>% factor(levels = cluster_order)) %>%
      group_by(region, hcl_label) %>%
      summarise(mean_alpha = mean(get(alpha_est))) %>%
      group_by(hcl_label) %>%
      mutate(
        sort_var = mean_alpha[region == "us"],
        region_pretty = case_when(
          region == "us" ~ "US",
          region == "developed" ~ "Developed",
          region == "emerging" ~ "Emerging"
        ),
        region_pretty = region_pretty %>% factor(levels = c("US", "Developed", "Emerging"))
      ) %>%
      ggplot(aes(reorder(hcl_label, sort_var), mean_alpha, fill = hcl_label)) + 
      geom_col() +
      coord_flip() +
      scale_y_continuous(breaks = seq(-0.2, 1, 0.2)) +
      facet_wrap(~region_pretty, scales = "free_x") +
      labs(y = paste("Average", str_to_upper(type) ,"Alpha (%)")) +
      theme(
        axis.title.y = element_blank(),
        legend.position = "none"
      ))
  
  (effect_size <- eb_us_size %>%
      left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>%
      filter(orig_sig %in% orig_sig_values) %>%
      mutate(hcl_label = hcl_label %>% factor(levels = cluster_order)) %>%
      group_by(size_grp, hcl_label) %>%
      summarise(mean_alpha = mean(get(alpha_est))) %>%
      group_by(hcl_label) %>%
      mutate(sort_var = mean_alpha[size_grp == "Mega"]) %>%
      ggplot(aes(reorder(hcl_label, sort_var), mean_alpha, fill = hcl_label)) +
      geom_col() +
      coord_flip() +
      scale_y_continuous(breaks = seq(0, 1.5, 0.50)) +
      facet_wrap(~size_grp, nrow = 1, scales = "free_x") +
      labs(y = paste("Average", str_to_upper(type) ,"Alpha (%)")) +
      theme(
        axis.title.y = element_blank(),
        legend.position = "none"
      ))
  
  list(effect_world, effect_regions, effect_size)
}

# Replication Rate by Cluster
plot_repl_cluster <- function(eb_factors, orig_sig, cluster_order) {
  if (orig_sig) {
    factor_subset <- eb_factors %>%
      left_join(char_info %>% select(characteristic, significance), by = "characteristic") %>%
      filter(significance == T)
  } else {
    factor_subset <- copy(eb_factors)
  }
  if (uniqueN(factor_subset$region) != 1) {
    warning("!!!MULTIPLE REGIONS INCLUDED!!!")
  }
  
  factor_subset %>%
    mutate(hcl_label = hcl_label %>% factor(levels = cluster_order)) %>%
    group_by(hcl_label) %>%
    summarise(
      n = n(),
      repl_rate = mean(p025 > 0),
      sort_var = repl_rate + n / 100000
    )  %>%
    ggplot(aes(reorder(hcl_label, sort_var), repl_rate * 100, fill = hcl_label)) +
    geom_col() +
    labs(y = "Replication Rate (%)") +
    coord_flip() +
    theme(
      axis.title.y = element_blank(),
      legend.position = "none"
    )
}

# Plot Replication Rate as a Function of Tau - Benchmark against Harvey et al. (2016)
plot_harvey <- function(harvey_base_res, harvey_worst_res, tau_ws, act_rr) {
  mle_est_base <- harvey_base_res$sim[[1]] %>% 
    lapply(function(x) x$mle) %>% 
    bind_rows() %>%
    mutate(type = "baseline")
  
  mle_est_worst <- harvey_worst_res$sim[[1]] %>% 
    lapply(function(x) x$mle) %>% 
    bind_rows() %>%
    mutate(type = "worst_case")
  
  mle_est <- mle_est_base %>% bind_rows(mle_est_worst)
  
  mle_summary <- mle_est %>%
    group_by(type, coef) %>%
    summarise(
      n = n(),
      coef_mean = mean(mle)
    )
  
  tc_harvey_base <- mle_summary %>% filter(type == "baseline" & coef == "tc") %>% pull(coef_mean)
  tc_harvey_worst <- mle_summary %>% filter(type == "worst_case" & coef == "tc") %>% pull(coef_mean)
  
  # Replication Rate under alternative Tau's
  m <- eb_est$us$factors %>%
    select(characteristic, ols_est, hcl_label) %>%
    mutate(cm = 1) %>%
    select(characteristic, hcl_label, cm) %>%
    spread(key = hcl_label, value = cm) %>% 
    select(-characteristic) %>% 
    as.matrix()
  m[is.na(m)] <- 0
  mm <- m %*% t(m)
  
  alpha_hat <- eb_est$us$factors %>% pull(ols_est)
  alpha_0_vec <- rep(0, 153)
  sigma <- eb_est$us$sigma
  
  tc_act <- eb_est$us$mle %>% filter(estimate == "tau_c") %>% pull(ml_est) %>% round(2)
  
  search_grid <- expand.grid(
    tau_c = c(seq(0.15, 0.46, by = 0.01), tc_act, tc_harvey_base, tc_harvey_worst),
    tau_w = tau_ws
  )
  
  repl_by_tau <- 1:nrow(search_grid) %>% lapply(function(i) {
    tw <- search_grid[i, "tau_w"]
    tc <- search_grid[i, "tau_c"]
    omega <- diag(153) * tw^2 + mm * tc^2
    post_cov <- solve(solve(omega) + solve(sigma))
    post_alpha <- post_cov %*% (solve(omega) %*% alpha_0_vec + solve(sigma) %*% alpha_hat)
    
    tibble(characteristic = eb_est$us$factors$characteristic, post_mean = drop(post_alpha), post_sd = sqrt(diag(post_cov))) %>%
      left_join(char_info %>% select(characteristic, "orig_sig" = significance), by = "characteristic") %>%
      filter(orig_sig == T) %>%
      summarise(repl_rate = mean(post_mean - 1.96 * post_sd > 0)) %>%
      mutate(tau_c = tc, tau_w = tw)
  }) %>% bind_rows()
  
  # Set TW labels 
  n_tw <- length(tau_ws) 
  tau_w_names <- vector(mode = "expression", length = n_tw)
  for (i in 1:n_tw) {
    tau_w_names[i] <- c(bquote(bold(tau[w])  ~ "=" ~ .(unname(tau_ws[i])) ~ "%"))
  }
  names(tau_w_names) <- tau_ws
  
  # Generate Important Points
  tau_points <- repl_by_tau %>%
    filter(tau_c %in% c(tc_act, tc_harvey_base, tc_harvey_worst)) %>%
    distinct() %>%
    mutate(
      tau_w_title = tau_w %>% factor(labels = tau_w_names),
      type = case_when(
        tau_c == tc_act ~ "Estimated from Data",
        tau_c == tc_harvey_base ~ "Harvey, Liu, and Zhu (2016): Baseline",
        tau_c == tc_harvey_worst ~ "Harvey, Liu, and Zhu (2016): Conservative",
        TRUE ~ "Other"
      ),
      type = type %>% factor(levels = c("Harvey, Liu, and Zhu (2016): Conservative", "Harvey, Liu, and Zhu (2016): Baseline", "Estimated from Data"))
    )
  print(tau_points)
  
  plot <- repl_by_tau %>%
    mutate(
      tau_w_title = tau_w %>% factor(labels = tau_w_names),
      type = case_when(
        tau_c == tc_act ~ "Estimated from Data",
        tau_c == tc_harvey_base ~ "Harvey, Liu, and Zhu (2016): Baseline",
        tau_c == tc_harvey_worst ~ "Harvey, Liu, and Zhu (2016): Conservative",
        TRUE ~ " "
      ),
      type = type %>% factor(levels = c("Estimated from Data", "Harvey, Liu, and Zhu (2016): Baseline", "Harvey et al. (2016): Conservative", " "))
    ) %>%
    ggplot(aes(tau_c, repl_rate * 100)) +
    geom_point(data = tau_points, aes(colour = type, shape = type, stroke = 1), size = 3) +
    geom_line(alpha = 1, size = 0.6) +
    geom_hline(yintercept = act_rr*100, linetype = "dotted") +
    scale_x_continuous(breaks = seq(0.05, max(search_grid$tau_c), 0.05)) +
    ylim(c(0, 100)) +
    theme(legend.title = element_blank(), legend.position = "top") +
    labs(y = "Replication Rate (%)", x = bquote(bold(tau[c])~"(%)"), colour = expression(tau[c]), shape = expression(tau[c]))
  if (n_tw > 1) {
    plot <- plot + facet_wrap(~tau_w_title, labeller = label_parsed)
  }
  return(plot)
}

# Single Factor TPF --
# Plot TPF Factor: Cluster + Market
plot_tpf_one_cluster <- function(data_wide, cluster_labels, s) {
  mkt_sr <- mean(data_wide$market)/sd(data_wide$market)
  one_cluster <- unique(cluster_labels$hcl_label) %>% lapply(function(c) {
    cl_chars <- cluster_labels %>%
      filter(characteristic %in% colnames(data_wide) & hcl_label == c) %>%
      pull(characteristic)
    # Cluster SR
    cl <- data_wide %>% select(market, all_of(cl_chars))
    w <- cl %>% epo_tpf(s = s)  
    sr_all <- cl %>% sr_func(w = w)
    # Average SR
    sr_single_avg <- cl_chars %>% sapply(function(char) {
      cl_sub <- data_wide %>% select(market, all_of(char))
      w <- cl_sub %>% epo_tpf(s = opt_s)
      cl_sub %>% sr_func(w = w)
    }) %>%
      mean()
    
    tibble(hcl_label=c, sr_all=sr_all, sr_single_avg=sr_single_avg)
  }) %>%
    bind_rows()
  
  one_cluster %>%
    mutate(hcl_label = hcl_label %>% factor(levels = cluster_order)) %>%
    ggplot(aes(reorder(hcl_label, sr_all), sr_all, fill = hcl_label)) +
    geom_col() +
    coord_flip() +
    geom_hline(yintercept = mkt_sr, linetype = "dashed") +
    theme(legend.position = "none", axis.title.y = element_blank()) +
    labs(y = "Monthly Sharpe Ratio: Market + Cluster")
}

# Plot TPF Factor: Exclude one cluster
plot_tpf_excl_cl <- function(data_wide, cluster_labels, s) {
  epo_w <- data_wide %>% epo_tpf(s = s)
  full_sr <- data_wide %>% sr_func(w = epo_w)
  
  excl_one <- c("Market", unique(cluster_labels$hcl_label)) %>% lapply(function(c) {
    cl_chars <- cluster_labels %>%
      filter(characteristic %in% colnames(data_wide) & hcl_label != c) %>%
      pull(characteristic)
    if (c != "Market") {
      cl_chars <- c(cl_chars, "market")
    }
    # All minus Cluster SR
    data <- data_wide %>% select(all_of(cl_chars))
    w <- data %>% epo_tpf(s = s)  
    sr <- data %>% sr_func(w = w)
    
    tibble(hcl_label=c, sr=sr)
  }) %>%
    bind_rows()
  
  excl_one %>%
    mutate(hcl_label = hcl_label %>% factor(levels = c(cluster_order, "Market"))) %>%
    ggplot(aes(reorder(hcl_label, -sr), (full_sr-sr) / full_sr, fill = hcl_label)) +
    geom_col() +
    coord_flip() +
    theme(legend.position = "none", axis.title.y = element_blank()) +
    labs(y = "Percentage Drop in Monthly SR from Excluding Cluster")
}

# Plot TPF Factor: Single Factor Importance
plot_tpf_factor_imp <- function(data_wide, cluster_labels, s) {
  epo_w <- data_wide %>% epo_tpf(s = s)
  full_sr <- data_wide %>% sr_func(w = epo_w)
  each_factor <- colnames(data_wide) %>% lapply(function(c) {
    # SR excluding char
    data <- data_wide %>% select(-all_of(c))
    w <- data %>% epo_tpf(s = opt_s)  
    sr <- data %>% sr_func(w = w)
    # Output
    tibble(characteristic = c, sr = sr)
  }) %>%
    bind_rows()
  
  each_factor %>%
    left_join(cluster_labels, by = "characteristic") %>%
    mutate(
      hcl_label = if_else(characteristic == "market", "Market", hcl_label),
      hcl_label = hcl_label %>% factor(levels = c(cluster_order, "Market")),
      drop = full_sr - sr,
      drop_prop = drop/full_sr
    ) %>%
    arrange(drop_prop) %>%
    tail(10) %>%
    ggplot(aes(reorder(characteristic, drop_prop), drop_prop*100, fill = hcl_label)) +
    geom_col() +
    coord_flip() +
    theme(axis.title.y = element_blank()) +
    labs(y = "Drop in TPF SR (% of full)", fill = "Cluster")
  
  each_factor %>%
    left_join(cluster_labels, by = "characteristic") %>%
    mutate(
      hcl_label = if_else(characteristic == "market", "Market", hcl_label),
      hcl_label = hcl_label %>% factor(levels = c(cluster_order, "Market")),
      drop = full_sr - sr,
      drop_prop = drop/full_sr,
      rank = frank(-drop_prop)
    ) %>%
    group_by(hcl_label) %>%
    filter(characteristic != "market") %>%
    # filter(drop_prop == max(drop_prop)) %>%
    ggplot(aes(reorder(characteristic, drop_prop), drop_prop*100, fill = hcl_label)) +
    geom_col() +
    coord_flip() +
    theme(axis.title.y = element_blank(), legend.position = "none") +
    labs(y = "Drop in TPF SR (% of full)", fill = "Cluster")+
    facet_wrap(~hcl_label, scales = "free_y")
}

# Plot TPF Factor: Single Factor Importance within Cluster
plot_tpf_factor_imp_cluster <- function(data_wide, cluster_labels, s) {
  within_cluster <- unique(cluster_labels$hcl_label) %>% lapply(function(c) {
    cl_chars <- cluster_labels %>%
      filter(characteristic %in% colnames(data_wide) & hcl_label == c) %>%
      pull(characteristic)
    # Full Cluster tpf
    data <- data_wide %>% select(market, all_of(cl_chars))
    w_all <- data %>% epo_tpf(s = opt_s)  
    sr_all <- data %>% sr_func(w = w_all)
    # Individual
    sr_chars <- cl_chars %>% lapply(function(char) {
      sub <- data %>% select(-all_of(char))
      w_sub <- sub %>% epo_tpf(s = s)  
      sr_sub <- sub %>% sr_func(w = w_sub)
      tibble(excl_char = char, sr = sr_sub)
    }) %>% bind_rows()
    
    sr_chars %>% mutate(hcl_label = c, sr_all = sr_all)
  }) %>%
    bind_rows()
  
  within_cluster %>%
      mutate(
        drop = sr_all - sr,
        drop_prop = drop/sr_all,
        hcl_label = hcl_label %>% factor(levels = cluster_order)
      ) %>%
      group_by(hcl_label) %>%
      # filter(drop == max(drop)) %>%
      ggplot(aes(reorder(excl_char, drop_prop), drop_prop*100, fill = hcl_label)) +
      geom_col() +
      coord_flip() +
      theme(axis.title.y = element_blank(), legend.position = "none") +
      labs(y = "Drop in cluster TPF SR (% of full cluster)", fill = "Cluster") +
      facet_wrap(~hcl_label, scales = "free")
}

# Plot TPF Factor: The Evolution of the TPF
plot_tpf_evolution <- function(data, data_wide, char_info, orig_sig_values, s) {
  mkt_sr <- mean(data_wide$market) / sd(data_wide$market)
  
  years <- data %>% 
    filter(year(eom) > min(char_info$sample_end)) %>% 
    mutate(year = year(eom)) %>%
    pull(year) %>%
    unique()
  
  sr_over_time <- years %>% lapply(function(y) {
    discovered_chars <- char_info %>% filter(sample_end <= y & significance %in% orig_sig_values) %>% pull(characteristic)
    sub <- data_wide %>% select(all_of(discovered_chars), "market")  
    w <- sub %>% epo_tpf(s = s) 
    # tibble(w = w, char = colnames(sub)) %>% left_join(char_info %>% select("char"=characteristic, sample_end), by = "char") %>% arrange(-w)
    # 2002: seasonality coincides with noa_at that also gets a large weight
    
    tibble(year = y, tpf_sr = sr_func(sub, w), n = ncol(sub))  
  }) %>% bind_rows()
  
  sr_over_time <- sr_over_time %>%
    bind_rows(tibble(year = min(years)-1, tpf_sr = mkt_sr, n = 0))
  
  (sr_plot <- sr_over_time %>%
      ggplot(aes(year, tpf_sr)) +
      geom_point() +
      geom_line() +
      ylim(c(0, NA)) +
      # geom_hline(yintercept = full_sr) +
      annotate("text", x = 1971, 
               y = 0, label = "Market", colour='black') +
      geom_segment(aes(x = 1971, y = 0.02, xend = 1971, yend = 0.1), size=0.1,arrow = arrow(length = unit(0.2, "cm"))) +
      labs(y = "Ex-Post Tangency SR", x = "Year") +
      theme(axis.title.x = element_blank()) +
      annotate("text", x = 1972, 
               y = 0.34, label = "Beta", colour='black') +
      geom_segment(aes(x = 1972, y = 0.31, xend = 1972, yend = 0.23), size=0.1,arrow = arrow(length = unit(0.2, "cm"))) +
      annotate("text", x = 1979, 
               y = 0.45, label = "Earning-to-Price", colour='black') +
      geom_segment(aes(x = 1979, y = 0.42, xend = 1979, yend = 0.30), size=0.1,arrow = arrow(length = unit(0.2, "cm"))) +
      annotate("text", x = 1983, 
               y = 0.08, label = "Earnings Momentum", colour='black') +
      geom_segment(aes(x = 1981, y = 0.10, xend = 1981, yend = 0.25), size=0.1,arrow = arrow(length = unit(0.2, "cm"))) +
      annotate("text", x = 1989, 
               y = 0.25, label = "Price Momentum", colour='black') +
      geom_segment(aes(x = 1989, y = 0.27, xend = 1989, yend = 0.39), size=0.1,arrow = arrow(length = unit(0.2, "cm"))) +
      annotate("text", x = 1991, 
               y = 0.73, label = "Operating Accruals", colour='black') +
      geom_segment(aes(x = 1991, y = 0.69, xend = 1991, yend = 0.6), size=0.1,arrow = arrow(length = unit(0.2, "cm"))) +
      annotate("text", x = 2002, 
               y = 0.9, label = "Seasonality", colour='black') +
      geom_segment(aes(x = 2002, y = 0.87, xend = 2002, yend = 0.78), size=0.1,arrow = arrow(length = unit(0.2, "cm"))))
  
  n_plot <- sr_over_time %>%
    ggplot(aes(year, n)) +
    geom_point() +
    geom_line() +
    labs(y = "Factor Discovered", x = "Year of Discovery")
  
  list("plot"=cowplot::plot_grid(sr_plot, n_plot, ncol = 1, rel_heights = c(0.7, 0.3)), "data"=sr_over_time)
}

# Plot performance over time
plot_ts <- function(data, oos, alphas, scale, orig_sig, start = as.Date("1986-01-01")) {
  data[, region := case_when(
    region == "us" ~ "US",
    region == "world_ex_us" ~ "World ex. US"
  )]
  data <- data[eom >= start]
  data <- cluster_labels[data, on = "characteristic"]
  if (oos) {
    data <- setDT(char_info)[, .(characteristic, sample_end)][data, on = .(characteristic)]
    data <- data[year(eom) > sample_end][, sample_end := NULL]
  }
  if (orig_sig) {
    data <- setDT(char_info)[, .(characteristic, significance)][data, on = .(characteristic)]
    data <- data[significance==T][, significance := NULL]
  }
  y_axis <- paste0("Cumulative ", if_else(alphas==T, "Alpha ", "Excess Return "), if_else(oos==T, "(OOS)", "(IS)"))
  
  agg <- data[, .(ret = mean(ret), mkt = mean(mkt_vw_exc)), by = .(region, eom)]
  if (alphas) {
    agg[, ret := ret - cov(ret,mkt)/var(mkt)*mkt, by = .(region)]
  }
  if (scale) {
    agg[, ret := ret / (sd(ret)*sqrt(12)/0.1), by = .(region)]
  }
  agg %>% setorder(region, eom)
  agg[, cumret_app := cumsum(ret), by = region]
  plot <- agg %>%
    ggplot(aes(eom, cumret_app, colour = region)) +
    geom_line() +
    labs(y = y_axis) +
    theme(
      legend.position = c(0.85, 0.35),
      legend.title = element_blank(),
      axis.title.x = element_blank()
    )
  # Table
  tbl <- agg %>%
    group_by(region) %>%
    summarise(
      n = n(),
      meanret = mean(ret),
      vol = sd(ret),
      ret_vol = meanret/vol*sqrt(12),
      t = meanret/(vol/sqrt(n))
    ) %>%
    mutate(meanret = meanret*12)
  tbl %>% 
    select(region, ret_vol, t) %>%
    pivot_longer(c(ret_vol, t)) %>%
    mutate(
      value = formatC(value, digits=2, format = "f"),
      value = if_else(name == "t", paste0("(", value, ")"), value)
    ) %>%
    mutate(
      region = if_else(name == "t", "", region)
    ) %>%
    select(-name) %>%
    rename("Region"=region, "Full sample"=value) %>%
    xtable(align = "llc") %>%
    print(include.rownames = F)
  # Output
  return(plot)
}

# Plot OOS performance of significant factors
plot_sig_oos <- function(sig_oos_pfs, sig_type, cutoff_2012, first_date, leg_pos) {
  full <- sig_oos_pfs %>%
    filter(eom >= first_date) %>% 
    group_by(region, type, significant) %>%
    mutate(
      a = ret - cov(mkt,ret)/var(mkt)*mkt
    )  %>%
    summarise(
      n = n(),
      meanret = mean(ret),
      sd = sd(ret),
      sr = meanret/sd * sqrt(12),
      alpha = mean(a),
      resvol = sd(a),
      ir = alpha/resvol*sqrt(12),
      t_alpha = alpha/(resvol/sqrt(n))
    ) %>%
    mutate(alpha = alpha*12) %>%
    filter(region %in% c("us", "world_ex_us") & type == sig_type) %>%
    setDT()
  
  post_harvey <- sig_oos_pfs %>%
    filter(eom >= first_date) %>%
    group_by(region, type, significant) %>%
    mutate(
      a = ret - cov(mkt,ret)/var(mkt)*mkt 
    )  %>%
    filter(eom >= as.Date("2013-01-01")) %>%
    summarise(
      n = n(),
      meanret = mean(ret),
      sd = sd(ret),
      sr = meanret/sd * sqrt(12),
      alpha = mean(a),
      resvol = sd(a),
      ir = alpha/resvol*sqrt(12),
      t_alpha = alpha/(resvol/sqrt(n))
    ) %>%
    mutate(alpha = alpha*12) %>%
    filter(region %in% c("us", "world_ex_us") & type == sig_type) %>%
    setDT()
  
  cumret <- sig_oos_pfs %>%
    filter(eom >= first_date) %>%
    group_by(region, type, significant) %>%
    arrange(region, type, significant, eom) %>%
    filter(type == sig_type & region %in% c("us", "world_ex_us")) %>%
    mutate(
      alpha = ret - cov(mkt,ret)/var(mkt)*mkt,
      alpha = alpha / (sd(alpha)*sqrt(12)/0.1),
      cum_alpha = cumsum(alpha),
      region_pretty = case_when(
        region == "us" ~ "U.S.",
        region == "world_ex_us" ~ "World ex. U.S."
      )
    )
  
  # Figure 
  sig_oos <- cumret %>%
      ggplot(aes(eom, cum_alpha, colour = region_pretty, linetype = region_pretty)) +
      geom_line() +
      labs(y = "Cumulative Alpha") +
      theme(
        axis.title.x = element_blank(),
        legend.position = leg_pos,
        legend.title = element_blank()
      )
  
  # Table for caption
  tbl <- rbind(
    full %>% select(region, ir, t_alpha) %>% mutate(sample = "Full sample"),
    post_harvey %>% select(region, ir, t_alpha) %>% mutate(sample = "Post Harvey et al")
  ) %>%
    pivot_longer(c(ir, t_alpha)) %>%
    mutate(
      value = formatC(value, digits=2, format = "f"),
      value = if_else(name == "t_alpha", paste0("(", value, ")"), value)
    ) %>%
    pivot_wider(names_from = sample, values_from = value) %>%
    mutate(
      region = case_when(
        region=="us" ~ "IR: US",
        region == "world_ex_us" ~ "IR: World ex. US"
      ),
      region = if_else(name == "t_alpha", "", region)
    ) %>%
    select(-name) %>%
    rename("Region"=region)
  
  if (cutoff_2012) {
    sig_oos <- sig_oos + geom_vline(xintercept = as.Date("2012-12-31"), linetype = "dotted", alpha = 1)
    tbl %>% xtable(align = "llcc") %>% print(include.rownames = F)
  } else {
    tbl %>% select(-`Post Harvey et al`) %>% xtable(align = "llc") %>% print(include.rownames = F)
  }
  # Output 
  return(sig_oos)
}


# EB Posterior checks
eb_plots <- function(eb, plot = "shrinkage") {
  if (plot == "cluster_distribution") {
    a <- eb$mle %>% filter(estimate == "alpha") %>% pull(ml_est)
    tb <- eb$mle %>% filter(estimate == "tau_bar") %>% pull(ml_est)
    
    op <- data.frame(x=c(a-3*tb, a+3*tb)) %>% # data.frame(x=c(-1.2, 1.2)) %>%
      ggplot(aes(x)) + 
      stat_function(fun=function(x) dnorm(x = x, mean = a, sd = tb)) +
      labs(x = "Alpha", y = "Density", title = "Population Cluster Distribution ~ N(a0, tau_bar)")
  }
  if (plot == "factor_distribution") {
    a <- eb$mle %>% filter(estimate == "alpha") %>% pull(ml_est)
    tb <- eb$mle %>% filter(estimate == "tau_bar") %>% pull(ml_est)
    tt <- eb$mle %>% filter(estimate == "tau_tilde") %>% pull(ml_est)
    factor_sd <- sqrt(tb^2 + tt^2)
    
    op <- data.frame(x=c(a-3*factor_sd, a+3*factor_sd)) %>% # data.frame(x=c(-1.2, 1.2)) %>%
      ggplot(aes(x)) + 
      stat_function(fun=function(x) dnorm(x = x, mean = a, sd = factor_sd)) +
      labs(x = "Alpha", y = "Density", title = "Population Factor Distribution ~ N(a0, tau_bar + tau_tilde)")
  }
  if (plot == "factor") {
    op <- eb$factors %>%
      ggplot(aes(reorder(characteristic, post_mean), post_mean)) +
      geom_point() +
      geom_errorbar(aes(ymin = p025, ymax = p975)) +
      facet_wrap(~region) +
      theme(
        axis.title.x = element_blank(),
        axis.text.x = element_text(size = 5, angle = 90, vjust = 0, hjust = 1),
        text = element_text(size = 10)
      )
  }
  if (plot == "cluster") {
    op <- eb$clusters %>% 
      ggplot(aes(reorder(hcl_label, post_mean), post_mean)) +
      geom_point() +
      geom_errorbar(aes(ymin = post_mean - 1.96 * post_sd, ymax = post_mean + 1.96 * post_sd)) +
      geom_hline(yintercept = 0, linetype = "dotted") +
      labs(y = "Posterior Distribution of Cluster Alpha") +
      theme(
        axis.title.x = element_blank(),
        axis.text.x = element_text(size = 8)
      ) 
  }
  if (plot == "signal") {
    op <- eb$signal %>%
      ggplot(aes(characteristic, post_mean)) +
      geom_point() +
      geom_errorbar(aes(ymin = post_mean - 1.96 * post_sd, ymax = post_mean + 1.96 * post_sd))
  }
  if (plot == "factor_shrinkage") {
    op <- eb$factors %>%
      select(characteristic, region, "eb" = post_mean, "ols" = ols_est) %>%
      gather(eb, ols, key = "type", value = "alpha") %>%
      group_by(characteristic) %>%
      mutate(sort_var = alpha[region == "us" & type == "ols"]) %>%
      ggplot(aes(reorder(characteristic, sort_var), alpha, colour = region)) +
      geom_point() +
      facet_wrap(~type, ncol = 1) +
      theme(
        axis.title.x = element_blank(),
        axis.text.x = element_text(size = 5, angle = 90, vjust = 0, hjust = 1)
      )
  }
  if (plot == "se") {
    op <- eb_act$factors %>% 
      mutate(ols_p025 = ols_est - 1.96 * ols_se) %>% 
      mutate(se_diff = (ols_se-post_sd) / ols_se) %>% 
      ggplot(aes(x = region, y = se_diff*100, colour = region)) +
      geom_boxplot() +
      expand_limits(y = 0) +
      theme(
        axis.title.x = element_blank(),
        legend.position = "none"
      ) +
      labs(y = "(SE_ols - SE_eb) / SE_ols * 100")
  }
  if (plot == "repl") {
    repl_table <- eb$factors %>%
      group_by(region) %>%
      summarise(
        repl_eb = mean(p025 > 0),
        repl_ols = mean(ols_est - 1.96*ols_se > 0)
      )
    print(repl_table)
    
    op <- eb$factors %>%
      group_by(region, hcl_label) %>%
      summarise(
        mean_alpha = mean(post_mean),
        rep_rate = mean(p025>0)
      ) %>%
      group_by(hcl_label) %>%
      mutate(sort_var = rep_rate[region == "US"]) %>%
      ggplot(aes(reorder(hcl_label, sort_var), rep_rate)) +
      geom_col() +
      facet_wrap(~region, ncol = 1) +
      theme(
        axis.title.x = element_blank(),
        axis.text.x = element_text(size = 7)
      )
  }
  if (plot == "significance") {
    op <- bind_rows(
      eb$factors %>% select(char_reg, characteristic, hcl_label, region, "alpha" = post_mean, "se" = post_sd) %>% mutate(type = "hlm"),
      eb$factors %>% select(char_reg, characteristic, hcl_label, region, "alpha" = ols_est, "se" = ols_se) %>% mutate(type = "ols")
    ) %>%
      mutate(significant = alpha - 1.96 * se > 0) %>%
      group_by(hcl_label) %>%
      mutate(
        sort_var = median(alpha[type == "ols" & region == "US"]),
        type = case_when(
          type == "hlm" ~ "Empirical Bayes",
          type == "ols" ~ "OLS"
        )
      ) %>%
      ggplot(aes(reorder(characteristic, sort_var), alpha, colour = significant)) +
      geom_point() +
      geom_errorbar(aes(ymin = alpha - 1.96*se, ymax = alpha + 1.96*se)) +
      geom_hline(yintercept = 0, colour = "black") +
      facet_wrap(region~type, ncol = 2) +
      labs(y = "Alpha", colour = "Significant") +
      theme(
        axis.title.x = element_blank(),
        axis.text.x = element_text(size = 5, angle = 90, vjust = 0, hjust = 1)
      )
  }
  if (plot == "shrinkage") {
    op <- eb$factors %>%
      select(region, characteristic, "hlm_alpha_mean" = post_mean, "ols_alpha" = ols_est) %>%
      gather(-characteristic, -region, key = "type", value = "alpha") %>%
      group_by(characteristic, region) %>%
      mutate(sort_var = sum(alpha * (type == "ols_alpha"))) %>%
      group_by(region) %>%
      mutate(
        rank = frank(sort_var, ties.method = "max") / 2,
        type = case_when(
          type == "hlm_alpha_mean" ~ "Empricial Bayes Posterior Mean",
          type == "ols_alpha" ~ "OLS Estimate"
        )
      ) %>%
      ggplot(aes(rank, alpha, shape = type, colour = type, group = type)) +
      geom_smooth(method = "lm", se = F, formula = "y ~ x") +
      geom_point() +
      facet_wrap(~region) +
      theme(
        axis.text.x = element_text(size = 7),
        text = element_text(size = 10)
      ) +
      labs(x = "Rank OLS Alpha", y = "Alpha", colour = "Type", shape = "Type")
  }
  
  if (plot == "comparison") {
    op <- bind_rows(
      eb$factors %>% select(char_reg, characteristic, hcl_label, region, "alpha" = post_mean, "se" = post_sd) %>% mutate(type = "hlm"),
      eb$factors %>% select(char_reg, characteristic, hcl_label, region, "alpha" = ols_est, "se" = ols_se) %>% mutate(type = "ols")
    ) %>%
      group_by(hcl_label) %>%
      mutate(
        sort_var = median(alpha[type == "ols" & region == "US"]),
        type = case_when(
          type == "hlm" ~ "Empirical Bayes",
          type == "ols" ~ "OLS"
        )
      ) %>%
      ggplot(aes(reorder(characteristic, sort_var), alpha, colour = hcl_label)) +
      geom_point() +
      geom_errorbar(aes(ymin = alpha - 1.96*se, ymax = alpha + 1.96*se)) +
      geom_hline(yintercept = 0, colour = "black") +
      facet_wrap(region~type, ncol = 2) +
      labs(y = "Alpha", colour = "Cluster") +
      theme(
        axis.title.x = element_blank(),
        axis.text.x = element_text(size = 5, angle = 90, vjust = 0, hjust = 1)
      )
  }
  if (plot == "cluster_density") {
    op <- eb$clusters %>% 
      group_by(hcl_label) %>% 
      nest() %>% 
      mutate(randoms = data%>% map(~rnorm(1000, mean = .x$post_mean, sd = .x$post_sd))) %>% 
      unnest(randoms) %>% 
      ggplot(aes(x = randoms, fill = hcl_label)) + 
      geom_density(alpha = 0.5)  #  + facet_wrap(~hcl_label)
  }
  print(op)
}


================================================
FILE: Analysis/1 - Prepare Data.R
================================================
# Prepare Support Data ---------------------------------
# Market Returns
market_returns <- fread(paste0(data_path, "/market_returns.csv"), colClasses = c("eom"="character"))
market_returns[, eom := eom %>% as.Date(format = "%Y-%m-%d")]
market_returns <- market_returns[, .(excntry, eom, mkt_vw_exc, stocks, me_lag1)]
market_returns <- market_returns[
  eom >= settings$start_date & eom <= settings$end_date &
    !(excntry %in% settings$country_excl) &   
    !(excntry == "PER" & eom == as.Date("1992-01-31") & mkt_vw_exc >= 8900) &  # Huge outlier 
    !(excntry == "VEN" & eom == as.Date("2018-02-28") & mkt_vw_exc < -1)]      # Something is clearly wrong

# Labels 
char_info <- readxl::read_xlsx("Factor Details.xlsx",
                               sheet = "details", range = "A1:N300") %>%
  select("characteristic"=abr_jkp, direction, significance, date_range = `in-sample period`, "hxz_group"=group) %>%
  filter(!is.na(characteristic)) %>%
  mutate(
    direction = direction %>% as.integer,
    sample_start = date_range %>% str_extract("^\\d+") %>% as.integer(),
    sample_end = date_range %>% str_extract("\\d+$") %>% as.integer()
  )
base_chars <- char_info$characteristic

# Country Classification
country_classification <- readxl::read_xlsx("Country Classification.xlsx", 
                                            sheet = "countries", range = "A1:C200") %>%
  select(excntry, msci_development, region) %>%
  filter(!is.na(excntry)) %>%
  setDT()

# Regions
region_info <- tibble(
  name = c("us", "developed", "emerging", "frontier", "world", "world_ex_us"),
  country_codes = list(
    "USA",
    country_classification[msci_development == "developed" & excntry != "USA"]$excntry,
    country_classification[msci_development == "emerging"]$excntry,
    country_classification[msci_development == "frontier"]$excntry,
    country_classification$excntry,
    country_classification[excntry != "USA"]$excntry
  ),
  countries_min = c(1, rep(settings$countries_min, 3), 1, 3)
)

# Prepare Data --------------------------------------------------------
# HML ----------------------
hml <- fread(paste0(data_path, "/hml.csv"), colClasses = c("eom"="character"))
hml[, eom := eom %>% as.Date(format = "%Y-%m-%d")]
# Choose weighting
hml[excntry == "USA", ret := case_when(
  settings$weighting$us == "vw" ~ ret_vw,
  settings$weighting$us == "ew" ~ ret_ew,
  settings$weighting$us == "vw_cap" ~ ret_vw_cap
)]
hml[excntry != "USA", ret := case_when(
  settings$weighting$global_ex_us == "vw" ~ ret_vw,
  settings$weighting$global_ex_us == "ew" ~ ret_ew,
  settings$weighting$global_ex_us == "vw_cap" ~ ret_vw_cap
)]
# Screens
hml <- hml %>%
  filter(
    characteristic %in% base_chars,
    eom >= settings$start_date & eom <= settings$end_date,
    !is.na(ret) & n_stocks_min >= settings$n_stocks_min,  # Min Stocks is the big one, it removes close to 50% of the obs. with n_stocks_min>=10. Perhaps it's too stringent.
    !(excntry %in% settings$country_excl)
  ) %>%
  select(-signal, -n_stocks_min)
# Set direction as original study
hml <- hml %>%
  left_join(char_info %>% select(characteristic, direction), by = "characteristic") %>%
  mutate(ret = ret * direction) %>%
  select(-ret_vw, -ret_ew, -ret_vw_cap)
# Ensure no Duplicates
if(hml[, .N, by = .(characteristic, excntry, eom)][, max(N)] > 1) {
  warning("THE DATA HAS DUPLICATES")
} 

# Regional Portfolios ------------------------------------------------
regional_data <- function(data, countries, weighting, countries_min, months_min, size_grps = F) {
  # Determine Country Weights
  weights <- market_returns[, .(excntry, eom, mkt_vw_exc, "country_weight" = case_when(
    weighting == "market_cap" ~ me_lag1,
    weighting == "stocks" ~ as.double(stocks),
    weighting == "ew" ~ 1)
  )]
  # Portfolio Return 
  pf <- data[excntry %in% countries]
  pf <- weights[pf, on = .(excntry, eom)]
  if (size_grps) {
    pf <- pf[, .(
      n = .N,
      ret = sum(ret*country_weight) / sum(country_weight),
      mkt_vw_exc = sum(mkt_vw_exc * country_weight) / sum(country_weight) 
    ), by = .(characteristic, size_grp, eom)]
  } else {
    pf <- pf[, .(
      n = .N,
      ret = sum(ret*country_weight) / sum(country_weight),
      mkt_vw_exc = sum(mkt_vw_exc * country_weight) / sum(country_weight) 
    ), by = .(characteristic, eom)]
  }
  # Minimum Requirement: Countries
  pf <- pf[n >= countries_min]
  # Minimum Requirement: Months
  pf[, months := .N, by = .(characteristic)]
  pf <- pf[months >= months_min]
  return(pf)
}
regional_pfs <- 1:nrow(region_info) %>% lapply(function(i) {
  info <- region_info[i, ]
  reg_pf <- hml %>% regional_data(countries = unlist(info$country_codes), weighting = settings$country_weighting,
                                  countries_min = info$countries_min, months_min = settings$months_min)
  reg_pf %>% mutate(region = info$name)
}) %>% bind_rows() 

# Characteristic Managed Portfolios ----------------------
cmp <- fread(paste0(data_path, "/cmp.csv"), colClasses = c("eom"="character"))
cmp[, eom := eom %>% as.Date(format="%Y-%m-%d")]

# Screens
cmp <- cmp %>%
  rename(ret = ret_weighted) %>%
  filter(
    characteristic %in% base_chars,
    eom >= settings$start_date & eom <= settings$end_date,
    !is.na(ret) & signal_weighted != 0 & n_stocks >= settings$n_stocks_min * 2, 
    !(excntry %in% settings$country_excl)
  ) %>%
  select(-signal_weighted, -n_stocks)
# Determine Direction
cmp <- cmp %>%
  left_join(char_info %>% select(characteristic, direction), by = "characteristic") %>%
  mutate(ret = ret * direction)
# Ensure no Duplicates
if(cmp[, .N, by = .(characteristic, excntry, size_grp, eom)][, max(N)] > 1) {
  warning("THE DATA HAS DUPLICATES")
} 
# Regional Portfolios
region_info_cmp <- region_info %>% filter(name == "us")
regional_pfs_cmp <- 1:nrow(region_info_cmp) %>% lapply(function(i) {
  info <- region_info[i, ]
  reg_pf <- cmp %>% regional_data(countries = unlist(info$country_codes), weighting = settings$country_weighting,
                                  countries_min = info$countries_min, months_min = settings$months_min, size_grps = T)
  reg_pf %>% mutate(region = info$name)
}) %>% bind_rows() 

# Regional Market Returns ---
regional_mkt_ret <- 1:nrow(region_info) %>% lapply(function(i) {
  info <- region_info[i, ]
  mkt <- market_returns[excntry %in% unlist(info$country_codes), .(n = .N, market = sum(mkt_vw_exc * me_lag1) / sum(me_lag1)), by = eom]
  mkt <- mkt[n >= info$countries_min][, n:= NULL]
  mkt %>% mutate(region = info$name)
}) %>% bind_rows() 

print(paste("Total Characteristics:", uniqueN(regional_pfs$characteristic)))


================================================
FILE: Analysis/2 - Determine Clusters.R
================================================
# Hierachical Clustering ----------------------------------------------
factor_hcl <- function(cor_mat, linkage = "ward.D", k, direction_bars = T) {
  dist_mat <- as.dist((1-cor_mat))
  # dist_mat <- as.dist(sqrt((1-cor_mat)*2)) # With ward.D2 gives the same clusters
  hcl <- dist_mat %>% 
    hclust(method=linkage)
  print(str_c("Cophenetic Correlation between Dendogram and Distance Matrix = ", 
              format(cor(cophenetic(hcl), dist_mat), digits = 2, nsmall = 2)))
  
  hcl_labels <- hcl %>%
    cutree(k = k) %>%
    as_tibble(rownames = "characteristic") %>%
    setDT() %>% 
    setnames(c("characteristic", "hcl"))

  hcl_col <- rep(colours_theme[c(1, 2, 3, 4, 5, 7, 9, 11)], ceiling(k/8))[1:k]
  
  dend <- hcl %>% 
    as.dendrogram() %>%
    dendextend::set("labels_col", value = hcl_col, k=k) %>%
    dendextend::set("branches_k_color", value = hcl_col, k=k) %>%
    dendextend::set("labels_cex", value = 0.5) %>%
    dendextend::set("branches_lty", 1) %>%
    dendextend::set("branches_lwd", 0.2) 
  dend %>% plot(horiz=T)
  
  return_list <- list(
    "cor" = cor_mat,
    "labels" = hcl_labels,
    "dend" = dend
  )
  
  if (direction_bars) {
    bar_colours <- tibble("characteristic" = colnames(cor_mat)) %>%
      left_join(char_info %>% select(characteristic, direction), by = "characteristic") %>% 
      mutate(col_dir = if_else(direction == 1, "black", "white"))
    colored_bars(colors = bar_colours %>% select(col_dir), dend = dend, rowLabels = c("Long High"), y_shift = 3, horiz = T)
    return_list$bar_colours <- bar_colours
  }
  return(return_list)
}

hcl_input <- function(data, ret_type = "alpha", ...) { # ret_type %in% c("raw", "alpha")
  data <- copy(data)  # Avoid modifying in place
  if (ret_type == "raw") {
    data[, ret_hcl := ret]
  }
  if (ret_type == "alpha") {
    data[, ret_hcl := ret - mkt_vw_exc * cov(ret, mkt_vw_exc)/var(mkt_vw_exc), by = characteristic]
  }
  data %>%
    select(characteristic, eom, ret_hcl) %>%
    spread(key = characteristic, value = ret_hcl) %>%
    select(-eom) %>%
    cor(...)  
}

# US Clusters -----------
clusters <- regional_pfs %>%
  filter(region == settings$hcl$region & year(eom) >= settings$hcl$start_year) %>%
  hcl_input(ret_type = settings$hcl$ret_type, method = settings$hcl$cor_method, use = "pairwise.complete.obs") %>%
  factor_hcl(linkage = settings$hcl$linkage, k = settings$hcl$k, direction_bars = T)

# Cluster Labels
if (settings$weighting$us == "vw_cap" & settings$hcl$k == 13 & settings$hcl$region == "us" & settings$hcl$start_year == 1975) {
  clusters$labels <- clusters$labels %>% 
    mutate(
      hcl_label = case_when(
        hcl == 1 ~ "Low Leverage",  
        hcl == 2 ~ "Investment",     
        hcl == 3 ~ "Size", 
        hcl == 4 ~ "Value",  
        hcl == 5 ~ "Quality", 
        hcl == 6 ~ "Low Risk", 
        hcl == 7 ~ "Debt Issuance", 
        hcl == 8 ~ "Seasonality", 
        hcl == 9 ~ "Accruals",   
        hcl == 10 ~ "Profitability",
        hcl == 11 ~ "Profit Growth",
        hcl == 12 ~ "Short-Term Reversal",
        hcl == 13 ~ "Momentum",
        TRUE ~ as.character(hcl))
    ) 
} else {
  clusters$labels <- clusters$labels %>% 
    mutate(hcl_label = hcl)
}
# Output
cluster_labels <- clusters$labels %>%
  select(-hcl)


================================================
FILE: Analysis/3 - Analysis.R
================================================
# Empirical Bayes Estimation --------------------------
# search_list: c(regions, type, layers, size_grp)
search_list <- list(
  "us" = list("us", "hml", 2), 
  "developed" = list("developed", "hml", 2),
  "emerging" = list("emerging", "hml", 2),
  "all" = list(c("us", "developed", "emerging"), "hml", 3),
  "world" = list("world", "hml", 2),
  "world_ex_us" = list("world_ex_us", "hml", 2),
  "us_mega" = list("us", "cmp", 2, "mega"),
  "us_large" = list("us", "cmp", 2, "large"),
  "us_small" = list("us", "cmp", 2, "small"),
  "us_micro" = list("us", "cmp", 2, "micro"),
  "us_nano" = list("us", "cmp", 2, "nano")
)
eb_est <- search_list %>% sapply(simplify = F, USE.NAMES = T, function(x) {
  print(paste("Region:", x[[1]]))
  regions <- x[[1]]
  if (x[[2]] == "cmp") {
    base_data <- copy(regional_pfs_cmp) %>% filter(size_grp == x[[4]])
  }
  if (x[[2]] == "hml") {
    base_data <- copy(regional_pfs)
  }
  # Prepare Data
  data <- base_data %>% 
    filter(eom >= settings$start_date & eom <= settings$end_date) %>%
    filter(region %in% regions) %>% 
    eb_prepare(
      scale_alpha = settings$eb$scale_alpha, 
      overlapping = settings$eb$overlapping 
    )
  # Run Empirical Bayes
  op <- data %>% 
    emp_bayes( 
      cluster_labels = cluster_labels, 
      min_obs = settings$eb$min_obs,
      fix_alpha = settings$eb$fix_alpha, 
      bs_cov = settings$eb$bs_cov, 
      layers = x[[3]], 
      shrinkage = settings$eb$shrinkage, 
      cor_type = settings$eb$cor_type, 
      bs_samples = settings$eb$bs_samples, 
      seed = settings$seed
    )
  # Output
  return(op)
})

# Simulations EB vs. BY --------------
# Simulations
if (update_sim) {
  # Values from Data
  pairwise_cor <- eb_est$us$input$long %>%
    select(characteristic, eom, ret_neu) %>%
    spread(key = characteristic, value = ret_neu) %>%
    select(-eom) %>%
    cor(use = "pairwise.complete.obs") 
  
  cor_value <- pairwise_cor %>%
    as_tibble(rownames = "char1") %>%
    gather(-char1, key = "char2", value = "cor") %>%
    left_join(cluster_labels %>% select(characteristic, "hcl1" = hcl_label), by = c("char1"="characteristic")) %>%
    left_join(cluster_labels %>% select(characteristic, "hcl2" = hcl_label), by = c("char2"="characteristic")) %>%
    filter(char1 != char2) %>%
    mutate(same_cluster = (hcl1 == hcl2)) %>%
    group_by(same_cluster) %>%
    summarise(avg_cor = mean(cor))
  
  # Time periods
  med_months <- eb_est$us$input$long %>% group_by(characteristic) %>% summarise(n = n()) %>% pull(n) %>% median()
  
  data <- list(
    yrs = round(med_months / 12),
    cor_within = cor_value %>% filter(same_cluster == T) %>% pull(avg_cor) %>% round(digits = 2),
    cor_across = cor_value %>% filter(same_cluster == F) %>% pull(avg_cor) %>% round(digits = 2)
  )
  # Simulation Settings
  set.seed(settings$seed)
  sim <- list(
    "alpha_0" = 0,
    "t" = 12*70,      # Median amount of data
    "clusters" = 13,
    "fct_pr_cl" = 10,
    "corr_within" = 0.58, 
    "corr_across" = 0.02,
    "n_sims" = 10000,
    "tau_c" = c(0.01, seq(from = 0.05, to = 0.5, by = 0.05)),
    "tau_w" = c(0.01, 0.2)
  )
  sim$se <- (10/sqrt(12))/sqrt(sim$t)
  sim$n <- sim$clusters * sim$fct_pr_cl
  
  # Check settings are consistent with data [Alert if there is a significant difference]
  if (abs(sim$t - data$yrs*12) > 12 | abs(sim$corr_within - data$cor_within) > 0.05 | abs(sim$corr_across - data$cor_across) > 0.05) {
    warning("SIMULATION AND DATA VALUES ARE NOT CONSISTENT!")
    print(data)
    print(list("yrs"=sim$t/12, "corr_within"=sim$corr_within, "corr_across"=sim$corr_across))
  }
  simulation <- sim_mt_control(sim_settings = sim)
  simulation %>% saveRDS(file = paste0(object_path, "/fdr_sim.RDS"))
} else {
  simulation <- readRDS(file = paste0(object_path, "/fdr_sim.RDS"))
}
# False Discovery Rate
model_fdr <- fdr_sim(t_low = 0, a_vec = eb_est$us$factor_mean, a_cov = eb_est$us$factor_cov, n_sim = 10000, seed = settings$seed)
# Multiple Testing Adjustments
mt <- multiple_testing(eb_all = eb_est$all, eb_world = eb_est$world)
# Tangency Portfolios -----------------------------
# Regions
tpf_world <- eb_est$world$input$long %>% tpf_cluster(mkt_region = "world", orig_sig = T, min_date = settings$tpf$start$world, n_boots = settings$tpf$bs_samples, shorting = settings$tpf$shorting, seed = settings$seed)
tpf_us <- eb_est$us$input$long %>% tpf_cluster(mkt_region = "us", orig_sig = T, min_date = settings$tpf$start$us, n_boots = settings$tpf$bs_samples, shorting = settings$tpf$shorting, seed = settings$seed)
tpf_dev <- eb_est$developed$input$long %>% tpf_cluster(mkt_region = "developed", orig_sig = T, min_date = settings$tpf$start$developed, n_boots = settings$tpf$bs_samples, shorting = settings$tpf$shorting, seed = settings$seed)
tpf_emer <- eb_est$emerging$input$long %>% tpf_cluster(mkt_region = "emerging", orig_sig = T, min_date = settings$tpf$start$emerging, n_boots = settings$tpf$bs_samples, shorting = settings$tpf$shorting, seed = settings$seed)
# Size Groups
tpf_size <- c("mega", "large", "small", "micro", "nano") %>% lapply(function(x) {
  eb_est[[paste0("us_", x)]]$input$long %>% tpf_cluster(mkt_region = "us", orig_sig = T, min_date = settings$tpf$start$size_grps, n_boots = settings$tpf$bs_samples, shorting = settings$tpf$shorting, seed = settings$seed) %>%
    mutate(size_grp = x)
}) %>%
  bind_rows()
# Single Factor TPF
tpf_factors <- prepare_tpf_factors(region = settings$tpf_factors$region, orig_sig_values = settings$tpf_factors$orig_sig, 
                                   start = settings$tpf_factors$start, scale = settings$tpf_factors$scale)
opt_s <- tpf_factors$long %>% optimal_shrinkage(k = settings$tpf_factors$k)

# Posterior Over time -----------------
ot_region <- "world"
if (update_post_over_time) {
  for (fix_taus in c(T,F)) {
    if (fix_taus) {
      fixed_priors <- list(
        "alpha" = eb_est[[ot_region]]$mle %>% filter(estimate == "alpha") %>% pull(ml_est), 
        "tau_c" = eb_est[[ot_region]]$mle %>% filter(estimate == "tau_c") %>% pull(ml_est), 
        "tau_s" = eb_est[[ot_region]]$mle %>% filter(estimate == "tau_s") %>% pull(ml_est)
      )
    } else {
      fixed_priors <- NULL
    }
    
    periods <- sort(unique(regional_pfs$eom))
    periods <- periods[month(periods) == 12]  # Only estimate once per year
    
    time_chars <- regional_pfs %>% 
      filter(region == ot_region & eom <= as.Date("1960-12-31")) %>% 
      group_by(characteristic) %>%
      filter(n() >= settings$eb$min_obs) %>%
      pull(characteristic) %>% 
      unique()
    
    posterior_over_time <- periods[periods >= as.Date("1960-12-31")] %>% lapply(function(end_date) { 
      print(end_date)
      # Prepare Data
      data <- regional_pfs %>% 
        filter(characteristic %in% time_chars) %>%
        filter(eom >= settings$start_date & eom <= end_date) %>%
        filter(region == ot_region) %>% 
        eb_prepare(
          scale_alpha = settings$eb$scale_alpha, 
          overlapping = settings$eb$overlapping 
        )
      # Run Empirical Bayes
      eb_act <- data %>% 
        emp_bayes( 
          cluster_labels = cluster_labels, 
          min_obs = settings$eb$min_obs,
          fix_alpha = settings$eb$fix_alpha, 
          bs_cov = settings$eb$bs_cov, 
          layers = 2, 
          shrinkage = settings$eb$shrinkage, 
          cor_type = settings$eb$cor_type, 
          bs_samples = 1000, 
          priors = fixed_priors,
          seed = settings$seed
        )
      eb_act$input <- NULL
      eb_act$end_date <- end_date
      return(eb_act)
    }) 
    if (fix_taus) {
      posterior_over_time %>% saveRDS(file = paste0(object_path, "/posterior_over_time.RDS"))
    } else {
      posterior_over_time %>% saveRDS(file = paste0(object_path, "/posterior_over_time_flex.RDS"))
    }
  }
} 
posterior_over_time <- readRDS(file = paste0(object_path, "/posterior_over_time.RDS"))
posterior_over_time_flex <- readRDS(file = paste0(object_path, "/posterior_over_time_flex.RDS"))

# Size Dimension
eb_us_size <- c("mega", "large", "small", "micro", "nano") %>% lapply(function(x) {
  eb_est[[str_c("us_", x)]]$factors %>% mutate(size_grp = x)
}) %>% 
  bind_rows() %>%
  mutate(
    size_grp = str_to_title(size_grp),
    size_grp = size_grp %>% factor(levels = c("Mega", "Large", "Small", "Micro", "Nano"))
  )

# In-Sample / Out-of-Sample ------------------------------------
is_oos <- c("pre", "post", "pre_post") %>% sapply(simplify = F, USE.NAMES = T, function(t) {
  data <- eb_est$us$input$long %>% prepare_is_oos(min_obs = 60, ret_scaled = "all", orig_group = T, type = t, print=T)
  regs <- data %>%
    group_by(characteristic, period, n_is, n_oos) %>%
    nest() %>%
    mutate(
      fit = data %>% map(~lm(ret_adj ~ mkt_vw_exc, data = .x)),
      tidied = fit %>% map(tidy)
    ) %>% 
    unnest(tidied) %>%
    filter(term == "(Intercept)") %>%
    select(characteristic, period, n_is, n_oos, estimate) %>%
    spread(key = period, value = estimate)
  list(data=data, regs=regs)
})

# Economi Benefit of More Powerful Multiple Comparison 
if (update_post_is) {
  periods <- sort(unique(regional_pfs$eom))
  periods <- periods[month(periods) == 12 & year(periods) >= 1959]  # Only estimate once per year
  
  posterior_is <- periods %>% lapply(function(end_date) { 
    print(paste("Date", end_date, "-" , match(end_date, periods), "out of", length(periods)))
    # Prepare Data
    data <- regional_pfs %>% 
      filter(eom >= settings$start_date & eom <= end_date) %>%
      filter(region == "us") %>% 
      eb_prepare(
        scale_alpha = settings$eb$scale_alpha, 
        overlapping = settings$eb$overlapping 
      )
    # Run Empirical Bayes
    eb_act <- data %>% 
      emp_bayes( 
        cluster_labels = cluster_labels, 
        min_obs = settings$eb$min_obs,
        fix_alpha = settings$eb$fix_alpha, 
        bs_cov = settings$eb$bs_cov, 
        layers = 2, 
        shrinkage = settings$eb$shrinkage, 
        cor_type = settings$eb$cor_type, 
        bs_samples = 1000,
        seed = settings$seed
      )
    # Output 
    eb_act$factors %>% mutate(est_date = end_date)
  }) %>% bind_rows()
  posterior_is %>% saveRDS(file = paste0(object_path, "/posterior_is.RDS"))
} else {
  posterior_is <- readRDS(file = paste0(object_path, "/posterior_is.RDS"))
}
sig_oos_pfs <- posterior_is %>% trading_on_significance()

# Harvey et al (2016) Simulation - Baseline ------------------------------
# We use the baseline specification from table 5 - Panel A where the average correlation is 0 (the average correlation among factors in our data is 7%)
# 1300 * (1-0.396)  The harvey et al numbers are m=1297 and m_true=783
harvey_base <- list(
  alpha_0 = 0,
  t = 70*12,  
  ret = 4.4 / 12,
  vol = 10 / sqrt(12),
  cl = 26,
  cl_true = 16,
  fct_pr_cl = 50,
  corr_across = 0.02,
  corr_within = 0.58,
  tau_ws = c(0.21),         # We estimate it at 0.21
  n_sims = 50,
  fix_alpha = T
)
harvey_base$se <- harvey_base$vol / sqrt(harvey_base$t)
harvey_base$n <- harvey_base$cl * harvey_base$fct_pr_cl
harvey_base$n_true <- harvey_base$cl_true * harvey_base$fct_pr_cl

if (update_harvey_baseline) {
  harvey_base_res <- harvey_et_al_sim(sim_settings = harvey_base, seed = settings$seed)
  harvey_base_res <- list("settings" = harvey_base, "sim" = harvey_base_res)
  harvey_base_res %>% saveRDS(file = paste0(object_path, "/harvey_res_baseline.RDS"))
} else {
  harvey_base_res <- readRDS(file = paste0(object_path, "/harvey_res_baseline.RDS"))
}

# Harvey et al (2016) Simulation - Worst Case
# We use the worst cases specification from table 5 - Panel B where the average correlation is 0 (the average correlation among factors in our data is 7%)
# 2500 * (1-0.683) = 800  The harvey numbers are m=2458 and m_true=779
harvey_worst <- list(
  alpha_0 = 0,
  t = 70*12,                # Median number of years for US factor
  ret = 4.4 / 12,
  vol = 10 / sqrt(12),
  cl = 50,
  cl_true = 16,
  fct_pr_cl = 50,
  corr_across = 0.02,
  corr_within = 0.58,
  tau_ws = c(0.21),         # We estimate it at 0.21 Same as what we estimate
  n_sims = 50,
  fix_alpha = T
)
harvey_worst$se <- harvey_worst$vol / sqrt(harvey_worst$t)
harvey_worst$n <- harvey_worst$cl * harvey_worst$fct_pr_cl
harvey_worst$n_true <- harvey_worst$cl_true * harvey_worst$fct_pr_cl

if (update_harvey_worstcase) {
  harvey_worst_res <- harvey_et_al_sim(sim_settings = harvey_worst, seed = settings$seed)
  harvey_worst_res <- list("settings" = harvey_worst, "sim" = harvey_worst_res)
  harvey_worst_res %>% saveRDS(file = paste0(object_path, "/harvey_res_worstcase.RDS"))
} else {
  harvey_worst_res <- readRDS(file = paste0(object_path, "/harvey_res_worstcase.RDS"))
}

# Estimate parameters on OOS data -----------
if (FALSE) {
  reg <- "us"
  # Prepare Data
  data <- regional_pfs %>% 
    filter(eom >= settings$start_date & eom <= settings$end_date) %>%
    filter(region==reg) %>%
    left_join(char_info %>% select(characteristic, sample_start, sample_end), by = "characteristic") %>%
    filter(year(eom) < sample_start | year(eom) > sample_end) %>%
    eb_prepare(
      scale_alpha = settings$eb$scale_alpha, 
      overlapping = settings$eb$overlapping # If we start in 1955-02-28 we lose 12 factors relative to starting in 1972-11-30
    )
  # Run Empirical Bayes
  op <- data %>% 
    emp_bayes( 
      cluster_labels = cluster_labels, 
      min_obs = settings$eb$min_obs,
      fix_alpha = settings$eb$fix_alpha, 
      bs_cov = settings$eb$bs_cov, 
      layers = 2, 
      shrinkage = settings$eb$shrinkage, 
      cor_type = settings$eb$cor_type, 
      bs_samples = settings$eb$bs_samples,
      seed = settings$seed
    )
  # OOS-replication rate
  op$factors %>%
    left_join(char_info %>% select(characteristic, significance), by = "characteristic") %>%
    filter(significance == 1) %>%
    summarise(
      n = n(),
      sd_ols = sd(ols_est),
      eb_rr = mean(p025 > 0),
      eb_ols = mean(ols_est - 1.96*ols_se > 0)
    )
  # OOS Hyperparameters
  op$mle %>% mutate(estimate = if_else(estimate == "tau_s", "tau_w", estimate))
  # Full sample Tau's
  eb_est[[reg]]$mle %>% mutate(estimate = if_else(estimate == "tau_s", "tau_w", estimate))
  # Replication rate with OOS hyperparameters
  repl_rate <- function(chars, alphas, sigma, alpha0, tau_c, tau_w, cluster_labels, char_info) {
    # Alpha zero vector
    alpha0_vec <- rep(alpha0, length(alphas))
    # Signal Membership
    cm <- tibble(characteristic = chars, "alpha"= alphas) %>% 
      left_join(cluster_labels, by = "characteristic") 
    m <- cm %>%
      mutate(cm = 1) %>%
      select(characteristic, hcl_label, cm) %>%
      spread(key = hcl_label, value = cm) %>% 
      select(-characteristic) %>% 
      as.matrix()
    m[is.na(m)] <- 0
    mm <- m %*% t(m)
    # Omega 
    omega <- diag(length(alphas)) * tau_w^2 + mm * tau_c^2
    # Posterior
    post_cov <- solve(solve(omega) + solve(sigma))
    post_alpha <- post_cov %*% (solve(omega) %*% alpha0_vec + solve(sigma) %*% alphas)
    # Replication Rate
    tibble(
      characteristic = chars,
      alpha = drop(post_alpha),
      se = sqrt(diag(post_cov))
    ) %>%
      left_join(char_info, by = "characteristic") %>%
      filter(significance == T) %>%
      summarise(
        repl_rate = mean(alpha - 1.96*se > 0)
      )
  }
  # Full Sample Hyper-parameters
  repl_rate(
    chars = eb_est[[reg]]$factors$characteristic, 
    alphas = eb_est[[reg]]$factors$ols_est, 
    sigma = eb_est[[reg]]$sigma,
    alpha0 = eb_est[[reg]]$mle %>% filter(estimate == "alpha") %>% pull(ml_est),
    tau_c = eb_est[[reg]]$mle %>% filter(estimate == "tau_c") %>% pull(ml_est),
    tau_w = eb_est[[reg]]$mle %>% filter(estimate == "tau_s") %>% pull(ml_est),
    cluster_labels = cluster_labels, 
    char_info = char_info
  )
  # OOS Hyper-parameters
  repl_rate(
    chars = eb_est[[reg]]$factors$characteristic, 
    alphas = eb_est[[reg]]$factors$ols_est, 
    sigma = eb_est[[reg]]$sigma,
    alpha0 = op$mle%>% filter(estimate == "alpha") %>% pull(ml_est),
    tau_c = op$mle %>% filter(estimate == "tau_c") %>% pull(ml_est),
    tau_w = op$mle %>% filter(estimate == "tau_s") %>% pull(ml_est),
    cluster_labels = cluster_labels, 
    char_info = char_info
  )
}


================================================
FILE: Analysis/4 - Output.R
================================================
# Determine Cluster Order
cluster_order <- c("Accruals", "Debt Issuance", "Investment", "Short-Term Reversal", "Value",
                   "Low Risk", "Quality", "Momentum", "Profitability", "Profit Growth",
                   "Seasonality", "Size", "Low Leverage")

# Collect all output in list
output <- list(figures = list(), tables = list())

# Headline Replication Rate
headline_rr <- eb_est$us$factors %>% left_join(char_info, by = "characteristic") %>% filter(significance == 1) %>% summarise(rr = mean(p025>0)) %>% pull(rr)

# Figures --------------------------------------
# HCL
output$figures$hcl_us <- function(tex = F) {
  par(mar = c(3,2,1,10), cex = 1) 
  c <- 1.2
  x <- 37/2
  label_func <- function(x) unique(clusters$labels$hcl_label)[unique(clusters$labels$hcl_label) %>% str_detect(x)]
  
  clusters_tex <- copy(clusters$dend)
  if (tex == T) {
    labels(clusters_tex) <- labels(clusters_tex) %>% str_replace_all("_", "\\\\_")
  }
  
  clusters_tex %>% plot(horiz=T)
  colored_bars(colors = clusters$bar_colours %>% select(col_dir), dend = clusters_tex, rowLabels = c("Long High"), 
               y_shift = 11/2, horiz = T)
  # Labels
  if (settings$hcl$k == 13) {
    text(x = x, y = 151, label_func("Short-Term Reversal"), cex = c, col = colours_theme[5], adj = 0)   #gold-9
    text(x = x, y = 141, label_func("Profitability"), cex = c, col = colours_theme[4], adj = 0)  #lightgreen-7
    text(x = x, y = 129, label_func("Low Risk"), cex = c, col = colours_theme[3], adj = 0)  #purple-5
    text(x = x, y = 112, label_func("Value"), cex = c, col = colours_theme[2], adj = 0)       #orange-4
    text(x = x, y = 90, label_func("Investment"), cex = c, col = colours_theme[1], adj = 0)           #darkgreen-3
    text(x = x, y = 73, label_func("Seasonality"), cex = c, col = colours_theme[11], adj = 0)            #orange-4 
    text(x = x, y = 63, label_func("Debt Issuance"), cex = c, col = colours_theme[9], adj = 0)       #red-2
    text(x = x, y = 57, label_func("Size"), cex = c, col = colours_theme[7], adj = 0)            #blue-1  
    text(x = x, y = 51, label_func("Accruals"), cex = c, col = colours_theme[5], adj = 0)   #black-11
    text(x = x, y = 43, label_func("Low Leverage"), cex = c, col = colours_theme[4], adj = 0)            #lightgreen-7
    text(x = x, y = 30, label_func("Profit Growth"), cex = c, col = colours_theme[3], adj = 0)   #purple-5
    text(x = x, y = 22, label_func("Momentum"), cex = c, col = colours_theme[2], adj = 0)               #darkgreen-3
    text(x = x, y = 12, label_func("Quality"), cex = c, col = colours_theme[1], adj = 0)        #blue-1
  }
}

# Cluster Validation
output$figures$hcl_us_val <- clusters$cor %>% cluster_val(labels = clusters$labels, op_format = "pdf")

# Literature comparison
print(output$figures$lit_comp <- eb_est$us %>% plot_lit_comp(mt_res = mt, eb_world = eb_est$world, excl_insig=T))

# Comparing Multiple Testing with Empirical Bayes
c(output$figures$mt_factors, output$figures$mt_summary) %<-% plot_mt_eb_comp(
  mt = mt, eb_all = eb_est$all, eb_us = eb_est$us, eb_developed = eb_est$developed, eb_world = eb_est$world,
  eb_emerging = eb_est$emerging, mts = c("OLS", "Bonferroni", "BY"), 
  regs = c("us", "developed", "emerging", "world"), se_methods = c("OLS", "BY", "EB - Region", "EB - All"),
  se_regions = "us")

# Replication Rate by Region
output$figures$gl_by_cluster <- plot_repl_region(eb_all = eb_est$all, cluster_order = cluster_order)

# Global Factor Posterior
output$figures$gl_by_factor <- eb_est$world %>% plot_factor_post(orig_sig = T, cluster_order = cluster_order)

# CI Many Factors
output$figures$ci_many_fcts <- plot_many_factors()

# Tangency Portfolio - US
output$figures$tpf <- tpf_us %>% plot_tpf(cluster_order = cluster_order, ci_low = 0.05, ci_high = 0.95)

# Tangency Portfolio - Regions
output$figures$tpf_regions <- plot_tpf_region(tpf_us = tpf_us, tpf_dev = tpf_dev, tpf_emer = tpf_emer, cluster_order = cluster_order, ci_low = 0.05, ci_high = 0.95)

# Tangency Portfolio - Size Groups
output$figures$tpf_size <- tpf_size %>% plot_tpf_size(cluster_order = cluster_order, ci_low = 0.05, ci_high = 0.95)

# Posterior over Time - Fixed Taus
output$figures$overtime <- posterior_over_time %>% plot_over_time(orig_sig = T, ols_incl = T, lb = 5, bw=F)
output$figures$overtime_bw <- posterior_over_time %>% plot_over_time(orig_sig = T, ols_incl = T, lb = 5, bw=T)

# Posterior over Time - Flexible Taus
output$figures$overtime_flex <- posterior_over_time_flex %>% plot_over_time(orig_sig = T, ols_incl = F, lb=5, bw=F)
output$figures$overtime_flex_bw <- posterior_over_time_flex %>% plot_over_time(orig_sig = T, ols_incl = F, lb=5, bw=T)

# Posterior over Time - Flexible Taus - Plot taus
output$figures$overtime_flex_taus <- posterior_over_time_flex %>% plot_taus_over_time() 

# By Size - Overall
output$figures$size_overall <- eb_us_size %>% plot_size_overall(flipped = T, text = F)

# By Size - Clusters
output$figures$size_clusters <- eb_us_size %>% plot_size_clusters(cluster_order = cluster_order)

# Model - False Discovery Rate
output$figures$model_fdr <- model_fdr %>% plot_fdr()

# Simulation - False Discovery Rate
output$figures$sim_fdr <- simulation %>% plot_sim_fdr()

# US verus world factor
output$figures$world_vs_us <- plot_world_vs_us(eb_us = eb_est$us, eb_world_ex_us = eb_est$world_ex_us)

# In-sample vs. OOS and Post
c(output$figures$is_pre, output$figures$is_post, output$figures$is_oos) %<-% plot_is_oos_post(is_oos = is_oos, type = "GLS") 

# In-sample vs. OOS and Post - quadratic
c(output$figures$is_pre_quad, output$figures$is_post_quad, output$figures$is_oos_quad) %<-% plot_is_oos_post_quad(is_oos = is_oos, type = "GLS")

# Effect Sizes
c(output$figures$effect_world, output$figures$effect_regions, output$figures$effect_size) %<-% plot_effects(type = "ols", orig_sig = T, cluster_order = cluster_order)

# Replicateion Rate by Cluster - US
output$figures$repl_cluster_us <- eb_est$us$factors %>% plot_repl_cluster(orig_sig = T, cluster_order = cluster_order)

# Simulation benchmarked to Harvey et al (2016)
if (eb_est$us$mle %>% filter(estimate == "tau_s") %>% pull(ml_est) %>% round(2) != 0.21) {
  warning("Tau_w in Harvey et al simulation is inconsistent with data!!")
}
output$figures$sim_harvey <- plot_harvey(harvey_base_res = harvey_base_res, harvey_worst_res = harvey_worst_res, tau_ws = 0.21, act_rr = headline_rr)
output$figures$sim_harvey_robustness <- plot_harvey(harvey_base_res = harvey_base_res, harvey_worst_res = harvey_worst_res, tau_ws = c(0.1, 0.21, 0.3), act_rr = headline_rr)

# TPF Single Factors 
output$figures$tpf_factors_one_cluster <- tpf_factors$wide %>% plot_tpf_one_cluster(cluster_labels = cluster_labels, s = opt_s)
output$figures$tpf_factors_excl_one <- tpf_factors$wide %>% plot_tpf_excl_cl(cluster_labels = cluster_labels, s = opt_s)
output$figures$tpf_factors_imp <- tpf_factors$wide %>% plot_tpf_factor_imp(cluster_labels = cluster_labels, s = opt_s)
output$figures$tpf_factors_imp_cluster <- tpf_factors$wide %>% plot_tpf_factor_imp_cluster(cluster_labels = cluster_labels, s = opt_s)
tpf_evol <- tpf_factors$long %>% plot_tpf_evolution(data_wide = tpf_factors$wide, char_info = char_info, orig_sig_values = settings$tpf_factors$orig_sig, s = opt_s)
output$figures$tpf_evolution <- tpf_evol$plot


# Cumulative returns OOS - Marginally significant factors
output$figures$marg_sig_oos <- sig_oos_pfs %>% plot_sig_oos(sig_type = "marg_sig", cutoff_2012 = T, first_date = as.Date("1990-01-01"), leg_pos = c(.85, .55)) # Also include table for caption

# Cummulative returns OOS - EB significant factors
output$figures$eb_sig_oos <- sig_oos_pfs %>% plot_sig_oos(sig_type = "eb_sig", cutoff_2012 = F, first_date = as.Date("1990-01-01"), leg_pos = c(.85, .40)) # Also include table for caption


# Save Figures as Pictures -------------------------
if (save_figures) {
  # Base settings
  output_fig <- function(path, name, format, width, height) {
    file <- paste0(path, "/", name, ".eps")
    ggsave(file=file, width = width, height = height, units = "in", dpi = 300)
  }
  
  h <- 5
  w <- 9
  # For main text -----------------------------------------
  # Figure 1 ------
  output$figures$lit_comp
  output_fig(path=output_path, name = "fig1", width = w + 1, height = h)
  dev.off()
  
  output$figures$lit_comp + scale_fill_manual(values = c("black", "grey35"))
  output_fig(path=output_path, name = "fig1_bw", width = w + 1, height = h)
  dev.off()
  
  # Figure 2 -----
  output$figures$marg_sig_oos
  output_fig(path=output_path, name = "fig2", width = w, height = h*2/3)
  dev.off()
  
  output$figures$marg_sig_oos + scale_colour_manual(values = c("black", "black"))
  output_fig(path=output_path, name = "fig2_bw", width = w, height = h*2/3)
  dev.off()
  
  # Figure 3 -----
  output$figures$sim_fdr
  output_fig(path=output_path, name = "fig3", format = format, width = w, height = h)
  dev.off()
  
  output$figures$sim_fdr + scale_colour_manual(values = c("black", "black", "black"))
  output_fig(path=output_path, name = "fig3_bw", format = format, width = w, height = h)
  dev.off()
  
  
  # Figure 4 -----
  output$figures$mt_factors + theme(text = element_text(size = 13), axis.text.x = element_blank())
  output_fig(path=output_path, name = "fig4", format = format, width = w, height = h)
  dev.off()
  
  output$figures$mt_factors + theme(text = element_text(size = 13), axis.text.x = element_blank()) + 
    scale_colour_manual(values = rep("black", 3))
  output_fig(path=output_path, name = "fig4_bw", format = format, width = w, height = h)
  dev.off()
  
  # Figure 5a -----
  output$figures$size_overall + theme(
    axis.title.x = element_text(size = 17),
    axis.text.x = element_text(size = 17),
    axis.text.y = element_text(size = 17))
  output_fig(path=output_path, name = "fig5a", format = format, width = w, height = h)
  dev.off()
  
  output$figures$size_overall + theme(
    axis.title.x = element_text(size = 17),
    axis.text.x = element_text(size = 17),
    axis.text.y = element_text(size = 17)) + geom_col(fill="grey35")
  output_fig(path=output_path, name = "fig5a_bw", format = format, width = w, height = h)
  dev.off()
  
  # Figure 5b -----
  output$figures$repl_cluster_us + theme(
    axis.text.y = element_text(size = 13), 
    axis.text.x = element_text(size = 14), 
    text = element_text(size = 14))
  output_fig(path=output_path, name = "fig5b", format = format, width = w, height = h)
  dev.off()
  
  output$figures$repl_cluster_us + theme(
    axis.text.y = element_text(size = 13), 
    axis.text.x = element_text(size = 14), 
    text = element_text(size = 14)) + scale_fill_manual(values = rep("grey35", 13))
  output_fig(path=output_path, name = "fig5b_bw", format = format, width = w, height = h)
  dev.off()
  
  # Figure 6 -----
  output$figures$mt_summary + 
    theme(
      text = element_text(size = 13),
      axis.title.x = element_blank(), 
      axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1, size = 9)
    )
  output_fig(path=output_path, name = "fig6", format = format, width = w, height = h)
  dev.off()
  
  output$figures$mt_summary + 
    theme(
      text = element_text(size = 13),
      axis.title.x = element_blank(), 
      axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1, size = 9)
    ) +
    scale_fill_grey()
  output_fig(path=output_path, name = "fig6_bw", format = format, width = w, height = h)
  dev.off()
  
  # Figure 7 -----
  output$figures$world_vs_us + theme(text = element_text(size = 13)) 
  output_fig(path=output_path, name = "fig7", format = format, width = h, height = h)
  dev.off()
  
  output$figures$world_vs_us + theme(text = element_text(size = 13)) + scale_colour_manual(values = c("black", "grey35"))
  output_fig(path=output_path, name = "fig7_bw", format = format, width = h, height = h)
  dev.off()
  
  # Figure 8a -----
  output$figures$is_pre + theme(
    text = element_text(size = 12), 
    plot.title = element_text(size = 10, vjust = -2), 
    plot.subtitle = element_text(size = 8, vjust = 0),
    plot.margin = unit(c(0,0,0,0), "cm")
  ) 
  output_fig(path=output_path, name = "fig8a", format = format, width = w/3, height = w/3)
  dev.off()
  
  output$figures$is_pre + theme(
    text = element_text(size = 12), 
    plot.title = element_text(size = 10, vjust = -2), 
    plot.subtitle = element_text(size = 8, vjust = 0),
    plot.margin = unit(c(0,0,0,0), "cm")
  )  + geom_point(colour = "black")
  output_fig(path=output_path, name = "fig8a_bw", format = format, width = w/3, height = w/3)
  dev.off()
  
  # Figure 8b -----
  output$figures$is_post  + theme(
    text = element_text(size = 12), 
    plot.title = element_text(size = 10, vjust = -2), 
    plot.subtitle = element_text(size = 8, vjust = 0),
    plot.margin = unit(c(0,0,0,0), "cm")
  ) 
  output_fig(path=output_path, name = "fig8b", format = format, width = w/3, height = w/3)
  dev.off()
  
  output$figures$is_post  + theme(
    text = element_text(size = 12), 
    plot.title = element_text(size = 10, vjust = -2), 
    plot.subtitle = element_text(size = 8, vjust = 0),
    plot.margin = unit(c(0,0,0,0), "cm")
  ) + geom_point(colour = "black")
  output_fig(path=output_path, name = "fig8b_bw", format = format, width = w/3, height = w/3)
  dev.off()
  
  # Figure 8c -----
  output$figures$is_oos  + theme(
    text = element_text(size = 12), 
    plot.title = element_text(size = 10, vjust = -2), 
    plot.subtitle = element_text(size = 8, vjust = 0),
    plot.margin = unit(c(0,0,0,0), "cm")
  ) 
  output_fig(path=output_path, name = "fig8c", format = format, width = w/3, height = w/3)
  dev.off()
  
  output$figures$is_oos  + theme(
    text = element_text(size = 12), 
    plot.title = element_text(size = 10, vjust = -2), 
    plot.subtitle = element_text(size = 8, vjust = 0),
    plot.margin = unit(c(0,0,0,0), "cm")
  )  + geom_point(colour = "black")
  output_fig(path=output_path, name = "fig8c_bw", format = format, width = w/3, height = w/3)
  dev.off()
  
  # Figure 9 -----
  output$figures$overtime
  output_fig(path=output_path, name = "fig9", format = format, width = w, height = h)
  dev.off()
  
  output$figures$overtime_bw
  output_fig(path=output_path, name = "fig9_bw", format = format, width = w, height = h)
  dev.off()
  
  
  # Figure 10 -----
  output$figures$sim_harvey + theme(
    text = element_text(size = 12),
    axis.text.x = element_text(size = 11),
    axis.text.y = element_text(size = 11),
    legend.text = element_text(size = 11)
  )
  output_fig(path=output_path, name = "fig10", format = format, width = w, height = h)
  dev.off()
  
  output$figures$sim_harvey + theme(
    text = element_text(size = 12),
    axis.text.x = element_text(size = 11),
    axis.text.y = element_text(size = 11),
    legend.text = element_text(size = 11)
  ) + scale_colour_manual(values = rep("black", 3))
  output_fig(path=output_path, name = "fig10_bw", format = format, width = w, height = h)
  dev.off()
  
  # Figure 11 -----
  output$figures$gl_by_factor + theme(text = element_text(size = 13), legend.position = "right", axis.text.x = element_text(size = 5)) 
  output_fig(path=output_path, name = "fig11", format = format, width = w, height = h)
  dev.off()
  
  output$figures$gl_by_factor + theme(text = element_text(size = 13), legend.position = "right", axis.text.x = element_text(size = 5)) +
    scale_colour_manual(values = rep("black", 13))
  output_fig(path=output_path, name = "fig11_bw", format = format, width = w, height = h)
  dev.off()
  
  
  # Figure 12 -----
  output$figures$effect_size + theme(
    axis.text.y = element_text(size = 13), 
    strip.text.x = element_text(size = 14),
    axis.text.x = element_text(size = 12),
    axis.title.x = element_text(size = 13)
  )
  output_fig(path=output_path, name = "fig12a", format = format, width = w, height = h)
  dev.off()
  
  output$figures$effect_size + theme(
    axis.text.y = element_text(size = 13), 
    strip.text.x = element_text(size = 14),
    axis.text.x = element_text(size = 12),
    axis.title.x = element_text(size = 13)
  ) + scale_fill_manual(values = rep("grey35", 13))
  output_fig(path=output_path, name = "fig12a_bw", format = format, width = w, height = h)
  dev.off()
  
  # Figure 12b -----
  output$figures$effect_regions + theme(
    axis.text.y = element_text(size = 13), 
    strip.text.x = element_text(size = 13),
    axis.text.x = element_text(size = 12),
    axis.title.x = element_text(size = 13)
  )
  output_fig(path=output_path, name = "fig12b", format = format, width = w, height = h)
  dev.off()
  
  output$figures$effect_regions + theme(
    axis.text.y = element_text(size = 13), 
    strip.text.x = element_text(size = 13),
    axis.text.x = element_text(size = 12),
    axis.title.x = element_text(size = 13)
  ) + scale_fill_manual(values = rep("grey35", 13))
  output_fig(path=output_path, name = "fig12b_bw", format = format, width = w, height = h)
  dev.off()
  
  # Figure 13 -----
  output$figures$tpf + theme(text = element_text(size = 13), axis.text.x = element_text(size = 13), axis.text.y = element_text(size = 13))
  output_fig(path=output_path, name = "fig13", format = format, width = w, height = h)
  dev.off()
  
  output$figures$tpf + theme(text = element_text(size = 13), axis.text.x = element_text(size = 13), axis.text.y = element_text(size = 13)) +
    scale_fill_manual(values = rep("grey35", 14))
  output_fig(path=output_path, name = "fig13_bw", format = format, width = w, height = h)
  dev.off()
  
  # Figure 14 -----
  # Evolution of TPF
  output$figures$tpf_evolution
  output_fig(path=output_path, name = "fig14", format = format, width = w, height = h*0.8)
  dev.off()
  
  output$figures$tpf_evolution
  output_fig(path=output_path, name = "fig14_bw", format = format, width = w, height = h*0.8)
  dev.off()
  
  
  # Figure IA.1 ----- # NEED TO DO!!
  if (FALSE) {
    # Need to run everything with settings$weighting$us = "vw" and settings$weighting$global_ex_us="vw"
    output$figures$lit_comp
    output_fig(path=output_path, name = "figIA1", format = format, width = w + 1, height = h)
    dev.off()
  }
  
  
  # Figure IA.2 ------
  output$figures$eb_sig_oos
  output_fig(path=output_path, name = "figIA2", format = format, width = w, height = h*2/3)
  dev.off()
  
  # Figure IA.3a -----
  output$figures$is_pre_quad + theme(
    text = element_text(size = 12), 
    plot.title = element_text(size = 10, vjust = -2), 
    plot.subtitle = element_text(size = 8, vjust = 0),
    plot.margin = unit(c(0,0,0,0), "cm")
  ) 
  output_fig(path=output_path, name = "figIA3a", format = format, width = w/3, height = w/3)
  dev.off()
  
  # Figure IA.3b -----
  output$figures$is_post_quad  + theme(
    text = element_text(size = 12), 
    plot.title = element_text(size = 10, vjust = -2), 
    plot.subtitle = element_text(size = 8, vjust = 0),
    plot.margin = unit(c(0,0,0,0), "cm")
  )  
  output_fig(path=output_path, name = "figIA3b", format = format, width = w/3, height = w/3)
  dev.off()
  
  # Figure IA.3c -----
  output$figures$is_oos_quad  + theme(
    text = element_text(size = 12), 
    plot.title = element_text(size = 10, vjust = -2), 
    plot.subtitle = element_text(size = 8, vjust = 0),
    plot.margin = unit(c(0,0,0,0), "cm")
  ) 
  output_fig(path=output_path, name = "figIA3c", format = format, width = w/3, height = w/3)
  dev.off()
  
  
  # Figure IA.4 -----
  output$figures$overtime_flex + theme(text = element_text(size = 13))
  output_fig(path=output_path, name = "figIA4", format = format, width = w, height = h)
  dev.off()
  
  # Figure IA.5 -----
  output$figures$overtime_flex_taus + theme(text = element_text(size = 13), legend.text = element_text(size = 12))
  output_fig(path=output_path, name = "figIA5", format = format, width = w, height = h)
  dev.off()
  
  # Figure IA.6 ----
  output$figures$sim_harvey_robustness + theme(
    text = element_text(size = 12),
    axis.text.x = element_text(size = 11),
    axis.text.y = element_text(size = 11),
    legend.text = element_text(size = 11)
  )
  output_fig(path=output_path, name = "figIA6", format = format, width = w, height = h)
  dev.off()
  
  # Figure IA.7 ----
  output$figures$gl_by_cluster  + theme(text = element_text(size = 13))
  output_fig(path=output_path, name = "figIA7", format = format, width = w, height = h)
  dev.off()
  
  # Figure IA.8 ----
  output$figures$size_clusters
  output_fig(path=output_path, name = "figIA8", format = format, width = w, height = h+1)
  dev.off()
  
  # Figure IA.9 ----
  output$figures$tpf_regions  + theme(text = element_text(size = 13), axis.text.x = element_text(size = 13), axis.text.y = element_text(size = 13))
  output_fig(path=output_path, name = "figIA9", format = format, width = w, height = h)
  dev.off()
  
  # Figure IA.10 ----
  output$figures$tpf_size  + theme(
    text = element_text(size = 13), 
    axis.text.x = element_text(size = 13), 
    axis.text.y = element_text(size = 11))
  
  output_fig(path=output_path, name = "figIA10", format = format, width = w, height = h*1.5)
  dev.off()
  
  # Figure IA.11 ----
  output$figures$tpf_factors_one_cluster
  output_fig(path=output_path, name = "figIA11", format = format, width = w, height = h)
  dev.off()
  
  # Figure IA.12 ----
  output$figures$tpf_factors_excl_one
  output_fig(path=output_path, name = "figIA12", format = format, width = w, height = h)
  dev.off()
  
  # Figure IA.13 ----
  output$figures$tpf_factors_imp_cluster + theme(
    axis.text.y = element_text(size = 6)
  )
  output_fig(path=output_path, name = "figIA13", format = format, width = w, height = h*1.5)
  dev.off()
  
  # Figure IA.14 ----
  output$figures$tpf_factors_imp + theme(
    axis.text.y = element_text(size = 6)
  )
  output_fig(path=output_path, name = "figIA14", format = format, width = w, height = h*1.5)
  dev.off()
  
  # Figure IA.15 (special, because not ggplot) ----
  pdf(str_c(output_path, "/figIA15.pdf"), width = w, height = h*2+1.5)
  output$figures$hcl_us()
  dev.off()
  
  # Figure IA.16 (special, because not ggplot) ----
  pdf(str_c(output_path, "/figIA16.pdf"), width = 6, height = 6)
  output$figures$hcl_us_val()
  dev.off()
  
  
  # Other figures not included in paper --------
  output$figures$ci_many_fcts + theme(axis.text.y = element_text(size = 13), text = element_text(size = 13))
  output$figures$model_fdr + theme(text = element_text(size = 13))

  # TABLES -----------------
  # Estimated Taus
  table_taus()
  # Economic Benefit of more Power
  sig_oos_pfs %>% table_economic_benefit()
  # Factor Performance
  table_factor_info()
  
  # Numbers mentioned in paper --------
  # Bayesian Multiple Testing
  bayes_sim <- 1000000
  (fdr_196 <- fdr_fwer_rates(t_cutoff = 1.96, orig_sig = T, a_vec = eb_est$world$factor_mean, a_cov = eb_est$world$factor_cov, n_sim = bayes_sim, seed=settings$seed))
  (fdr_278 <- fdr_fwer_rates(t_cutoff = 2.78, orig_sig = T, a_vec = eb_est$world$factor_mean, a_cov = eb_est$world$factor_cov, n_sim = bayes_sim, seed=settings$seed))
  (true_factors_tbl <- true_factors(t_cutoff = 0, a_vec = eb_est$world$factor_mean, a_cov = eb_est$world$factor_cov, orig_sig = T, n_sim = bayes_sim, seed=settings$seed))
  (rr_unc <- true_factors(t_cutoff = 1.96, a_vec = eb_est$world$factor_mean, a_cov = eb_est$world$factor_cov, orig_sig = T, n_sim = bayes_sim, seed=settings$seed))
  
  # Mentioned in introduction
  paste0("Replication rate SE: ", round(rr_unc$sd*100, 2), "%")
  paste0("Bayesian FDR: ", round(fdr_196$fdr_dist$mean*100, 2), "%, with 95% CI of [", round(fdr_196$fdr_dist$p025*100, 2), "%, ", round(fdr_196$fdr_dist$p975*100, 2), "%], SE: ", round(fdr_196$fdr_dist$sd*100, 2))
  paste0("Bayesian FWER: ", round(fdr_196$fwer_dist$mean*100, 2), "%, with SE of ", round(fdr_196$fwer_dist$sd*100, 2), "%")
  paste0("Expected fraction of true factors: ", round(true_factors_tbl$mean*100, 2), "%, with SE of ", round(true_factors_tbl$sd*100, 2), "%")
  
  
  # BY cutoff
  mt %>%
    filter(method == "BY" & region == "us") %>%
    mutate(sig = p<=0.05) %>%
    group_by(sig) %>%
    mutate(
      max = max(abs(statistic)),
      min = min(abs(statistic))
    ) %>%
    filter(abs(statistic)==max & sig==F | abs(statistic)==min & sig==T) %>%
    ungroup() %>%
    summarise(
      by_cutoff = mean(abs(statistic))
    ) %>% 
    print()
  
  # Change in Book equity factor
  be_gr_us <- eb_est$us$factors %>% 
    filter(region == "us" & characteristic == "be_gr1a")
  be_gr_all <- eb_est$all$factors %>% 
    filter(region == "us" & characteristic == "be_gr1a")
  tibble(
    characteristic = rep("be_gr1a",2), 
    region = rep("US", 2), 
    data = c("US", "Global"), 
    post_mean = c(be_gr_us$post_mean, be_gr_all$post_mean), 
    post_vol = c(be_gr_us$post_sd, be_gr_all$post_sd),
    t = post_mean / post_vol
  ) %>% print()
  
  # IS / OOS 
  is_oos$post$regs %>%
    ungroup() %>%
    summarise(
      is = mean(is),
      post = mean(oos),
      decline = post/is-1
    ) %>% 
    print()
  
  c("pre","post","pre_post") %>% lapply(function(x) {
    is_oos[[x]]$regs %>% mutate(period = x)
  }) %>%
    bind_rows() %>%
    group_by(period) %>%
    summarise(
      n = n(),
      is = mean(is > 0),
      oos = mean(oos > 0)
    ) %>% 
    print()
  
  # Posterior over time width
  posterior_over_time %>% plot_over_time(orig_sig = T, ols_incl = T, lb = 5)
  
  # Bayesian Multiple Testing
  fdr_196$fdr_dist
  fdr_278$fwer_fdr # FWER at t>2.78
  true_factors_tbl
  
  # Replication rates in different size groups
  eb_us_size %>% plot_size_overall(flipped = T, text = T)
  
  # Publication Bias
  plot_harvey(harvey_base_res = harvey_base_res, harvey_worst_res = harvey_worst_res, tau_ws = 0.21, act_rr = headline_rr)
  
  # Correlations across size and region
  eb_us_size %>%
    select(characteristic, size_grp, ols_est) %>%
    spread(key = size_grp, value = ols_est) %>%
    summarise(
      cor_mega_micro = cor(Mega, Micro, method = "spearman"),
      cor_mega_nano = cor(Mega, Nano, method = "spearman")
    ) %>%
    print()
  
  eb_est$all$factors %>%
    select(characteristic, region, ols_est) %>%
    spread(key = region, value = ols_est) %>%
    na.omit() %>%
    summarise(
      n = n(),
      cor_us_dev = cor(us, developed, method = "spearman"),
      cor_us_emer = cor(us, emerging, method = "spearman")
    ) %>%
    print()
  
  # TPF Evolution numbers
  tpf_evol$data %>%
    arrange(year) %>%
    mutate(tpf_sr_l1 = dplyr::lag(tpf_sr)) %>%
    filter(year %in% c(min(year), max(year), 2002, 1991)) %>% # char_info[characteristic %in% c("seas_2_5an", "oaccruals_at")]
    arrange(year) %>%
    mutate(
      new_inclusions = case_when(
        year == 1971 ~ "Market",
        year == 1991 ~ "Accruals",
        year == 2002 ~ "Seasonality",
        year == year(settings$end_date) ~ "[All factors included]",
      )
    ) %>%
    print()
  
  # Average pairwise correlations
  eb_est$us$input$long %>%
    select(characteristic, eom, ret_neu) %>%
    spread(key = characteristic, value = ret_neu) %>%
    select(-eom) %>%
    cor(use = "pairwise.complete.obs") %>%
    as_tibble(rownames = "char1") %>%
    gather(-char1, key = "char2", value = "cor") %>% 
    filter(char1 != char2) %>% 
    summarise(average_cor = mean(cor))
  
}

================================================
FILE: Analysis/Analysis.Rproj
================================================
Version: 1.0

RestoreWorkspace: Default
SaveWorkspace: Default
AlwaysSaveHistory: Default

EnableCodeIndexing: Yes
UseSpacesForTab: Yes
NumSpacesForTab: 2
Encoding: UTF-8

RnwWeave: Sweave
LaTeX: pdfLaTeX


================================================
FILE: Analysis/README.md
================================================
## Overview
This repository contains code that is used for the paper [Is There a Replication Crisis in Finance?](https://onlinelibrary.wiley.com/doi/full/10.1111/jofi.13249) by Jensen, Kelly and Pedersen (2023). The code used to create the underlying dataset can be found at [https://github.com/bkelly-lab/GlobalFactor](https://github.com/bkelly-lab/GlobalFactor). Please cite Jensen, Kelly and Pedersen (2023) if using the code or data. 

## How To Run the Code

1. Start with factor returns in each country. The files you need to download are `hml.csv`, `cmp.csv` and `market_returns.csv` using one of the following methods.
	1. Download the factor returns used in the paper [here](https://www.dropbox.com/sh/wcrjok1qyxtrasi/AABZ90GDCUvIzDzijt8Qoo3ha?dl=0). 
	1. Download the latest version of the factor returns [here](https://www.dropbox.com/sh/xq278bryrj0qf9s/AABUTvTGok91kakyL07LKyQoa?dl=0). 
	1. Generate the factor returns from scratch by following the steps in [GlobalFactors](https://github.com/bkelly-lab/ReplicationCrisis/tree/master/GlobalFactors) as the output from `portfolios.R`. 
2. Copy the code from this repository to a local folder. 
3. Open `main.R` in the programming language "R".
4. Ensure that the current working directory is the folder from 2. To check this, write `getwd()` in the console. To change the working directory use `setwd()`.
5. Run `main.R`.

## Outputs
1. The consol prints key numbers used in the paper as well as the paper tables in latex format.
2. If `save_figures=TRUE` (default), the folder in `output_path` will contain figures of the same format used in the paper. 

## Optional Settings 
1. `data_path` is the folder with the portfolio data from step 1 (default: current working directory/Data).
2. `object_path` is a folder where R objects can be saved for faster                iterations (default: current working directory/Objects).
3. `output_path` is a folder where figures can be saved (default: current working directory/Figures).
4. `save_figures` should be `TRUE` if you wish to save figures, otherwise `FALSE` (default: TRUE).
5. `settings` controls settings for the analysis, including the start and end date,           the portfolio weighting scheme, the cluster settings, the empirical Bayes settings, and the             tangency portfolio settings (default: the settings in the paper).  

## Notes

The code is divded into 4 separate R scripts. `0 - Functions.R` contains the project functions, `1 - Prepare Data.R` prepares the data, `2 - Determine Clusters.R` finds statistical clusters based on return data, `3 - Analysis.R` analyzes the data and `4 - Output.R` generates tables and figures based on the analysis. Everything is sourced from `main.R` which also contains user-defined control variables.  


================================================
FILE: Analysis/country_stats.R
================================================
library(xtable)
library(tidyverse)
library(data.table)

# Data [output from SAS code, already screened with obs_main=1, primary_sec=1, exch_main=1]
data_path <- "../../Data/Characteristics"
country_files <- list.files(data_path)
countries <- country_files %>% lapply(function(file) {
  fread(paste0(data_path, "/", file), select = c("excntry", "id", "eom", "me", "size_grp", "ret_local"))
}) %>% rbindlist()
countries[, eom := eom %>% as.character() %>% lubridate::fast_strptime(format = "%Y%m%d") %>% as.Date()]
# Aggregate by month
country_info <- countries[!is.na(me) & !is.na(ret_local), .(
  n = .N,
  n_nano = sum(size_grp == "nano"),
  n_mega = sum(size_grp == "mega"),
  me = sum(me),
  me_p50 = median(me)
), by = .(excntry, eom)]
# Country Classification
country_classification <- readxl::read_xlsx("Country Classification.xlsx", 
                                            sheet = "countries", range = "A1:C200") %>%
  select(excntry, msci_development) %>%
  filter(!is.na(excntry)) %>%
  setDT()
country_info <- country_classification[country_info, on = "excntry"]

# Table
table_country <- function(country_info, info_date) {
  tbl_caption <- paste("The table shows summary statistics by the country where a security is listed.\\", 
                       "We include common stocks that are the primary security of the underlying firm, traded on a standard exchange, with non-missing return and market equity data.\\",
                       "\\textit{Country} is the ISO code of the underlying exchange country.\\", 
                       "For further information, see \\href{https://en.wikipedia.org/wiki/List_of_ISO_3166_country_codes}{https://en.wikipedia.org/wiki/List\\_of\\_ISO\\_3166\\_country\\_codes.}",
                       "\\textit{MSCI} shows the MSCI classification of each country as of January 7th 2021.", 
                       "For the most recent classification, see \\href{https://www.msci.com/market-classification}{https://www.msci.com/market-classification}.",
                       "\\textit{Start} is the first date with a valid observation.",
                       "In the next 4 columns, the data is shown as of December 31st 2020.\\",
                       "\\textit{Stocks} is the number of stocks available.\\", 
                       "\\textit{Mega stocks} is the number of stocks with a market cap above the 80th percentile of NYSE stocks.\\",
                       "\\textit{Total Market Cap} is the aggregate market cap in million USD.\\",
                       "\\textit{Median MC} is the median market cap in million USD.")
  
  # Country summary
  country_info[, start_date := min(eom), by = excntry]
  country_stats <- country_info[eom==info_date]
  countries_add <- country_info[!(excntry %in% country_stats$excntry)][, .(excntry, msci_development, eom = info_date, start_date, n=0, n_mega=0, n_nano=0, me=0, me_p50=0)] %>% unique()# Add countries without data by info_date
  country_stats <- country_stats %>% rbind(countries_add) %>% arrange(-me) %>% select(-c(n_nano, eom))
  
  total <- country_stats %>%
    ungroup() %>%
    summarise(
      excntry = "All",
      msci_development = "",
      start_date = NA,
      n = sum(n),
      n_mega = sum(n_mega),
      me = sum(me),
      me_p50 = NA_real_
    )
  
  country_stats %>%
    bind_rows(total) %>%
    mutate(
      n = n %>% prettyNum(big.mark = ",", digits = 0),
      n_mega = n_mega %>% prettyNum(big.mark=",", digits = 0),
      start_date = as.character(start_date),
      me = me %>% formatC(format = "e", digits = 2),
      me_p50 = me_p50 %>% prettyNum(big.mark=",", digits = 0),
      msci_development = msci_development %>% str_to_title(),
      " "= ' ',
    ) %>%
    select(excntry, msci_development, start_date, ` `, everything()) %>%
    rename("Country" = excntry, "MSCI" = msci_development, "Start" = start_date, "Stocks" = n, "Mega Stocks" = n_mega,
           "Total Market Cap" = me, "Median MC" = me_p50) %>%
    xtable(digits = 0, align = "lllllrrrr", caption = tbl_caption) %>%
    print(include.rownames = T, floating = FALSE, latex.environments = "center", hline.after=c(-1, 0), 
          tabular.environment = "longtable", size="\\fontsize{10pt}{12pt}\\selectfont")
}

# Output for paper--------------------
# Remember:
# Only copy from "& country ..." and down.
# In line 94: 
#  - Delete "94"
#  - Delete "NA"
#  - Add hline above and below total
#  - Make "All" in \textbf{}
country_info[!(excntry %in% c("ZWE", "VEN"))] %>% table_country(info_date = as.Date("2020-12-31"))  # We exclude Zimbamwe and Venesuela due to data issues

# Nano Caps in the US
country_info[eom == as.Date("2020-12-31") & excntry == "USA", .(n, n_nano, nano_prop = n_nano / n)] %>% print()


================================================
FILE: Analysis/hxz_decomp.R
================================================
library(lubridate)
library(tidyverse)
library(data.table)
options(dplyr.summarise.inform = FALSE)

# User Input -----------------------
# Paths
data_path <- "../../Data" # Set to path with global characteristics data.
# Start Date
start <- as.Date("2020-12-31")

# Data -----------------------------
# Characteristics
char_info <- readxl::read_xlsx("Factor Details.xlsx",
                               sheet = "details", range = "A1:N300") %>%
  filter(!is.na(abr_jkp)) %>%
  select("characteristic"=abr_jkp, direction, significance) %>%
  mutate(direction = direction %>% as.numeric()) %>%
  setDT()
chars <- char_info$characteristic

# NYSE Cutoff 
nyse_size_cutoffs <- fread(paste0(data_path, "/nyse_cutoffs.csv"), colClasses = c("eom"="character"))
nyse_size_cutoffs[, eom := as.Date(eom, format = "%Y%m%d")]

# Return Cutoffs
ret_cutoffs <- fread(paste0(data_path, "/return_cutoffs.csv"), colClasses = c("eom"="character"))
ret_cutoffs[, eom := as.Date(eom, format = "%Y%m%d")]
ret_cutoffs[, eom_lag1 := floor_date(eom, unit = "month") - 1]  # Because we use ret_exc_lead1m

# Data
data <- fread(paste0(data_path, "/Characteristics/usa.csv"), 
              select = c("excntry", "id", "eom", "source_crsp", "comp_exchg", "crsp_exchcd", "size_grp", "ret_exc", "ret_exc_lead1m", "me", chars), colClasses = c("eom"="character"))
data[, eom := as.Date(lubridate::fast_strptime(eom, format = "%Y%m%d"))]
# ME CAP
data <- nyse_size_cutoffs[, .(eom, nyse_p80)][data, on = "eom"]
data[, me_cap := pmin(me, nyse_p80)][, nyse_p80 := NULL]
# Screens
data <- data[!is.na(size_grp) & !is.na(me) & !is.na(ret_exc_lead1m)]

# Winsorize Compustat Returns
data <- ret_cutoffs[, .(eom, "p001"=ret_exc_0_1, "p999"=ret_exc_99_9)][data, on = "eom"]
data[source_crsp == 0 & ret_exc > p999, ret_exc := p999]
data[source_crsp == 0 & ret_exc < p001, ret_exc := p001]
data[, c("p001", "p999") := NULL]
data <- ret_cutoffs[, .("eom" = eom_lag1, "p001"=ret_exc_0_1, "p999"=ret_exc_99_9)][data, on = "eom"]
data[source_crsp == 0 & ret_exc_lead1m > p999, ret_exc_lead1m := p999]
data[source_crsp == 0 & ret_exc_lead1m < p001, ret_exc_lead1m := p001]
data[, c("source_crsp", "p001", "p999") := NULL]

# Create 1 month separated observations
returns <- tidyr::crossing("id" = unique(data$id), "eom" = unique(data$eom)) %>% setDT()
returns <- data[, .(id, eom, ret_exc)][returns, on = .(id, eom)]
returns[, start := min(eom[!is.na(ret_exc)]), by = id]
returns <- returns[eom >= start][, start := NULL]
returns[, last := floor_date(max(eom[!is.na(ret_exc)]), unit = "month") + months(12) - 1, by = id]  # To avoid lookahead bias, _horizon_ months after last obs
returns <- returns[eom <= last][, last := NULL]
returns %>% setorder(id, eom)

pf_func <- function(chars, pfs, bps, bp_min_n, min_stocks, horizon) {
  # Realized Returns 
  ret_lead <- 1:horizon %>% lapply(function(h) {
    if (h==1) {
      r <- data[, .(id, eom, lead = 1, ret_exc = ret_exc_lead1m)][!is.na(ret_exc)]
    } else {
      r <- returns[, .(eom, lead = h, ret_exc = dplyr::lead(ret_exc, n = h)), by = id][!is.na(ret_exc)]
    }
    r[, eom_ret := ceiling_date(eom, unit = "months")+months(h)-1]
  }) %>% rbindlist()
  # Portfolios 
  chars %>% lapply(function(x) {
    print(paste0("   " , x, ": ", match(x, chars), " out of ", length(chars)))
    data[, var := as.double(get(x))]
    sub <- data[!is.na(var), .(id, eom, var, size_grp, me, me_cap, crsp_exchcd, comp_exchg)]
    # Portfolio Assignment
    if (bps == "nyse") {
      sub[, bp_stock := (crsp_exchcd == 1 & is.na(comp_exchg)) | (comp_exchg == 11 & is.na(crsp_exchcd))]
    }
    if (bps == "non_mc") {
      sub[, bp_stock := (size_grp %in% c("mega", "large", "small"))]
    }
    sub[, bp_n := sum(bp_stock), by = eom]
    sub <- sub[bp_n >= bp_min_n] # require at least 10 stocks for break points
    sub[, cdf := ecdf(var[bp_stock == T])(var), by = eom]
    sub[, min_cdf := min(cdf), by = eom]
    sub[cdf == min_cdf, cdf := 0.00000001] # To ensure that the lowest value is in portfolio 1   
    sub[, pf := ceiling(cdf*pfs), by = eom]  
    sub[pf == 0, pf := 1]  # Happens when non-bp stocks extend beyond bp stock range
    # Align with returns
    sub <- sub[, .(id, eom, me, me_cap, pf)][ret_lead, on = .(id, eom)][!is.na(pf)]
    # Returns 
    pf_returns <- sub[, .(
      characteristic = x,
      n = .N,
      ret_ew = mean(ret_exc),
      ret_vw = sum(ret_exc * me) / sum(me),
      ret_vw_cap = sum(ret_exc * me_cap) / sum(me_cap)
    ), by = .(pf, eom_ret, lead)]
    # HML
    pf_returns[, .(
      characteristic = x,
      n_stocks_min = as.integer(min(n[pf==pfs], n[pf==1])),
      ret_ew = ret_ew[pf == pfs] - ret_ew[pf == 1],
      ret_vw = ret_vw[pf == pfs] - ret_vw[pf == 1],
      ret_vw_cap = ret_vw_cap[pf == pfs] - ret_vw_cap[pf == 1]
    ), by = .(eom_ret, lead)][!is.na(ret_ew) & n_stocks_min >= min_stocks]
  }) %>% rbindlist()
}

# Output 
system.time(hml_nonmc3 <- chars %>% pf_func(pfs = 3, bps = "non_mc", bp_min_n = 5, min_stocks = 5, horizon = 12))  # 47 min
system.time(hml_nyse10 <- chars %>% pf_func(pfs = 10, bps = "nyse", bp_min_n = 5, min_stocks = 5, horizon = 12))

rr <- list(hml_nonmc3, hml_nyse10) %>% lapply(function(dt) {
  repl_data <- char_info[dt, on = "characteristic"] %>%
    filter(eom_ret <= start) %>%
    mutate(
      ret_ew = ret_ew*direction,
      ret_vw = ret_vw*direction,
      ret_vw_cap = ret_vw_cap*direction
    ) %>%
    pivot_longer(c(ret_ew, ret_vw, ret_vw_cap), names_to = "type", values_to = "ret")
  
  repl_data <- c(1, 6, 12) %>% lapply(function(h) {
    repl_data %>%
      filter(lead %in% 1:h) %>%
      group_by(characteristic, eom_ret, type, significance) %>%
      filter(n() == h) %>%
      summarise(ret = mean(ret)) %>%
      ungroup() %>%
      mutate(horizon = h)
  }) %>% bind_rows()
  
  rr_func <- function(sample) {
    sample %>%
      group_by(characteristic, type, horizon) %>%
      summarise(
        t = mean(ret)/(sd(ret)/sqrt(n()))
      ) %>%
      group_by(type) %>%
      summarise(
        n = n(),
        rr = mean(t > 1.96)
      )
  }
  
  new_factors <- c(
    "ret_3_1", "ret_9_1",  "ret_12_7",  "corr_1260d",  "rmax5_21d", "rmax5_rvol_21d",
    "ni_be", "ocf_at", "ocf_at_chg1", "mispricing_perf", "mispricing_mgmt",  "qmj",
    "qmj_prof", "qmj_growth",  "qmj_safety")
  
  
  # Our Benchline Raw Return
  baseline <- repl_data %>%
    filter(horizon == 1 & eom_ret <= start) %>%
    rr_func() %>%
    mutate(name = "Baseline")
  # Difference in sample period
  hor_diff <- repl_data %>%
    rr_func() %>%
    mutate(name = "Three Horizons")
  # Difference in horizons
  sample_diff <- repl_data %>%
    filter(eom_ret >= as.Date("1967-01-01") & eom_ret <= as.Date("2016-12-31")) %>%
    rr_func() %>%
    mutate(name = "Three Horizons, Shorter Sample")
  # Differences in Factors
  fct_diff <- repl_data %>%
    filter(eom_ret >= as.Date("1967-01-01") & eom_ret <= as.Date("2016-12-31")) %>%
    filter(!(characteristic %in% new_factors)) %>%
    rr_func() %>%
    mutate(name = "Three Horizons, Shorter Sample, Difference in Factors")
  bind_rows(baseline, sample_diff, hor_diff, fct_diff) %>%
    arrange(type, name)
})

# Decomposition  
terc_base <- rr[[1]] %>% filter(name == "Baseline" & type == "ret_vw_cap") %>% pull(rr)
terc_base_vw <- rr[[1]] %>% filter(name == "Baseline" & type == "ret_vw") %>% pull(rr)
terc_hor <- rr[[1]] %>% filter(name == "Three Horizons" & type == "ret_vw") %>% pull(rr)
terc_sample <- rr[[1]] %>% filter(name == "Three Horizons, Shorter Sample" & type == "ret_vw") %>% pull(rr)
terc_factors <- rr[[1]] %>% filter(name == "Three Horizons, Shorter Sample, Difference in Factors" & type == "ret_vw") %>% pull(rr)
dec_factors <- rr[[2]] %>% filter(name == "Three Horizons, Shorter Sample, Difference in Factors" & type == "ret_vw") %>% pull(rr)

# From vw_cap to vw
terc_base-terc_base_vw
# Multiple Horizons
terc_base_vw-terc_hor
# Shorter sample 
terc_hor-terc_sample
# New Factors
terc_sample-terc_factors
# Deciles instead of terciles and change of BP
terc_factors-dec_factors
# Explained Difference
(expl_rr <- terc_base-((terc_base-terc_base_vw)+(terc_base_vw-terc_hor)+(terc_hor-terc_sample)+(terc_sample-terc_factors)+(terc_factors-dec_factors)))
expl_rr-0.35


================================================
FILE: Analysis/main.R
================================================
library(cowplot)
library(directlabels)
library(xtable)
library(zeallot)
library(dendextend)
library(RColorBrewer)
library(rsample)
library(lubridate)
library(tidyverse)
library(data.table)
options(dplyr.summarise.inform = FALSE)

# How To --------------------------------------
# Paths
# - data_path:                Folder that contains market_returns.csv, hml.csv and cmp.csv generated from portfolio.R                
# - object_path:              Folder to save objects too. Retrived when update_*==F
# - output_path:              Folder to save figures in. Not neccesary if save_figures==F
# Save
# - save_figures:             Should figures be saved in output_path?
# Update
# - update_sim:               Simulations for figure 2 (Simulation Comparison of False Discovery Rates)
# - update_post_over_time:    Posterior calculations for figure 8 (US Factor Alpha Posterior Distribution over Time)
# - update_post_is:           Data for regression in table E.1 (The Economic Benefit of More Powerful Tests)
# - update_harvey_baseline:   Data for figure 9
# - update_harvey_worstcase:  Data for figure F.1

# User Input -----------------------
# Paths
data_path <- "Data" 
object_path <- "Objects"
output_path <- "Figures"
# Save
save_figures <- T
# Update
update_sim <- T              
update_post_over_time <- T
update_post_is <- T
update_harvey_baseline <- T  
update_harvey_worstcase <- T 
# Settings
settings <- list(
  seed = 1,
  start_date = as.Date("1925-12-31"),     
  end_date = as.Date("2022-12-31"),       # Important that end_date <= Last_CRSP_UPDATE
  country_excl = c("ZWE", "VEN"),         # Countries are excluded due to data issues
  weighting =  list(                      # Which weighting scheme to use? In c("ew", "vw", "vw_cap")
    us = 'vw_cap',
    global_ex_us = 'vw_cap'
  ),                   
  n_stocks_min = 5,                       # Minimum amount of stocks in each side of the portfolios
  months_min = 5 * 12,                    # Minimum amount of observations a factor needs to be included  
  country_weighting = "market_cap",       # How to weight countries? In ("market_cap", "stocks", "ew")
  countries_min = 3,                      # Minimum number of countries necessary in a regional portfolio
  clusters = "hcl",                       # Which cluster method to use? In c("manual", "hcl")
  hcl = list(
    ret_type = "alpha",                   # Which return to use in clustering: In c("raw", "alpha")
    cor_method = "pearson",               # Which cor method to base distance upon
    linkage    = "ward.D",                # Which linkage method to use
    k          = 13,                      # How many clusters to colour
    region     = "us",                    # Region to use for clusters
    start_year = 1975                     # Start year cluster data   
  ),
  eb = list(
    scale_alpha = T,
    overlapping = F,
    min_obs = 5 * 12,
    fix_alpha = T, 
    bs_cov = T, 
    shrinkage = 0, 
    cor_type = "block_clusters", 
    bs_samples = 10000                    # Set to 10000 for paper
  ),
  tpf = list(
    start = list(
      world = as.Date("1952-01-01"),
      us = as.Date("1952-01-01"),
      developed = as.Date("1987-01-01"),
      emerging = as.Date("1994-01-01"),
      size_grps = as.Date("1963-01-01")   # Dictated by start of nano-caps
    ),
    bs_samples = 10000,                   # Number of bootstrap samples [10.000 for paper]
    shorting = F                          # Should shorting be allowed?
  ),
  tpf_factors = list(
    region = "us",
    orig_sig = T,                         # Only include originally significant factor: T, include all: c(T,F)
    start = as.Date("1972-01-31"),
    scale = T,                            # Scale to ex-post volatility of 10%?
    k = 5                                 # Number of Folds for cross-validation exercise
  )
)

# Layout Settings ---------------
theme_set(theme_classic())
colours_theme <- c("#0C6291", "#A63446", RColorBrewer::brewer.pal(8, "Dark2"), 
                   "darkslategrey", "blue3", "red3", "purple2", "yellow2", "aquamarine",
                   "grey", "salmon", "antiquewhite", "chartreuse") 
scale_colour_discrete <- function(...) {
  scale_colour_manual(..., values = colours_theme)
}
scale_fill_discrete <- function(...) {
  scale_fill_manual(..., values = colours_theme)
}
scale_linetype_discrete <- function(...) {
  scale_linetype_manual(..., values = c("solid", "longdash", "dotted", "dashed", "dotdash", "twodash"))
}

# Run Scripts ----------
source("0 - Functions.R", echo = T)
source("1 - Prepare Data.R", echo = T)
source("2 - Determine Clusters.R", echo = T)
source("3 - Analysis.R", echo = T)
source("4 - Output.R", echo = T)


================================================
FILE: GlobalFactors/CHANGELOG.md
================================================
# CHANGELOG.md
This change log keeps track of changes to the underlying data set. In brackets, we highlight versions of importance. The version with _factor data set_ is the basis of the factor portfolios we upload at [https://jkpfactors.com/](https://jkpfactors.com/). The version with _paper data set_ is the basis of [Jensen, Kelly and Pedersen (2023)](https://onlinelibrary.wiley.com/doi/full/10.1111/jofi.13249).

## 05-03-2025 [Factor data set]
__Changes__:
- Added 2024 data
- moved world_data_prelim from scratch to work folder
- updated market returns macro to add a capped value option
- corrected error in o-score calculation
- added end_date filter in macros saving daily and monthly returns 

## 11-03-2024 
__Changes__:
- Added 2023 data
- Updated the country classification according the latest MSCI market classification

## 03-03-2023 
__Changes__:
- Added 2022 data
- Added 'me' (market equity) and 'ret' (total return) and removed 'source_crsp' from daily return files

__Impact__:
- Replication rate: 83.2% 

## 30-06-2022 [Paper data set] 

__Changes__:
- Changed name of "Skewness" cluster to "Short-Term Reversal"

__Impact__:
- Replication rate: 82.4% 

## 08-02-2022 

__Changes__:
- Fix error in the construction of intrinsic_value. Previously, we failed to scale intrinsic_value by market equity as done in Frankel and Lee (1998). We call the new characteristic ival_me and keep intrinsic_value in the data set. The alpha of the new factor based on ival_me is significantly different from zero, while the factor based on intrinsic_value is insignificant.

__Impact__:
- Replication rate: 82.4% (added 2020 data)


## 16-11-2021 

__Changes__:
- Changed return cutoffs to depend on all stocks, instead of only stocks from CRSP.
- Added monthly and daily returns to the output folder. 
- Changed the 'source' (character) column to 'source_crsp' (integer),. source_crsp is 1 if CRSP is the return data source.
- Changed the 'id' column from character to integer. For stocks from CRSP, the id is just their permno. For stocks from Compustat, the first digits is 1 if the stocks is traded on a US exchange, 2 if it's traded on a Canadian exchange, and 3 otherwise. The next two digits are the IID from Compustat, and the remaining six digits are the gvkey.  
- Adapted the primary_sec column such that all observations from CRSP have primary_sec=1. 
- Previously, we treated a zero return as a missing observation. Now, we have removed this screen, such that a zero return is treated like any other return. 
- Previously, we winsorized daily returns, market equity, and dollar volume, before creating charactersitics based on daily stock market data. Now, we have removed this winsorization, and daily characteristics are based on the raw data. 
- Added the option to create daily factor return in the portfolios.R code.
- Added the option to create industry returns in the portfolios.R code.

__Impact__:
- Replication rate: 83.2%

## 27-08-2021 

__Changes__:
- Fixed a bug regarding how daily delisting returns from CRSP is incorporated.
- Added indfmt='FS' to the international accounting data. 

__Impact__:
- Replication rate: 83.2%

## 14-06-2021 

__Changes__:

- We changed the winsorization scheme. First, we removed the 0.01%/99.9% winsorization of market equity in all countries. Second, we removed the winsorization of returns from the CRSP database. For Compustat returns, we set returns above (below) the 99.9% (0.01%) of CRSP returns in the same month, to that level. In other words, we base our winsorization of Compustat data on CRSP data from the same month. 
- We made several changes to the code for easier usability. Notably, the updated `main.sas` file returns a zip folder called "output" in the scratch folder, which contains all data neccesary to re-produce the results in the paper.  

__Impact__:

- Replication rate: 83.2%
- The revisions impacted all factors slightly, but the overall results are qualitatively very similar. 

## 02-19-2021 

__Changes__:

- Previously we did not exclude securities that are only traded over the counter. In the new version of the data set, we include an indicator column "exch_main" to exclude non-standard exchanges. In the US, the main exchanges are AMEX, NASDAQ and NYSE. Outside of the US, we exclude over the counter exchanges, stock connect exchanges in China and cross-country exchanges such as BATS Chi-X Europe. The documentation includes a full list of the excluded exchanges.  
- Included SIC, NAICS and GICS industry codes.

__Impact__:

- Replication rate: 84.0%. 
- Excluding non-standard exchanges mainly affected the US. By December 2019, the number of stocks in the US dropped from 5,256 to 4,102 (-22%) after adding the new 'exch_main' screen. The excluded securities are mainly tiny stocks traded over the counter, so the aggregate market cap only dropped by 2%. The change also mostly affected post 2000 data, because over the counter observations in Compustat are very rare before this point in time.    
- The change had a small effect outside of the US, because of our 'primary_sec' screen. It's very rare for Compustat to identify a security traded on a non-standard exchange as the primary security of a firm. 
- Because the changes mainly affected tiny stocks, our results did not change much. Across the 153 factors in the US, Developed and Emerging regions, the change in posterior monthly alpha ranged from -0.06% to +0.07.

## 02-15-2021

__Changes__:

- A bug caused _ivol_ff3_21d_, _iskew_ff3_21d_, _ivol_hxz4_21d_ and _iskew_hxz4_21d_ to require 17 (ff3) and 18 (hxz) observations for a valid estimate. Consistent with our original intent, we now require at least 15 observations for a valid estimate.    

__Impact__:

- Replication Rate: 84.0%. 
- The changes had a negligible effect on the affected factors.

## 02-01-2021 

__Changes__:

- Fixed a small bug in the bidask_hl() macro.
- When creating asset pricing factors (FF and HXZ), we previously required at least 5 stocks in a sub-portfolio (e.g. small stocks with high BM) for the observation to be valid. This led to missing observation in the 1950's for small stocks with low bm. We lowered this requirement to at least 3 stocks. Furthermore, when creating asset pricing factors, we changed the breakpoints to be based on NYSE stocks in the US instead of non-microcap stocks. Outside of the US, breakpoints are still based on non-microcap stocks.
  
__Impact__:

- Replication Rate: 84.0% 
- The _bidaskhl_21d_ factor changed slightly but is still significantly negative in all regions. The US factor IR changed from -0.11 to -0.09.
- The change in asset pricing factor generally didn't affect the results much.

## 01-25-2021
__Changes__:

- Changed residual momentum characteristics (resff3_12_1 & resff3_6_1) to be scaled with the standard deviation of residuals consistent with Blitz, Huij and Mertens (2011). 
- Fixed error in creating _qmj_prof_. The issue was that the _oaccruals_at_ used the value instead of the z-score of ranks. This effectively meant that accruals didn't impact the profitability score. 
- Fixed error for annual seasonality characteristics (factor names starting with seas_ and ending with _an). There was a bug in the screening procedure which meant that the characteristic for one stock could use information from an unrelated stock. 
- Rounding issues when converting a .csv file to an excel file, caused the zero_trades_* variables to not have any decimals which made the turnover tie-breaker ineffective.
- Standardized unexpected earnings (niq_su) and sales (saleq_su) is computed as the actual value minus the expected value (standardized by the standard deviation of this change). Before, the expected value was computed as the mean yearly change over the last 8 quarters added to the last quarterly value. Now the expected value is the same mean yearly change, but added to the quarterly value 4 quarters ago consistent with Jegadeesh and Livnat (2006).
  
__Impact__:

- Replication Rate: 84.0%
- The change to the residual momentum variables, made them slightly weaker. As an example, the monthly OLS information ratio of the US resff3_12_1 factor dropped from 0.33 to 0.28. 
- The _qmj_prof_ change made _qmj_prof_ and _qmj_ slightly stronger. As an example, the monthly OLS information ratio of the US _qmj_prof_ factor increased from 0.16 to 0.22.  
- The seasonality fix didn't have a large qualitative impact for the US factors, but did have a large positive effect outside of the US. As an example, the OLS IR of the developed market _seas_11_15an_ factor changed from -0.06 to 0.11.   
- The zero_trades where missing in the developed market because of too few non-missing observations. The developed market zero trades factors are generally strong and the IR ranges from 0.07 to 0.20. Similarly, The Emerging market zero trades factors where slightly negative before. After, the factors are strong with IRs ranging from 0.14 to 0.17. The US market zero trades factors improved slightly. The IR of zero_trades_21d has the most notable increase from 0.05 to 0.09.   
- The standardized unexpected sales (saleq_su) variable went from a significant IR of 0.12 to an insignificant IR of 0.05. This explains the drop in the replication rate. On the other hand, niq_su increased from 0.11 to 0.19.

## 01-15-2021 
__Changes__:

  - Base data set used in the first online version of Jensen, Kelly and Pedersen (2021).
  
__Impact__:

- Replication Rate: 84.9%


================================================
FILE: GlobalFactors/Cluster Labels.csv
================================================
characteristic,cluster
age,Low Leverage
aliq_at,Investment
aliq_mat,Low Leverage
ami_126d,Size
at_be,Low Leverage
at_gr1,Investment
at_me,Value
at_turnover,Quality
be_gr1a,Investment
be_me,Value
beta_60m,Low Risk
beta_dimson_21d,Low Risk
betabab_1260d,Low Risk
betadown_252d,Low Risk
bev_mev,Value
bidaskhl_21d,Low Leverage
capex_abn,Debt Issuance
capx_gr1,Investment
capx_gr2,Investment
capx_gr3,Investment
cash_at,Low Leverage
chcsho_12m,Value
coa_gr1a,Investment
col_gr1a,Investment
cop_at,Quality
cop_atl1,Quality
corr_1260d,Seasonality
coskew_21d,Seasonality
cowc_gr1a,Accruals
dbnetis_at,Seasonality
debt_gr3,Debt Issuance
debt_me,Value
dgp_dsale,Quality
div12m_me,Value
dolvol_126d,Size
dolvol_var_126d,Profitability
dsale_dinv,Profit Growth
dsale_drec,Profit Growth
dsale_dsga,Profit Growth
earnings_variability,Low Risk
ebit_bev,Profitability
ebit_sale,Profitability
ebitda_mev,Value
emp_gr1,Investment
eq_dur,Value
eqnetis_at,Value
eqnpo_12m,Value
eqnpo_me,Value
eqpo_me,Value
f_score,Profitability
fcf_me,Value
fnl_gr1a,Debt Issuance
gp_at,Quality
gp_atl1,Quality
inv_gr1,Investment
inv_gr1a,Investment
iskew_capm_21d,Short-Term Reversal
iskew_ff3_21d,Short-Term Reversal
iskew_hxz4_21d,Short-Term Reversal
ival_me,Value
ivol_capm_21d,Low Risk
ivol_capm_252d,Low Risk
ivol_ff3_21d,Low Risk
ivol_hxz4_21d,Low Risk
kz_index,Seasonality
lnoa_gr1a,Investment
lti_gr1a,Seasonality
market_equity,Size
mispricing_mgmt,Investment
mispricing_perf,Quality
ncoa_gr1a,Investment
ncol_gr1a,Debt Issuance
netdebt_me,Low Leverage
netis_at,Value
nfna_gr1a,Debt Issuance
ni_ar1,Debt Issuance
ni_be,Profitability
ni_inc8q,Quality
ni_ivol,Low Leverage
ni_me,Value
niq_at,Quality
niq_at_chg1,Profit Growth
niq_be,Profitability
niq_be_chg1,Profit Growth
niq_su,Profit Growth
nncoa_gr1a,Investment
noa_at,Debt Issuance
noa_gr1a,Investment
o_score,Profitability
oaccruals_at,Accruals
oaccruals_ni,Accruals
ocf_at,Profitability
ocf_at_chg1,Profit Growth
ocf_me,Value
ocfq_saleq_std,Low Risk
op_at,Quality
op_atl1,Quality
ope_be,Profitability
ope_bel1,Profitability
opex_at,Quality
pi_nix,Seasonality
ppeinv_gr1a,Investment
prc,Size
prc_highprc_252d,Momentum
qmj,Quality
qmj_growth,Quality
qmj_prof,Quality
qmj_safety,Quality
rd_me,Size
rd_sale,Low Leverage
rd5_at,Low Leverage
resff3_12_1,Momentum
resff3_6_1,Momentum
ret_1_0,Short-Term Reversal
ret_12_1,Momentum
ret_12_7,Profit Growth
ret_3_1,Momentum
ret_6_1,Momentum
ret_60_12,Investment
ret_9_1,Momentum
rmax1_21d,Low Risk
rmax5_21d,Low Risk
rmax5_rvol_21d,Short-Term Reversal
rskew_21d,Short-Term Reversal
rvol_21d,Low Risk
sale_bev,Quality
sale_emp_gr1,Profit Growth
sale_gr1,Investment
sale_gr3,Investment
sale_me,Value
saleq_gr1,Investment
saleq_su,Profit Growth
seas_1_1an,Profit Growth
seas_1_1na,Momentum
seas_11_15an,Seasonality
seas_11_15na,Seasonality
seas_16_20an,Seasonality
seas_16_20na,Accruals
seas_2_5an,Seasonality
seas_2_5na,Investment
seas_6_10an,Seasonality
seas_6_10na,Low Risk
sti_gr1a,Seasonality
taccruals_at,Accruals
taccruals_ni,Accruals
tangibility,Low Leverage
tax_gr1a,Profit Growth
turnover_126d,Low Risk
turnover_var_126d,Profitability
z_score,Low Leverage
zero_trades_126d,Low Risk
zero_trades_21d,Low Risk
zero_trades_252d,Low Risk


================================================
FILE: GlobalFactors/GlobalFactors.Rproj
================================================
Version: 1.0

RestoreWorkspace: Default
SaveWorkspace: Default
AlwaysSaveHistory: Default

EnableCodeIndexing: Yes
UseSpacesForTab: Yes
NumSpacesForTab: 2
Encoding: UTF-8

RnwWeave: Sweave
LaTeX: pdfLaTeX


================================================
FILE: GlobalFactors/MD
================================================


================================================
FILE: GlobalFactors/README.md
================================================
## Overview
This repository contains code that create a dataset of global stock returns and characteristics. The dataset was created for the paper [Is There a Replication Crisis in Finance?](https://onlinelibrary.wiley.com/doi/10.1111/jofi.13249) by Jensen, Kelly and Pedersen (2023). Please cite this paper if you are using the code or data: 
```
@article{JensenKellyPedersen2023,
	author = {Jensen, Theis Ingerslev and Kelly, Bryan and Pedersen, Lasse Heje},
	title = {Is There a Replication Crisis in Finance?},
	journal = {The Journal of Finance},
	volume = {78},
	number = {5},
	pages = {2465-2518},
	year = {2023}
}
```
Follow this [link](https://www.dropbox.com/sh/61j1v0sieq9z210/AACdJ68fs5_eT_eJMunwMBWia?dl=0) for a detailed documentation of the data sets.

## How to Generate Global Stock Returns and Stock Characteristics
_Note: The data created in this section can be downloaded directly from WRDS ([link](https://wrds-www.wharton.upenn.edu/pages/get-data/contributed-data-forms/global-factor-data/))_

The .sas files construct the stock-level characteristics and factor portfolio returns for all countries. The code requires a connection to WRDS servers. Below we outline our preferred approach to creating the dataset:

1. Connect to the [SAS studio server hosted by WRDS](https://wrds-cloud.wharton.upenn.edu/SASStudio/index?locale=en_US).  
2. Create a folder called _Global Data_ in your home directory and upload `main.sas`, `project_macros.sas`, `market_chars.sas`, `accounting_chars.sas`, `char_macros.sas`, and `ind_identification.sas` to this folder.
3. Create an empty folder in your institutions scratch folder. The scratch folder is located at "Sever Files and Folders/Files/scratch/\<institution name\>".
4. Open `main.sas`. 
5. Replace line 8 with the path to the scratch folder created in step 3. 
6. Run `main.sas`.  

For step 6, we suggest that you make a background submit. To do so, locate the `main.sas` file in your _Global Data_ folder, right-clik the file and select "Background Submit". Running the code will take around 24-48 hours. 
When the code has finished running, your scratch folder will contain a folder called _output_. This folder contains daily and monthly stocks returns, daily and monthly market returns, market equity breakpoints from NYSE and return breakpoints to use for winsorization. 
If `save_csv=1` on line 12, the folder will also hold the data country-by-country in .csv format. Importantly, these files only include the main observation of ordinary common stocks that are the primary securities of the underlying firm, and traded on a main exchange. To include all stocks, modify the 'save_main_data_csv()' macro. 

Note that the space in the scratch directory is shared across users in your institution. It's best practice to delete files after downloading what you need. Furthermore, files in the scratch directory is deleted after 7 days.

## How to Generate Global Factor Returns
_Note: The data created in this section can be downloaded directly from [JKPfactors.com](https://jkpfactors.com/)_

The file `portfolio.R` generates country level factor returns based on the dataset generated by `main.sas`:

1. Run `main.sas`.
2. Save country level .csv files by running the macro `save_main_data_csv()` in `main.sas`.
3. Download the folder named _output_ to a local directory and unzip.
4. Open `portfolio.R`.
5. User defined inputs:
	1. Replace the variable `data_path` with the path to the folder where you have unzipped the content of _output_.
	2. Replace the variable `output_path` with the path to the folder where you wish to save the factor returns. 
	3. Replace the variable `legacy_path` with the path to the folder where you wish to save earlier versions of the factor output. If you don't wish to save earlier version, set the variabl to         `NULL`.
	4. The variable `countries` controls which countries to generate factor returns for. By default, it selects all the countries in `data_path`.
	5. The variable `chars` controls which characteristics to use for creating factor returns. By default, it is set to the 153 characteristics from Jensen, Kelly and Pedersen (2022). Any             column in the charactersitic dataset can be used to as the basis of a factor. 
	6. The variable `settings` controls the end date, number of portfolio, which breakpoints to use, the data source, whether to winsorize Compustat returns, the minimum amount of stocks for             breakpoints to be valid and whether to create rank weighted (characteristic managed)                  portfolios in each of the 5 size groups. Default settings are the same as used in Jensen,             Kelly and Pedersen (2022).
 6. Run `portfolio.R`.

### Output
**Files**
- `pfs.csv`: We sort stocks into 3 portfolios based on non-microcap breakpoints. Portfolio 1 (3) has the stocks with the lowest (highest) value of the characteristic.
- `hml.csv`: Long/short portfolios that are long stocks with high values of the underlying characteristics (portfolio 3 from pfs.csv) and short stocks with low values (portfolio 1 from pfs.csv). 
- `lms.csv`: Long/short portfolios based on hml.csv but with the signing convention used in Jensen, Kelly and Pedersen (2022). In particular, we sign factors so they are consistent with the literature. For example, we go long low asset growth stocks and short high asset growth stocks, becuase the literature generally finds that low asset growth stocks outperform. 
- `cmp.csv`: Rank-weighted (chracteristic managed) portfolios within mega, large, small, micro and nano cap stocks in the US.
- `market_returns.csv`: Monthly market returns in all the countries we cover.
- `Country Factors`: Folder with lms.csv in country-by-country files for easier usability. Countries are saved by their [ISO Alpha-3 codes](https://www.nationsonline.org/oneworld/country_code_list.htm).  
- `Regional Factors`: Folder with regional factors based on lms.csv and the method in Jensen, Kelly and Pedersen (2022).   

**Variables**
- `excntry`: Country where securities are listed as ISO Alpha-3 codes.
- `eom`: End-of-month of the month used to calculate returns.
- `characteristic`: Name of characteristic, refer to table J.1 in Jensen, Kelly and Pedersen (2022).
- `region`: Region/MSCI country development of included factors. 
- `size_grp`: Size group used to create the rank weighted factors.
- `pf`: Portfolio identifier
- `n`: Total number of stocks in the portfolio.
- `n_stocks`: Total number of stocks in the long and short portfolio.
- `n_stocks_min`: Minimum number of stocks in the long and short portfolio. For example, if the long portfolio has 10 stocks and the short portfolio has 40 stocks, n_stocks=50 and n_stocks_min=10.
- `n_countries`: Number of countries included in regional portfolio.
- `signal` (pfs.csv): Median characteristic value in the portfolio.
- `signal` (hml.csv, lms.csv): Difference between the median characteristics in the long and short portfolio.
- `signal_weighted`: Rank weighted signal.
- `ret_ew`: Return with equal weights.
- `ret_vw`: Return with value weights.
- `ret_vw_cap`: Return with capped value weights as used in Jensen, Kelly and Pedersen (2022).
- `ret_weighted`: Rank weighted returns.
- `me_lag1`: Total market equity within a country at the begining of the month
- `dolvol_lag1`: Total dollar volume traded within a country in the previous month.
- `stocks`: Stocks included in the market portfolio.
- `mkt_vw_lcl`: Value weighted market return in local currency.
- `mkt_ew_lcl`: Equally weighted market return in local currency.
- `mkt_vw`: Value weighted market return in USD.
- `mkt_ew`: Equally weighted market return in USD.
- `mkt_ew_exc`: Equally weighted market excess return in USD.
- `mkt_vw_exc` (market_return.csv): Value weighted market excess return in USD.
- `mkt_vw_exc` (Regional Factors): Market cap weighted average of the market excess return in USD for countries included in the regional portfolio.


================================================
FILE: GlobalFactors/accounting_chars.sas
================================================
***************************************************************************
*                     Characteritics to Extract 
*************************************************************************** ; 
/* Pure Accounting Based Characteristics */
%let acc_chars=
	/* Accounting Based Size Measures */
	assets sales book_equity net_income enterprise_value
	
	/* 1yr Growth */
	at_gr1 ca_gr1 nca_gr1 lt_gr1 cl_gr1 ncl_gr1 be_gr1 pstk_gr1 debt_gr1 sale_gr1 cogs_gr1 sga_gr1 opex_gr1
	
	/* 3yr Growth */
	at_gr3 ca_gr3 nca_gr3 lt_gr3 cl_gr3 ncl_gr3 be_gr3 pstk_gr3 debt_gr3 sale_gr3 cogs_gr3 sga_gr3 opex_gr3
	
	/* 1yr Growth Scaled by Assets */
	cash_gr1a inv_gr1a rec_gr1a ppeg_gr1a lti_gr1a intan_gr1a debtst_gr1a ap_gr1a
	txp_gr1a debtlt_gr1a txditc_gr1a coa_gr1a col_gr1a cowc_gr1a ncoa_gr1a ncol_gr1a nncoa_gr1a
	oa_gr1a ol_gr1a noa_gr1a fna_gr1a fnl_gr1a nfna_gr1a gp_gr1a ebitda_gr1a ebit_gr1a 
	ope_gr1a ni_gr1a nix_gr1a dp_gr1a ocf_gr1a fcf_gr1a nwc_gr1a
	eqnetis_gr1a dltnetis_gr1a dstnetis_gr1a dbnetis_gr1a netis_gr1a fincf_gr1a eqnpo_gr1a
	tax_gr1a
	div_gr1a eqbb_gr1a eqis_gr1a eqpo_gr1a capx_gr1a
	
	/* 3yr Growth Scaled by Assets */
	cash_gr3a inv_gr3a rec_gr3a ppeg_gr3a lti_gr3a intan_gr3a debtst_gr3a ap_gr3a
	txp_gr3a debtlt_gr3a txditc_gr3a coa_gr3a col_gr3a cowc_gr3a ncoa_gr3a ncol_gr3a nncoa_gr3a 
	oa_gr3a ol_gr3a fna_gr3a fnl_gr3a nfna_gr3a gp_gr3a ebitda_gr3a ebit_gr3a 
	ope_gr3a ni_gr3a nix_gr3a dp_gr3a ocf_gr3a fcf_gr3a nwc_gr3a
	eqnetis_gr3a dltnetis_gr3a dstnetis_gr3a dbnetis_gr3a netis_gr3a fincf_gr3a eqnpo_gr3a
	tax_gr3a
	div_gr3a eqbb_gr3a eqis_gr3a eqpo_gr3a capx_gr3a
	
	/* Investment */
	capx_at rd_at  
	
 	/* Profitability */
 	gp_sale ebitda_sale ebit_sale pi_sale ni_sale nix_sale ocf_sale fcf_sale  /* Profit Margins */
 	gp_at ebitda_at ebit_at fi_at cop_at							/* Return on Assets */ 
 	ope_be ni_be nix_be ocf_be fcf_be								/* Return on Book Equity */ 
 	gp_bev ebitda_bev ebit_bev fi_bev cop_bev						/* Return on Invested Capital */  
 	gp_ppen ebitda_ppen fcf_ppen									/* Return on Physical Capital */
 	
 	/* Issuance */
 	fincf_at netis_at eqnetis_at eqis_at dbnetis_at dltnetis_at dstnetis_at
 	
 	/* Equity Payout */
 	eqnpo_at eqbb_at div_at
 	
 	/* Accruals */
 	oaccruals_at oaccruals_ni taccruals_at taccruals_ni noa_at
 	
 	/* Capitalization/Leverage Ratios */
	be_bev debt_bev cash_bev pstk_bev debtlt_bev debtst_bev
	debt_mev pstk_mev debtlt_mev debtst_mev
	
	/* Financial Soundness Ratios */
	int_debtlt int_debt cash_lt inv_act rec_act  			
	ebitda_debt debtst_debt cl_lt debtlt_debt profit_cl ocf_cl		
	ocf_debt lt_ppen debtlt_be fcf_ocf
	opex_at nwc_at
	
	/* Solvency Ratios */
 	debt_at debt_be ebit_int
 	
 	/* Liquidity Ratios */
 	cash_cl caliq_cl ca_cl
 	inv_days rec_days ap_days cash_conversion 
 	
 	/* Activity/Efficiency Ratio */
 	inv_turnover at_turnover rec_turnover ap_turnover	
 	
 	/* Non-Recurring Items */
 	spi_at xido_at nri_at
 	
	/* Miscalleneous */
	adv_sale staff_sale rd_sale div_ni sale_bev sale_be sale_nwc tax_pi
	
	/* Balance Sheet Fundamentals to Market Equity */
	be_me at_me cash_me
	
	/* Income Fundamentals to Market Equity */
	gp_me ebitda_me ebit_me ope_me ni_me nix_me sale_me ocf_me fcf_me cop_me
	rd_me
	
	/* Equity Payout/issuance to Market Equity */
	div_me eqbb_me eqis_me eqpo_me eqnpo_me eqnetis_me
	
	/* Debt Issuance to Market Enterprice Value */
	dltnetis_mev dstnetis_mev dbnetis_mev
	
	/* Firm Payout/issuance to Market Enterpice Value */
	netis_mev
	
	/* Balance Sheet Fundamentals to Market Enterprise Value */
	at_mev be_mev bev_mev ppen_mev cash_mev
	
	/* Income/CF Fundamentals to Market Enterprise Value */
	gp_mev ebitda_mev ebit_mev cop_mev sale_mev ocf_mev fcf_mev fincf_mev
	
	/* New Variables from HXZ */ 
	ni_inc8q ppeinv_gr1a lnoa_gr1a capx_gr1 capx_gr2 capx_gr3 sti_gr1a
	niq_at niq_at_chg1 niq_be niq_be_chg1 saleq_gr1 rd5_at
	dsale_dinv dsale_drec dgp_dsale dsale_dsga
	saleq_su niq_su debt_me netdebt_me capex_abn inv_gr1 be_gr1a
	op_at pi_nix op_atl1 gp_atl1 ope_bel1 cop_atl1
	at_be ocfq_saleq_std  
	aliq_at aliq_mat tangibility
	eq_dur f_score o_score z_score kz_index intrinsic_value ival_me
	sale_emp_gr1 emp_gr1 cash_at
	earnings_variability ni_ar1 ni_ivol
	
	/* New Variables not in HXZ */
	niq_saleq_std ni_emp sale_emp ni_at
	ocf_at ocf_at_chg1
	roeq_be_std roe_be_std
	gpoa_ch5 roe_ch5 roa_ch5 cfoa_ch5 gmar_ch5
;

%put ### In total %nwords(&acc_chars.) characteristics will be created ###;

**********************************************************************************************************************
*                                    MACRO - Add Helper Vars to Standardized Compustat Accounting 
*********************************************************************************************************************
Description: 
   The main functionality of this macro is to take the output of %compustat_accounting_data and add helper variables.
   These helper variables have two main functionalities:
	1. They are used to expand coverage of a given variable 
	2. They are used to create variables not directly available from the accounting statements
   All added variables have a suffix of '_x'.
;

%macro add_helper_vars(data=, out=);
	/* First ensure that the gap between two dates is always one month */
	proc sql;
		create table __comp_dates1 as
		select gvkey, curcd, min(datadate) as start_date, max(datadate) as end_date
		from &data.
		group by gvkey, curcd;
	quit;
	
	%expand(data=__comp_dates1, out=__comp_dates2, id_vars=gvkey, start_date=start_date, end_date=end_date, freq='month', new_date_name=datadate);
	
	proc sql;
		create table __helpers1 as 
		select a.gvkey, a.curcd, a.datadate, b.*, not missing(b.gvkey) as data_available
		from __comp_dates2 as a
		left join &data. as b on a.gvkey=b.gvkey and a.curcd=b.curcd and a.datadate=b.datadate;
	quit;
	
	proc sort nodupkey data=__helpers1; by gvkey curcd datadate; run;
	
	data __helpers2;
		set __helpers1;
		by gvkey curcd;
		retain count;
		if first.curcd then 
			count = 1;
		else
			count = count+1;
	run;
	
	/* Create Helper Variables */
	data &out.; 
		set __helpers2;
		by gvkey curcd;
		
		/* Require Certain Variables to Be Positive */
		array var_pos at sale revt dv che;
		do over var_pos;
			if var_pos<0 then
				var_pos = .;
		end;

		/* X Variables to Maximize Coverage */
		*Income Statement;
		sale_x			= coalesce(sale, revt); /* in NA sale has better coverage in Global revt has better coverage. They are the same though*/
		gp_x 			= coalesce(gp, sale_x-cogs); /*Gross Profit*/
		opex_x			= coalesce(xopr, cogs+xsga); /* Operating Expenses */
		ebitda_x 		= coalesce(ebitda, oibdp, sale_x-opex_x, gp_x-xsga); /*Operating Income Before Depreciation*/
		ebit_x 			= coalesce(ebit, oiadp, ebitda_x-dp); /*Operating Income Before Depreciation*/
		op_x            = ebitda_x + coalesce(xrd, 0);  /* Operating Profit ala Ball et al (2015)*/
		ope_x			= ebitda_x-xint; /* Operating Profit to Equity ala FF*/
		pi_x			= coalesce(pi, ebit_x-xint+coalesce(spi,0)+coalesce(nopi,0));  /* Interest Income is included in NOPI*/
		xido_x			= coalesce(xido, xi+coalesce(do, 0));
		ni_x			= coalesce(ib, ni-xido_x, pi_x - txt - coalesce(mii, 0)); 
		nix_x			= coalesce(ni, ni_x+coalesce(xido_x, 0), ni_x + xi + do);
		fi_x			= nix_x+xint;		/* Firm income i.e. return to equity and debt holders */  
		div_x			= coalesce(dvt, dv); /* See [1] */
		
		* Cash Flow Statement;
		eqbb_x			= sum(prstkc, purtshr);  /* Equity Buyback is mainly PRSTKC in NA and PURTSHR in GLOBAL. Using sum() means that any of the two inputs are allowed to be missing */	
		eqis_x			= sstk;  /* Equity Issuance is SSTK which is common+preferred Stocks.*/	
		eqnetis_x		= sum(eqis_x,-eqbb_x);  /* Net Equity Issuance. Using sum means that the variable will be computed as long as one of the inputs are non-missing */
		eqpo_x			= div_x+eqbb_x;  /* Net Equity Payout= Div+Buyback-Issuance*/
		eqnpo_x			= div_x-eqnetis_x;  /* Net Equity Payout= Div+Buyback-Issuance*/
		dltnetis_x		= coalesce(sum(dltis,-dltr), ltdch, dif12(dltt)); /* Net Long Term Debt issuance. GLOBAL firms only have LTDCH, NA firms only have dltis and DLTR. If cash flow items are missing. Approximate by the change in long term book debt */
		if missing(dltis) and missing(dltr) and missing(ltdch) and count<=12 then
			dltnetis_x	= .; 
		dstnetis_x		= coalesce(dlcch, dif12(dlc)); /* Prefer dlcch. If this is missing, approximate by the change in short term book debt */
		if missing(dlcch) and count<=12 then
			dstnetis_x	= .; 
		dbnetis_x		= sum(dstnetis_x, dltnetis_x);
		netis_x			= eqnetis_x+dbnetis_x;  /* I require that both equity and debt issuance are available */
		fincf_x			= coalesce(fincf, netis_x-dv+coalesce(fiao, 0)+coalesce(txbcof, 0)); /* https://wrds-web.wharton.upenn.edu/wrds/support/Data/_001Manuals%20and%20Overviews/_001Compustat/_001North%20America%20-%20Global%20-%20Bank/_000dataguide/index.cfm?_ga=2.179697184.565214677.1585224362-1543109641.1544636729 */
		
		* Balance Sheet Statement;
		debt_x			= sum(dltt, dlc); /* This greatly expands coverage */
		pstk_x			= coalesce(pstkrv, pstkl, pstk); /* Value of Preferred Stock*/
		seq_x			= coalesce(seq, ceq+coalesce(pstk_x, 0), at-lt);
		at_x            = coalesce(at, seq_x + dltt + coalesce(lct, 0) + coalesce(lo, 0) + coalesce(txditc, 0));  /* at is only available yearly in the beginning of the fundq data whereas seq and dltt is available on a quarterly basis from the beginning */  
		ca_x			= coalesce(act, rect+invt+che+aco); /* Current Assets */
		nca_x			= at_x-ca_x; /* Non-Current Assets */
		cl_x			= coalesce(lct, ap+dlc+txp+lco); /* Current Liabilities */
		ncl_x			= lt-cl_x;
		
		netdebt_x		= debt_x - coalesce(che, 0); /* Net Debt for calculating Enterprise Value */
		txditc_x		= coalesce(txditc, sum(txdb, itcb));
		be_x			= seq_x+coalesce(txditc_x, 0)-coalesce(pstk_x, 0); 
		bev_x			= coalesce(icapt+coalesce(dlc, 0)-coalesce(che, 0), netdebt_x+seq_x+coalesce(mib, 0));
		
		coa_x			= ca_x - che;  /* Operating (non cash) current assets */
		col_x			= cl_x-coalesce(dlc, 0);  /* Operating Current Liabilities */
		cowc_x			= coa_x-col_x;  /* Current Operating Working Capital */
		ncoa_x			= at_x-ca_x-coalesce(ivao, 0); /* Non-Current Operating Assets */
		ncol_x			= lt-cl_x-dltt; /* Non-Current Operating Liabilities */
		nncoa_x			= ncoa_x-ncol_x; /* Net Non-Current Operatng Assets */
		fna_x			= coalesce(ivst,0)+coalesce(ivao,0); /* Financial Assets */
		fnl_x			= debt_x+coalesce(pstk_x,0); /* Financial Liabilities */
		nfna_x			= fna_x-fnl_x; /* Net Financial Assets */
		oa_x			= coa_x+ncoa_x; /* Operating Assets */
		ol_x			= col_x+ncol_x; /* Operating Liabilities */
		noa_x			= oa_x-ol_x; /* Net Operating Assets*/
		lnoa_x          = ppent+intan+ao-lo+dp;  /* Long-term NOA (from HXZ A.3.5)*/
		
		caliq_x			= coalesce(ca_x-invt, che+rect); /* Liquid current assets use for quick ratio*/
		nwc_x			= ca_x-cl_x;
		ppeinv_x        = ppegt + invt;  * Should be moved to create_chars as it is a helper variables;
		aliq_x          = che + 0.75 * coa_x + 0.5 * (at_x - ca_x - coalesce(intan, 0));  /* From Ortiz-Molina and Phillips (2014). Don't subtract gdwl since this is already included intangibles */
		
		array var_bs be_x bev_x;
		do over var_bs;
			if var_bs<=0 then
				var_bs	= .;
		end;
		
		/* Accruals + OCF/FCF + Cash Based Operating Profit*/
		oacc_x			= coalesce(ni_x-oancf, dif12(cowc_x)+dif12(nncoa_x));	/* Operating Accruals: Difference between Accounting Earnings and Operating Cash Flow. [2] */
		tacc_x			= oacc_x+dif12(nfna_x);      /* Total Accruals = Accural Earnings - Cash Earnings = Change in Non-Cash Assets - Change in Liabilities. */
		if count<=12 then do;
			oacc_x 		= .;
			tacc_x		= .;
		end;
		ocf_x           = coalesce(oancf, ni_x-oacc_x, ni_x + dp - coalesce(wcapt, 0)); 
		fcf_x			= ocf_x-capx; /* Note that this does not include funds from financing activities */
		cop_x			= ebitda_x + coalesce(xrd, 0) - oacc_x;  /* Cash Based Operating Profitability (Gerakos et al, 2016) add R&D while subtracting accruals */
		
		drop count;
	run;		
	
	proc delete data= __comp_dates1 __comp_dates2 __helpers1 __helpers2; run;
%mend add_helper_vars;

**********************************************************************************************************************
*                                    MACRO - Compustat Accounting Data Standardized 
*********************************************************************************************************************
Description: 
   The main functionality of this macro is to create accounting datasets which are 
   comparable across frequency (quarterly/annual) and geography (North America/Global).
   To make the data comparable across frequency, we modify the quarterly data by: 
   	- Quarterize year-to-date variables.  
   	- Take the sum over the last 4 quarters for income and cash flow variables
	- Change name to be consistent with the annual data
   To make the data comparable across geography, we modify the global data by:
 	- Create columns available in the North American dataset in the global dataset.
 	  If possible we infer the values from available columns. Otherwise we just set to missing.
 	- If specified, we change all data to USD.
;


%macro standardized_accounting_data(coverage=, convert_to_usd=, me_data=, include_helpers_vars=1, start_date='01JAN1950'd);  /* Coverage in ('na', 'global', 'world'), convert_to_usd in (0,1), include_helpers_vars in (0,1)*/
	/* Compustat Accounting Vars to Extract */
	%let avars_inc =
		sale revt gp ebitda oibdp ebit oiadp  pi ib ni mii
		cogs xsga xopr xrd xad xlr dp xi do xido xint spi nopi txt
		dvt
	;
	%let avars_cf =
		/* Operating */
		oancf ibc dpc xidoc capx ibc dpc wcapt
		
		/* Financing */
		fincf fiao txbcof ltdch dltis dltr dlcch purtshr prstkc sstk
		dv dvc
	;
	%let avars_bs =
		/* Assets */
		at act aco che invt rect ivao ivst ppent ppegt intan ao gdwl re
		
		/* Liabilities */
		lt lct dltt dlc txditc txdb itcb txp ap lco lo 
		seq ceq pstkrv pstkl pstk mib icapt 
	;
	
	* Variables in avars_other are not measured in currency units, and only available in annual data;
	%let avars_other = emp; 
	
	%let avars=	&avars_inc. &avars_cf. &avars_bs.; 	/* fdate and pdate. Unfortunately only available from 2007. RDQ is available further back for NA but should probably use 10K dates instead???*/	
	
	%put INCOME STATEMENT: %nwords(&avars_inc.) || CASH FLOW STATEMENT: %nwords(&avars_cf.) || BALANCE SHEET: %nwords(&avars_bs.) || OTHER: %nwords(&avars_other.);
	
	proc sql noprint;
		select distinct lowcase(name) into :qvars_q separated by ' '
		from dictionary.columns
		where libname='COMP' and memname in ('FUNDQ', 'G_FUNDQ') and memtype='DATA'
		and findw(lowcase("&avars."),substr(lowcase(name),1,length(name)-1))>0
		and name like "%nrstr(%%)q"; /* Quarterly names ending with q */
	quit;
	
	proc sql noprint;
		select distinct lowcase(name) into :qvars_y separated by ' '
		from dictionary.columns
		where libname='COMP' and memname in ('FUNDQ', 'G_FUNDQ') and memtype='DATA'
		and findw(lowcase("&avars."),substr(lowcase(name),1,length(name)-1))>0
		and name like "%nrstr(%%)y"; /* Quarterly names ending with y (ytd_variables)*/
	quit;
	
	%let qvars = &qvars_q. &qvars_y.;
	
	* In International but not in NA: dvtq dvty purtshr purtshry ltdch ltdchy    LOC?;
	
	%if &coverage. = 'global' or &coverage. = 'world' %then %do;
		%let aname = __gfunda;
		%let qname = __gfundq;
		%let compcond=indfmt in ('INDL', 'FS') and datafmt='HIST_STD' and popsrc='I' and consol='C';
		data g_funda1; 
			set comp.g_funda;
			where &compcond. and datadate>=&start_date.; 
			source = 'GLOBAL'; format source $char6.;
			
			* Variables Not Available in G_FUNDA with Replacement;
			ni 			= ib+coalesce(xi,0)+coalesce(do,0);   /*https://wrds-www.wharton.upenn.edu/pages/support/support-articles/compustat/global/ni-net-income-variable/?_ga=2.199276522.59544135.1581901991-1543109641.1544636729 */
			
			* Variables Not Available in G_FUNDA without Replacement;
			gp 			= .;
			pstkrv		= .; /*captured by pstk_x*/
			pstkl		= .; /*captured by pstk_x*/
			itcb 		= .; /*Only used as substitute for txditc (deffered tax and investment credit)*/
			xad			= .;
			txbcof		= .; /* Part of FINCF but will automatically be set to zero*/
			keep gvkey datadate indfmt curcd source &avars. &avars_other.;
		run;
		proc sql;
			create table &aname. as
			select *
			from g_funda1
			group by gvkey, datadate
			having count(*)=1 or (count(*)=2 and indfmt='INDL');  /* If the accouting report is available inb both an industrial and financial format, choose financial format (happens very rarely in the international data)*/ 
			
			alter table &aname.
			drop indfmt;
		quit;
		
		data g_fundq1;
			set comp.g_fundq;
			where &compcond. and datadate>=&start_date.;
			source = 'GLOBAL'; format source $char6.;
			
			* Variables Not Available in G_FUNDQ with Replacement;
			niq			= ibq+coalesce(xiq, 0); /*Discontinued Operations is not available in g_fundq*/
			ppegtq		= ppentq+dpactq;  /* See [3] */
			
			* Variables Not Available in G_FUNDQ without Replacement;
			icaptq		= .;
			niy			= .;
			txditcq		= .;
			txpq		= .; 
			xidoq		= .; 
			xidoy		= .;
			xrdq		= .;
			xrdy		= .;
			txbcofy		= .; /* Part of FINCF but will automatically be set to zero*/
			keep gvkey datadate indfmt fyr fyearq fqtr curcdq source &qvars.; 
		run;
		proc sql;
			create table &qname. as
			select *
			from g_fundq1
			group by gvkey, datadate
			having count(*)=1 or (count(*)=2 and indfmt='INDL');  /* If the accouting report is available inb both an industrial and financial format, choose financial format (happens very rarely in the international data)*/ 
			
			alter table &qname.
			drop indfmt;
		quit;
	%end;
	
	%if &coverage. = 'na' or &coverage. = 'world' %then %do;
		%let aname = __funda;
		%let qname = __fundq;
		%let compcond=indfmt='INDL' and datafmt='STD' and popsrc='D' and consol='C';
		data &aname.; 
			set comp.funda; 
			where &compcond. and datadate>=&start_date.;
			source = 'NA'; format source $char6.;
			keep gvkey datadate curcd source &avars. &avars_other.; 
		run;
		data &qname.; 
			set comp.fundq; 
			where &compcond. and datadate>=&start_date.;
			source = 'NA'; format source $char6.;
			keep gvkey datadate fyr fyearq fqtr curcdq source &qvars.; 
		run;
	%end;	
	%if &coverage. = 'world' %then %do;
		%let aname = __wfunda;
		%let qname = __wfundq;
		data &aname.;
			set __gfunda __funda;
		run;
		data &qname.;
			set __gfundq __fundq;
		run;
		/*proc delete data= __gfunda __gfundq __funda __fundq; run;*/
	%end;

	/* If &convert_to_usd=1 then convert everything to USD otherwise keep as local currency*/
	%if &convert_to_usd.=1 %then %do;
		%compustat_fx(out=fx);
		
		proc sql;
			create table __tempa as 
			select a.*, b.fx
			from &aname. as a left join fx as b
			on a.datadate=b.date and a.curcd=b.curcdd;
			
			create table __tempq as 
			select a.*, b.fx
			from &qname. as a left join fx as b
			on a.datadate=b.date and a.curcdq=b.curcdd;
		quit;
		
		data __compa1;
			set __tempa;
			array var &avars.;
			do over var;
				var = var*fx;
			end;
			curcd = 'USD';
			drop fx;
		run;
		
		data __compq1;
			set __tempq;
			array var &qvars.;
			do over var;
				var = var*fx;
			end;
			curcdq = 'USD';
			drop fx;
		run; 
		
		proc delete data=fx __tempa __tempq;
	%end;
	%else %do;
		*Rename Data;
		proc sql;
			create table __compa1 as select * from &aname.; 
			create table __compq1 as select * from &qname.;
		quit;
	%end;
	
	proc delete data= &aname. &qname.;
	 
	/* Change Quarterly Data to Be Comparable to Annual Data */
	%QUARTERIZE(inset=__compq1, outset=__compq2, idvar=gvkey fyr, fyear=fyearq, fqtr=fqtr); /* Quarterize the YTD flow accounting variables */
	proc sort data=__compq2 nodupkey; by gvkey fyr fyearq fqtr; run; /*THEIS: 0 Deleted in the US*/
	%macro ttm(var); (&var + lag1(&var) + lag2(&var) + lag3(&var)) %mend; /* Note that ttm will return missing if either of the lags are missing. This is the behavior we want. */

	%macro temp();
	/* Prepare quarterly data: if quarterly Compustat variable is missing, replace with quarterized version*/
	data __compq3; 
		set __compq2;
		by gvkey fyr fyearq fqtr;
		
		/* Replace missing &var.q with quarterized version &var.y_q*/
		%do i=1 %to %nwords(&qvars_y.); 
			%let var_ytd = %scan(&qvars_y., &i, %str(' '));
			%let var = %sysfunc(prxchange(s/y$//, 1, &var_ytd.));
			if missing(&var.q) then
				&var.q = &var.y_q;
			drop &var.y_q;
		%end;	
		
		/* Create Quarterly Variables to Keep */
		ni_qtr = ibq;
		sale_qtr = saleq;
		ocf_qtr = coalesce(oancfq, ibq + dpq - coalesce(wcaptq, 0));
	
		*Cumulate Income/CF Items Over 4 Quarters (This should be made automatic at some point;
		%let yrl_vars = cogsq xsgaq xintq dpq txtq xrdq dvq spiq saleq revtq cogsq 
			xoprq oibdpq oiadpq ibq niq xidoq nopiq miiq piq xiq
			xidocq capxq oancfq ibcq dpcq wcaptq
			prstkcq sstkq purtshrq
			dsq dltrq ltdchq dlcchq
			fincfq fiaoq txbcofq dvtq;
		
		%do i=1 %to %nwords(&yrl_vars.);
			%let var_yrl = %scan(&yrl_vars., &i, %str(' '));
			%let var_yrl_name = %sysfunc(prxchange(s/q$//, 1, &var_yrl.));  
			&var_yrl_name. = %ttm(&var_yrl.);
			if (gvkey^=lag3(gvkey) or fyr^=lag3(fyr) or curcdq^=lag3(curcdq) or %ttm(fqtr)^=10) then 
				&var_yrl_name. = .;
			if missing(&var_yrl_name.) and fqtr=4 then
				&var_yrl_name. = &var_yrl_name.y;  * If financial quarter is 4, the ytd variable is yearly;
			drop &var_yrl. &var_yrl_name.y;
		%end;
		
		* Rename All Other (Balance Sheet and CURCDQ) Items to Facilitate Merge (This should be made automatic at some point);
		%let bs_vars = seqq ceqq pstkq icaptq mibq gdwlq req 
			atq actq invtq rectq ppegtq ppentq aoq acoq intanq cheq ivaoq ivstq 
			ltq lctq dlttq dlcq txpq apq lcoq loq txditcq txdbq;
		%do i=1 %to %nwords(&bs_vars.);
			%let var_bs = %scan(&bs_vars., &i, %str(' '));
			%let var_bs_name = %sysfunc(prxchange(s/q$//, 1, &var_bs.));  
			rename &var_bs. = &var_bs_name.;
		%end;
		rename curcdq = curcd;
		
	run;
	%mend;
	%temp();
	
	/* Ensure One Obs pr. Datadate */
	proc sort data=__compq3 nodupkey; by gvkey datadate fyr; run;
	data __compq4; set __compq3; by gvkey datadate; if last.datadate; drop fyr fyearq fqtr; run; /*THEIS: US->1432 Observations are deleted in this step*/	
	
	/* Add empty quarterly variables to annual data */
	data __compa2;
		set __compa1;
		ni_qtr = .;
		sale_qtr = .;
		ocf_qtr = .;
	run;
	
	/* Add Market Equity at Fiscal End */
	proc sql;
		create table __me_data as 
		select distinct gvkey, eom, me_company as me_fiscal 
		from &me_data.
		where not missing(gvkey) and primary_sec=1 and not missing(me_company) and common=1 and obs_main=1 /* Notice, exch_main is not a requirement */
		group by gvkey, eom
		having me_company=max(me_company);
	quit; 
	
	proc sql;
		create table __compa3 as 
		select a.*, b.me_fiscal
		from __compa2 as a 
		left join __me_data as b
		on a.gvkey = b.gvkey and a.datadate = b.eom;
	quit;
	
	proc sql;
		create table __compq5 as 
		select a.*, b.me_fiscal
		from __compq4 as a 
		left join __me_data as b
		on a.gvkey = b.gvkey and a.datadate = b.eom;
	quit;
	
	/* Include Helper Variables */
	%if &include_helpers_vars.=1 %then %do;
		%let qdata = __compq6;
		%let adata = __compa4;
		%add_helper_vars(data = __compq5, out=&qdata.);
		%add_helper_vars(data = __compa3, out=&adata.);
		proc delete data=__compq5 __compa3; run;
	%end;
	%else %do;
		%let qdata = __compq5;
		%let adata = __compa3;
	%end;
	
	/* Output */
	proc sort data= &adata. out=acc_std_ann nodupkey; by gvkey datadate; run;
	proc sort data= &qdata. out=acc_std_qtr nodupkey; by gvkey datadate; run;

	proc delete data=__compq1 __compq2 __compq3 __compq4 __compa1 __compa2 &qdata. &adata.; run; 
%mend standardized_accounting_data;


**********************************************************************************************************************
*                  MACRO - Create Accounting Characteristics from Compustat Standardized Data 
*********************************************************************************************************************
Description: 
   The main functionality of this macro is to take the output from %standardized_accounting_data and create characteristics 
   that require accounting data.  
   !!! When we are satisfied with the data, we should really include labels for all accounting characteristics !!!
;
%macro create_acc_chars(data=, out=, lag_to_public=, max_data_lag=, __keep_vars=, me_data=, suffix=);
	/* Helper Macros */
	%macro mean_year(var); mean(&var, lag12(&var)) %mend; /* Note that this function will still return a value even if one of the lags are missing. To change this do (var+lag12(var))/2. Could also consider if we should take the average of all observable observations? ie var+lag1(var)+lag2(var)...+lag12(var)*/
	
	%macro apply_to_lastq(x=, _qtrs=, func=); /* The Macro Below is a Generic way of creating lag from current to &n*/
		%let mv = &func.(&x.; 
		%do _i=1 %to &_qtrs.-1;
			%let mv = &mv., lag%eval(&_i.*3)(&x.);
		%end;
		%let mv = &mv.);
		&mv.;
	%mend apply_to_lastq;
	
	%macro apply_to_lasty(x=, yrs=, func=); /* The Macro Below is a Generic way of creating ANNUAL lags from current to &n */
		%let mv = &func.(&x.; 
		%do _i=1 %to &yrs.-1;
			%let mv = &mv., lag%eval(&_i.*12)(&x.);
		%end;
		%let mv = &mv.);
		&mv.;
	%mend apply_to_lasty;
	
	/* Start Procedure */
	proc sort data=&data. out=__chars3; by gvkey curcd datadate; run; /* deleted _chars1 and __chars2 steps*/
	
	data __chars4;
		set __chars3;
		by gvkey curcd;
		retain count;
		if first.curcd then 
			count = 1;
		else
			count = count+1;
	run;
	
	/* Create Accounting Characteristics */
	data __chars5; 
		set __chars4;
		by gvkey curcd;
		
		/* Accounting Based Size Measures */
		assets = at_x;
		sales = sale_x;
		book_equity = be_x;
		net_income = ni_x;
	
		/* Growth Characteristics */
		%let growth_vars =
			at_x ca_x nca_x 									/* Assets - Aggregated */
			lt cl_x ncl_x  									/* Liabilities - Aggregated */
			be_x pstk_x debt_x 								/* Financing Book Values */
			sale_x cogs xsga opex_x  						/* Sales and Operating Costs */
			capx invt
		;	
		
		* 1yr Growth;
		%do i=1 %to %nwords(&growth_vars.);
			%let var_gr1 = %scan(&growth_vars., &i, %str(' '));
			%var_growth(var_gr=&var_gr1., horizon=12);
		%end;
		* 3yr Growth;
		%do i=1 %to %nwords(&growth_vars.);
			%let var_gr3 = %scan(&growth_vars., &i, %str(' '));
			%var_growth(var_gr=&var_gr3., horizon = 36);
		%end;
		
		/* Change Scaled by Asset Characteristics */
		%let ch_asset_vars = 
			che invt rect ppegt ivao ivst intan 				/* Assets - Individual Items */
			dlc ap txp dltt txditc 								/* Liabilities - Individual Items*/
			coa_x col_x cowc_x ncoa_x ncol_x nncoa_x 			/* Operating Assets/Liabilities */
			oa_x ol_x									/* Operating Assets/Liabilities */
			fna_x fnl_x nfna_x									/* Financial Assets/Liabilities */
			gp_x ebitda_x ebit_x ope_x ni_x nix_x dp 			/* Income Statement*/
			fincf_x ocf_x fcf_x nwc_x 							/* Aggreagted Cash Flow */
			eqnetis_x dltnetis_x dstnetis_x dbnetis_x netis_x	/* Financing Cash Flow */
			eqnpo_x
			txt													/* Tax Change */
			eqbb_x eqis_x div_x eqpo_x						/* Financing Cash Flow */
			capx be_x
		;
		* 1yr Change Scaled by Assets;
		%do i=1 %to %nwords(&ch_asset_vars.);
			%let var_gr1a = %scan(&ch_asset_vars., &i, %str(' '));
			%chg_to_assets(var_gra = &var_gr1a., horizon = 12);
		%end;
		
		* 3yr Change Scaled by Assets;
		%do i=1 %to %nwords(&ch_asset_vars.);
			%let var_gr3a = %scan(&ch_asset_vars., &i, %str(' '));
			%chg_to_assets(var_gra = &var_gr3a., horizon = 36);
		%end;
		
		/* Investment Measure */	
		capx_at			= capx/at_x;  
		rd_at			= xrd/at_x;
		
		
		/* Non-Recurring Items */
		spi_at			= spi/at_x;
		xido_at			= xido_x/at_x;
		nri_at			= (spi+xido_x)/at_x;					/* Non-Recurring Items */
		
		/*Profitability Ratios and Rates of Return*/
		* Profit Margins; 
		gp_sale			= gp_x/sale_x; 							/* Gross Profit Margin*/                                  
		ebitda_sale		= ebitda_x/sale_x; 						/* Operating Profit Margin before Depreciation */
		ebit_sale		= ebit_x/sale_x; 						/* Operating profit Margin after Depreciation */                                 
		pi_sale			= pi_x/sale_x;  						/* Pretax Profit Margin */                                         
		ni_sale			= ni_x/sale_x;							/* Net Profit Margin Before XI */
		nix_sale		= ni/sale_x;							/* Net Profit Margin */
		ocf_sale		= ocf_x/sale_x;  						/* Cash Flow Margin */       
		fcf_sale		= fcf_x/sale_x;
		
		* Return on Assets;
		gp_at			= gp_x/at_x;
		ebitda_at		= ebitda_x/at_x;
		ebit_at			= ebit_x/at_x; 	
		fi_at			= fi_x/at_x;
		cop_at			= cop_x/at_x;
		ni_at           = ni_x/at_x;
		
		* Return on Book Equity;
		ope_be			= ope_x/be_x; 													
		ni_be			= ni_x/be_x; 								
		nix_be			= nix_x/be_x;
		ocf_be			= ocf_x/be_x;
		fcf_be			= fcf_x/be_x;
		
		* Return on Invested Book Capital;
		gp_bev			= gp_x/bev_x;
		ebitda_bev		= ebitda_x/bev_x;
		ebit_bev		= ebit_x/bev_x; 					/* Pre tax Return on Book Enterprise Value */
		fi_bev 			= fi_x/bev_x; 						/* ROIC */
		cop_bev			= cop_x/bev_x;						/* Cash Based Operating Profit to Invested Capital */
		
		* Return on Physical Capital;
		gp_ppen			= gp_x/ppent;
		ebitda_ppen		= ebitda_x/ppent;
		fcf_ppen		= fcf_x/ppent;
		
		* Issuance Variables;
		fincf_at		= fincf_x / at_x;
		netis_at		= netis_x / at_x;
		eqnetis_at		= eqnetis_x / at_x;
		eqis_at			= eqis_x / at_x;
		dbnetis_at		= dbnetis_x / at_x;
		dltnetis_at		= dltnetis_x / at_x;
		dstnetis_at		= dstnetis_x / at_x;
		
		/* Equity Payout */
		eqnpo_at		= eqnpo_x / at_x;
		eqbb_at			= eqbb_x / at_x;
		div_at			= div_x / at_x;
		
		* Accruals;
		oaccruals_at 	= oacc_x/at_x;							/* Operating Accruals */
		oaccruals_ni 	= oacc_x/abs(nix_x);					/* Percent Operating Accruals */
		taccruals_at 	= tacc_x/at_x;							/* Total Accruals */
		taccruals_ni 	= tacc_x/abs(nix_x);					/* Percent Total Accruals */
		noa_at			= noa_x/lag12(at_x);					/* Net Operating Asset to Total Assets*/
		if count <= 12 or lag12(at_x) <= 0 then do;
			noa_at = .;
		end;
		
		/*Capitalization/Leverage Ratios Book*/
		be_bev 			= be_x/bev_x;   						/* Common Equity as % of Book Enterprise Value*/
		debt_bev 		= debt_x/bev_x;  						/* Total Debt as % of Book Enterprise Value*/
		cash_bev		= che/bev_x;							/* Cash and Short-Term Investments to Book Enterprise Value */
		pstk_bev		= pstk_x/bev_x;							/* Prefered Stock to Book Enterprise Value */
		debtlt_bev		= dltt/bev_x;    						/* Long-term debt as % of Book Enterprise Value */
		debtst_bev		= dlc/bev_x;							/* Short-term debt as % of Book Enterprise Value */
			
		/*Financial Soundness Ratios*/
		int_debt 		= xint/debt_x; 							/* Interest as % of average total debt*/
		int_debtlt 		= xint/dltt; 							/* Interest as % of average long-term debt*/
		ebitda_debt 	= ebitda_x/debt_x; 						/* Ebitda to total debt*/
		profit_cl		= ebitda_x/cl_x; 						/* Profit before D&A to current liabilities*/
		ocf_cl			= ocf_x/cl_x; 							/* Operating cash flow to current liabilities*/
		ocf_debt		= ocf_x/debt_x;							/* Operating cash flow to total debt*/
		cash_lt 		= che/lt; 								/* Cash balance to Total Liabilities*/
		inv_act 		= invt/act; 															/*inventory as % of current assets*/
		rec_act 		= rect/act; 															/*receivables as % of current assets*/
		debtst_debt 	= dlc/debt_x; 														/*short term term as % of total debt*/
		cl_lt			= cl_x/lt; 																/*current liabilities as % of total liabilities*/
		debtlt_debt		= dltt/debt_x; 																/*long-term debt as % of total liabilities*/
		lt_ppen			= lt/ppent; 															/*total liabilities to total tangible assets*/
		debtlt_be		= dltt/be_x; 																/*long-term debt to book equity*/
		opex_at			= opex_x/at_x;													/* Operating Leverage ala Novy-Marx (2011) */
		nwc_at			= nwc_x/at_x;
		if ocf_x>0 then 
			fcf_ocf 	= fcf_x/ocf_x;  													/*Free Cash Flow/Operating Cash Flow*/
		
		/*Solvency Ratios*/
		debt_at		= debt_x/at_x; 																/*Debt-to-assets*/
		debt_be		= debt_x/be_x; 													/*debt to shareholders' equity ratio*/	
		ebit_int	= ebit_x/xint; 							/*interest coverage ratio*/
		
		/*Liquidity Ratios*/
		inv_days 			= %mean_year(invt)/cogs * 365;		/* Days Inventory Outstanding */
		rec_days 			= %mean_year(rect)/sale_x * 365;	/* Days Sales Outstanding */
		ap_days 			= %mean_year(ap)/cogs * 365;		/* Days Accounts Payable Outstanding */
		if count<=12 then do;
			array var_liq inv_days rec_days ap_days;
			do over var_liq;
				var_liq=.;
			end;
		end;
		cash_conversion = inv_days + rec_days - ap_days; 		/* Cash Conversion Cycle*/
		if cash_conversion<0 then 
			cash_conversion =.;
		if cl_x>0 then do;
			cash_cl		= che/cl_x; 											/* Cash Ratio*/                                   
			caliq_cl	= caliq_x/cl_x;										/* Quick Ratio (acid test)*/
			ca_cl		= ca_x/cl_x;										/* Current Ratio*/
		end;
		
		/*Activity/Efficiency Ratios*/
		inv_turnover = cogs/%mean_year(invt);				/* Inventory Turnover */
		at_turnover = sale_x/%mean_year(at_x);				/* Asset Turnover */
		rec_turnover = sale_x/%mean_year(rect);			/* Receivables Turnover */
		ap_turnover = (cogs+dif12(invt))/%mean_year(ap);	/* Account Payables Turnover */
		if count<=12 then do;
			array var_turn inv_turnover at_turnover rec_turnover ap_turnover;
			do over var_turn;
				var_turn=.;
			end;
		end;
		
		/*Miscallenous Ratios*/
		adv_sale		= xad/sale_x; 												/*advertising as % of sales*/
		staff_sale		= xlr/sale_x; 														/*labor expense as % of sales*/
		sale_bev 		= sale_x/bev_x; 															/*sale per $ Book Enterprise Value*/
	    rd_sale			= xrd/sale_x; 
		sale_be 		= sale_x/be_x; 															/*sales per $ total stockholders' equity*/
		if coalesce(nix_x, ni_x)>0 then 
			div_ni		= div_x/nix_x; 												/*Dividend payout ratio. THEIS: I added ib as a possibility*/	
		if nwc_x>0 then 
			sale_nwc    = sale_x/nwc_x;														/*sales per $ working capital*/
		if pi_x>0 then 
			tax_pi		= txt/pi_x; 								/*effective tax rate*/
			
			
		/* NEW VARIABLES */
		cash_at = che / at_x;
		if at_x <= 0 then
			cash_at = .;
		* Employees based variables;
		ni_emp = ni_x / emp;
		if emp <= 0 then ni_emp = .;
		sale_emp = sale_x / emp;
		if emp <= 0 then sale_emp = .;
		sale_emp_gr1 = sale_emp / lag12(sale_emp) - 1; /* Labor force efficiency */
		if count <= 12 or lag12(sale_emp) <= 0 then sale_emp_gr1 = .;
		emp_gr1 = (emp - lag12(emp)) / (0.5 * emp + 0.5 * lag12(emp));
		if count <= 12 or emp_gr1 = 0 or (0.5 * emp + 0.5 * lag12(emp)) = 0 then emp_gr1 = .;
		
		* Number of Consecutive Earnings Increases;
		ni_inc = ni_x > lag12(ni_x);
		if missing(ni_x) or missing(lag12(ni_x)) then
			ni_inc = .;
		ni_inc8q = 0;	
		no_decrease = 1;
		%do q = 0 %to 7;
			%let ql = %sysevalf(&q.*3);
			if lag&ql.(ni_inc) = 1 and no_decrease = 1 then 
				ni_inc8q = ni_inc8q + 1;
			else 
				no_decrease = 0;
		%end;
		n_ni_inc = %apply_to_lastq(x = not missing(ni_inc), _qtrs = 8, func = sum);
		if missing(ni_inc) or n_ni_inc ^= 8 or count < 33 then 
			ni_inc8q = .;
		drop no_decrease n_ni_inc; 	
		
		* 1yr Change Scaled by Lagged Assets;
		%let ch_asset_lag_vars = 
			noa_x ppeinv_x
		;
		%do i=1 %to %nwords(&ch_asset_lag_vars.);
			%let var_gr1al = %scan(&ch_asset_lag_vars., &i, %str(' '));
			%let name_gr1al = %sysfunc(tranwrd(&var_gr1al, _x, %str()));  /* Remove '_x' from var name */
			&name_gr1al._gr1a = (&var_gr1al-lag12(&var_gr1al))/lag12(at_x);
			if count<=12 or lag12(at_x)<=0 then
				&name_gr1al._gr1a = .;
		%end;
		
		* 1yr Change Scaled by Average Assets;
		%let ch_asset_avg_vars = 
			lnoa_x
		;
		%do i=1 %to %nwords(&ch_asset_avg_vars.);
			%let var_gr1aa = %scan(&ch_asset_avg_vars., &i, %str(' '));
			%let name_gr1aa = %sysfunc(tranwrd(&var_gr1aa, _x, %str()));  /* Remove '_x' from var name */
			&name_gr1aa._gr1a = (&var_gr1aa-lag12(&var_gr1aa))/(at_x + lag12(at_x));
			if count<=12 or (at_x + lag12(at_x))<=0 then
				&name_gr1aa._gr1a = .;
		%end;
		
		* CAPEX growth over 2 years;
		%var_growth(var_gr=capx, horizon=24);
			
		* Quarterly Profitability Measures;
		saleq_gr1 = sale_qtr / lag12(sale_qtr) - 1;
		if count <= 12 or lag12(sale_qtr) < 0 then
			saleq_gr1 = .;
		niq_be = ni_qtr / lag3(be_x);
		if count <= 3 or lag3(be_x) < 0 then
			niq_be = .;
		niq_at = ni_qtr / lag3(at_x);
		if count <= 3 or lag3(at_x) < 0 then
			niq_at = .;
		niq_be_chg1 = niq_be - lag12(niq_be);
		niq_at_chg1 = niq_at - lag12(niq_at);
		if count <= 12 then do;
			niq_be_chg1 = .;
			niq_at_chg1 = .;
		end;
		
		* R&D capital-to-assets;
		rd5_at = (xrd + lag12(xrd)*0.8 + lag24(xrd)*0.6 + lag36(xrd)*0.4 + lag48(xrd)*0.2) / at_x;
		if count <= 48 or at_x <= 0 then
			rd5_at = .;
			
		* Abarbanell and Bushee (1998);
		%chg_to_exp(var_ce = sale_x);
		%chg_to_exp(var_ce = invt);
		%chg_to_exp(var_ce = rect);
		%chg_to_exp(var_ce = gp_x);
		%chg_to_exp(var_ce = xsga);
		
		dsale_dinv = sale_ce - invt_ce;
		dsale_drec = sale_ce - rect_ce;
		dgp_dsale  = gp_ce - sale_ce;
		dsale_dsga = sale_ce - xsga_ce;
		drop sale_ce invt_ce rect_ce gp_ce xsga_ce;
		
		* Earnings and Revenue 'Surpise';
		%standardized_unexpected(var=sale_qtr, qtrs = 8, qtrs_min = 6);
		%standardized_unexpected(var=ni_qtr, qtrs = 8, qtrs_min = 6);
			
		* Abnormal Corporate Investment;
		__capex_sale = capx / sale_x;
		if sale_x <= 0 then
			__capx_sale = .;
		capex_abn = __capex_sale / ((lag12(__capex_sale) + lag24(__capex_sale) + lag36(__capex_sale)) / 3) - 1;
		if count <= 36 then 
			capex_abn = .;
		drop __capex_sale;
		
		/* Profit scaled by lagged */
		op_atl1 = op_x / lag12(at_x);
		if count <= 12 or lag12(at_x) <= 0 then
			op_atl1 = .;
		gp_atl1 = gp_x / lag12(at_x);
		if count <= 12 or lag12(at_x) <= 0 then
			gp_atl1 = .;
		ope_bel1 = ope_x / lag12(be_x);
		if count <= 12 or lag12(be_x) <= 0 then
			ope_bel1 = .;
		cop_atl1 = cop_x / lag12(at_x);
		if count <= 12 or lag12(at_x) <= 0 then
			cop_atl1 = .;
			
		/* Profitability Measures*/
		pi_nix = pi_x / nix_x;
		if pi_x <= 0 or nix_x <= 0 then
			pi_nix = .;
		ocf_at = ocf_x / at_x;
		op_at = op_x / at_x;
		if at_x <= 0 then do;
			ocf_at = .;
			op_at = .;
		end;
		ocf_at_chg1 = ocf_at - lag12(ocf_at);
		if count <= 12 then 
			ocf_at_chg1 = .;
		
		/* Book Leverage */
		at_be = at_x / be_x;
		
			/* Volatility Quarterly Items */
		__ocfq_saleq = ocf_qtr / sale_qtr;
		__niq_saleq = ni_qtr / sale_qtr;
		if sale_qtr <= 0 then do;
			__ocfq_saleq = .;
			__niq_saleq = .;
		end;
		__roeq = ni_qtr / be_x;
		if be_x <= 0 then 
			__roeq = .;
		%volq(name = ocfq_saleq_std, var = __ocfq_saleq, qtrs = 16, qtrs_min = 8);
		%volq(name = niq_saleq_std, var = __niq_saleq, qtrs = 16, qtrs_min = 8);
		%volq(name = roeq_be_std, var = __roeq, qtrs = 20, qtrs_min = 12);
		drop __ocfq_saleq __niq_saleq __roeq;
		
		/* Volatility Annual Items*/
		__roe = ni_x / be_x;
		if be_x <= 0 then
			__roe = .;
		%vola(name = roe_be_std, var = __roe, yrs = 5, yrs_min = 5);
		drop __roe;
		
		/* Asset Tangibility */
		tangibility = (che + 0.715 * rect + 0.547 * invt + 0.535 * ppegt) / at_x;  
		
		* Earnings Smoothness;
		%earnings_variability(esm_h=5);
		
		* Asset Liquidity;
		aliq_at = aliq_x / lag12(at_x);
		if count <= 12 or lag12(at_x) <= 0 then aliq_at = .;
		
		* Equity Duration Helper Variables;
		%equity_duration_cd(horizon=10, r=0.12, roe_mean=0.12, roe_ar1=0.57, g_mean=0.06, g_ar1=0.24);
		
		* Pitroski F-Score;
		%pitroski_f(name = f_score);
		
		* Ohlson (1980) O-score;
		%ohlson_o(name = o_score);
		
		* Altman (1968) Z-score;
		%altman_z(name = z_score);
		 
		* Intrinsic ROE based value from Frankel and Lee (1998);
		%intrinsic_value(name = intrinsic_value, r=0.12);
		
		* Kaplan-Zingales Index;
		%kz_index(name=kz_index);
		
		* 5 year ratio change (For quality minus junk variables);
		%chg_var1_to_var2(name=gpoa_ch5, var1=gp_x, var2=at_x, horizon=60);
		%chg_var1_to_var2(name=roe_ch5, var1=ni_x, var2=be_x, horizon=60);
		%chg_var1_to_var2(name=roa_ch5, var1=ni_x, var2=at_x, horizon=60);
		%chg_var1_to_var2(name=cfoa_ch5, var1=ocf_x, var2=at_x, horizon=60);
		%chg_var1_to_var2(name=gmar_ch5, var1=gp_x, var2=sale_x, horizon=60);
		
		/* Delete Helper Variables */
		drop count;
	run;	
	
	/* Create earningspersistence */
	%earnings_persistence(out=earnings_pers, data=__chars5, __n=5, __min=5);
	
	proc sql;
		create table __chars6 as 
		select a.*, b.ni_ar1, b.ni_ivol
		from __chars5 as a left join earnings_pers as b
		on a.gvkey = b.gvkey and a.curcd=b.curcd and a.datadate=b.datadate;
	quit;
	
	/* Keep only dates with accounting data */
	data __chars7;
		set __chars6;
		where data_available=1;
	run;
	
	/* Expand by Public Availability */
	* Would be great to change start_date to filling_date or some derivative which was a function of fqtr;
	proc sort data=__chars7; by gvkey descending datadate; run;
	data __chars8;
		set __chars7;
		by gvkey;
		
		start_date = intnx('month', datadate, &lag_to_public.,'e'); format start_date YYMMDDN8.;
		next_start_date = lag(start_date);
		if first.gvkey then 
			next_start_date=.;
		end_date = min(intnx('month', next_start_date, -1, 'e'), intnx('month', datadate, &max_data_lag., 'e')); format end_date YYMMDDN8.;	
			
		drop next_start_date;	
	run;
	
	%expand(data=__chars8, out=__chars9, id_vars=gvkey, start_date=start_date, end_date=end_date, freq='month', new_date_name=public_date);
	
	/* Convert All Raw (non-scaled) Variables to USD [2]*/
	%compustat_fx(out=__fx);
	proc sql;
		create table __chars10 as 
		select a.*, b.fx 
		from __chars9 as a left join __fx as b
		on a.curcd=b.curcdd and a.public_date=b.date;
	quit;
	
	data __chars11;
		set __chars10;
		array var_raw 
			assets sales book_equity net_income;
		do over var_raw;
			var_raw = var_raw*fx;
		end;
		drop curcd;
	run;
	
	/* Create Ratios using both Accounting and Market Values */
	* Note that valuation ratios are created at the company level;
	proc sql;
		create table __me_data1 as 
		select distinct gvkey, eom, me_company /* Include id for join with daily std */
		from &me_data.
		where not missing(gvkey) and primary_sec=1 and not missing(me_company) and common=1 and obs_main=1 /* Notice, exch_main is not a requirement */
		group by gvkey, eom
		having me_company=max(me_company);
	quit;
	
	proc sql; 
		create table __chars12 as
		select a.*, b.me_company
		from __chars11 as a left join __me_data1 as b
		on a.gvkey=b.gvkey and a.public_date=b.eom;
	quit;
	
	proc sort data=__chars12 out=__chars13 nodupkey; by gvkey public_date; run; /*THEIS: Global-> No duplicates US-> 3464 duplicate observations where deleted*/

	data __chars14;
		set __chars13;
		/* Prepare Data */
		mev				= me_company+netdebt_x*fx;			  /* Enterprise Value (in Dollars) */
		mat             = at_x * fx - be_x * fx + me_company;  /* Market Asset Value */
		if mev <= 0 then mev = .;
		if me_company <= 0 then	me_company = .;
		if mat <= 0 then mat = .;
		
		/* Characteristics Scaled by Market Equity */
		%let me_vars = at_x be_x debt_x netdebt_x che sale_x gp_x ebitda_x ebit_x ope_x ni_x nix_x cop_x  
			ocf_x fcf_x div_x eqbb_x eqis_x eqpo_x eqnpo_x eqnetis_x
			xrd;
		%do i=1 %to %nwords(&me_vars.);
			%let var_me = %scan(&me_vars., &i, %str(' '));
			%let name_me = %sysfunc(tranwrd(&var_me., _x, %str()));  /* Remove '_x' from var name */
			&name_me._me = (&var_me.*fx)/me_company;
		%end;
		ival_me = (intrinsic_value*fx) / me_company;
		
		/* Characteristics Scaled by Market Enterprise Value */
		%let mev_vars = at_x bev_x ppent be_x che sale_x gp_x ebitda_x ebit_x ope_x ni_x nix_x cop_x ocf_x fcf_x
			debt_x pstk_x dltt dlc dltnetis_x dstnetis_x dbnetis_x netis_x fincf_x;
		%do i=1 %to %nwords(&mev_vars.);
			%let var_mev = %scan(&mev_vars., &i, %str(' '));
			%let name_mev = %sysfunc(tranwrd(&var_mev., _x, %str()));  /* Remove '_x' from var name */
			&name_mev._mev = (&var_mev.*fx)/mev;
		%end;
		
		/* Characteristics Scaled by Market Assets */
		aliq_mat = aliq_x * fx / lag12(mat);
		if gvkey ^= lag12(gvkey) then aliq_mat = .;
		
		/* Size Measure */
		enterprise_value = mev;
		
		/* Equity Duration */
		eq_dur = ed_cd_w * fx / me_company + ed_constant * (me_company - ed_cd * fx) / me_company;
		if ed_err = 1 or eq_dur <= 0 then eq_dur = .;
	run;
	
	/* Format Output */
	proc sql noprint;
		select name into :col_names separated by ' ' 
		from dictionary.columns
		where libname=upcase("work") and memname = upcase("__chars14");
	quit;
	
	
	data __chars15;
		set __chars14;
		%do i=1 %to %nwords(&col_names.);
			%let old_name = %scan(&col_names., &i, %str(' '));
			%let new_name = %sysfunc(prxchange(s/xrd/rd/, 1, &old_name.));  /* Replace XRD with RD */
			%let new_name = %sysfunc(prxchange(s/xsga/sga/, 1, &new_name.));  /* Replace XSGA with SGA */
			%let new_name = %sysfunc(prxchange(s/dlc/debtst/, 1, &new_name.));  /* Replace DLC with DEBTST */
			%let new_name = %sysfunc(prxchange(s/dltt/debtlt/, 1, &new_name.));  /* Replace DLTT with DEBTLT */
			%let new_name = %sysfunc(prxchange(s/oancf/ocf/, 1, &new_name.));  /* Replace OANCF with OCF */
			%let new_name = %sysfunc(prxchange(s/ppegt/ppeg/, 1, &new_name.));  /* Replace PPEGT with PPEG */
			%let new_name = %sysfunc(prxchange(s/ppent/ppen/, 1, &new_name.));  /* Replace PPENT with PPEN */
			%let new_name = %sysfunc(prxchange(s/che/cash/, 1, &new_name.));  /* Replace CHE with CASH */
			%let new_name = %sysfunc(prxchange(s/invt/inv/, 1, &new_name.));  /* Replace INVT with INV */
			%let new_name = %sysfunc(prxchange(s/rect/rec/, 1, &new_name.));  /* Replace RECT with REC */
			%let new_name = %sysfunc(prxchange(s/txt/tax/, 1, &new_name.));  /* Replace TXT with TAX */
			%let new_name = %sysfunc(prxchange(s/ivao/lti/, 1, &new_name.));  /* Replace IVAO with LTI */
			%let new_name = %sysfunc(prxchange(s/ivst/sti/, 1, &new_name.));  /* Replace IVST with STI */
			%let new_name = %sysfunc(prxchange(s/sale_qtr/saleq/, 1, &new_name.));  /* Replace SALE_QTR with SALEQ */
			%let new_name = %sysfunc(prxchange(s/ni_qtr/niq/, 1, &new_name.));  /* Replace SALE_QTR with SALEQ */
			%let new_name = %sysfunc(prxchange(s/ocf_qtr/ocfq/, 1, &new_name.));  /* Replace SALE_QTR with SALEQ */
			rename &old_name. = &new_name.;
		%end;
		
	run;
	
	* Reorder and Keep only Selected Columns;
	data __chars16;
		retain source gvkey datadate public_date assets sales book_equity net_income enterprise_value;
		set __chars15;
		keep source gvkey public_date datadate &__keep_vars.;
	run;

	* Add suffix if specified;
	%if %length(&suffix.)>0 %then %do;
		data __chars16;
			set __chars16;
			%do i=1 %to %nwords(&__keep_vars.);
				%let var_x = %scan(&__keep_vars., &i, %str(' '));
				rename &var_x. = &var_x.&suffix.;
			%end;
			rename datadate=datadate&suffix.;
		run;
	%end;
	
	proc sort nodupkey data=__chars16 out=&out.; by gvkey public_date; run;
	proc delete data= __chars3 __chars4 __chars5 __chars6 __chars7 __chars8 
	__chars9 __chars10 __chars11 __chars12 __chars13 __chars14 __chars15 __chars16 __me_data __me_data1 __fx earnings_pers; run;
%mend create_acc_chars;

/* Combine Characteristics from Annual and Quarterly Data */
%macro combine_ann_qtr_chars(out=, ann_data=, qtr_data=, __char_vars=, q_suffix=);
	proc sql;
		create table __acc_chars1 as 
		select a.*, b.*
		from &ann_data. as a left join &qtr_data. as b
		on a.gvkey=b.gvkey and a.public_date=b.public_date;
	quit;
	
	/* Substitute Annual Characteristic for Quarterly if Quarterly is more recent */
	data __acc_chars2;
		set __acc_chars1;
		%do i=1 %to %nwords(&__char_vars.);
			%let ann_var = %scan(&__char_vars., &i.);
			%let qtr_var = &ann_var.&q_suffix.;
			if missing(&ann_var.) or (not missing(&qtr_var.) and datadate&q_suffix. > datadate) then /* Didn't include the first part before! */ 
				&ann_var. = &qtr_var.;
			drop &qtr_var.;
		%end;
		drop datadate datadate&q_suffix.; /* We can no longer be sure which items accounting dates refer to */
	run;
	
	proc sort nodupkey data=__acc_chars2 out=&out; by gvkey public_date; run;
	
	proc delete data=__acc_chars1 __acc_chars2; run;
%mend combine_ann_qtr_chars;


================================================
FILE: GlobalFactors/char_macros.sas
================================================
/* MACROS USING COMPOSITE DATA */
/* MACRO: MISPRICING_FACTORS 
- Based on the paper by Yuan and Stambaugh (2016)
- Currently, the distress probability anomaly is not implemented.
- I use fractional ranks i.e. ranks from 0 to 1. I think they use absolute ranks in the paper.
*/
%macro mispricing_factors(out=, data=, min_stks=, min_fcts=3);
	proc sql;
		create table chars1 as 
		select id, eom, excntry, chcsho_12m, eqnpo_12m, oaccruals_at, noa_at, at_gr1, ppeinv_gr1a, 
			o_score, ret_12_1, gp_at, niq_at
		from &data.
		where common=1 and primary_sec=1 and obs_main=1 and exch_main = 1 and not missing(ret_exc) and not missing(me)
		order by excntry, eom;
	quit;
	%let __vars = chcsho_12m eqnpo_12m oaccruals_at noa_at at_gr1 ppeinv_gr1a o_score ret_12_1 gp_at niq_at;
	%let __direction = -1 1 -1 -1 -1 -1 -1 1 1 1;
	%do i=1 %to 10;
		%let __v = %scan(&__vars., &i, %str(' '));
		%let __d = %scan(&__direction., &i, %str(' '));
		%if &__d. = 1 %then %do;
			%let __sort=;
		%end;
		%else %do;
			%let __sort=descending;
		%end;
		proc sql;
			create table __subset as 
			select *
			from chars1
			group by excntry, eom
			having count(&__v.) >= &min_stks.;
		quit;
		proc rank data=__subset out = __ranks(keep=excntry id eom rank_&__v.) &__sort. ties=mean f;
			by excntry eom;
			var &__v.;
			ranks rank_&__v.;
		run;
		proc sql;
			create table chars%eval(&i.+1) as
			select a.*, b.rank_&__v.
			from chars&i. as a left join __ranks as b
			on a.id=b.id and a.eom=b.eom;
		quit;
	%end;
	data &out.;
		set chars11;
		mispricing_perf = mean(rank_o_score, rank_ret_12_1, rank_gp_at, rank_niq_at);
		if missing(rank_o_score) + missing(rank_ret_12_1) + missing(rank_gp_at) + missing(rank_niq_at) > &min_fcts. then
			mispricing_perf = .;
		mispricing_mgmt = mean(rank_chcsho_12m, rank_eqnpo_12m, rank_oaccruals_at, rank_noa_at, rank_at_gr1, rank_ppeinv_gr1a);
		if missing(rank_chcsho_12m) + missing(rank_eqnpo_12m) + missing(rank_oaccruals_at) + missing(rank_noa_at) + missing(rank_at_gr1) + missing(rank_ppeinv_gr1a) > &min_fcts. then
			mispricing_mgmt = .;	
		keep id eom mispricing_perf mispricing_mgmt;
	run;
%mend;

* MACRO: QUALITY MINUS JUNK
- Based on the paper by Asness, Frazzini and Pedersen (2018)
- I deviate slightly from the original paper in the variable construction
- The most clear deviation is the way the growth variables are created.
;
%macro quality_minus_junk(out=, data=, min_stks=);
	/* Helper Macro */
	%macro z_ranks(out=, data=, var=, min=, sort=);
		proc sql;
			create table __subset as 
			select *
			from &data.
			group by excntry, eom
			having count(&var.) >= &min.;
		quit;
		proc rank data=__subset out = __ranks(keep=excntry id eom rank_&var.) &sort. ties=mean;
			by excntry eom;
			var &var.;
			ranks rank_&var.;
		run;
		proc sql;
			create table &out. as 
			select excntry, id, eom, (rank_&var. - mean(rank_&var.)) / std(rank_&var.) as z_&var.
			from __ranks
			where not missing(rank_&var.)
			group by excntry, eom;
		quit;
		proc delete data=__subset __ranks; run;
	%mend;
	proc sql;
		create table qmj1 as 
		select id, eom, excntry, coalesce(roeq_be_std*2, roe_be_std) as __evol, /* I multiply the quarterly measure by sqrt(4)=2 to reflect that quarterly measures are less volatile than the annual measure. Empirically, this seems to be a reasonable approximation although perhaps a slightly higher multiplied could be used e.g. 2.5*/
			gp_at, ni_be, ni_at, ocf_at, gp_sale, oaccruals_at, gpoa_ch5, roe_ch5, roa_ch5, cfoa_ch5, 
			gmar_ch5, betabab_1260d, debt_at, o_score, z_score
		from &data.
		where common=1 and primary_sec=1 and obs_main=1 and exch_main=1 and not missing(ret_exc) and not missing(me)
		order by excntry, eom;
	quit;
	%let z_vars    = gp_at ni_be ni_at ocf_at gp_sale oaccruals_at
	                 gpoa_ch5  roe_ch5 roa_ch5 cfoa_ch5 gmar_ch5
	                 betabab_1260d debt_at o_score z_score __evol;  
	%let direction = 1 1 1 1 1 -1
	                 1 1 1 1 1
	                 -1 -1 -1 1 -1;
	%do i=1 %to 16;
		%let __v = %scan(&z_vars., &i, %str(' '));
		%let __d = %scan(&direction., &i, %str(' '));
		%if &__d. = 1 %then %do;
			%let __sort=;
		%end;
		%else %do;
			%let __sort=descending;
		%end;
		%z_ranks(out=__z, data=qmj1, var = &__v., min=&min_stks., sort=&__sort.);
		proc sql;
			create table qmj%eval(&i.+1) as
			select a.*, b.z_&__v.
			from qmj&i. as a left join __z as b
			on a.id=b.id and a.eom=b.eom;
		quit;
		%if &i.>1 %then %do;
			proc delete data=qmj&i.; run;
		%end;
	%end;
	data qmj18;
		set qmj17;
		__prof = mean(z_gp_at, z_ni_be, z_ni_at, z_ocf_at, z_gp_sale, z_oaccruals_at);
		__growth = mean(z_gpoa_ch5, z_roe_ch5, z_roa_ch5, z_cfoa_ch5, z_gmar_ch5);
		__safety = mean(z_betabab_1260d, z_debt_at, z_o_score, z_z_score, z___evol);
		keep excntry id eom __prof __growth __safety;
	run;
	%z_ranks(out=__prof, data=qmj18, var = __prof, min=&min_stks., sort=);
	%z_ranks(out=__growth, data=qmj18, var = __growth, min=&min_stks., sort=);
	%z_ranks(out=__safety, data=qmj18, var = __safety, min=&min_stks., sort=);
	proc sql;
		create table qmj19 as 
		select a.excntry, a.id, a.eom, b.z___prof as qmj_prof, c.z___growth as qmj_growth, d.z___safety as qmj_safety
		from qmj18 as a 
		left join __prof as b on a.excntry=b.excntry and a.id=b.id and a.eom=b.eom
		left join __growth as c on a.excntry=c.excntry and a.id=c.id and a.eom=c.eom
		left join __safety as d on a.excntry=d.excntry and a.id=d.id and a.eom=d.eom;
	quit;
	/* QMJ SCORE! */
	data qmj20;
		set qmj19;
		__qmj = (qmj_prof + qmj_growth + qmj_safety) / 3; * Missing if any of subcomponents are missing;
	run;
	%z_ranks(out=__qmj, data=qmj20, var=__qmj, min = &min_stks., sort=);
	proc sql;
		create table &out. as 
		select a.excntry, a.id, a.eom, a.qmj_prof, a.qmj_growth, a.qmj_safety, b.z___qmj as qmj
		from qmj20 as a left join __qmj as b 
		on a.excntry=b.excntry and a.id=b.id and a.eom=b.eom;
	quit;
	proc delete data=qmj1 qmj17 qmj18 qmj19 qmj20; run;
%mend;

/* MACRO USING RETURN DATA -------------------------------------------------------*/
* MACRO: BIDASK_HL -------------------------
- Corwin-Schultz High-Low Bid-ask Estimator
- Heavily inspired by Shane Corwins code: http://sites.nd.edu/scorwin/files/2019/12/Sample-SAS-Program.pdf
- Primary change: I adjust prices for stock splits
- Arguments:
	* OUT: Output dataset containing estimates of average monthly bid-ask spread and return volatility
	* DATA: Input dataset with high and low prices
	* __min_obs: Minimum amount of daily observations required to compute monthly estimates;
%macro bidask_hl(out=, data=, __min_obs=);
	proc sql;
		create table __dsf1 as 
		select a.id, a.date, a.eom, a.bidask, a.tvol, 
			a.prc / a.adjfct as prc, a.prc_high / a.adjfct as prc_high, a.prc_low / a.adjfct as prc_low /* Adjust price for stocks splits! */
		from &data. as a left join scratch.market_returns_daily as b
		on a.excntry=b.excntry and a.date=b.date
		where not missing(b.mkt_vw_exc)  /* This ensures that we look at trading days */
		order by id, date;
	quit;
	
	* Cleaning data;
	data __dsf2(drop=prc_low_r prc_high_r);
		retain prc_low_r prc_high_r;
		set __dsf1;
		by id date eom;
		* Keep initial valeus;
		prc_low_in = prc_low;
		prc_high_in = prc_high;
		hlreset = 0;
		* Initial Screens;
		if bidask = 1 or prc_low=prc_high or prc_low<=0 or prc_high<=0 or tvol=0 then do; 
			prc_high = .;
			prc_low = .;
		end;
		/* Replace bad/missing price with previous day range */
		if first.id then do;
			prc_low_r = .;
			prc_high_r = .;
		end;
		* Reset retained high and low volume;
		if 0<prc_low<prc_high then do;
			prc_low_r=prc_low; 
			prc_high_r=prc_high;
		end;
		* Replace mising/bad high and low prices with retained values;
		else do;
			* Replace if within prior days range;
			if prc_low_r <= prc <= prc_high_r then do;
				prc_low = prc_low_r;
				prc_high = prc_high_r;
				hlreset = 1;
			end;
			* Replace if below prior days range;
			if prc < prc_low_r then do;
				prc_low = prc;
				prc_high = prc_high_r - (prc_low_r-prc);
				hlreset = 2;
			end;
			* Replace if above prior days range;
			if prc > prc_high_r then do;
				prc_low = prc_low_r + (prc - prc_high_r);
				prc_high = prc;
				hlreset = 3;
			end;
		end;
		/* Final data screen after H/L reset */
		if prc_low ^= 0 and prc_high/prc_low > 8 then do;
			prc_low = .;
			prc_high = .;
		end;
	run;
	
	/* Adjust for overnight returns */
	data __dsf3;
		set __dsf2;
		retadj = 0;
		prc_low_t = prc_low;
		prc_high_t = prc_high;
		prc_low_l1 = lag(prc_low);
		prc_high_l1 = lag(prc_high);
		prc_l1 = lag(prc);
		if id ^= lag(id) then do;
			prc_low_l1 = .;
			prc_high_l1 = .;
			prc_l1 = .;
		end;
		if prc_l1<prc_low and prc_l1>0 then do; * Adjust when prior close is below current low;
			prc_high_t=prc_high-(prc_low-prc_l1); 
			prc_low_t=prc_l1; 
			retadj=1;
	 	end;
		if prc_l1>prc_high and prc_l1>0 then do; * Adjust when prior close is above current high;
			prc_high_t=prc_l1; 
			prc_low_t=prc_low+(prc_l1-prc_high); 
			retadj=2;
		end;
	run;
	
	/* Calculate daily high/low bid-ask spread*/
	data __dsf4; 
		set __dsf3;
	 	pi=constant('PI');
	 	k2 = sqrt(8/pi);
	 	const = 3-2*sqrt(2);
	 	prc_high_2d=max(prc_high_t,prc_high_l1);
	 	prc_low_2d=min(prc_low_t,prc_low_l1);
	 	if prc_low_t>0 and prc_low_l1>0 then 
	 		beta = (log(prc_high_t/prc_low_t))**2+(log(prc_high_l1/prc_low_l1))**2;
	 	if prc_low_2d>0 then 
	 		gamma = (log(prc_high_2d/prc_low_2d))**2;
	 	alpha = (sqrt(2*beta)-sqrt(beta))/const - sqrt(gamma/const);
	 	* Calculate spread with missing set to zero;
	 	spread = 2*(exp(alpha)-1)/(1+exp(alpha));
	 	spread_0 = max(spread,0); * Set negative spread estimates to zero;
	 	if spread = . then 
	 		spread_0 = .;
	 	* Calculate daily volatillity;
	 	sigma = ((sqrt(beta/2)-sqrt(beta)))/(k2*const)+sqrt(gamma/(k2*k2*const));
	 	sigma_0 = max(sigma,0); * Set negative sigma estimates to zero;
	 	if sigma= . then 
	 		sigma_0 = .;
	run;
	
	/* Monthly bid-ask estimates */
	proc sql;
		create table &out. as 
		select id, eom, mean(spread_0) as bidaskhl_21d, mean(sigma_0) as rvolhl_21d
		from __dsf4
		group by id, eom
		having count(spread_0) > &__min_obs.;
	quit;
	
	proc delete data=__dsf1 __dsf2 __dsf3 __dsf4; run;
%mend;


* MACRO: SEASONALITY
- Caclulates annual and non-annual seasonality measures following Heston and Sadka (2008)
- Specifically, calculate the average return over annual and non-annual lags within the specified
  start and end dates
- Within a given year, the annual lag is lag11 and the non-annual lags are lag0-lag10. 
- For return predictability, the information should be used to form portfolios at the end of lag0
- Said differently, the seasonality variables should be lagged 1 period relative to returns
;
%macro seasonality(start_year=, end_year=);
	* Return over all lags;
	__all_ret = 0;
	__all_n = 0;
	%do i = %eval((&start_year.-1) * 12) %to %eval(&end_year. * 12 - 1);
		__all_ret = __all_ret + lag&i(ret_x);
		__all_n = __all_n + 1;
	%end;
	* Return over annual lags;
	__an_ret = 0;
	__an_n = 0;
	%do i = %eval(&start_year.) %to &end_year.;
		%let __al = %eval(&i. * 12 - 1);  
		__an_ret = __an_ret + lag&__al.(ret_x);
		__an_n = __an_n + 1;
	%end;
	* Return over non-annual lags;
	__na_ret = __all_ret - __an_ret;
	__na_n = __all_n - __an_n;
	* Create Variables;
	seas_&start_year._&end_year.an = __an_ret / __an_n;
	seas_&start_year._&end_year.na = __na_ret / __na_n;
	if count < %eval(&end_year. * 12) then do;
		seas_&start_year._&end_year.an = .;
		seas_&start_year._&end_year.na = .;	
	end;
	drop __all_ret __all_n __an_ret __an_n __na_ret __na_n;
%mend;

/* MACRO USING ACCOUNTING DATA -------------------------------------------------------*/
* Create Growth in Variable over horizon;
%macro var_growth(var_gr=, horizon=); /* Horizon is in months */
	%let name_gr = %sysfunc(tranwrd(&var_gr., _x, %str()));  /* Remove '_x' from var name */
	%let name_gr = &name_gr._gr%sysevalf(&horizon./12);      /* Add gr and horizon in years to name */ 
	&name_gr. = &var_gr./lag&horizon.(&var_gr.)-1;
	if count<=&horizon. or lag&horizon.(&var_gr.)<=0 then
			&name_gr. = .; 
%mend;

* Change in Variable over Horizon Scaled by Assets;
%macro chg_to_assets(var_gra=, horizon=); /* Horizon is in months */
	%let name_gra = %sysfunc(tranwrd(&var_gra., _x, %str()));  /* Remove '_x' from var name */
	%let name_gra = &name_gra._gr%sysevalf(&horizon./12);      /* Add gr and horizon in years to name */ 
	%let name_gra = &name_gra.a;                                /* Add 'a' in the end*/
	&name_gra. = (&var_gra.-lag&horizon.(&var_gra.))/at_x;
	if count<=&horizon. or at_x<=0 then
			&name_gra. = .; 
%mend;

* Ratio Change;
%macro chg_var1_to_var2(name=, var1=, var2=, horizon=);
	__x = &var1. / &var2.;
	if &var2. <= 0 then 
		__x=.;
	&name. = (__x - lag&horizon.(__x));
	if count <= horizon then 
		&name. = .;
	drop __x;
%mend;

* Change to expectations (Abarnell and Bushee, 1998);
%macro chg_to_exp(var_ce=);
	%let name_ce = %sysfunc(tranwrd(&var_ce., _x, %str()));  /* Remove '_x' from var name */
	%let name_ce = &name_ce._ce;
	__expect = (lag12(&var_ce.) + lag24(&var_ce.))/2;
	&name_ce. = &var_ce. / (__expect) - 1;
	if count <= 24 or __expect <= 0 then
		&name_ce. = .;
	drop __expect;
%mend;

* Standardized Unexpected Realization;
* Uses the specification in Jegadeesh and Livnat (2006);
%macro standardized_unexpected(var=, qtrs=, qtrs_min=);
	%let name = %sysfunc(tranwrd(&var., _x, %str()));  /* Remove '_x' from var name */
	%let name = &name._su;  
	__chg = &var. - lag12(&var.);
	__chg_mean = %apply_to_lastq(x = __chg, _qtrs = &qtrs., func = mean);
	__chg_std = %apply_to_lastq(x = __chg, _qtrs = &qtrs., func = std);
	__chg_n = %apply_to_lastq(x = not missing(__chg), _qtrs = &qtrs., func = sum);
	if __chg_n <= &qtrs_min. then do;
		__chg_mean = .;
		__chg_std = .;
	end;
	&name. = (&var. - (lag12(&var.) + lag3(__chg_mean) )) / lag3(__chg_std); /* This is the correct one*/
	if count <= %eval(12 + &qtrs.*3) then
		&name. = .;
	drop __chg __chg_mean __chg_std __chg_n;
%mend;

* Volatility of Quarterly Data;
%macro volq(name=, var=, qtrs=, qtrs_min=);
	__n = %apply_to_lastq(x = not missing(&var.), _qtrs = &qtrs., func = sum);
	&name. = %apply_to_lastq(x = &var., _qtrs = &qtrs., func = std);
	if count <= %eval((&qtrs.-1)*3) or __n < &qtrs_min. then
		&name. = .;
	drop __n;
%mend;

* Volatility of Annual Data;
%macro vola(name=, var=, yrs=, yrs_min=);
	__n = %apply_to_lasty(x = not missing(&var.), yrs = &yrs., func = sum);
	&name. = %apply_to_lasty(x = &var., yrs = &yrs., func = std);
	if count <= %eval((&yrs.-1)*12) or __n < &yrs_min. then
		&name. = .;
	drop __n;
%mend;

* Earnings Smoothness;
%macro earnings_variability(esm_h=);
	__roa = ni_x / lag12(at_x);
	__croa = ocf_x / lag12(at_x);
	__roa_n = %apply_to_lasty(x= not missing(__roa), yrs=&esm_h., func=sum); 
	__croa_n = %apply_to_lasty(x= not missing(__croa), yrs=&esm_h., func=sum);
	__roa_std = %apply_to_lasty(x=__roa, yrs=&esm_h., func=std);
	__croa_std = %apply_to_lasty(x=__croa, yrs=&esm_h., func=std);
	earnings_variability = __roa_std / __croa_std;
	if count <= %eval(&esm_h. * 12) or __croa_std <= 0 or __roa_n < &esm_h. or __croa_n < &esm_h. then
		earnings_variability = .;
	 drop __roa __croa  __roa_n __croa_n __roa_std __croa_std;
%mend;

/* Equity Duration: Forecast of Cash Distribution */
%macro equity_duration_cd(horizon=, r=, roe_mean=, roe_ar1=, g_mean=, g_ar1=);
	* Create Initial Variables;
	__roe0 = ni_x / lag12(be_x);
	__g0 = sale_x / lag12(sale_x) - 1;
	__be0 = be_x;
	if count <= 12 or lag12(be_x) <= 1 then __roe0 = .; /* Use 1 million to avoid bad estimates from a small denominator */
	if count <= 12 or lag12(sale_x) <= 1 then __g0 = .; /* Use 1 million to avoid bad estimates from a small denominator */
	* Forecast Cash Distributions;
	%let roe_c = &roe_mean.*(1 - &roe_ar1.);
	%let g_c = &g_mean.*(1 - &g_ar1.);
	%do i = 1 %to &horizon.;
		%let j = %eval(&i.-1);
		__roe&i. = &roe_c. + &roe_ar1. * __roe&j.;
		__g&i. = &g_c. + &g_ar1. * __g&j.;
		__be&i. = __be&j. * (1 + __g&i.);
		__cd&i. = __be&j. * (__roe&i. - __g&i.);
	%end;
	* Create Duration Helper Variables;
	ed_constant = &horizon. + (1 + &r.) / &r.;
	ed_cd_w = 0;
	ed_cd = 0;
	ed_err = 0;
	%do t = 1 %to &horizon.;
		ed_cd_w = ed_cd_w + &t. * __cd&t. / (1 + &r.)**&t.;
		ed_cd = ed_cd + __cd&t. / (1 + &r.)**&t.;
		if __be&t. < 0 then ed_err = 1;
	%end;
	drop __roe: __g: __be: __cd:; 
%mend;

* Pitroski (2000) Fundamental Score;
%macro pitroski_f(name=);
	__f_roa = ni_x / lag12(at_x);
	if count <= 12 or lag12(at_x) <= 0 then __f_roa = .;
	__f_croa = ocf_x / lag12(at_x);
	if count <= 12 or lag12(at_x) <= 0 then __f_croa = .;
	__f_droa = __f_roa - lag12(__f_roa);
	if count <= 12 then __f_droa = .;
	__f_acc = __f_croa - __f_roa;
	__f_lev = dltt / at_x - lag12(dltt / at_x);
	if count <= 12 or at_x <= 0 or lag12(at_x) <= 0 then __f_lev = .;
	__f_liq = ca_x / cl_x - lag12(ca_x / cl_x);
	if count <= 12 or cl_x <= 0 or lag12(cl_x) <= 0 then __f_liq = .;
	__f_eqis = eqis_x;
	__f_gm = gp_x / sale_x - lag12(gp_x / sale_x);
	if count <= 12 or sale_x <= 0 or lag12(sale_x) <= 0 then __f_gm = .;
	__f_aturn = sale_x / lag12(at_x) - lag12(sale_x) / lag24(at_x); 
	if count <= 24 or lag12(at_x) <= 0 or lag24(at_x) <= 0 then __f_aturn = .; 
	&name. = (__f_roa > 0) + (__f_croa > 0) + (__f_droa > 0) + (__f_acc > 0) +
			  (__f_lev < 0) + (__f_liq > 0) + (coalesce(__f_eqis, 0) = 0) +  /* Set __f_eqis to zero if missing. This greatly expands coverage and seems like a reasonable approximation */ 
			  (__f_gm > 0) + (__f_aturn > 0);
	* Only allow __f_eqis to be missing;		  
	if missing(__f_roa) or missing(__f_croa) or missing(__f_droa) or missing(__f_acc) or
	   missing(__f_lev) or missing(__f_liq) or missing(__f_gm) or missing(__f_aturn) then &name. = .;
	drop __f_:;
%mend;

* Ohlson (1980) O-score;
%macro ohlson_o(name=);
	* Create Helpers;
	__o_lat = log(at_x);
	__o_lev = debt_x / at_x;
	__o_wc = (ca_x - cl_x) / at_x;
	__o_roe = nix_x / at_x;
	if at_x <= 0 then do;
		__o_lat = .;
		__o_lev = .;
		__o_wc = .;
		__o_roe = .;
	end;
	__o_cacl = cl_x / ca_x;
	if ca_x <= 0 then __o_cacl = .;
	__o_ffo = (pi_x + dp) / lt;
	if lt <= 0 then __o_ffo = .;
	__o_neg_eq = lt > at_x;
	if missing(lt) or missing(at_x) then __o_neg_eq = .;
	__o_neg_earn = (nix_x < 0 and lag12(nix_x) < 0);
	if count <= 12 or missing(nix_x) or missing(lag12(nix_x)) then __o_neg_earn = .;
	__o_nich = (nix_x - lag12(nix_x)) / (abs(nix_x) + abs(lag12(nix_x)));
	if count <= 12 or (abs(nix_x) + abs(lag12(nix_x))) = 0 then __o_nich = .;
	* Create O-score;
	&name. = -1.32 - 0.407 * __o_lat + 6.03 * __o_lev - 1.43 * __o_wc
	          + 0.076 * __o_cacl - 1.72 * __o_neg_eq - 2.37 * __o_roe
	          - 1.83 * __o_ffo + 0.285 * __o_neg_earn - 0.52 * __o_nich;
%mend;

* Altman (1968) Z-score;
%macro altman_z(name=);
	* Create Helpers;
	__z_wc = (ca_x - cl_x) / at_x;
	__z_re = re / at_x;
	__z_eb = ebitda_x / at_x;
	__z_sa = sale_x / at_x;
	if at_x <= 0 then do;
		__z_wc = .;
		__z_re = .;
		__z_eb = .;
		__z_sa = .;
	end;
	__z_me = me_fiscal / lt;
	if lt <= 0 then	__z_me = .;
	* Create Temporary Z-score;
	&name. = 1.2 * __z_wc + 1.4 * __z_re + 3.3 * __z_eb + 0.6 * __z_me + 1.0 * __z_sa;
	drop __z:
%mend;

* Intrinsic ROE based value from Frankel and Lee (1998);
%macro intrinsic_value(name=, r=);
	__iv_po = div_x/nix_x;
	if nix_x <= 0 then
		__iv_po = div_x / (at_x * 0.06);
	__iv_roe = nix_x / ((be_x + lag12(be_x)) / 2);
	if count <= 12 or (be_x + lag12(be_x)) <= 0 then __iv_roe = .;
	__iv_be1 = (1 + (1 - __iv_po) * __iv_roe) * be_x;
	&name. = be_x + (__iv_roe - &r.) / (1 + &r.) * be_x 
					+ (__iv_roe - &r.) / ((1 + &r.) * &r.) * __iv_be1; 
	* If Intrinsic value is Non-Positive, set to missing;
	if &name. <= 0 then &name. = .;
	drop __iv:;
%mend;

*  Kaplan-Zingales Index;
%macro kz_index(name=);
	* Create Helper Variables;
	__kz_cf = (ni_x + dp) / lag12(ppent);
	if count <= 12 or lag12(ppent)<=0 then __kz_cf = .;
	__kz_q = (at_x + me_fiscal - be_x) / at_x;
	if at_x <= 0 then __kz_q = .;
	__kz_db = debt_x / (debt_x + seq_x);
	if (debt_x + seq_x) = 0 then __kz_db = .;
	__kz_dv = div_x / lag12(ppent);
	if count <= 12 or lag12(ppent)<=0 then __kz_dv = .;
	__kz_cs = che / lag12(ppent);
	if count <= 12 or lag12(ppent)<=0 then __kz_cs = .;
	* Create Variable;
	&name. = - 1.002 * __kz_cf + 0.283 * __kz_q + 3.139 * __kz_db
	         - 39.368 * __kz_dv - 1.315 * __kz_cs;
%mend;

/* Earnings Predicability/Persistence*/
* I scale net income by total assets to account for issuance activity.;
%macro earnings_persistence(out=, data=, __n=, __min=);
	%let __months = %eval(&__n. * 12);
	proc sort data=&data. out=__acc1; by gvkey curcd datadate; run;
	data __acc2;
		set __acc1;
		by gvkey curcd;
		retain count;
		if first.curcd then 
			count = 1;
		else
			count = count+1;
	run;
	data __acc3;
		set __acc2;
		__ni_at = ni_x / at_x;
		if at_x <= 0 then
			__ni_at = .;
		__ni_at_l1 = lag12(__ni_at);
		if count<=12 then
			__ni_at_l1 =.;	
	run;
	proc sql;
		create table __acc4 as 
		select gvkey, curcd, datadate, __ni_at, __ni_at_l1
		from __acc3
		where not missing(__ni_at) and not missing(__ni_at_l1);
	quit;
	
	proc sql;
		create table month_ends as 
		select distinct datadate
		from __acc4
		order by datadate;
	quit;
	
	* Divide data into __n groups;
	proc sql;
		create table dates_apply as 
		select *, mod(monotonic(), &__months.) as grp
		from month_ends;
	quit;
	
	* Helper macro: If first group, save &new. as &base. otherwise, append &new. to &base.;
	%macro save_or_append(base=, new=);
		%if &__grp. = 0 %then %do;
			data &base.; set &new.; run;
		%end;
		%else %do;
			proc append base=&base. data=&new.; run;
		%end;
	%mend;
	
	%do __grp=0 %to %eval(&__months. - 1); 
		%put ############### GROUP %eval(&__grp.+1) out of &__months. ###############; 
		* Prepare data;
		proc sql;
			create table calc_dates as
			select a.datadate, b.datadate as calc_date
			from dates_apply as a left join dates_apply(where=(grp = &__grp.)) as b
			on a.datadate > intnx("year", b.datadate, -&__n., "e") and a.datadate <= b.datadate and month(a.datadate) = month(b.datadate); /* month(*) ensures annual lags*/
		quit;
		
		proc sql;
			create table calc_data as 
			select a.*, b.calc_date
			from __acc4 as a left join calc_dates as b
			on a.datadate = b.datadate
			where not missing(b.calc_date)  
			group by a.gvkey, a.curcd, b.calc_date
			having count(*) >= &__min.
			order by a.gvkey, b.calc_date;
		quit;
		
		proc reg data=calc_data outest=__earn_pers1 edf NOPRINT;
			by gvkey curcd calc_date;
			model __ni_at=__ni_at_l1;
		run;
		proc sql;
			create table __earn_pers2 as 
			select gvkey, curcd, calc_date as datadate, __ni_at_l1 as ni_ar1, sqrt(_rmse_**2 * _edf_ / (_edf_ + 1)) as ni_ivol
			from __earn_pers1
			where (_edf_ + 2) >= &__min.;
		quit;
		%save_or_append(base=op_ep, new=__earn_pers2);
	%end;
	proc sort data=op_ep out=&out. nodup; by gvkey curcd datadate; run; 
	proc delete data= __acc1 __acc2 __acc3 __acc4  dates_apply calc_dates calc_data month_ends __earn_pers1 __earn_pers2 op_ep; run;
%mend;

/* MACRO - FIRM AGE */
%macro firm_age(data=, out=);
	* CRSP first observation;
	proc sql;
		create table crsp_age1 as 
		select permco, min(date) as crsp_first format=YYMMDDN8.
		from crsp.msf
		group by permco;
	quit;
	
	* Compustat accounting first observation;
	proc sql;
		create table comp_acc_age1 as
		select gvkey, datadate from comp.funda
		outer union corr
		select gvkey, datadate from comp.g_funda;
		
		create table comp_acc_age2 as
		select gvkey, min(datadate) as comp_acc_first format=YYMMDDN8.
		from comp_acc_age1
		group by gvkey;
		
		update comp_acc_age2
		set comp_acc_first = intnx('year', comp_acc_first, -1, 'e');  /* When submitting an annual report, the firm must have existed for at least 1 year*/
	quit; 
	
	* Compustat return first obs;
	proc sql;
		create table comp_ret_age1 as
		select gvkey, datadate from comp.secm
		outer union corr
		select gvkey, datadate from comp.g_secd where monthend=1;
		
		create table comp_ret_age2 as
		select gvkey, min(datadate) as comp_ret_first format=YYMMDDN8.
		from comp_ret_age1
		group by gvkey;
		
		update comp_ret_age2
		set comp_ret_first = intnx('year', comp_ret_first, -1, 'e');  /* When submitting an annual report, the firm must have existed for at least 1 year*/
	quit; 
	
	* Add to Dataset;
	proc sql;
		create table comb1 as
		select a.id, a.eom, min(b.crsp_first, c.comp_acc_first, d.comp_ret_first) as first_obs format=YYMMDDN8.
		from &data. as a 
		left join crsp_age1 as b 
		on a.permco=b.permco
		left join comp_acc_age2 as c
		on a.gvkey=c.gvkey
		left join comp_ret_age2 as d
		on a.gvkey=d.gvkey;
	
		create table comb2 as 
		select *, min(eom) as first_alt format=YYMMDDN8. /* A few (0.3% of all obs) north american observations don't have observations in comp.secm so this is a fall back option*/
		from comb1
		group by id; 
	
		create table comb3 as
		select *, intck ('month', min(first_obs, first_alt), eom) as age
		from comb2;
		
		alter table comb3
		drop first_obs, first_alt;
	quit;
	* Output;
	proc sort data=comb3 out=&out.; by id eom; run;
%mend;

================================================
FILE: GlobalFactors/ind_identification.sas
================================================
* MACRO: FF_IND_CLASS
	Add variable matching 4-digit SIC identifiers to Fama-French industry identifiers
	   Arguments:
	   	data: name of input dataset that includes 4-digit SIC codes under name 'sic'
	   	ff_grps: number of industry portfolios for Fama-French identifiers
	   	OUT: name of output dataset;
%macro ff_ind_class(data=, ff_grps=, out=);
	%if &ff_grps = 38 %then %do;
		proc sql;
			/* French identifies "Other" as "almost nothing", so no firms are identified as "other" 
			https://mba.tuck.dartmouth.edu/pages/faculty/ken.french/Data_Library/det_38_ind_port.html*/
			create table &out. as
			select *, 
				case
					when 100 <= sic <= 999 then 1
					when 1000 <= sic <= 1299 then 2 
					when 1300 <= sic <= 1399 then 3
					when 1400 <= sic <= 1499 then 4
					when 1500 <= sic <= 1799 then 5
					when 2000 <= sic <= 2099 then 6
					when 2100 <= sic <= 2199 then 7
					when 2200 <= sic <= 2299 then 8
					when 2300 <= sic <= 2399 then 9
					when 2400 <= sic <= 2499 then 10
					when 2500 <= sic <= 2599 then 11
					when 2600 <= sic <= 2661 then 12
					when 2700 <= sic <= 2799 then 13
					when 2800 <= sic <= 2899 then 14
					when 2900 <= sic <= 2999 then 15
					when 3000 <= sic <= 3099 then 16
					when 3100 <= sic <= 3199 then 17
					when 3200 <= sic <= 3299 then 18
					when 3300 <= sic <= 3399 then 19
					when 3400 <= sic <= 3499 then 20
					when 3500 <= sic <= 3599 then 21
					when 3600 <= sic <= 3699 then 22
					when 3700 <= sic <= 3799 then 23
					when 3800 <= sic <= 3879 then 24
					when 3900 <= sic <= 3999 then 25
					when 4000 <= sic <= 4799 then 26
					when 4800 <= sic <= 4829 then 27
					when 4830 <= sic <= 4899 then 28
					when 4900 <= sic <= 4949 then 29
					when 4950 <= sic <= 4959 then 30
					when 4960 <= sic <= 4969 then 31
					when 4970 <= sic <= 4979 then 32
					when 5000 <= sic <= 5199 then 33
					when 5200 <= sic <= 5999 then 34
					when 6000 <= sic <= 6999 then 35
					when 7000 <= sic <= 8999 then 36 
					when 9000 <= sic <= 9999 then 37
					else .
				end as ff38
			from &data.;
		run;
		%end;
		
	%else %if &ff_grps. = 49 %then %do;
		proc sql;
			create table &out. as
			select *, 	
				case
					when sic = 2048 or 100 <= sic <= 299 or 700 <= sic <= 799 or 910 <= sic <= 919 then 1
					when sic in (2095, 2098, 2099) or 2000 <= sic <= 2046 or 2050 <= sic <= 2063 or 
						2070 <= sic <= 2079 or 2090 <= sic <= 2092 then 2
					when sic in (2086, 2087, 2096, 2097) or 2064 <= sic <= 2068 then 3
					when sic = 2080 or 2082 <= sic <= 2085 then 4
					when 2100 <= sic <= 2199 then 5
					when sic in (3732, 3930, 3931) or 920 <= sic <= 999 or 3650 <= sic <= 3652 or 3940 <= sic <= 3949 then 6
					when sic in (7840, 7841, 7900, 7910, 7911, 7980) or 7800 <= sic <= 7833 or 7920 <= sic <= 7933 or 
						7940 <= sic <= 7949 or 7990 <= sic <= 7999 then 7
					when sic in (2770, 2771) or 2700 <= sic <= 2749 or 2780 <= sic <= 2799 then 8
					when sic in (2047, 2391, 2392, 3160, 3161, 3229, 3260, 3262, 3263, 3269, 3230, 3231, 3750, 3751, 3800, 3860, 
						3861, 3910, 3911, 3914, 3915, 3991, 3995) or 2510 <= sic <= 2519 or 2590 <= sic <= 2599 or 
						2840 <= sic <= 2844 or 3170 <= sic <= 3172 or 3190 <= sic <= 3199 or 3630 <= sic <= 3639 or 
						3870 <= sic <= 3873 or 3960 <= sic <= 3962 then 9
					when sic in (3020, 3021, 3130, 3131, 3150, 3151) or 2300 <= sic <= 2390 or 3100 <= sic <= 3111 or 
						 3140 <= sic <= 3149 or 3963 <= sic <= 3965 then 10
					when 8000 <= sic <= 8099 then 11
					when sic in (3693, 3850, 3851) or 3840 <= sic <= 3849 then 12
					when sic in (2830, 2831) or 2833 <= sic <= 2836 then 13
					when 2800 <= sic <= 2829 or 2850 <= sic <= 2879 or 2890 <= sic <= 2899 then 14
					when sic in (3031, 3041) or 3050 <= sic <= 3053 or 3060 <= sic <= 3099 then 15
					when 2200 <= sic <= 2284 or 2290 <= sic <= 2295 or 2297 <= sic <= 2299 or 2393 <= sic <= 2395 or 
						2397 <= sic <= 2399 then 16
					when sic in (2660, 2661, 3200, 3210, 3211, 3240, 3241, 3261, 3264, 3280, 3281, 3446, 3996) or 
						800 <= sic <= 899 or 2400 <= sic <= 2439 or 2450 <= sic <= 2459 or	2490 <= sic <= 2499 or 
						2950 <= sic <= 2952 or 3250 <= sic <= 3259 or 3270 <= sic <= 3275 or 3290 <= sic <= 3293 or 
						3295 <= sic <= 3299 or 3420 <= sic <= 3429 or 3430 <= sic <= 3433 or 3440 <= sic <= 3442 or 
						3448 <= sic <= 3452 or 3490 <= sic <= 3499 then 17
					when 1500 <= sic <= 1511 or 1520 <= sic <= 1549 or 1600 <= sic <= 1799 then 18
					when sic = 3300 or 3310 <= sic <= 3317 or 3320 <= sic <= 3325 or 3330 <= sic <= 3341 or 3350 <= sic <= 3357
						or 3360 <= sic <= 3379 or 3390 <= sic <= 3399 then 19
					when sic in (3400, 3443, 3444) or 3460 <= sic <= 3479 then 20
					when sic in (3538, 3585, 3586) or 3510 <= sic <= 3536 or 3540 <= sic <= 3569 or 3580 <= sic <= 3582 or 
						3589 <= sic <= 3599 then 21
					when sic in (3600, 3620, 3621, 3648, 3649, 3660, 3699) or 3610 <= sic <= 3613 or 3623 <= sic <= 3629 or 
						3640 <= sic <= 3646 or 3690 <= sic <= 3692 then 22
					when sic in (2296, 2396, 3010, 3011, 3537, 3647, 3694, 3700, 3710, 3711, 3799) or 3713 <= sic <= 3716 or 
						3790 <= sic <= 3792 then 23
					when sic in (3720, 3721, 3728, 3729) or 3723 <= sic <= 3725 then 24
					when sic in (3730, 3731) or 3740 <= sic <= 3743 then 25
					when sic = 3795 or 3760 <= sic <= 3769 or 3480 <= sic <= 3489 then 26
					when 1040 <= sic <= 1049 then 27
					when 1000 <= sic <= 1039 or 1050 <= sic <= 1119 or 1400 <= sic <= 1499 then 28
					when 1200 <= sic <= 1299 then 29
					when sic in (1300, 1389) or 1310 <= sic <= 1339 or 1370 <= sic <= 1382 or 2900 <= sic <= 2912 or 
						2990 <= sic <= 2999 then 30
					when sic in (4900, 4910, 4911, 4939) or 4920 <= sic <= 4925 or 4930 <= sic <= 4932 or 4940 <= sic <= 4942 then 31
					when sic in (4800, 4899) or 4810 <= sic <= 4813 or 4820 <= sic <= 4822 or 4830 <= sic <= 4841 or 
						4880 <= sic <= 4892 then 32
					when sic in (7020, 7021, 7200, 7230, 7231, 7240, 7241, 7250, 7251, 7395, 7500, 7600, 7620, 7622, 7623, 7640, 
						7641) or 7030 <= sic <= 7033 or 7210 <= sic <= 7212 or 7214 <= sic <= 7217 or 7219 <= sic <= 7221 or 
						7260 <= sic <= 7299 or 7520 <= sic <= 7549 or 7629 <= sic <= 7631 or 7690 <= sic <= 7699 or 
						8100 <= sic <= 8499 or 8600 <= sic <= 8699 or 8800 <= sic <= 8899 or 7510 <= sic <= 7515 then 33
					when sic in (3993, 7218, 7300, 7374, 7396, 7397, 7399, 7519, 8700, 8720, 8721) or 2750 <= sic <= 2759 or 
						7310 <= sic <= 7342 or 7349 <= sic <= 7353 or 7359 <= sic <= 7369 or 7376 <= sic <= 7385 or 
						7389 <= sic <= 7394 or 8710 <= sic <= 8713 or 8730 <= sic <= 8734 or 8740 <= sic <= 8748 or 
						8900 <= sic <= 8911 or 8920 <= sic <= 8999 or 4220 <= sic <= 4229 then 34
					when sic = 3695 or 3570 <= sic <= 3579 or 3680 <= sic <= 3689 then 35
					when sic = 7375 or 7370 <= sic <= 7373 then 36
					when sic in (3622, 3810, 3812) or 3661 <= sic <= 3666 or 3669 <= sic <= 3679 then 37
					when sic = 3811 or 3820 <= sic <= 3827 or 3829 <= sic <= 3839 then 38
					when sic in (2760, 2761) or 2520 <= sic <= 2549 or 2600 <= sic <= 2639 or 2670 <= sic <= 2699 or 
						3950 <= sic <= 3955 then 39
					when sic in (3220, 3221) or 2440 <= sic <= 2449 or 2640 <= sic <= 2659 or 3410 <= sic <= 3412 then 40
					when sic in (4100. 4130, 4131, 4150, 4151, 4230, 4231, 4780, 4789) or 4000 <= sic <= 4013 or 4040 <= sic <= 4049 
						or 4110 <= sic <= 4121 or 4140 <= sic <= 4142 or 4170 <= sic <= 4173 or 4190 <= sic <= 4200 or 
						4210 <= sic <= 4219 or 4240 <= sic <= 4249 or 4400 <= sic <= 4700 or 4710 <= sic <= 4712 or 
						4720 <= sic <= 4749 or 4782 <= sic <= 4785 then 41
					when sic in (5000, 5099, 5100) or 5010 <= sic <= 5015 or 5020 <= sic <= 5023 or 5030 <= sic <= 5060 or 
						5063 <= sic <= 5065 or 5070 <= sic <= 5078 or 5080 <= sic <= 5088 or 5090 <= sic <= 5094 or 
						5110 <= sic <= 5113 or 5120 <= sic <= 5122 or 5130 <= sic <= 5172 or 5180 <= sic <= 5182 or 
						5190 <= sic <= 5199 then 42
					when sic in (5200, 5250, 5251, 5260, 5261, 5270, 5271, 5300, 5310, 5311, 5320, 5330, 5331, 5334, 5900, 5999) or
						5210 <= sic <= 5231 or 5340 <= sic <= 5349 or 5390 <= sic <= 5400 or 5410 <= sic <= 5412 or 
						5420 <= sic <= 5469 or 5490 <= sic <= 5500 or 5510 <= sic <= 5579 or 5590 <= sic <= 5700 or 
						5710 <= sic <= 5722 or 5730 <= sic <= 5736 or 5750 <= sic <= 5799 or 5910 <= sic <= 5912 or 
						5920 <= sic <= 5932 or 5940 <= sic <= 5990 or 5992 <= sic <= 5995 then 43 
					when sic in (7000, 7213) or 5800 <= sic <= 5829 or 5890 <= sic <= 5899 or 7010 <= sic <= 7019 or 
						7040 <= sic <= 7049 then 44
					when sic = 6000 or 6010 <= sic <= 6036 or 6040 <= sic <= 6062 or 6080 <= sic <= 6082 or 6090 <= sic <= 6100 
						or 6110 <= sic <= 6113 or 6120 <= sic <= 6179 or 6190 <= sic <= 6199 then 45
					when sic in (6300, 6350, 6351, 6360, 6361) or 6310 <= sic <= 6331 or 6370 <= sic <= 6379 or 6390 <= sic <= 6411 
						then 46
					when sic in (6500, 6510, 6540, 6541, 6610, 6611) or 6512 <= sic <= 6515 or 6517 <= sic <= 6532 or 
						6550 <= sic <= 6553 or 6590 <= sic <= 6599 then 47
					when sic in (6700, 6798, 6799) or 6200 <= sic <= 6299 or 6710 <= sic <= 6726 or 6730 <= sic <= 6733 or 
						6740 <= sic <= 6779 or 6790 <= sic <= 6795 then 48
					when sic in (4970, 4971, 4990, 4991) or 4950 <= sic <= 4961 then 49
					else .
				end as ff49
				from &data.;
			run;
			%end;
%mend;

* MACRO: CRSP_INDUSTRY
	Create daily historical SIC and NAICS industry identifiers dataset from CRSP data 
	   Arguments:
	   	out: name of output dataset;
%macro crsp_industry(out=);
	/* Pull distinct date ranges and identifiers from CRSP datasets */
	proc sql;
		create table permno0 as
		select distinct permno, permco, namedt, nameendt, siccd as sic, input(naics, 6.0) as naics/* NAICS codes can't have leading zeroes so this should be okay*/
		from crsp.dsenames
		order by permno, namedt, nameendt;
	run;
	
	/* Alter missing industry identifiers */
	data permno1;
		set permno0;
		if missing(sic) then sic = -999;
		if sic = 0 then sic = -999;
		if missing(naics) then naics = -999;
	run;
	
	/* Find date distance for date ranges */
	data permno2; 
		set permno1;
		permno_diff = intck('day', namedt, nameendt, 'd');
	run;

	proc sort data = permno2;
		by permno namedt nameendt;
	run;
	
	/* Create new rows between valid dates */
	data permno3;
		set permno2;
		output;
		n = 0;
		if permno_diff > 0 then do;
			do until(n = permno_diff);
				namedt = intnx('day', namedt, 1);
				n + 1;
				output;
			end;
		end;
		drop nameendt permno_diff n;
	run;
	
	/* Get ready for output */
	data permno4;
		set permno3;
		if sic = -999 then sic = .;
		if naics = -999 then naics = .;
		date = namedt;
		drop namedt;
		format date yymmddn8.;
	run;
		
	proc sort data= permno4 out= &out. nodup; by permno date; run;
	
	proc delete data = permno0 permno1 permno2 permno3 permno4; run;
%mend; 

* MACRO: COMP_SIC_NAICS 
	Create a daily historical SIC and NAICS industry identifiers dataset using NA and global annual reports
	   Arguments: 
	   	OUT: name of output dataset;
%macro comp_sic_naics(OUT =, ff_num =);
	proc sql;
		/* Retrieve NA identifiers */
		create table comp1 as
		select distinct gvkey, datadate, sich as sic, naicsh as naics
		from COMP.FUNDA;
	run;
	
	/* Fix error of gvkey code 175650 */
	data comp2;
		set comp1;
		if gvkey = "175650" and datadate = '31DEC2005'd and missing(naics) then delete;
	run;
		
	proc sql;
		/* Retrieve global identifiers */
		create table comp3 as
		select distinct gvkey, datadate, sich as sic, naicsh as naics
		from COMP.G_FUNDA;
		
		/* Join global and NA data */
		create table comp4 as
		select a.gvkey as gvkeya, a.datadate as datea, a.sic as sica, a.naics as naicsa, 
			b.gvkey as gvkeyb, b.datadate as dateb, b.sic as sicb, b.naics as naicsb
		from comp2 as a full join comp3 as b
		on a.gvkey = b.gvkey and a.datadate = b.datadate;
	run;
	
	/* Coalesce NA and global */
	data comp5;
		set comp4;
		gvkey = put(coalesce(gvkeya, gvkeyb), $z6.);
		date  = coalesce(datea, dateb);
		sic   = coalesce(sica, sicb);
		naics = coalesce(naicsa, naicsb);
		format date yymmddn8.;
		drop gvkeya gvkeyb datea dateb sica sicb naicsa naicsb;
	run;
	
	/* Sort descending*/
	proc sort data = comp5;
		by gvkey descending date;
	run;
	
	/* Add valid date to in order to extend to daily observation */
	data comp6;
		set comp5;
		by gvkey;
		valid_to = intnx('day', lag(date), -1);
		if FIRST.gvkey then do;
			valid_to = date;
		end;
		format valid_to yymmddn8.;
	run;
	
	/* Re-sort */
	proc sort data = comp6;
		by gvkey date valid_to;
	run;
	
	/* Find date distance for date ranges */
	data comp7; 
		set comp6;
		comp_diff = intck('day', date, valid_to, 'd');
	run;

	proc sort data = comp7;
		by gvkey date valid_to;
	run;
	
	/* Create new rows between valid dates */
	data comp8;
		set comp7;
		output;
		n = 0;
		if comp_diff > 0 and comp_diff ne . then do;
			do until(n = comp_diff);
				date = intnx('day', date, 1);
				n + 1;
				output;
			end;
		end;
		drop valid_to comp_diff n;
	run;

	proc sort data= comp8 out= &out. nodup; by gvkey date; run;
	
	proc delete data = comp1 comp2 comp3 comp4 comp5 comp6 comp7 comp8; run;
%mend;

* MACRO: COMP_HGICS
	 Create a daily historical gics dataset from COMPUSTAT, either from the NA or global dataset 
	   Arguments:
	   	lib: COMPUSTAT library from which to pull historical gics data (CO_HGICS if NA, G_CO_HGICS if global)
		OUT: name of output dataset;
%macro COMP_HGICS(lib =, out =);
	/* Pull historical gics data */
	proc sql;
		create table gic1 as
		select distinct gvkey, indfrom, indthru, gsubind as gics
		from comp.&lib.
		where not missing(gvkey);
	run;
	
	proc sort data = gic1;
		by gvkey indfrom;
	run;
	
	/* Alter missing gics */
	data gic2;
		set gic1;
		by gvkey;
		if missing(gics) then gics = -999;
	run;
	
	/* Adjust indthru */
	data gic3;
		set gic2;
		by gvkey indfrom indthru;
		if LAST.gvkey and indthru = . then indthru = today();
	run;

	/* Estimate difference between indfrom and indthru */
	data gic4;
		set gic3;
		gic_diff = intck('days', indfrom, indthru);
	run;
	
	proc sort data = gic4;
		by gvkey indfrom indthru;
	run;

	/* Add rows to create daily data */
	data gic5;
		set gic4;
		by gvkey;
		output;
		n = 0;
		if gic_diff > 0 and gic_diff ne . then do;
			do until(n = gic_diff);
				indfrom = intnx('day', indfrom, 1);
				n + 1;
				output;
			end;
		end;
	run;
	
	data gic6;
		set gic5;
		date = indfrom;
		format date yymmddn8.;
		drop indfrom indthru gic_diff n;
	run;
	
	proc sort data= gic6 out=&OUT nodup; by gvkey date; run;
		
	proc delete data = gic1 gic2 gic3 gic4 gic5 gic6; run;
%mend COMP_HGICS;

/* MACRO: HGICS JOIN
	Join NA and global daily historical gics data from COMPUSTAT 
	   Argument: 
	   	OUT: name of output dataset */
%macro HGICS_JOIN(out=);
	/* Construct NA and global historical gics data */
	%comp_hgics(lib = co_hgic, OUT = na_hgics);
	%comp_hgics(lib = g_co_hgic, OUT = g_hgics);
	
	proc sql;
		create table gjoin1 as
		select a.gvkey as na_gvkey, a.gics as na_gics, a.date as na_date, b.gvkey as g_gvkey, b.gics 
			as g_gics, b.date as g_date
		from na_hgics as a full join g_hgics as b 
		on a.gvkey = b.gvkey and a.date = b.date;
	
	/* Coalesce NA and global */
	data gjoin2;
		set gjoin1;
		gvkey = put(coalesce(na_gvkey, g_gvkey), $z6.);
		date  = coalesce(na_date, g_date);
		gics  = coalesce(na_gics, g_gics);
		format date yymmddn8.;
		drop na_gvkey na_date na_gics g_gvkey g_date g_gics;
	run;
	
	proc sort data = gjoin2 out= &out nodup; by gvkey date; run;
	
	proc delete data = na_hgics g_hgics gjoin1 gjoin2; run;
%mend;

* MACRO: COMP_INDUSTRY
	Join SIC and NAICS industry identifiers to GICS identifiers constructed from COMPUSTAT data
	   Arguments:
	   	OUT: name of output dataset;
%macro comp_industry(out=);
	/* Construct datasets */
	%hgics_join(out=comp_gics);
	%comp_sic_naics(out=comp_other);
	
	/* Join datasets */
	data join1;
		merge comp_gics comp_other;
		by gvkey date;
	run;
	
	proc sort data = join1 nodupkey;
		by gvkey date;
	run;
	
	/* Check for gaps in coverage */
	data join2;
		set join1;
		by gvkey date;
		lagdate = lag(date);
		date_1  = intnx('day', date, -1);
		gap     = 0;
		format lagdate yymmddn8. date_1 yymmddn8.;
	run;
	
	data join3;
		set join2;
		by gvkey date;
		if not FIRST.gvkey and lagdate ne date_1 then gap = 1;
	run;
	
	/* Create rows for gaps in coverage with all indicators as missing */
	proc sql;
		create table gap1 as
		select *
		from join3
		where gap = 1;
	run;
	
	/* Size of gap */
	data gap2;
		set gap1;
		diff = intck('days', lagdate, date);
	run;
	
	/* Add rows to create daily data */
	data gap3;
		set gap2;
		by gvkey date;
		output;
		n = 0;
		if gap = 1 then do;
			do until(n = diff - 1);
				date      = intnx('day', date, -1);
				gics      = .;
				sic       = .;
				naics     = .;
				n + 1;
				output;
			end;
		end;
		drop lagdate date_1 gap diff n; 
	run;
	
	proc sort data = gap3;
		by gvkey date;
	run;
	
	/* Join added rows to original daily data */
	data joined1;
		merge join1 gap3;
		by gvkey date;
	run;
	
	proc sort data = joined1 out= &out. nodup; by gvkey date; run;	
	
	proc delete data = comp_gics comp_other join1 join2 join3 gap1 gap2 gap3 joined1; run;
%mend;


================================================
FILE: GlobalFactors/main.sas
================================================
/* Clean working environment */
proc delete data = _all_ ; run ; 

***************************************************************************
* Manual Inputs
*************************************************************************** ; 
* Assign scratch and project folder names;
%let scratch_folder = /scratch/INSTITUTION/FOLDER; 
%let project_folder = ~/Global Data;
* Set defaults;
%let delete_temp = 1;  * Should temporary files be deleted?;
%let save_csv = 1;     * Should the main data set be save country-by-country in a .csv format?;
%let save_daily_ret = 1;   * Save daily stocks returns country-by-country in a .csv format?;
%let save_monthly_ret = 1;   * Save monthly stocks returns  in a .csv format;
%let end_date = '31DEC2024'd; * Date of last observation: CRSP data is only updated annually, so we keep this updating frequency for consistency. Should be incremented every time there's an update to the CRSP database);

***************************************************************************
* Libraries and Functions
*************************************************************************** ; 
* Libraries;
options dlcreatedir;
libname scratch "&scratch_folder."; 
libname project "&project_folder.";

* Project macros;
%include "&project_folder./project_macros.sas";
%include "&project_folder./char_macros.sas";
%include "&project_folder./market_chars.sas";
%include "&project_folder./accounting_chars.sas";
%include "&project_folder./ind_identification.sas";

*****************************************************************************
* Create Return Data
**************************************************************************** ; 
%prepare_comp_sf(freq=both);
%clean_comp_msf(data=comp_msf); * Delete obvious data errors (work-in-progress);
%prepare_crsp_sf(freq=d);
%prepare_crsp_sf(freq=m);
%combine_crsp_comp_sf(out_msf=world_msf1, out_dsf=scratch.world_dsf, crsp_msf=crsp_msf, comp_msf=comp_msf, crsp_dsf=crsp_dsf, comp_dsf=comp_dsf);

proc delete data=comp_dsf crsp_dsf comp_msf crsp_msf; run;
*****************************************************************************
* Add Industry Codes 
***************************************************************************** ;
%crsp_industry(out=crsp_ind);
%comp_industry(out=comp_ind);
proc sql;
	create table world_msf2 as
	select a.*, b.gics as gics, coalesce(b.sic, c.sic) as sic, coalesce(b.naics, c.naics) as naics 
	from world_msf1 as a
	left join comp_ind as b on a.gvkey=b.gvkey and a.eom=b.date
	left join crsp_ind as c on a.permco=c.permco and a.permno=c.permno and a.eom=c.date;
quit;
proc delete data=world_msf1 crsp_ind comp_ind; run; * Prefer COMPUSTAT to CRSP;

* Add a column 'ff49' with Fama-French industry classification;
%ff_ind_class(data=world_msf2, ff_grps=49, out=world_msf3); 

* Size cutoffs;
%nyse_size_cutoffs(data=world_msf3, out=scratch.nyse_cutoffs);

* Classify stocks into size groups;
proc sql;
	create table scratch.world_msf as
	select case 
			when missing(a.me) then ('')
			when a.me >= b.nyse_p80 then 'mega'
			when a.me >= b.nyse_p50 then 'large'
			when a.me >= b.nyse_p20 then 'small'
			when a.me >= b.nyse_p1 then 'micro'
			else 'nano'
		end as size_grp, a.*
	from world_msf3 as a left join scratch.nyse_cutoffs as b
	on a.eom=b.eom;
quit;
proc delete data=world_msf2 world_msf3; run;

* Return cutoffs;
%return_cutoffs(data=scratch.world_msf, freq=m, out=scratch.return_cutoffs, crsp_only=0);
%return_cutoffs(data=scratch.world_dsf, freq=d, out=scratch.return_cutoffs_daily, crsp_only=0);

*****************************************************************************
* Market Returns
**************************************************************************** ; 
%market_returns(out = scratch.market_returns, data=scratch.world_msf, freq=m, wins_comp=1, wins_data=scratch.return_cutoffs, cap_data=scratch.nyse_cutoffs);
%market_returns(out = scratch.market_returns_daily, data=scratch.world_dsf, freq=d, wins_comp=1, wins_data=scratch.return_cutoffs_daily, cap_data=scratch.nyse_cutoffs);

*****************************************************************************
* Create Characteristics Based on Accounting Data
**************************************************************************** ;
%standardized_accounting_data(coverage='world', convert_to_usd=1, me_data = scratch.world_msf, include_helpers_vars=1, start_date='31DEC1949'd); 
%create_acc_chars(data=acc_std_ann, out=achars_world, lag_to_public=4, max_data_lag=18, __keep_vars=&acc_chars., me_data=scratch.world_msf, suffix=);
%create_acc_chars(data=acc_std_qtr, out=qchars_world, lag_to_public=4, max_data_lag=18, __keep_vars=&acc_chars., me_data=scratch.world_msf, suffix=_qitem);
%combine_ann_qtr_chars(out=scratch.acc_chars_world, ann_data=achars_world, qtr_data=qchars_world, __char_vars=&acc_chars., q_suffix=_qitem);

*****************************************************************************
* Create Characteristics Based on Monthly Market Data
**************************************************************************** ;
%market_chars_monthly(out=scratch.market_chars_m, data=scratch.world_msf, market_ret=scratch.market_returns, local_currency=0); 

* Free up space;
proc datasets library=work kill nolist; quit;

*****************************************************************************
* Combine Returns, Accounting and Monthly Market Data
**************************************************************************** ; 
proc sql;
	create table world_data_prelim as 
	select a.*, b.*, c.*
	from scratch.world_msf as a  
	left join scratch.market_chars_m as b
	on a.id=b.id and a.eom=b.eom
	left join scratch.acc_chars_world as c
	on a.gvkey=c.gvkey and a.eom=c.public_date;

	alter table world_data_prelim 
	drop div_tot, div_cash, div_spc, public_date, source; 
quit;

%if &delete_temp.=1 %then %do;
	proc delete data=
		scratch.market_chars_m scratch.acc_chars_world; 
	run;
%end;

*****************************************************************************
* Asset Pricing Factors
**************************************************************************** ; 
* Create monthly and daily factors from FF3 and HXZ4;
%ap_factors(out=scratch.ap_factors_daily, freq=d, sf=scratch.world_dsf, mchars=world_data_prelim, mkt=scratch.market_returns_daily, min_stocks_bp=10, min_stocks_pf=3);	
%ap_factors(out=scratch.ap_factors_monthly, freq=m, sf=scratch.world_msf, mchars=world_data_prelim, mkt=scratch.market_returns, min_stocks_bp=10, min_stocks_pf=3);	

*****************************************************************************
* Factor based on combined data
**************************************************************************** ;
%firm_age(data=scratch.world_msf, out=scratch.firm_age);
%mispricing_factors(out=scratch.mp_factors, data=world_data_prelim, min_stks=10, min_fcts=3);	
%market_beta(out=scratch.beta_60m, data=scratch.world_msf, fcts=scratch.ap_factors_monthly, __n=60, __min=36);
%residual_momentum(out=scratch.resmom_ff3, data=scratch.world_msf, fcts=scratch.ap_factors_monthly, type=ff3, __n =36, __min=24, incl=12 6, skip=1 1);

* Free up space;
proc datasets library=work nolist;
   delete _all_ / memtype=data;  /* Deletes all datasets */
   protect world_data_prelim;    /* Prevents world_data_prelim from being deleted */
quit;


*****************************************************************************
* Create Characteristics Based on Daily Market Data
**************************************************************************** ; 
%bidask_hl(out=scratch.corwin_schultz, data=scratch.world_dsf, __min_obs=10); 
%prepare_daily(data=scratch.world_dsf, fcts=scratch.ap_factors_daily); 
%roll_apply_daily(out=scratch.roll_21d, __n=1, __min=15, fcts=scratch.ap_factors_daily,  __month_ends=month_ends, sfx =_21d, 
				  __stats= rvol rmax skew capm_ext ff3 hxz4 dimsonbeta zero_trades);
%roll_apply_daily(out=scratch.roll_126d, __n=6, __min=60, fcts=scratch.ap_factors_daily,  __month_ends=month_ends, sfx =_126d, 
				  __stats= zero_trades turnover dolvol ami);
%roll_apply_daily(out=scratch.roll_252d, __n=12, __min=120, fcts=scratch.ap_factors_daily,  __month_ends=month_ends, sfx =_252d, 
				  __stats= rvol capm downbeta zero_trades prc_to_high mktvol);
%roll_apply_daily(out=scratch.roll_1260d, __n=60, __min=750, fcts=scratch.ap_factors_daily,  __month_ends=month_ends, sfx =_1260d, 
				  __stats= mktcorr);
%finish_daily_chars(out=scratch.market_chars_d);

%if &delete_temp.=1 %then %do;
	proc delete data=
		scratch.corwin_schultz scratch.roll_21d scratch.roll_126d scratch.roll_252d scratch.roll_1260d scratch.ap_factors_daily scratch.ap_factors_monthly ; 
	run;
%end;

* Free up space;
proc datasets library=work nolist;
   delete _all_ / memtype=data;  /* Deletes all datasets */
   protect world_data_prelim;    /* Prevents world_data_prelim from being deleted */
quit;


*****************************************************************************
* Combine all characteristics and build final dataset
**************************************************************************** ; 
proc sql;
	create table world_data3 as
	select a.*, b.beta_60m, b.ivol_capm_60m, c.resff3_12_1, d.resff3_6_1, e.mispricing_mgmt, e.mispricing_perf, f.*, g.age
	from world_data_prelim as a 
	left join scratch.beta_60m as b on a.id=b.id and a.eom=b.eom
	left join scratch.resmom_ff3_12_1 as c on a.id=c.id and a.eom=c.eom
	left join scratch.resmom_ff3_6_1 as d on a.id=d.id and a.eom=d.eom
	left join scratch.mp_factors as e on a.id=e.id and a.eom=e.eom
	left join scratch.market_chars_d as f on a.id=f.id and a.eom=f.eom
	left join scratch.firm_age as g on a.id=g.id and a.eom=g.eom;
quit;

* Add Quality minus Junk;
%quality_minus_junk(out=scratch.qmj, data=world_data3, min_stks=10);
proc sql;
	create table world_data4 as 
	select a.*, b.qmj, b.qmj_prof, b.qmj_growth, b.qmj_safety
	from world_data3 as a left join scratch.qmj as b
	on a.excntry=b.excntry and a.id=b.id and a.eom=b.eom;
quit;

* Reorder Variables;
data world_data5;
	retain id date eom source_crsp size_grp obs_main exch_main primary_sec gvkey iid permno permco excntry curcd fx 
		common comp_tpci crsp_shrcd comp_exchg crsp_exchcd gics sic naics ff49
		adjfct shares me me_company prc prc_local dolvol ret ret_local ret_exc ret_lag_dif ret_exc_lead1m
		market_equity enterprise_value book_equity assets sales net_income; 
	set world_data4;
run;

* Delete Temporary Files;
%if &delete_temp.=1 %then %do;
	proc delete data=
		world_data_prelim
		scratch.beta_60m scratch.qmj scratch.resmom_ff3_12_1 scratch.resmom_ff3_6_1 
		scratch.mp_factors scratch.firm_age scratch.market_chars_d; 
	run;
%end;

* Save combined data;
proc sort data=world_data5 out=scratch.world_data nodup; by id eom; run;

*****************************************************************************
* Create Output in .csv Format for Download
**************************************************************************** ; 
* Create Output Folder;
options dlcreatedir;
libname op "&scratch_folder./output"; 

option nonotes;
* Small Files;
proc export data=scratch.market_returns_daily
    outfile="&scratch_folder./output/market_returns_daily.csv"   
    dbms=CSV
    replace;
run;
proc export data=scratch.market_returns
    outfile="&scratch_folder./output/market_returns.csv"   
    dbms=CSV
    replace;
run;
proc export data=scratch.nyse_cutoffs
    outfile="&scratch_folder./output/nyse_cutoffs.csv"   
    dbms=CSV
    replace;
run;
proc export data=scratch.return_cutoffs
    outfile="&scratch_folder./output/return_cutoffs.csv"   
    dbms=CSV
    replace;
run;
proc export data=scratch.return_cutoffs_daily
    outfile="&scratch_folder./output/return_cutoffs_daily.csv"   
    dbms=CSV
    replace;
run;
option notes;

* Save main data as .csv files by country;
%if &save_csv.=1 %then %do;
	%save_main_data_csv(out=Characteristics, data=scratch.world_data, path=&scratch_folder./output, end_date=&end_date.); 
%end;
* Save daily return data as .csv files by country;
%if &save_daily_ret.=1 %then %do;
	%save_daily_ret_csv(out=Daily Returns, data=scratch.world_dsf, path=&scratch_folder./output, end_date=&end_date.);
%end;
* Save monthly return data as .csv files by country;
%if &save_monthly_ret.=1 %then %do;
	%save_monthly_ret_csv(out=world_ret_monthly, data=scratch.world_msf, path=&scratch_folder./output, end_date=&end_date.);
%end;
* Delete Temporary Files;
%if &delete_temp.=1 %then %do;
	proc delete data=
		scratch.market_returns_daily scratch.market_returns
		scratch.nyse_cutoffs scratch.return_cutoffs scratch.return_cutoffs_daily
		scratch.world_dsf scratch.world_msf scratch.world_data; 
	run;
%end;

================================================
FILE: GlobalFactors/market_chars.sas
================================================
* Market Chars: Monthly;
%let monthly_chars=
	/* Market Based Size Variables */
	market_equity
	
	/* Total Dividend Paid to Market Equity */
	div1m_me div3m_me div6m_me div12m_me
	
	/* Special Dividend Paid to Market Equity */
	divspc1m_me divspc12m_me
	
	/* Change in Shares Outstanding */
	chcsho_1m chcsho_3m chcsho_6m chcsho_12m
	
	/* Net Equity Payout */
	eqnpo_1m eqnpo_3m eqnpo_6m eqnpo_12m
	
	/* Momentum/Reversal */
	ret_1_0 ret_2_0 ret_3_0 ret_3_1 ret_6_0 ret_6_1 ret_9_0 ret_9_1 ret_12_0 ret_12_1 ret_12_7 ret_18_1 ret_24_1 ret_24_12
	ret_36_1 ret_36_12 ret_48_1 ret_48_12 ret_60_1 ret_60_12 ret_60_36
	
	/* Seasonality */
	seas_1_1an seas_2_5an seas_6_10an seas_11_15an seas_16_20an
	seas_1_1na seas_2_5na seas_6_10na seas_11_15na seas_16_20na
	
;

%put ### In total %nwords(&monthly_chars.) monthly characteristics will be created ###;
%macro market_chars_monthly(out=, data=, market_ret=, local_currency=);	
	%if &local_currency=1 %then %do;
		%let ret_var = ret_local;
	%end;
	%if &local_currency=0 %then %do;
		%let ret_var = ret;
	%end;
	
	/* Helper macro: Apply function lag0 to lag &n. */
	%macro apply_to_lastn(x=, _n=, func=);
		%let mv = &func.(&x.; 
		%do _i=1 %to &_n.-1;
			%let mv = &mv., lag&_i.(&x.);
		%end;
		%let mv = &mv.);
		&mv.;
	%mend apply_to_lastn;
	
	/* Get Important Variables */
	proc sql; 
		create table __monthly_chars1 as
		select a.id, a.date, a.eom, a.me, a.shares, a.adjfct, 
			a.prc, a.ret, a.ret_local, a.&ret_var. as ret_x,  
			a.div_tot, a.div_cash, a.div_spc, a.dolvol,
			a.ret_lag_dif, (a.ret_local = 0) as ret_zero, 
			a.ret_exc, b.mkt_vw_exc									/* Currently, Excess return is in USD because we lack RF for most markets */		
		from &data. as a left join &market_ret. as b
		on a.excntry=b.excntry and a.eom=b.eom
		order by a.id, a.eom;
	quit;
	
	* Ensure that there is a lag of 1 month between each obs;
	proc sql;
		create table __stock_coverage as 
		select id, min(eom) as start_date, max(eom) as end_date
		from __monthly_chars1
		group by id;
	quit;
	
	%expand(data=__stock_coverage, out=__full_range, id_vars=id, start_date=start_date, end_date=end_date, freq='month', new_date_name=eom);
	
	proc sql;
		create table __monthly_chars2 as
		select a.id, a.eom, missing(b.id) as obs_miss,
			b.me, b.shares, b.adjfct, b.prc, b.ret, b.ret_local, b.ret_x, b.ret_lag_dif,
			b.div_tot, b.div_cash, b.div_spc, b.dolvol, b.ret_zero, b.ret_exc, b.mkt_vw_exc
		from __full_range as a left join __monthly_chars1 as b
		on a.id=b.id and a.eom=b.eom
		order by id, eom;
	quit;
	
	* Cummulative Return Index;
	data __monthly_chars3;
		set __monthly_chars2;
		by id;
		retain ri_x;  /* Local or USD depending on &local_currency.*/
		retain ri;	  /* USD */
		retain count;
		if first.id then do;
			ri_x = sum(1, ret_x); /* Most will have missing return for the first observation. In that case this evaluates to 1*/
			ri = sum(1, ret);
			count = 1;
		end;
		else do;
			ri_x = ri_x*sum(1, ret_x); /* By using sum instead of 1+ret missing returns are set to 0 */
			ri = ri*sum(1, ret);
			count = count+1;
		end;
	run;
	
	/* Set non-standard returns to missing */
	data __monthly_chars4; 
		set __monthly_chars3;
		ret_miss = missing(ret_x) or ret_lag_dif^=1; /* We set returns with more than one month between price observations to missing (only impact Compustat data) */
		if ret_miss = 1 then do;
			ret_x = .;
			ret=.;
			ret_local=.;
			ret_exc =.;
			mkt_vw_exc = .;
		end;
		drop obs_miss ret_zero ret_lag_dif;
	run; 
	
	/* Create variables */
	proc sort nodup data=__monthly_chars4; by id eom; run;
	%macro temp();
	data __monthly_chars5;
		set __monthly_chars4; 
		by id eom;
		/* Market Equity */
		market_equity = me;
		
		/* Dividend to Price */
		%let div_range = 1 3 6 12; /* 24, 36*/
		%do i=1 %to %sysfunc(countw(&div_range.));  
			%let n = %scan(&div_range., &i.);
			div_sum = %apply_to_lastn(x=div_tot*shares, _n=&n., func=sum);
			div&n.m_me = div_sum/me; 
			if count < &n. then 
				div&n.m_me = .;
			drop div_sum;
		%end;
		
		/* Special Dividends */
		%let div_spc_range = 1 12;
		%do i=1 %to %sysfunc(countw(&div_spc_range.));  
			%let n = %scan(&div_spc_range., &i.);
			div_spc_sum = %apply_to_lastn(x=div_spc*shares, _n=&n., func=sum);
			divspc&n.m_me = div_spc_sum/me; 
			if count < &n. then 
				divspc&n.m_me = .;
			drop div_spc_sum;
		%end;
		
		/* Change in Shares Outstanding (Market Based Proxy for Net Share Issuance)*/
		%let chcsho_lags = 1 3 6 12;
		%do i=1 %to %sysfunc(countw(&chcsho_lags.));  
			%let chcsho_lag = %scan(&chcsho_lags.,&i.);
			chcsho_&chcsho_lag.m = (shares*adjfct)/lag&chcsho_lag.(shares*adjfct)-1;
			if count <= &chcsho_lag. then 
				chcsho_&chcsho_lag.m=.; 
		%end;
		
		/* Net Equity Payout (Market based stock buyback+dividend-stock issuance)*/
		%let eqnpo_lags = 1 3 6 12;
		%do i=1 %to %sysfunc(countw(&eqnpo_lags.));  
			%let eqnpo_lag = %scan(&eqnpo_lags.,&i.);
			eqnpo_&eqnpo_lag.m = log(ri/lag&eqnpo_lag.(ri))-log(me/lag&eqnpo_lag.(me));
			if count <= &eqnpo_lag. then 
				eqnpo_&eqnpo_lag.m=.; 
		%end;
		
		
		/* Momentum/Reversal */
		%let from_lags = 1 2 3 3 6 6 9 9 12 12 12 18 24 24 36 36 48 48 60 60 60;
		%let to_lags   = 0 0 0 1 0 1 0 1 0  1  7  1  1 12  1 12 12  1  1 12 36;
		%do j=1 %to %sysfunc(countw(&from_lags.));  
			%let from = %scan(&from_lags., &j.);
			%let to = %scan(&to_lags., &j.);
			ret_&from._&to. = lag&to.(ri_x)/lag&from.(ri_x)-1;
			if count <= &from. or missing(lag&to.(ret_x)) then /* Require the last return observation to be non-missing */
				ret_&from._&to.=.; 
		%end;
		
		/* Seasonality: Heston and Sadka (2008) */
		%seasonality(start_year=1, end_year=1);
		%seasonality(start_year=2, end_year=5);
		%seasonality(start_year=6, end_year=10);
		%seasonality(start_year=11, end_year=15);
		%seasonality(start_year=16, end_year=20);
		
		/* Drop Uneccesary Variables */
		drop me shares adjfct shares adjfct prc ret ret_local ret_x
			div_tot div_cash div_spc dolvol ret_exc mkt_vw_exc ret_miss ri_x ri count;
	run;
	%mend;
	%temp();
	
	proc sort data=__monthly_chars5 out=&out.; by id eom; run;
	
	proc delete data=__stock_coverage __full_range __monthly_chars1 __monthly_chars2 __monthly_chars3 __monthly_chars4 __monthly_chars5; run;
%mend;

/* Calculate CAPM beta over a rolling window */
%macro market_beta(out=, data=, fcts=, __n =, __min=); 
	proc sql;
		create table __msf1 as 
		select a.id, a.eom, a.ret_exc, a.ret_lag_dif, b.mktrf
		from &data. as a left join &fcts. as b
		on a.excntry=b.excntry and a.eom=b.eom
		where a.ret_local^=0 and not missing(a.ret_exc) and a.ret_lag_dif=1 and not missing(b.mktrf);
	quit;	
	%winsorize_own(inset=__msf1, outset=__msf2, sortvar=eom, vars=ret_exc, perc_low=0.1, perc_high=99.9); /* Winsorize returns at 0.1% and 99.9% */
	proc sort data=__msf2; by id eom; run;
	
	proc sql;
		create table month_ends as 
		select distinct eom
		from __msf2
		order by eom;
	quit;
	
	* Divide data into __n groups;
	proc sql;
		create table dates_apply as 
		select *, mod(monotonic(), &__n.) as grp
		from month_ends;
	quit;
	
	* Helper macro: If first group, save &new. as &base. otherwise, append &new. to &base.;
	%macro save_or_append(base=, new=);
		%if &__grp. = 0 %then %do;
			data &base.; set &new.; run;
		%end;
		%else %do;
			proc append base=&base. data=&new.; run;
		%end;
	%mend;
	
	%do __grp=0 %to %eval(&__n. - 1);
		%put ############### GROUP %eval(&__grp.+1) out of &__n. ###############; 
		* Prepare data;
		proc sql;
			create table calc_dates as
			select a.eom, b.eom as calc_date
			from dates_apply as a left join dates_apply(where=(grp = &__grp.)) as b
			on a.eom > intnx("month", b.eom, -&__n., "e") and a.eom <= b.eom;
		quit;
		
		proc sql;
			create table calc_data as 
			select a.*, b.calc_date
			from __msf2 as a left join calc_dates as b
			on a.eom = b.eom
			where not missing(b.calc_date)  
			group by a.id, b.calc_date
			having count(*) >= &__min.
			order by a.id, b.calc_date;
		quit;
		
		proc reg data=calc_data outest=__capm1 edf NOPRINT;
			by id calc_date;
			model ret_exc=mktrf;
		run;
		proc sql;
			create table __capm2 as 
			select id, calc_date as eom, mktrf as beta_&__n.m, sqrt(_rmse_**2 * _edf_ / (_edf_ + 1)) as ivol_capm_&__n.m
			from __capm1
			where (_edf_ + 2) >= &__min.;
		quit;
		%save_or_append(base=op_capm, new=__capm2);
	%end;
	proc sort data=op_capm out=&out. nodup; by id eom; run; 
	proc delete data=op_capm; run;
%mend;

/* MACRO: RESIDUAL MOMENTUM --------------
- Rolling regressions over &__n. months. used to calculate residual momentum. 
- Currently I have only implemented FF3 but I could easily extend it to CAPM and HXZ4
- Residual momentum is typically calculated over a shorter horizon than that used to estimate parameters.
  The number of months to include return data from, is indicated by &incl. The number of months to skip within that period,
  is indicated by &skip. Both incl and skip can be list, but they need to be of the same length,
*/
%macro residual_momentum(out=, data=, fcts=, type=, __n =, __min=, incl=, skip=); /* first_lag and last_lag can be a list but must have equal length. Type in (market, ff3, hxz4) */
	proc sql;
		create table __msf1 as 
		select a.id, a.eom, a.ret_exc, a.ret_lag_dif, b.mktrf, b.hml, b.smb_ff, b.roe, b.inv, b.smb_hxz
		from &data. as a left join &fcts. as b
		on a.excntry=b.excntry and a.eom=b.eom
		where a.ret_local^=0 and not missing(a.ret_exc) and not missing(b.mktrf) and ret_lag_dif=1; 
	quit;	
	%winsorize_own(inset=__msf1, outset=__msf2, sortvar=eom, vars=ret_exc, perc_low=0.1, perc_high=99.9); /* Winsorize returns at 0.1% and 99.9% */
	proc sort data=__msf2; by id eom; run;
	
	proc sql;
		create table month_ends as 
		select distinct eom
		from __msf2
		order by eom;
	quit;
	
	* Divide data into __n groups;
	proc sql;
		create table dates_apply as 
		select *, mod(monotonic(), &__n.) as grp
		from month_ends;
	quit;
	
	* Helper macro: If first group, save &new. as &base. otherwise, append &new. to &base.;
	%macro save_or_append(base=, new=);
		%if &__grp. = 0 %then %do;
			data &base.; set &new.; run;
		%end;
		%else %do;
			proc append base=&base. data=&new.; run;
		%end;
	%mend;
	
	%do __grp=0 %to %eval(&__n. - 1);
		%put ############### GROUP %eval(&__grp.+1) out of &__n. ###############; 
		* Prepare data;
		proc sql;
			create table calc_dates as
			select a.eom, b.eom as calc_date
			from dates_apply as a left join dates_apply(where=(grp = &__grp.)) as b
			on a.eom > intnx("month", b.eom, -&__n., "e") and a.eom <= b.eom;
		quit;
		
		proc sql;
			create table calc_data as 
			select a.*, b.calc_date
			from __msf2 as a left join calc_dates as b
			on a.eom = b.eom
			where not missing(b.calc_date)  
			group by a.id, b.calc_date
			having count(*) >= &__min. 
			order by a.id, b.calc_date;
		quit;	
		
		* Fama and French (1993) 3 factor model;
		%if %sysfunc(find(&type., ff3)) >= 1 %then %do;
			proc reg data=calc_data(where=(not missing(hml) and not missing(smb_ff))) NOPRINT;
				by id calc_date;
				model ret_exc=mktrf smb_ff hml;
				output out=__ff3_res1
					   residual=res;  
			run;
			%do __i=1 %to %nwords(&incl.);
				%let __in = %scan(&incl., &__i., %str(' '));
				%let __sk = %scan(&skip., &__i., %str(' '));
				proc sql;
					create table __ff3_res2 as 
					select *, (eom > intnx("month", calc_date, -&__in., "e") and eom <= intnx("month", calc_date, -&__sk., "e")) as incl  /* Incl=1 --> obs is included in momentum calculation */ 
					from __ff3_res1 
					group by id, calc_date
					having count(res) >= &__min. 
					order by id, calc_date, eom;
	 			
	 				create table __ff3_res3 as 
	 				select id, calc_date as eom, mean(res) / std(res) as resff3_&__in._&__sk.
	 				from __ff3_res2
	 				where incl = 1
	 				group by id, calc_date;
	 			quit;
	 			
				%save_or_append(base=op_&__in._&__sk., new=__ff3_res3);
			%end;
		%end;
	%end;
	/* Output */
	%do __i=1 %to %nwords(&incl.);
		%let __in = %scan(&incl., &__i., %str(' '));
		%let __sk = %scan(&skip., &__i., %str(' '));
		proc sort data=op_&__in._&__sk. out=&out._&__in._&__sk. nodup; by id eom; run; 
		proc delete data=op_&__in._&__sk.; run; 
	%end;
%mend;


/* MACRO FOR DAILY CHARS ----------------------------------------------

*/
%macro prepare_daily(data=, fcts=);
	/* Start timer */
	%let __prep_start = %sysfunc(datetime());
	/* Prepare stock level data */
	proc sql;
		create table dsf1 as 
		select a.excntry, a.id, a.date, a.eom, a.prc / a.adjfct as prc_adj, a.ret, a.ret_exc, a.dolvol as dolvol_d, a.shares, a.tvol,
			   b.mktrf, b.hml, b.smb_ff, b.roe, b.inv, b.smb_hxz,
			   a.ret_lag_dif, a.bidask,
			   sum(a.ret_local = 0) as zero_obs  /* Some firms have almost inclusively zero returns. These should be excluded */
		from &data. as a left join &fcts. as b
		on a.excntry = b.excntry and a.date = b.date
		where not missing(b.mktrf) /* not missing mktrf ensures that we look at trading days*/
		group by a.id, a.eom; 
	
		update dsf1
		set ret_exc = .,
		    ret = .
		where ret_lag_dif > 14;  /* Only used returns based on prices that are not more than two weeks old */
		
		alter table dsf1
		drop ret_lag_dif, bidask;
	quit;
	proc sort data=dsf1; by id date; run;
	
	* Create lead/lagged market returns (For dimson beta);
	proc sql;
		create table mkt_lead_lag1 as 
		select excntry, date, intnx('month',date,0,'E') as eom format=YYMMDDN8., mktrf
		from &fcts.
		order by excntry, date desc;
	quit;
	
	data mkt_lead_lag2;
		set mkt_lead_lag1;
		mktrf_ld1 = lag(mktrf);
		if excntry ^= lag(excntry) or eom ^= lag(eom) then mktrf_ld1 = .;  /* Eom condition is to avoid look-ahead bias */
	run;
	
	proc sort data=mkt_lead_lag2 out=mkt_lead_lag3; by excntry date; run;
	
	data mkt_lead_lag4;
		set mkt_lead_lag3;
		mktrf_lg1 = lag(mktrf);
		if excntry ^= lag(excntry) then mktrf_lg1 = .;
	run;
	
	* Overlapping returns used to calculate correlation;
	data corr_data;
		set dsf1;
		ret_exc_3l = ret_exc + lag(ret_exc) + lag2(ret_exc);
		mkt_exc_3l = mktrf + lag(mktrf) + lag2(mktrf);
		if id ^= lag2(id) then do;
			ret_exc_3l = .;
			mkt_exc_3l = .;
		end;
		keep id eom zero_obs ret_exc_3l mkt_exc_3l;
	run;
	
	*Unique Month Ends;
	proc sql;
		create table month_ends as 
		select distinct eom
		from dsf1
		order by eom;
	quit;
	
	/* Stop timer */
	data _null_;
		dur = datetime() - &__prep_start;
		put 30*'-' / ' PREPARING DAILY DATA TOOK:' dur time13.2 / 30*'-';
	run;
%mend;

* MACRO: ROLL APPLY DAILY -------------------------
- Apply &__stats. functions to rolling windows of data.
- The idea is to iteratively apply the functions to &__n. different splits of the data.
  The output of each function is a stock id-eom pair plus the calculated characteristics
- The currently implemented &__stats. are:
	* rvol, rmax, skew, capm, capm_ext, ff3, hxz4, dimsonbeta, downbeta, zero_trades, turnover, dolvol, ami, prc_to_high, mktcorr, mktvol
- Arguments:
	* OUT: Output dataset in a long format with all the requested characteristics
	* ...;
%macro roll_apply_daily(out=, __n=, __min=, fcts=, __month_ends=, sfx =,__stats=);  /* Create stats over rolling __n months. stats in (rvol, rmax, skew, capm, capm_ext, ff3, hxz4, dimsonbeta, downbeta, zero_trades, turnover, dolvol, ami, prc_to_high, mktcorr, mktvol) */
	/* Start timer */
	%let __roll_start = %sysfunc(datetime());
	* Divide data into __n groups;
	proc sql;
		create table dates_apply as 
		select *, mod(monotonic(), &__n.) as grp
		from &__month_ends.;
	quit;
	* Helper: If first group, save &new. as &base. otherwise, append &new. to &base.;
	%macro save_or_append(base=, new=);
		%if &__grp. = 0 %then %do;
			data &base.; set &new.; run;
		%end;
		%else %do;
			proc append base=&base. data=&new.; run;
		%end;
	%mend;
	
	* Drop unneccesary columns for faster join;
	data __input; set dsf1; run;
	%if %sysfunc(find(&__stats., ff3)) = 0 %then %do;
		proc sql;
			alter table __input
			drop hml, smb_ff;
		quit;
	%end;
	%if %sysfunc(find(&__stats., hxz4)) = 0 %then %do;
		proc sql;
			alter table __input
			drop roe, inv, smb_hxz;
		quit;
	%end;
	%if %sysfunc(find(&__stats., turnover)) = 0 and %sysfunc(find(&__stats., ami)) = 0 and %sysfunc(find(&__stats., zero_trades)) = 0 and %sysfunc(find(&__stats., dolvol)) = 0%then %do;
		proc sql;
			alter table __input
			drop dolvol_d, shares, tvol;
		quit;
	%end;
	%if %sysfunc(find(&__stats., prc_to_high)) = 0 %then %do;
		proc sql;
			alter table __input
			drop prc_adj;
		quit;
	%end;
	
	* Apply __stats to each group;
	%do __grp=0 %to %eval(&__n. - 1); 
		* Prepare data;
		proc sql;
			create table calc_dates as
			select a.eom, b.eom as calc_date
			from dates_apply as a left join dates_apply(where=(grp = &__grp.)) as b
			on a.eom > intnx("month", b.eom, -&__n., "e") and a.eom <= b.eom;
		quit;
		
		* Not neccesary if mktcorr is the only stat;
		%if %nwords(&__stats.)>1 or %sysfunc(find(&__stats., mktcorr))=0 %then %do;
			proc sql;
				/* Used for volume variables */
				create table calc_data_raw as 
				select a.*, b.calc_date
				from __input as a left join calc_dates as b
				on a.eom = b.eom
				where not missing(b.calc_date)  
				order by a.id, b.calc_date;
	
				/* Used for return variables */
				create table calc_data_screen as 
				select *
				from calc_data_raw
				where not missing(ret_exc) and zero_obs < 10  /* We exclude stock-months with 10 or more zero returns */
				group by id, calc_date
				having count(ret_exc) >= &__min.;
			quit;	
		%end;
		
		* Return Volatility;
		%if %sysfunc(find(&__stats., rvol)) >= 1 %then %do;
			proc sql;
				create table __rvol as 
				select id, calc_date as eom, std(ret_exc) as rvol&sfx.
				from calc_data_screen
				group by id, calc_date
				having count(ret_exc) >= &__min.;
			quit;
			%save_or_append(base=op_rvol, new=__rvol);
		%end;
		
		* Maximum Return;
		%if %sysfunc(find(&__stats., rmax)) >= 1 %then %do;
			proc rank data= calc_data_screen out = __rmax1 descending;
				by id calc_date;
				var ret;
				ranks ret_rank;
			run;
			proc sql;
				create table __rmax2 as 
				select id, calc_date as eom, mean(ret) as rmax5&sfx., max(ret) as rmax1&sfx.
				from __rmax1
				where ret_rank<=5
				group by id, calc_date;
			quit;
			%save_or_append(base=op_rmax, new=__rmax2);
		%end;
		
		* Return Skewness;
		%if %sysfunc(find(&__stats., skew)) >= 1 %then %do;
			proc means data=calc_data_screen skewness noprint;
				by id calc_date;
				var ret_exc;
				output out = __skew1
					   skewness = rskew&sfx.;
			run;
			proc sql;
				create table __skew2 as
				select id, calc_date as eom, rskew&sfx.
				from __skew1
				where _freq_ >= &__min.;
			quit;
			%save_or_append(base=op_skew, new=__skew2);
		%end;
		
		* Price-to-high;
		%if %sysfunc(find(&__stats., prc_to_high)) >= 1 %then %do;
			proc sql;
				create table __prc_high as 
				select id, calc_date as eom, prc_adj / max(prc_adj) as prc_highprc&sfx.
				from calc_data_screen
				group by id, calc_date
				having date = max(date) and count(prc_adj) >= &__min.;
			quit;
			%save_or_append(base=op_prc_high, new=__prc_high);
		%end;
		
		* Amihud (2002);
		%if %sysfunc(find(&__stats., ami)) >= 1 %then %do;
			proc sql;
				create table __ami as 
				select id, calc_date as eom, mean(abs(ret) / dolvol_d) *1e6 as ami&sfx.
				from calc_data_screen
				group by id, calc_date
				having count(dolvol_d) >= &__min.;
			quit;
			%save_or_append(base=op_ami, new=__ami);
		%end;
		
		* CAPM regression (beta + ivol);
		%if %sysfunc(find(&__stats., capm)) >= 1 and %sysfunc(find(&__stats., capm_ext)) = 0 %then %do;
			proc reg data=calc_data_screen outest=__capm1 edf NOPRINT;
				by id calc_date;
				model ret_exc=mktrf;
			run;
			proc sql;
				create table __capm2 as 
				select id, calc_date as eom, mktrf as beta&sfx., sqrt(_rmse_**2 * _edf_ / (_edf_ + 1)) as ivol_capm&sfx.
				from __capm1
				where (_edf_ + 2) >= &__min.;
			quit;
			%save_or_append(base=op_capm, new=__capm2);
		%end;
		
		* CAPM regression extended (beta + ivol + iskew + coskewness);
		%if %sysfunc(find(&__stats., capm_ext)) >= 1 %then %do;
			proc reg data=calc_data_screen outest=__capm_ext1 edf NOPRINT;
				by id calc_date;
				model ret_exc=mktrf;
				output out=__capm_ext_res
					   residual=res;  /* Including the output statement increases the time by a factor of 3. It's neccesary to compute skewness */
			run;
			
			proc sql;
				create table __capm_ext2 as 
				select id, calc_date as eom, mktrf as beta&sfx., sqrt(_rmse_**2 * _edf_ / (_edf_ + 1)) as ivol_capm&sfx.
				from __capm_ext1
				where (_edf_ + 2) >= &__min.;
			quit;
			
			* Idiosyncratic skewness;
			proc means data=__capm_ext_res skewness noprint;
				by id calc_date;
				var res;
				output out = __capm_ext_skew(where=(_freq_ >= &__min.))
					   skewness = iskew_capm&sfx.;
			run;
			
			* Coskewness;
			proc sql;
				create table __capm_ext_coskew1 as 
				select id, calc_date, res, mktrf - mean(mktrf) as mktrf_dm
				from __capm_ext_res
				group by id, calc_date;
				
				create table  __capm_ext_coskew2 as 
				select id, calc_date, mean(res * mktrf_dm**2) / (sqrt(mean(res**2)) * mean(mktrf_dm**2) ) as coskew&sfx.
				from __capm_ext_coskew1
				group by id, calc_date
				having count(res) >= &__min.;
			quit;
			
			proc sql;
				create table __capm_ext3 as 
				select a.*, b.iskew_capm&sfx., c.coskew&sfx.
				from __capm_ext2 as a
				left join __capm_ext_skew as b on a.id=b.id and a.eom=b.calc_date
				left join __capm_ext_coskew2 as c on a.id=c.id and a.eom=c.calc_date;
			quit;
			
			%save_or_append(base=op_capm_ext, new=__capm_ext3);
		%end;
		
		* Fama and French (1993) 3 factor model;
		%if %sysfunc(find(&__stats., ff3)) >= 1 %then %do;
			proc reg data=calc_data_screen(where=(not missing(hml) and not missing(smb_ff))) outest=__ff31 edf NOPRINT;
				by id calc_date;
				model ret_exc=mktrf smb_ff hml;
				output out=__ff3_res
					   residual=res;  /* Including the output statement increases the time by a factor of 3. It's neccesary to compute skewness */
			run;
			
			proc sql;
				create table __ff32 as 
				select id, calc_date as eom, sqrt(_rmse_**2 * _edf_ / (_edf_ + 1)) as ivol_ff3&sfx.
				from __ff31
				where (_edf_ + 4) >= &__min.;
			quit;
			
			* Idiosyncratic skewness;
			proc means data=__ff3_res skewness noprint;
				by id calc_date;
				var res;
				output out = __ff3_skew(where=(_freq_ >= &__min.))
					   skewness = iskew_ff3&sfx.;
			run;
			
			proc sql;
				create table __ff33 as 
				select a.*, b.iskew_ff3&sfx.
				from __ff32 as a
				left join __ff3_skew as b on a.id=b.id and a.eom=b.calc_date;
			quit;
			%save_or_append(base=op_ff3, new=__ff33);
		%end;
		
		* Hou, Xue and Zhang (2015) 4 factor model;
		%if %sysfunc(find(&__stats., hxz4)) >= 1 %then %do;
			proc reg data=calc_data_screen(where=(not missing(roe) and not missing(inv) and not missing(smb_hxz))) outest=__hxz41 edf NOPRINT;
				by id calc_date;
				model ret_exc=mktrf smb_hxz roe inv;
				output out=__hxz4_res
					   residual=res;  /* Including the output statement increases the time by a factor of 3. It's neccesary to compute skewness */
			run;
			
			proc sql;
				create table __hxz42 as 
				select id, calc_date as eom, sqrt(_rmse_**2 * _edf_ / (_edf_ + 1)) as ivol_hxz4&sfx.
				from __hxz41
				where (_edf_ + 5) >= &__min.;
			quit;
			
			* Idiosyncratic skewness;
			proc means data=__hxz4_res skewness noprint;
				by id calc_date;
				var res;
				output out = __hxz4_skew(where=(_freq_ >= &__min.))
					   skewness = iskew_hxz4&sfx.;
			run;
			
			proc sql;
				create table __hxz43 as 
				select a.*, b.iskew_hxz4&sfx.
				from __hxz42 as a
				left join __hxz4_skew as b on a.id=b.id and a.eom=b.calc_date;
			quit;
			%save_or_append(base=op_hxz4, new=__hxz43);
		%end;
		
		* Dimson beta;
		%if %sysfunc(find(&__stats., dimsonbeta)) >= 1 %then %do;
			proc sql;
				create table __dimson1 as 
				select a.excntry, a.id, a.date, a.eom, a.ret_exc, a.mktrf, b.mktrf_lg1, b.mktrf_ld1
				from calc_data_screen as a left join mkt_lead_lag4 as b
				on a.excntry = b.excntry and a.date = b.date
				where not missing(b.mktrf_lg1) and not missing(b.mktrf_ld1);
				
				create table __dimson2 as 
				select *
				from __dimson1
				group by id, eom
				having count(*) >= (&__min. - 1);  /* Minus one to reflect the fact that there is one less available observation due to the need of avoiding lookahead bias. */ 
			quit;
			
			proc reg data=__dimson2 outest=__dimson3 edf NOPRINT;
				by id eom;
				model ret_exc=mktrf mktrf_lg1 mktrf_ld1;
			run;
			
			data __dimson4;
				set __dimson3;
				beta_dimson&sfx. = mktrf + mktrf_lg1 + mktrf_ld1;	
				keep id eom beta_dimson&sfx.;
			run;
			%save_or_append(base=op_dimson, new=__dimson4);
		%end;
		
		* Downside beta;
		%if %sysfunc(find(&__stats., downbeta)) >= 1 %then %do;
			proc reg data=calc_data_screen(where=(mktrf < 0)) outest=__downbeta1 edf NOPRINT;
				by id calc_date;
				model ret_exc=mktrf;
			run;
			proc sql;
				create table __downbeta2 as 
				select id, calc_date as eom, mktrf as betadown&sfx.
				from __downbeta1
				where (_edf_ + 2) >= (&__min. / 2);  /* Use convention that we require half as many obs for downside beta */
			quit;
			%save_or_append(base=op_downbeta, new=__downbeta2);
		%end;
		
		* Number of zero trades with turnover as tiebreaker;
		%if %sysfunc(find(&__stats., zero_trades)) >= 1 %then %do;
			proc sql;
				create table __zero_trades1 as
				select id, calc_date as eom, mean(tvol=0) * 21 as zero_trades, mean(tvol / (shares * 1e6)) as turnover
				from calc_data_raw
				where not missing(tvol)
				group by id, calc_date
				having count(tvol) >= &__min.
				order by eom;
			quit;
			proc rank data=__zero_trades1(where=(not missing(zero_trades) and not missing(turnover))) out = __zero_trades2 descending ties=mean f; /* f means that we use fractional ranks i.e. between 0 and 1*/ 
				by eom;
				var turnover;
				ranks rank_turnover;
			run;
			proc sql;
				create table __zero_trades3 as
				select id, eom, zero_trades + rank_turnover / 100  as zero_trades&sfx. /* Divide by 100 to ensure that turnover only acts as a tie breaker (1/365*21=0.0833)*/
				from __zero_trades2;
			quit;
			%save_or_append(base=op_zero_trades, new=__zero_trades3);
		%end; 
				
		* Turnover;
		%if %sysfunc(find(&__stats., turnover)) >= 1 %then %do;
			proc sql;
				create table __turnover1 as 
				select id, date, calc_date, tvol / (shares * 1e6) as turnover_d  
				from calc_data_raw;
				
				create table __turnover2 as 
				select id, calc_date as eom, mean(turnover_d) as turnover&sfx., std(turnover_d) / (calculated turnover&sfx.) as turnover_var&sfx. 
				from __turnover1
				group by id, calc_date
				having count(turnover_d) >= &__min.;
			quit;
			%save_or_append(base=op_turnover, new=__turnover2);
		%end;
		
		* Dollar Volume;
		%if %sysfunc(find(&__stats., dolvol)) >= 1 %then %do;
			proc sql;
				create table __dolvol as 
				select id, calc_date as eom, mean(dolvol_d) as dolvol&sfx., std(dolvol_d) / (calculated dolvol&sfx.) as dolvol_var&sfx.
				from calc_data_raw
				group by id, calc_date
				having count(dolvol_d) >= &__min.;
			quit;
			%save_or_append(base=op_dolvol, new=__dolvol);
		%end;
		
		* Correlation to Market;
		%if %sysfunc(find(&__stats., mktcorr)) >= 1 %then %do;
			proc sql;
				create table __corr_data1 as 
				select a.*, b.calc_date
				from corr_data as a left join calc_dates as b
				on a.eom = b.eom
				where not missing(b.calc_date) and not missing(ret_exc_3l) and zero_obs < 10
				order by a.id, b.calc_date;
				
				create table __corr_data2 as 
				select *
				from __corr_data1
				group by id, calc_date
				having count(ret_exc_3l) >= &__min. and count(mkt_exc_3l) >= &__min.;
			quit;
			proc corr data = __corr_data2 outp=__corr1 noprint nomiss ;
				by id calc_date;
				var ret_exc_3l mkt_exc_3l;
			run;
			proc sql;
				create table __corr2 as 
				select id, calc_date as eom, ret_exc_3l as corr&sfx.
				from __corr1
				where _type_='CORR' and _name_ = 'mkt_exc_3l';
			quit;
			%save_or_append(base=op_corr, new=__corr2);
		%end;
		
		* Market Volatility (separately for each stock);
		%if %sysfunc(find(&__stats., mktvol)) >= 1 %then %do;
			proc sql;
				create table __mktvol as 
				select id, calc_date as eom, std(mktrf) as __mktvol&sfx.
				from calc_data_screen
				group by id, calc_date
				having count(ret_exc) >= &__min.;
			quit;
			%save_or_append(base=op_mktvol, new=__mktvol);
		%end;
		
		* NAME;
		%if %sysfunc(find(&__stats., NAME)) >= 1 %then %do;
			
			%save_or_append(base=, new=);
		%end;
		
	%end;	
	* Make all observations into a dataset by transposing and appending; 
	proc sql noprint;
	    select memname into :op_datasets separated by " "
	    from dictionary.tables
	    where lowcase(libname)="work" and  prxmatch("/^op\_/i", memname) > 0;
	quit;
	
	* Initialize dataset to append to;
	data &out.;
		format id 9.0 eom YYMMDDN8. stat $char20. value 16.8;
		stop;  
	run;
	%do k=1 %to %nwords(&op_datasets.);
		%let __dt = %scan(&op_datasets., &k., %str(' '));
		proc sort data=&__dt.; by id eom; run;
		proc transpose data=&__dt. out=__op(rename=(col1=value)) name=stat ;
			by id eom;
		run;
		proc append base=&out. data=__op force; run; 
		proc delete data=&__dt.; run;
	%end;
	/* Stop timer */
	data _null_;
		dur = datetime() - &__roll_start;
		put 30*'-' / ' DAILY ROLL APPLY TOOK:' dur time13.2 / 30*'-';
	run;
%mend;

/* MACRO: FINISH DAILY CHARS*/
%macro finish_daily_chars(out=);
	* Make bidask into a long format;
	proc transpose data=scratch.corwin_schultz out=bidask(rename=(col1=value)) name=stat;
		by id eom;
	run;
	* Combine all roll chars;
	data daily_chars1; set scratch.roll_21d; run;
	proc append base=daily_chars1 data=scratch.roll_126d; run; 
	proc append base=daily_chars1 data=scratch.roll_252d; run; 
	proc append base=daily_chars1 data=scratch.roll_1260d; run;
	proc append base=daily_chars1 data=bidask force; run;
	proc sort data=daily_chars1 nodup; by id eom; run;
	proc transpose data = daily_chars1 out= daily_chars2(drop=_name_);
		by id eom;
		id stat;
		var value;
	run;
	proc sql;
		create table daily_chars3 as 
		select *, corr_1260d * rvol_252d/__mktvol_252d as betabab_1260d, rmax5_21d / rvol_252d as rmax5_rvol_21d
		from daily_chars2;
		
		alter table daily_chars3
		drop __mktvol_252d;
	quit;
	proc sort data=daily_chars3 out=&out.; by id eom; run;
%mend;


================================================
FILE: GlobalFactors/portfolios.R
================================================
library(lubridate)
library(tidyverse)
library(data.table)

# How To --------------------
# Paths
# - data_path:    Set to path with global characteristics data and if daily_pf==T should also contain a folder with daily stock returns.
# - output_path:  Set to desired output folder.
# - legacy_path:  Set to folder if you want to maintain legacy version. If not, set to NULL.
# Countries
# - countries:    Choose the countries where portfolio returns are created. Default: All countries in data_path/Characteristics
# Characteristics
# - chars:        Characteristics to create portfolios from. Can be any column from the global characteristics dataset.
# Portfolio Settings
# - settings:     Choose how to create portfolios. For more information, see description for the portfolios() function

# User Input -----------------------
# Paths
data_path <- "../../Data"
output_path <- "../../PaperFactors"
legacy_path <- "../../Legacy"
# Countries
countries <- list.files(path = paste0(data_path, "/Characteristics")) %>% str_remove(".csv")
# Chars 
chars <- c(
  "age",                 "aliq_at",             "aliq_mat",            "ami_126d",           
  "at_be",               "at_gr1",              "at_me",               "at_turnover",        
  "be_gr1a",             "be_me",               "beta_60m",            "beta_dimson_21d",    
  "betabab_1260d",       "betadown_252d",       "bev_mev",             "bidaskhl_21d",       
  "capex_abn",           "capx_gr1",            "capx_gr2",            "capx_gr3",           
  "cash_at",             "chcsho_12m",          "coa_gr1a",            "col_gr1a",           
  "cop_at",              "cop_atl1",            "corr_1260d",          "coskew_21d",         
  "cowc_gr1a",           "dbnetis_at",          "debt_gr3",            "debt_me",            
  "dgp_dsale",           "div12m_me",           "dolvol_126d",         "dolvol_var_126d",    
  "dsale_dinv",          "dsale_drec",          "dsale_dsga",          "earnings_variability",
  "ebit_bev",            "ebit_sale",           "ebitda_mev",          "emp_gr1",            
  "eq_dur",              "eqnetis_at",          "eqnpo_12m",           "eqnpo_me",           
  "eqpo_me",             "f_score",             "fcf_me",              "fnl_gr1a",           
  "gp_at",               "gp_atl1",             "ival_me",             "inv_gr1",            
  "inv_gr1a",            "iskew_capm_21d",      "iskew_ff3_21d",       "iskew_hxz4_21d",     
  "ivol_capm_21d",       "ivol_capm_252d",      "ivol_ff3_21d",        "ivol_hxz4_21d",      
  "kz_index",            "lnoa_gr1a",           "lti_gr1a",            "market_equity",      
  "mispricing_mgmt",     "mispricing_perf",     "ncoa_gr1a",           "ncol_gr1a",          
  "netdebt_me",          "netis_at",            "nfna_gr1a",           "ni_ar1",             
  "ni_be",               "ni_inc8q",            "ni_ivol",             "ni_me",              
  "niq_at",              "niq_at_chg1",         "niq_be",              "niq_be_chg1",        
  "niq_su",              "nncoa_gr1a",          "noa_at",              "noa_gr1a",           
  "o_score",             "oaccruals_at",        "oaccruals_ni",        "ocf_at",             
  "ocf_at_chg1",         "ocf_me",              "ocfq_saleq_std",      "op_at",              
  "op_atl1",             "ope_be",              "ope_bel1",            "opex_at",            
  "pi_nix",              "ppeinv_gr1a",         "prc",                 "prc_highprc_252d",   
  "qmj",                 "qmj_growth",          "qmj_prof",            "qmj_safety",         
  "rd_me",               "rd_sale",             "rd5_at",              "resff3_12_1",        
  "resff3_6_1",          "ret_1_0",             "ret_12_1",            "ret_12_7",           
  "ret_3_1",             "ret_6_1",             "ret_60_12",           "ret_9_1",            
  "rmax1_21d",           "rmax5_21d",           "rmax5_rvol_21d",      "rskew_21d",          
  "rvol_21d",            "sale_bev",            "sale_emp_gr1",        "sale_gr1",           
  "sale_gr3",            "sale_me",             "saleq_gr1",           "saleq_su",           
  "seas_1_1an",          "seas_1_1na",          "seas_11_15an",        "seas_11_15na",       
  "seas_16_20an",        "seas_16_20na",        "seas_2_5an",          "seas_2_5na",         
  "seas_6_10an",         "seas_6_10na",         "sti_gr1a",            "taccruals_at",       
  "taccruals_ni",        "tangibility",         "tax_gr1a",            "turnover_126d",      
  "turnover_var_126d",   "z_score",             "zero_trades_126d",    "zero_trades_21d",    
  "zero_trades_252d"
)
# Portfolio settings
settings <- list(
  end_date = as.Date("2023-12-31"),
  pfs = 3,
  source = c("CRSP", "COMPUSTAT"),                           
  wins_ret = T,
  bps = "non_mc",
  bp_min_n = 10,
  cmp = list(
    us = T,
    int = F
  ),
  signals = list(
    us = F,
    int = F,
    standardize = T,
    weight = "vw_cap"
  ),
  regional_pfs = list(
    ret_type = "vw_cap",                    # Type of return to use for regional factors
    country_excl = c("ZWE", "VEN"),         # Countries are excluded due to data issues
    country_weights = "market_cap",         # How to weight countries? In ("market_cap", "stocks", "ew")
    stocks_min = 5,                         # Minimum amount of stocks in each side of the portfolios
    months_min = 5 * 12,                    # Minimum amount of observations a factor needs to be included  
    countries_min = 3                       # Minimum number of countries necessary in a regional portfolio
  ), 
  daily_pf = T,
  ind_pf = T
)

# Portfolio Function -------------
portfolios <- function(
  data_path,
  excntry,
  chars, 
  source = c("CRSP", "COMPUSTAT"),       # Use data from "CRSP", "Compustat" or both: c("CRSP", "COMPUSTAT")
  wins_ret = T,                          # Should Compustat returns be winsorized at the 0.1% and 99.9% of CRSP returns? 
  pfs,                                   # Number of portfolios 
  bps,                                   # What should breakpoints be based on? Non-Microcap stocks ("non_mc") or NYSE stocks "nyse"
  bp_min_n,                              # Minimum number of stocks used for breakpoints
  cmp = F,                               # Create characteristics managed size portfolios?
  signals = F,                           # Create portfolio signals?
  signals_standardize = F,               # Map chars to [-0.5, +0.5]?,
  signals_w = "vw_cap",                  # Weighting for signals: in c("ew", "vw", "vw_cap")
  nyse_size_cutoffs,                     # Data frame with NYSE size breakpoints
  daily_pf= F,                           # Should daily return be estimated
  ind_pf = F,                            # Should industry portfolio returns be estimated
  ret_cutoffs = NULL,                    # Data frame for monthly winsorization. Neccesary when wins_ret=T
  ret_cutoffs_daily = NULL               # Data frame for daily winsorization. Neccesary when wins_ret=T and daily_pf=T
) {
  # Characteristic Data
  data <- fread(paste0(data_path, "/Characteristics/", excntry, ".csv"), select = c("id", "eom", "source_crsp", "comp_exchg", "crsp_exchcd", "size_grp", "ret_exc", "ret_exc_lead1m", "me", "gics", "ff49", chars), colClasses = c("eom"="character"))
  data[, eom := eom %>% lubridate::fast_strptime(format = "%Y%m%d") %>% as.Date()]
  # ME CAP
  data <- nyse_size_cutoffs[, .(eom, nyse_p80)][data, on = "eom"]
  data[, me_cap := pmin(me, nyse_p80)][, nyse_p80 := NULL]
  # Screens
  if (length(source) == 1) {
    if (source == "CRSP") {
      data <- data[source_crsp == 1]
    }
    if (source == "COMPUSTAT") {
      data <- data[source_crsp == 0]
    }
  }
  data <- data[!is.na(size_grp) & !is.na(me) & !is.na(ret_exc_lead1m)] # The ret_exc_lead1m screen assumes that investor knew at the beginning of the month that the security would delist. 
  # Daily Returns
  if (daily_pf) {
    daily <- fread(paste0(data_path, "/Daily Returns/", excntry, ".csv"), colClasses = c("date"="character"), select = c("id", "date", "ret_exc")); gc()
    daily[, date := date %>% lubridate::fast_strptime(format = "%Y%m%d") %>% as.Date()]
    daily[, eom_lag1 := floor_date(date, unit="month")-1]
  }
  # Winsorize Returns?
  if (wins_ret) {
    data <- ret_cutoffs[, .("eom" = eom_lag1, "p001"=ret_exc_0_1, "p999"=ret_exc_99_9)][data, on = "eom"]
    data[source_crsp == 0 & ret_exc_lead1m > p999, ret_exc_lead1m := p999]
    data[source_crsp == 0 & ret_exc_lead1m < p001, ret_exc_lead1m := p001]
    data[, c("source_crsp", "p001", "p999") := NULL]
    if (daily_pf) {
      daily[, year := year(date)]
      daily[, month := month(date)]
      daily <- ret_cutoffs_daily[, .(year, month, "p001"=ret_exc_0_1, "p999"=ret_exc_99_9)][daily, on = .(year, month)]
      daily[id>99999 & ret_exc > p999, ret_exc := p999] # Only winsorize Compustat data, id for CRSP is 5 digits, Compustat is 9: source_crsp == 0 
      daily[id>99999 & ret_exc < p001, ret_exc := p001]
      daily[, c("p001", "p999", "year", "month") := NULL]
    }
  }
  # Standardize to [-0.5, +0.5] interval (for signals)
  if (signals_standardize & signals) {
    data[, (chars) := lapply(.SD, function(x) frank(x, ties.method = "min", na.last = "keep")), .SDcols = chars, by = eom]
    data[, (chars) := lapply(.SD, as.numeric), .SDcols = chars]
    data[, (chars) := lapply(.SD, function(x) x / max(x, na.rm=T) - 0.5), .SDcols = chars, by = eom]
  }
  # Industry Portfolios 
  if (ind_pf) {
    ind_data <- data[!is.na(gics), .(eom, gics, excntry, ret_exc_lead1m, me, me_cap)]
    # Get first 2 digits of GICS code for industry groups
    ind_data[, gics := as.numeric(substr(ind_data$gics, 1, 2))]
    ind_gics <- ind_data[, .(
      n = .N,
      ret_ew = mean(ret_exc_lead1m),
      ret_vw = sum(ret_exc_lead1m * me) / sum(me),
      ret_vw_cap = sum(ret_exc_lead1m * me_cap) / sum(me_cap) 
    ), by = .(gics, eom)][, excntry := str_to_upper(excntry)]
    # Lead month to match using leaded returns
    ind_gics[, eom := ceiling_date(eom+1, unit = "month")-1]
    ind_gics <- ind_gics[n >= bp_min_n]
    # Estimate industry portfolios by Fama-French portfolios for US data
    if (excntry == "usa"){
      ind_data <- data[!is.na(ff49), .(eom, ff49, ret_exc_lead1m, me, me_cap)]
      ind_ff49 <- ind_data[, .(
        n = .N,
        ret_ew = mean(ret_exc_lead1m),
        ret_vw = sum(ret_exc_lead1m * me) / sum(me),
        ret_vw_cap = sum(ret_exc_lead1m * me_cap) / sum(me_cap) 
      ), by = .(ff49, eom)][, excntry := str_to_upper(excntry)]
      ind_ff49[, eom := ceiling_date(eom+1, unit = "month")-1]
      ind_ff49 <- ind_ff49[n >= bp_min_n]
    }
  }
  # Prepare output list
  output <- list()
  # Apply Portfolio Function to Each Characteristic
  char_pfs <- chars %>% lapply(function(x) {
    op <- list()
    print(paste0("   " , x, ": ", match(x, chars), " out of ", length(chars)))
    data[, var := as.double(get(x))]
    # Unless we need to compute signals, limit size of data
    if(!signals) {
      sub <- data[!is.na(var), .(id, eom, var, size_grp, ret_exc_lead1m, me, me_cap, crsp_exchcd, comp_exchg)]
    } else {
      sub <- data[!is.na(var)]
    }
    # Portfolio Assignment
    if (bps == "nyse") {
      sub[, bp_stock := (crsp_exchcd == 1 & is.na(comp_exchg)) | (comp_exchg == 11 & is.na(crsp_exchcd))]
    }
    if (bps == "non_mc") {
      sub[, bp_stock := (size_grp %in% c("mega", "large", "small"))]
    }
    sub[, bp_n := sum(bp_stock), by = eom]
    sub <- sub[bp_n >= bp_min_n] # require at least 10 stocks for break points
    if (nrow(sub) != 0) {
      sub[, cdf := ecdf(var[bp_stock == T])(var), by = eom]
      sub[, min_cdf := min(cdf), by = eom]
      sub[cdf == min_cdf, cdf := 0.00000001] # To ensure that the lowest value is in portfolio 1   
      sub[, pf := ceiling(cdf*pfs), by = eom]  
      sub[pf == 0, pf := 1]  # Happens when non-bp stocks extend beyond bp stock range
      # Returns 
      op$pf_returns <- sub[, .(
        characteristic = x,
        n = .N,
        signal = median(var),
        ret_ew = mean(ret_exc_lead1m),
        ret_vw = sum(ret_exc_lead1m * me) / sum(me),
        ret_vw_cap = sum(ret_exc_lead1m * me_cap) / sum(me_cap)
      ), by = .(pf, eom)]
      op$pf_returns[, eom := ceiling_date(eom+1, unit = "month")-1]  # Reflect the fact that returns are leaded
      # Signals
      if (signals) {
        if (signals_w == "ew") {
          sub[, w := 1/.N, by = .(pf, eom)]
        }
        if (signals_w == "vw") {
          sub[, w := me / sum(me), .(pf, eom)]
        }
        if (signals_w == "vw_cap") {
          sub[, w := me_cap / sum(me_cap), .(pf, eom)]
        }
        sub[, (chars) := lapply(.SD, function(x) if_else(is.na(x), 0, x)), .SDcols = chars]  # Set missing to median of 0
        pf_signals <- sub[, lapply(.SD, function(x) sum(w * x)), .SDcols = chars, by = .(pf, eom)]
        pf_signals[, characteristic := x]
        pf_signals[, eom := ceiling_date(eom+1, unit = "month")-1]  # Reflect the fact that returns are leaded
        op$signals <- pf_signals
      }
      # Daily Portfolios
      if (daily_pf) {
        # Keep weights constant throughout month
        weights <- sub[, .(id, w_ew = 1/.N, w_vw = me/sum(me), w_vw_cap = me_cap/sum(me_cap)), by = .(eom, pf)]
        daily_sub <- weights[daily, on = .(id, eom=eom_lag1)][!is.na(pf) & !is.na(ret_exc)]
        op$pf_daily <- daily_sub[, .(
          n = .N,
          ret_ew = sum(w_ew*ret_exc),
          ret_vw = sum(w_vw*ret_exc),
          ret_vw_cap = sum(w_vw_cap*ret_exc)
        ), by = .(pf, date)][, characteristic := x]
      }
      # Output
      return(op)  
    } 
  })
  output$pf_returns <- char_pfs %>% lapply(function(x) x$pf_returns) %>% rbindlist()
  if (daily_pf) {
    output$pf_daily <- char_pfs %>% lapply(function(x) x$pf_daily) %>% rbindlist()
  }
  if (ind_pf) {
    output$gics_returns <- ind_gics
    if (excntry == "usa") {
      output$ff49_returns <- copy(ind_ff49)
    }
  }
  if (nrow(output$pf_returns) != 0) {
    output$pf_returns[, excntry := str_to_upper(excntry)]
    if (daily_pf) {
      output$pf_daily[, excntry := str_to_upper(unique(output$pf_returns[, excntry]))]
    }
    if (signals) {
      output$signals <- char_pfs %>% lapply(function(x) x$signals) %>% rbindlist()
      output$signals[, excntry := str_to_upper(excntry)]
    }
  }
  # Characteristic Managed Portfolios 
  if (cmp) {
    output$cmp <- chars %>% lapply(function(x) {
      print(paste0("   CMP - " , x, ": ", match(x, chars), " out of ", length(chars)))
      data[, var := get(x)]
      sub <- data[!is.na(var), .(eom, var, size_grp, ret_exc_lead1m)]
      sub[, p_rank := frank(var, na.last=NA, ties.method = "average") / (.N + 1), by = .(size_grp, eom)] # Notice tie method
      sub[, p_rank_dev := p_rank - mean(p_rank), by = .(size_grp, eom)]
      sub[, weight := p_rank_dev / (sum(abs(p_rank_dev)) / 2), by = .(size_grp, eom)]  # 1 unit invested in each of the long and short pf
      cmp <- sub[, .(
        excntry = excntry,
        characteristic = x,
        n_stocks = .N,
        ret_weighted = sum(ret_exc_lead1m * weight),
        signal_weighted = sum(var * weight),
        sd_var = sd(var)
      ), by = .(size_grp, eom)]
      cmp <- cmp[sd_var != 0][, sd_var := NULL]
      cmp[, eom := ceiling_date(eom+1, unit = "month")-1]  # Reflect the fact that returns are leaded
      return(cmp)
    }) %>% rbindlist()
    output$cmp[, excntry := str_to_upper(excntry)]
  }
  # Output
  return(output)
}

# Extract Neccesary Information --------------------
# Factor Details
char_info <- readxl::read_xlsx("Factor Details.xlsx", sheet = "details", range = "A1:N300") %>%
  select("characteristic"=abr_jkp, direction) %>%
  filter(!is.na(characteristic)) %>%
  mutate(direction = direction %>% as.integer) %>%
  setDT()
# Country Classification
country_classification <- readxl::read_xlsx("Country Classification.xlsx", 
                                            sheet = "countries", range = "A1:I200") %>%
  select(excntry, msci_development, region) %>%
  filter(!is.na(excntry) & !(excntry %in% settings$regional_pfs$country_excl)) %>%
  setDT()
regions <- tibble(
  name = c("developed", "emerging", "frontier", "world", "world_ex_us"),
  country_codes = list(
    country_classification[msci_development == "developed" & excntry != "USA"]$excntry,
    country_classification[msci_development == "emerging"]$excntry,
    country_classification[msci_development == "frontier"]$excntry,
    country_classification$excntry,
    country_classification[excntry != "USA"]$excntry
  ),
  countries_min = c(rep(settings$regional_pfs$countries_min, 3), 1, 3)
)
# Cluster Labels
cluster_labels <- fread("Cluster Labels.csv")
# NYSE Cutoff 
nyse_size_cutoffs <- fread(paste0(data_path, "/nyse_cutoffs.csv"), colClasses = c("eom"="character"))
nyse_size_cutoffs[, eom := as.Date(eom, format = "%Y%m%d")]
# CRSP Return Cutoffs
ret_cutoffs <- fread(paste0(data_path, "/return_cutoffs.csv"), colClasses = c("eom"="character"))
ret_cutoffs[, eom := as.Date(eom, format = "%Y%m%d")]
ret_cutoffs[, eom_lag1 := floor_date(eom, unit = "month") - 1]  # Because we use ret_exc_lead1m
if (settings$daily_pf) {
  ret_cutoffs_daily <- fread(paste0(data_path, "/return_cutoffs_daily.csv")) 
}
# Market 
market <- fread(paste0(data_path, "/market_returns.csv"), colClasses = c("eom"="character"))
market[, eom := eom %>% as.Date("%Y%m%d")]
if (settings$daily_pf) {
  market_daily <- fread(paste0(data_path, "/market_returns_daily.csv"), colClasses = c("date"="character"))
  market_daily[, date := date %>% as.Date("%Y%m%d")]
}

# Create Portfolios -----------------------
portfolio_data <- countries %>% lapply(function(ex) {
  print(paste0(ex, ": ", match(ex, countries), " out of ", length(countries)))
  portfolios(
    data_path = data_path,
    excntry = ex, 
    chars = chars, 
    source = settings$source, 
    wins_ret = settings$wins_ret, 
    pfs=settings$pfs, 
    bps=settings$bps, 
    bp_min_n=settings$bp_min_n, 
    cmp = if_else(ex == "usa", settings$cmp$us, settings$cmp$int),
    signals=if_else(ex == "usa", settings$signals$us, settings$signals$int),
    signals_standardize=settings$signals$standardize, 
    signals_w=settings$signals$weight, 
    nyse_size_cutoffs = nyse_size_cutoffs, 
    daily_pf = settings$daily_pf,
    ind_pf = settings$ind_pf, 
    ret_cutoffs = ret_cutoffs,
    ret_cutoffs_daily = ret_cutoffs_daily
  )
})

# Daily Data
if (settings$daily_pf) {
  # Daily Portfolio Returns
  pf_daily <- portfolio_data %>% lapply(function(x) x$pf_daily) %>% rbindlist() 
  pf_daily %>% setorder(excntry, characteristic, pf, date)
  # Daily Long-Short Factors
  hml_daily <- pf_daily[, .(
    pfs = sum(pf == settings$pfs) + sum(pf == 1),
    n_stocks = n[pf==settings$pfs] + n[pf==1],
    n_stocks_min = as.integer(min(n[pf==settings$pfs], n[pf==1])),
    ret_ew = ret_ew[pf==settings$pfs] - ret_ew[pf==1],
    ret_vw = ret_vw[pf==settings$pfs] - ret_vw[pf==1],
    ret_vw_cap = ret_vw_cap[pf==settings$pfs] - ret_vw_cap[pf==1]
  ), .(excntry, characteristic, date)]
  hml_daily <- hml_daily[pfs == 2][, pfs := NULL]
  hml_daily %>% setorder(excntry, characteristic, date)
  lms_daily <- char_info[hml_daily, on = "characteristic"]
  resign_cols <- c("ret_ew", "ret_vw", "ret_vw_cap")
  lms_daily[, (resign_cols) := lapply(.SD, function(x) x*direction), .SDcols=resign_cols]
}

# Monthly Portfolio Returns
pf_returns <- portfolio_data %>% lapply(function(x) x$pf_returns) %>% rbindlist() 
pf_returns <- pf_returns %>% select(excntry, characteristic, pf, eom, n, signal, ret_ew, ret_vw, ret_vw_cap)
pf_returns %>% setorder(excntry, characteristic, pf, eom)

# GICS Returns 
if (settings$ind_pf) {
  gics_returns <- portfolio_data %>% lapply(function(x) x$gics_returns) %>% rbindlist()
  gics_returns %>% setorder(excntry, gics, eom)
  ff49_returns <- portfolio_data[[which(countries == "usa")]]$ff49_returns
  ff49_returns %>% setorder(excntry, ff49, eom)
}

# Create HML Returns
hml_returns <- pf_returns[, .(
  pfs = sum(pf == settings$pfs) + sum(pf == 1),
  signal = signal[pf==settings$pfs] - signal[pf==1],
  n_stocks = n[pf==settings$pfs] + n[pf==1],
  n_stocks_min = min(n[pf==settings$pfs], n[pf==1]),
  ret_ew = ret_ew[pf==settings$pfs] - ret_ew[pf==1],
  ret_vw = ret_vw[pf==settings$pfs] - ret_vw[pf==1],
  ret_vw_cap = ret_vw_cap[pf==settings$pfs] - ret_vw_cap[pf==1]
), .(excntry, characteristic, eom)]
hml_returns <- hml_returns[pfs == 2][, pfs := NULL]
hml_returns %>% setorder(excntry, characteristic, eom)

# Create Long-Short Factors [Sign Returns to be consistent with original paper]
lms_returns <- char_info[hml_returns, on = "characteristic"]
resign_cols <- c("signal", "ret_ew", "ret_vw", "ret_vw_cap")
lms_returns[, (resign_cols) := lapply(.SD, function(x) x*direction), .SDcols=resign_cols]

# Extract Signals (TBD)

# Extract CMP returns
cmp_returns <- portfolio_data %>% lapply(function(x) x$cmp) %>% rbindlist() 
cmp_returns <- cmp_returns %>% select(excntry, characteristic, size_grp, eom, n_stocks, signal_weighted, ret_weighted)

# Cluster portfolios ---------------
cluster_pfs <- cluster_labels[lms_returns, on = .(characteristic)][, .(
  n_factors = .N,
  ret_ew = mean(ret_ew),
  ret_vw = mean(ret_vw),
  ret_vw_cap = mean(ret_vw_cap)
), by = .(excntry, cluster, eom)]
if (settings$daily_pf) {
  cluster_pfs_daily <- cluster_labels[lms_daily, on = .(characteristic)][, .(
    n_factors = .N,
    ret_ew = mean(ret_ew),
    ret_vw = mean(ret_vw),
    ret_vw_cap = mean(ret_vw_cap)
  ), by = .(excntry, cluster, date)]
}

# Regional Portfolios ------------------------------------------------
regional_data <- function(data, mkt, date_col, char_col, countries, weighting, countries_min, periods_min, stocks_min) {
  # Determine Country Weights
  weights <- mkt[, .(excntry, get(date_col), mkt_vw_exc, "country_weight" = case_when(
    weighting == "market_cap" ~ me_lag1,
    weighting == "stocks" ~ as.double(stocks),
    weighting == "ew" ~ 1)
  )]
  weights %>% setnames(old="V2", new="date_col")
  # Portfolio Return 
  pf <- data[excntry %in% countries & n_stocks_min >= stocks_min] 
  pf %>% setnames(old=c(date_col, char_col), new = c("date_col", "char_col"))
  pf <- weights[pf, on = .(excntry, date_col)]
  pf <- pf[!is.na(mkt_vw_exc), .(  
    n_countries = .N,
    direction = unique(direction),
    ret_ew = sum(ret_ew*country_weight) / sum(country_weight),
    ret_vw = sum(ret_vw*country_weight) / sum(country_weight),
    ret_vw_cap = sum(ret_vw_cap*country_weight) / sum(country_weight),
    mkt_vw_exc = sum(mkt_vw_exc * country_weight) / sum(country_weight) 
  ), by = .(char_col, date_col)]
  # Minimum Requirement: Countries
  pf <- pf[n_countries >= countries_min]
  # Minimum Requirement: Months
  pf[, periods := .N, by = .(char_col)]
  pf <- pf[periods >= periods_min][, periods := NULL]
  pf %>% setorder(char_col, date_col)
  pf %>% setnames(old = c("date_col", "char_col"), new = c(date_col, char_col))
  return(pf)
}
# Regional Factors
regional_pfs <- 1:nrow(regions) %>% lapply(function(i) {
  info <- regions[i, ]
  reg_pf <- lms_returns %>% regional_data(mkt=market, countries = unlist(info$country_codes), date_col = "eom", char_col = "characteristic", 
                                          weighting = settings$regional_pfs$country_weights,
                                          countries_min = info$countries_min, periods_min = settings$regional_pfs$months_min, 
                                          stocks_min = settings$regional_pfs$stocks_min)
  reg_pf %>% mutate(region = info$name) %>% select(region, characteristic, direction, eom, n_countries, ret_ew, ret_vw, ret_vw_cap, mkt_vw_exc)
}) %>% rbindlist() 
if (settings$daily_pf) {
  regional_pfs_daily <- 1:nrow(regions) %>% lapply(function(i) {
    info <- regions[i, ]
    reg_pf <- lms_daily %>% regional_data(mkt=market_daily, countries = unlist(info$country_codes), date_col = "date", char_col = "characteristic", 
                                          weighting = settings$regional_pfs$country_weights,
                                          countries_min = info$countries_min, periods_min = settings$regional_pfs$months_min*21, 
                                          stocks_min = settings$regional_pfs$stocks_min)
    reg_pf %>% mutate(region = info$name) %>% select(region, characteristic, direction, date, n_countries, ret_ew, ret_vw, ret_vw_cap, mkt_vw_exc)
  }) %>% rbindlist() 
}
# Regional Cluster Portfolios
regional_clusters <- 1:nrow(regions) %>% lapply(function(i) {
  info <- regions[i, ]
  reg_pf <- cluster_pfs %>% rename("n_stocks_min"=n_factors) %>% mutate(direction = NA_real_) %>% # Hack to make the function applicable
    regional_data(mkt=market, countries = unlist(info$country_codes), date_col = "eom", char_col = "cluster",
                  weighting = settings$regional_pfs$country_weights,
                  countries_min = info$countries_min, periods_min = settings$regional_pfs$months_min, 
                  stocks_min = 1)
  reg_pf %>% mutate(region = info$name) %>% select(region, cluster, eom, n_countries, ret_ew, ret_vw, ret_vw_cap, mkt_vw_exc)
}) %>% rbindlist() 
if (settings$daily_pf) {
  regional_clusters_daily <- 1:nrow(regions) %>% lapply(function(i) {
    info <- regions[i, ]
    reg_pf <- cluster_pfs_daily %>% rename("n_stocks_min"=n_factors) %>% mutate(direction = NA_real_) %>% # Hack to make the function applicable
      regional_data(mkt=market_daily, countries = unlist(info$country_codes), date_col = "date", char_col = "cluster", 
                    weighting = settings$regional_pfs$country_weights,
                    countries_min = info$countries_min, periods_min = settings$regional_pfs$months_min*21, 
                    stocks_min = 1)
    reg_pf %>% mutate(region = info$name) %>% select(region, cluster, date, n_countries, ret_ew, ret_vw, ret_vw_cap, mkt_vw_exc)
  }) %>% rbindlist() 
}

# Save ----------------
if(!is.null(legacy_path)) {
  # Save Time Stamped Files
  folder <- paste0(legacy_path, "/Past Portfolios/", Sys.Date())
  dir.create(folder)
  settings %>% saveRDS(file = paste0(folder, "/settings.RDS"))
  market[eom <= settings$end_date] %>% fwrite(file = paste0(folder, "/market_returns.csv"))
  market_daily[date <= settings$end_date] %>% fwrite(file = paste0(folder, "/market_returns_daily.csv"))
  hml_returns[eom <= settings$end_date] %>% fwrite(file = paste0(folder, "/hml.csv"))
  cmp_returns[eom <= settings$end_date] %>% fwrite(file = paste0(folder, "/cmp.csv"))
  if (settings$daily_pf) {
    lms_daily[date <= settings$end_date] %>% fwrite(file = paste0(folder, "/lms_daily.csv"))
  }
  if (settings$ind_pf) {
    gics_returns[eom <= settings$end_date] %>% fwrite(file = paste0(folder, "/industry_gics.csv"))
    if (nrow(ff49_returns) != 0) {
      ff49_returns[eom <= settings$end_date] %>% fwrite(file = paste0(folder, "/industry_ff49.csv"))
    }
  }
}
# Save Most Recent Files
market[eom <= settings$end_date] %>% fwrite(file = paste0(output_path, "/market_returns.csv"))
pf_returns[eom <= settings$end_date] %>% fwrite(file = paste0(output_path, "/pfs.csv"))
hml_returns[eom <= settings$end_date] %>% fwrite(file = paste0(output_path, "/hml.csv"))
lms_returns[eom <= settings$end_date] %>% fwrite(file = paste0(output_path, "/lms.csv"))
cmp_returns[eom <= settings$end_date] %>% fwrite(file = paste0(output_path, "/cmp.csv"))
cluster_pfs[eom <= settings$end_date] %>% fwrite(file = paste0(output_path, "/clusters.csv"))
if (settings$daily_pf) {
  market_daily[date <= settings$end_date] %>% fwrite(file = paste0(output_path, "/market_returns_daily.csv"))
  pf_daily[date <= settings$end_date] %>% fwrite(file = paste0(output_path, "/pfs_daily.csv"))
  hml_daily[date <= settings$end_date] %>% fwrite(file = paste0(output_path, "/hml_daily.csv"))
  lms_daily[date <= settings$end_date] %>% fwrite(file = paste0(output_path, "/lms_daily.csv"))
  cluster_pfs_daily[date <= settings$end_date] %>% fwrite(file = paste0(output_path, "/clusters_daily.csv"))
}
if (settings$ind_pf) {
  gics_returns[eom <= settings$end_date] %>% fwrite(file = paste0(output_path, "/industry_gics.csv"))
  if (nrow(ff49_returns) != 0) {
    ff49_returns[eom <= settings$end_date] %>% fwrite(file = paste0(output_path, "/industry_ff49.csv"))
  }
}

# Regional Factors
reg_folder <- paste0(output_path, "/Regional Factors")
if (!dir.exists(reg_folder)) {
  dir.create(reg_folder)
}
for (reg in unique(regional_pfs$region)) {
  regional_pfs[eom <= settings$end_date & region %in% reg] %>% fwrite(file = paste0(reg_folder, "/", str_to_sentence(reg), ".csv"))
}
if (settings$daily_pf) {
  reg_folder_daily <- paste0(output_path, "/Regional Factors Daily")
  if (!dir.exists(reg_folder_daily)) {
    dir.create(reg_folder_daily)
  }
  for (reg in unique(regional_pfs_daily$region)) {
    regional_pfs_daily[date <= settings$end_date & region %in% reg] %>% fwrite(file = paste0(reg_folder_daily, "/", str_to_sentence(reg), ".csv"))
  }
}
# Regional Clusters
reg_folder <- paste0(output_path, "/Regional Clusters")
if (!dir.exists(reg_folder)) {
  dir.create(reg_folder)
}
for (reg in unique(regional_clusters$region)) {
  regional_clusters[eom <= settings$end_date & region %in% reg] %>% fwrite(file = paste0(reg_folder, "/", str_to_sentence(reg), ".csv"))
}
if (settings$daily_pf) {
  reg_folder_daily <- paste0(output_path, "/Regional Clusters Daily")
  if (!dir.exists(reg_folder_daily)) {
    dir.create(reg_folder_daily)
  }
  for (reg in unique(regional_clusters_daily$region)) {
    regional_clusters_daily[date <= settings$end_date & region %in% reg] %>% fwrite(file = paste0(reg_folder_daily, "/", str_to_sentence(reg), ".csv"))
  }
}

# Save Long/Short Factors by Country
cnt_folder <- paste0(output_path, "/Country Factors")
if (!dir.exists(cnt_folder)) {
  dir.create(cnt_folder)
}
for (exc in unique(lms_returns$excntry)) {
  lms_returns[eom <= settings$end_date & excntry==exc] %>% fwrite(file = paste0(cnt_folder, "/", exc, ".csv"))
}
if (settings$daily_pf) {
  cnt_folder_daily <- paste0(output_path, "/Country Factors Daily")
  if (!dir.exists(cnt_folder_daily)) {
    dir.create(cnt_folder_daily)
  }
  for (exc in unique(lms_daily$excntry)) {
    lms_daily[date <= settings$end_date & excntry==exc] %>% fwrite(file = paste0(cnt_folder_daily, "/", exc, ".csv"))
  }
}

# Save supplementary information
nyse_size_cutoffs %>% fwrite(file = paste0(output_path, "/nyse_cutoffs.csv"))
ret_cutoffs %>% fwrite(file = paste0(output_path, "/return_cutoffs.csv"))
if (settings$daily_pf) {
  ret_cutoffs_daily %>% fwrite(file = paste0(output_path, "/return_cutoffs_daily.csv"))
}


================================================
FILE: GlobalFactors/project_macros.sas
================================================
*************************************************************
*  HELPER MACROS
************************************************************ ; 
* Winsorize_own: Flexible version of WRDS %winsorize macro
	Only difference currently, is the explicit specification of perc_low and perc_high.
	This allows for winsorizing in each end and also to specify winsorization below 1%
;
%macro winsorize_own(inset=, outset=, sortvar=, vars=, perc_low=1, perc_high=99, trim=0);
	/* List of all variables */
	%let vars = %sysfunc(compbl(&vars));
	%let nvars = %nwords(&vars);
	
	/* Display Output */
	%put ### START.;
	
	/* Trimming / Winsorization Options */
	%if &trim=0 %then %put ### Winsorization; %else %put ### Trimming;
	%put ### Number of Variables:  &nvars;
	%put ### List   of Variables:  &vars;
	options nonotes;
	
	/* Ranking within &sortvar levels */
	%put ### Sorting... ;
	proc sort data=&inset; by &sortvar; run; 
	
	/* 2-tail winsorization/trimming */
	%let var2 = %sysfunc(tranwrd(&vars,%str( ),%str(__ )))__;
	%let var_p1 = %sysfunc(tranwrd(&vars,%str( ),%str(__&perc_low )))__&perc_low ;
	%let var_p2 = %sysfunc(tranwrd(&vars,%str( ),%str(__&perc_high )))__&perc_high ;
	
	/* Theis: Handle naming if winsorization < 1%. In this case, digits cause problems */
	%let var_p1 = %sysfunc(prxchange(s/\./_/, -1, &var_p1.));  /* Replace . with _ */
	%let var_p2 = %sysfunc(prxchange(s/\./_/, -1, &var_p2.));  /* Replace . with _ */
	
	/* Calculate upper and lower percentiles */
	proc univariate data=&inset noprint;
	  by &sortvar;
	  var &vars;
	  output out=_perc pctlpts=&perc_low &perc_high pctlpre=&var2;
	run;
	
	%if &trim=1 %then 
	  %let condition = %str(if myvars(i)>=perct2(i) or myvars(i)<=perct1(i) then myvars(i)=. );
	  %else %let condition = %str(myvars(i)=min(perct2(i),max(perct1(i),myvars(i))) );
	
	%if &trim=0 %then %put ### Winsorizing at &perc_low.% and &perc_high.%... ;
	%else %put ### Trimming at &perc_low.% and &perc_high.%... ;
	
	/* Save output with trimmed/winsorized variables */
	data &outset;
	merge &inset (in=a) _perc;
	by &sortvar;
	  if a;
	  array myvars {&nvars} &vars;
	  array perct1 {&nvars} &var_p1;
	  array perct2 {&nvars} &var_p2;
	  do i = 1 to &nvars;
	    if not missing(myvars(i)) then
	    do;
	      &condition;
	    end;
	  end;
	drop i &var_p1 &var_p2;
	run;
	
	/* House Cleaning */
	proc sql; drop table _perc; quit;
	options notes;
	
	%put ### DONE . ; %put ;

%mend winsorize_own;

/* MACRO: RETURN CUTOFFS
	The output of the macro is the 0.1%, 1%, 99% and 99.9% percentile of excess returns which can later be used for winsorization
	The reason why this procedure is necesary is that we output the country country-by-country, which makes across country winsorization difficult.
*/
%macro return_cutoffs(data=, freq=, out=, crsp_only=); 
	%if &freq.=m %then %do;
		%let date_var = eom;
		%let by_vars = eom;
	%end;
	%if &freq.=d %then %do;
		%let date_var = date;
		%let by_vars = year month;
	%end;
	
	%if &crsp_only.=1 %then %do;
		proc sort data=&data.(where=(source_crsp=1 and common=1 and obs_main=1 and exch_main=1 and primary_sec=1 and excntry ^= 'ZWE' and not missing(ret_exc))) out=base; by &date_var.; run; 
	%end;
	%if &crsp_only.=0 %then %do;
		proc sort data=&data.(where=(common=1 and obs_main=1 and exch_main=1 and primary_sec=1 and excntry ^= 'ZWE' and not missing(ret_exc))) out=base; by &date_var.; run;
	%end;
	
	%if &freq.=d %then %do;
		data base; 
			set base;
			year=year(date);
			month=month(date);
		run;
	%end;
	
	%let ret_types = ret ret_local ret_exc; 
	%do i=1 %to %sysfunc(countw(&ret_types.));  
		%let ret_type = %scan(&ret_types., &i.);
		proc univariate data=base noprint;
	  		by &by_vars.;
	  		var &ret_type.;
	  		output out=cutoffs n=n pctlpts=0.1 1 99 99.9 pctlpre=&ret_type._;
	  	run;
	  	%if &i.=1 %then %do;
	  		data &out.; set cutoffs; run;
	  	%end;
	  	%else %do;
	  		%if &freq.=m %then %do;
	  			proc sql;
		  			create table &out. as
		  			select a.*, b.&ret_type._0_1, b.&ret_type._1, b.&ret_type._99, b.&ret_type._99_9
		  			from &out. as a
		  			left join cutoffs as b
		  			on a.eom=b.eom;
		  		quit;
	  		%end;
	  		%if &freq.=d %then %do;
	  			proc sql;
		  			create table &out. as
		  			select a.*, b.&ret_type._0_1, b.&ret_type._1, b.&ret_type._99, b.&ret_type._99_9
		  			from &out. as a
		  			left join cutoffs as b
		  			on a.year=b.year and a.month=b.month;
		  		quit;
	  		%end;
	  	%end;
	%end;
	proc delete data= cutoffs base; run;
%mend;

/* MACRO: NYSE SIZE CUTOFFS
	Used for determining size groups and me cap weights
*/
%macro nyse_size_cutoffs(data=, out=);
	proc sort data=&data.(where=(crsp_exchcd=1 and obs_main = 1 and exch_main = 1 and primary_sec = 1 and common = 1 and not missing(me))) 
		out=nyse_stocks; 
		by eom; 
	run;
	
	proc means data=nyse_stocks noprint;
		by eom;
		var me;
		output out=&out.(drop=_type_ _freq_) N=n P1=nyse_p1 p20 = nyse_p20 P50=nyse_p50 p80 = nyse_p80;
	run;
	proc delete data=nyse_stocks; run;
%mend;

/* Flexible version of WRDS populate function which can also do daily frequency */
%macro populate_own(inset=, outset=, datevar=, idvar=, datename=, forward_max=, period=); /* Period in ('day', 'month') */
	/* Start Macro*/
	%put ; %put ### START. Populating Data --> Note that duplicate idvar and datevar will be removed;

	/* nodupkey sort necessary */
	proc sort data=&inset. out=__temp nodupkey; by &idvar descending &datevar.; run;
	options nonotes;
	
	/* Populate Dates */
	/* FORWARD_MAX is the Regular Periodicity or the Forward Population Intervals */
	%let nid = %nwords(&idvar.);
	%let id2 = %scan(&idvar.,&nid.,%str( ));
	
	data &outset. ; format &datename. YYMMDDN8.; 
		set __temp;
		by &idvar.;
		&datename. =&datevar.;
		output;
		following = lag(&datename.); 
		forward_max = intnx('month', &datevar., &forward_max.,'e');
		if first.&id2 then
		 	following = .;
		
  		n = intck(&period.,&datename., min(following, forward_max));
  		do i=1 to n-1;
	   		&datename. = intnx(&period.,&datename. ,1,"E"); output;
	  	end;
	 	
		drop following forward_max n i;
	run;
	
	proc sort data=&outset. nodupkey; by &idvar. &datevar. &datename.; run;
	
	/* House Cleaning */
	proc sql; drop table __temp; quit;
	options notes;
	%put ### DONE . Dataset &OUTSET. with &period. Frequency Generated ; %put ;

%MEND populate_own;

/* Generic expand in a data set with a start date column and an end date column. */
%macro expand(data=, out=, id_vars=, start_date=, end_date=, freq='day', new_date_name=date); /*freq in ('day', 'month')*/
	data __expanded;
	   set &data.;
	   format &new_date_name. YYMMDDN8.;
	   do i = 0 to intck(&freq., &start_date., &end_date.);
	      &new_date_name. = intnx(&freq., &start_date., i, 'e');
	      output;
	   end;
	   drop &start_date. &end_date. i;
	run;
	
	proc sort data=__expanded out=&out nodupkey; by &id_vars. &new_date_name.; run;
	proc delete data=__expanded; run;
%mend expand;

/* USD to Foreign FX Conversion Rate from Compustat*/
%macro compustat_fx(out=);
	data usd_curcdd; 
		curcdd='USD';
		datadate=input(put(19500101,8.),yymmdd8.);
		fx=1;
		format datadate yymmddn8.;
	run;  /* comp.exrt_dly only starts in 1982 and since we convert to USD we know that the fx for USD is 1 */
	
	proc sql; 
		create table __fx1 as
		select distinct a.tocurd as  curcdd  , a.datadate,  b.exratd/a.exratd as fx /*fx is quoted as x/USD so to go from x to USD do x*fx*/
		from comp.exrt_dly a , comp.exrt_dly b
		where a.fromcurd = 'GBP' and b.tocurd = 'USD' /*b.exratd is always from GBP to USD, a.exratd is from GBP to currency X*/
		and a.fromcurd = b.fromcurd and a.datadate = b.datadate;
	quit;
	
	data __fx2; set __fx1 usd_curcdd; run; 
	
	proc sort data = __fx2;  by curcdd descending datadate; run ; 
	
	/* Carry forward fx observations in case gaps*/
	data __fx3; format date YYMMDDN8.; 
		set __fx2;
		by curcdd;
		date = datadate;
		output;
		following = lag(date); 
		if first.curcdd then
		 	following = date+1;
  		n = following-date;
  		do i=1 to n-1;
	   		date = date+1; output;
	  	end;
	 	
		drop datadate following n i;
	run;
	
	proc sort data=__fx3 out=&out nodupkey; by curcdd date; run;
	
	proc delete data=usd_curcdd __fx1 __fx2 __fx3; run;
%mend compustat_fx; 


**********************************************************************************************************************
*  MACRO - Add Primary Security from Compustat *
**********************************************************************************************************************
The macro expects a data set with gvkey and iid. It then returns the same dataset with a column "primary_sec" which is 1 when a security is the primary security in the US, Canada or internationally.	
Importantly, a given company (gvkey) can potentially have up to 3 primary securities although it happens very rarely. In the full Compustat monthly dataset, of all gvkey-eom pairs we have
	0 primary securities:  1.82%
  	1 primary securities: 95.10%
  	2 primary securities:  3.08%
  	3 primary securities:  0.01%
;
%macro add_primary_sec(data=, out=, date_var=);
	proc sql;
		create table __prihistrow as 
		select gvkey, itemvalue as prihistrow, effdate, thrudate
		from comp.g_sec_history where item = 'PRIHISTROW';
	quit;
	
	proc sql;
		create table __prihistusa as 
		select gvkey, itemvalue as prihistusa, effdate, thrudate
		from comp.sec_history where item = 'PRIHISTUSA';
	quit;
	
	proc sql;
		create table __prihistcan as 
		select gvkey, itemvalue as prihistcan, effdate, thrudate
		from comp.sec_history where item = 'PRIHISTCAN';
	quit;
	
	proc sql;
		create table __header as 
		select gvkey, prirow, priusa, prican from comp.company
		outer union corr
		select gvkey, prirow, priusa, prican from comp.g_company;
	quit;
	
	proc sort data=__header nodupkey; by gvkey; run; /* Only one duplicate (gvkey=254381)*/
	
	proc sql;
		create table __data1 as 
		select distinct a.*, coalesce(b.prihistrow, e.prirow) as prihistrow, coalesce(c.prihistusa, e.priusa)  as prihistusa, coalesce(d.prihistcan, e.prican) as prihistcan /* Prefer historical primary sec indicator, if this is not available, use header. Distinct avoid around 400 dup observations due to gvkey=213098 having overlapping prihistrow in g_sec_history */
		from &data. as a 
		left join __prihistrow as b
			on a.gvkey=b.gvkey and a.&date_var.>=b.effdate and (a.&date_var.<=b.thrudate or missing(b.thrudate))
		left join __prihistusa as c
			on a.gvkey=c.gvkey and a.&date_var.>=c.effdate and (a.&date_var.<=c.thrudate or missing(c.thrudate))
		left join __prihistcan as d
			on a.gvkey=d.gvkey and a.&date_var.>=d.effdate and (a.&date_var.<=d.thrudate or missing(d.thrudate))
		left join __header as e
			on a.gvkey=e.gvkey;
			
		create table __data2 as 
		select *, (not missing(iid) and (iid=prihistrow or iid=prihistusa or iid=prihistcan)) as primary_sec /* If the security id is identified as primary in either USA, CANADA or Rest of the World it is deemed a primary security */
		from __data1;
	quit;
	
	data &out.; set __data2(drop = prihistrow prihistusa prihistcan);
	
	proc delete data=__prihistrow __prihistusa __prihistcan __header __data1 __data2; run;
%mend add_primary_sec;

/* MACRO - COMPUSTAT EXCHANGES */*
	This macro returns a dataset with classifications of individual exchanges in terms of country and an indicator of whether it's a normal or special exchange
;
%macro comp_exchanges(out=); 
	/* Exchange Classification */
	/* What about 153? . Could consider excluding 111 since its the same securities traded on 110. This is just due to a rule in Thailand limiting the ownership of foreign investors in Thailand https://thaishares.com/nvdr/*/
	%let special_exchanges =
			(0, 
			1, 
			2, 
			3, 
			4, 
			15, 16, 17, 18, 21, /* US exchanges not in NYSE, Amex and NASDAQ */
			13, 
			19, 
			20, 
			127, 
			150, 		/* AIAF Mercado De Renta Fija --> Spanish exchange for trading debt securities https://practiceguides.chambers.com/practice-guides/capital-markets-debt-2019/spain/1-debt-marketsexchanges */
			157, 
			229, 
			263, 
			269, 
			281, 
			283, 
			290, 
			320, 
			326, 
			341, 
			342, 
			347, 
			348, 
			349,		/* BATS Chi-X Europe --> Trades stocks from various european exchanges. Should we keep it?*/
			352)		/* CHI-X Australia --> Only Trades securities listed on ASX (exchg=106). Should we keep it?*/
	;

	/* Determine Country of Exchange (Note that we assume that this is constant through time) */
	proc sql;
		create table __ex_country1 as
		select distinct exchg, excntry from comp.g_security
		outer union corr
		select distinct exchg, excntry from comp.security;
		
		create table __ex_country2 as
		select distinct exchg, 
			case 
				when count(excntry)>1 then 'multi_national' /*, calculated count > 1 as multi_national*/
				else excntry
			end as excntry
		from __ex_country1
		where not missing(excntry) and not missing(exchg)
		group by exchg;
		
		create table __ex_country3 as
		select a.*, b.exchgdesc
		from __ex_country2 as a left join comp.r_ex_codes as b
		on a.exchg=b.exchgcd;
		
		create table &out. as 
		select *, (excntry ^= 'multi_national' and exchg not in &special_exchanges.) as exch_main
		from __ex_country3;
	quit;
	
	proc delete data= __ex_country1 __ex_country2 __ex_country3; run;
%mend comp_exchanges;

**********************************************************************************************************************
*                                    US - Data From CRSP
********************************************************************************************************************* ; 
%macro prepare_crsp_sf(freq=m); /* freq in ('d', 'm') for daily and monthly. Returns crsp_msf if p=m and crsp_dsf if p=d*/
	/* CRSP with Company Information*/
	proc sql;
		create table __crsp_sf1 as
		select a.permno, a.permco, a.date, (a.prc < 0) as bidask, abs(a.prc) as prc, a.shrout/1000 as shrout, calculated prc * calculated shrout as me,
		   a.ret, a.retx, a.cfacshr, a.vol, 
		   case when a.prc > 0 and a.askhi > 0 then a.askhi else . end as prc_high,   /* Highest price when prc is not the bid-ask average: https://wrds-web.wharton.upenn.edu/wrds/query_forms/variable_documentation.cfm?vendorCode=CRSP&libraryCode=crspa&fileCode=dsf&id=askhi*/
		   case when a.prc > 0 and a.bidlo > 0 then a.bidlo else . end as prc_low,    /* Lowest price when prc is not the bid-ask average: https://wrds-web.wharton.upenn.edu/wrds/query_forms/variable_documentation.cfm?vendorCode=CRSP&libraryCode=crspa&fileCode=dsf&id=bidlo */
		   b.shrcd, b.exchcd, c.gvkey, c.liid as iid, /*http://www.crsp.org/products/documentation/crspccmlink-security-link-history*/
		   b.exchcd in (1, 2, 3) as exch_main			
		from crsp.&freq.sf as a 
		left join crsp.&freq.senames as b
		   on a.permno=b.permno and a.date>=namedt and a.date<=b.nameendt
		left join crsp.ccmxpf_lnkhist as c
		   on a.permno=c.lpermno and (a.date>=c.linkdt or missing(c.linkdt)) and 
		   (a.date<=c.linkenddt or missing(c.linkenddt)) and c.linktype in ('LC', 'LU', 'LS');
	quit;
	
	/* Adjust trading volume following Gao and Ritter (2010)*/
	proc sql;
		update __crsp_sf1
		set vol = 
			case 
				when date < '01FEB2001'd then vol / 2
				when date <= '31DEC2001'd then vol / 1.8
				when date < '31DEC2003'd then vol / 1.6
				else vol
			end
		where exchcd = 3;
	quit;
	
	/* Add dividend and dollar volume */
	proc sort data=__crsp_sf1; by permno date; run;
	
	data __crsp_sf2;
		set __crsp_sf1;
		by permno;
		dolvol = abs(prc) * vol;
		div_tot = (ret-retx)*lag(prc)*(cfacshr/lag(cfacshr)); /* The CFACSHR part is to put it on the pr share basis of the current date */
		if first.permno then
			div_tot=.;
	run;
	
	/* Incorporate Delisting Returns */
	%if &freq.=d %then %do;
		proc sql;
			create table __crsp_sf3 as
			select a.*, b.dlret, b.dlstcd
			from __crsp_sf2 as a left join crsp.&freq.sedelist as b
			on a.permno=b.permno and a.date=b.dlstdt;
		quit;
	%end;
	%if &freq.=m %then %do;
		proc sql;
			create table __crsp_sf3 as
			select a.*, b.dlret, b.dlstcd
			from __crsp_sf2 as a left join crsp.&freq.sedelist as b
			on a.permno=b.permno and year(a.date)=year(b.dlstdt) and month(a.date)=month(b.dlstdt);
		quit;
	%end;
	
	data __crsp_sf4; 
		set __crsp_sf3; 
		if missing(dlret) and (dlstcd=500 or (520<=dlstcd<=584)) then dlret=-0.3; /*If delisting is missing and is for performance related reasons. Set to -30%. This is relevant to 155 observations only*/
		if missing(ret) and not missing(dlret) then ret=0;
		ret= (1+ret)*(1+coalesce(dlret, 0))-1; /*If missing set to zero*/
		drop dlret dlstcd;
	run;
	
	* Excess Return;
	%if &freq.=d %then %let scale=21;
	%if &freq.=m %then %let scale=1;
	
	proc sql;
		create table __crsp_sf5 as 
		select a.*, a.ret-coalesce(b.t30ret, c.rf)/&scale. as ret_exc /* I prefer crsp.mcti but FF has monthly updates */
		from __crsp_sf4 as a 
		left join crsp.mcti as b 
			on year(a.date)=year(b.caldt) and month(a.date)=month(b.caldt)
		left join ff.factors_monthly as c 
			on year(a.date)=year(c.date) and month(a.date)=month(c.date);
	quit;
	
	* Company Market Equity;
	proc sql;
		create table __crsp_sf6 as 
		select *, sum(me) as me_company
		from __crsp_sf5
		group by permco, date;
	quit;
	
	* Make volume comparable across daily and monthly set:  https://wrds-web.wharton.upenn.edu/wrds/query_forms/variable_documentation.cfm?vendorCode=CRSP&libraryCode=crspa&fileCode=dsf&id=vol;
	%if &freq.=m %then %do;
		proc sql;
			update __crsp_sf6
			set vol = vol*100,
			    dolvol = dolvol*100;
		quit;
	%end;
	
	proc sort nodupkey data=__crsp_sf6; by permno date; run; /*In monthly file: Two duplicates 15075-20180131 and 86812-20190731 In daily file 13 obs*/
	data crsp_&freq.sf; set __crsp_sf6;
	
	proc delete data= __crsp_sf1 __crsp_sf2 __crsp_sf3 __crsp_sf4 __crsp_sf5 __crsp_sf6; 
%mend prepare_crsp_sf;

**********************************************************************************************************************
*                                    World - Data From Compustat
********************************************************************************************************************* ; 
%macro prepare_comp_sf(freq=); /* freq in (d, m,both) */
	/* SECD has a lot of missing CSHOC. Therefore we use information from Accounting Statements. This is not a problem for g_secd*/
	%let comp_cond = indfmt='INDL' and datafmt='STD' and popsrc='D' and consol='C';
	proc sql; 
		create table __firm_shares1 as 
		select gvkey, datadate, cshoq as csho_fund, ajexq as ajex_fund 
		from comp.fundq where &comp_cond. and not missing(cshoq) and not missing(ajexq) 
		outer union corr
		select gvkey, datadate, csho as csho_fund, ajex as ajex_fund 
		from comp.funda where &comp_cond. and not missing(csho) and not missing(ajex);
	quit;
	%populate_own(inset=__firm_shares1, outset=__firm_shares2, datevar=datadate, idvar=gvkey, datename=ddate, forward_max=12, period = 'day');

	proc sql;
		create table __comp_dsf_na as
		select a.gvkey, a.iid, a.datadate, a.tpci, a.exchg, a.prcstd, a.curcdd, a.prccd as prc_local, a.ajexdi,
			case when a.prcstd^=5 then a.prchd else . end as prc_high_lcl,  /* Highest price when prc is not the bid-ask average: https://wrds-web.wharton.upenn.edu/wrds/query_forms/variable_documentation.cfm?vendorCode=COMP&libraryCode=compd&fileCode=secd&id=prchd */
			case when a.prcstd^=5 then a.prcld else . end as prc_low_lcl,   /* Lowest price when prc is not the bid-ask average: https://wrds-web.wharton.upenn.edu/wrds/support/Data/_001Manuals%20and%20Overviews/_001Compustat/_001North%20America%20-%20Global%20-%20Bank/_000dataguide/?file_name=Data/_001Manuals%20and%20Overviews/_001Compustat/_001North%20America%20-%20Global%20-%20Bank/_000dataguide */
			cshtrd, coalesce(a.cshoc/1e6, b.csho_fund*b.ajex_fund/a.ajexdi) as cshoc,  /* Prefer cshoc but if missing choose shares outstanding from accounting statement adjusted for issuance activity*/
		   	(a.prccd/a.ajexdi*a.trfd) as ri_local,
		   	a.curcddv, a.div, a.divd, a.divsp /* Dividend Variables */
		from comp.secd as a left join __firm_shares2 as b
		on a.gvkey=b.gvkey and a.datadate=b.ddate;
		
		/* Adjust trading volume of NASDAQ stocks following Gao and Ritter (2010)*/
		update __comp_dsf_na
		set cshtrd = 
			case 
				when datadate < '01FEB2001'd then cshtrd / 2
				when datadate <= '31DEC2001'd then cshtrd / 1.8
				when datadate < '31DEC2003'd then cshtrd / 1.6
				else cshtrd
			end
		where exchg = 14;
		
		create table __comp_dsf_global as
		select gvkey, iid, datadate, tpci, exchg, prcstd, curcdd, 
			prccd/qunit as prc_local, ajexdi, cshoc/1e6 as cshoc,
			case when prcstd^=5 then prchd/qunit else . end as prc_high_lcl,  /* Highest price when prc is not the bid-ask average: https://wrds-web.wharton.upenn.edu/wrds/query_forms/variable_documentation.cfm?vendorCode=COMP&libraryCode=compd&fileCode=secd&id=prchd */
			case when prcstd^=5 then prcld/qunit else . end as prc_low_lcl,   /* Lowest price when prc is not the bid-ask average: https://wrds-web.wharton.upenn.edu/wrds/support/Data/_001Manuals%20and%20Overviews/_001Compustat/_001North%20America%20-%20Global%20-%20Bank/_000dataguide/?file_name=Data/_001Manuals%20and%20Overviews/_001Compustat/_001North%20America%20-%20Global%20-%20Bank/_000dataguide */
			cshtrd, ((calculated prc_local)/ajexdi*trfd) as ri_local,
			curcddv, div, divd, divsp
		from comp.g_secd;
		
		create table __comp_dsf1 as 
		select * from __comp_dsf_na 
		outer union corr
		select * from __comp_dsf_global;
	quit;
	
	/* Add FX */
	%compustat_fx(out=fx); 
	proc sql;
		create table __comp_dsf2 as 
		select a.*, b.fx as fx, c.fx as fx_div
		from __comp_dsf1 as a
		left join fx as b
			on a.curcdd=b.curcdd and a.datadate=b.date
		left join fx as c
			on a.curcddv=c.curcdd and a.datadate=c.date;
	quit;
	
	data __comp_dsf3; 
		set __comp_dsf2; 
		/* Price Adjustment */
		prc = prc_local*fx;
		prc_high = prc_high_lcl*fx;
		prc_low = prc_low_lcl*fx;
		me = prc*cshoc;
		dolvol = cshtrd*prc;
		ri = ri_local*fx;
	
		/* Dividend Adjustment (set to zero if missing)*/
		div_tot = coalesce(div, 0)*fx_div;
		div_cash = coalesce(divd, 0)*fx_div;
		div_spc = coalesce(divsp, 0)*fx_div;
		
		drop div divd divsp fx_div curcddv prc_high_lcl prc_low_lcl;
	run; 
	
	* Create Daily, Monthly or Both Datasets;
	%if &freq. = m or &freq. = d %then %let iter_max = 1;
	%if &freq. = both %then %let iter_max = 2;
	
	%do iter=1 %to &iter_max.;
		%if &freq. = m or &freq. = d %then %let freq_use = &freq.;  /* Neccesary because of the case where both daily AND monthly datasets are created */
		%if &freq. = both and &iter. = 1 %then %let freq_use = d;
		%if &freq. = both and &iter. = 2 %then %let freq_use = m;
		
		%if &freq_use.=m %then %do;
			proc sql;
				create table __comp_msf1 as 
				select *, intnx('month', datadate,0,'E') as eom format=YYMMDDN8., max(max(prc_high/ajexdi), max(prc/ajexdi))*ajexdi as prc_highm, min(min(prc_low/ajexdi), min(prc/ajexdi)) * ajexdi as prc_lowm,
					sum(div_tot/ajexdi)*ajexdi as div_totm, sum(div_cash/ajexdi)*ajexdi as div_cashm, sum(div_spc/ajexdi)*ajexdi as div_spcm,
					sum(cshtrd/ajexdi)*ajexdi as cshtrm, sum(dolvol) as dolvolm
				from __comp_dsf3
				group by gvkey, iid, calculated eom;
				
				create table __comp_msf2 as 
				select *
				from __comp_msf1(drop= cshtrd div_tot div_cash div_spc dolvol prc_high prc_low)
				where not missing(prc_local) and not missing(curcdd) and prcstd in (3, 4, 10) /* This is rather important when choosing last.eom later. Otherwise we sometimes have an end of month observations which is a dividend distribution rather than a trading day */
				order by gvkey, iid, eom, datadate;
			quit;
			
			data __comp_msf2; 
				set __comp_msf2; 
				rename 
					div_totm=div_tot 
					div_cashm=div_cash 
					div_spcm=div_spc
					dolvolm=dolvol
					prc_highm=prc_high
					prc_lowm=prc_low;
			run;
			
			/* Choose Last observation in Month */
			data __comp_msf3;
				set __comp_msf2;
				by gvkey iid eom;
				if last.eom;
			run;
			
			/* Add Information from SECM */
			proc sql;
				create table __comp_secm1 as 
				select a.gvkey, a.iid, a.datadate, intnx('month', a.datadate,0,'E') as eom format=YYMMDDN8.,
					a.tpci, a.exchg, a.curcdm as curcdd, a.prccm as prc_local, a.prchm as prc_high, a.prclm as prc_low, a.ajexm as ajexdi, 
					coalesce(a.cshom/1e6, a.csfsm/1e3, a.cshoq, b.csho_fund*b.ajex_fund/a.ajexm) as cshoc, /* Notive again that I impute with shares from accounting statements in case it is missing*/
					a.dvpsxm, a.cshtrm, a.curcddvm,
					a.prccm/a.ajexm*a.trfm as ri_local,  /*ri_local = local return index [1]*/
					c.fx as fx, d.fx as fx_div
				from comp.secm as a 
				left join __firm_shares2 as b
					on a.gvkey=b.gvkey and a.datadate=b.ddate
				left join fx as c
					on a.curcdm=c.curcdd and a.datadate=c.date
				left join fx as d
					on a.curcddvm=d.curcdd and a.datadate=d.date;
					
				update __comp_secm1
				set cshtrm = 
					case 
						when datadate < '01FEB2001'd then cshtrm / 2
						when datadate <= '31DEC2001'd then cshtrm / 1.8
						when datadate < '31DEC2003'd then cshtrm / 1.6
						else cshtrm
					end
				where exchg = 14;
			quit;
			
			data __comp_secm2; 
				set __comp_secm1; 
				/* Price Adjustment */
				if curcdd='USD' then fx=1; 
				prc = prc_local*fx;
				prc_high = prc_high*fx;
				prc_low = prc_low*fx;
				me = prc*cshoc;
				dolvol = cshtrm*prc;
				ri = ri_local*fx;
			
				/* Dividend Adjustment*/
				if curcddvm='USD' then fx_div=1;
				div_tot = dvpsxm*fx_div;
				div_cash = .;  /* Not available in SECM*/
				div_spc = .;  /* Not available in SECM*/
				
				drop dvpsxm fx_div curcddvm;
			run;
			
			%let common_vars=gvkey, iid, datadate, eom, tpci, exchg, curcdd, prc_local, prc_high, prc_low,
				ajexdi, cshoc, ri_local, fx, prc, me, cshtrm, dolvol, ri, div_tot, div_cash, div_spc;
			proc sql;
				create table __comp_msf4 as 
				select &common_vars., prcstd, 'secd' as source from __comp_msf3
				union
				select &common_vars., 10 as prcstd, 'secm' as source from __comp_secm2;  /* This ensures that all observations are kept in the __returns step. Erroneous prcstd codes have been screen out */
				
				create table __comp_msf5 as
				select *
				from __comp_msf4
				group by gvkey, iid, eom
				having count(*)=1 | (count(*)=2 and source='secd'); /* If a security has an observation in both SECD and SECM. Prefer the observation from SECD*/
			quit; 
			
			proc sort nodupkey data=__comp_msf5(drop=source) out=__comp_msf6; by gvkey iid eom; run; /* DUPLICATES should always be 0!*/
			proc delete data=__comp_msf1 __comp_msf2 __comp_msf3 __comp_msf4 __comp_msf5 __comp_secm1 __comp_secm2; run;
			
			%let base=__comp_msf6;
			%let period = 'month';
			%let out = comp_msf;
		%end;
		%if &freq_use.=d %then %do;
			%let base=__comp_dsf3;
			%let period = 'day';
			%let out = comp_dsf;
		%end;
		
		/* Compute Returns */
		proc sort nodupkey data= &base. out=__comp_sf1; by gvkey iid datadate; run; /* Very important to know if there are any duplicates!!*/
 
		data __returns;
			set __comp_sf1(where = (not missing(ri) and prcstd in (3, 4, 10))); /* The screen is important, see [1] */
			by gvkey iid;
			ret = ri/lag(ri)-1;
			ret_local = ri_local/lag(ri_local)-1;
			ret_lag_dif = intck(&period., lag(datadate), datadate);
			if first.iid then do;  
				ret=.;
				ret_local=.;
				ret_lag_dif=.;
			end;
			/* Handle situations where currency code changes */
			if first.iid=0 and curcdd^=lag(curcdd) then do;
				ret_local = ret;
			end;
			keep gvkey iid datadate ret ret_local ret_lag_dif;
		run;

		/* Handling Delisting */
		data __sec_info; set comp.security comp.g_security; run; /* Combine SECURITY and G_SECURITY*/
		
		data __delist1;
			set __returns(where=(not missing(ret_local) and ret_local^=0)); /* Take delisting date to be last day of trading with non missing and non zero return [2]*/
			by gvkey iid datadate;
			if last.iid;
		run;
		
		proc sql;
			create table __delist2 as 
			select a.gvkey, a.iid, a.datadate, b.secstat, b.dlrsni
			from __delist1 as a left join __sec_info as b
			on a.gvkey=b.gvkey and a.iid=b.iid;
			
			create table __delist3 as 
			select gvkey, iid, datadate as date_delist,
				case when dlrsni in ('02', '03') then -0.3 else 0 end as dlret  
			from __delist2
			where secstat='I';
		quit;
		
		* Incorporate Delisting Return;
		proc sql;
			create table __comp_sf2 as
			select a.*, b.ret, b.ret_local, b.ret_lag_dif, c.date_delist, c.dlret
			from &base as a 
			left join __returns as b
				on a.gvkey=b.gvkey and a.iid=b.iid and a.datadate=b.datadate
			left join __delist3 as c
				on a.gvkey=c.gvkey and a.iid=c.iid;
		quit;
		
		data __comp_sf3;
			set __comp_sf2;
			where datadate<=date_delist or missing(date_delist); /* In a sample of 104,377 this removes 1,434 obs*/
			if datadate=date_delist then do;
				ret = (1+ret)*(1+dlret)-1;
				ret_local = (1+ret_local)*(1+dlret)-1;
			end;
			drop ri ri_local date_delist dlret;
		run;
		
		/* Excess Return */
		%if &freq_use.=d %then %let scale=21;
		%if &freq_use.=m %then %let scale=1;
		
		proc sql;
			create table __comp_sf4 as 
			select a.*, a.ret-coalesce(b.t30ret, c.rf)/&scale. as ret_exc /* I prefer crsp.mcti but FF has monthly updates */
			from __comp_sf3 as a 
			left join crsp.mcti as b 
				on year(a.datadate)=year(b.caldt) and month(a.datadate)=month(b.caldt)
			left join ff.factors_monthly as c 
				on year(a.datadate)=year(c.date) and month(a.datadate)=month(c.date);
		quit;
		
		/* Add Exchange Information*/
		%comp_exchanges(out=__exchanges);
		proc sql;
			create table __comp_sf5 as
			select a.*, b.excntry, b.exch_main 
			from __comp_sf4 as a left join __exchanges as b
			on a.exchg=b.exchg;
		quit;
		
		/* Add Primary Security Indicator?*/
		%add_primary_sec(data=__comp_sf5, out=__comp_sf6, date_var=datadate);
		
		/* Output */
		proc sort nodupkey data=__comp_sf6 out=&out.; by gvkey iid datadate; run;
	%end;
	
	
	/* proc freq data=comp_dsf; tables ret_day_dif; run; *Check: 97.2% of non missing ret_day_dif are <=3. 98.8% are <=4; 99.75% are <=10. This might be a reasonable general cutoff for settings returns to 0*/
	proc delete data=__firm_shares1 __firm_shares2 fx 
		__comp_dsf_na __comp_dsf_global __comp_dsf1 __comp_dsf2 __comp_dsf3
		__returns __sec_info __delist1 __delist2 __delist3
		__comp_sf1 __comp_sf2 __comp_sf3 __comp_sf4 __comp_sf5 __comp_sf6 __exchanges &base.; run;
%mend prepare_comp_sf;

/* COMBINE CRSP AND COMPUSTAT WITH CRSP PREFERENCE*/
%macro combine_crsp_comp_sf(out_msf=, out_dsf=, crsp_msf=, comp_msf=, crsp_dsf=, comp_dsf=);
	/* Monthly Files */
	proc sql;
		create table __msf_world1 as
		select permno as id, PERMNO as permno, PERMCO as permco, GVKEY as gvkey, iid, 'USA' as excntry length=3, exch_main, (shrcd in (10, 11, 12)) as common, 1 as primary_sec,
			bidask, shrcd as crsp_shrcd, exchcd as crsp_exchcd, '' as comp_tpci, . as comp_exchg,
			'USD' as curcd, 1 as fx, date, intnx('month',date,0,'E') as eom format=YYMMDDN8., 
		   	cfacshr as adjfct, shrout as shares, me, me_company, prc, prc as prc_local, prc_high, prc_low, dolvol, vol as tvol, 
		   	RET as ret, ret as ret_local, ret_exc, 1 as ret_lag_dif, div_tot, . as div_cash, . as div_spc, 1 as source_crsp
		from &crsp_msf.
		outer union corr
		select case	when prxmatch("/W/", iid) then input(cats('3', gvkey, substr(iid, 1, 2)), 9.0)
				    when prxmatch("/C/", iid) then input(cats('2', gvkey, substr(iid, 1, 2)), 9.0)
				    else input(cats('1', gvkey, substr(iid, 1, 2)), 9.0)                            /* IID characters can only take 3 possible valued: http://zeerovery.nl/blogfiles/Comp-IID.pdf*/ 
			end as id, . as permno, . as permco, gvkey, iid, excntry, exch_main, (tpci='0') as common, primary_sec,
			(prcstd = 4) as bidask, . as crsp_shrcd, . as crsp_exchcd, tpci as comp_tpci, exchg as comp_exchg, 
		   	curcdd as curcd, fx, datadate as date, eom, 
		   	ajexdi as adjfct, cshoc as shares, me, me as me_company, prc, prc_local, prc_high, prc_low, dolvol, cshtrm as tvol,
		   	ret_local, ret, ret_exc, ret_lag_dif, div_tot, div_cash, div_spc, 0 as source_crsp  
		from &comp_msf.;
	quit;	
	
	/* Add Excess Return over Month t+1*/
	proc sort data=__msf_world1	; by id descending eom; run;
	data __msf_world2; 
		set __msf_world1;
		ret_exc_lead1m = lag(ret_exc);
		if lag(id)^=id or lag(ret_lag_dif)^=1 then
			ret_exc_lead1m = .;
	run;
	
	/* Daily Files */
	proc sql;
		create table __dsf_world1 as
		select permno as id, 'USA' as excntry length=3, exch_main, (shrcd in (10, 11, 12)) as common, 1 as primary_sec, 
			bidask, 'USD' as curcd, 1 as fx, DATE as date format=YYMMDDN8., intnx('month',DATE,0,'E') as eom format=YYMMDDN8., 
		   	cfacshr as adjfct, shrout as shares, me, dolvol, vol as tvol, prc, prc_high, prc_low,
		   	ret as ret_local, RET as ret, ret_exc, 1 as ret_lag_dif, 1 as source_crsp /* More memory efficient with integer instead of character vector*/ 
		from &crsp_dsf.
		outer union corr
		select case when prxmatch("/W/", iid) then input(cats('3', gvkey, substr(iid, 1, 2)), 9.0)
				    when prxmatch("/C/", iid) then input(cats('2', gvkey, substr(iid, 1, 2)), 9.0)
				    else input(cats('1', gvkey, substr(iid, 1, 2)), 9.0)                            /* IID characters can only take 3 possible valued: http://zeerovery.nl/blogfiles/Comp-IID.pdf*/ 
			end as id, excntry, exch_main, (tpci='0') as common, primary_sec, 
			(prcstd = 4) as bidask, curcdd as curcd, fx, datadate as date, intnx('month',datadate,0,'E') as eom format=YYMMDDN8., 
		   	ajexdi as adjfct, cshoc as shares, me, dolvol, cshtrd as tvol, prc, prc_high, prc_low,
		   	ret_local, ret, ret_exc, ret_lag_dif, 0 as source_crsp  
		from &comp_dsf.;
	quit;	
	
	/* Choose the main observation based on monthly data */
	* If multiple observations for the same GVKEY-IID, Then choose CRSP as the main observation;
	proc sql;
		create table __obs_main as 
		select id, gvkey, iid, eom, (count(gvkey) in (0, 1) or (count(gvkey)>1 and source_crsp=1)) as obs_main
		from __msf_world2
		group by gvkey, iid, eom;
		
		create table __msf_world3 as 
		select a.*, b.obs_main
		from __msf_world2 as a left join __obs_main as b
		on a.id = b.id and a.eom = b.eom;
	
		create table __dsf_world2 as 
		select a.*, b.obs_main
		from __dsf_world1 as a left join __obs_main as b
		on a.id = b.id and a.eom = b.eom;
	quit;
	
	proc sort data=__msf_world3 out=&out_msf. nodupkey; by id eom; run;
	proc sort data=__dsf_world2 out=&out_dsf. nodupkey; by id date; run;
	
	proc delete data= __msf_world1 __msf_world2 __msf_world3 __dsf_world1 __dsf_world2 __obs_main; run; 
%mend combine_crsp_comp_sf;

* MACRO: CLEAN_COMP_MSF
	- Remove obvious Compustat data errors by setting return to missing.
	- Currently only implemented for monthly Compustat file, should be expanded to daily file
;
%macro clean_comp_msf(data=);
	proc sql;
		update &data. 
		set ret=., ret_local=., ret_exc=.
		where gvkey = '002137' and iid = '01C' and eom in ('31DEC1983'd, '31JAN1984'd);
		
		update &data. 
		set ret=., ret_local=., ret_exc=.
		where gvkey = '013633' and iid = '01W' and eom in ('28FEB1995'd);
	quit;
%mend;

* MACRO: MARKET RETURNS
	If wins_comp=1, need to supply wins_data as well.
;
%macro market_returns(out=, data=, freq=m, wins_comp=1, wins_data=, cap_data=); 
	%if &freq.=d %then %do;
		%let dt_col = date;
		%let max_date_lag = 14;
	%end;
	%if &freq.=m %then %do;
		%let dt_col = eom;
		%let max_date_lag = 1;
	%end;
	/* Create Index Data */
	
	proc sql;
	    create table updated_data as
	    select a.*, b.nyse_p80
	    from &data. as a
	    left join &cap_data. as b
	    on a.eom = b.eom;
	quit;
	
	data updated_data;
	    set updated_data;
	    if me <= nyse_p80 then me_cap = me;
	    else me_cap = nyse_p80;
	    drop nyse_p80;
	run;


	/* Create Index Data */
	proc sql;
		create table __common_stocks1 as
		select distinct source_crsp, id, date, eom, excntry, obs_main, exch_main, primary_sec, common, ret_lag_dif, me, me_cap, dolvol, ret, ret_local, ret_exc
		from updated_data
		order by id, &dt_col.;
	quit;
	
	data __common_stocks2;
		set __common_stocks1;
		by id;
		me_lag1 = lag(me);
		me_cap_lag1 = lag(me_cap);
		dolvol_lag1 = lag(dolvol);
		if first.id then do;
			me_lag1 = .;
			me_cap_lag1 = .;
			dolvol_lag1 = .;
		end;
	run;
	
	%if &wins_comp. = 1 %then %do;
		%if &freq.=m %then %do;
			proc sql;
				create table __common_stocks3 as
				select a.*, b.ret_exc_0_1, b.ret_exc_99_9, b.ret_0_1, b.ret_99_9, b.ret_local_0_1, b.ret_local_99_9
				from __common_stocks2 as a 
				left join &wins_data. as b
				on a.eom=b.eom;
			quit;
		%end;
		%if &freq.=d %then %do;
			proc sql;
				create table __common_stocks3 as
				select a.*, b.ret_exc_0_1, b.ret_exc_99_9, b.ret_0_1, b.ret_99_9, b.ret_local_0_1, b.ret_local_99_9
				from __common_stocks2 as a 
				left join &wins_data. as b
				on year(a.date)=b.year and month(a.date)=b.month;
			quit;
		%end;
		proc sql;
			* Winsorize returns;
			update __common_stocks3 
			set ret = ret_99_9
			where ret > ret_99_9 and source_crsp = 0 and not missing(ret);		
			update __common_stocks3 
			set ret = ret_0_1
			where ret < ret_0_1 and source_crsp = 0 and not missing(ret);
			* Winsorize local returns;
			update __common_stocks3 
			set ret_local = ret_local_99_9
			where ret_local > ret_local_99_9 and source_crsp = 0 and not missing(ret_local);		
			update __common_stocks3 
			set ret_local = ret_local_0_1
			where ret_local < ret_local_0_1 and source_crsp = 0 and not missing(ret_local);
			* Winsorize excess returns;
			update __common_stocks3 
			set ret_exc = ret_exc_99_9
			where ret_exc > ret_exc_99_9 and source_crsp = 0 and not missing(ret_exc);		
			update __common_stocks3 
			set ret_exc = ret_exc_0_1
			where ret_exc < ret_exc_0_1 and source_crsp = 0 and not missing(ret_exc);
			
			alter table __common_stocks3
			drop ret_exc_0_1, ret_exc_99_9, ret_0_1, ret_99_9, ret_local_0_1, ret_local_99_9;
		quit;
		
	%end;
	%if &wins_comp. = 0 %then %do;
		data __common_stocks3; set __common_stocks2; run;
	%end;
	proc sql;
		create table mkt1 as
		select excntry, &dt_col., 
			count(*) as stocks, 
			sum(me_lag1) as me_lag1,
			sum(me_cap_lag1) as me_cap_lag1,
			sum(dolvol_lag1) as dolvol_lag1,
			sum(ret_local*me_lag1)/(calculated me_lag1) as mkt_vw_lcl,
			sum(ret_local*me_cap_lag1)/(calculated me_cap_lag1) as mkt_vw_cap_lcl,
			mean(ret_local) as mkt_ew_lcl,
			sum(ret*me_lag1)/(calculated me_lag1) as mkt_vw,
			sum(ret*me_cap_lag1)/(calculated me_cap_lag1) as mkt_vw_cap,
			mean(ret) as mkt_ew,
			sum(ret_exc*me_lag1)/(calculated me_lag1) as mkt_vw_exc,
			sum(ret_exc*me_cap_lag1)/(calculated me_cap_lag1) as mkt_vw_cap_exc,
			mean(ret_exc) as mkt_ew_exc
		from __common_stocks3
		where obs_main = 1 and exch_main = 1 and primary_sec = 1 and common = 1 and ret_lag_dif <= &max_date_lag. and not missing(me_lag1) and not missing(ret_local) 
		group by excntry, &dt_col.;
	quit;
	%if &freq.=m %then %do;
		data &out.; set mkt1; run;
	%end;
	%if &freq.=d %then %do;
		proc sql;
			create table &out. as
			select *
			from mkt1
			group by excntry, year(date), month(date)
			having stocks / max(stocks) >= 0.25; /* With less than 25% of stocks trading, it's likely that the date is not an official trading date */
		quit;
	%end;
	
	proc delete data= __common_stocks1 __common_stocks2 __common_stocks3 mkt1; run; 
%mend;

* MACRO: AP_FACTORS
- This macro creates the factors from the 3-factors model of Fama and French (1993)
  as well as the factors from the 4-factor of Hou, Xue and Zhang (2015).
  Factors other than market and small minus big, are created using an unconditional double sort on sort and the underlying characteristics
  following the methodology of Fama and French (1993). Breakpoints are based on all non-microcap stocks within a country.
  Arguements 
  	- out: Name of output dataset
  	- freq: In (d, m) i.e. either daily or monthly
  	- sf: Dataset of &freq. stocks returns
  	- mchars: Dataset of characteristics with monthly frequency
  	- mkt: dataset with market returns. This is also used to identify trading days.
  	- min_stocks_bp: Minimum number of stocks used to create breakpoints. 
  	- min_stocks_pf: Minimum number of stocks available in the beginning of a month, to create a valid portfolio.
;
%macro ap_factors(out=, freq=, sf=, mchars=, mkt=, min_stocks_bp=, min_stocks_pf=);
	%if &freq.=d %then %do;
		/* Daily return data */
		proc sql;
			create table world_sf1 as 
			select excntry, id, date, eom, ret_exc
			from &sf.
			where ret_lag_dif <= 5 and not missing(ret_exc); * Impose a maximum lag of 5 days between return calculation;
		quit;
		%winsorize_own(inset=world_sf1, outset=world_sf2, sortvar=eom, vars=ret_exc, perc_low=0.1, perc_high=99.9); /* Winsorize Returns at 0.1% (Note I winsorize by month) */
		%let __date_col = date;
	%end;
	%if &freq.=m %then %do;
		/* Monthly return data */
		proc sql;
			create table world_sf1 as 
			select excntry, id, eom, ret_exc
			from &sf.
			where ret_lag_dif = 1 and not missing(ret_exc); 
		quit;
		%winsorize_own(inset=world_sf1, outset=world_sf2, sortvar=eom, vars=ret_exc, perc_low=0.1, perc_high=99.9); /* Winsorize Returns at 0.1% */
		%let __date_col = eom;
	%end;
	
	/* Sorting Variables */
	proc sql;
		create table base1 as 
		select id, eom, size_grp, excntry, me, market_equity, be_me, at_gr1, niq_be,
			source_crsp, exch_main, obs_main, common, comp_exchg, crsp_exchcd, primary_sec, ret_lag_dif
		from &mchars.;
	quit;
	
	proc sort data=base1 out=base2; by id eom; run;
	
	%macro temp();
	/* Lag variables used at portfolio rebalacing */
	%let cols_lag = comp_exchg crsp_exchcd exch_main obs_main common primary_sec excntry size_grp me be_me at_gr1 niq_be;
	data base3; 
		set base2;
		by id eom;
		%do i=1 %to %nwords(&cols_lag.); 
			%let col = %scan(&cols_lag., &i, %str(' '));
			&col._l = lag(&col.);
			if id ^= lag(id) or source_crsp ^= lag(source_crsp) or intck("month", lag(eom), eom)^=1 then
				&col._l = .;
			drop &col.;
		%end;
	run;
	%mend;
	%temp();
	
	/* Screens */
	proc sql;
		create table base4 as
		select *,  case 
					when missing(size_grp_l) then ''
					when size_grp_l in ('large', 'mega') then 'big'
					else 'small'
				end as size_pf
		from base3
		where obs_main_l = 1 and exch_main_l = 1 and common_l = 1 and primary_sec_l = 1 and ret_lag_dif = 1 and not missing(me_l)
		order by excntry_l, size_grp_l, eom;
	quit;
	
	/* Factors: Three-by-two sort on var and size Fama-French Style*/
	%macro sort_ff_style(out=, char=, freq=, min_stocks_bp=, min_stocks_pf=);
		* Breakpoints (based on NYSE stocks in the US and non-microcap stocks outside of the US);
		proc sql;
			create table bp_stocks as
			select *
			from base4
			where ((size_grp_l in ('small', 'large', 'mega') and excntry_l ^= 'USA') or ((crsp_exchcd_l=1 or comp_exchg_l=11) and excntry_l = 'USA')) and not missing(&char._l)
			order by eom, excntry_l;
		quit;
		
		proc means data=bp_stocks noprint;
			by eom excntry_l;
			var &char._l;
			output out=bps(drop=_type_ _freq_) N=n P30 = bp_p30 P70=bp_p70;
		run;
		
		* Create weights by end of month;
		proc sql;
			create table weights1 as 
			select a.excntry_l, a.id, a.eom, a.size_pf, a.me_l,
				case 
					when a.&char._l >= b.bp_p70 then 'high'
					when a.&char._l >= b.bp_p30 then 'mid'
					else 'low'
				end as char_pf
			from base4 as a left join bps as b
			on a.excntry_l = b.excntry_l and a.eom = b.eom
			where b.n >= &min_stocks_bp. and not missing(a.&char._l) and size_pf^='';
			
			create table weights2 as 
			select *, me_l / sum(me_l) as w
			from weights1
			group by excntry_l, size_pf, char_pf, eom
			having count(*) >= &min_stocks_pf.;
		quit;
		
	
		* Match with return data;
		proc sql;
			create table returns as 
			select a.*, b.w, b.size_pf, b.char_pf
			from world_sf2 as a inner join weights2 as b
			on a.id = b.id and a.eom=b.eom and a.excntry=b.excntry_l;
		quit;
		
		* Create portfolio returns;
		proc sql;
			create table pfs1 as 
			select "&char." as characteristic, excntry, size_pf, char_pf, &__date_col., sum(ret_exc*w) as ret_exc
			from returns
			group by excntry, size_pf, char_pf, &__date_col.;
		quit;
		
		proc sort data=pfs1 out = pfs2; by characteristic excntry &__date_col.; run;
		proc transpose data=pfs2 delimiter=_ out=pfs3(drop=_name_);
			by characteristic excntry &__date_col.;
			var ret_exc;
			id size_pf char_pf;
		run;
		
		data &out.;
			set pfs3;
			lms = (small_high + big_high) / 2 - (small_low + big_low) / 2;
			smb = (small_high + small_mid + small_low) / 3 - (big_high + big_mid + big_low) / 3;
			keep characteristic excntry &__date_col. lms smb;
		run;
	%mend;
	
	/* Create Individual Factors */
	%sort_ff_style(out=book_to_market, char=be_me, min_stocks_bp = &min_stocks_bp., min_stocks_pf = &min_stocks_pf.);
	%sort_ff_style(out=asset_growth, char=at_gr1, min_stocks_bp = &min_stocks_bp., min_stocks_pf = &min_stocks_pf.);
	%sort_ff_style(out=roeq, char=niq_be, min_stocks_bp = &min_stocks_bp., min_stocks_pf = &min_stocks_pf.);
	
	/* Fama and French (1993) */
	data ff;
		set book_to_market;
		rename lms = hml;
		rename smb = smb_ff;
	run;
	
	/* Hou, Xue and Zhang (2015) */
	data hxz1;
		set asset_growth roeq;
	run;
	
	proc transpose data=hxz1 out=hxz2;
		by characteristic excntry &__date_col.;
		var lms smb;
	run;
	
	proc sort data=hxz2 out = hxz3; by excntry &__date_col.; run;
	proc transpose data=hxz3 delimiter=_ out=hxz4(drop=_name_);
		by excntry &__date_col.;
		var col1;
		id characteristic _name_;
	run;
	
	data hxz;
		set hxz4;
		rename niq_be_lms = roe;
		smb_hxz = (at_gr1_smb + niq_be_smb) / 2;
		inv = -at_gr1_lms;
	run;
	
	/* Factor Dataset */
	proc sql;
		create table &out. as 
		select a.excntry, a.&__date_col., a.mkt_vw_exc as mktrf, b.hml, b.smb_ff, c.roe, c.inv, c.smb_hxz
		from &mkt. as a 
		left join ff as b on a.excntry = b.excntry and a.&__date_col. = b.&__date_col.
		left join hxz as c on a.excntry = c.excntry and a.&__date_col. = c.&__date_col.;
	quit;
%mend;

* MACRO: FILE DELETE;
%macro file_delete(file);
  %let rc= %sysfunc(filename(fref,&file));
  %let rc= %sysfunc(fdelete(&fref));
%mend;

* MACRO: SAVE_MAIN_DATA_CSV
- The macro saves the main data as separate .csv files by country. 
- By main data we mean data for common stocks that are the primary security of the underlying firm with non-missing return and lagged market equity 
  Arguments
	- out: Name of the output Zip file (will be saved in &path.)
	- data: should be the path to the main sas dataset
	- path: path where data is stored. Should be a scratch directory
;
%macro save_main_data_csv(out=, data=, path=, end_date=);
	* Lagged me data;
	data main_data1;
		set &data.;
		me_lag1 = lag(me);
		if id ^= lag(id) or intck("month", lag(eom), eom)^=1 then
			me_lag1 = .;
	run;
	
	* Reorder Variables;
	data main_data2;
		retain id date eom source_crsp size_grp obs_main exch_main primary_sec gvkey iid permno permco excntry curcd fx 
			common comp_tpci crsp_shrcd comp_exchg crsp_exchcd
			adjfct shares me me_lag1; 
		set main_data1;
	run;
	
	* Screen;
	proc sql;
		create table main_data3 as
		select * 
		from main_data2
		where primary_sec = 1 and common = 1 and obs_main = 1 and exch_main = 1 and eom <= &end_date.; /*removed not missing(me_lag1) and and ret_lag_dif = 1 and not missing(ret_exc)*/
	quit;
	
	proc sql noprint;
		select distinct lowcase(excntry) into :countries separated by ' '
		from main_data3;
	quit;
	
	/* Create country .csv files */
	%do i=1 %to %nwords(&countries.);
		%put ################ "&path./&__c..csv" ########################;
		option nonotes;
		%let __c = %scan(&countries., &i., %str(' '));		
		proc export data=main_data3(where=(excntry = upcase("&__c.")))
		    outfile="&path./&__c..csv"   
		    dbms=CSV
		    replace;
		run;
		option notes;
	%end;
	
	
	* Zip file for easier download;
	ods package (newzip) open nopf;
	%do i=1 %to %nwords(&countries.);
		%let __c = %scan(&countries., &i., %str(' '));
		ods package (newzip) add file="&path./&__c..csv";
	%end;
	ods package (newzip) publish archive 
		properties (
			archive_name="&out..zip" 
			archive_path= "&path."
		);
	ods package(newzip) close;
	
	/* Delete intermidiate .csv files */
	%do i=1 %to %nwords(&countries.);
		%let __c = %scan(&countries., &i., %str(' '));
		%file_delete(file=&path./&__c..csv);
	%end;
	proc delete data = main_data1 main_data2 main_data3; run;
%mend;

* MACRO: SAVE_DAILY_RET_CSV
- The macro saves the daily return data as a separate .csv file country-by-country. 
  Arguments
	- out: Name of the output Zip file (will be saved in &path.)
	- data: should be the path to world_dsf
	- path: path where data is stored. Should be a scratch directory
	- end_date: restricts the output to till the end_date
;
%macro save_daily_ret_csv(out=, data=, path=, end_date=);
	data daily; 
		set &data.;
		where date <= &end_date.;
		keep excntry id date source_crsp me ret ret_exc; 
	run;
	proc sql noprint;
		select distinct lowcase(excntry) into :countries separated by ' '
		from daily;
	quit;
	/* Create country .csv files */
	option nonotes;
	%do i=1 %to %nwords(&countries.);
		%let __c = %scan(&countries., &i., %str(' '));
		%put ################ "&path./&__c..csv" ########################;
		proc export data=daily(where=(excntry = upcase("&__c.")))
		    outfile="&path./&__c..csv"   
		    dbms=CSV
		    replace;
		run;
	%end;
	option notes;
	
	* Zip file for easier download;
	ods package (newzip) open nopf;
	%do i=1 %to %nwords(&countries.);
		%let __c = %scan(&countries., &i., %str(' '));
		ods package (newzip) add file="&path./&__c..csv";
	%end;
	ods package (newzip) publish archive 
		properties (
			archive_name="&out..zip" 
			archive_path= "&path."
		);
	ods package(newzip) close;
	
	/* Delete intermidiate .csv files */
	%do i=1 %to %nwords(&countries.);
		%let __c = %scan(&countries., &i., %str(' '));
		%file_delete(file=&path./&__c..csv);
	%end;
	proc delete data = daily; run;
%mend;

* MACRO: SAVE_MONTHLY_RET_CSV
- The macro saves the monthly return data as a .csv across all countries. 
  Arguments
	- out: Name of the output Zip file (will be saved in &path.)
	- data: should be the path to world_msf
	- path: path where data is stored. Should be a scratch directory
	- end_date: restricts the output to till the end_date
;
%macro save_monthly_ret_csv(out=, data=, path=, end_date=);
	data monthly; 
		set &data.;
		where eom <= &end_date.;
		keep excntry id source_crsp eom me ret_exc ret ret_local; 
	run;
	proc export data=monthly
	    outfile="&path./world_ret_monthly.csv"   
	    dbms=CSV
	    replace;
	run;
	* Zip file for easier download;
	ods package (newzip) open nopf;
	ods package (newzip) add file="&path./world_ret_monthly.csv";
	ods package (newzip) publish archive 
		properties (
			archive_name="&out..zip" 
			archive_path= "&path."
		);
	ods package(newzip) close;
	* Delete intermidiate .csv files;
	%file_delete(file=&path./world_ret_monthly.csv);
	proc delete data = monthly; run;
%mend;


/* Footnotes */*
[1]: This screen ensures that the return is always computed between two days with a price. The ret_day_dif lets the user check if it's within a reasonable range. 
	 Compustat have some observations whre prc and curcdd is missing and div and curcddiv is not missing. In other words dividend ex-date occuring outside trading days 
	 should not be used to compute returns. Further, the screen on prcstd ensures that returns are computed with non-stale prices. 
	 Specifically prcstd in (3, 4, 10) ensures that the price is taken from observable market data. The main excluded prcstd is 5, which is "No prices available, last actual price was carried forward".
	 This is prevalent in the G_SECD where it accounts for 14.5% of the observations. 
[2]: Compustat also include a security inactivation date called 'dldtei'. In a subsample of 300 inactive securities,
	 The difference in months between the date I take to be the delisting date and dldtei is 0 for 67% and 1 for 14%.
	 The rest of the observations are scattered from -432 to 317. I think this validates my choice but is is not completely clear.
;

================================================
FILE: README.md
================================================
## Overview
This repository contains the code used for the paper [Is There a Replication Crisis in Finance?](https://onlinelibrary.wiley.com/doi/10.1111/jofi.13249) by Jensen, Kelly and Pedersen (2023). Please cite this paper if you are using the code or data:
```
@article{JensenKellyPedersen2023,
	author = {Jensen, Theis Ingerslev and Kelly, Bryan and Pedersen, Lasse Heje},
	title = {Is There a Replication Crisis in Finance?},
	journal = {The Journal of Finance},
	volume = {78},
	number = {5},
	pages = {2465-2518},
	year = {2023}
}
```
Follow this [link](https://www.dropbox.com/sh/61j1v0sieq9z210/AACdJ68fs5_eT_eJMunwMBWia?dl=0) for a detailed documentation of the data sets.

The code consists of the following two self-contained components:

- [GlobalFactors](https://github.com/bkelly-lab/ReplicationCrisis/tree/master/GlobalFactors) is a folder with code that creates data sets of global stock returns, firm characteristics, and global long-short factors. __Note that the data can be downloaded without running the code__. The global stock return and firm characteristics can be downloaded from WRDS ([link](https://wrds-www.wharton.upenn.edu/pages/get-data/contributed-data-forms/global-factor-data/)). The long-short factors factor returns used in the paper ([here](https://www.dropbox.com/sh/wcrjok1qyxtrasi/AABZ90GDCUvIzDzijt8Qoo3ha?dl=0)) and the latest version of the factor returns (see weblink below). In addition, we keep a folder with the latest versions of the factor returns and additional data such as the underlying portfolios, market returns, and industry returns ([link](https://www.dropbox.com/sh/xq278bryrj0qf9s/AABUTvTGok91kakyL07LKyQoa?dl=0)).

- [Analysis](https://github.com/bkelly-lab/ReplicationCrisis/tree/master/Analysis) is a folder that contains the analysis in the paper, including all figures and tables. This folder takes the global factors as input (either the ones that can be downloaded or the ones that you construct yourself). 

See also the website [https://JKPfactors.com/](https://JKPfactors.com/), where the most recent long-short factors can be downloaded using a simple drop-down menu.